def remove(self, key): """ 在指定OPS服务器上,删除指定的键 @param key: 设置键 @return: 无 """ logcm.print_info("Remove ops key (key : %s)" % key) # 取得访问URL url = self.get_url('del', key) # 请求OPS服务器 webcm.read_url(url, self.cfg['encoding'])
def update(self, key, val): """ 在指定OPS服务器上,更新指定的键值 @param key: 设置键 @param val: 设置值 @return: 无 """ logcm.print_info("Update ops value (key : %s, val : %s )" % (key, val)) # 取得访问URL url = self.get_url('update', key, val) # 请求OPS服务器 webcm.read_url(url, self.cfg['encoding'])
def add(self, key, val): """ 在指定OPS服务器上,添加新的键值 @param key: 设置键 @param val: 设置值 @return: 无 """ logcm.print_info("Add ops value (key : %s, val : %s )" % (key, val)) # 取得访问URL url = self.get_url('add', key, val) # 请求OPS服务器 webcm.read_url(url, self.cfg['encoding'])
def crawl_from_url(page_url, next_page_select, img_select, tag_select, local_path, encoding='utf-8', page_no=1): """ 从指定的网页URL,取得满足要求的图片URL列表 @param page_url: 网页URL @param next_page_select: 下一页select语句 @param img_select: 图片select语句 @param tag_select: 标签select语句 @param local_path: 本地文件保存路径 @param encoding: 网页编码 @param page_no: 网页页码号 @return: 无 """ logcm.print_info("crawl_from_url Page.%d start..." % page_no) html = webcm.read_url(page_url, encoding) # print(html) soup = htmlcm.to_soup(html) # 下载当前网页中的所有图片 webcm.down_img(soup, page_url, img_select, tag_select, local_path, page_no) # 取得下一个页面的URL地址 next_page_url = htmlcm.next_page(soup, page_url, next_page_select) # 只要下一个页面存在,继续递归下载 if next_page_url is not None: logcm.print_info("NextPageUrl is " + next_page_url) crawl_from_url(next_page_url, next_page_select, img_select, tag_select, local_path, encoding, page_no + 1) else: logcm.print_info("End\n")
def load(self): """ 取得所有设定值 @return: 设定值的键值字典 """ logcm.print_info("Loading all ops values ...") # 取得访问URL url = self.get_url('load') # OPS服务器的设定画面访问 html = webcm.read_url(url, self.cfg['encoding']) if html is None: return None ops_map = {} soup = htmlcm.to_soup(html) # OPS网页的表格解析 trs = soup.select("body table tr") for i in range(1, len(trs)): tr = trs[i] tds = tr.select("td") if len(tds) == 4: key = tds[2].string val = tds[3].string ops_map[key] = val return ops_map
def crawl_with_format(page_url, next_page_format, img_select, tag_select, local_path, encoding='utf-8', page_no=1): """ 从指定的网页URL,取得满足要求的图片URL列表 @param page_url: 网页URL @param next_page_format: 下一页连接的模板 @param img_select: 图片select语句 @param tag_select: 标签select语句 @param local_path: 本地文件保存路径 @param encoding: 网页编码 @param page_no: 网页页码号 @return: 无 """ logcm.print_info("......crawl_with_format Page." + str(page_no) + "......") html = webcm.read_url(page_url, encoding) # print(html) soup = htmlcm.to_soup(html) # 下载当前网页中的所有图片 count = webcm.down_img(soup, page_url, img_select, tag_select, local_path, page_no) if count == 0: # 如果下载不到文件,或者是已存在的文件,则结束下载。 logcm.print_info("Not found image End\n") return # 取得下一个页面的URL地址 next_page_url = next_page_format.replace('[page_no]', str(page_no + 1)) logcm.print_info("NextPageUrl is " + next_page_url) crawl_with_format( next_page_url, next_page_format, img_select, tag_select, local_path, encoding, page_no + 1, )
""" import jieba.analyse import matplotlib.pyplot as plt from common import htmlcm from common import logcm from common import webcm from common import wordscm from scipy.misc import imread from wordcloud import WordCloud # 加载内容URL的内容 url = 'http://www.leadbankmap.com/baogao/detail_4800.html' encoding = 'utf-8' html = webcm.read_url(url, encoding) content = htmlcm.clean_html(html) # 加载停用词列表 stopwords = wordscm.load_stopwords('./data', 'words_stop1.txt', encoding) # 导入stopwords jieba.analyse.set_stop_words('./data/words_stop1.txt') # 文字频率排行 seg = jieba.analyse.textrank(content, topK=50, withWeight=False, allowPOS=('nt', 'n', 'nv')) cut_text = " ".join(seg) logcm.print_obj(cut_text, 'cut_text')