def save_article(self, categoryName, url, imgurl): logger.info("视频网页地址:" + url) data = HttpUtil.get_html(url) if data is None: return True title = self.obtainInfo.find_title(data) public_time = self.obtainInfo.find_time(data) flag = DateUtil.verify_time(public_time) if not flag: return False subject = self.obtainInfo.find_subject(data) context = self.obtainInfo.find_context(data) tags = self.obtainInfo.find_tags(data) if not context is None: # 对文章内容加密 context = context.encode('utf-8') bs64 = base64.b64encode(context) p = paramater(categoryName, title, '', '', str(subject), str(bs64), imgurl, tags, '投资界', url, public_time) over_dict = p.__dict__ result = json.dumps(over_dict, ensure_ascii=False) js = json.loads(result) HttpUtil.post(js) return True
def save_article(self, categoryName, tag, url, imgurl): data = HttpUtil.get_html(url) if data is None: return True title = self.obtainInfo.find_title(data) authors = self.obtainInfo.find_author_info(data) context = self.obtainInfo.find_context(data) subject = self.obtainInfo.find_subject(data) tags = tag author = '' public_time = '' if len(authors) > 0: for v in authors: if v.find(':') > 0: public_time = v else: author = v if public_time.find('年') > 0: public_time = DateUtil.time_transfer(public_time) flag = DateUtil.verify_time(public_time) if not flag: return False if not context is None: # 对文章内容加密 context = context.encode('utf-8') bs64 = base64.b64encode(context) p = paramater(categoryName, title, author, author, str(subject), str(bs64), imgurl, tags, '36Kr网', url, public_time) over_dict = p.__dict__ result = json.dumps(over_dict, ensure_ascii=False) js = json.loads(result) HttpUtil.post(js) return True
def save_article(self, categoryName, url, imgurl): logger.info("视频网页地址:" + url) data = HttpUtil.get_html(url) if data is None: return True soup_obj = self.obtainInfo.get_soup_obj(data) title = self.obtainInfo.get_title(soup_obj) public_time = self.obtainInfo.get_time(soup_obj) if public_time is None or public_time.find('前'): public_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) flag = DateUtil.verify_time(public_time) if not flag: return False subject = self.obtainInfo.get_desc(soup_obj) context = self.obtainInfo.get_content(soup_obj) tags = self.obtainInfo.get_tag(soup_obj) author = self.obtainInfo.get_author(soup_obj) if not context is None: # 对文章内容加密 context = context.encode('utf-8') bs64 = base64.b64encode(context) p = paramater(categoryName, title, author, author, str(subject), str(bs64), imgurl, tags, self.default_value_source, url, public_time) over_dict = p.__dict__ result = json.dumps(over_dict, ensure_ascii=False) js = json.loads(result) # 请求数据 HttpUtil.post(js) return True
def save_video(self, categoryName, tags, url, img, time): logger.info("视频网页地址:" + url) data = HttpUtil.get_html(url) if data is None: return context = self.obtainInfo.find_video_context(data) if context is None: return context = '<iframe width="680" height="480" src="' + context + '" frameborder=0 allowfullscreen></iframe>' author = self.obtainInfo.find_video_author(data) title = self.obtainInfo.find_video_title(data) subject = title if not context is None: # 对文章内容加密 context = context.encode('utf-8') bs64 = base64.b64encode(context) p = paramater(categoryName, title, author, author, str(subject), str(bs64), img, tags, '金斧子', url, time) over_dict = p.__dict__ result = json.dumps(over_dict, ensure_ascii=False) js = json.loads(result) HttpUtil.post(js)
def run(self): logger.info("开始线程:", self.thread_id) act_url = self.url logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None: return pages, b_id = self.obtain.find_pages1(html.decode("UTF-8")) for key, value in pages.items(): self.save.save_article(self.categoryName, self.tag, key, value) while True: act_url = self.sub_url + '&b_id=' + str(b_id) + '&per_page=30' logger.info('分页URL:' + act_url) html = HttpUtil.get_html(act_url) if html is None: return pages, b_id = self.obtain.find_pages2(html.decode("UTF-8")) if len(pages) == 0: return for key, value in pages.items(): self.save.save_article(self.categoryName, self.tag, key, value)
def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + 'p' + str(i) + '.html' logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None: continue pages = self.obtain.find_article_pages(html.decode("UTF-8")) if len(pages) == 0: return for key, desc in pages.items(): flag = self.save.save_article(self.categoryName, self.tag, key, desc) if not flag: break if not flag: break def __del__(self): logger.info(self.thread_id, "线程结束!)")
def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + str(i) + '-10.shtml' logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None: continue if self.type == 1: pages = self.obtain.find_pages1(html.decode("UTF-8")) elif self.type == 2: pages = self.obtain.find_pages2(html.decode("UTF-8")) if len(pages) == 0: break num = 0 for key, value in pages.items(): flag = self.save.save_article(self.categoryName, key, value) if not flag: break num = num + 1 if not flag: break
def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + "?page=" + str(i) logger.info(act_url) # 这里是先拿到这个界面所有的链接 html = HttpUtil.get_html(act_url) if html is None: return result = self.obtain.find_page_info_by_html_str(html.decode("UTF-8")) if len(result) == 0: break num = 0 # 这里去循环请求 for key, value in result.items(): flag = self.save.save_article(self.categoryName, key, value) if not flag: break num = num + 1 time.sleep(1) if not flag: break
def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + str(i) logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None: return pages = self.obtain.find_pages(html.decode("UTF-8")) result = self.obtain.find_page_info(pages) if len(result) == 0: break num = 0 for key, value in result.items(): flag = self.save.save_article(self.categoryName, key, value) if not flag: break num = num + 1 if not flag: break
def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + str(i) logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None: continue pages, times = self.obtain.find_video_pages(html.decode("UTF-8")) if len(pages) == 0: return num = 0 for url, img in pages.items(): public_time = times[num] if public_time is None or public_time.find('前'): public_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) flag = DateUtil.verify_time(public_time) if not flag: break self.save.save_video(self.categoryName, self.tag, url, img, public_time) if not flag: break def __del__(self): logger.info(self.thread_id, "线程结束!)")
from com.unif.util.HttpUtil import HttpUtil # "http://192.168.30.153:8087/section/findSecById" HttpUtil.post({'id': 0}, )