def get(args, send_type): # print(args) medium_type = 1 startime = time.time() upload_data_dict = async_run(args, send_type, medium_type) endtime = time.time() need_time = endtime - startime logger.info(f"调用协程完成---所用时间{need_time}") return upload_data_dict
def async_run(args, send_type, medium_type): '''运行''' logger.info("调用协程开始") baidu_news_upload_data = Baidu_news_upload_data(args, send_type, medium_type) baidu_run = baidu_news_upload_data.baidu_run() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: upload_data_dict = loop.run_until_complete(baidu_run) return upload_data_dict finally: loop.close()
def test(args, send_type, words): # 测试 error_counts = 0 for word in words: args['word'] = word upload_data_dict = get(args, send_type) results = upload_data_dict['results'] for result in results: url = result['Url'] srgs = {'url': url} get_baidu_article = Get_baidu_article(srgs, send_type, 0) new_data, error_count = get_baidu_article.run() error_counts += error_count print(new_data) log.info(f'错误数量-{error_counts}')
def get_article_html(self, url): '''获得编码格式正确后的源码''' headers = {"User-Agent": self.UA} try: res = requests.get(url=url, headers=headers, verify=False, timeout=60) s = requests.session() s.keep_alive = False # 关闭多余连接 if res.status_code == 200: try: result = res.json() log.info(f"该url-{url}源码为json格式") return except: charset = re.search('<meta.*?charset(.*?)>', res.text) if charset: charset = re.sub('"|=|/', '', charset.group(1)).lower() for html_encode in html_encode_list: if html_encode in charset: charset = html_encode if charset == 'unicode': charset = 'unicode_escape' if 'huaxia' in url: charset = 'gb2312' result = res.content.decode(charset, 'ignore') else: try: result = res.content.decode() except: result = res.text else: self.error_count += 1 result = "" log.error(f"请求该url-{url}的详情页出错,状态码-{res.status_code}") except Exception as e: self.error_count += 1 result = "" log.error(f'访问该url-{url}失败-原因-{str(e)}') return result
results = upload_data_dict['results'] for result in results: url = result['Url'] srgs = {'url': url} get_baidu_article = Get_baidu_article(srgs, send_type, 0) new_data, error_count = get_baidu_article.run() error_counts += error_count print(new_data) log.info(f'错误数量-{error_counts}') if __name__ == '__main__': # words = ['广州', '皮肤', '伙伴', '冒险', '合作', '手游', '杭州', '豪车', '造型', '国内'] # args = { # 'word': '', # 'rows': 30, # 'page': 1, # # } # send_type = 1 # baidu_article_main(args,send_type) error_count = 0 args = { 'url': 'http://news.changsha.cn/xctt/html/110187/20200114/65709.shtml', } send_type = 1 get_baidu_article = Get_baidu_article(args, send_type, error_count) new_data, error_count = get_baidu_article.run() log.info(f'数据--{json.dumps(new_data, ensure_ascii=False, indent=4)}') # print('错误数量',error_count)