예제 #1
0
def get(args, send_type):
    # print(args)
    medium_type = 1
    startime = time.time()
    upload_data_dict = async_run(args, send_type, medium_type)
    endtime = time.time()
    need_time = endtime - startime
    logger.info(f"调用协程完成---所用时间{need_time}")
    return upload_data_dict
예제 #2
0
def async_run(args, send_type, medium_type):
    '''运行'''
    logger.info("调用协程开始")
    baidu_news_upload_data = Baidu_news_upload_data(args, send_type,
                                                    medium_type)
    baidu_run = baidu_news_upload_data.baidu_run()
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        upload_data_dict = loop.run_until_complete(baidu_run)
        return upload_data_dict
    finally:
        loop.close()
예제 #3
0
def test(args, send_type, words):
    # 测试
    error_counts = 0
    for word in words:
        args['word'] = word
        upload_data_dict = get(args, send_type)
        results = upload_data_dict['results']
        for result in results:
            url = result['Url']
            srgs = {'url': url}
            get_baidu_article = Get_baidu_article(srgs, send_type, 0)
            new_data, error_count = get_baidu_article.run()
            error_counts += error_count
            print(new_data)
    log.info(f'错误数量-{error_counts}')
예제 #4
0
 def get_article_html(self, url):
     '''获得编码格式正确后的源码'''
     headers = {"User-Agent": self.UA}
     try:
         res = requests.get(url=url,
                            headers=headers,
                            verify=False,
                            timeout=60)
         s = requests.session()
         s.keep_alive = False  # 关闭多余连接
         if res.status_code == 200:
             try:
                 result = res.json()
                 log.info(f"该url-{url}源码为json格式")
                 return
             except:
                 charset = re.search('<meta.*?charset(.*?)>', res.text)
                 if charset:
                     charset = re.sub('"|=|/', '', charset.group(1)).lower()
                     for html_encode in html_encode_list:
                         if html_encode in charset:
                             charset = html_encode
                     if charset == 'unicode':
                         charset = 'unicode_escape'
                     if 'huaxia' in url:
                         charset = 'gb2312'
                     result = res.content.decode(charset, 'ignore')
                 else:
                     try:
                         result = res.content.decode()
                     except:
                         result = res.text
         else:
             self.error_count += 1
             result = ""
             log.error(f"请求该url-{url}的详情页出错,状态码-{res.status_code}")
     except Exception as e:
         self.error_count += 1
         result = ""
         log.error(f'访问该url-{url}失败-原因-{str(e)}')
     return result
예제 #5
0
        results = upload_data_dict['results']
        for result in results:
            url = result['Url']
            srgs = {'url': url}
            get_baidu_article = Get_baidu_article(srgs, send_type, 0)
            new_data, error_count = get_baidu_article.run()
            error_counts += error_count
            print(new_data)
    log.info(f'错误数量-{error_counts}')


if __name__ == '__main__':
    # words = ['广州', '皮肤', '伙伴', '冒险', '合作', '手游', '杭州', '豪车', '造型', '国内']
    # args = {
    #     'word': '',
    #     'rows': 30,
    #     'page': 1,
    #
    # }
    # send_type = 1
    # baidu_article_main(args,send_type)
    error_count = 0
    args = {
        'url': 'http://news.changsha.cn/xctt/html/110187/20200114/65709.shtml',
    }
    send_type = 1
    get_baidu_article = Get_baidu_article(args, send_type, error_count)
    new_data, error_count = get_baidu_article.run()
    log.info(f'数据--{json.dumps(new_data, ensure_ascii=False, indent=4)}')
    # print('错误数量',error_count)