Beispiel #1
0
def get_all_article():

    article = []
    nav_url = find_data(url_base)
    article_num = 0
    article_list = []
    #nav_url.append(url_base)
    parse_log.info('开始爬取当日所有文章链接....')
    for i in nav_url:
        #遍历导航栏链接
        page_num, art_list = get_page(i['type_url'])
        if not page_num:
            #去除没有文章的分类
            continue
        parse_log.debug('已获取文章链接{0}条'.format(page_num))
        article_list.append({
            'type_name': i['type_name'],
            'art_list': art_list
        })

    upload_end = []

    total = 0

    for i in article_list:
        #upload_type = news_type[i['type_name']]
        waiting_upload = i['art_list']
        #parse_log.debug( '开始上传,待上传{0}条'.format(len(waiting_upload))
        total = total + len(waiting_upload)
        for a in waiting_upload:
            if a not in upload_end:
                article_title, artibody = handle_article(a)
                if not artibody or not article_title:
                    continue
                article.append({
                    'title': article_title,
                    'body': artibody,
                    'type': i['type_name'],
                    'url': a,
                    'source_url': url_base
                })

                parse_log.debug(u'获取文章内容:{0} 栏目.....{1}/{2}........{3}'.format(
                    i['type_name'],
                    waiting_upload.index(a),
                    len(waiting_upload),
                    a,
                ))
                upload_end.append(a)

            else:
                parse_log.debug('已存在')
                continue

    parse_log.info('已获取文章{0}条'.format(len(article)))
    return article
Beispiel #2
0
def get_all_article():

    article = []
    nav_url = find_data(url_base)
    article_num = 0
    article_list = []
    # nav_url.append(url_base)
    parse_log.info('开始爬取当日所有文章链接....')
    for i in nav_url:
        #遍历导航栏链接
        page_num, art_list = get_page(i['type_url'], i['type_name'])
        if not page_num:
            #去除没有文章的分类
            continue
        article_list.extend(art_list)

    parse_log.info('已获取链接{0}条'.format(len(article_list)))
    cache = Cache_file()
    url_list = cache.read()

    total = 0
    upload_end = []

    def get_wenzhang(i):
        if i['wenzhang_url'] in url_list:
            return
        article_title, artibody = handle_article(i['wenzhang_url'])
        url_list.append(i['wenzhang_url'])
        if not artibody or not article_title:
            return
        return {
            'title': article_title,
            'body': artibody,
            'type': i['type_name'],
            'url': i['wenzhang_url'],
            'source_url': url_base
        }

    pool = ThreadPool(4)
    results = pool.map(get_wenzhang, article_list)
    pool.close()
    pool.join()
    results2 = []
    for i in results:
        if i is None:
            continue
        results2.append(i)

    cache.save(url_list)
    #parse_log.info('已获取文章{0}条'.format(len(results2)))
    return results2
def main():
    upload_goto = []
    #上传目标信息

    article_list = []
    try:
        xinlang = parse_xinlang.get_all_article()
        parse_log.info('获取新浪新闻{0}条'.format(len(xinlang)))
        article_list.extend(xinlang)
    except:
        pass

    try:
        huijinwang = parse_huijinwang.get_all_article()
        parse_log.info('获取汇金网新闻{0}条'.format(len(huijinwang)))
        article_list.extend(huijinwang)
    except:
        pass

    try:
        wangyicaijin = parse_wangyicaijin.get_all_article()
        parse_log.info('获取网易财经{0}条'.format(len(wangyicaijin)))
        article_list.extend(wangyicaijin)
    except:
        pass

    try:
        rong360 = spider_rong360.get_all_article()
        parse_log.info('获取rong360{0}条'.format(len(rong360)))
        article_list.extend(rong360)
    except:
        pass

    try:
        south_money = spider_south.get_all_article()
        parse_log.info('获取南方财富网{0}条'.format(len(south_money)))
        article_list.extend(south_money)
    except:
        pass

    try:
        jingjiwang = spider_jingjiwang.get_all_article()
        parse_log.info('获取中国经济网{0}条'.format(len(jingjiwang)))
        article_list.extend(jingjiwang)
    except:
        pass

    parse_log.info('获取文章共{0}条,开始存入数据库'.format(len(article_list)))
    for i in article_list:
        time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        d = {
            '_id': i['title'],
            'title': i['title'],
            'body': str(i['body']),
            'type': i['type'],
            'url': i['url'],
            'source_url': i['source_url'],
            'date': today_date,
            'create_time': time_now
        }
        insert_data(d, 'news_data')

    today_article = find_data({'date': today_date}, 'news_data')
    today_article = [i for i in today_article]
    random.shuffle(today_article)
    wait_upload = []
    for i in today_article:
        wait_upload.append(i)
    parse_log.info('今日待上传{0}条'.format(len(wait_upload)))

    for i in upload_goto:
        wait_upload = wait_upload[:i['upload_max']]
        total = len(wait_upload)
        for a in wait_upload:
            upload(a, i)
        parse_log.info('上传完成{0}'.format(total))