コード例 #1
0
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)
    default_headers = deepcopy(DEFAULT_HEADERS)
    default_headers.update({'Referer': 'http://www.guokr.com/scientific/'})
    save_path = SCRIPT_CONFIG['SAVE_PATH']
    book_name = '果壳网'
    task = Task.make_task({
        'url': API_URL.format(start),
        'method': 'GET',
        'meta': {
            'headers': default_headers,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'start': start,
            'end': end,
            'kw': kw,
            'save_path': SCRIPT_CONFIG['SAVE_PATH'],
        },
        'retry': 3,
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        pass

    crawler.start()

    items = []

    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()

    with HTML2Kindle(items, save_path, book_name,
                     MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
        html2kindle.make_metadata(window=kw.get('window', 50))
        html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
    os._exit(0)
コード例 #2
0
def resulter_collection(task):
    with ArticleDB(task['save']['save_path']) as article_db:

        # 获取收藏夹名字,将其插入meta表
        global GET_BOOK_NAME_FLAG
        if GET_BOOK_NAME_FLAG is False:
            try:
                article_db.insert_meta_data(
                    ['BOOK_NAME', '知乎收藏夹_' + task['save']['book_name']],
                    update=False)
                GET_BOOK_NAME_FLAG = True
            except:
                pass
        article_db.insert_article(task['parsed_data'])
コード例 #3
0
ファイル: __init__.py プロジェクト: sml920505/web2kindle
def make_mobi(path, window=50, multi=True):
    from web2kindle.libs.db import ArticleDB
    from web2kindle import MAIN_CONFIG
    from web2kindle.libs.html2kindle import HTML2Kindle

    if not path:
        import os
        path = os.getcwd()

    items = []
    with ArticleDB(path) as db:
        db.reset_version()
        items.extend(db.select_article())
        book_name = db.select_meta('BOOK_NAME')

    if items:
        with HTML2Kindle(items, path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window)
            if multi:
                html2kindle.make_book_multi(path)
            else:
                html2kindle.make_book(path)
コード例 #4
0
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    default_headers = deepcopy(DEFAULT_HEADERS)
    default_headers.update({'Referer': 'http://www.guokr.com/scientific/'})
    save_path = SCRIPT_CONFIG['SAVE_PATH']
    book_name = '果壳网'
    task = Task.make_task({
        'url': API_URL.format(start),
        'method': 'GET',
        'meta': {
            'headers': default_headers,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'start': start,
            'end': end,
            'kw': kw,
            'save_path': SCRIPT_CONFIG['SAVE_PATH'],
        },
        'retry': 10,
        'retry_delay': 10
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        _ = db.select_all_article_id()
    if _:
        for each in _:
            ARTICLE_ID_SET.add(each[0])

    crawler.start()

    items = []
    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()
        db.reset()

    if items:
        new = True
        with HTML2Kindle(items, save_path, book_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)
    else:
        LOG.log_it('无新项目', 'INFO')
        new = False

    if new and kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
コード例 #5
0
def resulter_content(task):
    LOG.log_it("正在将任务 {} 插入数据库".format(task['tid']), 'INFO')
    with ArticleDB(task['save']['save_path']) as article_db:
        article_db.insert_article(task['parsed_data'])
コード例 #6
0
def main(zhuanti_list, start, end, kw):
    """start默认1;end为结束页数,每页9个"""
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q,
                      MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    start = int(start)
    end = int(end)

    for zhuanti in zhuanti_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update({'Referer': BASE_URL.format(zhuanti)})

        # 以专题的数字作为子文件名
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti))

        if kw.get('order_by') == 'comment':
            order_by = ORDER_COMMENT
        elif kw.get('order_by') == 'add':
            order_by = ORDER_ADD
        elif kw.get('order_by') == 'top':
            order_by = ORDER_TOP
        else:
            # 默认add
            order_by = ORDER_ADD

        task = Task.make_task({
            'url': API_URL.format(zhuanti, order_by, start),
            'method': 'GET',
            'meta': {'headers': new_header, 'verify': False},
            'parser': parser_list,
            'priority': 0,
            'save': {'cursor': start,
                     'save_path': save_path,
                     'start': start,
                     'end': end,
                     'kw': kw,
                     'name': zhuanti,
                     'order_by': order_by},
            'retry': 10,
            'retry_delay': 10
        })

        iq.put(task)

        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()

        # 利用集合去重
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    # 开始爬虫
    crawler.start()

    # 开始制作电子书
    for zhuanti in zhuanti_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti))
        with ArticleDB(save_path, VERSION=0) as db:
            # 读取所有文章
            items.extend(db.select_article())
            # 从数据库中获取专题名字
            book_name = db.select_meta('BOOK_NAME')
            # 更新数据库版本
            db.increase_version()
            # 数据库收尾工作
            db.reset()

        if items:
            with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)

            if kw.get('email'):
                with SendEmail2Kindle() as s:
                    s.send_all_mobi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
コード例 #7
0
def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        raise RetryDownload

    try:
        text = response.text
        bs = BeautifulSoup(text, 'lxml')
    except Exception as e:
        LOG.log_it('解析网页出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)ERRINFO:{}'
                   .format(str(e)), 'WARN')
        raise RetryDownload

    book_name = bs.title.string if bs.title else task['save']['name']

    # 插入文集名字
    with ArticleDB(task['save']['save_path']) as article_db:
        article_db.insert_meta_data(['BOOK_NAME', format_file_name('简书专题_' + book_name)], update=False)

    # 顺序反向
    items = bs.select('a.title')
    items.reverse()

    for item in items:
        # 如果已经在数据库中,则不下载
        url = 'https://www.jianshu.com' + item.attrs['href']
        if md5string(url) in ARTICLE_ID_SET:
            to_next = False
            continue

        try:
            title = item.string
        except:
            LOG.log_it('解析标题出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)', 'WARN')
            raise RetryDownload

        new_task = Task.make_task({
            'url': url,
            'method': 'GET',
            'meta': task['meta'],
            'parser': parser_content,
            'resulter': resulter_content,
            'priority': 5,
            'save': task['save'],
            'title': title,
        })
        new_tasks.append(new_task)

    # 下一页
    if to_next and len(items) != 0:
        if task['save']['cursor'] < task['save']['end']:
            next_page_task = deepcopy(task)
            next_page_task.update(
                {'url': API_URL.format(task['save']['name'], task['save']['order_by'], task['save']['cursor'] + 1)})
            next_page_task['save'].update({'cursor': next_page_task['save']['cursor'] + 1})
            new_tasks.append(next_page_task)

    return None, new_tasks
コード例 #8
0
def main(zhuanlan_name_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)

    for zhuanlan_name in zhuanlan_name_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update(
            {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)})
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))

        task = Task.make_task({
            'url':
            'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}'
            .format(zhuanlan_name, start),
            'method':
            'GET',
            'meta': {
                'headers': new_header,
                'verify': False
            },
            'parser':
            parser_list,
            'priority':
            0,
            'save': {
                'cursor': start,
                'save_path': save_path,
                'start': start,
                'end': end,
                'kw': kw,
                'name': zhuanlan_name
            },
            'retry':
            3,
        })

        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            pass

    crawler.start()
    for zhuanlan_name in zhuanlan_name_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))
        with ArticleDB(save_path, VERSION=0) as db:
            db.insert_meta_data(['BOOK_NAME', zhuanlan_name])
            items.extend(db.select_article())
            db.increase_version()

        with HTML2Kindle(items, save_path, zhuanlan_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        for zhuanlan_name in zhuanlan_name_list:
            with SendEmail2Kindle() as s:
                s.send_all_mobi(
                    os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name)))

    os._exit(0)
コード例 #9
0
def main(start, end, kw):
    # start:2017/12/11
    # end:2017/12/12
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)
    try:
        start_l = [int(_) for _ in start.split('-')]
        end_l = [int(_) for _ in end.split('-')]
        start_t = int(
            datetime.datetime(start_l[0], start_l[1],
                              start_l[2]).timestamp()) + 60 * 60 * 24
        end_t = int(
            datetime.datetime(end_l[0], end_l[1], end_l[2]).timestamp())
    except:
        LOG.log_it('日期格式错误', 'WARN')
        traceback.print_exc()
        return

    global API_URL
    if 'type' in kw:
        if kw['type'] == 'business':
            API_URL = API_BUSINESS
        elif kw['type'] == 'intelligent':
            API_URL = API_INTELLIGENT
        elif kw['type'] == 'design':
            API_URL = API_DESIGN
        elif kw['type'] == 'fashion':
            API_URL = API_FASHION
        elif kw['type'] == 'entertainment':
            API_URL = API_ENTERTAINMENT
        elif kw['type'] == 'city':
            API_URL = API_CITY
        elif kw['type'] == 'game':
            API_URL = API_GAME
        elif kw['type'] == 'long':
            API_URL = API_LONG
        elif kw['type'] == 'home':
            pass
    else:
        kw.update({'type': 'home'})

    new_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS'))
    new_header.update({'Referer': 'https://www.qdaily.com/'})
    save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                             'qdaily_{}'.format(kw['type']))
    book_name = '好奇心日报_{}_{}_{}'.format(kw['type'], start, end)
    task = Task.make_task({
        'url': API_URL.format(start_t),
        'method': 'GET',
        'meta': {
            'headers': new_header,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start_t,
            'save_path': save_path,
            'start': start_t,
            'end': end_t,
            'kw': kw,
            'page': 1,
            'name': book_name,
        },
        'retry': 3,
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        pass

    crawler.start()

    items = []
    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()

    with HTML2Kindle(items, save_path, book_name,
                     MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
        html2kindle.make_metadata(window=kw.get('window', 50))
        html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(save_path)
    os._exit(0)
コード例 #10
0
def main(zhuanlan_name_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))
    new = True

    for zhuanlan_name in zhuanlan_name_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update(
            {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)})
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))

        task = Task.make_task({
            'url':
            'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}'
            .format(zhuanlan_name, start),
            'method':
            'GET',
            'meta': {
                'headers': new_header,
                'verify': False
            },
            'parser':
            parser_list,
            'priority':
            0,
            'save': {
                'cursor': start,
                'save_path': save_path,
                'start': start,
                'end': end,
                'kw': kw,
                'name': zhuanlan_name
            },
            'retry':
            10,
            'retry_delay':
            10
        })

        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    crawler.start()
    for zhuanlan_name in zhuanlan_name_list:
        items = []
        book_name = '知乎专栏_{}'.format(zhuanlan_name)
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))
        with ArticleDB(save_path, VERSION=0) as db:
            db.insert_meta_data(['BOOK_NAME', zhuanlan_name])
            items.extend(db.select_article())
            db.increase_version()
            db.reset()

        if items:
            new = True
            with HTML2Kindle(items, save_path, book_name,
                             MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
            new = False

    if new and kw.get('email'):
        for zhuanlan_name in zhuanlan_name_list:
            with SendEmail2Kindle() as s:
                s.send_all_mobi(
                    os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name)))
コード例 #11
0
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    new_header = deepcopy(DEFAULT_HEADERS)

    global IS_TODAY_URL
    if start is None:
        IS_TODAY_URL = True
        save_path = os.path.join(
            SCRIPT_CONFIG['SAVE_PATH'],
            'zhihu_daily_' + get_datetime_string('%Y%m%d'))
        book_name = '知乎日报_' + get_datetime_string('%Y%m%d')
    else:
        if end is None:
            end = datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y%m%d')

        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 'zhihu_daily_{}_{}'.format(start, end))
        book_name = '知乎日报_{}_{}'.format(start, end)
        IS_TODAY_URL = False

    url = TODAY_URL if IS_TODAY_URL else YESTERDAY_URL.format(start)

    task = Task.make_task({
        'url': url,
        'method': 'GET',
        'meta': {
            'headers': new_header,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'save_path': save_path,
            'start': start,
            'end': end,
            'kw': kw
        },
        'retry': 99,
        'retry_delay': 10
    })

    iq.put(task)

    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        _ = db.select_all_article_id()
    if _:
        for each in _:
            ARTICLE_ID_SET.add(each[0])

    crawler.start()

    items = []
    with ArticleDB(save_path, VERSION=0) as db:
        db.insert_meta_data(['BOOK_NAME', book_name])
        items.extend(db.select_article())
        db.increase_version()
        db.reset()

    if items:
        new = True
        with HTML2Kindle(items, save_path, book_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)
    else:
        LOG.log_it('无新项目', 'INFO')
        new = False

    if new and kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(os.path.join(save_path))
コード例 #12
0
def main(collection_num_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))
    new = True

    for collection_num in collection_num_list:
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(collection_num))

        task = Task.make_task({
            'url':
            'https://www.zhihu.com/collection/{}?page={}'.format(
                collection_num, start),
            'method':
            'GET',
            'meta': {
                'headers': DEFAULT_HEADERS,
                'verify': False
            },
            'parser':
            parser_collection,
            'resulter':
            resulter_collection,
            'priority':
            0,
            'retry':
            10,
            'save': {
                'start': start,
                'end': end,
                'kw': kw,
                'save_path': save_path,
                'name': collection_num,
            },
            'retry_delay':
            10
        })
        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    crawler.start()
    for collection_num in collection_num_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(collection_num))
        with ArticleDB(save_path) as db:
            items.extend(db.select_article())
            book_name = db.select_meta('BOOK_NAME')
            db.increase_version()
            db.reset()

        if items:
            new = True
            with HTML2Kindle(items, save_path, book_name,
                             MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
            new = False

    if new and kw.get('email'):
        for collection_num in collection_num_list:
            save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                     str(collection_num))
            with SendEmail2Kindle() as s:
                s.send_all_mobi(save_path)