Example #1
0
def send_mobi(path):
    if not path:
        import os
        path = os.getcwd()

    from web2kindle.libs.send_email import SendEmail2Kindle
    with SendEmail2Kindle() as s:
        s.send_all_mobi(path)
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)
    default_headers = deepcopy(DEFAULT_HEADERS)
    default_headers.update({'Referer': 'http://www.guokr.com/scientific/'})
    save_path = SCRIPT_CONFIG['SAVE_PATH']
    book_name = '果壳网'
    task = Task.make_task({
        'url': API_URL.format(start),
        'method': 'GET',
        'meta': {
            'headers': default_headers,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'start': start,
            'end': end,
            'kw': kw,
            'save_path': SCRIPT_CONFIG['SAVE_PATH'],
        },
        'retry': 3,
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        pass

    crawler.start()

    items = []

    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()

    with HTML2Kindle(items, save_path, book_name,
                     MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
        html2kindle.make_metadata(window=kw.get('window', 50))
        html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
    os._exit(0)
Example #3
0
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    default_headers = deepcopy(DEFAULT_HEADERS)
    default_headers.update({'Referer': 'http://www.guokr.com/scientific/'})
    save_path = SCRIPT_CONFIG['SAVE_PATH']
    book_name = '果壳网'
    task = Task.make_task({
        'url': API_URL.format(start),
        'method': 'GET',
        'meta': {
            'headers': default_headers,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'start': start,
            'end': end,
            'kw': kw,
            'save_path': SCRIPT_CONFIG['SAVE_PATH'],
        },
        'retry': 10,
        'retry_delay': 10
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        _ = db.select_all_article_id()
    if _:
        for each in _:
            ARTICLE_ID_SET.add(each[0])

    crawler.start()

    items = []
    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()
        db.reset()

    if items:
        new = True
        with HTML2Kindle(items, save_path, book_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)
    else:
        LOG.log_it('无新项目', 'INFO')
        new = False

    if new and kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
Example #4
0
def main(zhuanti_list, start, end, kw):
    """start默认1;end为结束页数,每页9个"""
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q,
                      MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    start = int(start)
    end = int(end)

    for zhuanti in zhuanti_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update({'Referer': BASE_URL.format(zhuanti)})

        # 以专题的数字作为子文件名
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti))

        if kw.get('order_by') == 'comment':
            order_by = ORDER_COMMENT
        elif kw.get('order_by') == 'add':
            order_by = ORDER_ADD
        elif kw.get('order_by') == 'top':
            order_by = ORDER_TOP
        else:
            # 默认add
            order_by = ORDER_ADD

        task = Task.make_task({
            'url': API_URL.format(zhuanti, order_by, start),
            'method': 'GET',
            'meta': {'headers': new_header, 'verify': False},
            'parser': parser_list,
            'priority': 0,
            'save': {'cursor': start,
                     'save_path': save_path,
                     'start': start,
                     'end': end,
                     'kw': kw,
                     'name': zhuanti,
                     'order_by': order_by},
            'retry': 10,
            'retry_delay': 10
        })

        iq.put(task)

        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()

        # 利用集合去重
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    # 开始爬虫
    crawler.start()

    # 开始制作电子书
    for zhuanti in zhuanti_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti))
        with ArticleDB(save_path, VERSION=0) as db:
            # 读取所有文章
            items.extend(db.select_article())
            # 从数据库中获取专题名字
            book_name = db.select_meta('BOOK_NAME')
            # 更新数据库版本
            db.increase_version()
            # 数据库收尾工作
            db.reset()

        if items:
            with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)

            if kw.get('email'):
                with SendEmail2Kindle() as s:
                    s.send_all_mobi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
Example #5
0
def main(zhuanlan_name_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)

    for zhuanlan_name in zhuanlan_name_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update(
            {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)})
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))

        task = Task.make_task({
            'url':
            'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}'
            .format(zhuanlan_name, start),
            'method':
            'GET',
            'meta': {
                'headers': new_header,
                'verify': False
            },
            'parser':
            parser_list,
            'priority':
            0,
            'save': {
                'cursor': start,
                'save_path': save_path,
                'start': start,
                'end': end,
                'kw': kw,
                'name': zhuanlan_name
            },
            'retry':
            3,
        })

        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            pass

    crawler.start()
    for zhuanlan_name in zhuanlan_name_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))
        with ArticleDB(save_path, VERSION=0) as db:
            db.insert_meta_data(['BOOK_NAME', zhuanlan_name])
            items.extend(db.select_article())
            db.increase_version()

        with HTML2Kindle(items, save_path, zhuanlan_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        for zhuanlan_name in zhuanlan_name_list:
            with SendEmail2Kindle() as s:
                s.send_all_mobi(
                    os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name)))

    os._exit(0)
Example #6
0
def main(start, end, kw):
    # start:2017/12/11
    # end:2017/12/12
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)
    try:
        start_l = [int(_) for _ in start.split('-')]
        end_l = [int(_) for _ in end.split('-')]
        start_t = int(
            datetime.datetime(start_l[0], start_l[1],
                              start_l[2]).timestamp()) + 60 * 60 * 24
        end_t = int(
            datetime.datetime(end_l[0], end_l[1], end_l[2]).timestamp())
    except:
        LOG.log_it('日期格式错误', 'WARN')
        traceback.print_exc()
        return

    global API_URL
    if 'type' in kw:
        if kw['type'] == 'business':
            API_URL = API_BUSINESS
        elif kw['type'] == 'intelligent':
            API_URL = API_INTELLIGENT
        elif kw['type'] == 'design':
            API_URL = API_DESIGN
        elif kw['type'] == 'fashion':
            API_URL = API_FASHION
        elif kw['type'] == 'entertainment':
            API_URL = API_ENTERTAINMENT
        elif kw['type'] == 'city':
            API_URL = API_CITY
        elif kw['type'] == 'game':
            API_URL = API_GAME
        elif kw['type'] == 'long':
            API_URL = API_LONG
        elif kw['type'] == 'home':
            pass
    else:
        kw.update({'type': 'home'})

    new_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS'))
    new_header.update({'Referer': 'https://www.qdaily.com/'})
    save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                             'qdaily_{}'.format(kw['type']))
    book_name = '好奇心日报_{}_{}_{}'.format(kw['type'], start, end)
    task = Task.make_task({
        'url': API_URL.format(start_t),
        'method': 'GET',
        'meta': {
            'headers': new_header,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start_t,
            'save_path': save_path,
            'start': start_t,
            'end': end_t,
            'kw': kw,
            'page': 1,
            'name': book_name,
        },
        'retry': 3,
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        pass

    crawler.start()

    items = []
    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()

    with HTML2Kindle(items, save_path, book_name,
                     MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
        html2kindle.make_metadata(window=kw.get('window', 50))
        html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(save_path)
    os._exit(0)
Example #7
0
def main(zhuanlan_name_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))
    new = True

    for zhuanlan_name in zhuanlan_name_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update(
            {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)})
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))

        task = Task.make_task({
            'url':
            'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}'
            .format(zhuanlan_name, start),
            'method':
            'GET',
            'meta': {
                'headers': new_header,
                'verify': False
            },
            'parser':
            parser_list,
            'priority':
            0,
            'save': {
                'cursor': start,
                'save_path': save_path,
                'start': start,
                'end': end,
                'kw': kw,
                'name': zhuanlan_name
            },
            'retry':
            10,
            'retry_delay':
            10
        })

        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    crawler.start()
    for zhuanlan_name in zhuanlan_name_list:
        items = []
        book_name = '知乎专栏_{}'.format(zhuanlan_name)
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))
        with ArticleDB(save_path, VERSION=0) as db:
            db.insert_meta_data(['BOOK_NAME', zhuanlan_name])
            items.extend(db.select_article())
            db.increase_version()
            db.reset()

        if items:
            new = True
            with HTML2Kindle(items, save_path, book_name,
                             MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
            new = False

    if new and kw.get('email'):
        for zhuanlan_name in zhuanlan_name_list:
            with SendEmail2Kindle() as s:
                s.send_all_mobi(
                    os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name)))
Example #8
0
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    new_header = deepcopy(DEFAULT_HEADERS)

    global IS_TODAY_URL
    if start is None:
        IS_TODAY_URL = True
        save_path = os.path.join(
            SCRIPT_CONFIG['SAVE_PATH'],
            'zhihu_daily_' + get_datetime_string('%Y%m%d'))
        book_name = '知乎日报_' + get_datetime_string('%Y%m%d')
    else:
        if end is None:
            end = datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y%m%d')

        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 'zhihu_daily_{}_{}'.format(start, end))
        book_name = '知乎日报_{}_{}'.format(start, end)
        IS_TODAY_URL = False

    url = TODAY_URL if IS_TODAY_URL else YESTERDAY_URL.format(start)

    task = Task.make_task({
        'url': url,
        'method': 'GET',
        'meta': {
            'headers': new_header,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'save_path': save_path,
            'start': start,
            'end': end,
            'kw': kw
        },
        'retry': 99,
        'retry_delay': 10
    })

    iq.put(task)

    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        _ = db.select_all_article_id()
    if _:
        for each in _:
            ARTICLE_ID_SET.add(each[0])

    crawler.start()

    items = []
    with ArticleDB(save_path, VERSION=0) as db:
        db.insert_meta_data(['BOOK_NAME', book_name])
        items.extend(db.select_article())
        db.increase_version()
        db.reset()

    if items:
        new = True
        with HTML2Kindle(items, save_path, book_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)
    else:
        LOG.log_it('无新项目', 'INFO')
        new = False

    if new and kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(os.path.join(save_path))
Example #9
0
def main(collection_num_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))
    new = True

    for collection_num in collection_num_list:
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(collection_num))

        task = Task.make_task({
            'url':
            'https://www.zhihu.com/collection/{}?page={}'.format(
                collection_num, start),
            'method':
            'GET',
            'meta': {
                'headers': DEFAULT_HEADERS,
                'verify': False
            },
            'parser':
            parser_collection,
            'resulter':
            resulter_collection,
            'priority':
            0,
            'retry':
            10,
            'save': {
                'start': start,
                'end': end,
                'kw': kw,
                'save_path': save_path,
                'name': collection_num,
            },
            'retry_delay':
            10
        })
        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    crawler.start()
    for collection_num in collection_num_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(collection_num))
        with ArticleDB(save_path) as db:
            items.extend(db.select_article())
            book_name = db.select_meta('BOOK_NAME')
            db.increase_version()
            db.reset()

        if items:
            new = True
            with HTML2Kindle(items, save_path, book_name,
                             MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
            new = False

    if new and kw.get('email'):
        for collection_num in collection_num_list:
            save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                     str(collection_num))
            with SendEmail2Kindle() as s:
                s.send_all_mobi(save_path)