Exemple #1
0
# """
# if __name__ == "__main__":
#     nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
#     nbd_spyder.get_historical_news(start_page=684)
#
#     Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
#     DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
"""
Example-2:
爬取实时新闻数据
"""
if __name__ == '__main__':
    from Kite.database import Database
    from Kite import config

    from Leorio.tokenization import Tokenization

    from Killua.denull import DeNull
    from Killua.deduplication import Deduplication

    import threading

    # 如果没有历史数据从头爬取,如果已爬取历史数据,则从最新的时间开始爬取
    # 如历史数据中最近的新闻时间是"2020-12-09 20:37:10",则从该时间开始爬取
    nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
    nbd_spyder.get_historical_news()

    Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
    DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()

    nbd_spyder.get_realtime_news()
Exemple #2
0
obj = Database()
df = obj.get_data(config.DATABASE_NAME,
                  config.COLLECTION_NAME_CNSTOCK,
                  keys=["Date", "Category"])

cnstock_spyder = CnStockSpyder(config.DATABASE_NAME,
                               config.COLLECTION_NAME_CNSTOCK)
# 先补充历史数据,比如已爬取数据到2020-12-01,但是启动实时爬取程序在2020-12-23,则先
# 自动补充爬取2020-12-02至2020-12-23的新闻数据
for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items(
):
    # 查询type_chn的最近一条数据的时间
    latets_date_in_db = max(df[df.Category == type_chn]["Date"].to_list())
    cnstock_spyder.get_historical_news(url_to_be_crawled,
                                       category_chn=type_chn,
                                       start_date=latets_date_in_db)

Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()

# 开启多线程并行实时爬取
thread_list = []
for url, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
    thread = threading.Thread(target=cnstock_spyder.get_realtime_news,
                              args=(url, type_chn, 60))
    thread_list.append(thread)
for thread in thread_list:
    thread.start()
for thread in thread_list:
    thread.join()