def get_all_news_about_specific_stock(self, database_name,
                                       collection_name):
     # 获取collection_name的key值,看是否包含RelatedStockCodes,如果没有说明,没有做将新闻中所涉及的
     # 股票代码保存在新的一列
     _keys_list = list(
         next(
             self.database.get_collection(database_name,
                                          collection_name).find()).keys())
     if "RelatedStockCodes" not in _keys_list:
         tokenization = Tokenization(import_module="jieba",
                                     user_dict="./Leorio/financedict.txt")
         tokenization.update_news_database_rows(database_name,
                                                collection_name)
     # 创建stock_code为名称的collection
     stock_symbol_list = self.database.get_data(
         config.STOCK_DATABASE_NAME,
         config.COLLECTION_NAME_STOCK_BASIC_INFO,
         keys=["symbol"])["symbol"].to_list()
     col_names = self.database.connect_database(
         config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE).list_collection_names(
             session=None)
     for symbol in stock_symbol_list:
         if symbol not in col_names:
             _collection = self.database.get_collection(
                 config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, symbol)
             _tmp_num_stat = 0
             for row in self.database.get_collection(
                     database_name, collection_name).find():  # 迭代器
                 if symbol[2:] in row["RelatedStockCodes"].split(" "):
                     # 返回新闻发布后n天的标签
                     _tmp_dict = {}
                     for label_days, key_name in self.label_range.items():
                         _tmp_res = self._label_news(
                             datetime.datetime.strptime(
                                 row["Date"].split(" ")[0], "%Y-%m-%d"),
                             symbol, label_days)
                         _tmp_dict.update({key_name: _tmp_res})
                     _data = {
                         "Date": row["Date"],
                         "Url": row["Url"],
                         "Title": row["Title"],
                         "Article": row["Article"],
                         "OriDB": database_name,
                         "OriCOL": collection_name
                     }
                     _data.update(_tmp_dict)
                     _collection.insert_one(_data)
                     _tmp_num_stat += 1
             logging.info(
                 "there are {} news mentioned {} in {} collection need to be fetched ... "
                 .format(_tmp_num_stat, symbol, collection_name))
         else:
             logging.info(
                 "{} has fetched all related news from {}...".format(
                     symbol, collection_name))
         break
Exemple #2
0
 def get_all_news_about_specific_stock(self, database_name,
                                       collection_name):
     # 获取collection_name的key值,看是否包含RelatedStockCodes,如果没有说明,没有做将新闻中所涉及的
     # 股票代码保存在新的一列
     _keys_list = list(
         next(
             self.database.get_collection(database_name,
                                          collection_name).find()).keys())
     if "RelatedStockCodes" not in _keys_list:
         tokenization = Tokenization(import_module="jieba",
                                     user_dict="./Leorio/financedict.txt")
         tokenization.update_news_database_rows(database_name,
                                                collection_name)
     # 创建stock_code为名称的collection
     stock_code_list = self.database.get_data("stock",
                                              "basic_info",
                                              keys=["code"
                                                    ])["code"].to_list()
     for code in stock_code_list:
         _collection = self.database.get_collection(
             config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, code)
         _tmp_num_stat = 0
         for row in self.database.get_collection(
                 database_name, collection_name).find():  # 迭代器
             if code in row["RelatedStockCodes"].split(" "):
                 _collection.insert_one({
                     "Date": row["Date"],
                     "Url": row["Url"],
                     "Title": row["Title"],
                     "Article": row["Article"],
                     "OriDB": database_name,
                     "OriCOL": collection_name
                 })
                 _tmp_num_stat += 1
         logging.info(
             "there are {} news mentioned {} in {} collection ... ".format(
                 _tmp_num_stat, code, collection_name))
Exemple #3
0
#     cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn)
#     logging.info("finished ...")
#     time.sleep(30)
#
# jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
# jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ, "2020-12-04", "2020-12-08")
#
# nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
# nbd_spyder.get_historical_news(684)

# 2. 抽取出新闻中所涉及的股票,并保存其股票代码在collection中新的一列
from Leorio.tokenization import Tokenization

tokenization = Tokenization(import_module="jieba",
                            user_dict="./Leorio/financedict.txt")
tokenization.update_news_database_rows(config.DATABASE_NAME, "cnstock")
# tokenization.update_news_database_rows(config.DATABASE_NAME, "nbd")
# tokenization.update_news_database_rows(config.DATABASE_NAME, "jrj")

# 3. 针对历史数据进行去重清洗
from Killua.deduplication import Deduplication

Deduplication("finnewshunter", "cnstock").run()
# Deduplication("finnewshunter", "nbd").run()
# Deduplication("finnewshunter", "jrj").run()  # 暂时只有jrj需要去重

# 4. 将历史数据中包含null值的行去掉
from Killua.denull import DeNull

# DeNull("finnewshunter", "cnstock").run()
# DeNull("finnewshunter", "nbd").run()
    from Kite.database import Database
    from Kite import config
    from concurrent import futures
    import threading

    obj = Database()
    df = obj.get_data(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK, keys=["Date", "Category"])

    cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
    # 先补充历史数据,比如已爬取数据到2020-12-01,但是启动实时爬取程序在2020-12-23,则先
    # 自动补充爬取2020-12-02至2020-12-23的新闻数据
    for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
        # 查询type_chn的最近一条数据的时间
        latets_date_in_db = max(df[df.Category == type_chn]["Date"].to_list())
        cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn, start_date=latets_date_in_db)

    tokenization = Tokenization(import_module="jieba", user_dict=config.USER_DEFINED_DICT_PATH)
    tokenization.update_news_database_rows(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
    Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
    DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()

    # 开启多线程并行实时爬取
    thread_list = []
    for url, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
        thread = threading.Thread(target=cnstock_spyder.get_realtime_news, args=(url, type_chn, 60))
        thread_list.append(thread)
    for thread in thread_list:
        thread.start()
    for thread in thread_list:
        thread.join()