def get_all_news_about_specific_stock(self, database_name, collection_name): # 获取collection_name的key值,看是否包含RelatedStockCodes,如果没有说明,没有做将新闻中所涉及的 # 股票代码保存在新的一列 _keys_list = list( next( self.database.get_collection(database_name, collection_name).find()).keys()) if "RelatedStockCodes" not in _keys_list: tokenization = Tokenization(import_module="jieba", user_dict="./Leorio/financedict.txt") tokenization.update_news_database_rows(database_name, collection_name) # 创建stock_code为名称的collection stock_symbol_list = self.database.get_data( config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO, keys=["symbol"])["symbol"].to_list() col_names = self.database.connect_database( config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE).list_collection_names( session=None) for symbol in stock_symbol_list: if symbol not in col_names: _collection = self.database.get_collection( config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, symbol) _tmp_num_stat = 0 for row in self.database.get_collection( database_name, collection_name).find(): # 迭代器 if symbol[2:] in row["RelatedStockCodes"].split(" "): # 返回新闻发布后n天的标签 _tmp_dict = {} for label_days, key_name in self.label_range.items(): _tmp_res = self._label_news( datetime.datetime.strptime( row["Date"].split(" ")[0], "%Y-%m-%d"), symbol, label_days) _tmp_dict.update({key_name: _tmp_res}) _data = { "Date": row["Date"], "Url": row["Url"], "Title": row["Title"], "Article": row["Article"], "OriDB": database_name, "OriCOL": collection_name } _data.update(_tmp_dict) _collection.insert_one(_data) _tmp_num_stat += 1 logging.info( "there are {} news mentioned {} in {} collection need to be fetched ... " .format(_tmp_num_stat, symbol, collection_name)) else: logging.info( "{} has fetched all related news from {}...".format( symbol, collection_name)) break
def get_all_news_about_specific_stock(self, database_name, collection_name): # 获取collection_name的key值,看是否包含RelatedStockCodes,如果没有说明,没有做将新闻中所涉及的 # 股票代码保存在新的一列 _keys_list = list( next( self.database.get_collection(database_name, collection_name).find()).keys()) if "RelatedStockCodes" not in _keys_list: tokenization = Tokenization(import_module="jieba", user_dict="./Leorio/financedict.txt") tokenization.update_news_database_rows(database_name, collection_name) # 创建stock_code为名称的collection stock_code_list = self.database.get_data("stock", "basic_info", keys=["code" ])["code"].to_list() for code in stock_code_list: _collection = self.database.get_collection( config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, code) _tmp_num_stat = 0 for row in self.database.get_collection( database_name, collection_name).find(): # 迭代器 if code in row["RelatedStockCodes"].split(" "): _collection.insert_one({ "Date": row["Date"], "Url": row["Url"], "Title": row["Title"], "Article": row["Article"], "OriDB": database_name, "OriCOL": collection_name }) _tmp_num_stat += 1 logging.info( "there are {} news mentioned {} in {} collection ... ".format( _tmp_num_stat, code, collection_name))
# cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn) # logging.info("finished ...") # time.sleep(30) # # jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ) # jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ, "2020-12-04", "2020-12-08") # # nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD) # nbd_spyder.get_historical_news(684) # 2. 抽取出新闻中所涉及的股票,并保存其股票代码在collection中新的一列 from Leorio.tokenization import Tokenization tokenization = Tokenization(import_module="jieba", user_dict="./Leorio/financedict.txt") tokenization.update_news_database_rows(config.DATABASE_NAME, "cnstock") # tokenization.update_news_database_rows(config.DATABASE_NAME, "nbd") # tokenization.update_news_database_rows(config.DATABASE_NAME, "jrj") # 3. 针对历史数据进行去重清洗 from Killua.deduplication import Deduplication Deduplication("finnewshunter", "cnstock").run() # Deduplication("finnewshunter", "nbd").run() # Deduplication("finnewshunter", "jrj").run() # 暂时只有jrj需要去重 # 4. 将历史数据中包含null值的行去掉 from Killua.denull import DeNull # DeNull("finnewshunter", "cnstock").run() # DeNull("finnewshunter", "nbd").run()
from Kite.database import Database from Kite import config from concurrent import futures import threading obj = Database() df = obj.get_data(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK, keys=["Date", "Category"]) cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK) # 先补充历史数据,比如已爬取数据到2020-12-01,但是启动实时爬取程序在2020-12-23,则先 # 自动补充爬取2020-12-02至2020-12-23的新闻数据 for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items(): # 查询type_chn的最近一条数据的时间 latets_date_in_db = max(df[df.Category == type_chn]["Date"].to_list()) cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn, start_date=latets_date_in_db) tokenization = Tokenization(import_module="jieba", user_dict=config.USER_DEFINED_DICT_PATH) tokenization.update_news_database_rows(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK) Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run() DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run() # 开启多线程并行实时爬取 thread_list = [] for url, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items(): thread = threading.Thread(target=cnstock_spyder.get_realtime_news, args=(url, type_chn, 60)) thread_list.append(thread) for thread in thread_list: thread.start() for thread in thread_list: thread.join()