def __init__(self): self.tokenization = Tokenization( import_module="jieba", user_dict=config.USER_DEFINED_DICT_PATH, chn_stop_words_dir=config.CHN_STOP_WORDS_PATH) self.database = Database() self.classifier = Classifier()
class TopicModelling(object): def __init__(self): self.tokenization = Tokenization( import_module="jieba", user_dict=config.USER_DEFINED_DICT_PATH, chn_stop_words_dir=config.CHN_STOP_WORDS_PATH) self.database = Database() self.classifier = Classifier() def create_dictionary(self, raw_documents_list, save_path=None, is_saved=False): """ 将文中每个词汇关联唯一的ID,因此需要定义词汇表 :param: raw_documents_list, 原始语料列表,每个元素即文本,如["洗尽铅华...", "风雨赶路人...", ...] :param: savepath, corpora.Dictionary对象保存路径 """ documents_token_list = [] for doc in raw_documents_list: documents_token_list.append(self.tokenization.cut_words(doc)) _dict = corpora.Dictionary(documents_token_list) # 找到只出现一次的token once_items = [ _dict[tokenid] for tokenid, docfreq in _dict.dfs.items() if docfreq == 1 ] # 在documents_token_list的每一条语料中,删除只出现一次的token for _id, token_list in enumerate(documents_token_list): documents_token_list[_id] = list( filter(lambda token: token not in once_items, token_list)) # 极端情况,某一篇语料所有token只出现一次,这样该篇新闻语料的token列表就变为空,因此删除掉 documents_token_list = [ token_list for token_list in documents_token_list if (len(token_list) != 0) ] # 找到只出现一次的token对应的id once_ids = [ tokenid for tokenid, docfreq in _dict.dfs.items() if docfreq == 1 ] # 删除仅出现一次的词 _dict.filter_tokens(once_ids) # 消除id序列在删除词后产生的不连续的缺口 _dict.compactify() if is_saved and save_path: _dict.save(save_path) logging.info( "new generated dictionary saved in path -> {} ...".format( save_path)) return _dict, documents_token_list def renew_dictionary(self, old_dict_path, new_raw_documents_list, new_dict_path=None, is_saved=False): documents_token_list = [] for doc in new_raw_documents_list: documents_token_list.append(self.tokenization.cut_words(doc)) _dict = corpora.Dictionary.load(old_dict_path) _dict.add_documents(documents_token_list) if new_dict_path: old_dict_path = new_dict_path if is_saved: _dict.save(old_dict_path) logging.info( "updated dictionary by another raw documents serialized in {} ... " .format(old_dict_path)) return _dict, documents_token_list def create_bag_of_word_representation(self, raw_documents_list, old_dict_path=None, new_dict_path=None, bow_vector_save_path=None, is_saved_dict=False): if old_dict_path: # 如果存在旧的语料词典,就在原先词典的基础上更新,增加未见过的词 corpora_dictionary, documents_token_list = self.renew_dictionary( old_dict_path, raw_documents_list, new_dict_path=new_dict_path) else: # 否则重新创建词典 start_time = time.time() corpora_dictionary, documents_token_list = self.create_dictionary( raw_documents_list, save_path=new_dict_path, is_saved=is_saved_dict) end_time = time.time() logging.info( "there are {} mins spent to create a new dictionary ... ". format((end_time - start_time) / 60)) # 根据新词典对文档(或语料)生成对应的词袋向量 start_time = time.time() bow_vector = [ corpora_dictionary.doc2bow(doc_token) for doc_token in documents_token_list ] end_time = time.time() logging.info( "there are {} mins spent to calculate bow-vector ... ".format( (end_time - start_time) / 60)) if bow_vector_save_path: corpora.MmCorpus.serialize(bow_vector_save_path, bow_vector) return documents_token_list, corpora_dictionary, bow_vector @staticmethod def transform_vectorized_corpus(corpora_dictionary, bow_vector, model_type="lda", model_save_path=None): # 如何没有保存任何模型,重新训练的情况下,可以选择该函数 model_vector = None if model_type == "lsi": # LSI(Latent Semantic Indexing)模型,将文本从词袋向量或者词频向量(更好),转为一个低维度的latent空间 # 对于现实语料,目标维度在200-500被认为是"黄金标准" model_tfidf = models.TfidfModel(bow_vector) # model_tfidf.save("model_tfidf.tfidf") tfidf_vector = model_tfidf[bow_vector] model = models.LsiModel(tfidf_vector, id2word=corpora_dictionary, num_topics=config.TOPIC_NUMBER) # 初始化模型 model_vector = model[tfidf_vector] if model_save_path: model.save(model_save_path) elif model_type == "lda": model = models.LdaModel(bow_vector, id2word=corpora_dictionary, num_topics=config.TOPIC_NUMBER) # 初始化模型 model_vector = model[bow_vector] if model_save_path: model.save(model_save_path) elif model_type == "tfidf": model = models.TfidfModel(bow_vector) # 初始化 # model = models.TfidfModel.load("model_tfidf.tfidf") model_vector = model[bow_vector] # 将整个语料进行转换 if model_save_path: model.save(model_save_path) return model_vector def classify_stock_news(self, unseen_raw_document, database_name, collection_name, label_name="60DaysLabel", topic_model_type="lda", classifier_model="svm", ori_dict_path=None, bowvec_save_path=None, is_saved_bow_vector=False): historical_raw_documents_list = [] Y = [] for row in self.database.get_collection(database_name, collection_name).find(): if label_name in row.keys(): if row[label_name] != "": historical_raw_documents_list.append(row["Article"]) Y.append(row[label_name]) logging.info( "fetch symbol '{}' historical news with label '{}' from [DB:'{}' - COL:'{}'] ... " .format(collection_name, label_name, database_name, collection_name)) le = preprocessing.LabelEncoder() Y = le.fit_transform(Y) logging.info( "encode historical label list by sklearn preprocessing for training ... " ) label_name_list = le.classes_ # ['中性' '利好' '利空'] -> [0, 1, 2] # 根据历史新闻数据库创建词典,以及计算每个历史新闻的词袋向量;如果历史数据库创建的字典存在,则加载进内存 # 用未见过的新闻tokens去更新该词典 if not os.path.exists(ori_dict_path): if not os.path.exists(bowvec_save_path): _, _, historical_bow_vec = self.create_bag_of_word_representation( historical_raw_documents_list, new_dict_path=ori_dict_path, bow_vector_save_path=bowvec_save_path, is_saved_dict=True) logging.info( "create dictionary of historical news, and serialized in path -> {} ... " .format(ori_dict_path)) logging.info( "create bow-vector of historical news, and serialized in path -> {} ... " .format(bowvec_save_path)) else: _, _, _ = self.create_bag_of_word_representation( historical_raw_documents_list, new_dict_path=ori_dict_path, is_saved_dict=True) logging.info( "create dictionary of historical news, and serialized in path -> {} ... " .format(ori_dict_path)) else: if not os.path.exists(bowvec_save_path): _, _, historical_bow_vec = self.create_bag_of_word_representation( historical_raw_documents_list, new_dict_path=ori_dict_path, bow_vector_save_path=bowvec_save_path, is_saved_dict=True) logging.info( "historical news dictionary existed, which saved in path -> {}, but not the historical bow-vector" " ... ".format(ori_dict_path)) else: historical_bow_vec_mmcorpus = corpora.MmCorpus( bowvec_save_path ) # type -> <gensim.corpora.mmcorpus.MmCorpus> historical_bow_vec = [] for _bow in historical_bow_vec_mmcorpus: historical_bow_vec.append(_bow) logging.info( "both historical news dictionary and bow-vector existed, load historical bow-vector to memory ... " ) start_time = time.time() updated_dictionary_with_old_and_unseen_news, unssen_documents_token_list = self.renew_dictionary( ori_dict_path, [unseen_raw_document], is_saved=True) end_time = time.time() logging.info( "renew dictionary with unseen news tokens, and serialized in path -> {}, " "which took {} mins ... ".format(ori_dict_path, (end_time - start_time) / 60)) unseen_bow_vector = [ updated_dictionary_with_old_and_unseen_news.doc2bow(doc_token) for doc_token in unssen_documents_token_list ] updated_bow_vector_with_old_and_unseen_news = [] updated_bow_vector_with_old_and_unseen_news.extend(historical_bow_vec) updated_bow_vector_with_old_and_unseen_news.extend(unseen_bow_vector) # 原先updated_bow_vector_with_old_and_unseen_news是list类型, # 但是经过下面序列化后重新加载进来的类型是gensim.corpora.mmcorpus.MmCorpus if is_saved_bow_vector and bowvec_save_path: corpora.MmCorpus.serialize( bowvec_save_path, updated_bow_vector_with_old_and_unseen_news ) # 保存更新后的bow向量,即包括新旧新闻的bow向量集 logging.info( "combined bow vector(type -> 'list') generated by historical news with unseen bow " "vector to create a new one ... ") if topic_model_type == "lsi": start_time = time.time() updated_tfidf_model_vector = self.transform_vectorized_corpus( updated_dictionary_with_old_and_unseen_news, updated_bow_vector_with_old_and_unseen_news, model_type="tfidf" ) # type -> <gensim.interfaces.TransformedCorpus object> end_time = time.time() logging.info( "regenerated TF-IDF model vector by updated dictionary and updated bow-vector, " "which took {} mins ... ".format((end_time - start_time) / 60)) start_time = time.time() model = models.LsiModel( updated_tfidf_model_vector, id2word=updated_dictionary_with_old_and_unseen_news, num_topics=config.TOPIC_NUMBER) # 初始化模型 model_vector = model[ updated_tfidf_model_vector] # type -> <gensim.interfaces.TransformedCorpus object> end_time = time.time() logging.info( "regenerated LSI model vector space by updated TF-IDF model vector space, " "which took {} mins ... ".format((end_time - start_time) / 60)) elif topic_model_type == "lda": start_time = time.time() model_vector = self.transform_vectorized_corpus( updated_dictionary_with_old_and_unseen_news, updated_bow_vector_with_old_and_unseen_news, model_type="lda") end_time = time.time() logging.info( "regenerated LDA model vector space by updated dictionary and bow-vector, " "which took {} mins ... ".format((end_time - start_time) / 60)) # 将gensim.interfaces.TransformedCorpus类型的lsi模型向量转为numpy矩阵 start_time = time.time() latest_matrix = corpus2dense(model_vector, num_terms=model_vector.obj.num_terms).T end_time = time.time() logging.info( "transform {} model vector space to numpy.adarray, " "which took {} mins ... ".format(topic_model_type.upper(), (end_time - start_time) / 60)) # 利用历史数据的话题模型向量(或特征),进一步训练新闻分类器 start_time = time.time() train_x, train_y, test_x, test_y = utils.generate_training_set( latest_matrix[:-1, :], Y) clf = self.classifier.train(train_x, train_y, test_x, test_y, model_type=classifier_model) end_time = time.time() logging.info( "finished training by sklearn {} using latest {} model vector space, which took {} mins ... " .format(classifier_model.upper(), topic_model_type.upper(), (end_time - start_time) / 60)) label_id = clf.predict(latest_matrix[-1, :].reshape(1, -1))[0] return label_name_list[label_id]
if __name__ == "__main__": from Hisoka.classifier import Classifier from Kite.database import Database from sklearn import preprocessing database = Database() topicmodelling = TopicModelling() raw_documents_list = [] Y = [] for row in database.get_collection("stocknews", "sz000001").find(): if "30DaysLabel" in row.keys(): raw_documents_list.append(row["Article"]) Y.append(row["30DaysLabel"]) le = preprocessing.LabelEncoder() Y = le.fit_transform(Y) _, corpora_dictionary, corpus = topicmodelling.create_bag_of_word_representation( raw_documents_list) model_vector = topicmodelling.transform_vectorized_corpus( corpora_dictionary, corpus, model_type="lsi") csr_matrix = utils.convert_to_csr_matrix(model_vector) train_x, train_y, test_x, test_y = utils.generate_training_set( csr_matrix, Y) classifier = Classifier() classifier.svm(train_x, train_y, test_x, test_y) # lsi Tue, 15 Dec 2020 14:54:08 classifier.py[line:54] INFO train_pred: 0.9829 test_pred: 0.703 (只是去掉停用词、tab符以及空格符) # lsi Tue, 15 Dec 2020 17:00:58 classifier.py[line:54] INFO train_pred: 0.9852 test_pred: 0.7492(去掉不含中文的词以及只有一个字符的词) # lda Tue, 15 Dec 2020 17:29:56 classifier.py[line:54] INFO train_pred: 0.9498 test_pred: 0.7426(去掉不含中文的词以及只有一个字符的词)