def _train_lda(vectorizer, corpora_path, id2word_path, model_dir, model_fname=model_fname, num_topics=10): """训练和保存基于tfidf的lda模型 基于{corpora_path}文件保存的语料和{id2word_path}保存的gensim字典来训练lda_tfidf模型, 保存该模型到{model_dir}文件夹下 Args: vectorizer(str) :- 向量化方法, choices=["bow", "tfidf"] corpora_path(path) :- 保存语料的.txt文件 id2word_path(path) :- 保存gensim字典的文件 model_dir(path) :- 保存gensim LDA模型的文件夹 model_fname(path) :- 模型文件名 num_topics(int) :- lda的超参,主题数 """ try: assert vectorizer in ["bow", "tfidf"] except AssertionError: raise AssertionError("vectorizer must be bow or tfidf") if not os.path.isdir(model_dir): raise OSError(model_dir, "doesn't exist") corpora = [] with open(corpora_path, 'r', encoding="utf8") as fp: lines = fp.readlines() for line in lines: corpora.append(line.strip()) id2word = gensim.corpora.Dictionary.load(id2word_path) corpus = [id2word.doc2bow(corpus.split(" ")) for corpus in corpora] # tfidf的话需要计算idf if vectorizer == "tfidf": MmCorpus.serialize(corpus_tfidf_mm, corpus) corpus = MmCorpus(corpus_tfidf_mm) model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics) model_path = os.path.join(model_dir, vectorizer) make_dir(model_path) model_path = os.path.join(model_path, model_fname) if not os.path.isfile(model_path): model.save(model_path) print('model saved') else: print(f"{model_path} already exists") return model
def _transform_corpora(self, normalizer, corpora_dir, corpora_path, id2word_path): """转化语料 1. 从{corpora_dir}文件夹下提取所有.txt文件作为语料 2. 文件总每一行经过预处理后作为一行,存入{corpora_path}文件 3. 保存id2word到{id2word_path}文件 Args: corpora_dir(path) :- 语料文件所在的文件夹 corpora_path(path) :- 汇总所有语料的.txt文件 id2word_path(path) :- gensim的字典文件 """ corpora = [] if not os.path.isdir(corpora_dir): raise OSError(corpora_dir, "doesn't exist") if not os.path.isdir(os.path.dirname(corpora_path)): raise OSError(os.path.dirname(corpora_path), " doesn't exist") if not os.path.isdir(os.path.dirname(os.path.dirname(id2word_path))): raise OSError("the grandparent directory of ", id2word_path, " doesnt't exist") output_tfidf = open(corpora_path, 'a', encoding="utf8") for file in os.listdir(corpora_dir): if file.endswith('txt'): file = os.path.join(corpora_dir, file) print(file + ' read') with open(file, encoding="utf8") as f: lines = f.readlines() for line in lines: if self.is_text: words = normalizer.tokenize(line) else: words = line.split(" ") if len(words) > 0: corpora.append(words) output_tfidf.write('{}\n'.format(" ".join(words))) f.close() output_tfidf.close() id2word = gensim.corpora.Dictionary(corpora) parent_dir = os.path.dirname(id2word_path) make_dir(parent_dir) if not os.path.isfile(id2word_path): id2word.save(id2word_path) print('id2word saved') else: print(id2word_path, ' already exists')
def _create_corporaListAndCorporaText(normalizer, corpora_source, corpora_txt, id2word_fname): ''' 从{corpora_source}文件夹下提取所有.txt文件作为语料 文件总每一行经过预处理后作为一行,存入{corpora_txt}文件 并保存id2word到{id2word_fname}文件 Args: corpora_source(path) :- 语料文件所在的文件夹 corpora_txt(path) :- 汇总所有语料的.txt文件 id2word_fname(path) :- gensim的字典文件 ''' corpora = [] if not os.path.isdir(corpora_source): raise OSError(corpora_source, "doesn't exist") if not os.path.isdir(os.path.dirname(corpora_txt)): raise OSError(os.path.dirname(corpora_txt), " doesn't exist") if not os.path.isdir(os.path.dirname(os.path.dirname(id2word_fname))): raise OSError("the grandparent directory of ", id2word_fname, " doesnt't exist") output_tfidf = open(corpora_txt, 'a', encoding="utf8") for file in os.listdir(corpora_source): if file.endswith('txt'): file = os.path.join(corpora_source, file) print(file + ' read') with open(file, encoding="utf8") as f: lines = f.readlines() for line in lines: words = normalizer.tokenize(line) if len(words) > 0: corpora.append(words) output_tfidf.write('{}\n'.format(" ".join(words))) f.close() output_tfidf.close() id2word = gensim.corpora.Dictionary(corpora) parent_dir = os.path.dirname(id2word_fname) make_dir(parent_dir) if not os.path.isfile(id2word_fname): id2word.save(id2word_fname) print('id2word saved') else: print(id2word_fname, ' already exists')
def _createAndSave_lda_tfidf(corpora_txt, id2word_fname, ldaModel_save_repo, num_topics=10): ''' 训练和保存基于tfidf的lda模型 基于{corpora_txt}文件保存的语料和{id2word_fname}保存的gensim字典来训练lda_tfidf模型, 主题数为{num_topics} 保存该模型到{ldaModel_save_repo}文件夹下 Args: corpora_txt(path) :- 保存语料的.txt文件 id2word_fname(path) :- 保存gensim字典的文件 ldaModel_save_repo(path) :- 保存gensim LDA模型的文件夹 num_topics(int) :- lda的超参,主题数 ''' if not os.path.isdir(ldaModel_save_repo): raise OSError(ldaModel_save_repo, "doesn't exist") corpora = [] with open(corpora_txt, 'r', encoding="utf8") as fp: lines = fp.readlines() for line in lines: corpora.append(line.strip()) id2word = gensim.corpora.Dictionary.load(id2word_fname) MmCorpus.serialize( 'corpus_tfidf.mm', [id2word.doc2bow(corpus.split(" ")) for corpus in corpora]) mm = MmCorpus('corpus_tfidf.mm') lda_tfidf = gensim.models.LdaModel(corpus=mm, id2word=id2word, num_topics=num_topics) make_dir(ldaModel_save_repo + '/gensim_tfidf') if not os.path.isfile(ldaModel_save_repo + '/gensim_tfidf/crawl_news.model'): lda_tfidf.save(ldaModel_save_repo + '/gensim_tfidf/crawl_news.model') print('lda_tfidf saved') else: print(ldaModel_save_repo, '/gensim_tfidf/crawl_news.model already exists') return lda_tfidf