Ejemplo n.º 1
0
    def _train_lda(vectorizer,
                   corpora_path,
                   id2word_path,
                   model_dir,
                   model_fname=model_fname,
                   num_topics=10):
        """训练和保存基于tfidf的lda模型

        基于{corpora_path}文件保存的语料和{id2word_path}保存的gensim字典来训练lda_tfidf模型,

        保存该模型到{model_dir}文件夹下

        Args:
            vectorizer(str) :- 向量化方法, choices=["bow", "tfidf"]
            corpora_path(path) :- 保存语料的.txt文件
            id2word_path(path) :- 保存gensim字典的文件
            model_dir(path) :- 保存gensim LDA模型的文件夹
            model_fname(path) :- 模型文件名
            num_topics(int) :- lda的超参,主题数
        """
        try:
            assert vectorizer in ["bow", "tfidf"]
        except AssertionError:
            raise AssertionError("vectorizer must be bow or tfidf")

        if not os.path.isdir(model_dir):
            raise OSError(model_dir, "doesn't exist")

        corpora = []
        with open(corpora_path, 'r', encoding="utf8") as fp:
            lines = fp.readlines()
            for line in lines:
                corpora.append(line.strip())
        id2word = gensim.corpora.Dictionary.load(id2word_path)
        corpus = [id2word.doc2bow(corpus.split(" ")) for corpus in corpora]

        # tfidf的话需要计算idf
        if vectorizer == "tfidf":
            MmCorpus.serialize(corpus_tfidf_mm, corpus)
            corpus = MmCorpus(corpus_tfidf_mm)

        model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

        model_path = os.path.join(model_dir, vectorizer)
        make_dir(model_path)
        model_path = os.path.join(model_path, model_fname)
        if not os.path.isfile(model_path):
            model.save(model_path)
            print('model saved')
        else:
            print(f"{model_path} already exists")
        return model
Ejemplo n.º 2
0
    def _transform_corpora(self, normalizer, corpora_dir, corpora_path,
                           id2word_path):
        """转化语料

        1. 从{corpora_dir}文件夹下提取所有.txt文件作为语料
        2. 文件总每一行经过预处理后作为一行,存入{corpora_path}文件
        3. 保存id2word到{id2word_path}文件

        Args:
            corpora_dir(path) :- 语料文件所在的文件夹
            corpora_path(path) :- 汇总所有语料的.txt文件
            id2word_path(path) :- gensim的字典文件
        """
        corpora = []
        if not os.path.isdir(corpora_dir):
            raise OSError(corpora_dir, "doesn't exist")

        if not os.path.isdir(os.path.dirname(corpora_path)):
            raise OSError(os.path.dirname(corpora_path), " doesn't exist")

        if not os.path.isdir(os.path.dirname(os.path.dirname(id2word_path))):
            raise OSError("the grandparent directory of ", id2word_path,
                          " doesnt't exist")

        output_tfidf = open(corpora_path, 'a', encoding="utf8")
        for file in os.listdir(corpora_dir):
            if file.endswith('txt'):
                file = os.path.join(corpora_dir, file)
                print(file + ' read')
                with open(file, encoding="utf8") as f:
                    lines = f.readlines()
                    for line in lines:
                        if self.is_text:
                            words = normalizer.tokenize(line)
                        else:
                            words = line.split(" ")
                        if len(words) > 0:
                            corpora.append(words)
                            output_tfidf.write('{}\n'.format(" ".join(words)))
                f.close()

        output_tfidf.close()
        id2word = gensim.corpora.Dictionary(corpora)

        parent_dir = os.path.dirname(id2word_path)
        make_dir(parent_dir)
        if not os.path.isfile(id2word_path):
            id2word.save(id2word_path)
            print('id2word saved')
        else:
            print(id2word_path, ' already exists')
Ejemplo n.º 3
0
    def _create_corporaListAndCorporaText(normalizer, corpora_source,
                                          corpora_txt, id2word_fname):
        ''' 从{corpora_source}文件夹下提取所有.txt文件作为语料

        文件总每一行经过预处理后作为一行,存入{corpora_txt}文件

        并保存id2word到{id2word_fname}文件

        Args:
            corpora_source(path) :- 语料文件所在的文件夹
            corpora_txt(path) :- 汇总所有语料的.txt文件
            id2word_fname(path) :- gensim的字典文件
        '''
        corpora = []
        if not os.path.isdir(corpora_source):
            raise OSError(corpora_source, "doesn't exist")

        if not os.path.isdir(os.path.dirname(corpora_txt)):
            raise OSError(os.path.dirname(corpora_txt), " doesn't exist")

        if not os.path.isdir(os.path.dirname(os.path.dirname(id2word_fname))):
            raise OSError("the grandparent directory of ", id2word_fname,
                          " doesnt't exist")

        output_tfidf = open(corpora_txt, 'a', encoding="utf8")
        for file in os.listdir(corpora_source):
            if file.endswith('txt'):
                file = os.path.join(corpora_source, file)
                print(file + ' read')
                with open(file, encoding="utf8") as f:
                    lines = f.readlines()
                    for line in lines:
                        words = normalizer.tokenize(line)
                        if len(words) > 0:
                            corpora.append(words)
                            output_tfidf.write('{}\n'.format(" ".join(words)))
                f.close()

        output_tfidf.close()
        id2word = gensim.corpora.Dictionary(corpora)

        parent_dir = os.path.dirname(id2word_fname)
        make_dir(parent_dir)
        if not os.path.isfile(id2word_fname):
            id2word.save(id2word_fname)
            print('id2word saved')
        else:
            print(id2word_fname, ' already exists')
Ejemplo n.º 4
0
    def _createAndSave_lda_tfidf(corpora_txt,
                                 id2word_fname,
                                 ldaModel_save_repo,
                                 num_topics=10):
        '''  训练和保存基于tfidf的lda模型

        基于{corpora_txt}文件保存的语料和{id2word_fname}保存的gensim字典来训练lda_tfidf模型,

        主题数为{num_topics}

        保存该模型到{ldaModel_save_repo}文件夹下

        Args:
            corpora_txt(path) :- 保存语料的.txt文件
            id2word_fname(path) :- 保存gensim字典的文件
            ldaModel_save_repo(path) :- 保存gensim LDA模型的文件夹
            num_topics(int) :- lda的超参,主题数
        '''
        if not os.path.isdir(ldaModel_save_repo):
            raise OSError(ldaModel_save_repo, "doesn't exist")

        corpora = []
        with open(corpora_txt, 'r', encoding="utf8") as fp:
            lines = fp.readlines()
            for line in lines:
                corpora.append(line.strip())
        id2word = gensim.corpora.Dictionary.load(id2word_fname)

        MmCorpus.serialize(
            'corpus_tfidf.mm',
            [id2word.doc2bow(corpus.split(" ")) for corpus in corpora])
        mm = MmCorpus('corpus_tfidf.mm')
        lda_tfidf = gensim.models.LdaModel(corpus=mm,
                                           id2word=id2word,
                                           num_topics=num_topics)

        make_dir(ldaModel_save_repo + '/gensim_tfidf')
        if not os.path.isfile(ldaModel_save_repo +
                              '/gensim_tfidf/crawl_news.model'):
            lda_tfidf.save(ldaModel_save_repo +
                           '/gensim_tfidf/crawl_news.model')
            print('lda_tfidf saved')
        else:
            print(ldaModel_save_repo,
                  '/gensim_tfidf/crawl_news.model already exists')
        return lda_tfidf