Beispiel #1
0
    def lda(self, column, method='mallet', save_model=None, load_model=None):
        if method == 'mallet':
            print("Mallet LDA")
        else:
            raise ValueError("Invalid paramater for LDA.method: {}".format(method))
        tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/")
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)

        if not hasattr(self, "vocab"):
            self.__learn_vocab(column)

        if len(self.__bag_of_words) != 0:
            docs, id2word = self.__bag_of_words[column]
        else:
            docs, id2word = self.__get_bag_of_words(column)
        model = LdaMallet(mallet_path=self.mallet_path,
                          id2word=id2word,
                          prefix=tmp_dir,
                          num_topics=self.num_topics,
                          iterations=self.lda_max_iter,
                          optimize_interval=20)
        model.train(docs)
        doc_topics = list()
        for doc_vec in model.read_doctopics(model.fdoctopics()):
            topic_ids, vecs = zip(*doc_vec)
            doc_topics.append(np.array(vecs))
        self.features["lda"] = np.array(doc_topics)
        self.feature_names["lda"] = model.get_topics()
        return
Beispiel #2
0
def fit_lda(prefix, tokenized_docs, id2word,
            mallet_path=os.environ["MALLET_PATH"],
            num_topics=500, iterations=500):

    if not os.path.isdir(prefix):
        os.makedirs(prefix)

    if os.path.exists(os.path.join(prefix, "saved_model.pkl")):
        return utils.SaveLoad.load(os.path.join(prefix, "saved_model.pkl"))
    elif tokenized_docs is None:
        raise ValueError("LDA model not found at {}/{}".format(prefixed, "saved_model.pkl"))

    if mallet_path is None or mallet_path == "":
        raise ValueError("No mallet path specified")

    corpus = [id2word.doc2bow(tokens) for tokens in tokenized_docs.values.tolist()]

    lda_model = LdaMallet(mallet_path=mallet_path,
                          prefix=prefix,
                          corpus=corpus,
                          id2word=id2word,
                          iterations=iterations,
                          workers=4,
                          num_topics=num_topics,
                          optimize_interval=20)
    lda_model.save(os.path.join(prefix, "saved_model.pkl"))
    id2word.save_as_text(os.path.join(prefix, "id2word"))

    # save clean lda weights for later analysis
    W = lda_model.get_topics()
    W = pd.DataFrame(W).rename(columns=id2word)
    W.index = pd.Series(["lda.{}".format(i) for i in range(len(W))], name="topic_id")
    W.to_csv(os.path.join(prefix, "lda_weights.csv"))
    return lda_model
Beispiel #3
0
class LdaMalletHandler(TransformerMixin, BaseEstimator):
    def __init__(self,
                 n_components=100,
                 mallet_path=None,
                 prefix=None,
                 iterations=1000,
                 vectorizer=None):
        self.n_components = n_components
        self.mallet_path = mallet_path
        self.prefix = prefix
        self.iterations = iterations
        self.vectorizer = vectorizer

    def vect2gensim(self, vectorizer, dtmatrix):
        # transform sparse matrix into gensim corpus and dictionary
        corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False)
        dictionary = Dictionary.from_corpus(
            corpus_vect_gensim,
            id2word=dict(
                (id, word) for word, id in vectorizer.vocabulary_.items()))
        return (corpus_vect_gensim, dictionary)

    def fit(self, X, y=None):
        print('vect2gensim')
        corpus, dictionary = self.vect2gensim(self.vectorizer, X)
        self.model = LdaMallet(self.mallet_path,
                               iterations=self.iterations,
                               corpus=corpus,
                               num_topics=self.n_components,
                               id2word=dictionary)
        return self

    def transform(self, X):
        corpus = Sparse2Corpus(X, documents_columns=False)
        doc_topic = self.model[corpus]
        mat = np.zeros((X.shape[0], self.n_components), dtype=np.float64)
        for did, doc in enumerate(doc_topic):
            for topic in doc:
                mat[did][topic[0]] = topic[1]
        return mat

    def get_doc_topic_matrix(self):
        arr = []
        lines = open(self.model.fdoctopics(), "r").read().splitlines()
        for line in lines:
            arr.append(line.split()[2:])
        return np.array(arr, dtype=np.float64)

    def get_topic_words_matrix(self):
        return self.model.get_topics()
def main():
    num_topics = 10
    #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt'
    MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin",
                               "mallet.bat")  # r"D:\Mallet\mallet-2.0.8\bin"
    texts = wenzhang_Lemmatizer1.texts2
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary_mallet_10_3.dictionary')
    #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary')
    word_id = dictionary.token2id
    corpus = [dictionary.doc2bow(text) for text in texts]
    # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus)  # 保存corpus
    # corpus = corpora.MmCorpus('corpus_wenzhang.mm')  # 加载
    # print(os.path.abspath('corpus.mm'))
    mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH,
                                 corpus=corpus,
                                 num_topics=num_topics,
                                 id2word=dictionary)
    mallet_lda_model.save(
        'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model')
    #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model')
    topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics,
                                                 num_words=20)
    # print(topic_words20)
    writetopic_wordToExcleFile(
        topic_words20,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls'
    )
    topic_words = mallet_lda_model.get_topics()
    print(len(topic_words), len(topic_words[0]))
    doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics())  #doc_topics_path
    #print(mallet_lda_model.fdoctopics())
    writedoc_topicToExcleFile(
        doc_topics,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3'
    )
    return texts, word_id, topic_words, doc_topics, num_topics