class LDARecommender(Recommender):
    def __init__(self):
        return

    def preprocess(self, text):
        return preprocessing.cleanTokens(text)

    def train(self, train_filename):
        print("train LDA")
        train_name = os.path.basename(train_filename)
        model_filename = train_name + ".lda_model"
        if os.path.isfile(model_filename):
            self.model = LdaMallet.load(model_filename)
        else:
            self.corpus = preprocessing.GensimCorpus(train_filename)
            self.model = LdaMallet(mallet_path,
                                   self.corpus,
                                   num_topics=100,
                                   id2word=self.corpus.dictionary)
            self.model.save(model_filename)
            topics_str = self.model.show_topics(num_topics=-1)
            open(train_name + ".lda_model.topics", 'w').write(str(topics_str))

    def recommend(self, input_text):
        input_bow = self.corpus.dictionary.doc2bow(self.preprocess(input_text))
        input_topics = self.model[input_bow]
        print("lda topics: " + str(input_topics))
        return input_text
class LdaMalletHandler:
    def __init__(self, mallet_path):
        self.mallet_path = mallet_path

    def run_model(self, model_name, corpus, **kwargs):
        self.model_name = model_name
        self.dictionary = Dictionary(corpus)
        corpus_bow = [self.dictionary.doc2bow(text) for text in corpus]
        os.makedirs("ldamodels/"+model_name, exist_ok=True )
        self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs)

    def save_model(self):
        self.model.save("ldamodels/"+self.model_name+"/model.model")
        self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict")

    def load_model(self, model_name):
        self.model_name = model_name
        self.dictionary  = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict")
        self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model")
        self.model.mallet_path = self.mallet_path
    
    def doc_topics(self, doc_idx):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        return self.doc_retriever.doc_topics(doc_idx)    
    
    def ext_doc_topics(self, ext_doc):
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        doc_topics.sort(key=lambda x: x[1], reverse=True)
        return doc_topics

    def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        topics = []
        for topic in doc_topics:
            topics.append(topic[1])    
        most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric)    
        return most_similar

    def n_most_representative(self, topic, n=3):
         if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
         topics = np.zeros(self.model.num_topics)
         topics[topic]=1
         most_similar = self.doc_retriever.n_most_similar(topics, n=n)
         return most_similar
        
    def get_string_topics(self, num_topics=-1, num_words=10):
        if(num_topics==-1):
            num_topics = self.model.num_topics 
        string_topics = []
        for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words):
            splitted = topic[1].split("\"")
            result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))]
            string_topics.append(" ".join(result))
        return string_topics    
Beispiel #3
0
def fit_lda(prefix, tokenized_docs, id2word,
            mallet_path=os.environ["MALLET_PATH"],
            num_topics=500, iterations=500):

    if not os.path.isdir(prefix):
        os.makedirs(prefix)

    if os.path.exists(os.path.join(prefix, "saved_model.pkl")):
        return utils.SaveLoad.load(os.path.join(prefix, "saved_model.pkl"))
    elif tokenized_docs is None:
        raise ValueError("LDA model not found at {}/{}".format(prefixed, "saved_model.pkl"))

    if mallet_path is None or mallet_path == "":
        raise ValueError("No mallet path specified")

    corpus = [id2word.doc2bow(tokens) for tokens in tokenized_docs.values.tolist()]

    lda_model = LdaMallet(mallet_path=mallet_path,
                          prefix=prefix,
                          corpus=corpus,
                          id2word=id2word,
                          iterations=iterations,
                          workers=4,
                          num_topics=num_topics,
                          optimize_interval=20)
    lda_model.save(os.path.join(prefix, "saved_model.pkl"))
    id2word.save_as_text(os.path.join(prefix, "id2word"))

    # save clean lda weights for later analysis
    W = lda_model.get_topics()
    W = pd.DataFrame(W).rename(columns=id2word)
    W.index = pd.Series(["lda.{}".format(i) for i in range(len(W))], name="topic_id")
    W.to_csv(os.path.join(prefix, "lda_weights.csv"))
    return lda_model
Beispiel #4
0
def run():
    # Get the Preprocessed Dataset
    df = pd.read_pickle('./data/tmp/preprocessed.pkl')

    if os.path.isfile('./models/MALLET/mallet_model.pkl'):
        # Let's not do any model retraining without building in topic stability constraints
        #     e.g. number of docs or tokens now in different topics

        seen = False  # Data we provide is new and unseen for the model
        with open('./models/MALLET/mallet_model.pkl', 'rb') as modelfile:
            topic_model = pickle.load(modelfile)

        with open('./models/MALLET/mallet_dict.pkl', 'rb') as dictfile:
            dictionary = pickle.load(dictfile)
            df['bow'] = df['tokens'].apply(dictionary.doc2bow)

    else:
        seen = True  # any data we provide is used to train the model
        with Timer('Train the LDA Model'):
            test_range = (5, 50)
            df, corpus, dictionary = get_corpus_and_dict(df, 'tokens')
            list_of_models, scores = topic_count_selection(
                dictionary, corpus, list(df['tokens']), test_range)

            plot_coherence(
                test_range,
                scores).savefig('./models/MALLET/ModelCoherence.png')

            # Let's save the model with highest coherence
            num_topics = test_range[0] + scores.index(max(scores)) + 1
            topic_model = LdaMallet('/home/hadoop/Mallet-master/bin/mallet',
                                    corpus=corpus,
                                    num_topics=num_topics,
                                    id2word=dictionary,
                                    iterations=1000,
                                    prefix=f'{os.getcwd()}/models/MALLET/',
                                    random_seed=42)

            print(f"* Chosen Model with {num_topics} topics")
            with open('./models/MALLET/mallet_model.pkl', 'wb') as modelfile:
                topic_model.save(modelfile)
            with open('./models/MALLET/mallet_corpus.pkl', 'wb') as corpusfile:
                pickle.dump(corpus, corpusfile)
            with open('./models/MALLET/mallet_dict.pkl', 'wb') as dictfile:
                pickle.dump(dictionary, dictfile)

    df = get_topic_model_scores(df, topic_model, seen=seen)
    df.to_pickle('./data/tmp/scored.pkl')

    print("\nSample")
    print(df.head(), "\n")
    def set_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int,
                  model_file_path: str, language_processed_data: list):
        my_path = os.path.abspath(os.path.dirname(__file__))
        logging.info("---- Creating LDA Mallet model")
        logging.info("------ Getting LDA Mallet model file")
        mallet_path = os.path.join(my_path, "../../statics/mallet-2.0.8/bin/mallet")
        temp = self.essentials.dictionary[0]
        model = LdaMallet(mallet_path,
                          corpus=self.essentials.corpus, num_topics=self.number_of_topics,
                          id2word=self.essentials.dictionary.id2token)
        model.save(model_file_path)
        self.model = model
        logging.info("---- LDA Mallet model is created")

        metrics = self.get_model_evaluation_metrics(language_processed_data)
        parameters = self.get_model_parameters()
        self.write_model_evaluation_metrics(lang, data_version, dictionary_version, model_version,param_name, param_version, metrics, parameters)
        return
def get_topics(num, corpus, id2word, output_dir, all_sentences):
    print(num)
    ldamallet = LdaMallet(args.mallet_dir,
                          corpus=corpus,
                          num_topics=num,
                          prefix=output_dir + "/" + str(num),
                          workers=4,
                          id2word=id2word,
                          iterations=1000,
                          random_seed=42)
    coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                               texts=all_sentences,
                                               dictionary=id2word,
                                               coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    print('\nCoherence Score: ', coherence_ldamallet)
    keywords = {i: ", ".join([word for word, prop in ldamallet.show_topic(i)]) for i in range(ldamallet.num_topics)}
    with open(output_dir + "/" + str(num) + '_words.json', 'w') as f:
        f.write(json.dumps(keywords))
    ldamallet.save(output_dir + "/" + str(num))
    #ldamallet.show_topics(num_topics=num, formatted=True)
    return coherence_ldamallet
def main():
    num_topics = 10
    #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt'
    MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin",
                               "mallet.bat")  # r"D:\Mallet\mallet-2.0.8\bin"
    texts = wenzhang_Lemmatizer1.texts2
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary_mallet_10_3.dictionary')
    #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary')
    word_id = dictionary.token2id
    corpus = [dictionary.doc2bow(text) for text in texts]
    # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus)  # 保存corpus
    # corpus = corpora.MmCorpus('corpus_wenzhang.mm')  # 加载
    # print(os.path.abspath('corpus.mm'))
    mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH,
                                 corpus=corpus,
                                 num_topics=num_topics,
                                 id2word=dictionary)
    mallet_lda_model.save(
        'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model')
    #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model')
    topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics,
                                                 num_words=20)
    # print(topic_words20)
    writetopic_wordToExcleFile(
        topic_words20,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls'
    )
    topic_words = mallet_lda_model.get_topics()
    print(len(topic_words), len(topic_words[0]))
    doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics())  #doc_topics_path
    #print(mallet_lda_model.fdoctopics())
    writedoc_topicToExcleFile(
        doc_topics,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3'
    )
    return texts, word_id, topic_words, doc_topics, num_topics
    # TOO BIG TO SERIALIZE
    # Save the Dict and Corpus
    try:
        corpora.MmCorpus.serialize('mag_bow_corpus.mm',
                                   corpus)  # save corpus to disk
    except OverflowError:
        # Don't save corpus, call LDA directly
        print("Overflow while saving corpus, skip and train.")
        ldamallet = LdaMallet(mallet_path,
                              corpus=corpus,
                              num_topics=300,
                              id2word=id2word_dictionary)
        print('LDA Model trained')

        try:
            ldamallet.save('ldamallet_mag.model')
        except OverflowError:
            print("Trying to pickle model using protocol 4")
            with open('ldamallet_mag.model', 'wb') as pick:
                pick.dump(ldamallet, pick, protocol=pickle.HIGHEST_PROTOCOL)
        print("Lda model saved to disk")

        # Show Topics
        pprint(ldamallet.show_topics(formatted=False))

        # Compute Coherence Score
        coherence_model_ldamallet = CoherenceModel(
            model=ldamallet,
            texts=data_stemmed,
            dictionary=id2word_dictionary,
            coherence='c_v')
Beispiel #9
0
DICT_PATH  = 'docs.dict'
MODEL_PATH = 'docs.model'

raw_corpus = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
docs = [doc.split() for doc in raw_corpus]

if exists(MODEL_PATH):
    print('Testing...\n')
    dict = corpora.Dictionary.load(DICT_PATH)
    lda  = LdaMallet.load(MODEL_PATH)
    for doc in docs:
        topics = lda[dict.doc2bow(doc)]
        print(topics, doc)
else:
    print('Training...\n')
    dictionary = corpora.Dictionary(docs)
    dictionary.save(DICT_PATH)
    corpus = [dictionary.doc2bow(text) for text in docs]

    lda = LdaMallet(MALLET_PATH, corpus=corpus,
                    num_topics=3, workers=60, id2word=dictionary, iterations=50, prefix=PREFIX)
    lda.save(MODEL_PATH)
    corpus = corpora.MmCorpus('unpaywallmag_bow_corpus.mm')
except FileNotFoundError:
    corpus = [id2word_dictionary.doc2bow(textlist) for textlist in tqdm(data_stemmed)]
    print("Doc2Bow corpus created")
    # TOO BIG TO SERIALIZE
    # Save the Dict and Corpus
    try:
        corpora.MmCorpus.serialize('unpaywallmag_bow_corpus.mm', corpus)  # save corpus to disk
    except OverflowError:
        # Don't save corpus, call LDA directly
        print("Overflow while saving corpus, skip and train.")
        ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=300, id2word=id2word_dictionary)
        print('LDA Model trained')

        try:
            ldamallet.save('ldamallet_model.model')
        except OverflowError:
            print("Trying to pickle model using protocol 4")
            with open('ldamallet_model.model', 'wb') as pick:
                pick.dump(ldamallet_model, pick, protocol=pickle.HIGHEST_PROTOCOL)
        print("Lda model saved to disk")

        # Show Topics
        pprint(ldamallet.show_topics(formatted=False))

        # Compute Coherence Score
        coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_stemmed,
                                           dictionary=id2word_dictionary, coherence='c_v')

        coherence_ldamallet = coherence_model_ldamallet.get_coherence()
        print('\nCoherence Score: ', coherence_ldamallet)
Beispiel #11
0
def extract_features(max_documents=50000000,
                     max_words_per_doc=50000000,
                     incl_tf=True,
                     incl_df=True,
                     incl_graph=True,
                     incl_w2v=True,
                     incl_topic_model=True,
                     incl_atm=True):
    
    ######### SIMPLE FREQUENCY MEASURES ######################################################
    if incl_df or incl_tf or incl_graph:
        doc_cnt = max_documents
        # set containers:
        tf, df, network = Counter(), Counter(), nx.Graph()
        doc_ner_idx = {}
        dir_ner_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='wiki')
        dir_filename_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='filename')
        for filename, words in zip(dir_filename_iterator, dir_ner_iterator):
            # count the ners:
            ner_cnt = Counter()
            ner_cnt.update(words)
            if ner_cnt:
                # collect which ners appear in which doc:
                doc_ner_idx[os.path.basename(filename)] = set([n for n in ner_cnt])
                # update global tf and df:
                for k, v in ner_cnt.items():
                    tf[k] += v
                    df[k] += 1
                # update nodes in network:
                for ner in ner_cnt:
                    if ner not in network:
                        network.add_node(ner)
                # update edges in network:
                for ner1, ner2 in combinations(ner_cnt, 2):
                    try:
                        network[ner1][ner2]['weight'] += 1
                    except KeyError:
                        network.add_edge(ner1, ner2, weight=1)
        
        # dump for reuse:
        pickle.dump(tf, open('../workspace/tf.m', 'wb'))
        pickle.dump(df, open('../workspace/df.m', 'wb'))
        pickle.dump(doc_ner_idx, open('../workspace/doc_ner_idx.m', 'wb'))
        pickle.dump(network, open('../workspace/nx.m', 'wb'))
        
        # scale network values:
        max_weight = float(max([network[n1][n2]['weight']\
                            for n1, n2 in network.edges_iter()]))
        for n1, n2 in network.edges_iter():
            network[n1][n2]['weight'] /= max_weight
        nx.write_gexf(network,
                      '../workspace/dbnl_network.gexf',
                      prettyprint=True)
    
    ######### WORD2VEC MODEL ######################################################
    if incl_w2v:
        # build w2v model:
        dir_w2v_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='w2v')
        w2v_model = Word2Vec(dir_w2v_iterator, window=15, min_count=10,
                                         size=150, workers=10, negative=5)
        w2v_model.init_sims(replace=True)
        w2v_model.save(os.path.abspath('../workspace/w2v_model.m'))

    ######### STANDARD TOPIC MODEL ######################################################
    if incl_topic_model:
        # build vocab for lda:
        vocab_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='lda_vocab')
        lda_dict = corpora.Dictionary(vocab_lda_iterator)
        lda_dict.filter_extremes(no_below=25, no_above=0.5, keep_n=5000)
        
        # build lda model:
        dir_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='lda',
                                         lda_dict=lda_dict)
        lda_workspace_path = '../workspace/mallet_output/'
        if not os.path.isdir(lda_workspace_path):
            os.mkdir(lda_workspace_path)
        mallet_path = '/home/mike/GitRepos/dbnl/code/mallet-2.0.8RC2/bin/mallet'
        lda_model = LdaMallet(mallet_path, dir_lda_iterator, num_topics=150,
                                       id2word=lda_dict, iterations=1900,
                                       prefix=lda_workspace_path)
        lda_model.save('../workspace/lda_model.m')

    ######### AUTHOR TOPIC MODEL ######################################################
    if incl_atm:
        # build vocab for lda:
        vocab_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='lda_vocab')
        lda_dict = corpora.Dictionary(vocab_lda_iterator)
        lda_dict.filter_extremes(no_below=25, no_above=0.5, keep_n=5000)
        lda_dict.compactify()
        atm_vocab = []
        for i, w in lda_dict.items():
            atm_vocab.append(w)
        print(len(atm_vocab), 'vocab')
        atm_vocab = tuple(atm_vocab)
        corpus, doc_author = [], []
        for filename in sorted(glob.glob('../workspace/wikified_periodicals/*.wikified')):
            doc_words, auth_set = [], set()
            max_documents -= 1
            if max_documents % 100 == 0:
                print('\t-', max_documents, 'to go')
            if max_documents <= 1:
                break
            word_cnt = max_words_per_doc
            for line in codecs.open(filename, 'r', encoding='utf8'):
                comps = line.strip().split('\t')
                if comps:
                    idx, token, lemma, pos, pos_conf, ner, wiki = comps
                    if wiki != 'X':
                        auth_set.add(wiki)
                    elif pos.startswith(('N(', 'ADJ(')):
                        try:
                            doc_words.append(atm_vocab.index(token.lower()))
                        except:
                            pass
                word_cnt -= 1
                if word_cnt <= 0:
                    break
            if auth_set and doc_words:
                corpus.append(sorted(doc_words))
                doc_author.append(sorted(list(auth_set)))
        atm_author_idx = {}
        for i1, authors in enumerate(doc_author):
            for i2, auth in enumerate(authors):
                if auth not in atm_author_idx:
                    atm_author_idx[auth] = len(atm_author_idx)
                doc_author[i1][i2] = atm_author_idx[auth]
        n_topic = 30
        atm_model = AuthorTopicModel(n_doc=len(corpus),
                                     n_voca=len(atm_vocab),
                                     n_topic=n_topic,
                                     n_author=len(atm_author_idx))
        atm_model.fit(corpus, doc_author, max_iter=10)
        for k in range(n_topic):
            top_words = get_top_words(atm_model.TW, atm_vocab, k, 10)
            print('topic ', k , ','.join(top_words))
        author_id = 7
        fig = plt.figure(figsize=(12,6))
        plt.bar(range(n_topic), atm_model.AT[author_id]/np.sum(atm_model.AT[author_id]))
        #plt.title(author_idx[author_id])
        plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(atm_model.TW, atm_vocab, k, 10)) for k in range(n_topic)])
        #plt.show()
        plt.savefig('atm1.pdf')
        pickle.dump(atm_vocab, open('../workspace/atm_vocab.m', 'wb'))
        pickle.dump(atm_model, open('../workspace/atm_model.m', 'wb'))
        pickle.dump(atm_author_idx, open('../workspace/atm_author_idx.m', 'wb'))
Beispiel #12
0
    # TOO BIG TO SERIALIZE
    # Save the Dict and Corpus
    try:
        corpora.MmCorpus.serialize('arxivmag_bow_corpus.mm',
                                   corpus)  # save corpus to disk
    except OverflowError:
        # Don't save corpus, call LDA directly
        print("Overflow while saving corpus, skip and train.")
        ldamallet = LdaMallet(mallet_path,
                              corpus=corpus,
                              num_topics=300,
                              id2word=id2word_dictionary)
        print('LDA Model trained')

        try:
            ldamallet.save('ldamallet_arxiv.model')
        except OverflowError:
            print("Trying to pickle model using protocol 4")
            with open('ldamallet_arxiv.model', 'wb') as pick:
                pick.dump(ldamallet, pick, protocol=pickle.HIGHEST_PROTOCOL)
        print("Lda model saved to disk")

        # Show Topics
        pprint(ldamallet.show_topics(formatted=False))

        # Compute Coherence Score
        coherence_model_ldamallet = CoherenceModel(
            model=ldamallet,
            texts=data_stemmed,
            dictionary=id2word_dictionary,
            coherence='c_v')
 def generate_topics(data, topics, gender):
     dictionary = Dictionary(data)
     corpus = [dictionary.doc2bow(text) for text in data]
     print('performing topic modeling with', topics, 'topics')
     ldamodel = LdaMallet(TopicModeling.MALLET_PATH, corpus=corpus, num_topics=topics, id2word=dictionary)
     ldamodel.save('ldamodel.' + gender + '.' + str(topics))
        print(
            'Train an LDA model over the given corpus using the given dictionary.'
        )
        print('If num_topics is not specified, use the default of 100.')
        print(
            'If num_passes is specified, makes multiple passes over the corpus.'
        )
        print('This uses MALLET to train a topic model.')
    else:
        _, mm_fname, dict_fname, model_fname = sys.argv[:4]
        num_topics = int(sys.argv[4]) if len(sys.argv) >= 5 else 100

        try:
            mallet_path = sep.join(
                [os.environ['MALLET_HOME'], 'bin', 'mallet'])
        except KeyError:
            logging.error('please set the MALLET_HOME environment variable to '
                          'the root directory of your MALLET installation')
            exit()

        mm = MmCorpus(mm_fname)
        id2word = Dictionary.load(dict_fname)

        lda_model = LdaMallet(mallet_path,
                              corpus=normalize_langs(mm),
                              id2word=id2word,
                              num_topics=num_topics,
                              prefix=model_fname[:-6],
                              iterations=100)
        lda_model.save(model_fname)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    words_freq = pd.DataFrame(words_freq, columns=['word', 'count'])
    acronyms = words_freq[words_freq.word.str.len() <= 3]
    acronyms.to_csv('acronyms.csv')

    if train:
        print('begin training mallet LDA model')
        mallet_lda_model = LdaMallet(path_to_mallet_binary,
                                     corpus=bow_corpus,
                                     iterations=3900,
                                     num_topics=140,
                                     alpha=60,
                                     id2word=dictionary,
                                     prefix=path_to_mallet_output,
                                     workers=multiprocessing.cpu_count())
        mallet_lda_model.save('{}lda_model.pkl'.format(path_to_mallet_output))
        # mallet_lda_model.save('{}lda_model_{}.pkl'.format(path_to_mallet_output, uuid))
        print('calculate model coherence C_v score')
        coherence_model_lda = CoherenceModel(model=mallet_lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('model coherence score: {}'.format(coherence_lda))
    else:
        print('load mallet LDA model')
        # mallet_lda_model = LdaMallet.load('{}lda_model.pkl'.format(path_to_mallet_output))
        mallet_lda_model = LdaMallet.load('{}lda_model_{}.pkl'.format(
            path_to_mallet_output, uuid))

    # # #convert the model to gensim format
Beispiel #16
0
class NlPipe:
    def __init__(self,
                 list_of_docs,
                 path,
                 document_ids=None,
                 language_model="en_core_web_lg",
                 tagger=False,
                 parser=False,
                 ner=False,
                 categorization=False,
                 remove_stopwords=True,
                 remove_punctuation=True,
                 set_lower=True,
                 remove_num=True,
                 expand_stopwords=True,
                 language_detection=False,
                 allowed_languages=frozenset({'en'}),
                 no_processes=None):
        """
        :param list_of_docs: List of strings where every document is one string.
        :param document_ids: The ids of the documents, matching the order of the list_of_docs
        :param language_model: Spacy language model to be used for text preprocessing
        :param tagger: Use spacy part-of-speech tagger.
        :param parser: Use spacy to annotate syntactic dependencies in documents.
        :param ner: Use spacy for entity recognition and annotation.
        :param categorization: Use spacy to assign document labels
        :param remove_stopwords: Remove stop words during text preprocessing.
        :param remove_punctuation: Remove punctuation during text prssing.
        :param set_lower: Convert all strings to lowercase during text preprocessing.
        :param remove_num: Remove numeric characters during text preprocessing.
        :param expand_stopwords: Remove non-alpha-characters in stop words and add them to the stop words.
        :param language_detection: Detect language of docs.
        :param allowed_languages: Allowed language for the documents.
        """
        self.path = path
        self.pipe_disable = []
        if not tagger:
            self.pipe_disable.append("tagger")
        if not parser:
            self.pipe_disable.append("parser")
        if not ner:
            self.pipe_disable.append("ner")
        if not categorization:
            self.pipe_disable.append("textcat")
        self.remove_punctuation = remove_punctuation
        self.remove_stop_words = remove_stopwords
        self.remove_num = remove_num
        self.set_lower = set_lower
        self.input_docs = list_of_docs
        self.document_ids = np.array(document_ids)
        self.use_gpu = spacy.prefer_gpu()
        self.nlp = spacy.load(language_model)
        if expand_stopwords:
            stops = [stop for stop in self.nlp.Defaults.stop_words]
            for stop in stops:
                self.nlp.Defaults.stop_words.add(re.sub(r"[\W]", "", stop))
        self.spacy_docs = None
        self.preprocessed_docs = None
        self.bag_of_words = None
        self.preprocessing_batch_size = 50000
        if no_processes is None:
            self.processes = psutil.cpu_count(logical=False) - 1
        else:
            self.processes = no_processes
        self.lda_model = None
        self.result_df = None
        self.word_topic_df = None
        self.allowed_languages = allowed_languages
        self.language_detection = language_detection
        self.id2word = None
        self.coherence_dict = None
        self.max_df = None
        self.min_df = None
        self.use_phrases = None
        self.filter_extremes_value = None
        self.keep_n = None
        self.keep_tokens = None

    def enable_pipe_component(self, component):
        """
        Method to enable components of the spacy pipeline after initialization of the class.
        :param component: Component to enable (see https://spacy.io/usage/processing-pipelines/ for available
        components).
        """
        if component in self.pipe_disable:
            self.pipe_disable.remove(component)

    def disable_pipe_component(self, component):
        """
        Method to disable components of the spacy pipeline after initialization of the class.
        :param component: Component to disable (see https://spacy.io/usage/processing-pipelines/ for available
        components).
        """
        if component not in self.pipe_disable:
            self.pipe_disable.append(component)

    def preprocess_spacy(self,
                         load_existing=True,
                         save_data=True,
                         filter_loaded=None):
        """
        Method to preprocess the documents using spacy with the enabled pipeline components.
        """
        if os.path.exists(
                f"{self.path}text_df_preprocessed_spacy") and load_existing:
            preprocessed_df = pd.read_pickle(
                f"{self.path}text_df_preprocessed_spacy")
            if filter_loaded is None:
                self.spacy_docs = preprocessed_df['preprocessed_text'].to_list(
                )
            else:
                self.spacy_docs = preprocessed_df['preprocessed_text'].loc[
                    filter_loaded].to_list()
        else:
            if self.language_detection:
                self.spacy_docs = [
                    doc for doc in tqdm(self.nlp.pipe(
                        self.input_docs,
                        disable=self.pipe_disable,
                        n_process=self.processes,
                        batch_size=self.preprocessing_batch_size),
                                        desc="Preprocessing text with spacy: ")
                    if detect(doc.text) in self.allowed_languages
                ]
            else:
                self.spacy_docs = []
                for doc in tqdm(self.nlp.pipe(
                        self.input_docs,
                        disable=self.pipe_disable,
                        n_process=self.processes,
                        batch_size=self.preprocessing_batch_size),
                                desc="Preprocessing spacy"):
                    self.spacy_docs.append(doc)
            if save_data:
                temp_df = pd.DataFrame([self.document_ids,
                                        self.spacy_docs]).transpose()
                temp_df.columns = ['thread_id', 'preprocessed_text']
                temp_df.to_pickle(f"{self.path}text_df_preprocessed_spacy")

    def preprocess(self, load_existing=True, filter_loaded=None):
        """
        Remove stop words, numbers and punctation as well as lower case all of the tokens, depending on the settings
        passed to the class during initialization.
        """
        if os.path.exists(
                f"{self.path}/text_df_preprocessed") and load_existing:
            print("Found preprocessed data. Loading")
            preprocessed_df = pd.read_pickle(
                f"{self.path}/text_df_preprocessed")
            if filter_loaded is None:
                self.preprocessed_docs = preprocessed_df[
                    'preprocessed_text'].to_list()
                print('Preprocessed data loaded.')
            else:
                self.preprocessed_docs = preprocessed_df[
                    'preprocessed_text'].loc[filter_loaded].to_list()
                if isinstance(self.document_ids, np.ndarray):
                    self.document_ids = self.document_ids[filter_loaded]
                print(
                    f'{sum(filter_loaded)} preprocessed docs of {len(self.input_docs)} docs loaded.'
                )
        else:
            self.preprocessed_docs = []
            if not self.spacy_docs:
                self.preprocess_spacy()
            for spacy_doc in tqdm(
                    self.spacy_docs,
                    desc="Removing stop words/punctuation/numeric chars: "):
                doc = []
                for token in spacy_doc:
                    # todo: check if useful condition
                    if not self.remove_stop_words and token.is_stop:
                        word = token.text
                    elif token.is_stop:
                        continue
                    else:
                        word = token.lemma_
                    if self.set_lower:
                        word = word.lower()
                    if self.remove_num:
                        word = re.sub(r"[\d]", "", word)
                    if self.remove_punctuation:
                        word = re.sub(r"[\W]", "", word)
                    if len(word) >= 2 and word != "wbr":
                        doc.append(word)
                self.preprocessed_docs.append(doc)
            temp_df = pd.DataFrame([self.document_ids, self.preprocessed_docs]).\
                transpose()
            temp_df.columns = ['thread_id', 'preprocessed_text']
            temp_df.to_pickle(f"{self.path}/text_df_preprocessed")

    def create_bag_of_words(self,
                            filter_extremes=True,
                            min_df=5,
                            max_df=0.5,
                            keep_n=100000,
                            keep_tokens=None,
                            use_phrases=None,
                            bigram_min_count=1000,
                            bigram_threshold=100,
                            trigram_threshold=100,
                            load_existing=True,
                            tfidf=False):
        """
        :param filter_extremes: En-/Disable filtering of tokens that occur too frequent/not frequent enough
        (https://radimrehurek.com/gensim/corpora/dictionary.html)
        :param min_df: Keep only tokens that appear in at least n documents (see link above)
        :param max_df: Keep only tokens that appear in less than the fraction of documents (see link above)
        :param keep_n: Keep only n most frequent tokens (see link above)
        :param keep_tokens: Iterable of tokens not to be remove (see link above)
        :param use_phrases: Set to bigram or trigram if the use of Gensmin Phrases
        (https://radimrehurek.com/gensim/models/phrases.html) is wanted. Will create bigrams/trigrams of frequently
        co-occuring words (e.g. "new", "york" => "new_yor)k").
        :param bigram_min_count: Minimum occurrence of bigrams to be considered by Gensmin Phrases.
        :param bigram_threshold: Threshold for Gensim Phrases bigram settings.
        :param trigram_threshold: Threshold for Gensim Phrases trigram settings.
        """
        if use_phrases not in {None, "bigram", "trigram"}:
            raise Exception(
                "Please use valid option (None, 'bigram' or 'trigram) to make use of this function."
            )
        #todo: check logic
        else:
            if use_phrases == "bigram" and not isinstance(
                    bigram_threshold, int) and not isinstance(
                        bigram_min_count, int):
                raise Exception(
                    "Thresholds or minimum count for bigrams/trigrams not integer. Please provide "
                    "threshold and minimum count for bigrams (and trigrams) as integer."
                )
            elif use_phrases == "trigram" and not isinstance(bigram_threshold, int) \
                    or not isinstance(trigram_threshold, int) or not isinstance(bigram_min_count, int):
                raise Exception(
                    "Thresholds or minimum count for bigrams/trigrams not integer. Please provide "
                    "threshold and minimum count for bigrams (and trigrams) as integer."
                )

        if not self.preprocessed_docs:
            self.preprocess()
        if os.path.exists(f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}") \
                and load_existing:
            self.load_dict(
                path=
                f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}"
            )
            self.filter_extremes_value = filter_extremes
            self.min_df = min_df
            self.max_df = max_df
            self.use_phrases = use_phrases
        else:
            #todo: add auto check for existing dictionary here.
            if use_phrases == "bigram" or use_phrases == "trigram":
                self.create_bigrams(bigram_min_count=bigram_min_count,
                                    bigram_threshold=bigram_threshold)
            if use_phrases == "trigram":
                self.create_bigrams(bigram_min_count=bigram_min_count,
                                    bigram_threshold=bigram_threshold)
                self.create_trigrams(trigram_threshold=trigram_threshold)
            self.create_dictionary(filter_extremes=filter_extremes,
                                   min_df=min_df,
                                   max_df=max_df,
                                   keep_n=keep_n,
                                   keep_tokens=keep_tokens,
                                   use_phrases=use_phrases)
        self.create_bag_of_words_matrix(tfidf=tfidf)

    def create_bigrams(self, bigram_min_count, bigram_threshold):
        self.bigram_phrases = Phrases(self.preprocessed_docs,
                                      min_count=bigram_min_count,
                                      threshold=bigram_threshold)
        self.bigram_phraser = Phraser(self.bigram_phrases)
        self.preprocessed_docs = [
            self.bigram_phraser[doc]
            for doc in tqdm(self.preprocessed_docs, desc="Extracting bigrams")
        ]

    def create_trigrams(self, trigram_threshold):
        trigram_phrases = Phrases(self.bigram_phrases[self.preprocessed_docs],
                                  threshold=trigram_threshold)
        trigram_phraser = Phraser(trigram_phrases)
        self.preprocessed_docs = [
            trigram_phraser[self.bigram_phraser[doc]]
            for doc in tqdm(self.preprocessed_docs, desc="Extracting trigrams")
        ]

    def create_bag_of_words_matrix(self, tfidf=False):
        self.bag_of_words = [
            self.id2word.doc2bow(doc)
            for doc in tqdm(self.preprocessed_docs,
                            desc='Creating bag of words')
        ]
        if tfidf:
            self.create_tfidf()

    def create_dictionary(self, filter_extremes, min_df, max_df, keep_n,
                          keep_tokens, use_phrases):
        print('Creating dictionary.')
        self.id2word = corpora.Dictionary(self.preprocessed_docs)
        # todo: add autosave of dictionary here
        self.max_df = max_df
        self.min_df = min_df
        self.use_phrases = use_phrases
        self.filter_extremes_value = filter_extremes
        self.keep_n = keep_n
        self.keep_tokens = keep_tokens
        if filter_extremes:
            self.filter_extremes(min_df=self.min_df,
                                 max_df=self.max_df,
                                 keep_n=self.keep_n,
                                 keep_tokens=self.keep_tokens)
        self.save_dict(
            path=
            f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}"
        )

    def filter_extremes(self, min_df, max_df, keep_n, keep_tokens=[]):
        self.filter_extremes_value = True
        self.max_df = max_df
        self.min_df = min_df
        self.keep_n = keep_n
        self.keep_tokens = keep_tokens
        self.id2word.filter_extremes(no_below=self.min_df,
                                     no_above=self.max_df,
                                     keep_n=keep_n,
                                     keep_tokens=keep_tokens)

    def create_tfidf(self):
        tfidf_model = TfidfModel(self.bag_of_words)
        self.bag_of_words = [
            tfidf_model[vector]
            for vector in tqdm(self.bag_of_words,
                               desc="Creating tf-idf matrix")
        ]

    def create_lda_model(self,
                         no_topics=10,
                         random_state=42,
                         passes=5,
                         alpha='auto',
                         eta=None,
                         workers=None,
                         chunksize=2000):
        """
        :param no_topics: Number of topics that are to be explored by lda model
        :param random_state: Random state for reproducible results (default 42, gensim default is None)
        :param passes: Number of times the whole corpus is processed.
        :param alpha: set topic-document distribution prior alpha to "symmetric" or "asymmetric"
        (gensim default is "symmetric")
        :param eta: Word-topic distribution prior eta (beta)
        :param workers: number of workers to use. Defaulting to one as there seems to be a bug in gensim. 1 already
        uses all available cores. Higher number of workers results in a load bigger than the number of cores.
        :param chunksize: chunsize parameter of gensim
        """
        if eta is None:
            eta = 1 / no_topics
        if workers is None:
            workers = self.processes
        if self.bag_of_words is None:
            self.create_bag_of_words()
        self.lda_model = LdaMulticore(corpus=self.bag_of_words,
                                      id2word=self.id2word,
                                      num_topics=no_topics,
                                      eta=eta,
                                      workers=workers,
                                      random_state=random_state,
                                      alpha=alpha,
                                      passes=passes,
                                      chunksize=chunksize)

    def create_mallet_lda_model(self,
                                no_topics,
                                random_state=42,
                                workers=None,
                                mallet_path="mallet-2.0.8/bin/mallet",
                                iterations=1000,
                                custom_prefix=None):
        """
        Method to create a mallet lda model using gensim wrapper for lda mallet
        :param no_topics: Number of topics for lda model
        :param random_state: Random state to be able to reprocude model creation
        :param workers: Number of workers to use
        :param mallet_path: path to mallet binary, e.g. "mallet-2.0.8/bin/mallet"
        :param iterations: iterations over the corpus?!
        """
        if workers is None:
            workers = self.processes
        if self.bag_of_words is None:
            self.create_bag_of_words()
        if custom_prefix is None:
            prefix = f"{self.path}mallet_temp_"
        else:
            prefix = f"{self.path}mallet_temp_{custom_prefix}_"
        self.lda_model = LdaMallet(num_topics=no_topics,
                                   mallet_path=mallet_path,
                                   corpus=self.bag_of_words,
                                   id2word=self.id2word,
                                   random_seed=random_state,
                                   iterations=iterations,
                                   workers=workers,
                                   prefix=prefix)

    def calculate_coherence(self,
                            model=None,
                            coherence_score='c_v',
                            workers=None):
        """
        Method to calculate the coherence score of a given lda model. The model can either be provided or will be taken
        from the class.
        :param model: Model to use instead of the model saved within the class.
        :param coherence_score: Coherence score to calculate
        :param workers: Number of workers to use for coherence evaluation.
        :return: Return coherence model, which also contains the coherence score of a model.
        """
        if workers is None:
            workers = self.processes
        if model is None:
            model = self.lda_model
        else:
            model = model
        if coherence_score != 'u_mass':
            coherence_model = CoherenceModel(model=model,
                                             texts=self.preprocessed_docs,
                                             dictionary=self.id2word,
                                             coherence=coherence_score,
                                             processes=workers)
        else:
            coherence_model = CoherenceModel(model=model,
                                             corpus=self.bag_of_words,
                                             dictionary=self.id2word,
                                             coherence=coherence_score,
                                             processes=workers)
        return coherence_model

    def search_best_model(self,
                          topic_list=frozenset({2, 3, 4, 5, 10, 15, 20, 25}),
                          alphas=[0.9, 0.5, 0.1],
                          etas=['auto', 0.9, 0.5, 0.1],
                          save_best_model=True,
                          save_models=False,
                          return_best_model=False,
                          passes=1,
                          coherence_scores=['c_v'],
                          chunksize=2000,
                          workers=None,
                          coherence_suffix=None):
        #todo: save best model within class.
        """
        Method to search for the best lda model for a given number of topics. The best model will be determined by its
        coherence score.
        :param topic_list: Iterable of integers of topics to test the coherence score for.
        :param alphas: Iterable of floats between 0 and 1 for determining the dirichlet prior of the lda model.
        :param save_best_model: Set to true if the best model has to be saved within the class.
        :param save_models: If set to false (default) only the coherence score for each combination of numbers of topics
        and alphas will be saved. If set to true, the lda model, the coherence score and the coherence model will be
        saved.
        :param return_best_model: If true, the method will return the best found model and the number of topics of this
        model.
        :return: Number of topics for the best result and the model with the best result of the coherence score
        """
        if coherence_suffix is None:
            path = f"{self.path}coherence_results"
        else:
            path = f"{self.path}coherence_results_{coherence_suffix}"
        if os.path.exists(path):
            print("coherence results found")
            with open(path, "rb") as f:
                self.coherence_dict = pickle.load(f)
        else:
            self.coherence_dict = {}
        if workers is None:
            workers = self.processes
        if return_best_model and not save_best_model:
            raise Exception(
                "To return the best model, the parameter save_best_model has to be set to True."
            )
        if self.coherence_dict and save_best_model:
            try:
                best_score = self.coherence_dict['best_score']
            except:
                best_score = 0
        else:
            best_score = 0
        for no_topics in tqdm(topic_list,
                              desc="Calculating topic coherences: "):
            for alpha in tqdm(alphas, desc='Alphas'):
                for eta in tqdm(etas, desc='Etas'):
                    coherence_key = f"no={no_topics}-a={alpha}-e={eta}-filter={self.filter_extremes_value}" \
                                    f"-min_df={self.min_df}-max_df={self.max_df}-phrases={self.use_phrases}" \
                                    f"-k_n={self.keep_n}-k_t={self.keep_tokens}"
                    if coherence_key in self.coherence_dict.keys():
                        print("coherence value found, skipping")
                        continue
                    else:
                        self.create_lda_model(no_topics=no_topics,
                                              alpha=alpha,
                                              eta=eta,
                                              passes=passes,
                                              chunksize=chunksize,
                                              workers=workers)
                        self.coherence_dict[coherence_key] = {}
                        if save_models:
                            self.coherence_dict[coherence_key][
                                "lda_model"] = self.lda_model
                        for coherence_score in coherence_scores:
                            coherence_model = self.calculate_coherence(
                                coherence_score=coherence_score,
                                workers=workers)
                            coherence_result = coherence_model.get_coherence()
                            if save_models:
                                self.coherence_dict[coherence_key][
                                    "coherence_model"] = coherence_model
                            self.coherence_dict[coherence_key][
                                coherence_score] = coherence_result
                            if save_best_model and coherence_result > best_score:
                                self.coherence_dict[
                                    "best_score"] = coherence_result
                                self.coherence_dict[
                                    "best_model"] = self.lda_model
                                self.coherence_dict[
                                    "best_topic_no"] = no_topics
                                self.coherence_dict["best_alpha"] = alpha
                                self.coherence_dict["best_eta"] = eta
                            if coherence_result > best_score:
                                best_score = coherence_result
                        with open(path, "wb") as f:
                            pickle.dump(self.coherence_dict, f)
        if return_best_model:
            #returns number of topics and the lda_model
            return self.coherence_dict["best_topic_no"], self.coherence_dict[
                "best_model"]

    def search_best_model_mallet(self,
                                 topic_list=frozenset(
                                     {2, 3, 4, 5, 10, 15, 20, 25}),
                                 save_best_model=True,
                                 save_models=False,
                                 return_best_model=False,
                                 coherence_scores=['c_v'],
                                 workers=None,
                                 coherence_workers=None,
                                 coherence_suffix=None,
                                 random_state=42,
                                 mallet_path="mallet-2.0.8/bin/mallet",
                                 iterations=1000):
        """

        :param topic_list:
        :param save_best_model:
        :param save_models:
        :param return_best_model:
        :param coherence_scores:
        :param workers:
        :param coherence_suffix:
        :param random_state:
        :param mallet_path:
        :param iterations:
        :return:
        """
        if coherence_suffix is None:
            path = f"{self.path}coherence_results_mallet"
        else:
            path = f"{self.path}coherence_results_mallet_{coherence_suffix}"
        if os.path.exists(path):
            print("coherence results found")
            with open(path, "rb") as f:
                self.coherence_dict = pickle.load(f)
        else:
            self.coherence_dict = {}
        if workers is None:
            workers = self.processes
        if coherence_workers is None:
            coherence_workers = self.processes
        if return_best_model and not save_best_model:
            raise Exception(
                "To return the best model, the parameter save_best_model has to be set to True."
            )
        if self.coherence_dict and save_best_model:
            try:
                best_score = self.coherence_dict['best_score']
            except:
                best_score = 0
        else:
            best_score = 0
        for no_topics in tqdm(topic_list,
                              desc="Calculating topic coherences: "):
            coherence_key = f"mallet-no={no_topics}-filter={self.filter_extremes_value}" \
                            f"-min_df={self.min_df}-max_df={self.max_df}-phrases={self.use_phrases}" \
                            f"-k_n={self.keep_n}-k_t={self.keep_tokens}"
            if coherence_key in self.coherence_dict.keys():
                print("coherence value found, skipping")
                continue
            else:
                self.create_mallet_lda_model(no_topics=no_topics,
                                             workers=workers,
                                             random_state=random_state,
                                             mallet_path=mallet_path,
                                             iterations=iterations)
                self.coherence_dict[coherence_key] = {}
                if save_models:
                    self.coherence_dict[coherence_key][
                        "lda_model"] = self.lda_model
                for coherence_score in coherence_scores:
                    coherence_model = self.calculate_coherence(
                        coherence_score=coherence_score,
                        workers=coherence_workers)
                    coherence_result = coherence_model.get_coherence()
                    if save_models:
                        self.coherence_dict[coherence_key][
                            "coherence_model"] = coherence_model
                    self.coherence_dict[coherence_key][
                        coherence_score] = coherence_result
                    if save_best_model and coherence_result > best_score:
                        self.coherence_dict["best_score"] = coherence_result
                        self.coherence_dict["best_model"] = self.lda_model
                        self.coherence_dict["best_topic_no"] = no_topics
                        self.coherence_dict[
                            "best_alpha"] = self.lda_model.alpha
                    if coherence_result > best_score:
                        best_score = coherence_result
                with open(path, "wb") as f:
                    pickle.dump(self.coherence_dict, f)
        if return_best_model:
            #returns number of topics and the lda_model
            return self.coherence_dict["best_topic_no"], self.coherence_dict[
                "best_model"]

    def create_document_topic_df(self, model=None, no_topics=10):
        """
        Creates a dataframe containing the the result of the LDA model for each document. Will set the topic with the
        highest share within the document as the dominant topic.
        :param model: LDA model to use for the calculation of the topic distribution of each document.
        :param no_topics: Number of topics in case no LDA model is provided.
        """
        if model is None:
            model = self.lda_model
        if isinstance(model, LdaMallet):
            model = malletmodel2ldamodel(model)
        topic_result_list = []
        for doc in model.get_document_topics(bow=self.bag_of_words):
            temp_dict = {}
            for topic, probability in doc:
                temp_dict[topic] = probability
            topic_result_list.append(temp_dict)
        self.result_df = pd.DataFrame(data=topic_result_list,
                                      columns=range(model.num_topics))
        self.result_df = self.result_df.fillna(0)
        if self.document_ids is not None and not self.language_detection:
            self.result_df.index = self.document_ids
        elif self.document_ids is not None and self.language_detection:
            raise Warning(
                "Using document ids and language detection together is not implemented (yet)."
            )
        dominant_topic = np.argmax(self.result_df.values, axis=1)
        self.result_df['dominant_topic'] = dominant_topic

    def plot_document_topic_distribution(self):
        #todo: log normalize
        if self.result_df is None:
            raise Exception(
                "Please create the topic distribution dataframe using the 'create_document_topic_df' "
                "method")
        counter = Counter(self.result_df.dominant_topic)
        topic_dict = OrderedDict(
            sorted(counter.items(), key=lambda x: x[1], reverse=True))
        plt.figure(figsize=(10, 6))
        g = sns.barplot(x=list(topic_dict.values()),
                        y=list(topic_dict.keys()),
                        order=list(topic_dict.keys()),
                        orient='h')
        g.set_ylabel("topic number")
        g.set_xlabel("count")
        plt.show()

    def evaluate_model(self, no_words=30):
        #todo: update 4 gensim
        keywords = np.array(self.vectorizer.get_feature_names())
        topic_keywords = []
        for topic_weights in self.lda_model.components_:
            top_keyword_locations = (-topic_weights).argsort()[:no_words]
            topic_keywords.append(keywords.take(top_keyword_locations))
        self.word_topic_df = pd.DataFrame(
            topic_keywords, columns=[f"word_{x}" for x in range(no_words)])

    def evaluate_pyldavis(self, model=None, use_jupyter=None):
        """
        Method for a visual evaluation of the LDA topic model using pyldavis.
        :param model: LDA model that is to be evaluated. If 'None', it will use the last model that has been saved
        within the class.
        :param use_jupyter: set how the pyldavis panel is displayed. If default (None), it will try to find out if run
        from jupyter and set the method accordingly
        :return:
        """
        if model is None:
            if self.lda_model is None:
                raise Exception(
                    "Please create a LDA model for evaluation before running this method."
                )
            model = self.lda_model
        if isinstance(model, LdaMallet):
            model = malletmodel2ldamodel(model)
        panel = pyLDAvis.gensim.prepare(model, self.bag_of_words, self.id2word)
        if use_jupyter is None:
            try:
                is_jupyter = os.environ['_'].split(
                    "/")[-1] == "jupyter-notebook"
                if is_jupyter:
                    pyLDAvis.enable_notebook()
            except KeyError:
                is_jupyter = False
            if is_jupyter:
                pyLDAvis.display(panel)
            else:
                pyLDAvis.show(panel)
        else:
            if use_jupyter:
                pyLDAvis.enable_notebook()
                pyLDAvis.display(panel)
            elif not use_jupyter:
                pyLDAvis.show(panel)

    def print_bow(self, doc_positions):
        print([[(self.id2word[token_id], freq) for token_id, freq in doc]
               for doc in compress(self.bag_of_words, doc_positions)])

    def save_model(self, path):
        self.lda_model.save(path)

    def load_model(self, path):
        self.lda_model = LdaMulticore.load(path)

    def save_dict(self, path):
        self.id2word.save(path)
        print("dict saved")

    def load_dict(self, path):
        self.id2word = corpora.Dictionary.load(path)
Beispiel #17
0
from gensim.models.wrappers import LdaMallet
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
from gensim.models import CoherenceModel

path_to_mallet_binary = 'd:/mallet-2.0.8/bin/mallet'
output_path = 'd:/code/gc_text_analysis/mallet_output/'
num_topics = 140
model = LdaMallet(path_to_mallet_binary,
                  corpus=bow_docs,
                  workers=4,
                  iterations=2000,
                  num_topics=num_topics,
                  id2word=dictionary,
                  prefix=output_path)

model.save('gc_lda_model.pkl')

dictionary.id2token = dict((v, k) for k, v in dictionary.token2id.items())
words_freq = [(dictionary.id2token[id], cnt)
              for id, cnt in dictionary.dfs.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
words_freq = pd.DataFrame(words_freq, columns=['word', 'count'])

coherence_model_lda = CoherenceModel(model=model,
                                     texts=ngram_docs,
                                     dictionary=dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

topics = model.show_topics(num_topics=num_topics,
                           num_words=10,