Exemple #1
0
def topic_model_coherence_generator(texts, start_topic_count, end_topic_count,
                                    step):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count + 1,
                                 step)):
        dictionary_path = 'dictionary_mallet122_' + str(
            topic_nums) + '.dictionary'
        dictionary = corpora.Dictionary.load(dictionary_path)
        corpus = [dictionary.doc2bow(text) for text in texts]
        mallet_path = 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\122文章\\mallet模型\\dictionary_mallet122_' + str(
            topic_nums) + '.model'
        mallet_lda_model = LdaMallet.load(mallet_path)
        cv_coherence_model_mallet_lda = CoherenceModel(model=mallet_lda_model,
                                                       corpus=corpus,
                                                       texts=texts,
                                                       dictionary=dictionary,
                                                       coherence='c_v')
        coherence_score = cv_coherence_model_mallet_lda.get_coherence()
        coherence_scores.append(coherence_score)
    return coherence_scores
Exemple #2
0
def calculate_entropy_mallet_models():  # output to csv files.
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    index = 0
    dataset = pandas.read_csv(dataset_csv_path)
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)

        df = pd.DataFrame()
        pbar = tqdm.tqdm(total=len(lda_model[corpus]))

        for i, row in enumerate(lda_model[corpus]):
            topic_dist = sorted(row, key=lambda x: (x[1]), reverse=True)
            rs_string = ''
            topic_entropy = 0
            for topic in topic_dist:
                rs_string = rs_string + 'Topic ' + str(topic[0] +
                                                       1) + ': ' + str(
                                                           topic[1]) + '; '
                topic_entropy = topic_entropy + (-math.log2(topic[1]))
            df = df.append(pd.Series([
                str(i), dataset['Submission_Num'][i], rs_string,
                str(topic_entropy), dataset['Submission_Text'][i]
            ]),
                           ignore_index=True)
            pbar.update(1)
        df.columns = [
            'Document_No', 'Submission_Num', 'Probabilities', 'Entropy',
            'Submission_Text'
        ]

        csv_file_result_path = f'./turn-in/{bigram_threshold}/model_entropy/{num_topics[index]}.csv'
        index = index + 1
        create_file(csv_file_result_path)
        df.to_csv(csv_file_result_path, index=False)
        pbar.close()
Exemple #3
0
def generate_topic_weight_terms():
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)
        df = topics_proportion(lda_model=lda_model,
                               corpus=corpus,
                               num_topics=num_topics[i])
        df.sort_values(by=['Topic'], ascending=True,
                       inplace=True)  # Sort columns in ascending order

        output_path = f'./turn-in/{bigram_threshold}/topic_proportion_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()
def format_topics_sentences_mallet(ldamodel:LdaMallet, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num, topn=8)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return (sent_topics_df)
Exemple #5
0
    def setUp(self):
        # Suppose given below are the topics which two different LdaModels come up with.
        # `topics1` is clearly better as it has a clear distinction between system-human
        # interaction and graphs. Hence both the coherence measures for `topics1` should be
        # greater.
        self.topics1 = [['human', 'computer', 'system', 'interface'],
                        ['graph', 'minors', 'trees', 'eps']]
        self.topics2 = [['user', 'graph', 'minors', 'system'],
                        ['time', 'graph', 'survey', 'minors']]
        self.ldamodel = LdaModel(corpus=self.corpus,
                                 id2word=self.dictionary,
                                 num_topics=2,
                                 passes=0,
                                 iterations=0)

        mallet_home = os.environ.get('MALLET_HOME', None)
        self.mallet_path = os.path.join(mallet_home, 'bin',
                                        'mallet') if mallet_home else None
        if self.mallet_path:
            self.malletmodel = LdaMallet(mallet_path=self.mallet_path,
                                         corpus=self.corpus,
                                         id2word=self.dictionary,
                                         num_topics=2,
                                         iterations=0)

        vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
        if not vw_path:
            logging.info(
                "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model"
            )
            self.vw_path = None
        else:
            self.vw_path = vw_path
            self.vwmodel = LdaVowpalWabbit(self.vw_path,
                                           corpus=self.corpus,
                                           id2word=self.dictionary,
                                           num_topics=2,
                                           passes=0)
def build_lda_model(dictionary, corpus, lda_params, use_mallet=True):
    num_topics, alpha, beta = lda_params

    if (use_mallet):
        mallet_path = 'C:/mallet/mallet-2.0.8/bin/mallet.bat'
        lda_model = LdaMallet(mallet_path,
                              corpus=corpus,
                              id2word=dictionary,
                              num_topics=num_topics,
                              alpha=alpha)
    else:
        lda_model = LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics,
                             random_state=33,
                             update_every=1,
                             chunksize=100,
                             passes=10,
                             alpha=alpha,
                             eta=beta,
                             per_word_topics=True)

    return lda_model
Exemple #7
0
    def _make_mallet_model(self, corpus_filepath, path_to_mallet, remove_stopwords, corpus_language, num_topics, **kwargs):
        """Returns a gensim-created topic model (class LdaMallet), and assigns class
        attributes _docs (an OrderedDict containing the preprocessed corpus documents)
        and _vocabulary (the corpus vocabulary (iter of str)). This function lowercases
        all words in the corpus, and removes stopwords if remove_stopwords is True.
        The keys for the document dictionary are unique document ids of the format
        "doc<i>" where <i> is the number of the document in the corpus."""
        munged_corpus = munge.corpus_to_doc_tokens(corpus_filepath)

        # make corpus lowercase, remove stopwords
        if remove_stopwords:
            stop_words = stopwords.words(corpus_language)
            prepped_corpus = [
                [word.lower() for word in doc if word.lower() not in stop_words] for doc in munged_corpus]
        else:
            prepped_corpus = [[word.lower() for word in doc]
                              for doc in munged_corpus]
        # TODO (7/12/19 faunam): make lowercasing corpus optional

        id_to_word = corpora.Dictionary(prepped_corpus)
        term_document_frequency = [
            id_to_word.doc2bow(doc) for doc in prepped_corpus]
        mallet_model = LdaMallet(path_to_mallet, corpus=term_document_frequency,
                                 id2word=id_to_word, num_topics=num_topics, **kwargs)

        docs = OrderedDict(("doc" + str(i), " ".join(doc))
                           for i, doc in enumerate(prepped_corpus))
        full_corpus = munge.corpus_to_documents(corpus_filepath)
        full_docs = OrderedDict(("doc" + str(i), doc)
                                for i, doc in enumerate(full_corpus))

        self._docs = docs
        self._full_docs = full_docs
        self._vocabulary = [word for word in id_to_word.values()]

        return mallet_model
def main():
    num_topics = 10
    #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt'
    MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin",
                               "mallet.bat")  # r"D:\Mallet\mallet-2.0.8\bin"
    texts = wenzhang_Lemmatizer1.texts2
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary_mallet_10_3.dictionary')
    #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary')
    word_id = dictionary.token2id
    corpus = [dictionary.doc2bow(text) for text in texts]
    # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus)  # 保存corpus
    # corpus = corpora.MmCorpus('corpus_wenzhang.mm')  # 加载
    # print(os.path.abspath('corpus.mm'))
    mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH,
                                 corpus=corpus,
                                 num_topics=num_topics,
                                 id2word=dictionary)
    mallet_lda_model.save(
        'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model')
    #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model')
    topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics,
                                                 num_words=20)
    # print(topic_words20)
    writetopic_wordToExcleFile(
        topic_words20,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls'
    )
    topic_words = mallet_lda_model.get_topics()
    print(len(topic_words), len(topic_words[0]))
    doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics())  #doc_topics_path
    #print(mallet_lda_model.fdoctopics())
    writedoc_topicToExcleFile(
        doc_topics,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3'
    )
    return texts, word_id, topic_words, doc_topics, num_topics
Exemple #9
0
count_vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
count_vectorizer.fit(docs)
doc_word = count_vectorizer.transform(docs).transpose()
corpus = matutils.Sparse2Corpus(doc_word)

# vocab creation
word2id = dict((v, k) for v, k in count_vectorizer.vocabulary_.items())
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
dictionary = corpora.Dictionary()
dictionary.id2token = id2word
dictionary.token2id = word2id

# topic modeling
ldamallet = LdaMallet(MALLET_PATH,
                      corpus=corpus,
                      num_topics=num_topics,
                      id2word=id2word,
                      iterations=400)

# save topic model to file
topic_file = open("english_topics_{}.pkl".format(sys.argv[1]), "wb")
pickle.dump(ldamallet.show_topics(formatted=False, num_topics=num_topics),
            topic_file)
topic_file.close()

# get NPMI coherence
coherence = CoherenceModel(model=ldamallet,
                           texts=texts,
                           dictionary=dictionary,
                           coherence='c_npmi')
print("coherence:", coherence.get_coherence())
docs_train = docs[:2000]
docs_test = docs[2000:]
dictionary = corpora.Dictionary(docs_train)

# Filter terms that occur in more than 50% of docs
dictionary.filter_extremes(no_above=0.5)

# Convert to document term matrix (corpus)
doc_term_mat_train = [dictionary.doc2bow(doc) for doc in docs_train]
doc_term_mat_test = [dictionary.doc2bow(doc) for doc in docs_test]

path_to_mallet_binary = r'C:\mallet\bin\mallet'
if __name__ == "__main__":
    model = LdaMallet(path_to_mallet_binary,
                      corpus=doc_term_mat_train,
                      alpha=5,
                      num_topics=10,
                      id2word=dictionary,
                      optimize_interval=50)

    topics = model.print_topics()
    for topic in topics:
        print(topic)

    # Compute Coherence Score for base model
    coherence_model_lda = CoherenceModel(model=model,
                                         corpus=doc_term_mat_train,
                                         texts=docs_train,
                                         dictionary=dictionary,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    gensim_model = ldamallet.malletmodel2ldamodel(model)
Exemple #11
0
hybrid_weights.extend(hybrid_weights)
hybrid_weights = np.array(hybrid_weights)
# Convert to probabilities
hybrid_weights = hybrid_weights / hybrid_weights.sum()

# GLOBAL num_items_to_pick (with replacement) -- high number: half a million
num_picks = 1000000

# LOAD MODELS
loadmodstart = time()
id2word_dictionary = corpora.Dictionary.load(
    '/home/ashwath/Programs/ACLAAn/LDA/aclmag.dict')
corpus = corpora.MmCorpus(
    '/home/ashwath/Programs/ACLAAn/LDA/aclmag_bow_corpus.mm')
try:
    ldamallet = LdaMallet.load(
        '/home/ashwath/Programs/ACLAAn/LDA/lda_model.model')
    vec_bow_test = id2word_dictionary.doc2bow(['test'])
    vec_ldamallet = ldamallet[vec_bow_test]
except subprocess.CalledProcessError:
    print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD")
    ldamallet = LdaModel.load(
        '/home/ashwath/Programs/ACLAAn/LDA/ldanormal_acl.model')

#index = similarities.MatrixSimilarity(ldamallet[corpus])
#index.save("simIndex.index")
malletindex = similarities.MatrixSimilarity.load(
    '/home/ashwath/Programs/ACLAAn/LDA/simIndexAcl.index')
with open(
        '/home/ashwath/Programs/ACLAAn/LDA/docid_to_magid_training_acl.pickle',
        'rb') as pick:
    docid_to_magid = pickle.load(pick)
Exemple #12
0
DICT_PATH  = 'docs.dict'
MODEL_PATH = 'docs.model'

raw_corpus = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
docs = [doc.split() for doc in raw_corpus]

if exists(MODEL_PATH):
    print('Testing...\n')
    dict = corpora.Dictionary.load(DICT_PATH)
    lda  = LdaMallet.load(MODEL_PATH)
    for doc in docs:
        topics = lda[dict.doc2bow(doc)]
        print(topics, doc)
else:
    print('Training...\n')
    dictionary = corpora.Dictionary(docs)
    dictionary.save(DICT_PATH)
    corpus = [dictionary.doc2bow(text) for text in docs]

    lda = LdaMallet(MALLET_PATH, corpus=corpus,
                    num_topics=3, workers=60, id2word=dictionary, iterations=50, prefix=PREFIX)
    lda.save(MODEL_PATH)
Exemple #13
0
# --+ write plot to file
out_f = os.path.join("analysis", "topicModeling", ".output", "pr_coherence_scores.pdf")
plt.savefig(out_f, transparent=True, bbox_inches="tight", pad_inches=0)


# %% topic model estimation
"""
I focus on two models:
    - 8 topics, ~ local optimum
    - 30 topic, ~ global optimum
"""

# model with 8 topics
# --+ estimate model
lda_8 = LdaMallet(
    mallet_path, corpus=corpus, id2word=dictionary, num_topics=8, random_seed=123
)
# --+ print topics (20 words per topic)
lda_8.print_topics(num_topics=8, num_words=20)
# --+ translate topic modeling outcome
lda_8 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_8)

# --+ term-to-topic probabilities (10 words per topic)
top_terms_line = lda_8.show_topics(num_topics=8, num_words=10)
# ----+ rearrange data on top 10 terms per topic
top_terms_m = []
for i in top_terms_line:
    topic_num = i[0]
    prob_terms = i[1].split("+")
    for term_sort, term in enumerate(prob_terms):
        weight = float(term.split("*")[0])
def load_mallet_model(artefacts_path='./artefacts', suffix=''):
    model_path = str(Path(artefacts_path) / 'model')
    if suffix:
        model_path = model_path + f'_{suffix}'
    model = LdaMallet.load(model_path)
    return model
    corpus = corpora.MmCorpus('mag_bow_corpus.mm')
except FileNotFoundError:
    corpus = [
        id2word_dictionary.doc2bow(textlist) for textlist in tqdm(data_stemmed)
    ]
    print("Doc2Bow corpus created")
    # TOO BIG TO SERIALIZE
    # Save the Dict and Corpus
    try:
        corpora.MmCorpus.serialize('mag_bow_corpus.mm',
                                   corpus)  # save corpus to disk
    except OverflowError:
        # Don't save corpus, call LDA directly
        print("Overflow while saving corpus, skip and train.")
        ldamallet = LdaMallet(mallet_path,
                              corpus=corpus,
                              num_topics=300,
                              id2word=id2word_dictionary)
        print('LDA Model trained')

        try:
            ldamallet.save('ldamallet_mag.model')
        except OverflowError:
            print("Trying to pickle model using protocol 4")
            with open('ldamallet_mag.model', 'wb') as pick:
                pick.dump(ldamallet, pick, protocol=pickle.HIGHEST_PROTOCOL)
        print("Lda model saved to disk")

        # Show Topics
        pprint(ldamallet.show_topics(formatted=False))

        # Compute Coherence Score
Exemple #16
0
def main():
    logger.info('-' * 80)
    logger.info('Loading data')
    corpus = load_corpus(args.dataset_dir)

    logger.info('-' * 80)
    logger.info('Make dictionary')

    dictionary = Dictionary(corpus)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=TOKEN_MIN_DOCS,
                               no_above=TOKEN_MAX_DOCS_FRAC)

    vocab_path = os.path.join(args.dump_dir, 'vocab.txt')
    with open(vocab_path, 'w') as f:
        f.write("\n".join(dictionary.itervalues()) + '\n')

    # Bag-of-words representation of the documents.
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

    logger.info(f'Number of unique tokens: {len(dictionary)}')
    logger.info(f'Number of documents: {len(bow_corpus)}')

    logger.info('-' * 80)
    logger.info('Training model')

    callbacks = []
    if 'perplexity' in args.callbacks:
        perplexity_metric = PerplexityMetric(corpus=bow_corpus)
        callbacks.append(perplexity_metric)
    if 'coherence' in args.callbacks:
        coherence_metric = CoherenceMetric(texts=corpus,
                                           dictionary=dictionary,
                                           coherence=args.coherence,
                                           topn=args.topn)
        callbacks.append(coherence_metric)

    model_path = os.path.join(args.dump_dir, 'lda.model')
    if args.model == 'lda':
        model = LdaModel(corpus=bow_corpus,
                         num_topics=args.num_topics,
                         id2word=dictionary,
                         passes=args.num_epochs,
                         update_every=1,
                         eval_every=args.eval_every,
                         iterations=args.iterations,
                         alpha='auto',
                         eta='auto',
                         chunksize=args.batch_size,
                         callbacks=callbacks,
                         log_dir=args.log_dir,
                         model_dir=model_path)
    elif args.model == 'multicore_lda':
        model = LdaMulticore(corpus=bow_corpus,
                             num_topics=args.num_topics,
                             id2word=dictionary,
                             passes=args.num_epochs,
                             eval_every=args.eval_every,
                             iterations=args.iterations,
                             eta='auto',
                             chunksize=args.batch_size,
                             workers=args.workers,
                             callbacks=callbacks,
                             log_dir=args.log_dir,
                             model_dir=model_path)
    elif args.model == 'mallet_lda':
        model = LdaMallet(args.mallet_path,
                          corpus=bow_corpus,
                          num_topics=args.num_topics,
                          id2word=dictionary,
                          workers=args.workers,
                          prefix=os.path.join(args.dump_dir, 'mallet_'),
                          iterations=args.iterations)
    elif args.model == 'gensim_lda':
        model = GensimLdaModel(corpus=bow_corpus,
                               num_topics=args.num_topics,
                               id2word=dictionary,
                               passes=args.num_epochs,
                               update_every=1,
                               eval_every=args.eval_every,
                               iterations=args.iterations,
                               alpha='auto',
                               eta='auto',
                               chunksize=args.batch_size)
    elif args.model == 'gensim_multicore_lda':
        model = GensimLdaMulticore(corpus=bow_corpus,
                                   num_topics=args.num_topics,
                                   id2word=dictionary,
                                   passes=args.num_epochs,
                                   eval_every=args.eval_every,
                                   iterations=args.iterations,
                                   eta='auto',
                                   chunksize=args.batch_size,
                                   workers=args.workers)

    model.save(model_path)

    logger.info('-' * 80)

    if args.model != 'mallet_lda':
        top_topics = model.top_topics(texts=corpus, coherence='c_v')
        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        avg_topic_coherence = sum([t[1] for t in top_topics]) / args.num_topics
        logger.info(f'Average topic coherence: {avg_topic_coherence:.4f}.')
        for topic_idx, (topic_words, topic_score) in enumerate(top_topics):
            logger.info(f'Topic #{topic_idx} ({topic_score:.4f}): ' +
                        " ".join((t[1] for t in topic_words[:5])))
        logger.info(
            f'Perplexity: {np.exp2(-model.log_perplexity(bow_corpus)):.4f}')
    else:
        pprint(model.show_topics(formatted=False))

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model,
                                         texts=corpus,
                                         dictionary=dictionary,
                                         coherence=args.coherence,
                                         topn=args.topn)
    coherence_lda = coherence_model_lda.get_coherence()
    logger.info(f'Coherence : {coherence_lda:.4f}')
Exemple #17
0
def extract_features(max_documents=50000000,
                     max_words_per_doc=50000000,
                     incl_tf=True,
                     incl_df=True,
                     incl_graph=True,
                     incl_w2v=True,
                     incl_topic_model=True,
                     incl_atm=True):
    
    ######### SIMPLE FREQUENCY MEASURES ######################################################
    if incl_df or incl_tf or incl_graph:
        doc_cnt = max_documents
        # set containers:
        tf, df, network = Counter(), Counter(), nx.Graph()
        doc_ner_idx = {}
        dir_ner_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='wiki')
        dir_filename_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='filename')
        for filename, words in zip(dir_filename_iterator, dir_ner_iterator):
            # count the ners:
            ner_cnt = Counter()
            ner_cnt.update(words)
            if ner_cnt:
                # collect which ners appear in which doc:
                doc_ner_idx[os.path.basename(filename)] = set([n for n in ner_cnt])
                # update global tf and df:
                for k, v in ner_cnt.items():
                    tf[k] += v
                    df[k] += 1
                # update nodes in network:
                for ner in ner_cnt:
                    if ner not in network:
                        network.add_node(ner)
                # update edges in network:
                for ner1, ner2 in combinations(ner_cnt, 2):
                    try:
                        network[ner1][ner2]['weight'] += 1
                    except KeyError:
                        network.add_edge(ner1, ner2, weight=1)
        
        # dump for reuse:
        pickle.dump(tf, open('../workspace/tf.m', 'wb'))
        pickle.dump(df, open('../workspace/df.m', 'wb'))
        pickle.dump(doc_ner_idx, open('../workspace/doc_ner_idx.m', 'wb'))
        pickle.dump(network, open('../workspace/nx.m', 'wb'))
        
        # scale network values:
        max_weight = float(max([network[n1][n2]['weight']\
                            for n1, n2 in network.edges_iter()]))
        for n1, n2 in network.edges_iter():
            network[n1][n2]['weight'] /= max_weight
        nx.write_gexf(network,
                      '../workspace/dbnl_network.gexf',
                      prettyprint=True)
    
    ######### WORD2VEC MODEL ######################################################
    if incl_w2v:
        # build w2v model:
        dir_w2v_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='w2v')
        w2v_model = Word2Vec(dir_w2v_iterator, window=15, min_count=10,
                                         size=150, workers=10, negative=5)
        w2v_model.init_sims(replace=True)
        w2v_model.save(os.path.abspath('../workspace/w2v_model.m'))

    ######### STANDARD TOPIC MODEL ######################################################
    if incl_topic_model:
        # build vocab for lda:
        vocab_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='lda_vocab')
        lda_dict = corpora.Dictionary(vocab_lda_iterator)
        lda_dict.filter_extremes(no_below=25, no_above=0.5, keep_n=5000)
        
        # build lda model:
        dir_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='lda',
                                         lda_dict=lda_dict)
        lda_workspace_path = '../workspace/mallet_output/'
        if not os.path.isdir(lda_workspace_path):
            os.mkdir(lda_workspace_path)
        mallet_path = '/home/mike/GitRepos/dbnl/code/mallet-2.0.8RC2/bin/mallet'
        lda_model = LdaMallet(mallet_path, dir_lda_iterator, num_topics=150,
                                       id2word=lda_dict, iterations=1900,
                                       prefix=lda_workspace_path)
        lda_model.save('../workspace/lda_model.m')

    ######### AUTHOR TOPIC MODEL ######################################################
    if incl_atm:
        # build vocab for lda:
        vocab_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified',
                                         max_documents=max_documents,
                                         max_words_per_doc=max_words_per_doc,
                                         get='lda_vocab')
        lda_dict = corpora.Dictionary(vocab_lda_iterator)
        lda_dict.filter_extremes(no_below=25, no_above=0.5, keep_n=5000)
        lda_dict.compactify()
        atm_vocab = []
        for i, w in lda_dict.items():
            atm_vocab.append(w)
        print(len(atm_vocab), 'vocab')
        atm_vocab = tuple(atm_vocab)
        corpus, doc_author = [], []
        for filename in sorted(glob.glob('../workspace/wikified_periodicals/*.wikified')):
            doc_words, auth_set = [], set()
            max_documents -= 1
            if max_documents % 100 == 0:
                print('\t-', max_documents, 'to go')
            if max_documents <= 1:
                break
            word_cnt = max_words_per_doc
            for line in codecs.open(filename, 'r', encoding='utf8'):
                comps = line.strip().split('\t')
                if comps:
                    idx, token, lemma, pos, pos_conf, ner, wiki = comps
                    if wiki != 'X':
                        auth_set.add(wiki)
                    elif pos.startswith(('N(', 'ADJ(')):
                        try:
                            doc_words.append(atm_vocab.index(token.lower()))
                        except:
                            pass
                word_cnt -= 1
                if word_cnt <= 0:
                    break
            if auth_set and doc_words:
                corpus.append(sorted(doc_words))
                doc_author.append(sorted(list(auth_set)))
        atm_author_idx = {}
        for i1, authors in enumerate(doc_author):
            for i2, auth in enumerate(authors):
                if auth not in atm_author_idx:
                    atm_author_idx[auth] = len(atm_author_idx)
                doc_author[i1][i2] = atm_author_idx[auth]
        n_topic = 30
        atm_model = AuthorTopicModel(n_doc=len(corpus),
                                     n_voca=len(atm_vocab),
                                     n_topic=n_topic,
                                     n_author=len(atm_author_idx))
        atm_model.fit(corpus, doc_author, max_iter=10)
        for k in range(n_topic):
            top_words = get_top_words(atm_model.TW, atm_vocab, k, 10)
            print('topic ', k , ','.join(top_words))
        author_id = 7
        fig = plt.figure(figsize=(12,6))
        plt.bar(range(n_topic), atm_model.AT[author_id]/np.sum(atm_model.AT[author_id]))
        #plt.title(author_idx[author_id])
        plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(atm_model.TW, atm_vocab, k, 10)) for k in range(n_topic)])
        #plt.show()
        plt.savefig('atm1.pdf')
        pickle.dump(atm_vocab, open('../workspace/atm_vocab.m', 'wb'))
        pickle.dump(atm_model, open('../workspace/atm_model.m', 'wb'))
        pickle.dump(atm_author_idx, open('../workspace/atm_author_idx.m', 'wb'))
Exemple #18
0
    # Create the vocabulary
    for ii in files:
        doc_scanner.scan(tokenize_file(ii))

    # Initialize the documents
    docs = doc_scanner.docs
    dictionary = Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # start = time.time()
    # gensim_lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=args.num_topics, iterations=args.num_iterations)
    # time_took = time.time() - start
    # report(gensim_lda.print_topics(num_topics=10, num_words=50), filename="gensim", limit=50)
    # print(("Total time it took: %0.5f seconds" % (time_took)))

    mallet_file = "/home/jihwangk/Desktop/GitDir/Mallet/bin/mallet"
    # start = time.time()
    mallet_lda = LdaMallet(mallet_file,
                           corpus=corpus,
                           num_topics=args.num_topics,
                           id2word=dictionary,
                           iterations=args.num_iterations)
    # time_took = time.time() - start
    mallet_lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
        mallet_lda, iterations=args.num_iterations)
    report(mallet_lda.print_topics(num_topics=10, num_words=50),
           filename="mallet",
           limit=50)
    # print(("Total time it took: %0.5f seconds" % (time_took)))
Exemple #19
0
def topic_model(W, K, N):
    """

    :param w: min_number of words per segment
    :param k: number of topics
    :param n: number of iterations
    :return:
    """
    print("\n-----LDA CONCEPT DETECITON-----")
    print('MODEL:', hash((W, K, N)), W, K, N)
    corpus = load_from_csv(CORPUS_PATH)

    # Create CountVectorizer to get Document-Term matrix

    stop_words = load_stop_words("../data/stopwords-fr.txt")
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    proc_corpus, proc_corpus_text_only = remove_short_segs(
        corpus, vectorizer, W)
    proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only]
    proc_stop_words = []

    for i in range(len(proc_corpus_text_only)):
        proc_stop_words.append([])
        for j in range(len(proc_corpus_text_only[i])):
            if proc_corpus_text_only[i][j] not in stop_words and len(
                    proc_corpus_text_only[i][j]) >= 3:
                proc_stop_words[i].append(proc_corpus_text_only[i][j])

    # train vectorizer on corpus
    print('Corpus Size:', len(proc_stop_words))
    id2word = Dictionary(proc_stop_words)
    corp = [id2word.doc2bow(text) for text in proc_stop_words]

    # print("Number of Features: " + str(len(feature_names)))

    # redirect stdout for capturing LL/token

    # initialize model
    path_to_mallet_binary = "../mallet_git/bin/mallet"

    mallet_model = LdaMallet(path_to_mallet_binary,
                             corpus=corp,
                             num_topics=K,
                             id2word=id2word,
                             optimize_interval=20,
                             random_seed=9,
                             iterations=N)

    u_mass = CoherenceModel(model=mallet_model,
                            texts=proc_stop_words,
                            corpus=corp,
                            coherence='u_mass')
    c_v = CoherenceModel(model=mallet_model,
                         texts=proc_stop_words,
                         corpus=corp,
                         coherence='c_v')
    c_uci = CoherenceModel(model=mallet_model,
                           texts=proc_stop_words,
                           corpus=corp,
                           coherence='c_uci')
    c_npmi = CoherenceModel(model=mallet_model,
                            texts=proc_stop_words,
                            corpus=corp,
                            coherence='c_npmi')

    u_mass_val = u_mass.get_coherence()
    c_v_val = c_v.get_coherence()
    c_uci_val = c_uci.get_coherence()
    c_npmi_val = c_npmi.get_coherence()

    print('U_MASS_VAL:', u_mass_val)
    print('C_V_VAL:', c_v_val)
    print('C_UCI_VAL:', c_uci_val)
    print('C_NPMI_VAL:', c_npmi_val)

    return 0
#Approach 2

!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
mallet_path = '/content/mallet-2.0.8/bin/mallet'

#create model
ldamallet = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=8, id2word=dictionary)

pprint(ldamallet.show_topics(formatted=False))
gensimmodel= gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

#create wrapper for visualization
ldamallet_display = pyLDAvis.gensim.prepare(gensimmodel, doc_term_matrix, dictionary, sort_topics=False)
pyLDAvis.save_html(ldamallet_display,open("ldamallet_8_topics.html","w"))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=reviews, dictionary=dictionary, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\n Mallet Coherence Score: ', coherence_ldamallet)

#Generate Tags
def get_reviews_to_process(text):
Exemple #21
0
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    corpus = load_from_csv(CORPUS_PATH)

    # Create CountVectorizer to get Document-Term matrix

    stop_words = load_stop_words("data/stopwords-fr.txt")
    vectorizer = CountVectorizer(lowercase=True, max_df=MAX_DF, min_df=MIN_DF, token_pattern=r"(?u)\b\w\w\w+\b")

    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)
    proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only]
    proc_stop_words = []

    for i in range(len(proc_corpus_text_only)):
        proc_stop_words.append([])
        for j in range(len(proc_corpus_text_only[i])):
            if proc_corpus_text_only[i][j] not in stop_words and len(proc_corpus_text_only[i][j]) >= 3:
                proc_stop_words[i].append(proc_corpus_text_only[i][j])

    # train vectorizer on corpus

    id2word = Dictionary(proc_stop_words)
    corp = [id2word.doc2bow(text) for text in proc_stop_words]

    # print("Number of Features: " + str(len(feature_names)))

    # initialize model
    path_to_mallet_binary = "/Users/fnascime/Dev/mallet/mallet-2.0.8/bin/mallet"

    coherence_values = []

    for seed in range(20):

        mallet_model = LdaMallet(path_to_mallet_binary, corpus=corp, num_topics=16, id2word=id2word, optimize_interval=20,
                                 random_seed=seed, iterations=1000)

        gensim_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_model)
        coherencemodel = CoherenceModel(model=gensim_model, texts=proc_stop_words, dictionary=id2word, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())


    max = 0
    best_seed = 999
    for index, coherence in enumerate(coherence_values) :

        print ("Seed: ", index, " -> Coherence: ", coherence)
        if coherence > max:
            max = coherence
            best_seed = index

    print (" *** Summary ***")
    print (" Best Seed     : ", best_seed)
    print ("Best coherence : ", max)
    print ("Median         : ", median(coherence_values))
    print ("Mean           : ", mean(coherence_values))
    print ("Stdev          : ", stdev(coherence_values))


    #doc_topics = list(mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False))
    #topic_word = TopicWord(mallet_model)
    #topic_word.get_topic_word()
    #topic_word.write_to_csv("output/topic_" +str(mallet_model.random_seed) + "_" + str(mallet_model.iterations) + "_" + str(mallet_model.num_topics) + ".csv")

    #topic_doc = TopicDoc(mallet_model)
    #topic_doc.get_topic_doc()
    #topic_doc.write_to_csv("output/topic_doc"+str(mallet_model.random_seed)+ "_" + str(mallet_model.iterations)+ "_" + str(mallet_model.num_topics) + ".csv", num_docs=50)

    return 0
def lda_mallet(mallet_path, corpus, dictionary, num_topics):
    lda_model = LdaMallet(mallet_path,
                          corpus=corpus,
                          id2word=dictionary,
                          num_topics=num_topics)
    return lda_model
Exemple #23
0
# # 9 LDA Mallet Model

# Now that we have completed our Topic Modeling using "Variational Bayes" algorithm from Gensim's LDA, we will now explore Mallet's LDA (which is more accurate but slower) using Gibb's Sampling (Markov Chain Monte Carlos) under Gensim's Wrapper package.
# 
# Mallet's LDA Model is more accurate, since it utilizes Gibb's Sampling by sampling one variable at a time conditional upon all other variables.

# In[20]:


import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'/Users/Mick/Desktop/mallet/'}) # Set environment
mallet_path = '/Users/Mick/Desktop/mallet/bin/mallet'             # Update this path

# Build the LDA Mallet Model
ldamallet = LdaMallet(mallet_path,corpus=corpus,num_topics=7,id2word=id2word) # Here we selected 7 topics again
pprint(ldamallet.show_topics(formatted=False))


# After building the LDA Mallet Model using Gensim's Wrapper package, here we see our 7 new topics in the document along with the top 10 keywords and their corresponding weights that makes up each topic.

# ## 9.1 LDA Mallet Model Performance

# In[21]:


# Compute coherence score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence="c_v")
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
Exemple #24
0
import gensim
from gensim.models.wrappers import LdaMallet
# If mallet doesn't work, use normal LDA.
from gensim.models.ldamodel import LdaModel
ldamallet = LdaMallet.load(
    '/home/ashwath/Programs/MAGCS/LDA/ldamallet_mag50.model')
lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
    ldamallet, gamma_threshold=0.001, iterations=50)
lda.save('lda_mag50.model')
        print(
            'Train an LDA model over the given corpus using the given dictionary.'
        )
        print('If num_topics is not specified, use the default of 100.')
        print(
            'If num_passes is specified, makes multiple passes over the corpus.'
        )
        print('This uses MALLET to train a topic model.')
    else:
        _, mm_fname, dict_fname, model_fname = sys.argv[:4]
        num_topics = int(sys.argv[4]) if len(sys.argv) >= 5 else 100

        try:
            mallet_path = sep.join(
                [os.environ['MALLET_HOME'], 'bin', 'mallet'])
        except KeyError:
            logging.error('please set the MALLET_HOME environment variable to '
                          'the root directory of your MALLET installation')
            exit()

        mm = MmCorpus(mm_fname)
        id2word = Dictionary.load(dict_fname)

        lda_model = LdaMallet(mallet_path,
                              corpus=normalize_langs(mm),
                              id2word=id2word,
                              num_topics=num_topics,
                              prefix=model_fname[:-6],
                              iterations=100)
        lda_model.save(model_fname)
Exemple #26
0
def load_gensim_file(file_name):
    return LdaMallet.load('data/gensim_models/' + file_name)
Exemple #27
0
class NlPipe:
    def __init__(self,
                 list_of_docs,
                 path,
                 document_ids=None,
                 language_model="en_core_web_lg",
                 tagger=False,
                 parser=False,
                 ner=False,
                 categorization=False,
                 remove_stopwords=True,
                 remove_punctuation=True,
                 set_lower=True,
                 remove_num=True,
                 expand_stopwords=True,
                 language_detection=False,
                 allowed_languages=frozenset({'en'}),
                 no_processes=None):
        """
        :param list_of_docs: List of strings where every document is one string.
        :param document_ids: The ids of the documents, matching the order of the list_of_docs
        :param language_model: Spacy language model to be used for text preprocessing
        :param tagger: Use spacy part-of-speech tagger.
        :param parser: Use spacy to annotate syntactic dependencies in documents.
        :param ner: Use spacy for entity recognition and annotation.
        :param categorization: Use spacy to assign document labels
        :param remove_stopwords: Remove stop words during text preprocessing.
        :param remove_punctuation: Remove punctuation during text prssing.
        :param set_lower: Convert all strings to lowercase during text preprocessing.
        :param remove_num: Remove numeric characters during text preprocessing.
        :param expand_stopwords: Remove non-alpha-characters in stop words and add them to the stop words.
        :param language_detection: Detect language of docs.
        :param allowed_languages: Allowed language for the documents.
        """
        self.path = path
        self.pipe_disable = []
        if not tagger:
            self.pipe_disable.append("tagger")
        if not parser:
            self.pipe_disable.append("parser")
        if not ner:
            self.pipe_disable.append("ner")
        if not categorization:
            self.pipe_disable.append("textcat")
        self.remove_punctuation = remove_punctuation
        self.remove_stop_words = remove_stopwords
        self.remove_num = remove_num
        self.set_lower = set_lower
        self.input_docs = list_of_docs
        self.document_ids = np.array(document_ids)
        self.use_gpu = spacy.prefer_gpu()
        self.nlp = spacy.load(language_model)
        if expand_stopwords:
            stops = [stop for stop in self.nlp.Defaults.stop_words]
            for stop in stops:
                self.nlp.Defaults.stop_words.add(re.sub(r"[\W]", "", stop))
        self.spacy_docs = None
        self.preprocessed_docs = None
        self.bag_of_words = None
        self.preprocessing_batch_size = 50000
        if no_processes is None:
            self.processes = psutil.cpu_count(logical=False) - 1
        else:
            self.processes = no_processes
        self.lda_model = None
        self.result_df = None
        self.word_topic_df = None
        self.allowed_languages = allowed_languages
        self.language_detection = language_detection
        self.id2word = None
        self.coherence_dict = None
        self.max_df = None
        self.min_df = None
        self.use_phrases = None
        self.filter_extremes_value = None
        self.keep_n = None
        self.keep_tokens = None

    def enable_pipe_component(self, component):
        """
        Method to enable components of the spacy pipeline after initialization of the class.
        :param component: Component to enable (see https://spacy.io/usage/processing-pipelines/ for available
        components).
        """
        if component in self.pipe_disable:
            self.pipe_disable.remove(component)

    def disable_pipe_component(self, component):
        """
        Method to disable components of the spacy pipeline after initialization of the class.
        :param component: Component to disable (see https://spacy.io/usage/processing-pipelines/ for available
        components).
        """
        if component not in self.pipe_disable:
            self.pipe_disable.append(component)

    def preprocess_spacy(self,
                         load_existing=True,
                         save_data=True,
                         filter_loaded=None):
        """
        Method to preprocess the documents using spacy with the enabled pipeline components.
        """
        if os.path.exists(
                f"{self.path}text_df_preprocessed_spacy") and load_existing:
            preprocessed_df = pd.read_pickle(
                f"{self.path}text_df_preprocessed_spacy")
            if filter_loaded is None:
                self.spacy_docs = preprocessed_df['preprocessed_text'].to_list(
                )
            else:
                self.spacy_docs = preprocessed_df['preprocessed_text'].loc[
                    filter_loaded].to_list()
        else:
            if self.language_detection:
                self.spacy_docs = [
                    doc for doc in tqdm(self.nlp.pipe(
                        self.input_docs,
                        disable=self.pipe_disable,
                        n_process=self.processes,
                        batch_size=self.preprocessing_batch_size),
                                        desc="Preprocessing text with spacy: ")
                    if detect(doc.text) in self.allowed_languages
                ]
            else:
                self.spacy_docs = []
                for doc in tqdm(self.nlp.pipe(
                        self.input_docs,
                        disable=self.pipe_disable,
                        n_process=self.processes,
                        batch_size=self.preprocessing_batch_size),
                                desc="Preprocessing spacy"):
                    self.spacy_docs.append(doc)
            if save_data:
                temp_df = pd.DataFrame([self.document_ids,
                                        self.spacy_docs]).transpose()
                temp_df.columns = ['thread_id', 'preprocessed_text']
                temp_df.to_pickle(f"{self.path}text_df_preprocessed_spacy")

    def preprocess(self, load_existing=True, filter_loaded=None):
        """
        Remove stop words, numbers and punctation as well as lower case all of the tokens, depending on the settings
        passed to the class during initialization.
        """
        if os.path.exists(
                f"{self.path}/text_df_preprocessed") and load_existing:
            print("Found preprocessed data. Loading")
            preprocessed_df = pd.read_pickle(
                f"{self.path}/text_df_preprocessed")
            if filter_loaded is None:
                self.preprocessed_docs = preprocessed_df[
                    'preprocessed_text'].to_list()
                print('Preprocessed data loaded.')
            else:
                self.preprocessed_docs = preprocessed_df[
                    'preprocessed_text'].loc[filter_loaded].to_list()
                if isinstance(self.document_ids, np.ndarray):
                    self.document_ids = self.document_ids[filter_loaded]
                print(
                    f'{sum(filter_loaded)} preprocessed docs of {len(self.input_docs)} docs loaded.'
                )
        else:
            self.preprocessed_docs = []
            if not self.spacy_docs:
                self.preprocess_spacy()
            for spacy_doc in tqdm(
                    self.spacy_docs,
                    desc="Removing stop words/punctuation/numeric chars: "):
                doc = []
                for token in spacy_doc:
                    # todo: check if useful condition
                    if not self.remove_stop_words and token.is_stop:
                        word = token.text
                    elif token.is_stop:
                        continue
                    else:
                        word = token.lemma_
                    if self.set_lower:
                        word = word.lower()
                    if self.remove_num:
                        word = re.sub(r"[\d]", "", word)
                    if self.remove_punctuation:
                        word = re.sub(r"[\W]", "", word)
                    if len(word) >= 2 and word != "wbr":
                        doc.append(word)
                self.preprocessed_docs.append(doc)
            temp_df = pd.DataFrame([self.document_ids, self.preprocessed_docs]).\
                transpose()
            temp_df.columns = ['thread_id', 'preprocessed_text']
            temp_df.to_pickle(f"{self.path}/text_df_preprocessed")

    def create_bag_of_words(self,
                            filter_extremes=True,
                            min_df=5,
                            max_df=0.5,
                            keep_n=100000,
                            keep_tokens=None,
                            use_phrases=None,
                            bigram_min_count=1000,
                            bigram_threshold=100,
                            trigram_threshold=100,
                            load_existing=True,
                            tfidf=False):
        """
        :param filter_extremes: En-/Disable filtering of tokens that occur too frequent/not frequent enough
        (https://radimrehurek.com/gensim/corpora/dictionary.html)
        :param min_df: Keep only tokens that appear in at least n documents (see link above)
        :param max_df: Keep only tokens that appear in less than the fraction of documents (see link above)
        :param keep_n: Keep only n most frequent tokens (see link above)
        :param keep_tokens: Iterable of tokens not to be remove (see link above)
        :param use_phrases: Set to bigram or trigram if the use of Gensmin Phrases
        (https://radimrehurek.com/gensim/models/phrases.html) is wanted. Will create bigrams/trigrams of frequently
        co-occuring words (e.g. "new", "york" => "new_yor)k").
        :param bigram_min_count: Minimum occurrence of bigrams to be considered by Gensmin Phrases.
        :param bigram_threshold: Threshold for Gensim Phrases bigram settings.
        :param trigram_threshold: Threshold for Gensim Phrases trigram settings.
        """
        if use_phrases not in {None, "bigram", "trigram"}:
            raise Exception(
                "Please use valid option (None, 'bigram' or 'trigram) to make use of this function."
            )
        #todo: check logic
        else:
            if use_phrases == "bigram" and not isinstance(
                    bigram_threshold, int) and not isinstance(
                        bigram_min_count, int):
                raise Exception(
                    "Thresholds or minimum count for bigrams/trigrams not integer. Please provide "
                    "threshold and minimum count for bigrams (and trigrams) as integer."
                )
            elif use_phrases == "trigram" and not isinstance(bigram_threshold, int) \
                    or not isinstance(trigram_threshold, int) or not isinstance(bigram_min_count, int):
                raise Exception(
                    "Thresholds or minimum count for bigrams/trigrams not integer. Please provide "
                    "threshold and minimum count for bigrams (and trigrams) as integer."
                )

        if not self.preprocessed_docs:
            self.preprocess()
        if os.path.exists(f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}") \
                and load_existing:
            self.load_dict(
                path=
                f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}"
            )
            self.filter_extremes_value = filter_extremes
            self.min_df = min_df
            self.max_df = max_df
            self.use_phrases = use_phrases
        else:
            #todo: add auto check for existing dictionary here.
            if use_phrases == "bigram" or use_phrases == "trigram":
                self.create_bigrams(bigram_min_count=bigram_min_count,
                                    bigram_threshold=bigram_threshold)
            if use_phrases == "trigram":
                self.create_bigrams(bigram_min_count=bigram_min_count,
                                    bigram_threshold=bigram_threshold)
                self.create_trigrams(trigram_threshold=trigram_threshold)
            self.create_dictionary(filter_extremes=filter_extremes,
                                   min_df=min_df,
                                   max_df=max_df,
                                   keep_n=keep_n,
                                   keep_tokens=keep_tokens,
                                   use_phrases=use_phrases)
        self.create_bag_of_words_matrix(tfidf=tfidf)

    def create_bigrams(self, bigram_min_count, bigram_threshold):
        self.bigram_phrases = Phrases(self.preprocessed_docs,
                                      min_count=bigram_min_count,
                                      threshold=bigram_threshold)
        self.bigram_phraser = Phraser(self.bigram_phrases)
        self.preprocessed_docs = [
            self.bigram_phraser[doc]
            for doc in tqdm(self.preprocessed_docs, desc="Extracting bigrams")
        ]

    def create_trigrams(self, trigram_threshold):
        trigram_phrases = Phrases(self.bigram_phrases[self.preprocessed_docs],
                                  threshold=trigram_threshold)
        trigram_phraser = Phraser(trigram_phrases)
        self.preprocessed_docs = [
            trigram_phraser[self.bigram_phraser[doc]]
            for doc in tqdm(self.preprocessed_docs, desc="Extracting trigrams")
        ]

    def create_bag_of_words_matrix(self, tfidf=False):
        self.bag_of_words = [
            self.id2word.doc2bow(doc)
            for doc in tqdm(self.preprocessed_docs,
                            desc='Creating bag of words')
        ]
        if tfidf:
            self.create_tfidf()

    def create_dictionary(self, filter_extremes, min_df, max_df, keep_n,
                          keep_tokens, use_phrases):
        print('Creating dictionary.')
        self.id2word = corpora.Dictionary(self.preprocessed_docs)
        # todo: add autosave of dictionary here
        self.max_df = max_df
        self.min_df = min_df
        self.use_phrases = use_phrases
        self.filter_extremes_value = filter_extremes
        self.keep_n = keep_n
        self.keep_tokens = keep_tokens
        if filter_extremes:
            self.filter_extremes(min_df=self.min_df,
                                 max_df=self.max_df,
                                 keep_n=self.keep_n,
                                 keep_tokens=self.keep_tokens)
        self.save_dict(
            path=
            f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}"
        )

    def filter_extremes(self, min_df, max_df, keep_n, keep_tokens=[]):
        self.filter_extremes_value = True
        self.max_df = max_df
        self.min_df = min_df
        self.keep_n = keep_n
        self.keep_tokens = keep_tokens
        self.id2word.filter_extremes(no_below=self.min_df,
                                     no_above=self.max_df,
                                     keep_n=keep_n,
                                     keep_tokens=keep_tokens)

    def create_tfidf(self):
        tfidf_model = TfidfModel(self.bag_of_words)
        self.bag_of_words = [
            tfidf_model[vector]
            for vector in tqdm(self.bag_of_words,
                               desc="Creating tf-idf matrix")
        ]

    def create_lda_model(self,
                         no_topics=10,
                         random_state=42,
                         passes=5,
                         alpha='auto',
                         eta=None,
                         workers=None,
                         chunksize=2000):
        """
        :param no_topics: Number of topics that are to be explored by lda model
        :param random_state: Random state for reproducible results (default 42, gensim default is None)
        :param passes: Number of times the whole corpus is processed.
        :param alpha: set topic-document distribution prior alpha to "symmetric" or "asymmetric"
        (gensim default is "symmetric")
        :param eta: Word-topic distribution prior eta (beta)
        :param workers: number of workers to use. Defaulting to one as there seems to be a bug in gensim. 1 already
        uses all available cores. Higher number of workers results in a load bigger than the number of cores.
        :param chunksize: chunsize parameter of gensim
        """
        if eta is None:
            eta = 1 / no_topics
        if workers is None:
            workers = self.processes
        if self.bag_of_words is None:
            self.create_bag_of_words()
        self.lda_model = LdaMulticore(corpus=self.bag_of_words,
                                      id2word=self.id2word,
                                      num_topics=no_topics,
                                      eta=eta,
                                      workers=workers,
                                      random_state=random_state,
                                      alpha=alpha,
                                      passes=passes,
                                      chunksize=chunksize)

    def create_mallet_lda_model(self,
                                no_topics,
                                random_state=42,
                                workers=None,
                                mallet_path="mallet-2.0.8/bin/mallet",
                                iterations=1000,
                                custom_prefix=None):
        """
        Method to create a mallet lda model using gensim wrapper for lda mallet
        :param no_topics: Number of topics for lda model
        :param random_state: Random state to be able to reprocude model creation
        :param workers: Number of workers to use
        :param mallet_path: path to mallet binary, e.g. "mallet-2.0.8/bin/mallet"
        :param iterations: iterations over the corpus?!
        """
        if workers is None:
            workers = self.processes
        if self.bag_of_words is None:
            self.create_bag_of_words()
        if custom_prefix is None:
            prefix = f"{self.path}mallet_temp_"
        else:
            prefix = f"{self.path}mallet_temp_{custom_prefix}_"
        self.lda_model = LdaMallet(num_topics=no_topics,
                                   mallet_path=mallet_path,
                                   corpus=self.bag_of_words,
                                   id2word=self.id2word,
                                   random_seed=random_state,
                                   iterations=iterations,
                                   workers=workers,
                                   prefix=prefix)

    def calculate_coherence(self,
                            model=None,
                            coherence_score='c_v',
                            workers=None):
        """
        Method to calculate the coherence score of a given lda model. The model can either be provided or will be taken
        from the class.
        :param model: Model to use instead of the model saved within the class.
        :param coherence_score: Coherence score to calculate
        :param workers: Number of workers to use for coherence evaluation.
        :return: Return coherence model, which also contains the coherence score of a model.
        """
        if workers is None:
            workers = self.processes
        if model is None:
            model = self.lda_model
        else:
            model = model
        if coherence_score != 'u_mass':
            coherence_model = CoherenceModel(model=model,
                                             texts=self.preprocessed_docs,
                                             dictionary=self.id2word,
                                             coherence=coherence_score,
                                             processes=workers)
        else:
            coherence_model = CoherenceModel(model=model,
                                             corpus=self.bag_of_words,
                                             dictionary=self.id2word,
                                             coherence=coherence_score,
                                             processes=workers)
        return coherence_model

    def search_best_model(self,
                          topic_list=frozenset({2, 3, 4, 5, 10, 15, 20, 25}),
                          alphas=[0.9, 0.5, 0.1],
                          etas=['auto', 0.9, 0.5, 0.1],
                          save_best_model=True,
                          save_models=False,
                          return_best_model=False,
                          passes=1,
                          coherence_scores=['c_v'],
                          chunksize=2000,
                          workers=None,
                          coherence_suffix=None):
        #todo: save best model within class.
        """
        Method to search for the best lda model for a given number of topics. The best model will be determined by its
        coherence score.
        :param topic_list: Iterable of integers of topics to test the coherence score for.
        :param alphas: Iterable of floats between 0 and 1 for determining the dirichlet prior of the lda model.
        :param save_best_model: Set to true if the best model has to be saved within the class.
        :param save_models: If set to false (default) only the coherence score for each combination of numbers of topics
        and alphas will be saved. If set to true, the lda model, the coherence score and the coherence model will be
        saved.
        :param return_best_model: If true, the method will return the best found model and the number of topics of this
        model.
        :return: Number of topics for the best result and the model with the best result of the coherence score
        """
        if coherence_suffix is None:
            path = f"{self.path}coherence_results"
        else:
            path = f"{self.path}coherence_results_{coherence_suffix}"
        if os.path.exists(path):
            print("coherence results found")
            with open(path, "rb") as f:
                self.coherence_dict = pickle.load(f)
        else:
            self.coherence_dict = {}
        if workers is None:
            workers = self.processes
        if return_best_model and not save_best_model:
            raise Exception(
                "To return the best model, the parameter save_best_model has to be set to True."
            )
        if self.coherence_dict and save_best_model:
            try:
                best_score = self.coherence_dict['best_score']
            except:
                best_score = 0
        else:
            best_score = 0
        for no_topics in tqdm(topic_list,
                              desc="Calculating topic coherences: "):
            for alpha in tqdm(alphas, desc='Alphas'):
                for eta in tqdm(etas, desc='Etas'):
                    coherence_key = f"no={no_topics}-a={alpha}-e={eta}-filter={self.filter_extremes_value}" \
                                    f"-min_df={self.min_df}-max_df={self.max_df}-phrases={self.use_phrases}" \
                                    f"-k_n={self.keep_n}-k_t={self.keep_tokens}"
                    if coherence_key in self.coherence_dict.keys():
                        print("coherence value found, skipping")
                        continue
                    else:
                        self.create_lda_model(no_topics=no_topics,
                                              alpha=alpha,
                                              eta=eta,
                                              passes=passes,
                                              chunksize=chunksize,
                                              workers=workers)
                        self.coherence_dict[coherence_key] = {}
                        if save_models:
                            self.coherence_dict[coherence_key][
                                "lda_model"] = self.lda_model
                        for coherence_score in coherence_scores:
                            coherence_model = self.calculate_coherence(
                                coherence_score=coherence_score,
                                workers=workers)
                            coherence_result = coherence_model.get_coherence()
                            if save_models:
                                self.coherence_dict[coherence_key][
                                    "coherence_model"] = coherence_model
                            self.coherence_dict[coherence_key][
                                coherence_score] = coherence_result
                            if save_best_model and coherence_result > best_score:
                                self.coherence_dict[
                                    "best_score"] = coherence_result
                                self.coherence_dict[
                                    "best_model"] = self.lda_model
                                self.coherence_dict[
                                    "best_topic_no"] = no_topics
                                self.coherence_dict["best_alpha"] = alpha
                                self.coherence_dict["best_eta"] = eta
                            if coherence_result > best_score:
                                best_score = coherence_result
                        with open(path, "wb") as f:
                            pickle.dump(self.coherence_dict, f)
        if return_best_model:
            #returns number of topics and the lda_model
            return self.coherence_dict["best_topic_no"], self.coherence_dict[
                "best_model"]

    def search_best_model_mallet(self,
                                 topic_list=frozenset(
                                     {2, 3, 4, 5, 10, 15, 20, 25}),
                                 save_best_model=True,
                                 save_models=False,
                                 return_best_model=False,
                                 coherence_scores=['c_v'],
                                 workers=None,
                                 coherence_workers=None,
                                 coherence_suffix=None,
                                 random_state=42,
                                 mallet_path="mallet-2.0.8/bin/mallet",
                                 iterations=1000):
        """

        :param topic_list:
        :param save_best_model:
        :param save_models:
        :param return_best_model:
        :param coherence_scores:
        :param workers:
        :param coherence_suffix:
        :param random_state:
        :param mallet_path:
        :param iterations:
        :return:
        """
        if coherence_suffix is None:
            path = f"{self.path}coherence_results_mallet"
        else:
            path = f"{self.path}coherence_results_mallet_{coherence_suffix}"
        if os.path.exists(path):
            print("coherence results found")
            with open(path, "rb") as f:
                self.coherence_dict = pickle.load(f)
        else:
            self.coherence_dict = {}
        if workers is None:
            workers = self.processes
        if coherence_workers is None:
            coherence_workers = self.processes
        if return_best_model and not save_best_model:
            raise Exception(
                "To return the best model, the parameter save_best_model has to be set to True."
            )
        if self.coherence_dict and save_best_model:
            try:
                best_score = self.coherence_dict['best_score']
            except:
                best_score = 0
        else:
            best_score = 0
        for no_topics in tqdm(topic_list,
                              desc="Calculating topic coherences: "):
            coherence_key = f"mallet-no={no_topics}-filter={self.filter_extremes_value}" \
                            f"-min_df={self.min_df}-max_df={self.max_df}-phrases={self.use_phrases}" \
                            f"-k_n={self.keep_n}-k_t={self.keep_tokens}"
            if coherence_key in self.coherence_dict.keys():
                print("coherence value found, skipping")
                continue
            else:
                self.create_mallet_lda_model(no_topics=no_topics,
                                             workers=workers,
                                             random_state=random_state,
                                             mallet_path=mallet_path,
                                             iterations=iterations)
                self.coherence_dict[coherence_key] = {}
                if save_models:
                    self.coherence_dict[coherence_key][
                        "lda_model"] = self.lda_model
                for coherence_score in coherence_scores:
                    coherence_model = self.calculate_coherence(
                        coherence_score=coherence_score,
                        workers=coherence_workers)
                    coherence_result = coherence_model.get_coherence()
                    if save_models:
                        self.coherence_dict[coherence_key][
                            "coherence_model"] = coherence_model
                    self.coherence_dict[coherence_key][
                        coherence_score] = coherence_result
                    if save_best_model and coherence_result > best_score:
                        self.coherence_dict["best_score"] = coherence_result
                        self.coherence_dict["best_model"] = self.lda_model
                        self.coherence_dict["best_topic_no"] = no_topics
                        self.coherence_dict[
                            "best_alpha"] = self.lda_model.alpha
                    if coherence_result > best_score:
                        best_score = coherence_result
                with open(path, "wb") as f:
                    pickle.dump(self.coherence_dict, f)
        if return_best_model:
            #returns number of topics and the lda_model
            return self.coherence_dict["best_topic_no"], self.coherence_dict[
                "best_model"]

    def create_document_topic_df(self, model=None, no_topics=10):
        """
        Creates a dataframe containing the the result of the LDA model for each document. Will set the topic with the
        highest share within the document as the dominant topic.
        :param model: LDA model to use for the calculation of the topic distribution of each document.
        :param no_topics: Number of topics in case no LDA model is provided.
        """
        if model is None:
            model = self.lda_model
        if isinstance(model, LdaMallet):
            model = malletmodel2ldamodel(model)
        topic_result_list = []
        for doc in model.get_document_topics(bow=self.bag_of_words):
            temp_dict = {}
            for topic, probability in doc:
                temp_dict[topic] = probability
            topic_result_list.append(temp_dict)
        self.result_df = pd.DataFrame(data=topic_result_list,
                                      columns=range(model.num_topics))
        self.result_df = self.result_df.fillna(0)
        if self.document_ids is not None and not self.language_detection:
            self.result_df.index = self.document_ids
        elif self.document_ids is not None and self.language_detection:
            raise Warning(
                "Using document ids and language detection together is not implemented (yet)."
            )
        dominant_topic = np.argmax(self.result_df.values, axis=1)
        self.result_df['dominant_topic'] = dominant_topic

    def plot_document_topic_distribution(self):
        #todo: log normalize
        if self.result_df is None:
            raise Exception(
                "Please create the topic distribution dataframe using the 'create_document_topic_df' "
                "method")
        counter = Counter(self.result_df.dominant_topic)
        topic_dict = OrderedDict(
            sorted(counter.items(), key=lambda x: x[1], reverse=True))
        plt.figure(figsize=(10, 6))
        g = sns.barplot(x=list(topic_dict.values()),
                        y=list(topic_dict.keys()),
                        order=list(topic_dict.keys()),
                        orient='h')
        g.set_ylabel("topic number")
        g.set_xlabel("count")
        plt.show()

    def evaluate_model(self, no_words=30):
        #todo: update 4 gensim
        keywords = np.array(self.vectorizer.get_feature_names())
        topic_keywords = []
        for topic_weights in self.lda_model.components_:
            top_keyword_locations = (-topic_weights).argsort()[:no_words]
            topic_keywords.append(keywords.take(top_keyword_locations))
        self.word_topic_df = pd.DataFrame(
            topic_keywords, columns=[f"word_{x}" for x in range(no_words)])

    def evaluate_pyldavis(self, model=None, use_jupyter=None):
        """
        Method for a visual evaluation of the LDA topic model using pyldavis.
        :param model: LDA model that is to be evaluated. If 'None', it will use the last model that has been saved
        within the class.
        :param use_jupyter: set how the pyldavis panel is displayed. If default (None), it will try to find out if run
        from jupyter and set the method accordingly
        :return:
        """
        if model is None:
            if self.lda_model is None:
                raise Exception(
                    "Please create a LDA model for evaluation before running this method."
                )
            model = self.lda_model
        if isinstance(model, LdaMallet):
            model = malletmodel2ldamodel(model)
        panel = pyLDAvis.gensim.prepare(model, self.bag_of_words, self.id2word)
        if use_jupyter is None:
            try:
                is_jupyter = os.environ['_'].split(
                    "/")[-1] == "jupyter-notebook"
                if is_jupyter:
                    pyLDAvis.enable_notebook()
            except KeyError:
                is_jupyter = False
            if is_jupyter:
                pyLDAvis.display(panel)
            else:
                pyLDAvis.show(panel)
        else:
            if use_jupyter:
                pyLDAvis.enable_notebook()
                pyLDAvis.display(panel)
            elif not use_jupyter:
                pyLDAvis.show(panel)

    def print_bow(self, doc_positions):
        print([[(self.id2word[token_id], freq) for token_id, freq in doc]
               for doc in compress(self.bag_of_words, doc_positions)])

    def save_model(self, path):
        self.lda_model.save(path)

    def load_model(self, path):
        self.lda_model = LdaMulticore.load(path)

    def save_dict(self, path):
        self.id2word.save(path)
        print("dict saved")

    def load_dict(self, path):
        self.id2word = corpora.Dictionary.load(path)
Exemple #28
0
def main():
    print("\n-----LDA CONCEPT DETECTION-----")

    # check command line
    if len(sys.argv) != 4:
        print(HELP_MESSAGE)
        quit(1)

    if not sys.argv[0].isdigit():
        print(HELP_MESSAGE)
        print("<num_topics> must be numeric")

    if not sys.argv[1].isdigit():
        print(HELP_MESSAGE)
        print("<num_iterations> must be numeric")

    if not sys.argv[2].endswith(".html"):
        print(HELP_MESSAGE)
        print("<visualization_file_path> must end with '.html'")

    if not sys.argv[3].endswith(".csv"):
        print(HELP_MESSAGE)
        print("<corpus_csv_file> must end with '.csv'")

    num_topics = sys.argv[0]
    num_iter = sys.argv[1]
    vis_file_path = sys.argv[2]
    corpus_csv_file = sys.argv[3]

    # load corpus
    corpus = load_from_csv(corpus_csv_file)

    # create CountVectorizer to get help remove short segments
    stop_words = load_stop_words("../../data/stopwords-fr.txt")
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    # remove short segments from the corpus
    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)
    proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only]

    # remove stop words from the corpus
    proc_stop_words = []
    for i in range(len(proc_corpus_text_only)):
        proc_stop_words.append([])
        for j in range(len(proc_corpus_text_only[i])):
            if proc_corpus_text_only[i][j] not in stop_words and len(
                    proc_corpus_text_only[i][j]) >= 3:
                proc_stop_words[i].append(proc_corpus_text_only[i][j])

    # vectorize text with gensim's Dictionary
    id2word = Dictionary(proc_stop_words)
    corp = [id2word.doc2bow(text) for text in proc_stop_words]

    # run mallet lda model
    path_to_mallet_binary = "Mallet/bin/mallet"
    mallet_model = LdaMallet(path_to_mallet_binary,
                             corpus=corp,
                             num_topics=13,
                             id2word=id2word,
                             optimize_interval=20,
                             random_seed=4,
                             iterations=1000)

    # convert to gensim model to build visualization
    gensim_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
        mallet_model)
    vis = pyLDAvis.gensim.prepare(gensim_model, corp, id2word)

    # save visualization
    pyLDAvis.save_html(vis, "sa_visualization.html")

    return 0
f = open("discursos_all.txt", "r")
discursos_file = f.read()
f.close()

res = eval(discursos_file)

elapsed_time = time.time() - start_time
print(time.strftime("Discursos importados, demorou %H:%M:%S:%m",
                    time.gmtime(elapsed_time)))

start_time = time.time()

data = [a.split() for a in res]

dictionary = Dictionary(data)

corpus = [dictionary.doc2bow(t) for t in data]

mallet_path = 'X:\\Programs\\mallet\\mallet-2.0.8\\bin\\mallet.bat'

lda = LdaMallet(mallet_path, corpus=corpus, id2word=dictionary, num_topics=500)

elapsed_time = time.time() - start_time
print(time.strftime("Lda model criado, demorou %H:%M:%S:%m", time.gmtime(elapsed_time)))

with open("topics_500_latest.txt", 'w+') as f:
    for index, topic in lda.show_topics(formatted=False, num_words=15):
        f.write('[{}] - '.format(index))
        f.write(', '.join(str(line[0]) for line in topic))
        f.write('\n')
from HyperDoc2Vec import *

snowball = SnowballStemmer(language='english')
nlp = spacy.load('en', disable=['parser', 'ner'])
nlp.Defaults.stop_words |= {'table', 'ref', 'formula', 'citation', 'cit', 'references'
                            'fig', 'figure', 'abstract', 'introduction',
                            'description','conclusion','results','discussion'}
mallet_path = '/home/ashwath/mallet-2.0.8/bin/mallet'


# LOAD MODELS
loadmodstart = time()
id2word_dictionary = corpora.Dictionary.load('/home/ashwath/Programs/ArxivCS/LDA/arxivmag.dict')
corpus = corpora.MmCorpus('/home/ashwath/Programs/ArxivCS/LDA/arxivmag_bow_corpus.mm')
try:
    ldamallet = LdaMallet.load('/home/ashwath/Programs/ArxivCS/LDA/ldamallet_arxiv.model')
    vec_bow_test = id2word_dictionary.doc2bow(['test'])
    vec_ldamallet = ldamallet[vec_bow_test]
except subprocess.CalledProcessError:
    print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD")
    ldamallet = LdaModel.load('/home/ashwath/Programs/ArxivCS/LDA/lda_arxiv.model')
    
malletindex = similarities.MatrixSimilarity.load('/home/ashwath/Programs/ArxivCS/LDA/simIndexArxiv.index')
with open('/home/ashwath/Programs/ArxivCS/LDA/docid_to_magid_training_arxiv.pickle', 'rb') as pick:
    docid_to_magid = pickle.load(pick)

hd2vmodel = HyperDoc2Vec.load('/home/ashwath/Programs/ArxivCS/hyperdoc2vec_arxivmag/models/hd2v_arxivmag.model')
print("MODELS took {} seconds to load".format(time()-loadmodstart))

def remove_stopwords(context):
    #print("Removing stop words.")