print(dictionary)


class MyCorpus(object):
    def __iter__(self):
        for line in all_questions:
            if line:
                yield dictionary.doc2bow(line.lower().split())


corpus_memory_friendly = MyCorpus()
i = 0
# for vector in corpus_memory_friendly:  # load one vector into memory at a time
#     print(vector)
#     i+=1
#     if i==40:
#         break
lsi = LsiModel(corpus_memory_friendly, id2word=dictionary, num_topics=300)
corpora.MmCorpus.serialize('datadump/lsi_data/train_corpora.mm',
                           corpus_memory_friendly)
dictionary.save('datadump/lsi_data/train.dict')
lsi.save("datadump/lsi_data/lsi_model")
# dictionary = corpora.Dictionary(corpus)
# corpus_gensim = [dictionary.doc2bow(doc) for doc in corpus]
# tfidf = TfidfModel(corpus_gensim)
# corpus_tfidf = tfidf[corpus_gensim]
# lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
# lsi_index = MatrixSimilarity(lsi[corpus_tfidf])
# sims['ng20']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]]
#                                 for i in range(len(corpus))])
Example #2
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 29 12:58:29 2020

@author: cvicentm
"""

from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel

model = LsiModel(common_corpus, id2word=common_dictionary)
vectorized_corpus = model[common_corpus] 
Example #3
0
wandb.init(config=config, project="topical_language_generation_sweeps")

#data preparation
cached_dir = "/home/rohola/cached_models"
tokenizer = TransformerGPT2Tokenizer(cached_dir)
dataset = TopicalDataset(config.dataset_dir, tokenizer)

docs = [doc for doc in dataset]

dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=config.no_below, no_above=config.no_above)

corpus = [dictionary.doc2bow(doc) for doc in docs]
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi_model = LsiModel(
    corpus_tfidf,
    id2word=dictionary,
    num_topics=config.num_topics,
)

#cm = CoherenceModel(model=lsi_model, corpus=corpus, coherence='u_mass')
cm = CoherenceModel(model=lsi_model,
                    texts=docs,
                    dictionary=dictionary,
                    coherence='c_w2v')
# coherence = cm.get_coherence()
# print("coherence: ", coherence)
wandb.log({"coherence": cm.get_coherence()})
Example #4
0
    vocab = Dictionary(tweets,
                       no_below=NO_BELOW,
                       no_above=NO_ABOVE,
                       keep_tokens=set(KEEP_TOKENS))

vocab.filter_extremes(no_below=NO_BELOW,
                      no_above=NO_ABOVE,
                      keep_n=KEEP_N,
                      keep_tokens=set(KEEP_TOKENS))
print(' len(vocab) after filtering: {}'.format(len(vocab.dfs)))

# no time at all, just a bookeeping step, doesn't actually compute anything
tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
tfidf.save(os.path.join(BIGDATA_PATH, 'tfidf{}.pkl'.format(len(vocab.dfs))))

tweets = [vocab.doc2bow(tw) for tw in tweets]
json.dump(tweets,
          gzip.open(os.path.join(BIGDATA_PATH, 'tweet_bows.json.gz'), 'w'))

gc.collect()

# LSA is more useful name than LSA
lsa = LsiModel(tfidf[tweets],
               num_topics=200,
               id2word=vocab,
               extra_samples=100,
               power_iters=2)

# these models can be big
lsa.save(os.path.join(BIGDATA_PATH, 'lsa_tweets'))
def main():
    conf = SparkConf().setAppName("Program Number 1")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")

    # creates Spark Session
    spark = SparkSession.builder.appName("Program Number 1").getOrCreate()

    # tweets folder address on HDFS server -  ignore files with .tmp extensions (Flume active files).
    inputpath = "hdfs://hdfs input path"

    spark.conf.set("spark.sql.shuffle.partitions", 1)

    # get the raw tweets from HDFS
    raw_tweets = spark.read.format("json").option(
        "inferScehma", "true").option("mode", "dropMalformed").load(inputpath)

    # get the tweet text from the raw data. text is transformed to lower case. Deletes re-tweets. and finally include an index for each tweet
    tweets = raw_tweets.select(
        functions.lower(functions.col("text"))).withColumnRenamed(
            "lower(text)", "text").distinct().withColumn(
                "id", functions.monotonically_increasing_id())

    # Create a tokenizer that Filter away tokens with length < 4, and get rid of symbols like $,#,...
    tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength(
        4).setInputCol("text").setOutputCol("tokens")

    # Tokenize tweets
    tokenized_tweets = tokenizer.transform(tweets)
    remover = StopWordsRemover().setInputCol("tokens").setOutputCol("cleaned")

    # remove stopwords
    cleaned_tweets = remover.transform(tokenized_tweets)

    # create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000.
    vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol(
        "features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets)
    wordVectors = vectorizer.transform(cleaned_tweets).select("id", "features")

    # LDA
    # create Latent Dirichlet Allocation model and run it on our data with 25 iteration and 5 topics
    lda = LDA(k=5, maxIter=25)
    # fit the model on data
    ldaModel = lda.fit(wordVectors)
    # create topics based on LDA
    lda_topics = ldaModel.describeTopics()
    # show LDA topics

    # ______________________________________________________________________________________________________________
    # LSA
    clean_tweets_list = []
    tweet_list = []
    # for creating the document term matrix for the LSIModel as input
    # this is needed as LSI needs tuples of (vocabulary_index, frequency) form
    for tweet_row in wordVectors.select('features').collect():
        tweet_list.clear()
        # reading the SparseVector of 'features' column (hence the 0 index) and zipping them to a list
        # idx = vocabulary_index, val=frequency of that word in that tweet
        for idx, val in zip(tweet_row[0].indices, tweet_row[0].values):
            # converting the frequency from float to integer
            tweet_list.append((idx, int(val)))
        clean_tweets_list.append(tweet_list[:])

    # calling the LSIModel and passing the number of topics as 5
    lsa_model = LsiModel(clean_tweets_list, num_topics=5)
    # show LSA topics

    # ______________________________________________________________________________________________________________
    # #Comparison

    # get the weights and indices of words from LDA topics in format of List[list[]]
    lda_wordIndices = [row['termIndices'] for row in lda_topics.collect()]
    lda_wordWeights = [row['termWeights'] for row in lda_topics.collect()]

    # get the weights and indices of words from LDA topics in format of numpy array with 5*wordCount shape.
    # each element is the weight of the corresponding word in that specific topic.
    lsa_weightsMatrix = lsa_model.get_topics()

    # function to calculate the similarity between an lsa topic and an lda topic.
    def topic_similarity_calculator(lsa_t, lda_t):
        (lda_index, lda_weight) = lda_t
        sum = 0
        for index, weight in zip(lda_index, lda_weight):
            sum = sum + (np.abs(lsa_t[index] * weight))
        return sum

    # run the similarity function on 25 possibilities (5 LSA * 5 LDA)
    similarity = []
    eachLSA = []
    for i in range(0, 5):
        eachLSA.clear()
        for j in range(0, 5):
            temp = topic_similarity_calculator(
                lsa_weightsMatrix[i], (lda_wordIndices[j], lda_wordWeights[j]))
            eachLSA.append(temp)
        similarity.append(eachLSA[:])

    # Print the similarity table
    # each row is a LDA topic and each column is an LSA topic.
    print(" ")
    print("Similarity table")

    def similarity_print(s):
        i = 1
        print("|--------------------------------------------------------|")
        print("|      |  LSA 1  |  LSA 2  |  LSA 3  |  LSA 4  |  LSA 5  |")
        print("|--------------------------------------------------------|")
        for one, two, three, four, five in zip(*similarity):
            print(
                '|LDA {} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} |'
                .format(i, one, two, three, four, five))
            print("|--------------------------------------------------------|")
            i = i + 1
#creates the similarity matrix

    similarity_print(similarity)

    # ______________________________________________________________________________________________________________
    # Final result Table
    # Manually found the following Topics to be similar
    # (LSA1 - LDA1)
    # (LSA5 - LDA2)
    # rest are alone
    lsa_words_idx = []
    for idx, curr_topic in enumerate(lsa_weightsMatrix):
        lsa_words_idx.append(np.abs(curr_topic).argsort()[-10:][::-1])
    lsa_topics_bow = {}
    lda_topics_bow = {}
    lsa_bow_list = []
    lda_bow_list = []
    for curr_idx, (lda_topic,
                   lsa_topic) in enumerate(zip(lda_wordIndices,
                                               lsa_words_idx)):
        lsa_bow_list.clear()
        lda_bow_list.clear()
        for idx in range(10):
            lsa_bow_list.append(vectorizer.vocabulary[lsa_topic[idx]])
            lda_bow_list.append(vectorizer.vocabulary[lda_topic[idx]])
        lsa_topics_bow[curr_idx] = lsa_bow_list[:]
        lda_topics_bow[curr_idx] = lda_bow_list[:]

    results = []
    names = []
    # Creating word dictionary for LDA2 and LSA5
    lda2_lsa5 = lda_topics_bow[1][:]
    for word in (lsa_topics_bow[4]):
        if word not in lda2_lsa5:
            lda2_lsa5.append(word)

# Creating word dictionary for LDA1 and LSA1
    lda1_lsa1 = lda_topics_bow[0][:]
    for word in (lsa_topics_bow[0]):
        if word not in lda1_lsa1:
            lda1_lsa1.append(word)
    results.append(lda1_lsa1)
    names.append("LDA1 - LSA1 ")
    results.append(lda2_lsa5)
    names.append("LDA2 - LSA5 ")
    results.append(lda_topics_bow[2])
    names.append("LDA3        ")
    results.append(lda_topics_bow[3])
    names.append("LDA4        ")
    results.append(lda_topics_bow[4])
    names.append("LDA5        ")
    results.append(lsa_topics_bow[1])
    names.append("LSA2        ")
    results.append(lsa_topics_bow[2])
    names.append("LSA3        ")
    results.append(lsa_topics_bow[3])
    names.append("LSA4        ")
    #printing the topics and related words
    print(" ")
    print("Topics Table")
    print(
        "|------------------------------------------------------------------------------------------|"
    )
    print(
        "|    Topic     |  Significant Words                                                    |"
    )
    print(
        "|------------------------------------------------------------------------------------------|"
    )
    for name, r in zip(names, results):
        print('| {} |  {} |'.format(name, r))
        print(
            "|------------------------------------------------------------------------------------------|"
        )

    print(" ")
    print(" ")
Example #6
0
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

#LSA - Topic Modelling
##Application du modèle sur le corpus LONGIT
number_of_topics = 1

words = 100

document_list,titles = load_data("",'./corpus_files/prod_all_txt/corpus_longit.csv')

clean_text = preprocess_data(document_list)

dictionary,doc_term_matrix = prepare_corpus(clean_text)

lsamodel = LsiModel(doc_term_matrix, num_topics = number_of_topics, id2word = dictionary)  # train model

print(lsamodel.print_topics(num_topics = number_of_topics, num_words = words))

output_file = open('./corpus_files/tm_csv/topic_modelling.csv', mode = 'w', encoding = 'utf8')

output_file.write("Topic modelling du corpus LONGIT : "+str(lsamodel.print_topics(num_topics = number_of_topics, num_words = words)))

output_file.close()
Example #7
0
# 3. create bag of words
dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


document_num = 30
bow_doc_x = bow_corpus[document_num]
print(bow_corpus[10])
#
# for i in range(len(bow_doc_x)):
#     print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
#                                                      dictionary[bow_doc_x[i][0]],
#                                                      bow_doc_x[i][1]))
#
#
lsamodel = LsiModel(bow_corpus, num_topics=7, id2word=dictionary)  # train model
print(lsamodel.print_topics(num_topics=7, num_words=10))
for idx, topic in lsamodel.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")


from gensim.test.utils import datapath

# Save model to disk.
temp_file = datapath("lsa_model_optimized")
lsamodel.save(temp_file)

# Load a potentially pretrained model from disk.
df_test_jokes = pd.read_csv("JokeText.csv")
if False:
Example #8
0
        def topicmodiling():
            l=[]
            text=''
            for i in range(len(dfs)):
                for j in dfs[i]:
                    if(j=='\n'):
                        j=' '
                        text=text+j
                    else:
                        text=text + j
                l.append(text)
                text=''
            for i in l:
                text=text+i+"\n"
            nlp=English()
            doc = nlp(text)
            texts, article = [], []
            for w in doc:
            # if it's not a stop word or punctuation mark or it is not a number, add it to our article!
                if w.is_stop == False and w.is_punct == False and w.like_num == False  and w.like_email ==False :
                # we add the lematized version of the word
                     article.append(w.lemma_)
            # if it's a new line, it means we're onto our next document
                if w.text == '\n':
                    texts.append(article)
                    article = []
            bigram = gensim.models.Phrases(texts)
            texts = [bigram[line] for line in texts]
            for i in texts:
                for j in i:
                    if(j=='\n'): 
                        i.remove(j)
            dictionary = Dictionary(texts)

            corpus = [dictionary.doc2bow(text) for text in texts]
            dictionary.token2id
            dictionary 
            lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=dictionary)
            a=lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics
            b=[]
            for i in range(0,len(a)):
                b.append(a[i][1].split('+'))   
            k=[]
            for i in range(0,len(b)):
                k.append(b[i][0:5])
            top1=[]
            for i in range(0,5):
                top1.append(k[0][i].split('*'))   
            top2=[]
            for i in range(0,5):
                top2.append(k[1][i].split('*'))   
            top3=[]
            for i in range(0,5):
                top3.append(k[2][i].split('*'))  
            top4=[]
            for i in range(0,5):
                top4.append(k[3][i].split('*'))   
            df1 = DataFrame (top1,columns=['Topic 1 weight','Topic 1 words'])
            df2 = DataFrame (top2,columns=['Topic 2 weight','Topic 2 words'])
            df3 = DataFrame (top3,columns=['Topic 3 weight','Topic 3 words'])
            df4 = DataFrame (top4,columns=['Topic 4 weight','Topic 4 words'])
            result = pd.concat([df1, df2,df3,df4], axis=1)
            for col in result.columns:
                result[col]=result[col].str.replace('"','')
                result[col]=result[col].str.replace('-','')
            return result
Example #9
0
def create_LSI(corpus, dictionary, num_topics):
    return LsiModel(corpus, num_topics=num_topics, id2word=dictionary)
Example #10
0
def process_page(all_documents, order_text, unorder_text, order_list,
                 unorder_list, ocr_values, page_ocr):
    # Count n grams frequencies and calculate cosine similarity between two docs.
    counts = CountVectorizer(ngram_range=(1, 5))
    counts_matrix = counts.fit_transform(all_documents)
    cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
    #     print('Count Vectorizer', cos[0][1])
    ocr_values.append(cos[0][1])

    # Calculate tf-idf cosine similarity (nltk or spacy text the same)
    tokenize = lambda doc: doc.lower().split(" ")
    tfidf = TfidfVectorizer(norm='l2',
                            min_df=0,
                            use_idf=True,
                            smooth_idf=False,
                            sublinear_tf=True,
                            tokenizer=tokenize,
                            ngram_range=(1, 5))
    tfidf_matrix = tfidf.fit_transform(all_documents)

    cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    #     print('TF-IDF Vectorizer', cos[0][1])
    ocr_values.append(cos[0][1])

    #     # Calculate similarity using GLOVE and SPACY
    #     order_doc = nlp(order_text)
    #     unorder_doc = nlp(unorder_text)
    #     sim_doc = order_doc.similarity(unorder_doc)
    # #     print('Spacy GLOVE', sim_doc)
    #     #https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
    #     ocr_values.append(sim_doc)

    # Calculate jaccard ratio. Takes list of tokens
    jac = 1 - distance.jaccard(order_list, unorder_list)
    #     print('Jaccard', jac)
    ocr_values.append(jac)

    # use gensim's similarity matrix and lsi to calculate cosine
    all_tokens = [order_list, unorder_list]
    dictionary = Dictionary(all_tokens)
    corpus = [dictionary.doc2bow(text) for text in all_tokens]
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
    sim = MatrixSimilarity(lsi[corpus])
    lsi_cos = [t[1][1] for t in list(enumerate(sim))]
    lsi_cos = lsi_cos[0]
    #     print('LSI', lsi_cos)
    ocr_values.append(lsi_cos)
    #https://radimrehurek.com/gensim/tut3.html

    #     align = align_pages(order_text, unorder_text)
    # #     print('smw', align)
    #     ocr_values.append(align)
    #     print(ocr_values)
    if os.path.isfile(page_ocr):
        final_metrics = pd.read_csv(page_ocr)
        ocr_values.append(datetime.date.today())
        final_metrics.loc[len(final_metrics.index)] = ocr_values
        final_metrics.to_csv(page_ocr, index=False)
    else:
        ocr_values.append(datetime.date.today())
        cols = [
            'first_issue_date', 'first_page_number', 'second_issue_date',
            'second_page_number', 'countsvec_cos', 'tfidfvec_cos',
            'jaccard_sim', 'lsi_cos', 'date_run'
        ]
        final_df = pd.DataFrame([ocr_values], columns=cols)
        final_df.to_csv(page_ocr, index=False)
def topic_analysis(corpus, dictionary, models_path, technique):

    import uuid
    uuid = str(uuid.uuid4())
    print("[BLOCK] Starting models for context")
    sys.stdout.flush()

    if technique == "all" or technique == "hdp":
        t1 = time()
        # HDP model
        model = HdpModel(corpus, id2word=dictionary)
        model.save("%s/hdp_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for HDP model: %s" % (round(t2 - t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldap":
        t1 = time()
        # Parallel LDA model
        model = LdaMulticore(corpus,
                             id2word=dictionary,
                             num_topics=100,
                             workers=23,
                             passes=20)
        model.save("%s/lda_parallel_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA multicore: %s" %
              (round(t2 - t1, 2)))
    sys.stdout.flush()

    if technique == "all" or technique == "lsa":
        t1 = time()
        # LSA model
        model = LsiModel(corpus, id2word=dictionary, num_topics=400)
        model.save("%s/lsa_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LSA: %s" % (round(t2 - t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldao":
        t1 = time()
        # Online LDA model
        model = LdaModel(corpus,
                         id2word=dictionary,
                         num_topics=100,
                         update_every=1,
                         chunksize=10000,
                         passes=5)
        model.save("%s/lda_online_%s" % (models_path, uuid))
        t2 = time()
        print("[BLOCK] Training time for LDA online: %s" % (round(t2 - t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "lda":
        t1 = time()
        # Offline LDA model
        model = LdaModel(corpus,
                         id2word=dictionary,
                         num_topics=100,
                         update_every=0,
                         passes=20)
        model.save("%s/lda_offline_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA offline: %s" %
              (round(t2 - t1, 2)))
        sys.stdout.flush()
Example #12
0
            text += text_p[i].text
        text = text.lower()

        # tokenize text into sentences
        sentences = sent_tokenize(text)
        text_length = find_text_length(sentences)
        tokens = tokenize(text)

        # create dictionary of tokens
        dictionary = corpora.Dictionary(tokens)
        sent_term_matrix = [
            dictionary.doc2bow(sentence) for sentence in tokens
        ]
        # Gensim's LsiModel performs Truncated SVD such that dimension of S = numtopics
        # if numtopics is not specified, performs SVD
        lsamodel = LsiModel(sent_term_matrix, id2word=dictionary)

        # Grab matrix V from SVD, A = USV^t
        V = corpus2dense(lsamodel[sent_term_matrix], len(
            lsamodel.projection.s)).T / lsamodel.projection.s

        # Output sentence with the longest vector lengths, no repeats
        lengths = find_length(lsamodel.projection.s, V)
        if (len(sentences) < 5):
            num_sentences = len(sentences)
        else:
            num_sentences = 5
        indices = find_indices(
            lengths, num_sentences)  # number of sentences printed = 5

        scores = []  # rouge scores
Example #13
0

corpus = MyCorpus(test_data_dir)  # create a dictionary
print(corpus)
# for vector in corpus: # convert each document to a bag-of-word vector
#     print vector

tfidf = TfidfModel(corpus)
#print(tfidf[some_doc])

topics = 20
num_clusters = 8
passes = 1

print "Create LSI model"
lsi_model = LsiModel(corpus, id2word=corpus.dictionary, num_topics=topics)
corpus_lsi = lsi_model[corpus]

print "Create LDA model"
lda_model = LdaModel(corpus,
                     id2word=corpus.dictionary,
                     num_topics=topics,
                     passes=passes)
corpus_lda = lda_model[corpus]

print "Done creating models"

# print "*********************"
# print "\n\nPrint LSI model\n"
# topic_id = 0
# for topic in lsi_model.show_topics(num_words=5):
Example #14
0
 def compute_lsi(self, num_topics=None):
     lsi = LsiModel(self.wiki_tfidf_corpus,
                    num_topics=num_topics,
                    id2word=self.wiki_dict)
     return lsi
Example #15
0

# for item in docs:
#     corpus.append(list(word_tokenize(reuters.raw(item))))

# print(corpus[3])

# tfidf = TfidfModel(corpus)
# print(tfidf[doc[0]])
#tfidf.save('/tmp/foo.tfidf_model')


documents = [tokenize(reuters.raw(docs[0]))] #for file_id in docs[0]]
dictionary = Dictionary(documents)
#for item in docs[0]:
topics.append(reuters.categories(docs[0]))
corpus = [dictionary.doc2bow(d) for d in documents]
tfidf_model = TfidfModel(corpus, id2word=dictionary)
tfidf_values = tfidf_model[corpus]
#dict(tfidf_model[dictionary.doc2bow(tokenize(reuters.raw(docs[0])))])
# print tfidf_values[dictionary.token2id['year']]                     # 0.0367516096888
# print tfidf_values[dictionary.token2id['following']]                # 0.0538505795815
# print tfidf_values[dictionary.token2id['provided']]                 # 0.0683210467787
# print tfidf_values[dictionary.token2id['structural']]               # 0.0945807226371
# print tfidf_values[dictionary.token2id['japanese']]                 # 0.107960637598
# print tfidf_values[dictionary.token2id['downtrend']]                # 0.122670341446
#print(documents[2])
#print(corpus[2])

lsi = LsiModel(tfidf_values, num_topics=len(topics))
print(lsi[tfidf_values]) # project some document into LSI space
# In[6]:


for i in range(len(cols)):
    dictionary.add_documents([ [cols[i]] ])
    curr_id = dictionary.doc2idx([ cols[i] ])[0]
    print(i)
    for j in range( len(Patients) ):
        corpus_tfidf[j].append( ( curr_id , data.at[ j , i+1 ] ) )


# In[7]:


lsi = LsiModel(corpus_tfidf, id2word=dictionary)
lsi_index = MatrixSimilarity(lsi[corpus_tfidf])
sims['files']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]] for i in range(len(corpus))])


# In[8]:


sims


# In[9]:


ind=np.unravel_index(np.argmax(sims['files']['LSI'],axis=None), sims['files']['LSI'].shape)
data_lemmatized = make_bigrams(data_words)
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
lda_model = LdaModel.load('lda_model_full2')

for c in lda_model[corpus[5:8]]:
    print("Document Topics      : ", c[0])
    print("Word id, Topics      : ", c[1][:3])
    print("Phi Values (word id) : ", c[2][:2])
    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:2]])
    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:2]])
    print("------------------------------------------------------\n")

lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5)
pprint(lsi_model.print_topics(-1))


def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True)
            else:
Example #18
0
def recommend(docs_path,
              dict_path,
              use_fos_annot=False,
              pp_dict_path=None,
              np_dict_path=None,
              lda_preselect=False,
              combine_train_contexts=True):
    """ Recommend
    """

    test = []
    train_mids = []
    train_texts = []
    train_foss = []
    train_ppann = []
    train_nps = []
    foss = []
    tmp_bag = []
    adjacent_cit_map = {}

    if pp_dict_path and False:
        prind('loading predpatt dictionary')
        pp_dictionary = corpora.Dictionary.load(pp_dict_path)
        pp_num_unique_tokens = len(pp_dictionary.keys())
        use_predpatt_model = True
        if not combine_train_contexts:
            prind(('usage of predpatt model is not implemented for not'
                   'combining train contexts.\nexiting.'))
            sys.exit()
    else:
        use_predpatt_model = False
        pp_dictionary = None

    if np_dict_path:
        prind('loading noun phrase dictionary')
        np_dictionary = corpora.Dictionary.load(np_dict_path)
        np_num_unique_tokens = len(np_dictionary.keys())
        use_noun_phrase_model = True
    else:
        use_noun_phrase_model = False
        np_dictionary = None

    prind('checking file length')
    num_lines = sum(1 for line in open(docs_path))

    # # for MAG eval
    # mag_id2year = {}
    # with open('MAG_CS_en_year_map.csv') as f:
    #     for line in f:
    #         pid, year = line.strip().split(',')
    #         mag_id2year[pid] = int(year)
    # # /for MAG eval

    prind('train/test splitting')
    with open(docs_path) as f:
        for idx, line in enumerate(f):
            if idx == 0:
                tmp_bag_current_mid = line.split('\u241E')[0]
            if idx % 10000 == 0:
                prind('{}/{} lines'.format(idx, num_lines))
            cntxt_foss = []
            cntxt_ppann = []
            cntxt_nps = []
            # handle varying CSV formats
            vals = line.split('\u241E')
            if use_noun_phrase_model:
                cntxt_nps = vals[-1]
                if '\u241D' in cntxt_nps:  # includes NP<marker> variant
                    np_all, np_marker = cntxt_nps.split('\u241D')
                    cntxt_nps = np_marker  # mby use both for final eval
                cntxt_nps = [np for np in cntxt_nps.strip().split('\u241F')]
                vals = vals[:-1]
            if len(vals) == 4:
                mid, adjacent, in_doc, text = vals
            elif len(vals) == 5:
                if use_predpatt_model:
                    mid, adjacent, in_doc, text, pp_annot_json = vals
                else:
                    mid, adjacent, in_doc, text, fos_annot = vals
            elif len(vals) == 6:
                mid, adjacent, in_doc, text, fos_annot, pp_annot_json = vals
            else:
                prind('input file format can not be parsed\nexiting')
                sys.exit()
            if len(vals) in [5, 6] and use_fos_annot:
                cntxt_foss = [
                    f.strip() for f in fos_annot.split('\u241F')
                    if len(f.strip()) > 0
                ]
                foss.extend(cntxt_foss)
            if use_predpatt_model:
                if '\u241F' in pp_annot_json:  # includes alternative version
                    ppann, ppann_alt = pp_annot_json.split('\u241F')
                    pp_annot_json = ppann
                cntxt_ppann = json.loads(pp_annot_json)
            # create adjacent map for later use in eval
            if mid not in adjacent_cit_map:
                adjacent_cit_map[mid] = []
            if len(adjacent) > 0:
                adj_cits = adjacent.split('\u241F')
                for adj_cit in adj_cits:
                    if adj_cit not in adjacent_cit_map[mid]:
                        adjacent_cit_map[mid].append(adj_cit)
            # fill texts
            if mid != tmp_bag_current_mid or idx == num_lines - 1:
                # tmp_bag now contains all lines sharing ID tmp_bag_current_mid
                num_contexts = len(tmp_bag)
                sub_bags_dict = {}
                for item in tmp_bag:
                    item_in_doc = item[0]
                    item_text = item[1]
                    item_foss = item[2]
                    item_ppann = item[3]
                    item_nps = item[4]
                    if item_in_doc not in sub_bags_dict:
                        sub_bags_dict[item_in_doc] = []
                    sub_bags_dict[item_in_doc].append(
                        [item_text, item_foss, item_ppann, item_nps])
                if len(sub_bags_dict) < 2:
                    # can't split, reset bag, next
                    tmp_bag = []
                    tmp_bag_current_mid = mid
                    continue
                order = sorted(sub_bags_dict,
                               key=lambda k: len(sub_bags_dict[k]),
                               reverse=True)
                # ↑ keys for sub_bags_dict, ordered for largest bag to smallest

                min_num_train = math.floor(num_contexts * 0.8)
                train_tups = []
                test_tups = []
                for jdx, sub_bag_key in enumerate(order):
                    sb_tup = sub_bags_dict[sub_bag_key]
                    # if sub_bag_key[1:3] == '06':  # time split ACL
                    # if mag_id2year[sub_bag_key] > 2017:  # time split MAG
                    # if sub_bag_key[:2] == '17':  # time split arXiv
                    if len(train_tups
                           ) > min_num_train or jdx == len(order) - 1:
                        test_tups.extend(sb_tup)
                    else:
                        train_tups.extend(sb_tup)
                test.extend([
                    [
                        tmp_bag_current_mid,  # mid
                        tup[0],  # text
                        tup[1],  # fos
                        sum_weighted_term_lists(tup[2], pp_dictionary),  # pp
                        tup[3]  # nps
                    ] for tup in test_tups
                ])
                if combine_train_contexts:
                    # combine train contexts per cited doc
                    train_text_combined = ' '.join(tup[0]
                                                   for tup in train_tups)
                    train_mids.append(tmp_bag_current_mid)
                    train_texts.append(train_text_combined.split())
                    train_foss.append(
                        [fos for tup in train_tups for fos in tup[1]])
                    train_ppann.append(
                        sum_weighted_term_lists(
                            sum([tup[2] for tup in train_tups], []),
                            pp_dictionary))
                    train_nps.append(
                        [np for tup in train_tups for np in tup[3]])
                else:
                    # don't combine train contexts per cited doc
                    for tup in train_tups:
                        train_mids.append(tmp_bag_current_mid)
                        train_texts.append(tup[0].split())
                        train_foss.append([fos for fos in tup[1]])
                        train_nps.append([np for np in tup[1]])
                # reset bag
                tmp_bag = []
                tmp_bag_current_mid = mid
            tmp_bag.append([in_doc, text, cntxt_foss, cntxt_ppann, cntxt_nps])
    prind('loading dictionary')
    dictionary = corpora.Dictionary.load(dict_path)
    num_unique_tokens = len(dictionary.keys())
    prind('building corpus')
    corpus = [dictionary.doc2bow(text) for text in train_texts]

    if use_fos_annot:
        prind('preparing FoS model')
        mlb = MultiLabelBinarizer()
        mlb.fit([foss])
        train_foss_matrix = mlb.transform(train_foss)
        train_foss_set_sizes = np.sum(train_foss_matrix, 1)
    prind('generating TFIDF model')
    tfidf = models.TfidfModel(corpus)
    prind('preparing similarities')
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=num_unique_tokens)

    bm25 = BM25(corpus)
    average_idf = sum(map(lambda k: float(bm25.idf[k]),
                          bm25.idf.keys())) / len(bm25.idf.keys())

    if lda_preselect:
        orig_index = index.index.copy()

        prind('generating LDA/LSI model')
        lda = LsiModel(tfidf[corpus], id2word=dictionary, num_topics=100)
        prind('preparing similarities')
        lda_index = similarities.SparseMatrixSimilarity(
            lda[tfidf[corpus]], num_features=num_unique_tokens)

    if use_predpatt_model:
        prind('preparing claim similarities')
        pp_tfidf = models.TfidfModel(train_ppann)
        pp_index = similarities.SparseMatrixSimilarity(
            pp_tfidf[train_ppann], num_features=pp_num_unique_tokens)

    if use_noun_phrase_model:
        prind('preparing noun phrase similarities')
        np_corpus = [np_dictionary.doc2bow(nps) for nps in train_nps]
        np_index = similarities.SparseMatrixSimilarity(
            np_corpus, num_features=np_num_unique_tokens)

    # models: BoW, NP<marker>, Claim, Claim+BoW
    eval_models = [{
        'name': 'bow'
    }, {
        'name': 'np'
    }, {
        'name': 'claim'
    }, {
        'name': 'claim+bow'
    }]
    for mi in range(len(eval_models)):
        eval_models[mi]['num_cur'] = 0
        eval_models[mi]['num_top'] = 0
        eval_models[mi]['num_top_5'] = 0
        eval_models[mi]['num_top_10'] = 0
        eval_models[mi]['ndcg_sums'] = [0] * AT_K
        eval_models[mi]['map_sums'] = [0] * AT_K
        eval_models[mi]['mrr_sums'] = [0] * AT_K
        eval_models[mi]['recall_sums'] = [0] * AT_K
    prind('test set size: {}\n- - - - - - - -'.format(len(test)))
    for test_item_idx, tpl in enumerate(test):
        if test_item_idx > 0 and test_item_idx % 10000 == 0:
            save_results(docs_path,
                         num_lines,
                         len(test),
                         eval_models,
                         suffix='_tmp')
        test_mid = tpl[0]
        # if test_mid not in train_mids:
        #     # not testable
        #     continue
        test_text = bow_preprocess_string(tpl[1])
        if use_fos_annot:
            test_foss_vec = mlb.transform([tpl[2]])
            dot_prods = train_foss_matrix.dot(
                test_foss_vec.transpose()).transpose()[0]
            with np.errstate(divide='ignore', invalid='ignore'):
                fos_sims = np.nan_to_num(dot_prods / train_foss_set_sizes)
            fos_sims_list = list(enumerate(fos_sims))
            fos_sims_list.sort(key=lambda tup: tup[1], reverse=True)
            fos_ranking = [s[0] for s in fos_sims_list]
            fos_boost = np.where(dot_prods >= dot_prods.max() - 1)[0].tolist()
            top_dot_prod = dot_prods[-1]
        if use_predpatt_model:
            pp_sims = pp_index[pp_tfidf[tpl[3]]]
            pp_sims_list = list(enumerate(pp_sims))
            pp_sims_list.sort(key=lambda tup: tup[1], reverse=True)
            pp_ranking = [s[0] for s in pp_sims_list]
        if use_noun_phrase_model:
            np_sims = np_index[np_dictionary.doc2bow(tpl[4])]
            np_sims_list = list(enumerate(np_sims))
            np_sims_list.sort(key=lambda tup: tup[1], reverse=True)
            np_ranking = [s[0] for s in np_sims_list]
        test_bow = dictionary.doc2bow(test_text)
        if lda_preselect:
            # pre select in LDA/LSI space
            lda_sims = lda_index[lda[tfidf[test_bow]]]
            lda_sims_list = list(enumerate(lda_sims))
            lda_sims_list.sort(key=lambda tup: tup[1], reverse=True)
            lda_ranking = [s[0] for s in lda_sims_list]
            lda_picks = lda_ranking[:1000]
            index.index = orig_index[lda_picks]
        sims = index[tfidf[test_bow]]
        sims_list = list(enumerate(sims))
        sims_list.sort(key=lambda tup: tup[1], reverse=True)
        bow_ranking = [s[0] for s in sims_list]

        bm25_scores = list(enumerate(bm25.get_scores(test_bow, average_idf)))
        bm25_scores.sort(key=lambda tup: tup[1], reverse=True)
        bm25_ranking = [s[0] for s in bm25_scores]

        if lda_preselect:
            # translate back from listing in LDA/LSI pick subset to global listing
            bow_ranking = [lda_picks[r] for r in bow_ranking]
        if use_fos_annot:
            boost_ranking = fos_boost_ranking(bow_ranking, fos_boost,
                                              top_dot_prod)
        if not combine_train_contexts:
            seen = set()
            seen_add = seen.add
            final_ranking = [
                x for x in final_ranking
                if not (train_mids[x] in seen or seen_add(train_mids[x]))
            ]
        if use_predpatt_model:
            sims_comb = combine_simlists(sims, pp_sims, [2, 1])
            comb_sims_list = list(enumerate(sims_comb))
            comb_sims_list.sort(key=lambda tup: tup[1], reverse=True)
            comb_ranking = [s[0] for s in comb_sims_list]

        for mi in range(len(eval_models)):
            if mi == 0:
                final_ranking = bow_ranking
            elif mi == 1:
                final_ranking = np_ranking
            elif mi == 2:
                final_ranking = pp_ranking
            elif mi == 3:
                final_ranking = comb_ranking
            rank = len(bow_ranking)  # assign worst possible
            for idx, doc_id in enumerate(final_ranking):
                if train_mids[doc_id] == test_mid:
                    rank = idx + 1
                    break
                if idx >= 10:
                    break
            dcgs = [0] * AT_K
            idcgs = [0] * AT_K
            precs = [0] * AT_K
            num_rel_at = [0] * AT_K
            num_rel = 1 + len(adjacent_cit_map[test_mid])
            num_rel_at_k = 0
            for i in range(AT_K):
                relevant = False
                placement = i + 1
                doc_id = final_ranking[i]
                result_mid = train_mids[doc_id]
                if result_mid == test_mid:
                    relevance = 1
                    num_rel_at_k += 1
                    relevant = True
                elif result_mid in adjacent_cit_map[test_mid]:
                    relevance = .5
                    num_rel_at_k += 1
                    relevant = True
                else:
                    relevance = 0
                num_rel_at[i] = num_rel_at_k
                if relevant:
                    precs[i] = num_rel_at_k / placement
                denom = math.log2(placement + 1)
                dcg_numer = math.pow(2, relevance) - 1
                for j in range(i, AT_K):
                    dcgs[j] += dcg_numer / denom
                if placement == 1:
                    ideal_rel = 1
                elif placement <= num_rel:
                    ideal_rel = .5
                else:
                    ideal_rel = 0
                idcg_numer = math.pow(2, ideal_rel) - 1
                for j in range(i, AT_K):
                    # note this^    we go 0~9, 1~9, 2~9, ..., 9
                    idcgs[j] += idcg_numer / denom
            for i in range(AT_K):
                eval_models[mi]['ndcg_sums'][i] += dcgs[i] / idcgs[i]
                eval_models[mi]['map_sums'][i] += sum(precs[:i + 1]) / max(
                    num_rel_at[i], 1)
                if rank <= i + 1:
                    eval_models[mi]['mrr_sums'][i] += 1 / rank
                    eval_models[mi]['recall_sums'][i] += 1
            if rank == 1:
                eval_models[mi]['num_top'] += 1
            if rank <= 5:
                eval_models[mi]['num_top_5'] += 1
            if rank <= 10:
                eval_models[mi]['num_top_10'] += 1
            eval_models[mi]['num_cur'] += 1
            prind('- - - - - {}/{} - - - - -'.format(eval_models[0]['num_cur'],
                                                     len(test)))
            prind('#1: {}'.format(eval_models[0]['num_top']))
            prind('in top 5: {}'.format(eval_models[0]['num_top_5']))
            prind('in top 10: {}'.format(eval_models[0]['num_top_10']))
            prind('ndcg@5: {}'.format(eval_models[0]['ndcg_sums'][4] /
                                      eval_models[0]['num_cur']))
            prind('map@5: {}'.format(eval_models[0]['map_sums'][4] /
                                     eval_models[0]['num_cur']))
            prind('mrr@5: {}'.format(eval_models[0]['mrr_sums'][4] /
                                     eval_models[0]['num_cur']))
            prind('recall@5: {}'.format(eval_models[0]['recall_sums'][4] /
                                        eval_models[0]['num_cur']))

    for mi in range(len(eval_models)):
        eval_models[mi]['num_applicable'] = eval_models[mi]['num_cur']
        eval_models[mi]['ndcg_results'] = [
            sm / eval_models[mi]['num_cur']
            for sm in eval_models[mi]['ndcg_sums']
        ]
        eval_models[mi]['map_results'] = [
            sm / eval_models[mi]['num_cur']
            for sm in eval_models[mi]['map_sums']
        ]
        eval_models[mi]['mrr_results'] = [
            sm / eval_models[mi]['num_cur']
            for sm in eval_models[mi]['mrr_sums']
        ]
        eval_models[mi]['recall_results'] = [
            sm / eval_models[mi]['num_cur']
            for sm in eval_models[mi]['recall_sums']
        ]

    return eval_models, num_lines, len(test)
Example #19
0
# %%
filepath = "./data/train_input.csv"
corpus = Sentences(filepath, loop=False)
dict = Dictionary(corpus, prune_at=DICTLENGTH)
dict.filter_extremes(no_below= 2, no_above= 0.8)

# %%
for i, bow in enumerate(TFIDF_corpus(corpus, dict)):
    print("---------%s----------" %(i, ))
    print(len(bow))
    print(bow)
    if(i == 20):
        break;
# %%
embed_size = SVDSIZE
lsi = LsiModel(TFIDF_corpus(corpus, dict), num_topics= embed_size)
comments_embd = lsi[TFIDF_corpus(corpus, dict)]

# %%
for i, embd in enumerate(comments_embd):
    print(i, len(embd))

# %%
labels = pd.read_csv('./data/train_input.csv', usecols = ['label', ], squeeze = True)

x = np.zeros((NUMSAMPLES, embed_size))
y = np.zeros((NUMSAMPLES, 1))

count = 0
for i, (embed, l) in enumerate(zip(comments_embd, labels)):
    hidden = [item[1] for item in embed]
    FILENAME = 'panda_corpus.txt'

    panda_g = corpus(FILENAME)

    si = SenSimi(panda_g)

    panda_raw = si.reconstructdata()
    print(type(panda_raw))

    bowlist = si.bowcorpus(panda_raw)
    print(bowlist[1])

    panda_tfidfmodel = si.tfidfmodel(bowlist)
    panda_tfidf = panda_tfidfmodel[bowlist]
    # FIXME:不能用全部的语料生成索引,超出numpy.array限制
    # FIXME: 如果用部分语料,用于比较相似的句子不包括在索引矩阵的特征向量中(基)
    print('using lsi model...')
    panda_lsi = LsiModel(corpus=panda_tfidf,
                         id2word=si.word_dict,
                         num_topics=300)
    index = similarities.MatrixSimilarity(panda_lsi[panda_tfidf])
    good = ['可爱', '萌', '喜欢', '国宝', '神奇']
    good_bow = si.word_dict.doc2bow(good)
    good_tfidf = panda_tfidfmodel[good_bow]
    good_lsi = panda_lsi[good_tfidf]
    simi = index[good_lsi]
    simi_list = list(simi)
    print(max(simi_list))
    where = simi_list.index(max(simi_list))
    print(panda_raw[where])
nltk.download('punkt')

clean_text = re.sub(r'%(.*)\n', '', file_contents)
clean_text = re.sub(r'\s+', ' ', clean_text)

words = nltk.word_tokenize(clean_text)
n = 5
ngrams = {}
for i in range(len(words) - n):
    gram = ' '.join(words[i:i + n])
    if gram not in ngrams.keys():
        ngrams[gram] = []
    ngrams[gram].append(words[i + n])

from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity

dictionary = corpora.Dictionary(clean_text)
corpus_gensim = [dictionary.doc2bow(doc) for doc in clean_text]
tfidf_text = TfidfModel(corpus_gensim)
corpus_tfidf_text = tfidf_text[corpus_gensim]
lsi_text = LsiModel(corpus_tfidf_text, id2word=dictionary, num_topics=10)
lsi_index_text = MatrixSimilarity(lsi_text[corpus_tfidf_text])
sims['clean_text']['LSI'] = np.array([
    lsi_index_text[lsi_text[corpus_tfidf_text[i]]]
    for i in range(len(clean_text))
])
Example #22
0
def main():
    # --- arguments ---
    (dataset, version, _, _, nbs_topics, _, _, cache_in_memory, use_callbacks,
     tfidf, args) = parse_args()

    model_class = 'LSImodel'
    _split_ = "_split" if use_callbacks else ""

    data_name = f'{dataset}_{version}_{tfidf}'
    data_dir = join(LDA_PATH, version, tfidf)

    # --- logging ---
    logger = init_logging(name=data_name,
                          basic=False,
                          to_stdout=True,
                          to_file=True)
    logg = logger.info
    log_args(logger, args)

    # --- load dict ---
    logg('Loading dictionary')
    data_file = join(data_dir, f'{data_name}.dict')
    dictionary = Dictionary.load(data_file)

    # --- load corpus ---
    logg('Loading corpus')
    data_file = join(data_dir, f'{data_name}.mm')
    corpus = MmCorpus(data_file)
    if cache_in_memory:
        logg('Reading corpus into RAM')
        corpus = list(corpus)
    if use_callbacks:
        train, test = split_corpus(corpus)
    else:
        train, test = corpus, []
    logg(f'size of... train_set={len(train)}, test_set={len(test)}')

    # --- train ---
    topn = 20
    columns = [f'term{x}'
               for x in range(topn)] + [f'weight{x}' for x in range(topn)]
    for nbtopics in nbs_topics:
        gc.collect()

        logg(f'Running {model_class} with {nbtopics} topics')
        model = LsiModel(corpus=train, num_topics=nbtopics, id2word=dictionary)

        model_dir = join(LSI_PATH, version, tfidf, f'{_split_}')
        model_path = join(model_dir,
                          f'{dataset}_{model_class}{_split_}_{nbtopics}')
        if not exists(model_dir):
            makedirs(model_dir)

        # --- save topics ---
        topics = model.show_topics(num_words=topn, formatted=False)
        topics = [list(chain(*zip(*topic[1]))) for topic in topics]
        topics = pd.DataFrame(topics, columns=columns)
        logg(f'Saving topics to {model_path}.csv')
        topics.to_csv(f'{model_path}.csv')

        # --- save model ---
        logg(f'Saving model to {model_path}')
        model.save(model_path)

    # --- done ---
    logg(f'\n'
         f'----- end -----\n'
         f'----- {dataset.upper()} -----\n'
         f'{"#" * 50}\n')
Example #23
0
        continue
    time1 = time.time()
    print('已生成训练集文本向量。正在进行模型训练......')

    num_topics = 2 + int(len(corpus) / 250)
    if num_topics >= 20:
        num_topics = 10
    num_words = (num_topics - 2) * 2 + 10
    print('本院系文章总数为%d,即将分为主题数%d个,关键字%d个......' %
          (len(corpus), num_topics, num_words))
    # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=50)
    # result = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
    # doc_lda = ldamodel[corpus]
    model = LsiModel(
        corpus,
        id2word=dictionary,
        num_topics=num_topics,
    )
    doc_lda = model[corpus]
    result = model.print_topics(num_topics=num_topics, num_words=num_words)
    time2 = time.time()
    print('模型训练用时:', time2 - time1)
    print('LDA模型训练完成。插入数据库......')

    for n in range(len(doc_lda)):
        Topic = doc_lda[n]
        if len(Topic) == 0:
            prams = (institution_paper_list[n][0], institution + "其他",
                     json.dumps({}, ensure_ascii=False),
                     json.dumps({}, ensure_ascii=False))
            sql = 'insert into lda2 values(%s,%s,%s,%s)'
    # Learn an LSI model from the tf-idf vectors.
    if True:
        
        # The number of topics to use.
        num_topics = 300
        
        # Load the tf-idf corpus back from disk.
        corpus_tfidf = MmCorpus('./data/corpus_tfidf.mm')        
        
        # Train LSI
        print '\nLearning LSI model from the tf-idf vectors...'
        t0 = time.time()
        
        # Build the LSI model
        # This took 2hrs. and 7min. on my machine.
        model_lsi = LsiModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary)   
    
        print '    Building LSI model took %s' % formatTime(time.time() - t0)

        # Write out the LSI model to disk.
        # The LSI model is big but not as big as the corpus.
        # The largest piece is the projection matrix:
        #  100,000 words x 300 topics x 8-bytes per val x (1MB / 2^20 bytes) = ~229MB
        #  This is saved as `lsi.lsi_model.projection.u.npy` 
        model_lsi.save('./data/lsi.lsi_model')
    
    # ========= STEP 6: Convert articles to LSI with index ========
    # Transform corpus to LSI space and index it
    if True:
        
        print '\nApplying LSI model to all vectors...'        
Example #25
0
# Transform arbitrary documents by getting them into the same BOW vector space created by your training corpus
documents = ["Some iterable", "containing multiple", "documents", "..."]
bow_documents = (dictionary.doc2bow(tokenize_func(document))
                 for document in documents
                 )  # use a generator expression because...
logent_documents = logent_transformation[
    bow_documents]  # ...transformation is done during iteration of documents using generators, so this uses constant memory

### Chained transformations
# This builds a new corpus from iterating over documents of bow_corpus as transformed to log entropy representation.
# Will also take many hours if bow_corpus is the Wikipedia corpus created above.
logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus])

# Creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus.
lsi_transformation = LsiModel(corpus=logent_corpus,
                              id2word=dictionary,
                              num_features=400)

# Alternative way of performing same operation as above, but with implicit chaining
# lsi_transformation = LsiModel(corpus=logent_transformation[bow_corpus], id2word=dictionary,
#    num_features=400)

# Can persist transformation models, too.
logent_transformation.save("logent.model")
lsi_transformation.save("lsi.model")

### Similarities (the best part)
from gensim.similarities import Similarity

# This index corpus consists of what you want to compare future queries against
index_documents = [
linked = linkage(sims['texts']['LDA'], 'complete')

plt.figure(figsize=(10, 20))
plt.title('LDa Clustering Dendrogram')
dendrogram(linked,
           orientation='left',
           labels=Index_of_files,
           distance_sort='descending',
           show_leaf_counts=True)
plt.show()

#################################################      Run Lsi Model      ############################################
#you can change number of topics(num_topics=20) and see diffrent results

lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20)
lsi_index = MatrixSimilarity(lsi[corpus_tfidf])
sims['texts']['LSI'] = np.array(
    [lsi_index[lsi[corpus_tfidf[i]]] for i in range(len(corpus))])

#################################################   visualization of Lsi Algorithm with dendrgram #####################
# you can see the result ===>Figure_3.png
linked = linkage(sims['texts']['LSI'], 'complete')

plt.figure(figsize=(10, 20))
plt.title('Lsi Clustering Dendrogram')
dendrogram(linked,
           orientation='left',
           labels=Index_of_files,
           distance_sort='descending',
           show_leaf_counts=True)
Example #27
0
texts = [bigram[line] for line in texts]

# In[10]:

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# We're now done with a very important part of any text analysis - the data cleaning and setting up of corpus. It must be kept in mind that we created the corpus the way we did because that's how gensim requires it - most algorithms still require one to clean the data set the way we did, by removing stop words and numbers, adding the lemmatized form of the word, and using bigrams.

# ### LSI
#
# LSI stands for Latent Semantic Indeixing - it is a popular information retreival method which works by decomposing the original matrix of words to maintain key topics. Gensim's implementation uses an SVD.

# In[11]:

lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

# In[12]:

lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics

# ### HDP
#
# HDP, the Hierarchical Dirichlet process is an unsupervised topic model which figures out the number of topics on it's own.

# In[13]:

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

# In[14]:
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False,
         num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    if num_topics is None:
        num_topics = 100

    possible_model_names = [
        'tf_idf',  # 0
        'lsi_bow', 'lsi_tf_idf',  # 1, 2
        'rp_bow', 'rp_tf_idf',  # 3, 4
        'lda_bow', 'lda_tf_idf',  # 5, 6
        'hdp_bow', 'hdp_tf_idf',  # 7, 8
        'word2vec',  # 9
    ]
    chosen_model_name = possible_model_names[chosen_model_no]
    print(chosen_model_name)

    game_names, _ = load_game_names(include_genres=False, include_categories=False)

    steam_tokens = load_tokens()

    nlp = spacy.load('en_core_web_lg')

    documents = list(steam_tokens.values())

    dct = Dictionary(documents)
    print(len(dct))
    dct.filter_extremes(no_below=no_below, no_above=no_above)
    print(len(dct))

    corpus = [dct.doc2bow(doc) for doc in documents]

    # Pre-processing

    pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf')

    tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors)

    if pre_process_corpus_with_tf_idf:
        # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf!
        print('Corpus as Tf-Idf')
        pre_processed_corpus = tfidf_model[corpus]
    else:
        print('Corpus as Bag-of-Words')
        pre_processed_corpus = corpus

    # Model

    model = None
    wv = None
    index2word_set = None

    if chosen_model_name == 'tf_idf':
        print('Term Frequency * Inverse Document Frequency (Tf-Idf)')
        model = tfidf_model

    elif chosen_model_name.startswith('lsi'):
        print('Latent Semantic Indexing (LSI/LSA)')
        model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('rp'):
        print('Random Projections (RP)')
        model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('lda'):
        print('Latent Dirichlet Allocation (LDA)')
        model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('hdp'):
        print('Hierarchical Dirichlet Process (HDP)')
        model = HdpModel(pre_processed_corpus, id2word=dct)

    elif chosen_model_name == 'word2vec':
        use_a_lot_of_ram = False

        if use_a_lot_of_ram:
            model = None

            print('Loading Word2Vec based on Google News')
            # Warning: this takes a lot of time and uses a ton of RAM!
            wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
        else:
            if use_spacy:
                print('Using Word2Vec with spaCy')
            else:
                print('Training Word2Vec')

                model = Word2Vec(documents)

                wv = model.wv

        if not use_spacy:
            wv.init_sims(replace=normalize_vectors)

            index2word_set = set(wv.index2word)

    else:
        print('No model specified.')
        model = None

    if chosen_model_name != 'word2vec':
        if not use_soft_cosine_similarity:
            index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct))
        else:
            w2v_model = Word2Vec(documents)
            similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
            similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100)
            index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix)
    else:
        index = None

    query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True)

    app_ids = list(int(app_id) for app_id in steam_tokens.keys())

    matches_as_app_ids = []

    for query_count, query_app_id in enumerate(query_app_ids):
        print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids),
                                                    query_app_id, get_app_name(query_app_id, game_names)))

        query = steam_tokens[str(query_app_id)]

        if use_spacy:
            spacy_query = Doc(nlp.vocab, query)
        else:
            spacy_query = None

        if chosen_model_name != 'word2vec':
            vec_bow = dct.doc2bow(query)
            if pre_process_corpus_with_tf_idf:
                pre_preoccessed_vec = tfidf_model[vec_bow]
            else:
                pre_preoccessed_vec = vec_bow
            vec_lsi = model[pre_preoccessed_vec]
            sims = index[vec_lsi]

            if use_soft_cosine_similarity:
                sims = enumerate(sims)

            similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims]
            similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples)
        else:
            if use_spacy:
                similarity_scores = {}
                for app_id in steam_tokens:
                    reference_sentence = steam_tokens[app_id]
                    spacy_reference = Doc(nlp.vocab, reference_sentence)
                    similarity_scores[app_id] = spacy_query.similarity(spacy_reference)
            else:
                query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set)

                similarity_scores = {}

                counter = 0
                num_games = len(steam_tokens)

                for app_id in steam_tokens:
                    counter += 1

                    if (counter % 1000) == 0:
                        print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id]))

                    reference_sentence = steam_tokens[app_id]
                    reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set)

                    try:
                        similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence)
                    except ZeroDivisionError:
                        similarity_scores[app_id] = 0

        similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed,
                                                       verbose=False)
        matches_as_app_ids.append(similar_app_ids)

    print_ranking(query_app_ids,
                  matches_as_app_ids,
                  only_print_banners=True)

    return
Example #29
0
    #go through each word in each data_text row, remove stopwords, and set them on the index.
    data_text.iloc[idx]['headline_text'] = [word for word in data_text.iloc[idx]['headline_text'].split(' ') if word not in stopwords.words() and word.isalpha()];
    
    #print logs to monitor output
    if idx % 1000 == 0:
        sys.stdout.write('\rc = ' + str(idx) + ' / ' + str(len(data_text)));
        
pickle.dump(data_text, open('data_text.dat', 'wb'))

train_headlines = [value[0] for value in data_text.iloc[0:].values];
num_topics = 10;

id2word = gensim.corpora.Dictionary(train_headlines)
corpus = [id2word.doc2bow(text) for text in train_headlines]
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
lsimodel = LsiModel(corpus=corpus, num_topics=num_topics, id2word=id2word)


def get_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

get_topics(lda, num_topics)

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, id2word)

Example #30
0
                                      coherence='c_v')
    lda_coherence_umass = CoherenceModel(model=lda_model,
                                         texts=processed_emails,
                                         dictionary=dictionary,
                                         coherence='u_mass')

    lD_name = "saved/models/LDA/lda" + str(j_index) + ".model"
    lD_coh_cv = "saved/models/LDA/cv_lda" + str(j_index) + ".coherence"
    lD_coh_um = "saved/models/LDA/umass_lda" + str(j_index) + ".coherence"

    # save the models to the disk
    lda_model.save(lD_name)
    lda_coherence_cv.save(lD_coh_cv)
    lda_coherence_umass.save(lD_coh_um)

    lsa_model = LsiModel(tfidf_vectors, num_topics=top, id2word=dictionary)
    lsa_coherence_cv = CoherenceModel(model=lsa_model,
                                      texts=processed_emails,
                                      dictionary=dictionary,
                                      coherence='c_v')
    lsa_coherence_umass = CoherenceModel(model=lsa_model,
                                         texts=processed_emails,
                                         dictionary=dictionary,
                                         coherence='u_mass')

    lS_name = "saved/models/LSA/lsa" + str(j_index) + ".model"
    lS_coh_cv = "saved/models/LSA/cv_lsa" + str(j_index) + ".coherence"
    lS_coh_um = "saved/models/LSA/umass_lsa" + str(j_index) + ".coherence"

    lsa_model.save(lS_name)
    lsa_coherence_cv.save(lS_coh_cv)