print(dictionary) class MyCorpus(object): def __iter__(self): for line in all_questions: if line: yield dictionary.doc2bow(line.lower().split()) corpus_memory_friendly = MyCorpus() i = 0 # for vector in corpus_memory_friendly: # load one vector into memory at a time # print(vector) # i+=1 # if i==40: # break lsi = LsiModel(corpus_memory_friendly, id2word=dictionary, num_topics=300) corpora.MmCorpus.serialize('datadump/lsi_data/train_corpora.mm', corpus_memory_friendly) dictionary.save('datadump/lsi_data/train.dict') lsi.save("datadump/lsi_data/lsi_model") # dictionary = corpora.Dictionary(corpus) # corpus_gensim = [dictionary.doc2bow(doc) for doc in corpus] # tfidf = TfidfModel(corpus_gensim) # corpus_tfidf = tfidf[corpus_gensim] # lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200) # lsi_index = MatrixSimilarity(lsi[corpus_tfidf]) # sims['ng20']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]] # for i in range(len(corpus))])
# -*- coding: utf-8 -*- """ Created on Mon Jun 29 12:58:29 2020 @author: cvicentm """ from gensim.test.utils import common_dictionary, common_corpus from gensim.models import LsiModel model = LsiModel(common_corpus, id2word=common_dictionary) vectorized_corpus = model[common_corpus]
wandb.init(config=config, project="topical_language_generation_sweeps") #data preparation cached_dir = "/home/rohola/cached_models" tokenizer = TransformerGPT2Tokenizer(cached_dir) dataset = TopicalDataset(config.dataset_dir, tokenizer) docs = [doc for doc in dataset] dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=config.no_below, no_above=config.no_above) corpus = [dictionary.doc2bow(doc) for doc in docs] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi_model = LsiModel( corpus_tfidf, id2word=dictionary, num_topics=config.num_topics, ) #cm = CoherenceModel(model=lsi_model, corpus=corpus, coherence='u_mass') cm = CoherenceModel(model=lsi_model, texts=docs, dictionary=dictionary, coherence='c_w2v') # coherence = cm.get_coherence() # print("coherence: ", coherence) wandb.log({"coherence": cm.get_coherence()})
vocab = Dictionary(tweets, no_below=NO_BELOW, no_above=NO_ABOVE, keep_tokens=set(KEEP_TOKENS)) vocab.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N, keep_tokens=set(KEEP_TOKENS)) print(' len(vocab) after filtering: {}'.format(len(vocab.dfs))) # no time at all, just a bookeeping step, doesn't actually compute anything tfidf = TfidfModel(id2word=vocab, dictionary=vocab) tfidf.save(os.path.join(BIGDATA_PATH, 'tfidf{}.pkl'.format(len(vocab.dfs)))) tweets = [vocab.doc2bow(tw) for tw in tweets] json.dump(tweets, gzip.open(os.path.join(BIGDATA_PATH, 'tweet_bows.json.gz'), 'w')) gc.collect() # LSA is more useful name than LSA lsa = LsiModel(tfidf[tweets], num_topics=200, id2word=vocab, extra_samples=100, power_iters=2) # these models can be big lsa.save(os.path.join(BIGDATA_PATH, 'lsa_tweets'))
def main(): conf = SparkConf().setAppName("Program Number 1") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # creates Spark Session spark = SparkSession.builder.appName("Program Number 1").getOrCreate() # tweets folder address on HDFS server - ignore files with .tmp extensions (Flume active files). inputpath = "hdfs://hdfs input path" spark.conf.set("spark.sql.shuffle.partitions", 1) # get the raw tweets from HDFS raw_tweets = spark.read.format("json").option( "inferScehma", "true").option("mode", "dropMalformed").load(inputpath) # get the tweet text from the raw data. text is transformed to lower case. Deletes re-tweets. and finally include an index for each tweet tweets = raw_tweets.select( functions.lower(functions.col("text"))).withColumnRenamed( "lower(text)", "text").distinct().withColumn( "id", functions.monotonically_increasing_id()) # Create a tokenizer that Filter away tokens with length < 4, and get rid of symbols like $,#,... tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength( 4).setInputCol("text").setOutputCol("tokens") # Tokenize tweets tokenized_tweets = tokenizer.transform(tweets) remover = StopWordsRemover().setInputCol("tokens").setOutputCol("cleaned") # remove stopwords cleaned_tweets = remover.transform(tokenized_tweets) # create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000. vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol( "features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets) wordVectors = vectorizer.transform(cleaned_tweets).select("id", "features") # LDA # create Latent Dirichlet Allocation model and run it on our data with 25 iteration and 5 topics lda = LDA(k=5, maxIter=25) # fit the model on data ldaModel = lda.fit(wordVectors) # create topics based on LDA lda_topics = ldaModel.describeTopics() # show LDA topics # ______________________________________________________________________________________________________________ # LSA clean_tweets_list = [] tweet_list = [] # for creating the document term matrix for the LSIModel as input # this is needed as LSI needs tuples of (vocabulary_index, frequency) form for tweet_row in wordVectors.select('features').collect(): tweet_list.clear() # reading the SparseVector of 'features' column (hence the 0 index) and zipping them to a list # idx = vocabulary_index, val=frequency of that word in that tweet for idx, val in zip(tweet_row[0].indices, tweet_row[0].values): # converting the frequency from float to integer tweet_list.append((idx, int(val))) clean_tweets_list.append(tweet_list[:]) # calling the LSIModel and passing the number of topics as 5 lsa_model = LsiModel(clean_tweets_list, num_topics=5) # show LSA topics # ______________________________________________________________________________________________________________ # #Comparison # get the weights and indices of words from LDA topics in format of List[list[]] lda_wordIndices = [row['termIndices'] for row in lda_topics.collect()] lda_wordWeights = [row['termWeights'] for row in lda_topics.collect()] # get the weights and indices of words from LDA topics in format of numpy array with 5*wordCount shape. # each element is the weight of the corresponding word in that specific topic. lsa_weightsMatrix = lsa_model.get_topics() # function to calculate the similarity between an lsa topic and an lda topic. def topic_similarity_calculator(lsa_t, lda_t): (lda_index, lda_weight) = lda_t sum = 0 for index, weight in zip(lda_index, lda_weight): sum = sum + (np.abs(lsa_t[index] * weight)) return sum # run the similarity function on 25 possibilities (5 LSA * 5 LDA) similarity = [] eachLSA = [] for i in range(0, 5): eachLSA.clear() for j in range(0, 5): temp = topic_similarity_calculator( lsa_weightsMatrix[i], (lda_wordIndices[j], lda_wordWeights[j])) eachLSA.append(temp) similarity.append(eachLSA[:]) # Print the similarity table # each row is a LDA topic and each column is an LSA topic. print(" ") print("Similarity table") def similarity_print(s): i = 1 print("|--------------------------------------------------------|") print("| | LSA 1 | LSA 2 | LSA 3 | LSA 4 | LSA 5 |") print("|--------------------------------------------------------|") for one, two, three, four, five in zip(*similarity): print( '|LDA {} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} |' .format(i, one, two, three, four, five)) print("|--------------------------------------------------------|") i = i + 1 #creates the similarity matrix similarity_print(similarity) # ______________________________________________________________________________________________________________ # Final result Table # Manually found the following Topics to be similar # (LSA1 - LDA1) # (LSA5 - LDA2) # rest are alone lsa_words_idx = [] for idx, curr_topic in enumerate(lsa_weightsMatrix): lsa_words_idx.append(np.abs(curr_topic).argsort()[-10:][::-1]) lsa_topics_bow = {} lda_topics_bow = {} lsa_bow_list = [] lda_bow_list = [] for curr_idx, (lda_topic, lsa_topic) in enumerate(zip(lda_wordIndices, lsa_words_idx)): lsa_bow_list.clear() lda_bow_list.clear() for idx in range(10): lsa_bow_list.append(vectorizer.vocabulary[lsa_topic[idx]]) lda_bow_list.append(vectorizer.vocabulary[lda_topic[idx]]) lsa_topics_bow[curr_idx] = lsa_bow_list[:] lda_topics_bow[curr_idx] = lda_bow_list[:] results = [] names = [] # Creating word dictionary for LDA2 and LSA5 lda2_lsa5 = lda_topics_bow[1][:] for word in (lsa_topics_bow[4]): if word not in lda2_lsa5: lda2_lsa5.append(word) # Creating word dictionary for LDA1 and LSA1 lda1_lsa1 = lda_topics_bow[0][:] for word in (lsa_topics_bow[0]): if word not in lda1_lsa1: lda1_lsa1.append(word) results.append(lda1_lsa1) names.append("LDA1 - LSA1 ") results.append(lda2_lsa5) names.append("LDA2 - LSA5 ") results.append(lda_topics_bow[2]) names.append("LDA3 ") results.append(lda_topics_bow[3]) names.append("LDA4 ") results.append(lda_topics_bow[4]) names.append("LDA5 ") results.append(lsa_topics_bow[1]) names.append("LSA2 ") results.append(lsa_topics_bow[2]) names.append("LSA3 ") results.append(lsa_topics_bow[3]) names.append("LSA4 ") #printing the topics and related words print(" ") print("Topics Table") print( "|------------------------------------------------------------------------------------------|" ) print( "| Topic | Significant Words |" ) print( "|------------------------------------------------------------------------------------------|" ) for name, r in zip(names, results): print('| {} | {} |'.format(name, r)) print( "|------------------------------------------------------------------------------------------|" ) print(" ") print(" ")
Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix Output : term dictionary and Document Term Matrix """ # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # generate LDA model return dictionary,doc_term_matrix #LSA - Topic Modelling ##Application du modèle sur le corpus LONGIT number_of_topics = 1 words = 100 document_list,titles = load_data("",'./corpus_files/prod_all_txt/corpus_longit.csv') clean_text = preprocess_data(document_list) dictionary,doc_term_matrix = prepare_corpus(clean_text) lsamodel = LsiModel(doc_term_matrix, num_topics = number_of_topics, id2word = dictionary) # train model print(lsamodel.print_topics(num_topics = number_of_topics, num_words = words)) output_file = open('./corpus_files/tm_csv/topic_modelling.csv', mode = 'w', encoding = 'utf8') output_file.write("Topic modelling du corpus LONGIT : "+str(lsamodel.print_topics(num_topics = number_of_topics, num_words = words))) output_file.close()
# 3. create bag of words dictionary = gensim.corpora.Dictionary(processed_docs) bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] document_num = 30 bow_doc_x = bow_corpus[document_num] print(bow_corpus[10]) # # for i in range(len(bow_doc_x)): # print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], # dictionary[bow_doc_x[i][0]], # bow_doc_x[i][1])) # # lsamodel = LsiModel(bow_corpus, num_topics=7, id2word=dictionary) # train model print(lsamodel.print_topics(num_topics=7, num_words=10)) for idx, topic in lsamodel.print_topics(-1): print("Topic: {} \nWords: {}".format(idx, topic )) print("\n") from gensim.test.utils import datapath # Save model to disk. temp_file = datapath("lsa_model_optimized") lsamodel.save(temp_file) # Load a potentially pretrained model from disk. df_test_jokes = pd.read_csv("JokeText.csv") if False:
def topicmodiling(): l=[] text='' for i in range(len(dfs)): for j in dfs[i]: if(j=='\n'): j=' ' text=text+j else: text=text + j l.append(text) text='' for i in l: text=text+i+"\n" nlp=English() doc = nlp(text) texts, article = [], [] for w in doc: # if it's not a stop word or punctuation mark or it is not a number, add it to our article! if w.is_stop == False and w.is_punct == False and w.like_num == False and w.like_email ==False : # we add the lematized version of the word article.append(w.lemma_) # if it's a new line, it means we're onto our next document if w.text == '\n': texts.append(article) article = [] bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] for i in texts: for j in i: if(j=='\n'): i.remove(j) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] dictionary.token2id dictionary lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=dictionary) a=lsimodel.show_topics(num_topics=5) # Showing only the top 5 topics b=[] for i in range(0,len(a)): b.append(a[i][1].split('+')) k=[] for i in range(0,len(b)): k.append(b[i][0:5]) top1=[] for i in range(0,5): top1.append(k[0][i].split('*')) top2=[] for i in range(0,5): top2.append(k[1][i].split('*')) top3=[] for i in range(0,5): top3.append(k[2][i].split('*')) top4=[] for i in range(0,5): top4.append(k[3][i].split('*')) df1 = DataFrame (top1,columns=['Topic 1 weight','Topic 1 words']) df2 = DataFrame (top2,columns=['Topic 2 weight','Topic 2 words']) df3 = DataFrame (top3,columns=['Topic 3 weight','Topic 3 words']) df4 = DataFrame (top4,columns=['Topic 4 weight','Topic 4 words']) result = pd.concat([df1, df2,df3,df4], axis=1) for col in result.columns: result[col]=result[col].str.replace('"','') result[col]=result[col].str.replace('-','') return result
def create_LSI(corpus, dictionary, num_topics): return LsiModel(corpus, num_topics=num_topics, id2word=dictionary)
def process_page(all_documents, order_text, unorder_text, order_list, unorder_list, ocr_values, page_ocr): # Count n grams frequencies and calculate cosine similarity between two docs. counts = CountVectorizer(ngram_range=(1, 5)) counts_matrix = counts.fit_transform(all_documents) cos = cosine_similarity(counts_matrix[0:1], counts_matrix) # print('Count Vectorizer', cos[0][1]) ocr_values.append(cos[0][1]) # Calculate tf-idf cosine similarity (nltk or spacy text the same) tokenize = lambda doc: doc.lower().split(" ") tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1, 5)) tfidf_matrix = tfidf.fit_transform(all_documents) cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) # print('TF-IDF Vectorizer', cos[0][1]) ocr_values.append(cos[0][1]) # # Calculate similarity using GLOVE and SPACY # order_doc = nlp(order_text) # unorder_doc = nlp(unorder_text) # sim_doc = order_doc.similarity(unorder_doc) # # print('Spacy GLOVE', sim_doc) # #https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed # ocr_values.append(sim_doc) # Calculate jaccard ratio. Takes list of tokens jac = 1 - distance.jaccard(order_list, unorder_list) # print('Jaccard', jac) ocr_values.append(jac) # use gensim's similarity matrix and lsi to calculate cosine all_tokens = [order_list, unorder_list] dictionary = Dictionary(all_tokens) corpus = [dictionary.doc2bow(text) for text in all_tokens] lsi = LsiModel(corpus, id2word=dictionary, num_topics=2) sim = MatrixSimilarity(lsi[corpus]) lsi_cos = [t[1][1] for t in list(enumerate(sim))] lsi_cos = lsi_cos[0] # print('LSI', lsi_cos) ocr_values.append(lsi_cos) #https://radimrehurek.com/gensim/tut3.html # align = align_pages(order_text, unorder_text) # # print('smw', align) # ocr_values.append(align) # print(ocr_values) if os.path.isfile(page_ocr): final_metrics = pd.read_csv(page_ocr) ocr_values.append(datetime.date.today()) final_metrics.loc[len(final_metrics.index)] = ocr_values final_metrics.to_csv(page_ocr, index=False) else: ocr_values.append(datetime.date.today()) cols = [ 'first_issue_date', 'first_page_number', 'second_issue_date', 'second_page_number', 'countsvec_cos', 'tfidfvec_cos', 'jaccard_sim', 'lsi_cos', 'date_run' ] final_df = pd.DataFrame([ocr_values], columns=cols) final_df.to_csv(page_ocr, index=False)
def topic_analysis(corpus, dictionary, models_path, technique): import uuid uuid = str(uuid.uuid4()) print("[BLOCK] Starting models for context") sys.stdout.flush() if technique == "all" or technique == "hdp": t1 = time() # HDP model model = HdpModel(corpus, id2word=dictionary) model.save("%s/hdp_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for HDP model: %s" % (round(t2 - t1, 2))) sys.stdout.flush() if technique == "all" or technique == "ldap": t1 = time() # Parallel LDA model model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=23, passes=20) model.save("%s/lda_parallel_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LDA multicore: %s" % (round(t2 - t1, 2))) sys.stdout.flush() if technique == "all" or technique == "lsa": t1 = time() # LSA model model = LsiModel(corpus, id2word=dictionary, num_topics=400) model.save("%s/lsa_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LSA: %s" % (round(t2 - t1, 2))) sys.stdout.flush() if technique == "all" or technique == "ldao": t1 = time() # Online LDA model model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5) model.save("%s/lda_online_%s" % (models_path, uuid)) t2 = time() print("[BLOCK] Training time for LDA online: %s" % (round(t2 - t1, 2))) sys.stdout.flush() if technique == "all" or technique == "lda": t1 = time() # Offline LDA model model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20) model.save("%s/lda_offline_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LDA offline: %s" % (round(t2 - t1, 2))) sys.stdout.flush()
text += text_p[i].text text = text.lower() # tokenize text into sentences sentences = sent_tokenize(text) text_length = find_text_length(sentences) tokens = tokenize(text) # create dictionary of tokens dictionary = corpora.Dictionary(tokens) sent_term_matrix = [ dictionary.doc2bow(sentence) for sentence in tokens ] # Gensim's LsiModel performs Truncated SVD such that dimension of S = numtopics # if numtopics is not specified, performs SVD lsamodel = LsiModel(sent_term_matrix, id2word=dictionary) # Grab matrix V from SVD, A = USV^t V = corpus2dense(lsamodel[sent_term_matrix], len( lsamodel.projection.s)).T / lsamodel.projection.s # Output sentence with the longest vector lengths, no repeats lengths = find_length(lsamodel.projection.s, V) if (len(sentences) < 5): num_sentences = len(sentences) else: num_sentences = 5 indices = find_indices( lengths, num_sentences) # number of sentences printed = 5 scores = [] # rouge scores
corpus = MyCorpus(test_data_dir) # create a dictionary print(corpus) # for vector in corpus: # convert each document to a bag-of-word vector # print vector tfidf = TfidfModel(corpus) #print(tfidf[some_doc]) topics = 20 num_clusters = 8 passes = 1 print "Create LSI model" lsi_model = LsiModel(corpus, id2word=corpus.dictionary, num_topics=topics) corpus_lsi = lsi_model[corpus] print "Create LDA model" lda_model = LdaModel(corpus, id2word=corpus.dictionary, num_topics=topics, passes=passes) corpus_lda = lda_model[corpus] print "Done creating models" # print "*********************" # print "\n\nPrint LSI model\n" # topic_id = 0 # for topic in lsi_model.show_topics(num_words=5):
def compute_lsi(self, num_topics=None): lsi = LsiModel(self.wiki_tfidf_corpus, num_topics=num_topics, id2word=self.wiki_dict) return lsi
# for item in docs: # corpus.append(list(word_tokenize(reuters.raw(item)))) # print(corpus[3]) # tfidf = TfidfModel(corpus) # print(tfidf[doc[0]]) #tfidf.save('/tmp/foo.tfidf_model') documents = [tokenize(reuters.raw(docs[0]))] #for file_id in docs[0]] dictionary = Dictionary(documents) #for item in docs[0]: topics.append(reuters.categories(docs[0])) corpus = [dictionary.doc2bow(d) for d in documents] tfidf_model = TfidfModel(corpus, id2word=dictionary) tfidf_values = tfidf_model[corpus] #dict(tfidf_model[dictionary.doc2bow(tokenize(reuters.raw(docs[0])))]) # print tfidf_values[dictionary.token2id['year']] # 0.0367516096888 # print tfidf_values[dictionary.token2id['following']] # 0.0538505795815 # print tfidf_values[dictionary.token2id['provided']] # 0.0683210467787 # print tfidf_values[dictionary.token2id['structural']] # 0.0945807226371 # print tfidf_values[dictionary.token2id['japanese']] # 0.107960637598 # print tfidf_values[dictionary.token2id['downtrend']] # 0.122670341446 #print(documents[2]) #print(corpus[2]) lsi = LsiModel(tfidf_values, num_topics=len(topics)) print(lsi[tfidf_values]) # project some document into LSI space
# In[6]: for i in range(len(cols)): dictionary.add_documents([ [cols[i]] ]) curr_id = dictionary.doc2idx([ cols[i] ])[0] print(i) for j in range( len(Patients) ): corpus_tfidf[j].append( ( curr_id , data.at[ j , i+1 ] ) ) # In[7]: lsi = LsiModel(corpus_tfidf, id2word=dictionary) lsi_index = MatrixSimilarity(lsi[corpus_tfidf]) sims['files']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]] for i in range(len(corpus))]) # In[8]: sims # In[9]: ind=np.unravel_index(np.argmax(sims['files']['LSI'],axis=None), sims['files']['LSI'].shape)
data_lemmatized = make_bigrams(data_words) id2word = corpora.Dictionary(data_lemmatized) texts = data_lemmatized corpus = [id2word.doc2bow(text) for text in texts] lda_model = LdaModel.load('lda_model_full2') for c in lda_model[corpus[5:8]]: print("Document Topics : ", c[0]) print("Word id, Topics : ", c[1][:3]) print("Phi Values (word id) : ", c[2][:2]) print("Word, Topics : ", [(dct[wd], topic) for wd, topic in c[1][:2]]) print("Phi Values (word) : ", [(dct[wd], topic) for wd, topic in c[2][:2]]) print("------------------------------------------------------\n") lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5) pprint(lsi_model.print_topics(-1)) def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data): sent_topics_df = pd.DataFrame() for i, row_list in enumerate(ldamodel[corpus]): row = row_list[0] if ldamodel.per_word_topics else row_list row = sorted(row, key=lambda x: (x[1]), reverse=True) for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append( pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else:
def recommend(docs_path, dict_path, use_fos_annot=False, pp_dict_path=None, np_dict_path=None, lda_preselect=False, combine_train_contexts=True): """ Recommend """ test = [] train_mids = [] train_texts = [] train_foss = [] train_ppann = [] train_nps = [] foss = [] tmp_bag = [] adjacent_cit_map = {} if pp_dict_path and False: prind('loading predpatt dictionary') pp_dictionary = corpora.Dictionary.load(pp_dict_path) pp_num_unique_tokens = len(pp_dictionary.keys()) use_predpatt_model = True if not combine_train_contexts: prind(('usage of predpatt model is not implemented for not' 'combining train contexts.\nexiting.')) sys.exit() else: use_predpatt_model = False pp_dictionary = None if np_dict_path: prind('loading noun phrase dictionary') np_dictionary = corpora.Dictionary.load(np_dict_path) np_num_unique_tokens = len(np_dictionary.keys()) use_noun_phrase_model = True else: use_noun_phrase_model = False np_dictionary = None prind('checking file length') num_lines = sum(1 for line in open(docs_path)) # # for MAG eval # mag_id2year = {} # with open('MAG_CS_en_year_map.csv') as f: # for line in f: # pid, year = line.strip().split(',') # mag_id2year[pid] = int(year) # # /for MAG eval prind('train/test splitting') with open(docs_path) as f: for idx, line in enumerate(f): if idx == 0: tmp_bag_current_mid = line.split('\u241E')[0] if idx % 10000 == 0: prind('{}/{} lines'.format(idx, num_lines)) cntxt_foss = [] cntxt_ppann = [] cntxt_nps = [] # handle varying CSV formats vals = line.split('\u241E') if use_noun_phrase_model: cntxt_nps = vals[-1] if '\u241D' in cntxt_nps: # includes NP<marker> variant np_all, np_marker = cntxt_nps.split('\u241D') cntxt_nps = np_marker # mby use both for final eval cntxt_nps = [np for np in cntxt_nps.strip().split('\u241F')] vals = vals[:-1] if len(vals) == 4: mid, adjacent, in_doc, text = vals elif len(vals) == 5: if use_predpatt_model: mid, adjacent, in_doc, text, pp_annot_json = vals else: mid, adjacent, in_doc, text, fos_annot = vals elif len(vals) == 6: mid, adjacent, in_doc, text, fos_annot, pp_annot_json = vals else: prind('input file format can not be parsed\nexiting') sys.exit() if len(vals) in [5, 6] and use_fos_annot: cntxt_foss = [ f.strip() for f in fos_annot.split('\u241F') if len(f.strip()) > 0 ] foss.extend(cntxt_foss) if use_predpatt_model: if '\u241F' in pp_annot_json: # includes alternative version ppann, ppann_alt = pp_annot_json.split('\u241F') pp_annot_json = ppann cntxt_ppann = json.loads(pp_annot_json) # create adjacent map for later use in eval if mid not in adjacent_cit_map: adjacent_cit_map[mid] = [] if len(adjacent) > 0: adj_cits = adjacent.split('\u241F') for adj_cit in adj_cits: if adj_cit not in adjacent_cit_map[mid]: adjacent_cit_map[mid].append(adj_cit) # fill texts if mid != tmp_bag_current_mid or idx == num_lines - 1: # tmp_bag now contains all lines sharing ID tmp_bag_current_mid num_contexts = len(tmp_bag) sub_bags_dict = {} for item in tmp_bag: item_in_doc = item[0] item_text = item[1] item_foss = item[2] item_ppann = item[3] item_nps = item[4] if item_in_doc not in sub_bags_dict: sub_bags_dict[item_in_doc] = [] sub_bags_dict[item_in_doc].append( [item_text, item_foss, item_ppann, item_nps]) if len(sub_bags_dict) < 2: # can't split, reset bag, next tmp_bag = [] tmp_bag_current_mid = mid continue order = sorted(sub_bags_dict, key=lambda k: len(sub_bags_dict[k]), reverse=True) # ↑ keys for sub_bags_dict, ordered for largest bag to smallest min_num_train = math.floor(num_contexts * 0.8) train_tups = [] test_tups = [] for jdx, sub_bag_key in enumerate(order): sb_tup = sub_bags_dict[sub_bag_key] # if sub_bag_key[1:3] == '06': # time split ACL # if mag_id2year[sub_bag_key] > 2017: # time split MAG # if sub_bag_key[:2] == '17': # time split arXiv if len(train_tups ) > min_num_train or jdx == len(order) - 1: test_tups.extend(sb_tup) else: train_tups.extend(sb_tup) test.extend([ [ tmp_bag_current_mid, # mid tup[0], # text tup[1], # fos sum_weighted_term_lists(tup[2], pp_dictionary), # pp tup[3] # nps ] for tup in test_tups ]) if combine_train_contexts: # combine train contexts per cited doc train_text_combined = ' '.join(tup[0] for tup in train_tups) train_mids.append(tmp_bag_current_mid) train_texts.append(train_text_combined.split()) train_foss.append( [fos for tup in train_tups for fos in tup[1]]) train_ppann.append( sum_weighted_term_lists( sum([tup[2] for tup in train_tups], []), pp_dictionary)) train_nps.append( [np for tup in train_tups for np in tup[3]]) else: # don't combine train contexts per cited doc for tup in train_tups: train_mids.append(tmp_bag_current_mid) train_texts.append(tup[0].split()) train_foss.append([fos for fos in tup[1]]) train_nps.append([np for np in tup[1]]) # reset bag tmp_bag = [] tmp_bag_current_mid = mid tmp_bag.append([in_doc, text, cntxt_foss, cntxt_ppann, cntxt_nps]) prind('loading dictionary') dictionary = corpora.Dictionary.load(dict_path) num_unique_tokens = len(dictionary.keys()) prind('building corpus') corpus = [dictionary.doc2bow(text) for text in train_texts] if use_fos_annot: prind('preparing FoS model') mlb = MultiLabelBinarizer() mlb.fit([foss]) train_foss_matrix = mlb.transform(train_foss) train_foss_set_sizes = np.sum(train_foss_matrix, 1) prind('generating TFIDF model') tfidf = models.TfidfModel(corpus) prind('preparing similarities') index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=num_unique_tokens) bm25 = BM25(corpus) average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) if lda_preselect: orig_index = index.index.copy() prind('generating LDA/LSI model') lda = LsiModel(tfidf[corpus], id2word=dictionary, num_topics=100) prind('preparing similarities') lda_index = similarities.SparseMatrixSimilarity( lda[tfidf[corpus]], num_features=num_unique_tokens) if use_predpatt_model: prind('preparing claim similarities') pp_tfidf = models.TfidfModel(train_ppann) pp_index = similarities.SparseMatrixSimilarity( pp_tfidf[train_ppann], num_features=pp_num_unique_tokens) if use_noun_phrase_model: prind('preparing noun phrase similarities') np_corpus = [np_dictionary.doc2bow(nps) for nps in train_nps] np_index = similarities.SparseMatrixSimilarity( np_corpus, num_features=np_num_unique_tokens) # models: BoW, NP<marker>, Claim, Claim+BoW eval_models = [{ 'name': 'bow' }, { 'name': 'np' }, { 'name': 'claim' }, { 'name': 'claim+bow' }] for mi in range(len(eval_models)): eval_models[mi]['num_cur'] = 0 eval_models[mi]['num_top'] = 0 eval_models[mi]['num_top_5'] = 0 eval_models[mi]['num_top_10'] = 0 eval_models[mi]['ndcg_sums'] = [0] * AT_K eval_models[mi]['map_sums'] = [0] * AT_K eval_models[mi]['mrr_sums'] = [0] * AT_K eval_models[mi]['recall_sums'] = [0] * AT_K prind('test set size: {}\n- - - - - - - -'.format(len(test))) for test_item_idx, tpl in enumerate(test): if test_item_idx > 0 and test_item_idx % 10000 == 0: save_results(docs_path, num_lines, len(test), eval_models, suffix='_tmp') test_mid = tpl[0] # if test_mid not in train_mids: # # not testable # continue test_text = bow_preprocess_string(tpl[1]) if use_fos_annot: test_foss_vec = mlb.transform([tpl[2]]) dot_prods = train_foss_matrix.dot( test_foss_vec.transpose()).transpose()[0] with np.errstate(divide='ignore', invalid='ignore'): fos_sims = np.nan_to_num(dot_prods / train_foss_set_sizes) fos_sims_list = list(enumerate(fos_sims)) fos_sims_list.sort(key=lambda tup: tup[1], reverse=True) fos_ranking = [s[0] for s in fos_sims_list] fos_boost = np.where(dot_prods >= dot_prods.max() - 1)[0].tolist() top_dot_prod = dot_prods[-1] if use_predpatt_model: pp_sims = pp_index[pp_tfidf[tpl[3]]] pp_sims_list = list(enumerate(pp_sims)) pp_sims_list.sort(key=lambda tup: tup[1], reverse=True) pp_ranking = [s[0] for s in pp_sims_list] if use_noun_phrase_model: np_sims = np_index[np_dictionary.doc2bow(tpl[4])] np_sims_list = list(enumerate(np_sims)) np_sims_list.sort(key=lambda tup: tup[1], reverse=True) np_ranking = [s[0] for s in np_sims_list] test_bow = dictionary.doc2bow(test_text) if lda_preselect: # pre select in LDA/LSI space lda_sims = lda_index[lda[tfidf[test_bow]]] lda_sims_list = list(enumerate(lda_sims)) lda_sims_list.sort(key=lambda tup: tup[1], reverse=True) lda_ranking = [s[0] for s in lda_sims_list] lda_picks = lda_ranking[:1000] index.index = orig_index[lda_picks] sims = index[tfidf[test_bow]] sims_list = list(enumerate(sims)) sims_list.sort(key=lambda tup: tup[1], reverse=True) bow_ranking = [s[0] for s in sims_list] bm25_scores = list(enumerate(bm25.get_scores(test_bow, average_idf))) bm25_scores.sort(key=lambda tup: tup[1], reverse=True) bm25_ranking = [s[0] for s in bm25_scores] if lda_preselect: # translate back from listing in LDA/LSI pick subset to global listing bow_ranking = [lda_picks[r] for r in bow_ranking] if use_fos_annot: boost_ranking = fos_boost_ranking(bow_ranking, fos_boost, top_dot_prod) if not combine_train_contexts: seen = set() seen_add = seen.add final_ranking = [ x for x in final_ranking if not (train_mids[x] in seen or seen_add(train_mids[x])) ] if use_predpatt_model: sims_comb = combine_simlists(sims, pp_sims, [2, 1]) comb_sims_list = list(enumerate(sims_comb)) comb_sims_list.sort(key=lambda tup: tup[1], reverse=True) comb_ranking = [s[0] for s in comb_sims_list] for mi in range(len(eval_models)): if mi == 0: final_ranking = bow_ranking elif mi == 1: final_ranking = np_ranking elif mi == 2: final_ranking = pp_ranking elif mi == 3: final_ranking = comb_ranking rank = len(bow_ranking) # assign worst possible for idx, doc_id in enumerate(final_ranking): if train_mids[doc_id] == test_mid: rank = idx + 1 break if idx >= 10: break dcgs = [0] * AT_K idcgs = [0] * AT_K precs = [0] * AT_K num_rel_at = [0] * AT_K num_rel = 1 + len(adjacent_cit_map[test_mid]) num_rel_at_k = 0 for i in range(AT_K): relevant = False placement = i + 1 doc_id = final_ranking[i] result_mid = train_mids[doc_id] if result_mid == test_mid: relevance = 1 num_rel_at_k += 1 relevant = True elif result_mid in adjacent_cit_map[test_mid]: relevance = .5 num_rel_at_k += 1 relevant = True else: relevance = 0 num_rel_at[i] = num_rel_at_k if relevant: precs[i] = num_rel_at_k / placement denom = math.log2(placement + 1) dcg_numer = math.pow(2, relevance) - 1 for j in range(i, AT_K): dcgs[j] += dcg_numer / denom if placement == 1: ideal_rel = 1 elif placement <= num_rel: ideal_rel = .5 else: ideal_rel = 0 idcg_numer = math.pow(2, ideal_rel) - 1 for j in range(i, AT_K): # note this^ we go 0~9, 1~9, 2~9, ..., 9 idcgs[j] += idcg_numer / denom for i in range(AT_K): eval_models[mi]['ndcg_sums'][i] += dcgs[i] / idcgs[i] eval_models[mi]['map_sums'][i] += sum(precs[:i + 1]) / max( num_rel_at[i], 1) if rank <= i + 1: eval_models[mi]['mrr_sums'][i] += 1 / rank eval_models[mi]['recall_sums'][i] += 1 if rank == 1: eval_models[mi]['num_top'] += 1 if rank <= 5: eval_models[mi]['num_top_5'] += 1 if rank <= 10: eval_models[mi]['num_top_10'] += 1 eval_models[mi]['num_cur'] += 1 prind('- - - - - {}/{} - - - - -'.format(eval_models[0]['num_cur'], len(test))) prind('#1: {}'.format(eval_models[0]['num_top'])) prind('in top 5: {}'.format(eval_models[0]['num_top_5'])) prind('in top 10: {}'.format(eval_models[0]['num_top_10'])) prind('ndcg@5: {}'.format(eval_models[0]['ndcg_sums'][4] / eval_models[0]['num_cur'])) prind('map@5: {}'.format(eval_models[0]['map_sums'][4] / eval_models[0]['num_cur'])) prind('mrr@5: {}'.format(eval_models[0]['mrr_sums'][4] / eval_models[0]['num_cur'])) prind('recall@5: {}'.format(eval_models[0]['recall_sums'][4] / eval_models[0]['num_cur'])) for mi in range(len(eval_models)): eval_models[mi]['num_applicable'] = eval_models[mi]['num_cur'] eval_models[mi]['ndcg_results'] = [ sm / eval_models[mi]['num_cur'] for sm in eval_models[mi]['ndcg_sums'] ] eval_models[mi]['map_results'] = [ sm / eval_models[mi]['num_cur'] for sm in eval_models[mi]['map_sums'] ] eval_models[mi]['mrr_results'] = [ sm / eval_models[mi]['num_cur'] for sm in eval_models[mi]['mrr_sums'] ] eval_models[mi]['recall_results'] = [ sm / eval_models[mi]['num_cur'] for sm in eval_models[mi]['recall_sums'] ] return eval_models, num_lines, len(test)
# %% filepath = "./data/train_input.csv" corpus = Sentences(filepath, loop=False) dict = Dictionary(corpus, prune_at=DICTLENGTH) dict.filter_extremes(no_below= 2, no_above= 0.8) # %% for i, bow in enumerate(TFIDF_corpus(corpus, dict)): print("---------%s----------" %(i, )) print(len(bow)) print(bow) if(i == 20): break; # %% embed_size = SVDSIZE lsi = LsiModel(TFIDF_corpus(corpus, dict), num_topics= embed_size) comments_embd = lsi[TFIDF_corpus(corpus, dict)] # %% for i, embd in enumerate(comments_embd): print(i, len(embd)) # %% labels = pd.read_csv('./data/train_input.csv', usecols = ['label', ], squeeze = True) x = np.zeros((NUMSAMPLES, embed_size)) y = np.zeros((NUMSAMPLES, 1)) count = 0 for i, (embed, l) in enumerate(zip(comments_embd, labels)): hidden = [item[1] for item in embed]
FILENAME = 'panda_corpus.txt' panda_g = corpus(FILENAME) si = SenSimi(panda_g) panda_raw = si.reconstructdata() print(type(panda_raw)) bowlist = si.bowcorpus(panda_raw) print(bowlist[1]) panda_tfidfmodel = si.tfidfmodel(bowlist) panda_tfidf = panda_tfidfmodel[bowlist] # FIXME:不能用全部的语料生成索引,超出numpy.array限制 # FIXME: 如果用部分语料,用于比较相似的句子不包括在索引矩阵的特征向量中(基) print('using lsi model...') panda_lsi = LsiModel(corpus=panda_tfidf, id2word=si.word_dict, num_topics=300) index = similarities.MatrixSimilarity(panda_lsi[panda_tfidf]) good = ['可爱', '萌', '喜欢', '国宝', '神奇'] good_bow = si.word_dict.doc2bow(good) good_tfidf = panda_tfidfmodel[good_bow] good_lsi = panda_lsi[good_tfidf] simi = index[good_lsi] simi_list = list(simi) print(max(simi_list)) where = simi_list.index(max(simi_list)) print(panda_raw[where])
nltk.download('punkt') clean_text = re.sub(r'%(.*)\n', '', file_contents) clean_text = re.sub(r'\s+', ' ', clean_text) words = nltk.word_tokenize(clean_text) n = 5 ngrams = {} for i in range(len(words) - n): gram = ' '.join(words[i:i + n]) if gram not in ngrams.keys(): ngrams[gram] = [] ngrams[gram].append(words[i + n]) from gensim import corpora from gensim.models import TfidfModel from gensim.models import LsiModel from gensim.similarities import MatrixSimilarity dictionary = corpora.Dictionary(clean_text) corpus_gensim = [dictionary.doc2bow(doc) for doc in clean_text] tfidf_text = TfidfModel(corpus_gensim) corpus_tfidf_text = tfidf_text[corpus_gensim] lsi_text = LsiModel(corpus_tfidf_text, id2word=dictionary, num_topics=10) lsi_index_text = MatrixSimilarity(lsi_text[corpus_tfidf_text]) sims['clean_text']['LSI'] = np.array([ lsi_index_text[lsi_text[corpus_tfidf_text[i]]] for i in range(len(clean_text)) ])
def main(): # --- arguments --- (dataset, version, _, _, nbs_topics, _, _, cache_in_memory, use_callbacks, tfidf, args) = parse_args() model_class = 'LSImodel' _split_ = "_split" if use_callbacks else "" data_name = f'{dataset}_{version}_{tfidf}' data_dir = join(LDA_PATH, version, tfidf) # --- logging --- logger = init_logging(name=data_name, basic=False, to_stdout=True, to_file=True) logg = logger.info log_args(logger, args) # --- load dict --- logg('Loading dictionary') data_file = join(data_dir, f'{data_name}.dict') dictionary = Dictionary.load(data_file) # --- load corpus --- logg('Loading corpus') data_file = join(data_dir, f'{data_name}.mm') corpus = MmCorpus(data_file) if cache_in_memory: logg('Reading corpus into RAM') corpus = list(corpus) if use_callbacks: train, test = split_corpus(corpus) else: train, test = corpus, [] logg(f'size of... train_set={len(train)}, test_set={len(test)}') # --- train --- topn = 20 columns = [f'term{x}' for x in range(topn)] + [f'weight{x}' for x in range(topn)] for nbtopics in nbs_topics: gc.collect() logg(f'Running {model_class} with {nbtopics} topics') model = LsiModel(corpus=train, num_topics=nbtopics, id2word=dictionary) model_dir = join(LSI_PATH, version, tfidf, f'{_split_}') model_path = join(model_dir, f'{dataset}_{model_class}{_split_}_{nbtopics}') if not exists(model_dir): makedirs(model_dir) # --- save topics --- topics = model.show_topics(num_words=topn, formatted=False) topics = [list(chain(*zip(*topic[1]))) for topic in topics] topics = pd.DataFrame(topics, columns=columns) logg(f'Saving topics to {model_path}.csv') topics.to_csv(f'{model_path}.csv') # --- save model --- logg(f'Saving model to {model_path}') model.save(model_path) # --- done --- logg(f'\n' f'----- end -----\n' f'----- {dataset.upper()} -----\n' f'{"#" * 50}\n')
continue time1 = time.time() print('已生成训练集文本向量。正在进行模型训练......') num_topics = 2 + int(len(corpus) / 250) if num_topics >= 20: num_topics = 10 num_words = (num_topics - 2) * 2 + 10 print('本院系文章总数为%d,即将分为主题数%d个,关键字%d个......' % (len(corpus), num_topics, num_words)) # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=50) # result = ldamodel.print_topics(num_topics=num_topics, num_words=num_words) # doc_lda = ldamodel[corpus] model = LsiModel( corpus, id2word=dictionary, num_topics=num_topics, ) doc_lda = model[corpus] result = model.print_topics(num_topics=num_topics, num_words=num_words) time2 = time.time() print('模型训练用时:', time2 - time1) print('LDA模型训练完成。插入数据库......') for n in range(len(doc_lda)): Topic = doc_lda[n] if len(Topic) == 0: prams = (institution_paper_list[n][0], institution + "其他", json.dumps({}, ensure_ascii=False), json.dumps({}, ensure_ascii=False)) sql = 'insert into lda2 values(%s,%s,%s,%s)'
# Learn an LSI model from the tf-idf vectors. if True: # The number of topics to use. num_topics = 300 # Load the tf-idf corpus back from disk. corpus_tfidf = MmCorpus('./data/corpus_tfidf.mm') # Train LSI print '\nLearning LSI model from the tf-idf vectors...' t0 = time.time() # Build the LSI model # This took 2hrs. and 7min. on my machine. model_lsi = LsiModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary) print ' Building LSI model took %s' % formatTime(time.time() - t0) # Write out the LSI model to disk. # The LSI model is big but not as big as the corpus. # The largest piece is the projection matrix: # 100,000 words x 300 topics x 8-bytes per val x (1MB / 2^20 bytes) = ~229MB # This is saved as `lsi.lsi_model.projection.u.npy` model_lsi.save('./data/lsi.lsi_model') # ========= STEP 6: Convert articles to LSI with index ======== # Transform corpus to LSI space and index it if True: print '\nApplying LSI model to all vectors...'
# Transform arbitrary documents by getting them into the same BOW vector space created by your training corpus documents = ["Some iterable", "containing multiple", "documents", "..."] bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents ) # use a generator expression because... logent_documents = logent_transformation[ bow_documents] # ...transformation is done during iteration of documents using generators, so this uses constant memory ### Chained transformations # This builds a new corpus from iterating over documents of bow_corpus as transformed to log entropy representation. # Will also take many hours if bow_corpus is the Wikipedia corpus created above. logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus]) # Creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus. lsi_transformation = LsiModel(corpus=logent_corpus, id2word=dictionary, num_features=400) # Alternative way of performing same operation as above, but with implicit chaining # lsi_transformation = LsiModel(corpus=logent_transformation[bow_corpus], id2word=dictionary, # num_features=400) # Can persist transformation models, too. logent_transformation.save("logent.model") lsi_transformation.save("lsi.model") ### Similarities (the best part) from gensim.similarities import Similarity # This index corpus consists of what you want to compare future queries against index_documents = [
linked = linkage(sims['texts']['LDA'], 'complete') plt.figure(figsize=(10, 20)) plt.title('LDa Clustering Dendrogram') dendrogram(linked, orientation='left', labels=Index_of_files, distance_sort='descending', show_leaf_counts=True) plt.show() ################################################# Run Lsi Model ############################################ #you can change number of topics(num_topics=20) and see diffrent results lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20) lsi_index = MatrixSimilarity(lsi[corpus_tfidf]) sims['texts']['LSI'] = np.array( [lsi_index[lsi[corpus_tfidf[i]]] for i in range(len(corpus))]) ################################################# visualization of Lsi Algorithm with dendrgram ##################### # you can see the result ===>Figure_3.png linked = linkage(sims['texts']['LSI'], 'complete') plt.figure(figsize=(10, 20)) plt.title('Lsi Clustering Dendrogram') dendrogram(linked, orientation='left', labels=Index_of_files, distance_sort='descending', show_leaf_counts=True)
texts = [bigram[line] for line in texts] # In[10]: dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # We're now done with a very important part of any text analysis - the data cleaning and setting up of corpus. It must be kept in mind that we created the corpus the way we did because that's how gensim requires it - most algorithms still require one to clean the data set the way we did, by removing stop words and numbers, adding the lemmatized form of the word, and using bigrams. # ### LSI # # LSI stands for Latent Semantic Indeixing - it is a popular information retreival method which works by decomposing the original matrix of words to maintain key topics. Gensim's implementation uses an SVD. # In[11]: lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary) # In[12]: lsimodel.show_topics(num_topics=5) # Showing only the top 5 topics # ### HDP # # HDP, the Hierarchical Dirichlet process is an unsupervised topic model which figures out the number of topics on it's own. # In[13]: hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) # In[14]:
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False, num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if num_topics is None: num_topics = 100 possible_model_names = [ 'tf_idf', # 0 'lsi_bow', 'lsi_tf_idf', # 1, 2 'rp_bow', 'rp_tf_idf', # 3, 4 'lda_bow', 'lda_tf_idf', # 5, 6 'hdp_bow', 'hdp_tf_idf', # 7, 8 'word2vec', # 9 ] chosen_model_name = possible_model_names[chosen_model_no] print(chosen_model_name) game_names, _ = load_game_names(include_genres=False, include_categories=False) steam_tokens = load_tokens() nlp = spacy.load('en_core_web_lg') documents = list(steam_tokens.values()) dct = Dictionary(documents) print(len(dct)) dct.filter_extremes(no_below=no_below, no_above=no_above) print(len(dct)) corpus = [dct.doc2bow(doc) for doc in documents] # Pre-processing pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf') tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors) if pre_process_corpus_with_tf_idf: # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf! print('Corpus as Tf-Idf') pre_processed_corpus = tfidf_model[corpus] else: print('Corpus as Bag-of-Words') pre_processed_corpus = corpus # Model model = None wv = None index2word_set = None if chosen_model_name == 'tf_idf': print('Term Frequency * Inverse Document Frequency (Tf-Idf)') model = tfidf_model elif chosen_model_name.startswith('lsi'): print('Latent Semantic Indexing (LSI/LSA)') model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('rp'): print('Random Projections (RP)') model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('lda'): print('Latent Dirichlet Allocation (LDA)') model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('hdp'): print('Hierarchical Dirichlet Process (HDP)') model = HdpModel(pre_processed_corpus, id2word=dct) elif chosen_model_name == 'word2vec': use_a_lot_of_ram = False if use_a_lot_of_ram: model = None print('Loading Word2Vec based on Google News') # Warning: this takes a lot of time and uses a ton of RAM! wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) else: if use_spacy: print('Using Word2Vec with spaCy') else: print('Training Word2Vec') model = Word2Vec(documents) wv = model.wv if not use_spacy: wv.init_sims(replace=normalize_vectors) index2word_set = set(wv.index2word) else: print('No model specified.') model = None if chosen_model_name != 'word2vec': if not use_soft_cosine_similarity: index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct)) else: w2v_model = Word2Vec(documents) similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100) index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix) else: index = None query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) app_ids = list(int(app_id) for app_id in steam_tokens.keys()) matches_as_app_ids = [] for query_count, query_app_id in enumerate(query_app_ids): print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids), query_app_id, get_app_name(query_app_id, game_names))) query = steam_tokens[str(query_app_id)] if use_spacy: spacy_query = Doc(nlp.vocab, query) else: spacy_query = None if chosen_model_name != 'word2vec': vec_bow = dct.doc2bow(query) if pre_process_corpus_with_tf_idf: pre_preoccessed_vec = tfidf_model[vec_bow] else: pre_preoccessed_vec = vec_bow vec_lsi = model[pre_preoccessed_vec] sims = index[vec_lsi] if use_soft_cosine_similarity: sims = enumerate(sims) similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims] similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) else: if use_spacy: similarity_scores = {} for app_id in steam_tokens: reference_sentence = steam_tokens[app_id] spacy_reference = Doc(nlp.vocab, reference_sentence) similarity_scores[app_id] = spacy_query.similarity(spacy_reference) else: query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set) similarity_scores = {} counter = 0 num_games = len(steam_tokens) for app_id in steam_tokens: counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set) try: similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence) except ZeroDivisionError: similarity_scores[app_id] = 0 similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed, verbose=False) matches_as_app_ids.append(similar_app_ids) print_ranking(query_app_ids, matches_as_app_ids, only_print_banners=True) return
#go through each word in each data_text row, remove stopwords, and set them on the index. data_text.iloc[idx]['headline_text'] = [word for word in data_text.iloc[idx]['headline_text'].split(' ') if word not in stopwords.words() and word.isalpha()]; #print logs to monitor output if idx % 1000 == 0: sys.stdout.write('\rc = ' + str(idx) + ' / ' + str(len(data_text))); pickle.dump(data_text, open('data_text.dat', 'wb')) train_headlines = [value[0] for value in data_text.iloc[0:].values]; num_topics = 10; id2word = gensim.corpora.Dictionary(train_headlines) corpus = [id2word.doc2bow(text) for text in train_headlines] lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics) lsimodel = LsiModel(corpus=corpus, num_topics=num_topics, id2word=id2word) def get_topics(model, num_topics): word_dict = {}; for i in range(num_topics): words = model.show_topic(i, topn = 20); word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]; return pd.DataFrame(word_dict); get_topics(lda, num_topics) pyLDAvis.enable_notebook() pyLDAvis.gensim.prepare(lda, corpus, id2word)
coherence='c_v') lda_coherence_umass = CoherenceModel(model=lda_model, texts=processed_emails, dictionary=dictionary, coherence='u_mass') lD_name = "saved/models/LDA/lda" + str(j_index) + ".model" lD_coh_cv = "saved/models/LDA/cv_lda" + str(j_index) + ".coherence" lD_coh_um = "saved/models/LDA/umass_lda" + str(j_index) + ".coherence" # save the models to the disk lda_model.save(lD_name) lda_coherence_cv.save(lD_coh_cv) lda_coherence_umass.save(lD_coh_um) lsa_model = LsiModel(tfidf_vectors, num_topics=top, id2word=dictionary) lsa_coherence_cv = CoherenceModel(model=lsa_model, texts=processed_emails, dictionary=dictionary, coherence='c_v') lsa_coherence_umass = CoherenceModel(model=lsa_model, texts=processed_emails, dictionary=dictionary, coherence='u_mass') lS_name = "saved/models/LSA/lsa" + str(j_index) + ".model" lS_coh_cv = "saved/models/LSA/cv_lsa" + str(j_index) + ".coherence" lS_coh_um = "saved/models/LSA/umass_lsa" + str(j_index) + ".coherence" lsa_model.save(lS_name) lsa_coherence_cv.save(lS_coh_cv)