def recommend(docs_path): """ - foo """ with open(docs_path) as f: lines = f.readlines() test = [] train_aids = [] train_texts = [] tmp_bag = [] tmp_bag_current_aid = lines[0].split(',')[0] texts = [] # for dictionary generation adjacent_cit_map = {} for idx, line in enumerate(lines): aid, adjacent, in_doc, text = line.split(',') # create adjacent map for later use in eval if aid not in adjacent_cit_map: adjacent_cit_map[aid] = [] if len(adjacent) > 2: adj_cits = adjacent[1:-1].split('|') for adj_cit in adj_cits: if adj_cit not in adjacent_cit_map[aid]: adjacent_cit_map[aid].append(adj_cit) # fill texts text = text.replace('[]', '') texts.append(text.split()) if aid != tmp_bag_current_aid or idx == len(lines) - 1: # tmp_bag now contains all lines sharing ID tmp_bag_current_aid num_contexts = len(tmp_bag) sub_bags_dict = {} for item in tmp_bag: item_in_doc = item[0] item_text = item[1] if item_in_doc not in sub_bags_dict: sub_bags_dict[item_in_doc] = [] sub_bags_dict[item_in_doc].append(item_text) order = sorted(sub_bags_dict, key=lambda k: len(sub_bags_dict[k]), reverse=True) # ↑ keys for sub_bags_dict, ordered for largest bag to smallest min_num_train = math.floor(num_contexts * 0.8) train_texts_comb = [] test_texts = [] # TODO: how to do k-fold cross val with this? for jdx, sub_bag_key in enumerate(order): sb_texts = sub_bags_dict[sub_bag_key] if len(train_texts_comb ) > min_num_train or jdx == len(order) - 1: test_texts.extend(sb_texts) else: train_texts_comb.extend(sb_texts) l_tr = len(train_texts_comb) l_te = len(test_texts) l_tr_perc = (l_tr / (l_tr + l_te)) * 100 l_te_perc = (l_te / (l_tr + l_te)) * 100 test.extend([(tmp_bag_current_aid, txt) for txt in test_texts]) # because we use BOW we can just combine train docs here train_text_combined = ' '.join(txt for txt in train_texts_comb) train_aids.append(tmp_bag_current_aid) train_texts.append(train_text_combined.split()) # reset bag tmp_bag = [] tmp_bag_current_aid = aid tmp_bag.append([in_doc, text]) # average number of adjacent docs # adj_sum = 0 # for k, v in adjacent_cit_map.items(): # adj_sum += len(v) # print(adj_sum/len(adjacent_cit_map)) dictionary = corpora.Dictionary(texts) # dictionary.save('1712_test.dict') num_unique_tokens = len(dictionary.keys()) # print(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in train_texts] # corpora.MmCorpus.serialize('1712_test_corpus.mm', corpus) # print(corpus) tfidf = models.TfidfModel(corpus) num_cur = 0 num_top = 0 num_top_5 = 0 num_top_10 = 0 ndcg_sum_5 = 0 map_sum_5 = 0 print('test set size: {}\n- - - - - - - -'.format(len(test))) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=num_unique_tokens) for tpl in test: test_aid = tpl[0] test_text = tpl[1].split() test_bow = dictionary.doc2bow(test_text) sims = index[tfidf[test_bow]] sims_list = list(enumerate(sims)) sims_list.sort(key=lambda tup: tup[1], reverse=True) # print('correct: {}'.format(test_aid)) # print('- - - - - - - -') # for idx, sim in enumerate(sims_list[:11]): # pre = '{} '.format(idx) # if train_aids[sim[0]] == test_aid: # pre += '✔ ' # else: # pre += ' ' # print('{}{}: {}'.format(pre, sim[1], train_aids[sim[0]])) rank = len(sims_list) for idx, sim in enumerate(sims_list): if train_aids[sim[0]] == test_aid: rank = idx + 1 break if idx >= 10: break dcg = 0 idcg = 0 num_rel = 1 + len(adjacent_cit_map[test_aid]) for i in range(5): placement = i + 1 sim = sims_list[i] result_aid = train_aids[sim[0]] if result_aid == test_aid: relevance = 1 elif result_aid in adjacent_cit_map[test_aid]: relevance = .5 else: relevance = 0 denom = math.log2(placement + 1) dcg_numer = math.pow(2, relevance) - 1 dcg += dcg_numer / denom if placement == 1: ideal_rel = 1 elif placement <= num_rel: ideal_rel = .5 else: ideal_rel = 0 idcg_numer = math.pow(2, ideal_rel) - 1 idcg += idcg_numer / denom ndcg = dcg / idcg if rank == 1: num_top += 1 if rank <= 5: num_top_5 += 1 map_sum_5 += 1 / rank ndcg_sum_5 += ndcg if rank <= 10: num_top_10 += 1 num_cur += 1 print('- - - - - {}/{} - - - - -'.format(num_cur, len(test))) print('#1: {}'.format(num_top)) print('in top 5: {}'.format(num_top_5)) print('in top 10: {}'.format(num_top_10)) print('ndcg@5: {}'.format(ndcg_sum_5 / num_cur)) print('map@5: {}'.format(map_sum_5 / num_cur))
def trainDictionary(alltokens, productid, outpath): dictionary = corpora.Dictionary(alltokens) dictionary.save(os.path.join(outpath, "dictionary.dict"))
from gensim import corpora, models, similarities import codecs import json import pyLDAvis import pyLDAvis.gensim with codecs.open("../input/hafez_Train3cls_cls3.txt", "r", 'UTF-8') as myfile: documents=myfile.readlines() with codecs.open("../../stop-words_persian_1_fa.txt","r", 'UTF-8') as myfile: stoplist=myfile.read() #textha = [[word for word in document.lower().split() if word not in stoplist] #for document in matns] texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) #textha = [[word for word in text if word not in tokens_once] # for text in textha] texts = [[word for word in text if word not in tokens_once] for text in texts] #loghatname = corpora.Dictionary(textha) dictionary = corpora.Dictionary(texts) #maincorpus = [loghatname.doc2bow(text) for text in textha] corpus = [dictionary.doc2bow(text) for text in texts] lda = models.LdaModel(corpus, id2word=dictionary, num_topics=20,passes=10) pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda,corpus,dictionary) vis
# read in bodyTextOnly.txt with open('./SomethingsNotRightTextOnly.txt') as f: reader = csv.reader(f, delimiter='\t') id_body = list(reader) # separate out post posts = [row[1] for row in id_body] # build topic model base on posts ## make dictionary # stop_words = set(stopwords.words('english')) stop_words = load_stop_words() words = [[word for word in re.split('\W+', post.lower()) if (word not in stop_words and word != "")] for post in posts] dictionary = corpora.Dictionary(words) ## build corpus corpus = [dictionary.doc2bow(word) for word in words if word != ""] ## initialize lda model lda = ldamulticore.LdaModel( corpus=corpus, id2word=dictionary, num_topics=40 ) ## print topics topics = lda.print_topics(num_words=10, num_topics=-1) # for topic in topics: # print(topic)
import os import tempfile from six import iteritems from gensim import corpora import datetime app = Flask(__name__) app.config["DEBUG"] = True TEMP_FOLDER = tempfile.gettempdir() print('Folder "{}" will be used to save temporary dictionary and corpus.'. format(TEMP_FOLDER)) # collect statistics about all tokens dictionary = corpora.Dictionary( line.lower().split('|')[0].split() for line in open('huangke/jieba_brand_segged.txt')) stoplist = [] # remove stop words and words that appear only once stop_ids = [ dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq < 1 ] # remove stop words and words that appear only once dictionary.filter_tokens(stop_ids + once_ids) print(dictionary)
# freq_words = ['comment'] # for i in freq_words : # stopset.append(i) text_corpus = [] for doc in fo: temp_doc = tokenize(doc.strip()) current_doc = [] for word in range(len(temp_doc)): if (temp_doc[word][0] not in stopset) and ( temp_doc[word][1] == 'NN' or temp_doc[word][1] == 'NNS' or temp_doc[word][1] == 'NNP' or temp_doc[word][1] == 'NNPS'): current_doc.append(temp_doc[word][0]) text_corpus.append(current_doc) dictionary = corpora.Dictionary(text_corpus) # print dictionary #dictionary.save('myDict.dict') # print dictionary.token2id corpus = [dictionary.doc2bow(text) for text in text_corpus] # print corpus ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=100) for topics in ldamodel.print_topics(num_topics=2, num_words=10): print topics, "\n"
data_words = list(sentence_to_words(tweets_week)) # Remove Stop Words data_words_nostops = remove_stopwords(data_words) bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) #lemmatization data_lemmatized = lemmatization(data_words_bigrams) # Create Dictionary id2word = corpora.Dictionary(data_words_bigrams) # id2word.filter_n_most_frequent(int(len(id2word)*0.005)) texts = data_lemmatized.copy() # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] #Build LDA Model # lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # id2word=id2word, # num_topics=8, # random_state=100, # update_every=1,
infolder = '../../' + DataSet + 'Data/' outfolder = '../../' + DataSet + 'Submissions/' savedfolder = DataSet + "Saved/" testPostsFile = infolder + DataSet + "testPosts.json" trainPostsFile = infolder + DataSet + "trainPosts.json" # First, we make a dictionary of words used in the titles with Files([open(trainPostsFile), open(testPostsFile)]) as myFiles: try: dictionary = corpora.dictionary.Dictionary.load(savedfolder + "dictionary.saved") except: dictionary = corpora.Dictionary(doc for doc in myFiles) stop_ids = [ dictionary.token2id[stopword] for stopword in stop_words if stopword in dictionary.token2id ] #infreq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq < 50] #dictionary.filter_tokens(stop_ids + infreq_ids) # remove stop words and words that appear infrequently dictionary.filter_tokens(stop_ids) dictionary.compactify( ) # remove gaps in id sequence after words that were removed dictionary.save(savedfolder + "dictionary.saved") try: tfidf = models.tfidfmodel.TfidfModel.load(savedfolder + "tfidf.saved") except:
import jieba, os from gensim import corpora, models, similarities train_set = [] walk = os.walk('C:\\Users\\Sun Yutian\\Desktop\\test') for root, dirs, files in walk: for name in files: f = open(os.path.join(root, name), 'r') raw = f.read() word_list = list(jieba.cut(raw, cut_all=False, HMM=True)) train_set.append(word_list) dic = corpora.Dictionary(train_set) corpus = [dic.doc2bow(text) for text in train_set] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=10) corpus_lda = lda[corpus_tfidf] f = open('dat', 'r') raw = f.read() word_list = list(jieba.cut(raw, cut_all=False, HMM=True)) vec_bow = dic.doc2bow(word_list) vec_lda = lda[vec_bow] index = similarities.MatrixSimilarity(lda[corpus]) sims = index[vec_lda] print(list(enumerate(sims)))
while node: if node.feature.split(",")[0] == "名詞": keywords.append(node.surface) node = node.next golo.append(keywords) return keywords if __name__ == "__main__": for i in line_list: keywords = extractKeyword(i) #stop_word_list = ["ため","これ","それ","的","(",")","0","1","2","3","4","5","6","7","8","9","1","2","3","4","5","6","7","8","9","日","私","たち","こと","自分","自身","さん"] #golo = [[word for word in keywords if word not in stop_word_list] for keywords in golo] #特徴語辞書の作成 dictionary = corpora.Dictionary(golo) #低頻度語や二割以上の単語を削除 dictionary.filter_extremes(no_below=2, no_above = 0.2) ##text全体に対する特徴ベクトルの集合= corpusを作成する。 corpus = [dictionary.doc2bow(keywords) for keywords in golo] corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use #print(corpus) # LDAのモデルの呼出と学習 ここでtopicの数(ユーザー層の数)を設定出来る lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5) #トピックの出力 for topic in lda.show_topics(-1,8): print(topic)
def prepare_for_modeling(data_path, model_type="LDA-KeyWords", params={"TEXT_prepared_df": pd.DataFrame({}), "save_LDA_dictionary_path": "./output/lda_keywords/dictionary.pickle", "words_column": "all_key_words" }, verbose=1): if model_type == "LDA-KeyWords": """ params={"TEXT_prepared_df": pd.DataFrame({}), "save_LDA_dictionary_path": "./output/lda_keywords/dictionary.pickle", "words_column": "all_key_words" } """ if len(params['TEXT_prepared_df']) > 0: # load data for LDA df_data = params['TEXT_prepared_df'] if verbose == 2: print("loaded data shape:", df_data.shape) else: if verbose == 2: print("No data is provided") return False words_column = params['words_column'] df_data[words_column] = df_data[words_column].apply(lambda x: [w.replace(' ', '_') for w in x if len(w) > 1 ]) # get all unique key_words tmp_list = df_data[words_column].tolist() set_of_words = set([w for sublist in tmp_list for w in sublist]) if verbose == 2: print('\nNumber of unique key-words for topic modeling dictionary:', len(set_of_words)) # delete empty lists of words df_data = df_data[df_data[words_column].apply(len) > 0] # create a vocabulary for the LDA model dictionary = corpora.Dictionary(df_data[words_column]) # save dictionary with open(params["save_LDA_dictionary_path"], 'wb') as f: # Pickle the LDA dictionary using the highest protocol available. pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL) if verbose == 2: print("LDA dictionary file is saved to:", params["save_LDA_dictionary_path"]) print('\nNumber of texts processed: ', dictionary.num_docs) print('Number of extracted key-words: ', len(dictionary.token2id)) print('\nEach text is represented by list of ', len(dictionary.token2id), " tuples: \n\t\t(key-words's index in bag-of-words dictionary, key-words's term frequency)") # count the number of occurrences of each distinct token in each document df_data['doc2bow'] = df_data['all_key_words'].apply( lambda x: dictionary.doc2bow(x)) if model_type == "LDA": """ params={"TEXT_prepared_df": pd.DataFrame({}), "save_LDA_dictionary_path": "./output/lda/dictionary.pickle", "text_column": "text" } """ if len(params['TEXT_prepared_df']) > 0: # load data for LDA df_data = params['TEXT_prepared_df'] print("loaded data shape:", df_data.shape) elif len(data_path) > 0: print("Preparing data for LDA...") df_data = pd.read_csv(params['data_path']) df_data['list_of_lemmas'] = df_data[words_column].apply( lambda text: get_list_of_lemmas(text)) print("Data for LDA shape:", df_data.shape) else: return False # get all unique lemmas tmp_list = df_data['list_of_lemmas'].apply(set).apply(list).tolist() list_of_words = [w for sublist in tmp_list for w in sublist] # count words' document frequencies in the corpus w_freq_counter = collections.Counter(list_of_words) s_w_freq = pd.Series(w_freq_counter) if verbose == 2: print('\nTotal number of unique Lemmas: ', len(s_w_freq)) print("\nDistribution of lemmas' document counts: ") print(pd.DataFrame(s_w_freq.describe(percentiles=[ 0.55, 0.65, 0.75, 0.85, 0.95, 0.97, 0.99])).T) # select upper and lower boundary for lemmas' count up_pct = s_w_freq.quantile(0.99) low_pct = 3 # s_w_freq.quantile(0.50) if verbose == 2: print("\nDeleting too frequent and too rare words...") print('Lemma count upper bound:', up_pct) print('Lemma count lower bound:', low_pct) # select Lemmas selected_words = set(s_w_freq[(s_w_freq > low_pct) & (s_w_freq <= up_pct)].index) if verbose == 2: print('\nList of words for topic modeling dictionary is reduced from', len(s_w_freq), 'to', len(selected_words)) # select words in each article if they belong to chosen list of words df_data['selected_words'] = df_data['list_of_lemmas'].apply(lambda x: [l for l in x if l in selected_words]) # delete empty lists of words df_data = df_data[df_data['selected_words'].apply(len) > 0] # create a vocabulary for the LDA model dictionary = corpora.Dictionary(df_data['selected_words']) # save dictionary with open(params["save_LDA_dictionary_path"], 'wb') as f: # Pickle the LDA dictionary using the highest protocol available. pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL) if verbose == 2: print("LDA dictionary file is saved to:", params["save_LDA_dictionary_path"]) print('\nNumber of texts processed: ', dictionary.num_docs) print('Number of extracted lemmas: ', len(dictionary.token2id)) print('\nEach text is represented by list of ', len(dictionary.token2id), " tuples: \n\t\t(lemma's index in bag-of-words dictionary, lemma's term frequency)") # count the number of occurrences of each distinct token in each document df_data['doc2bow'] = df_data['selected_words'].apply( lambda x: dictionary.doc2bow(x)) return df_data
def return_topic_figures(n_topics=5): """Creates plotly visualizations generated from topic model Args: n_topics = number of topics to generate from articles, default 5 Returns: figures (list): list containing the plotly visualizations """ ### import data ### data = return_keywords() data_for_topics = data["abstract_kw"].apply( lambda x: list(ast.literal_eval(x).keys())) ### Build topic model ### # parameters n_topics = n_topics # Create Dictionary id2word = corpora.Dictionary(data_for_topics) # Create Corpus: Term Document Frequency corpus = [id2word.doc2bow(text) for text in data_for_topics] # Build LDA model lda_model = gensim.models.ldamodel.LdaModel( corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=100, update_every=1, chunksize=10, passes=10, alpha="symmetric", iterations=100, per_word_topics=True, ) topics = lda_model.show_topics(formatted=False) data_flat = [w for w_list in data_for_topics for w in w_list] counter = Counter(data_flat) out = [] for i, topic in topics: for word, weight in topic: out.append([word, i, weight, counter[word]]) df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"]) specs = np.full((ceil(n_topics / 2), 2), {"secondary_y": True}) topic_bar_charts = make_subplots( rows=ceil(n_topics / 2), cols=2, specs=specs.tolist(), horizontal_spacing=0.1, vertical_spacing=0.15, ) row, col = (0, 0) for topic in range(n_topics): if (topic % 2) != 0: col = 2 else: col = 1 row += 1 color = px.colors.qualitative.Vivid[topic] topic_bar_charts.add_trace( go.Bar( x=df.loc[df.topic_id == topic, "word"], y=df.loc[df.topic_id == topic, "word_count"], width=0.5, opacity=0.3, marker_color=color, name=("Topic " + str(topic) + " word count"), ), secondary_y=False, row=row, col=col, ) topic_bar_charts.add_trace( go.Bar( x=df.loc[df.topic_id == topic, "word"], y=df.loc[df.topic_id == topic, "importance"], width=0.2, marker_color=color, name=("Topic " + str(topic) + " weight"), ), secondary_y=True, row=row, col=col, ) topic_bar_charts.update_layout(barmode="overlay") topic_bar_charts.update_layout(height=800, width=1000, margin=dict(l=50, r=50, t=50, b=100)) # append all charts figures = [dict(data=topic_bar_charts)] return figures
def LDA_post(infile, outfile, topic = 14): docs = [] # f = open(infile, 'r') # line = f.readline() # while line: # docs.append(line.lower().split('\t')[1]) # line = f.readline() # f.close() with open(infile, 'r') as csvfile: spamreader = csv.reader(csvfile, delimiter = ',', quotechar = '"') header = next(spamreader) for row in spamreader: docs.append(row[1]) texts = [] widgets = [FormatLabel('Processed: %(value)d records (in: %(elapsed)s)')] pbar = ProgressBar(widgets = widgets) for doc in pbar((doc for doc in docs)): texts.append([word for word in wordProcBase.tokenize_tweet(doc) if word not in stopwords.words('english')]) # doc = wordProcBase.tokenize5(doc.decode('utf-8')) # texts.append([word for word in doc if word not in stopwords.words('english')]) pbar.finish() pprint.pprint(texts) return # create a Gensim dictionary form the texts dictionary = corpora.Dictionary(texts) # remove extrems dictionary.filter_extremes(no_below = 1, no_above = 0.85) # convert the dictionary to a bag of words corpus for reference corpus = [dictionary.doc2bow(text) for text in texts] print ('Applying LDA...') lda = models.LdaModel(corpus, num_topics = topic, id2word = dictionary, update_every = 1, chunksize = 10000, passes = 100, minimum_probability = 0.001) topics = lda.show_topics(num_topics = topic, num_words = 5) # pprint.pprint(lda.print_topics(num_topics = topic)) # pprint.pprint(topics) print ('Writing results into file...') # 結果寫入文件 with open(outfile, 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"') top_prob = lda.get_document_topics(corpus) #a list of (topic_id, topic_probability) 2-tuples index = 1 for prob in top_prob: string = [0 for i in range(topic)] prob = sorted(prob, key = operator.itemgetter(0), reverse = False) for i, p in prob: string[i] = p spamwriter.writerow(string) index += 1 return ''' # reading unseen data ''' print ('Reading unseen data...') unseen = _MAIN_DIR_ + "/Data/VA_Proc/emtion_tweets/survey/google_survey_data.csv" docs = [] with open(unseen, 'r') as csvfile: spamreader = csv.reader(csvfile, delimiter = ',', quotechar = '"') for row in spamreader: docs.append(row[1]) texts = [] for doc in docs: texts.append([word for word in wordProcBase.tokenize3(doc.decode('utf-8')) if word not in stopwords.words('english')]) dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below = 1, no_above = 0.85) corpus = [dictionary.doc2bow(text) for text in texts] with open(outfile, 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"') top_prob = lda.get_document_topics(corpus) index = 1 for prob in top_prob: string = [index] for i in xrange(0, len(prob)): string.append(prob[i][1]) spamwriter.writerow(string) index += 1
def English(documents): # Log logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Reference text file processing texts = [[word for word in document.lower().split()] for document in documents[1:]] print(texts) # Statistically restricted word frequency frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts1 = [[token for token in text if frequency[token] > 1] for text in texts] print(texts1) # Build a corpus dictionary = corpora.Dictionary(texts1) print(dictionary.token2id) # Doc2bow the dictionary to get a new corpus corpus = [dictionary.doc2bow(text) for text in texts1] # Building a TF-IDF model tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] for doc in corpus_tfidf: print(doc) # Depth-first search print(tfidf.dfs) print(tfidf.idfs) # Training the Lsi model lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) lsi.print_topics(2) # Map documents into two-dimensional topic space with Lsi model corpus_lsi = lsi[corpus_tfidf] for doc in corpus_lsi: print(doc) # Calculate sparse matrix similarity index = similarities.MatrixSimilarity(lsi[corpus]) # Object text file processing query = documents[0] print(query) # doc2bow builds a bag of words model, turning the file into a sparse vector query_bow = dictionary.doc2bow(query.lower().split()) print(query_bow) # Map documents into 2D topic space with Lsi model query_lsi = lsi[query_bow] print(query_lsi) # Calculate cosine similarity sims = index[query_lsi] sims = list(sims) return sims
def buildTokenDict(self): ''' assign an id to each word in self.segResponses ''' self.tokenDictionary = corpora.Dictionary(self.segResponses) logging.info("完成词袋" + str(self.tokenDictionary))
args = parser.parse_args() n_topics=args.n_topics n_docs=0 input_file=args.input #input_file='/medargsia/iarroyof/Volumen de 384 GB/data/GUs_textform_noPeriods.txt' #input_file='lsa_example.csv' #input_file='wiki_sample/wiki_75_AA.txt.cln' #input_file='wiki_sample/wiki_77_AA.txt' # A little stopwords list stoplist = set('for a of the and to in _ [ ]'.split()) # Do not load the text corpus into memory, but stream it! fille=corpus_streamer(input_file, strings=True) dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file)) # remove stop words and words that appear only once stop_ids=[dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1] dictionary.filter_tokens(stop_ids + once_ids) # remove gaps in id sequence after words that were removed dictionary.compactify() # Store the dictionary dictionary.save('lsa_mini.dict') # Reading sentences from file into a list of strings. # Use instead streaming objects: # Load stored word-id map (dictionary) stream_it = corpus_streamer(input_file, dictionary=dictionary) #for vector in stream_it: # load one vector into memory at a time
category = df['category'] dates = df['dates'] heads = df['heads'] cats = df.category.unique() unique_dates = df.dates.unique() text_words = np.load('text_words.npy') text_words_nostops = np.load('text_words_nostops.npy') data_lemmatized = np.load('data_lemmatized.npy') id2word = corpora.Dictionary(data_lemmatized) print('0.5') corpus = np.load('corpus.npy') range_per_date = {} #[[0,0] for i in range(len(unique_dates))] for d in unique_dates: range_per_date[d] = [0, 0] print(1) for d in unique_dates: found = 0 for i in range(len(dates)):
##불용어 제거 swords = open(r'C:\Users\Z\Desktop\NI\한국어불용어100.txt', encoding='UTF8').read() stop_words = re.findall('[가-힣]+', swords) tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화 tokenized_doc = tokenized_doc.apply( lambda x: [item for item in x if item not in stop_words]) # 불용어를 제거합니다. ##################################### #LDA tokenized_doc[:5] from gensim import corpora dictionary = corpora.Dictionary(tokenized_doc) corpus = [dictionary.doc2bow(text) for text in tokenized_doc] print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0 import gensim NUM_TOPICS = 5 #20개의 토픽, k=20 ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15) topics = ldamodel.print_topics(num_words=10) for topic in topics: print(topic) for i, topic_list in enumerate(ldamodel[corpus]): if i == 5:
# 日本語の取り扱い・語の取捨選択はKH Coderでやる方が楽。 # そのため(少々ダサいけれども)文書-単語行列からgensimで扱いやすいDictionaryモデルを作成している。 documents = [] words = df.columns.values[1:] # 一番左のID列は除く for row in df.values: word_counts = row[1:] document_bow = [] # 各投稿ごとのbag of words for word_index, count in enumerate(word_counts): for i in range(count): document_bow.append(words[word_index]) if len(document_bow) > 0: documents.append(document_bow) # 各単語をidに変換する辞書の作成 dictionary = corpora.Dictionary(documents) # documentsをcorpus化する corpus = list(map(dictionary.doc2bow, documents)) # TF-IDFモデルを作成する。 test_model = models.TfidfModel(corpus) # corpusへのモデル適用 corpus_tfidf = test_model[corpus] start, stop, step = 2, 30, 1 plot_graph(documents, start, stop, step, dictionary, corpus_tfidf) number_of_topics = 7 words = 10
def __init__(self, corpus_dir): self.corpus_dir = corpus_dir # Given a list of tokens, return a gensim dictionary of unique tokens # Only includes tokens that appear more than 5 times and less than 50% of the corpus. self.dictionary = corpora.Dictionary(iter_documents(corpus_dir)) self.dictionary.filter_extremes(no_below=1, no_above=1) # remove stopwords etc
] tokens_stemmed = [self.stemmer.stem(x) for x in tokens_stopwords] return tokens_stemmed if __name__ == '__main__': input_file = 'data_topic_modeling.txt' data = load_data(input_file) preprocessor = Preprocessor() processed_tokens = [preprocessor.process(x) for x in data] dict_tokens = corpora.Dictionary(processed_tokens) corpus = [dict_tokens.doc2bow(text) for text in processed_tokens] num_topic = 2 num_words = 4 ldamodel = models.ldamodel.LdaModel(corpus, num_topics=num_topic, id2word=dict_tokens, passes=25) print('Most contributing words to the topics:') for item in ldamodel.print_topics(num_topics=num_topic, num_words=num_words): print('Topic', item[0], '==>', item[1])
def get_docs_corpus(self): dictionary = corpora.Dictionary(self.docs_words) for doc_words in self.docs_words: yield dictionary.doc2bow(doc_words)
def process(self, process_file, cluster_ResFileName): try: # 一、获取标题和分词 flag, lines = self.load_processfile(process_file) if flag == False: logging.error("load error") return False, "load error" # 分词结果与其他方法形式不同 title_list, sen_seg_list = self.seg_words(lines) # # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频(另一种方式) # vectorizer = CountVectorizer() # x = vectorizer.fit_transform(sen_seg_list) # weight = x.toarray() # # model = lda.LDA(n_topics=5, n_iter=100, random_state=1) # model.fit(np.asarray(weight)) # model.fit_transform(X) is also available # topic_word = model.topic_word_ # model.components_ also works # print(topic_word) # # 文档-主题(Document-Topic)分布 # doc_topic = model.doc_topic_ # print(doc_topic) # # numpy.savetxt('100.csv', doc_topic, delimiter = ',') #将得到的文档-主题分布保存 # 二、lda模型提取特征 # 构造词典 dictionary = corpora.Dictionary(sen_seg_list) # 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】 corpus = [dictionary.doc2bow(words) for words in sen_seg_list] lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=15) lda.save('zhwiki_lda.model') lda = models.ldamodel.LdaModel.load('zhwiki_lda.model') # 打印所有主题,每个主题显示10个词 for topic in lda.print_topics(num_words=500): print(topic) # 主题矩阵 ldainfer = lda.inference(corpus)[0] # 主题推断 print(lda.inference(corpus)) np.savetxt('100tag.csv', lda.inference(corpus), delimiter=',', fmt='%s') # 将得到的文档-主题分布保存 k = self.evaluate_km(ldainfer) # 三、Kmeans,大数据量下用Mini-Batch-KMeans算法 km = KMeans(n_clusters=k) km.fit(ldainfer) print(Counter(km.labels_)) # 打印每个类多少个 # print(km.cluster_centers_) # 中心点 # 存储每个样本所属的簇 clusterRes = codecs.open(cluster_ResFileName, 'w', encoding='UTF-8') count = 1 while count <= len(km.labels_): clusterRes.write( str(title_list[count - 1]) + '\t' + str(km.labels_[count - 1])) clusterRes.write('\r\n') count = count + 1 clusterRes.close() except: logging.error(traceback.format_exc()) return False, "process fail"
def get_docs_LSI_model(self): LSI_model = models.LsiModel(corpus=self.get_docs_corpus(), id2word=corpora.Dictionary(self.docs_words), num_topics=2) return LSI_model
lemma = WordNetLemmatizer() # Function to lemmatize and remove the stopwords def clean(doc): stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) punc_free = "".join(ch for ch in stop_free if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) return normalized # Creating a list of documents from the complaints column list_of_docs = df["message"].tolist() # Implementing the function for all the complaints of list_of_docs doc_clean = [clean(doc).split() for doc in list_of_docs] # Code starts here dictionary=corpora.Dictionary(doc_clean) doc_term_matrix=[dictionary.doc2bow(text) for text in doc_clean] lsimodel=LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary) pprint(lsimodel.print_topics()) # -------------- from gensim.models import LdaModel from gensim.models import CoherenceModel # doc_term_matrix - Word matrix created in the last task # dictionary - Dictionary created in the last task # Function to calculate coherence values
def main(): corpus = {} with open('corpus_data/preprocessedf_corpus.json') as corpus: corpus = json.loads(corpus.read().encode('utf-8')) corpus_2 = defaultdict(str) for artist, songlist in corpus.items(): for song in songlist: lyrics = song['lyrics'].strip('\\') corpus_2[artist] += lyrics features = {} with open('corpus_data/artist_features.json') as features: features = json.loads(features.read()) finalcorpus = [] for artist, lyrics in corpus_2.items(): d = {} d['artist'] = artist d['lyrics'] = lyrics d['pos'] = features[artist]['pos_counts'] finalcorpus.append(d) df = pd.DataFrame(finalcorpus) # nltk.download('wordnet') from nltk.corpus import wordnet as wn def get_lemma(word): lemma = wn.morphy(word) if lemma is None: return word else: return lemma """TOPIC MODELING HOPEFULLY""" import re from nltk import word_tokenize from nltk.corpus import stopwords STOPWORDS = stopwords.words('english') PROFANITY = set() with open('corpus_data/rapsw.txt') as infile: infile = infile.read() infile = infile.split() for el in infile: PROFANITY.add(el) def clean_text(text, ar): tokenized_text = word_tokenize(text.lower()) tokenized_text = [token for token in tokenized_text if len(token) > 5] cleaned_text = [ t for t in tokenized_text if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t) ] if ar == 'sw': cleaned_text = [t for t in cleaned_text if t not in STOPWORDS] if ar == 'lm': cleaned_text = [get_lemma(token) for token in cleaned_text] if ar == 'rw': cleaned_text = [ token for token in cleaned_text if token not in PROFANITY ] return cleaned_text for index, row in df.iterrows(): row['lyrics'] = clean_text(row['lyrics'], sys.argv[1]) from gensim import models, corpora from gensim.corpora.dictionary import Dictionary from gensim.test.utils import common_corpus, common_texts, get_tmpfile all_lyrics = [] all_artists = [] for index, row in df.iterrows(): all_lyrics.append(row['lyrics']) all_artists.append(row['artist']) #common_dictionary = Dictionary(common_texts) #common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] #lda_model = models.LdaModel(common_corpus, num_topics=10) dictionary = corpora.Dictionary(all_lyrics) corpus = [dictionary.doc2bow(text) for text in all_lyrics] NUM_TOPICS = 25 lda_model = models.LdaModel(corpus=corpus, num_topics=25, id2word=dictionary, passes=20) topics = lda_model.print_topics(num_words=4) print('LDA Topics') for topic in topics: print(topic) lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary) topics = lsi_model.print_topics(num_words=4) print('LSI TOPICS') for topic in topics: print(topic) from gensim import similarities text = "" with open(sys.argv[2]) as inf: inf = inf.read() text = inf bow = dictionary.doc2bow(clean_text(text, sys.argv[1])) lda_index = similarities.MatrixSimilarity(lda_model[corpus]) lsi_index = similarities.MatrixSimilarity(lsi_model[corpus]) # Let's perform some queries similarities = lda_index[lda_model[bow]] # Sort the similarities similarities = sorted(enumerate(similarities), key=lambda item: -item[1]) similaritiesLSI = lsi_index[lsi_model[bow]] similaritiesLSI = sorted(enumerate(similaritiesLSI), key=lambda item: -item[1]) # Top most similar documents: #print(similarities[:10]) # [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)] # Let's see what's the most similar document document_id, similarity = similarities[0] document_id2, similarityLSI = similaritiesLSI[0] # print(all_lyrics[document_id][:1000]) print("LDA : TOP 5 Similar ARTISTS") for el in similarities[:5]: print(all_artists[el[0]]) print('') print('LSI : Top 5 Similar Artists') for el in similaritiesLSI[:5]: print(all_artists[el[0]])
# Count word frequencies frequency = defaultdict( int) # Create defaultdict class with each word and their frequency for text in texts: # Iterate through each text document from our texts corpus for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts] pprint.pprint(processed_corpus) # Saved processed corpus into our corpora.Dictionary object this is our most important object # it contains the tokens as well as their frequencies dictionary = corpora.Dictionary(processed_corpus) print(dictionary) # The token2id attribute returns dictionary of our tokens and their ids pprint.pprint(dictionary.token2id) # The cfs attribute returns ow many instances of this token are contained in the documents. dictionary.dfs # Save output dictionary into text file for later use dictionary.save_as_text("dict_text.txt") ## Comparison of new document with corpus # We can convert our entire original corpus to a list of vectors: bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
def LDA(trainSet, testSet, topics=10, times=200): # topics: # of topics in the result # times: # of passes during training #tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # create sample documents #r1 = "1, 2, 3" #r2 = "2, 3, 4" #r3 = "1, 3, 5" #r4 = "2, 4, 5" #r5 = "1, 5, 6" # compile sample documents into a list #r_set = [r1, r2, r3, r4, r5] #print r_set # list for tokenized documents in loop texts = [] freq = [] #gradient.csv with open('gradient.csv') as myfile: csv_reader = csv.reader(myfile) for row in csv_reader: x = ",".join(row) dishList = x.split(',') dishList = stem_words(dishList) newList = [] for dish in dishList: dish = re.sub('[^A-Za-z]', '', str(dish)) #print dish #tokens = tokenizer.tokenize(raw) # remove stop words from tokens #stopped_tokens = [i for i in raw if not i in en_stop] # stem tokens #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list if not dish in en_stop: dish = dish.lower() newList.append(dish) freq.append(dish) #print newList texts.append(newList) print "Filter High Freq Words" top = Counter(freq).most_common(1000) #print top topList = [] for i in top: topList.append(i[0]) topList.remove('allrecip') topList.remove('recip') topList.remove('martha') #topList.remove('re') topList.remove('stewart') topList.remove('myrecip') topList.remove('recipe') topList.remove('recipes') topList.remove('street') topList.remove('epicuri') topList.remove('edamam') final = [] for i in texts: partFinal = [] for j in i: if j in topList: partFinal.append(j) final.append(partFinal) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(final) #print dictionary # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in final] #print corpus print "Train Model" # generate LDA model flag = False while not flag: ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=topics, id2word=dictionary, passes=times) ldamodel.save('lda.model') flag = True japanFlag = False for entry in ldamodel.print_topics(): print entry dishes = str(entry[1]).split("+") weight = [] name = [] for dish in dishes: dish = str(dish).split("*") #print dish[0] weight.append(float(dish[0])) name.append(re.sub('[^A-Za-z]', '', str(dish[1]))) if max(weight) <= 0.002: flag = False break if "thai" in name and "chines" in name: flag = False break if "thai" in name and "indian" in name: flag = False break if "japanes" in name: japanFlag = True if not japanFlag: flag = False #print ldamodel.print_topics() '''
def summarizer(name,file): filename = file.rstrip('.pdf')+'.txt' f=open(filename,'r') abstract=0 text = "" stemmer = PorterStemmer() # for abstract extraction for sentence in f.readlines(): sentence=sentence.rstrip('\n') if abstract and len(sentence) >0: text+=sentence+"\n" elif stemmer.stem(sentence.lower()) == stemmer.stem(name) and abstract==0: abstract=1 if len(sentence)>0 and sentence == sentence.upper() and abstract==1 and \ stemmer.stem(sentence.lower())!=stemmer.stem(name): break content = sent_tokenize(text) content.pop() scores = [] avg_score = 0.0 for sent in content: score=0.0 for word in sent.split(): if word.lower() in tf_values: score+=tf_values[word.lower()] scores.append(score) #print("for",sent,"score :",score) avg_score+=score #print(tf_values["Citation".lower()]) try: avg_score/=len(content) except: print("file can't used ") return #print(avg_score) summary="" for i,j in enumerate(scores): if j>=avg_score: summary+=content[i] final_summary="By TF-IDF\n" final_summary+=name.upper()+" "+"Summary: \n"+" "+summary+"\n" final_summary+="\tOriginal "+name+" : "+str(len(text))+" chars\n" final_summary+="\tReduced Size : "+str(len(summary))+" chars\n" final_summary+="\tcompression ratio : "+str((len(summary))/(len(text)))+"\n\n" output.write(final_summary) # code for LDA data = [] lda_stem = PorterStemmer() for sent in content: words = [] for word in word_tokenize(sent): if word not in get_stop_words('en') and word not in stopwords.words("english") and word != '.': words.append(lda_stem.stem(word)) data.append(words) dictionary = corpora.Dictionary(data) corpus = [dictionary.doc2bow(sent) for sent in data] # LDA model intialization topic_used = 3 # no of topic to summarize ldamodel = models.ldamodel.LdaModel(corpus, num_topics=topic_used, id2word = dictionary, passes=100) prob_sum_for_each_topic = [0.0 for i in range(topic_used)] data_desc = ldamodel.get_document_topics(corpus) print(data_desc) for data in data_desc: print(data) for topic_data in data: prob_sum_for_each_topic[topic_data[0]] += topic_data[1] print(topic_data[1]) important_topic = prob_sum_for_each_topic.index(max(prob_sum_for_each_topic)) threshold = prob_sum_for_each_topic[important_topic]/len(data_desc) lda_summary="" for index,word_data in enumerate(data_desc): if word_data[important_topic][1] > threshold: lda_summary+=content[index] """for index,sent in enumerate(content): print(data_desc[important_topic][index][1]) if data_desc[important_topic][index][1] > threshold: lda_summary+= content[index]""" final_summary_lda="By LDA\n" final_summary_lda+=name.upper()+" "+"Summary: \n"+" "+lda_summary+"\n" final_summary_lda+="\tOriginal "+name+" : "+str(len(text))+" chars\n" final_summary_lda+="\tReduced Size : "+str(len(lda_summary))+" chars\n" final_summary_lda+="\tcompression ratio : "+str((len(lda_summary))/(len(text)))+"\n\n" output.write(final_summary_lda) f.close()
file_docs = [] filenames = [] document_for_test = "the_man_in_the_brown_suit" for file in os.listdir("data"): if file.endswith(".txt"): if file in ["stopwords.txt", f"{document_for_test}.txt"]: # because we want to test which book is sense_and_sensibility similar to continue print(file) filenames.append(file) file_docs.append(open("data/" + file, "r", encoding="UTF-8").read()) # print(file_docs) gen_words_list = [[w.lower() for w in word_tokenize(text)] for text in file_docs] dictionary = corpora.Dictionary(gen_words_list) corpus = [dictionary.doc2bow(gen_words_list) for gen_words_list in gen_words_list] tf_idf = models.TfidfModel(corpus) sims = similarities.Similarity("data", tf_idf[corpus], num_features=len(dictionary)) file2_sentence_list = [] with open(f"data/{document_for_test}.txt", "r", encoding="UTF-8") as f: tokens = sent_tokenize(f.read()) for line in tokens: file2_sentence_list.append(line) for line in file2_sentence_list: query_doc = [w.lower() for w in word_tokenize(line)] query_doc_bow = dictionary.doc2bow(query_doc) query_doc_tf_idf = tf_idf[query_doc_bow] # print("Comparing Result:", sims[query_doc_tf_idf]) similarity_list = list(sims[query_doc_tf_idf]) most_similar_index = similarity_list.index(max(similarity_list))