def test_dict_interface(self): """Test Python 2 dict-like interface in both Python 2 and 3.""" d = Dictionary(self.texts) self.assertTrue(isinstance(d, Mapping)) self.assertEqual(list(zip(d.keys(), d.values())), list(d.items())) # Even in Py3, we want the iter* members. self.assertEqual(list(d.items()), list(d.iteritems())) self.assertEqual(list(d.keys()), list(d.iterkeys())) self.assertEqual(list(d.values()), list(d.itervalues()))
def _get_weights_lda(text, stopwords, lda_dictionary, lda, w2idx_embedding, target_word, lda_topics=False): my_punctuation = '!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~' new_corpus = lda_dictionary.doc2bow(text) if not lda_topics: ''' code removed ''' number_of_topics_in_current_document = len(lda_topics) topic_word_matrix = lda.expElogbeta aux = np.reshape(lda_topics, (len(lda_topics), 2)) topics = aux[:, 0] p_pertenencia_a_topics = aux[:, 1] word2id_in_file = Dictionary([text]).token2id word2id_global_lda = lda_dictionary.token2id # Initialize a matrix whose values will be each word's weight. # Shape: [number_of_topics_in_current_document, number_of_words] topic_filewords_matrix = np.zeros( (number_of_topics_in_current_document, len(word2id_in_file)), dtype=float) for k_top, topic in enumerate(topics): for word in word2id_in_file.keys(): ''' code removed ''' # Suma por columnas para tener el peso acumulado de una palabra en todos los topics del documento weight = np.sum(topic_filewords_matrix, axis=0) # Normalizo # w = weight / np.array([np.linalg.norm(weight, axis=0)]).T d = (np.sum(weight**2, )**(0.5)) weight = (weight.T / d).T weight_words = { word: ww for word, ww in zip(word2id_in_file.keys(), weight) if (ww != 0 and word in self.w2idx_embedding) } return weight_words, lda_topics
class MyCorpus(object): def __init__(self, input_file, K): self.K = K self.input_file = input_file self.dictionary = Dictionary() with open(input_file, "rt") as f: for line in f: self.dictionary.add_documents([line.split()]) self.dictionary.filter_extremes(no_below = 2, no_above = 0.5, keep_n = K) def __iter__(self): count = 1 with open(self.input_file, "rt") as f: count += 1 for line in f: yield self.dictionary.doc2bow(line.rstrip().split()) def __str__(self): s = "MyCorpus(" + str(self.dictionary.num_docs) + " documents, " s += str(len(self.dictionary.keys())) + " features, " s += str(corpus.dictionary.num_nnz) + " non-zero entries)" return s def __repr__(self): return "MyCorpus('" + self.input_file + "', " + str(self.K) + ")"
def gemsim_tfidf(corpus): from gensim.models import TfidfModel from gensim.corpora import Dictionary corpus = [re.sub(r'[.|,]', '', line.lower()).split() for line in corpus] dct = Dictionary(corpus) corpus_as_bow = [dct.doc2bow(line) for line in corpus] print(corpus) sort_list = [] for key in dct.keys(): sort_list.append((key, dct[key], dct.dfs[key] * dct.cfs[key])) sort_list = sorted(sort_list, key=lambda item: item[2], reverse=True) keywords_list = sort_list[0:2 if len(sort_list) > 2 else len(sort_list)] keywords_doc = [word[1] for word in keywords_list] print(keywords_doc) print(dct.token2id) model_trained = TfidfModel(corpus_as_bow) for doc in model_trained[corpus_as_bow]: print(doc) index = similarities.MatrixSimilarity(model_trained[corpus_as_bow]) keywords_tfidf = model_trained[dct.doc2bow(keywords_doc)] print(keywords_tfidf) sims = index[keywords_tfidf] print(sims) print(max(sims)) max_idx = np.argmax(sims) print("most similar doc is {}".format(corpus[max_idx]))
def test1(): sentences = [['我吴彦祖', '我张学友'], ['吴彦祖我', '张学友我刘德华吴彦祖'], ['酸奶芝士', '芝士酸奶'], ['芝士蛋糕', '酸奶芝士蛋糕']] ls_of_words = [jieba.lcut(sentence) for sentence in sentences] dt = Dictionary(ls_of_words).token2id ls_of_wids = [[dt[word] for word in words] for words in ls_of_words] dimension = len(dt) # 维数 matrix = np.matrix([[0] * dimension] * dimension) for ls in ls_of_wids: co_occurrence_matrix(matrix, ls) print(matrix) # 奇异值分解(Singular Value Decomposition) U, s, Vh = np.linalg.svd(matrix, full_matrices=False) # 聚类 X = -U[:, 0:2] labels = KMeans(n_clusters=2).fit(X).labels_ colors = ('y', 'g') mp.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文 for word in dt.keys(): i = dt[word] mp.scatter(X[i, 1], X[i, 0], c=colors[labels[i]], s=400, alpha=0.4) mp.text(X[i, 1], X[i, 0], word, ha='center', va='center') mp.show()
class TermFrequency(object): """ Computes a term frequency distance_matrix """ def __init__(self, documents): logging.log(logging.INFO, "Creating Term Frequency") self.id2Word = Dictionary(documents) self.num_unique_words = len(self.id2Word) self.distance_matrix = self.to_term_frequency_matrix(documents) def to_term_frequency_vector(self, document): return self.id2Word.doc2bow(document) def to_binary_vector(self, document): tf = self.id2Word.doc2bow(document) vect = sparse2full(tf, len(self.id2Word.keys())) return np.array(vect > 0, dtype=int) # concerts to binary def to_term_frequency_matrix(self, documents): return [self.to_term_frequency_vector(d) for d in documents] def binary_matrix(self): """ Turns a regular tf distance_matrix into a binary distance_matrix """ def get_binary_data(val): if val <= 0: return 0 return 1 full_matrix = MatrixHelper.gensim_to_python_mdarray( self.distance_matrix, self.num_unique_words) return [[get_binary_data(cell) for cell in row] for row in full_matrix]
class TermFrequency(object): """ Computes a term frequency distance_matrix """ def __init__(self, documents): logging.log(logging.INFO, "Creating Term Frequency") self.id2Word = Dictionary(documents) self.num_unique_words = len(self.id2Word) self.distance_matrix = self.to_term_frequency_matrix(documents) def to_term_frequency_vector(self, document): return self.id2Word.doc2bow(document) def to_binary_vector(self, document): tf = self.id2Word.doc2bow(document) vect = sparse2full(tf, len(self.id2Word.keys())) return np.array( vect > 0, dtype=int ) # concerts to binary def to_term_frequency_matrix(self, documents): return [self.to_term_frequency_vector(d) for d in documents] def binary_matrix(self): """ Turns a regular tf distance_matrix into a binary distance_matrix """ def get_binary_data(val): if val <= 0: return 0 return 1 full_matrix = MatrixHelper.gensim_to_python_mdarray(self.distance_matrix, self.num_unique_words) return [[get_binary_data(cell) for cell in row] for row in full_matrix]
class TFIDF(): def __init__(self): pass def preprocess_tfidf(self): return [process_text(r) for r in get_db_records()] def create_tfidf_model(self): self.dataset = self.preprocess_tfidf() self.dct = Dictionary(self.dataset) self.dct.filter_extremes(no_below=50) corpus = [self.dct.doc2bow(line) for line in self.dataset] self.model = TfidfModel(corpus) def infer_tfidf(self): def infer(vector): dim = self.dct.keys()[-1] + 1 text1 = self.model[self.dct.doc2bow(vector)] t1 = [] for d in range(dim): t1_val = [i[1] for i in text1 if i[0] == d] if len(t1_val) == 1: t1.append(t1_val[0]) else: t1.append(0) return t1 return infer @staticmethod def load(filename): with open(filename, "rb") as f: return pickle.load(f)
def test_dict_interface(self): """Test Python 2 dict-like interface in both Python 2 and 3.""" d = Dictionary(self.texts) self.assertTrue(isinstance(d, Mapping)) self.assertEqual(list(zip(d.keys(), d.values())), list(d.items())) # Even in Py3, we want the iter* members. self.assertEqual(list(d.items()), list(d.iteritems())) self.assertEqual(list(d.keys()), list(d.iterkeys())) self.assertEqual(list(d.values()), list(d.itervalues())) # XXX Do we want list results from the dict members in Py3 too? if not PY3: self.assertTrue(isinstance(d.items(), list)) self.assertTrue(isinstance(d.keys(), list)) self.assertTrue(isinstance(d.values(), list))
def do_ir2(db, param): print 'Computazione di IR2', db, param, '...' def words(text): stopwords = set(nltk.corpus.stopwords.words('english')) return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords] class BigramsCorpus: def __init__(self, db, collection): self.client = MongoClient()[db][collection] def __iter__(self): for doc in self.client.find(): yield [doc['_id']] def __len__(self): return self.client.count() bigram_corpus = BigramsCorpus('cordis', 'bi_grams') bigrams = Dictionary(bigram_corpus) project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}} a = [project] project_corpus = MongoCorpus('cordis', 'projects', aggregate=a) n = max(bigrams.keys()) dataset = [] for doc in project_corpus: temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))]) x = [0]*(n+1) for bi, _ in temp: x[bi] = 1 dataset.append(x) alg = KMeans(n_clusters=int(param)) alg.fit(dataset) clusters = defaultdict(list) for i, doc in enumerate(project_corpus): temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))]) x = [0]*(n+1) for bi, _ in temp: x[bi] = 1 p = alg.predict([x]) clusters[p[0]].append(doc['reference']) mongo_clusters = [] for k, v in clusters.items(): mongo_clusters.append({'cluster': k, 'projects': v}) # Mongo da questo errore: InvalidDocument: Cannot encode object: 0 print mongo_clusters # Salva su collezione Mongo mongo = MongoClient()['g8']['ir2'] mongo.insert_many(mongo_clusters) print 'Fatto!'
def tfidf_train(table, tokens_col, tf_weighing='n', df_weighing='t', document_normalization='c'): out_table = table.copy() _corpus = out_table[tokens_col] _smartirs = tf_weighing + df_weighing + document_normalization _dictionary = Dictionary(_corpus) _corpus = [_dictionary.doc2bow(text) for text in _corpus] _model = TfidfModel(_corpus, smartirs=_smartirs) _corpus = [text for text in _model[_corpus]] _sparse_matrix = corpus2csc(_corpus, num_terms=len(_dictionary.token2id)).T _values = [value for value in _dictionary.values()] _keys = [key for key in _dictionary.keys()] _dic = pd.DataFrame({'indice': _keys, 'word': _values}) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Dictionary | {table1} """.format(table1=pandasDF2MD(_dic)))) out_table['sparse_vectors'] = sparse_encode( _sparse_matrix)['sparse_vectors'] fit_model = dict() fit_model['dictionary'] = _dictionary fit_model['model'] = _model fit_model['report'] = rb.get() return {'out_table': out_table, 'fit_model': fit_model}
word2vec. train.text.apply(lambda x: (map(x,dictionary.token2id))) list(map(dictionary.token2id.get,['allah'])) dictionary.token2id test.text=test.text.apply(lambda x: dictionary.doc2idx(x)) test_text=pad_sequences(test.text) #test_target=test.target.values test_text.shape,train_text.shape len(dictionary.keys()) from sklearn.model_selection import train_test_split train_x,val_x,train_y,val_y=train_test_split(train_text,train_target,test_size=.2) from keras.models import Sequential from keras.layers import Embedding, Dense, Dropout,LSTM vocab_size=len(dictionary.keys()) input_dimension=train_x.shape[1] def create_model(): model=Sequential() model.add(Embedding(vocab_size,16,input_length=input_dimension))
data = data.split(" = ") datagensim = [] regex = re.compile('[^a-zA-Z ]') for d in data: #First parameter is the replacement, second parameter is your input string test = regex.sub('', d) #Out: 'abdE' if len(test) > 100: datagensim += [[i for i in test.split(" ") if len(i) > 3]] dct = Dictionary(datagensim) dct.filter_extremes(no_below=2, no_above=0.9) dct.compactify() X = np.zeros((len(dct.keys()), len(datagensim)), int) i = 0 bow = [] datagensimClean = [] for d in datagensim: idx = dct.doc2idx(d) dC = [d[i] for i in range(len(d)) if idx[i] > -1] tmp = dct.doc2bow(dC) datagensimClean += [dC] bow += [tmp] for key, value in tmp: X[key, i] = value i += 1 datagensim = datagensimClean
elif (el['paperId'] in S2_Proto) and (S2_Proto[el['paperId']] in paper_ids): npapers = npapers + 1 tokens = tokens + paper_corpus[paper_ids.index( S2_Proto[el['paperId']])] author_npapers.append(npapers) author_corpus.append(tokens) dct = Dictionary(author_corpus) # fit dictionary corpus_bow = [dct.doc2bow(line) for line in author_corpus] # convert corpus to BoW format model = TfidfModel(corpus_bow) corpus_tfidf = [model[el] for el in corpus_bow] vocab_tfidf = [dct[el] for el in dct.keys()] with vocab_tfidf_file.open('w') as fout: [ fout.write(str(idx) + ':' + wd + '\n') for idx, wd in enumerate(vocab_tfidf) ] with corpus_tfidf_file.open('w') as fout: for an, anp, doc_tfidf in zip(author_names, author_npapers, corpus_tfidf): fout.write(an.replace(' ', '_') + ' ' + str(anp)) for token in doc_tfidf: fout.write(' ' + str(token[0]) + ':' + str(token[1])) fout.write('\n')
datagensim = [] regex = re.compile('[^a-zA-Z ]') for d in data[:200]: #First parameter is the replacement, second parameter is your input string test = regex.sub('', d) #Out: 'abdE' if len(test)>100: datagensim += [[i.lower() for i in test.split(" ") if len(i)>2]] #gensim.utils.lemmatize( dct = Dictionary(datagensim) dct.filter_extremes(keep_n=50000, no_above=0.8 ) dct.compactify() X = np.zeros((len(dct.keys()),len(datagensim)),int) i = 0 bow = [] datagensimClean = [] for d in datagensim: idx = dct.doc2idx(d) dC = [d[i] for i in range(len(d)) if idx[i]>-1] tmp = dct.doc2bow(dC) datagensimClean += [dC] bow += [tmp] for key, value in tmp: X[key,i] = value i +=1 datagensim = datagensimClean
def analyze(originfile, all=False): keywords = helper.getKeywords(originfile) os.chdir('./resources/stanford-corenlp-full-2018-10-05') os.system('kill $(lsof -t -i:9000)') cmd = 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 10000000000000 &' time.sleep(4) print("starting nlp service") with open(os.devnull, "w") as f: subprocess.call(cmd, shell=True, stderr=f, stdout=f) time.sleep(4) print("nlp service started") os.chdir('../../') nlp_wrapper = StanfordCoreNLP('http://localhost:9000') print("Number of processors: ", mp.cpu_count()) if all: print("all") '''if not os.path.isfile('/resources/all_test.csv'): print("test file created") open('./resources/all_test.csv', 'w').close()''' conn = db.db_connection() dbo = db.db_operator(conn) spell = SpellChecker() counter = Value('i', 1) corpus_tok_all=[] '''for i in range(1790): print('i=' +str(i)) print("limit= 10000") print("offset= "+str(10000*i)) conn.connect() query = 'SELECT reviews.ReviewID, reviews.Country as \'Tourist_Country\', ' \ 'hotels.CountryID as \'Hotel Country\', Good, reviews.Bad ' \ 'FROM masterthesis.reviews, masterthesis.hotels ' \ 'where hotels.HotelNumber=reviews.HotelNumber limit 10000 offset '+str(10000*i)+';' results = [list(x) for x in dbo.execute(query)]; conn.disconnect() print("got results from sql") print("starting analysis") print("tot number rows= " + str(len(results))) try: print('analyzing 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,initargs=(counter, spell, nlp_wrapper,), ) corpus_tok = pool.map_async(thread_function_row_only_all, [doc for doc in results]).get(timeout=1200) pool.close() pool.terminate() pool.join() print('got corpus_tok for 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) except TimeoutError: print("timeout error") pool.close() pool.terminate() pool.join() corpus_tok=[] for doc in results: try: pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) c=pool.map_async(thread_function_row_only_all, [doc]).get(timeout=60) #print('pool close') pool.close() pool.terminate() #print('pool join') pool.join() except TimeoutError: print(str(doc)+" caused Exception") pool.close() pool.terminate() #print('pool join') pool.join() c=[None] corpus_tok.append(c[0]) print("beginning removal of sents with contrast") corpus_tok = [r for r in corpus_tok if r != None] print('len corpus_tok_reduced= '+str(len(corpus_tok))) corpus_tok_all+=corpus_tok print('len corpus_tok_all= ' + str(len(corpus_tok_all))) if i%100==0 and i!=0: with open('./resources/all_test.csv', mode='a') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for c in corpus_tok_all: writer.writerow(c) file.close() corpus_tok_all=[] ''' ''' corpus_tok_all=[] i=0 kk=set() with open('./resources/all_test.csv', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i%100000==0: print(i) #if i%10000==0:break ar=((row[0].replace('[','')).replace(']','')).split(',') if ar[1][-1]!="'":#France, Metro. ar[1]=ar[1]+','+ar[2] for j in range(2,len(ar)-1): ar[j]=ar[j+1] del ar[len(ar)-1] ar[1]=ar[1][2:-1] ar[2] = (ar[2].replace("'", '')).replace(' ', '') rev=''.join(ar[3:]) revlist= ar[:3] revlist.append(rev) tokens = ((((row[1].replace(']', '')).replace('[','')).replace("'",'')).replace(" ",'')).split(',') r=(revlist,tokens) k=ar[0] if k not in kk: kk.add(k) corpus_tok_all.append(r) file.close() corpus_tok=corpus_tok_all corpustokonly = [r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) lenc=len(corpus_tok) print("corpus_tok len = "+str(lenc)) for idx in range(lenc): if idx%100000==0: print(idx) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) with open('./resources/corpus_tok_all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerows(corpus_tok) file.close() print("corpus_tok written") from gensim.corpora import Dictionary print("writing frequence file") ''' '''all_set=set() for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): if not (keyword == 'cleaning' or keyword=='pet'): start_time = time.time() print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), additionaldetails=True) # corpus = helper.getCorpusTextFromRaw(raw_corpus) spell = SpellChecker() counter = Value('i', 1) print("starting analysis") pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter, spell, nlp_wrapper,), ) corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get() print('pool close') pool.close() print('pool join') pool.join() print("beginning removal of sents with contrast") corpus_tok = [r for r in corpus_tok if r != None] ############################################################################### # We find bigrams in the documents. Bigrams are sets of two adjacent words. # Using bigrams we can get phrases like "machine_learning" in our output # (spaces are replaced with underscores); without bigrams we would only get # "machine" and "learning". # # Note that in the code below, we find bigrams and then add them to the # original data, because we would like to keep the words "machine" and # "learning" as well as the bigram "machine_learning". # # .. Important:: # Computing n-grams of large dataset can be very computationally # and memory intensive. # print('len all_set_tok before= ' + str(len(all_set))) print('len corpus_tok= ' + str(len(corpus_tok))) print('len corpus_tok+all_set_tok= ' + str(len(corpus_tok) + len(all_set))) for sen in corpus_tok: all_set.add((tuple(sen[0]),tuple(sen[1]))) print('len all_set_tok after= ' + str(len(all_set))) print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion) # Compute bigrams. if len(all_set) > 0: corpus_tok=[(list(x[0]),list(x[1])) for x in all_set] corpustokonly = [r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) for idx in range(len(corpus_tok)): for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) from gensim.corpora import Dictionary print("writing frequence file") # Create a dictionary representation of the documents. dictionary = Dictionary(corpustokonly) alltok = [] freq = [] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus = len(corpus_tok) print("len dictionary = " + str(len(dictionary.keys()))) i = 0 for t in dictionary: i += 1 if i % 1000 == 0: print("analyzing token " + str(i)) freqsent = 0 for doc in corpustokonly: if dictionary.get(t) in doc: freqsent += 1 freq.append((t, dictionary.get(t), alltok.count(dictionary.get(t)), alltok.count(dictionary.get(t)) / len(alltok), freqsent, freqsent / lencorpus)) freq.sort(key=lambda tup: tup[5], reverse=True) for i in range(len(freq)): freq[i] = tuple(list(freq[i]) + [i]) if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') with open('resources/bow/allfreq/stanford/all.txt', 'w') as f: for item in freq: f.write(str(item) + '\n') f.close() print("writing bow file") top_tokens = [f[1] for f in freq[:500]] lentoptok = len(top_tokens) corpus_bow = {} toplen = 0 for i in range(len(corpus_tok)): corpus_bow[i] = [0] * lentoptok if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen: toplen = len(corpus_tok[i][0] + corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)] = 1 with open('resources/bow/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * toplen + top_tokens) for i in corpus_bow.keys(): writer.writerow(corpus_tok[i][0] + corpus_tok[i][1] + [''] * ( toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i]) file.close() ''' # Create a dictionary representation of the documents. '''dictionary = Dictionary(corpustokonly) alltok = [] freq = [] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus = len(corpus_tok) print("len dictionary = " + str(len(dictionary.keys()))) time.sleep(100000) counter = Value('i', 0) pool = mp.Pool(initializer=init_globals_token_analyzer, processes=mp.cpu_count(), initargs=(counter,corpustokonly,dictionary,lencorpus,alltok), ) print("pool initialized") corpustokonly=None alltok=None del corpustokonly, alltok freq = pool.map_async(thread_function_row_only_token_analyzer, [t for t in dictionary]).get() pool.close() pool.terminate() pool.join() dictionary=None del dictionary global ctonly, dic, alltoks ctonly=None dic=None alltoks=None del ctonly,dic,alltoks print("frequence list len= "+str(len(freq))) print("frequence list created") freq.sort(key=lambda tup: tup[5], reverse=True) print("frequence list sorted") for i in range(len(freq)): if i%10000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) freq[i] = tuple(list(freq[i]) + [i]) print("frequence list modified") if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') i=0 ''' '''with open('resources/bow/allfreq/stanford/all.txt', 'w') as f: for item in freq: i+=1 if i%10000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) f.write(str(item) + '\n') f.close()''' corpus_tok=[] i=0 with open('./resources/corpus_tok_all.csv', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i%100000==0: print(i) corpus_tok.append(row) file.close() print("len corpus_tok= "+str(len(corpus_tok))) freq=[] i=0 with open('./resources/bow/allfreq/stanford/all.txt', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i==501:break freq.append(row) file.close() for i in range(len(freq)): freq[i]=freq[i][0] freq[i]=freq[i].replace("'",'') freq[i]=freq[i].replace('"','') freq[i]=freq[i].replace('(','') freq[i]=freq[i].replace(')','') freq[i]=freq[i].replace(' ','') freq[i]=freq[i].split(',') freq[i]=tuple(freq[i]) for i in range(len(corpus_tok)): if i%100000==0: print(i) corpus_tok[i][0]=corpus_tok[i][0].replace('[','') corpus_tok[i][0]=corpus_tok[i][0].replace(']','') det=(corpus_tok[i][0].split(',')) if 'São Tomé' in det[1]:#São Tomé and PrÃ\\\\xadncipe det[1]=' '+'São Tomé and PrÃ\xadncipe'+' ' if det[1][-1]!="'":#France, Metro if 'Ivoire' in det[1]:#Cote d'Ivoire det[1]=det[1].replace('\\','') det[2]=det[2][1:] else: det[1]=det[1]+','+det[2] for j in range(2,len(det)-1): det[j]=det[j+1] del det[len(det)-1] det=det[:3] desc=(corpus_tok[i][0].split(','))[-1] det[0]=det[0][1:-1] det[1]=det[1][2:-1] det[2]=det[2][2:-1] desc=desc[3:-1] det.append(desc) corpus_tok[i][0]=det corpus_tok[i][1]=corpus_tok[i][1].replace("'",'') corpus_tok[i][1]=corpus_tok[i][1].replace(' ','') corpus_tok[i][1]=corpus_tok[i][1].replace('[','') corpus_tok[i][1]=corpus_tok[i][1].replace(']','') corpus_tok[i][1]=corpus_tok[i][1].split(',') print("writing bow file") top_tokens = [f[1] for f in freq[:400]] lentoptok = len(top_tokens) corpus_bow = {} toplen = 0 print("corpus_tok_len= "+str(len(corpus_tok))) for i in range(len(corpus_tok)): corpus_bow[i] = [0] * lentoptok if i%100000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen: toplen = len(corpus_tok[i][0] + corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)] = 1 print("len corpus_bow keys= "+str(len(corpus_bow.keys()))) print("got corpus_bow") j=0 print("corpus_bow_len "+str(len(corpus_bow))) with open('resources/bow/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * toplen + top_tokens) for i in corpus_bow.keys(): j+=1 if j%100000==0: print(j) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) writer.writerow( corpus_tok[i][0] + corpus_tok[i][1] + [''] * (toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i]) file.close() print("over") else: print("not all") for emotion in ['Good','Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): if emotion=='Good' and keyword=='cleaning':#cleaning good start_time = time.time() print(keyword+' ---- '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) spell = SpellChecker() counter = Value('i', 1) corpus_tok_all=[] #if not os.path.isfile('/resources/cleaning_test.csv'): #open('./resources/cleaning_test.csv', 'w').close() for i in range(400):#400 print(str(i)) offset=i*1000 limit=1000 print("starting reading") print("limit="+str(limit)) print("offset="+str(offset)) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), additionaldetails=True, limit=limit, offset=offset) #corpus = helper.getCorpusTextFromRaw(raw_corpus) #raw_corpus_half_one = raw_corpus[:int(len(raw_corpus) / 2)] #raw_corpus_half_two=raw_corpus[int(len(raw_corpus)/2):] print("starting analysis") pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) try: corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get(timeout=30) pool.close() pool.join() except TimeoutError: print("timeout error") print('pool close') pool.close() print('pool terminate') pool.terminate() print('pool join') pool.join() corpus_tok=[] for doc in raw_corpus: try: pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) c=pool.map_async(thread_function_row_only, [doc]).get(timeout=30) #print('pool close') pool.close() #print('pool join') pool.join() '''thread = threading.Thread(target = thread_function_row_only, args = (doc)) thread.start() thread.join() c=que.get()''' except TimeoutError: print(str(doc)+" caused Exception") c=[None] corpus_tok.append(c[0]) corpus_tok_reduced=[r for r in corpus_tok if r != None] print("len corpus_tok: " + str(len(corpus_tok))) print("len corpus_tok_reduced: " + str(len(corpus_tok_reduced))) '''with open('./resources/cleaning_test.csv', mode='a') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for c in corpus_tok_reduced: writer.writerow(c) file.close()''' corpus_tok_all+=corpus_tok_reduced print("len corpus_tok_all: " + str(len(corpus_tok_all))) ''' corpus_tok=[] s=0 for doc in corpus: newdoc=False doc = doc.lower() s += 1 if s % 10000 == 0: print(str(s)) for con in constr_conjs: if con in doc: newdoc=True break if not newdoc: toks = [spell.correction(tok['lemma']) for tok in nlp_wrapper.annotate(doc, properties={'annotators': 'lemma, pos', 'outputFormat': 'json', })[ 'sentences'][0]['tokens'] if tok['pos'] in ['NNS', 'NN'] and len(tok['lemma']) > 1] toapp = [] for i in range(len(toks)): if '/' in toks[i]: for tok in toks[i].split('/'): toapp.append(tok) for tok in toapp: toks.append(tok) toapp = [] for i in range(len(toks)): if '-' in toks[i]: for tok in toks[i].split('-'): toapp.append(tok) for tok in toapp: toks.append(tok) corpus_tok.append(toks)''' #print("beginning removal of sents with contrast") corpus_tok=corpus_tok_all print("len corpus_tok: " + str(len(corpus_tok))) ############################################################################### # We find bigrams in the documents. Bigrams are sets of two adjacent words. # Using bigrams we can get phrases like "machine_learning" in our output # (spaces are replaced with underscores); without bigrams we would only get # "machine" and "learning". # # Note that in the code below, we find bigrams and then add them to the # original data, because we would like to keep the words "machine" and # "learning" as well as the bigram "machine_learning". # # .. Important:: # Computing n-grams of large dataset can be very computationally # and memory intensive. # # Compute bigrams. if len(corpus_tok)>0: corpustokonly=[r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) for idx in range(len(corpus_tok)): for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) from gensim.corpora import Dictionary print("writing frequence file") # Create a dictionary representation of the documents. dictionary = Dictionary(corpustokonly) alltok = [] freq=[] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus=len(corpus_tok) print("len dictionary = "+str(len(dictionary.keys()))) i=0 for t in dictionary: i+=1 if i%1000==0: print("analyzing token "+str(i)) freqsent = 0 for doc in corpustokonly: if dictionary.get(t) in doc: freqsent+=1 freq.append((t,dictionary.get(t),alltok.count(dictionary.get(t)),alltok.count(dictionary.get(t))/len(alltok),freqsent,freqsent/lencorpus)) freq.sort(key=lambda tup: tup[5], reverse=True) for i in range(len(freq)): freq[i]=tuple(list(freq[i])+[i]) if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') with open('resources/bow/allfreq/stanford/'+keyword+'_'+emotion.lower()+'.txt', 'w') as f: for item in freq: f.write(str(item)+'\n') f.close() print("writing bow file") top_tokens=[f[1] for f in freq[:500]] lentoptok=len(top_tokens) corpus_bow={} toplen=0 for i in range(len(corpus_tok)): corpus_bow[i]=[0]*lentoptok if len(corpus_tok[i][0]+corpus_tok[i][1])>toplen: toplen=len(corpus_tok[i][0]+corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)]=1 with open('resources/bow/'+keyword+'_'+emotion.lower()+'.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['']*toplen+top_tokens) for i in corpus_bow.keys(): writer.writerow(corpus_tok[i][0]+corpus_tok[i][1]+['']*(toplen-len(corpus_tok[i][0]+corpus_tok[i][1]))+corpus_bow[i]) file.close() print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion) f.close()
class LdaSelector(QObject): # 训练测试主题词结束信号 train_test_over_msg = pyqtSignal() # 提取主题词结束信号,第一个str参数是期刊名,第二个str参数是年份,int参数是提取主题词的个数 select_over_msg = pyqtSignal(str, str, int) # 当前文件的绝对路径 abspath = os.path.dirname(__file__) def __init__(self): QObject.__init__(self) self.dictionary = Dictionary self.corpus_a = list def perplexity(self, ldamodel, testset, dictionary, size_dictionary, num_topics): """calculate the perplexity of a lda-model""" # dictionary : {7822:'deferment', 1841:'circuitry',19202:'fabianism'...] # print ('the info of this ldamodel: \n') # print ('num of testset: %s; size_dictionary: %s; num of topics: %s'%(len(testset), size_dictionary, num_topics)) prep = 0.0 prob_doc_sum = 0.0 topic_word_list = [ ] # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...] for topic_id in range(num_topics): topic_word = ldamodel.show_topic(topic_id, size_dictionary) dic = {} for word, probability in topic_word: dic[word] = probability topic_word_list.append(dic) doc_topics_ist = [ ] # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...] for doc in testset: doc_topics_ist.append( ldamodel.get_document_topics(doc, minimum_probability=0)) testset_word_num = 0 for i in range(len(testset)): prob_doc = 0.0 # the probablity of the doc doc = testset[i] doc_word_num = 0 # the num of words in the doc for word_id, num in doc: prob_word = 0.0 # the probablity of the word doc_word_num += num word = dictionary[word_id] for topic_id in range(num_topics): # cal p(w) : p(w) = sumz(p(z)*p(w|z)) prob_topic = doc_topics_ist[i][topic_id][1] prob_topic_word = topic_word_list[topic_id][word] prob_word += prob_topic * prob_topic_word prob_doc += math.log(prob_word) # p(d) = sum(log(p(w))) prob_doc_sum += prob_doc testset_word_num += doc_word_num prep = math.exp( -prob_doc_sum / testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd)) # print ("the perplexity of this ldamodel is : %s"%prep) return prep def build_corpus(self, journal, year): input_path = self.abspath + '/data/journal_year/' + journal + '/' input_filename = year + '.csv' input_file_big = open(input_path + input_filename, 'r', encoding='utf-8', errors='ignore').readlines() list_stopWords = list(set(stopwords.words('english'))) # 转大小写 input_file = [text.lower() for text in input_file_big] # 分词 list_words = [word_tokenize(text) for text in input_file] # 过滤停止词 filtered_words = [[w for w in text if not w in list_stopWords] for text in list_words] # 过滤标点 english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '’', '≤', 'a.', 'b.', 'c.', 'd.', 'e.', 'm.', 'n.', 'p.', 'f.', 'g.', 'h.', 'i.', 'j.', 'k.', 'l.', 'o.', 'q.', 'r.', 's.', 't.', 'u.', 'v.', 'w.', 'x.', 'y.', 'z.' ] text_list = [[ word for word in text if word not in english_punctuations ] for text in filtered_words] dropword = [ 'model', 'method', 'published', 'results', 'using', 'study', 'The', '\'\'', '``', 'two', 'paper', 'online' ] text_list2 = [[word for word in text if word not in dropword] for text in text_list] # 过滤数字 train_set = [[ word for word in text if bool(re.search(r'\d', word)) == False ] for text in text_list2] # res=[] # for word in text_list2: # if bool(re.search(r'\d', word))==False: # res.append(word) # else: # pass # 构建训练语料 self.dictionary = Dictionary(train_set) self.dictionary.filter_extremes(no_below=5, no_above=0.5) self.corpus_a = [self.dictionary.doc2bow(text) for text in train_set] def train_test(self, journal, year, upper_bound, lower_bound, step): self.build_corpus(journal, year) # 分训练、测试集 tfidf = models.TfidfModel(self.corpus_a) corpus = tfidf[self.corpus_a] p = int(len(corpus) * .8) cp_train = corpus[0:p] cp_test = corpus[p:] # lda模型训练 # 2013 年开始50个主题 grid = dict() for topic in range(lower_bound, upper_bound, step): # grid[topic]=[] grid[topic] = [] # lda = LdaModel(corpus=corpus_a, id2word=dictionary, num_topics=topic,passes=2,update_every=0,alpha='auto',iterations = 500) lda = LdaModel(corpus=cp_train, id2word=self.dictionary, num_topics=topic, passes=2, update_every=0, alpha='auto', iterations=500) # test_perplexity=lda.log_perplexity(cp_test) # perplex= lda.bound(cp_test) # test_perplexity = numpy.exp2(-perplex / sum(cnt for document in cp_test for cnt in document)) test_perplexity = self.perplexity(lda, cp_test, self.dictionary, len(self.dictionary.keys()), topic) print(topic) print(test_perplexity) grid[topic].append(test_perplexity) df = pd.DataFrame(grid) plt.figure(figsize=(14, 8), dpi=120) plt.subplot(221) plt.plot(df.columns.values, df.iloc[0].values, '#007A99', linewidth=2) plt.xticks(df.columns.values) plt.ylabel(journal + '_' + year + '_test_perplexity') plt.show() self.train_test_over_msg.emit() def select_lda(self, journal, year, num_topics): self.build_corpus(journal, year) # 输出 lda = LdaModel(corpus=self.corpus_a, id2word=self.dictionary, num_topics=num_topics, passes=2, update_every=0, alpha='auto', iterations=500) output_path = self.abspath + '/data/lda_topic/' + journal + '/' output_filename = year + '.txt' with open(output_path + output_filename, 'w', newline='', encoding='UTF-8') as f: for i in range(0, num_topics): input_str = lda.show_topic(i, topn=30)[0][0] + ':' + str( lda.show_topic(i, topn=30)[0][1]) for j in range(1, len(lda.show_topic(i, topn=30))): word = lda.show_topic(i, topn=30)[j][0] + ':' + str( lda.show_topic(i, topn=30)[j][1]) input_str = input_str + ',' + word f.write(input_str + '\n') self.select_over_msg.emit(journal, year, num_topics)
def prepare_text_for_fitting(full_texts, sentences, nlp, **kwargs): #Grap and parse the chapters/sentences from the input corpus chapters = full_texts.split('\n\n\n\n\n\n') p_chapters = [ tokenize(nlp(chapter_return(chapter))) for chapter in chapters ] p_sentences = [tokenize(nlp(sentence)) for sentence in sentences] #Create gensim dictionaries and carefully filter the high/low occurring words. text_dict = Dictionary(p_chapters) sentence_dict = Dictionary(p_sentences) text_dict.filter_extremes(no_below=4, no_above=0.22) print len(text_dict) text_dict.compactify() text_dict[text_dict.keys()[0]] #Get the bag of word representation for every word in each chapter chap_corpus = [text_dict.doc2bow(c) for c in p_chapters] #sent_corpus = [text_dict.doc2bow(s) for s in p_sentences] #The GloVe vector representation of each word in all of the chapters tf_idf_glove = np.vstack( [nlp(text_dict[i]).vector for i in range(len(text_dict))]) #Create a normed set of the vectors for easy similarity scoring normed_vecs = copy.deepcopy(tf_idf_glove) for i, nv in enumerate(normed_vecs): normed_vecs[i] = nv / np.linalg.norm(nv) #Get the bag of word rep. for each applicable sentence. #If a word is not in the dictionary, we grab and weight the most similar available word. sent_corpus = [ get_sent_bow(s, text_dict, nlp, preload=normed_vecs) for s in p_sentences ] #pickle.dump(sent_corpus,open('raw_count_mat.pckl','wb')) #Could use atn or ntn as well as ltn if os.path.isfile('tf_idf_sent_mat_samp4.pckl'): sent_vecs = pickle.load(open('tf_idf_sent_mat_samp4.pckl', 'rb')) else: #Create a TF-IDF model for the text as a whole model_tfidf = TfidfModel(chap_corpus, id2word=text_dict, smartirs='ltn') model_tfidf.save('tfidf_model_samp4') #Apply the model to each word in the applicable sentences sent_tfidf = model_tfidf[sent_corpus] #Unpack each TF-IDF vector sent_vecs = np.vstack( [sparse2full(c, len(text_dict)) for c in sent_tfidf]) pickle.dump(sent_vecs, open('tf_idf_sent_mat_samp4.pckl', 'wb')) if os.path.isfile('glove_sent_mat_samp4.pckl'): sent_glove_mat = pickle.load(open('glove_sent_mat_samp4.pckl', 'rb')) else: #Weight the glove vector representation by the appropriate TF-IDF values sent_glove_mat = np.dot(sent_vecs, tf_idf_glove) pickle.dump(sent_glove_mat, open('glove_sent_mat_samp4.pckl', 'wb')) if os.path.isfile('sent_w2v_mat_samp4.pckl'): sent_w2v_mat = pickle.load(open('sent_w2v_mat_samp4.pckl', 'rb')) else: #Create a 250 element Word2Vec modeller model_w2v = Word2Vec(p_chapters, size=250, window=7) #Train it over 10 epochs model_w2v.train(p_chapters, total_examples=model_w2v.corpus_count, epochs=10) model_w2v.init_sims() model_w2v.save('word2vec_model_samp4') #Fix non-included ones ids = [] #Collect the dict. ID's for the intersection of the w2v and text vocabs. for k in model_w2v.wv.vocab: try: ids.append(text_dict.token2id[k]) except KeyError: pass #[text_dict.token2id[k] for k in model_w2v.wv.vocab] #Create the new, smaller subset dictionary filt_dict = {new_id: text_dict[new_id] for new_id in ids} #Deal with the id numbers being off. blah = zip(list(np.sort(ids)), range(len(model_w2v.wv.vocab))) renum_dict = dict(blah) #Subset corpus filt_sent_corp = [] for i in range(len(p_sentences)): corp_ = [] for p in sent_corpus[i]: if p[0] in ids: corp_.append((renum_dict[p[0]], p[1])) filt_sent_corp.append(corp_) #New, smaller Word2Vec model tdidf_w2v = TfidfModel(filt_sent_corp, id2word=filt_dict, smartirs='ltn') sent_w2v_tdidf = tdidf_w2v[filt_sent_corp] #Appropriate TF-IDF vectors w2v_tfidf_vecs = np.vstack( [sparse2full(c, len(filt_dict)) for c in sent_w2v_tdidf]) #Collect all of the appropriate Word2Vectors w2v_vecs = [ model_w2v.wv[filt_dict[filt_dict.keys()[i]]] for i in range(len(filt_dict)) ] w2v_vecs = np.array(w2v_vecs) w2v_vecs.shape = (len(filt_dict), 250) sent_w2v_mat = np.dot(w2v_tfidf_vecs, w2v_vecs) pickle.dump(sent_w2v_mat, open('w2v_sent_mat_samp4.pckl', 'wb')) return sent_vecs, sent_glove_mat, sent_w2v_mat
def lda_scratch(topic_num, alpha, beta, passes): docs_list = [] with open(filtered_data_file, 'r', encoding="UTF-8") as f: for line in f.readlines(): docs_list.append(line.split()) dictionary = Dictionary(docs_list) docs_idx_list = [] for doc in docs_list: one_doc_idx = [] for word in doc: one_doc_idx.append(dictionary.token2id[word]) docs_idx_list.append(one_doc_idx) doc_num = len(docs_list) word_num = len(dictionary.keys()) n_d_k = np.zeros((doc_num, topic_num)) n_k_w = np.zeros((topic_num, word_num)) n_k = np.zeros((topic_num, )) z = {} for d, doc in enumerate(docs_idx_list): for w_index, w in enumerate(doc): k = np.random.randint(0, topic_num) n_d_k[d, k] += 1 n_k_w[k, w] += 1 n_k[k] += 1 z[(d, w_index)] = k n_d_k = n_d_k + np.ones((topic_num, )) * alpha n_k_w = n_k_w + np.ones((word_num, )) * beta theta = np.zeros((doc_num, topic_num)) for i_pass in range(passes): print("I_Pass: {}".format(i_pass)) for d, doc in enumerate(docs_idx_list): theta[d] = np.random.dirichlet( n_d_k[d] + np.ones((topic_num, )) * alpha, 1) for w_index, w in enumerate(doc): word = w topic = z[(d, w_index)] n_d_k[d, topic] -= 1 n_k_w[topic, word] -= 1 n_k[topic] -= 1 temp_phi = n_k_w[:, word] / n_k p_z_k = n_d_k[d] * temp_phi # p_z_k = theta[d] * temp_phi new_topic = np.random.multinomial( 1, p_z_k / np.sum(p_z_k)).argmax() z[(d, w_index)] = new_topic n_d_k[d, new_topic] += 1 n_k_w[new_topic, word] += 1 n_k[new_topic] += 1 for k_i in range(topic_num): print("K: {}".format(k_i)) arg_list = (n_k_w[k_i]).argsort()[-10:] for idx, arg_index in enumerate(list(reversed(arg_list))): print( "{}: {}".format(dictionary[arg_index], n_k_w[k_i, arg_index] / np.sum(n_k_w[k_i])), end="\t") print("\n")
# Print the matrix wcounts = dict() for x in occurrences.keys(): wcounts[x] = 0 for d in corpus: for w in d: wcounts[w] += 1 graphd = {"nodes": [], "links": []} ease_id_dict = dict() for x in dct.keys(): ease_id_dict[dct[x]] = x for x in wcounts.keys(): graphd["nodes"].append({ "id": x, "word": x, "count": wcounts[x], "linked": 0 }) edges = dict() for x in occurrences.keys(): for y in occurrences[x].keys(): st1 = x + ":" + y st2 = y + ":" + x