def cosine_string(a, b, char=True, move=False): if move == False: t = w.tf_idf(a, b, char=char) return round(simi.cosine_similarity(t.A[0], t.A[1]), 15) else: t = w.tf_idf(a, b, char=char) cosine = round(simi.cosine_similarity(t.A[0], t.A[1]), 15) p = perpindahan(a, b) pindah = round(simi.cosine_similarity(p[0], p[1]), 15) return (cosine + pindah) / 2
def calculate_corpus_tf_idf(corpus_reader): ''' Calculate TF*IDF weight for each document in given corpus. -Input: CorpusReader (either Tagged or PlainText ) -Return: (1) A dictionary whose keys=document name and values=dictionary with terms for keys and TF*IDF weights for values (2) A dictionary whose keys=terms and values=their IDF ''' st_time = time.time() # Term Frequency for each category tfs_per_document = defaultdict(Counter) for document in corpus_reader.fileids(): terms_in_document = corpus_reader.words(document) tfs_per_document[document] = tf_idf.tf(terms_in_document) # Inverse Document Frequency idfs = tf_idf.idf(tfs_per_document) # key is folder name, value is a list of (term, tfidf score) pairs tfidfs_per_document = defaultdict(defaultdict) for document, tfs in tfs_per_document.iteritems(): tfidfs_per_document[document] = tf_idf.tf_idf(tfs, idfs, len(tfs_per_document)) print "time to compute TF-IDF weights for corpus: %.3f sec" % (time.time()-st_time) return tfidfs_per_document, idfs
def execute(): table = tf_idf() text_output.delete('1.0', tk.END) folder_name = folder num_of_files = len([ name for name in os.listdir(folder) if os.path.isfile(os.path.join(folder, name)) ]) + 1 for x in range(1, num_of_files): file_name = folder_name + '/' + str(x).zfill(2) + '.txt' table.add_file(file_name) top_k = entry_top_k.get() top_k = int(top_k) for x in range(1, num_of_files): target_file = folder_name + '/' + str(x).zfill(2) + '.txt' var = 'Top ' + str(top_k) + ' of tf-idf in ' + os.path.basename( target_file) + ' : \n' text_output.insert('end', var) var = table.get_tf_idf(target_file, top_k) text_output.insert('end', var) var = '\n\n' text_output.insert('end', var) keyword = entry_keyword.get() var = 'tf-idf of key word "' + keyword + ' : \n' text_output.insert('end', var) var = table.similarities([keyword]) for x in var: x[0] = os.path.basename(x[0]) text_output.insert('end', var) return 0
def classify_article_words(article_words, corpus_tfidfs_per_category, corpus_idf): ''' Find the category that best matches the given article among the given categories. -Input: (1) list of article terms (2) TF*IDF weights for each document in corpus (3) IDF for the entire corpus -Return: top category and a dictionary with match score for the article with each category ''' st_time = time.time() # Find article TF and TFIDF article_tfs = tf_idf.tf(article_words) article_tfidfs = tf_idf.tf_idf(article_tfs, corpus_idf, len(corpus_tfidfs_per_category)) # find best match among categories sim_scores = defaultdict() for cat_name, cat_tfidf_scores in corpus_tfidfs_per_category.iteritems(): cos_sim_time = time.time() sim_scores[cat_name] = \ cosine_sim.cosine_similarity_dict(article_tfidfs, cat_tfidf_scores) # sort by value (match score), descending match = sorted(sim_scores.iteritems(), key=operator.itemgetter(1), reverse=True)[0][0] return match, sim_scores
def data_to_dataset(datas, code_and_dates=None, labels=None, comments=None, return_format="APPLY_FOR_NLTK_CLASSIFY"): """ datas별로 dates와 codes를 제공 하거나 code_and_dates 는 list속의 (code, date) 형식 label을 내놓아야 한다. APPLY_FOR_NLTK_CLASSIFY의 경우 참고로, 너무 최근이어서 다음날의 주가를 알수없는경우 featureset 만으로 2번 반환리스트에 속한다. 그 이외에서는 (featureset, label) 로 1번 반환리스트에 속한다. """ assert ((code_and_dates is not None) | (labels is not None)) if not labels: assert (len(datas) == len(code_and_dates)) labels = [up_or_down(*I) for I in code_and_dates] words_and_importent_in_docs = tf_idf.tf_idf(datas) def build_feature(): return [ sorted(I.items(), key=lambda X: (X[1], X[0]))[::-1][:5] for I in words_and_importent_in_docs.values() ] features = build_feature() if return_format == "APPLY_FOR_NLTK_CLASSIFY": return [({W:V for W, V in F},L, *CC) for F, L, CC in zip(features, labels, comments) if L is not None], \ [({W:V for W, V in F},None, *CC) for F, L, CC in zip(features, labels, comments) if L is None] else: return features, labels
def data_to_dataset(datas, code_and_dates = None, labels = None, return_format = "APPLY_FOR_NLTK_CLASSIFY"): """ datas별로 dates와 codes를 제공 하거나 code_and_dates 는 list속의 (code, date) 형식 label을 내놓아야 한다. APPLY_FOR_NLTK_CLASSIFY의 경우 참고로, 너무 최근이어서 다음날의 주가를 알수없는경우 featureset 만으로 2번 반환리스트에 속한다. 그 이외에서는 (featureset, label) 로 1번 반환리스트에 속한다. """ assert( (code_and_dates is not None) | (labels is not None )) if not labels: assert(len(datas) == len(code_and_dates)) labels = [ up_or_down(*I) for I in code_and_dates] words_and_importent_in_docs = tf_idf.tf_idf(datas) def build_feature(): return [sorted(I.items(), key = lambda X : (X[1], X[0]))[::-1][:5] for I in words_and_importent_in_docs.values()] features = build_feature() if return_format == "APPLY_FOR_NLTK_CLASSIFY": """ if you want use full feature but it is extremely low efficiency """ return [({W:V for W, V in F},L) for F, L in zip(features, labels) if L is not None], \ [{W:V for W, V in F} for F, L in zip(features, labels) if L is None] """ 존재성만 다룰경우의 파라메터 반환 """ return [({W:1 for i,(W, V) in enumerate(F)},L) for F, L in zip(features, labels) if L is not None], \ [{W:1 for i,(W, V) in enumerate(F)} for F, L in zip(features, labels) if L is None] else: return features, labels
def train(outputfile, corpus): print("Computing most signicant ngrams for", corpus) tfidf = tf_idf(corpus) top_200 = top_n(tfidf, 200) print("Dumping training output...") with open(outputfile, 'wb') as file: pickle.dump(top_200, file) print("Output saved at", outputfile)
def get_website_scores(question): from tf_idf import tf_idf from conf.website_conf import website_list from website_corpus import get_corpus_list # 获取网站的语料库 corpus_list = get_corpus_list(website_list) words, word_weights = tf_idf(corpus_list) for i in range(len(word_weights)): for j in range(len(words)): print words[j], word_weights[i][j]
def run_tf_idf(): if not os.path.exists('tf_idf_top2000.xlsx'): print('start tf-idf method ...') print(type(corpus[0])) start_time = time.time() keywords_tf_idf = tf_idf(corpus) print(keywords_tf_idf) print('total time taken:', time.time() - start_time, 's') res_path = 'tf_idf_top2000.xlsx' return res_path
def getConsultapadrao(): M = [ 'O peã e o caval são pec de xadrez. O caval é o melhor do jog.', 'A jog envolv a torr, o peã e o rei.', 'O peã lac o boi', 'Caval de rodei!', 'Polic o jog no xadrez.' ] stopwords = ['a', 'o', 'e', 'é', 'de', 'do', 'no', 'são'] q = 'xadrez peã caval torr' separadores = [' ', ',', '.', '!', '?'] (ponderada_docs, ponderada_consulta) = tf_idf.tf_idf(M, stopwords, q, separadores) return modelo_vetorial(ponderada_docs, ponderada_consulta)
def selector(self, method): """ Selector for the choosen method. """ #Init given method # TF-IDF if self.method == "tf-idf": self.tf_idf = tf_idf(self.data) # Word to vec elif self.method == "word2vec": self.word2vec = word2vec(self.data) elif self.method == "doc2vec": self.doc2vec = doc2vec(self.data)
def get_top_n_website_scores(question, n=5): from tf_idf import tf_idf, get_corpus_list from conf.website_conf import website_list from jieba_split import split_word_only from cosine import batch_get_sort_scores from wikipedia_expansion import get_question_expansion_corpus if debug_flag: print 'question:', question website_corpus = get_corpus_list(website_list) question_corpus = get_question_expansion_corpus(question) # question_corpus = split_word_only(question) website_corpus.append(question_corpus) words, words_weight = tf_idf(website_corpus) return batch_get_sort_scores(words_weight, website_list)[0:n]
def test(): docs = [ '我 今天 心情 很好,但是 看到 不想 看到 的 人 了', '我 今天 心情 很好,且 看到 想 看到 的 人 了', '我 今天 心情 很 不好,但是 看到 想 看到 的 人 了', '我 今天 心情 很 不好,且 看到 不想 看到 的 人 了', '我 今天 心情 很好,且 看到 想 看到 的 人 了' ] from tf_idf import tf_idf words, words_weight = tf_idf(docs) for i, x in enumerate(words_weight): print docs[i] for j, y in enumerate(words_weight[i]): print words[j], words_weight[i][j] print for i in range(0, 4): score = get_cossimi(words_weight[i], words_weight[4]) print 'score', i, ':', score print
def common_emoji_cl(fname,m,names=False): #high tf-idf emoji of each cluster print 'Extract terms from %s'%fname [emoji,emoji_cl]=extract_data(fname) N=len(emoji) k=len(emoji_cl) N_cl=[len(emoji_cl[i]) for i in xrange(1,k+1)] print 'Count emoji tf' counter=Counter() counter=tf_idf.count(emoji,counter,ref=emoji_dict,type='emoji') counter_cl=[tf_idf.tf(counter,emoji_cl[i+1],ref=emoji_dict,type='emoji') for i in xrange(k)] print 'Calculate cluster emoji tf-idf' tfIdf=[tf_idf.tf_idf(counter,counter_cl[i+1],N,N_cl[i]) for i in xrange(k)] term=tfIdf[0][0] tfIdf=[tfIdf[i][1] for i in xrange(k)] print 'Write results' write_common_emoji_cl(fname,mtx,names)
def lang_detect(inputfolder, corpus): for dirpath, dirnames, filenames in os.walk(inputfolder): for inputfile in corpus: lang_scores = defaultdict(int) ngrams_input = tf_idf([inputfile]) for filename in filenames: path = dirpath + "/" + filename # Open languages profiles for comparison: with open(path, 'rb') as file: ngrams = pickle.load(file) for ngram, tfidf in ngrams: if ngram in ngrams_input: lang_scores[path] += tfidf * ngrams_input[ngram] # Language detection: best = (None, 0) for lang, score in lang_scores.items(): print(f'\t{lang}: {score:.10f}') if score > best[1]: best = (lang, score) print(f'Detected {inputfile} to be from this language: {best[0]}\n') break # Only toplevel dir
def merge(filepath=""): #path of chunk file #merges the feature vectors together so that they can be used in Naive bayes #return a dictionary ( sentenceid: featureVector) tree = ET.parse(filepath) root = tree.getroot() coherence_dict = centroid_coherence(root) #sentid: coherence tf_isf_dict = tf_isf(root, 1) #sentid: total tf_isfscore tf_idf_dict = tf_idf(root) length_dict, position_dict = sentence_length_position(root) title_simm_dict = title_simm_main1(filepath) is_question_dict = is_question(root) #print title_simm_dict ans = {} for sentid in coherence_dict.iterkeys(): d = { "coherence": coherence_dict[sentid], "tfIsf": 0, "tfIdf": 0, "length": 0, "position": 0, "titleSimm": 0, "isQues": 0 } if tf_isf_dict.has_key(sentid): d["tfIsf"] = tf_isf_dict[sentid] if tf_idf_dict.has_key(sentid): d["tfIdf"] = tf_idf_dict[sentid] if length_dict.has_key(sentid): d["length"] = length_dict[sentid] if position_dict.has_key(sentid): d["position"] = position_dict[sentid] if title_simm_dict.has_key(sentid): d["titleSimm"] = title_simm_dict[sentid] if is_question_dict.has_key(sentid): d["isQues"] = is_question_dict[sentid] ans[sentid] = d return ans
def run_intersection(): print('start intersection ...') start_time = time.time() if not os.path.exists('tf_idf_top2000.xlsx'): # tf-idf print('start tf-idf method ...') start_time = time.time() keywords_tf_idf = tf_idf(corpus) print(keywords_tf_idf) print('total time taken:', time.time() - start_time, 's') if not os.path.exists('chi_square_top2000.xlsx'): # chi_square print('start chi_square method ...') start_time = time.time() keywords_chi_square = chi_square(DATA_PATH, DICT_PATH, corpus) print(keywords_chi_square) print('total time taken:', time.time() - start_time, 's') if not os.path.exists('word2vec_huffman_top2000_multiprocessing.xlsx'): # word2vec extra data processing categories = len(corpus) for i in range(categories): corpus[i] = corpus[i].split() # word2vec_huffman_softmax print('start huffman method ...') start_time = time.time() keywords_huffman = word2vec_huffman(corpus, MODEL_HUFFMAN_PATH) print(keywords_huffman) print('total time taken:', time.time() - start_time, 's') tf_idf_res = pd.read_excel('tf_idf_top2000.xlsx', sheet_name=None) word2vec_res = pd.read_excel( 'word2vec_huffman_top2000_multiprocessing.xlsx', sheet_name=None) chi_square_res = pd.read_excel('chi_square_top2000.xlsx', sheet_name=None) sheet_names = list(tf_idf_res.keys()) sheet_num = len(sheet_names) writer = pd.ExcelWriter('intersection.xlsx') for i in range(sheet_num): new_sheet = [] this_tf_idf = tf_idf_res[sheet_names[i]] this_word2vec = word2vec_res[sheet_names[i]] this_chi_square = chi_square_res[sheet_names[i]] for word in this_word2vec.values.tolist(): word = word[0] if word not in new_sheet: new_sheet.append(word) for word in this_chi_square.values.tolist(): word = word[0] if word not in new_sheet: new_sheet.append(word) for word in this_tf_idf.values.tolist(): word = word[0] if word not in new_sheet: new_sheet.append(word) intersection = pd.DataFrame({'word': new_sheet}) intersection.to_excel(writer, sheet_name=sheet_names[i], index=None) writer.save() writer.close() print('total time taken:', time.time() - start_time, 's') res_path = 'intersection.xlsx' return res_path
def tf_idf_cl(term_cl,user,counter,top_user,N,term_remove): '''''' names=list(set(user)) m=len(user) count=[len([user[i] for i in xrange(m) if user[i]==name]) for name in names] idx=np.argsort(count)[::-1] if top_user>len(idx): top_user=len(idx) print '---- cluster has less than %d unique users'%top_user names=[names[i] for i in idx[:top_user]] term_user=init_clusters(len(names),names) for name in names: term_user[name]=[term_cl[i] for i in xrange(m) if user[i]==name] '''''' print '-- Count cls-term tf' counter_i=copy.copy(counter) counter_i.subtract(counter) counter_term=tf_idf.tf(counter_i,term_cl,'term') '''''' print '-- Count user-term tf' counter_user=init_clusters(len(names),names) for name in names: counter_i=copy.copy(counter_term) counter_i.subtract(counter_term) counter_user[name]=tf_idf.tf(counter_i,term_user[name],'term') '''''' print '-- Clean counters' if '' in counter.keys(): del counter[''] del counter_term[''] '''''' for name in names: del counter_user[name][''] '''''' for term in counter_term: if counter_term[term]==0: term_remove+=[term] for term in term_remove: del counter_term[term] '''''' for name in names: del counter_user[name][term] '''''' print '-- Calculate cls-term tfidf' term_tfidf=tf_idf.tf_idf(counter,counter_term,N) '''''' print '-- Calculate user-term tfidf' user_tfidf=init_clusters(len(names),names) for name in names: user_tfidf[name]=tf_idf.tf_idf(counter_term,counter_user[name],m) '''''' print '-- Calculate term norm-tfidf' #user_tfidf=[max([user_tfidf[name][term] for name in names]) for term in term_tfidf] user_tfidf=[np.std([user_tfidf[name][term] for name in names]) for term in term_tfidf] #user_tfidf=[0 for term in term_tfidf] term=term_tfidf.keys() n=len(term) tfidf=[term_tfidf[term[i]]/(1+user_tfidf[i]) for i in xrange(n)] return [(term[i],tfidf[i]) for i in xrange(n)]
def tf_idf_test(fname,cl=0,sep='\t'): data=file('txt\\'+fname+'.txt').readlines()[1:] n=len(data) data=[data[i][:-1].split(sep) for i in xrange(n)] user=[data[i][1] for i in xrange(n) if int(data[i][0])==cl] term=[data[i][-1].split(',') for i in xrange(n)] term_cl=[term[i] for i in xrange(n) if int(data[i][0])==cl] names=list(set(user)) m=len(user) count=[len([user[i] for i in xrange(m) if user[i]==name]) for name in names] idx=np.argsort(count)[::-1] #N=int(len(idx)*0.1) N=50 names=[names[i] for i in idx[:N]] term_user=init_clusters(len(names),names) for name in names: term_user[name]=[term_cl[i] for i in xrange(m) if user[i]==name] print 'Count all tf' counter=Counter() counter=tf_idf.tf(counter,term,'term') print 'Count cl tf' counter_i=copy.copy(counter) counter_i.subtract(counter) counter_term=tf_idf.tf(counter_i,term_cl,'term') remove=[] for term in counter_term: if counter_term[term]==0: remove+=[term] for term in remove: del counter_term[term] print 'Count user tf' print '#_name',len(names) counter_user=init_clusters(len(names),names) for name in names: print '-- %s'%name counter_i=copy.copy(counter_term) counter_i.subtract(counter_term) counter_user[name]=tf_idf.tf(counter_i,term_user[name],'term') print 'Calculate cl tfidf' term_tfidf=tf_idf.tf_idf(counter,counter_term,n) print 'Calculate user tfidf' user_tfidf=init_clusters(len(names),names) for name in names: user_tfidf[name]=tf_idf.tf_idf(counter_term,counter_user[name],m) print 'Sort tfidf' user_tfidf=[max([user_tfidf[name][term] for name in names]) for term in term_tfidf] #user_tfidf=[np.std([user_tfidf[name][term] for name in names]) for term in term_tfidf] term=term_tfidf.keys() n=len(term) tfidf=[term_tfidf[term[i]]/(1+user_tfidf[i]) for i in xrange(n)] term_tfidf=[term_tfidf[term[i]] for i in xrange(n)] f=os.open('txt\\tfidf_test.txt', os.O_RDWR|os.O_CREAT) idx=np.argsort(term_tfidf)[::-1] os.write(f,'term_tfidf,'+','.join([term[i] for i in idx[:10]])+'\n') os.write(f,','+','.join(['%0.4f'%term_tfidf[i] for i in idx[:10]])+'\n') idx=np.argsort(user_tfidf)[::-1] os.write(f,'user_tfidf,'+','.join([term[i] for i in idx[:10]])+'\n') os.write(f,','+','.join(['%0.4f'%user_tfidf[i] for i in idx[:10]])+'\n') idx=np.argsort(tfidf)[::-1] os.write(f,'tfidf,'+','.join([term[i] for i in idx[:10]])+'\n') os.write(f,','+','.join(['%0.4f'%tfidf[i] for i in idx[:10]])+'\n') os.close(f)
def compare_bars(bar,lat1,lon1,miles): con = connect("bar_data_test.db") c1 = con.cursor() print bar review11=c1.execute("SELECT REVIEW FROM BARS WHERE name like '%s' " %bar) review1=c1.fetchone() #print review1 for review in review1: r1=review #print r1 old_category1=c1.execute("SELECT CATEGORY FROM BARS WHERE name like '%s'" %bar) old_category=c1.fetchone() c=[category_new for category_new in old_category] c=clean_content(c).lower() print c #************** def unique_list(l): ulist = [] [ulist.append(x) for x in l if x not in ulist] return ulist comp=' '.join(unique_list(c.split())) comp = comp.split() categry_match=[] comp_list=[] sort_before=[] length='' #print comp def words_in_string(word_list, a_string): return set(word_list).intersection(a_string.split()) rows = c1.execute("SELECT NAME, CATEGORY from BARS ") for row in rows: print row[1] i=0 for word in words_in_string(comp, row[1]): print(word) i+=1 categry_match.append([i,row[0]]) categry_match.sort(key=lambda x: x[0], reverse=True) j=0 while j<5: bar_match = categry_match[j][1] review22=c1.execute("SELECT REVIEW FROM BARS WHERE NAME like '%s'" %bar_match) review2=c1.fetchall() for review in review2: r2=clean_content(review) #print r1,r2 compare_bar=tf_idf.tf_idf(r1,r2)[0] comp_list.append(compare_bar[1]) sort_before.append(bar_match) j+=1 comp=comp_list bar_list=sort_before bars=[] p=zip(comp,bar_list) #print p x=sorted(p, key=itemgetter(0),reverse=True) print x c1.execute('DROP TABLE IF EXISTS BAR_MATCH') con.commit() sql="""CREATE TABLE `BAR_MATCH` ( NAME TEXT, LATTITUDE TEXT, LONGITUDE TEXT )""" c1.execute(sql) for row in x: bars=row[1] y11=c1.execute("SELECT NAME, ADDRESS FROM BARS WHERE NAME like '%s'" %bars) y=c1.fetchall() for y1 in y: print y1 lattitude, longitude = getDistance.findLocation(y1[1]) print lattitude, longitude c1.execute("insert into BAR_MATCH values (?,?,?)",(y1[0],longitude, lattitude)) con.commit()
def compare_bars(bar, lat1, lon1, miles): con = connect("bar_data_test.db") c1 = con.cursor() print bar review11 = c1.execute("SELECT REVIEW FROM BARS WHERE name like '%s' " % bar) review1 = c1.fetchone() #print review1 for review in review1: r1 = review #print r1 old_category1 = c1.execute( "SELECT CATEGORY FROM BARS WHERE name like '%s'" % bar) old_category = c1.fetchone() c = [category_new for category_new in old_category] c = clean_content(c).lower() print c #************** def unique_list(l): ulist = [] [ulist.append(x) for x in l if x not in ulist] return ulist comp = ' '.join(unique_list(c.split())) comp = comp.split() categry_match = [] comp_list = [] sort_before = [] length = '' #print comp def words_in_string(word_list, a_string): return set(word_list).intersection(a_string.split()) rows = c1.execute("SELECT NAME, CATEGORY from BARS ") for row in rows: print row[1] i = 0 for word in words_in_string(comp, row[1]): print(word) i += 1 categry_match.append([i, row[0]]) categry_match.sort(key=lambda x: x[0], reverse=True) j = 0 while j < 5: bar_match = categry_match[j][1] review22 = c1.execute("SELECT REVIEW FROM BARS WHERE NAME like '%s'" % bar_match) review2 = c1.fetchall() for review in review2: r2 = clean_content(review) #print r1,r2 compare_bar = tf_idf.tf_idf(r1, r2)[0] comp_list.append(compare_bar[1]) sort_before.append(bar_match) j += 1 comp = comp_list bar_list = sort_before bars = [] p = zip(comp, bar_list) #print p x = sorted(p, key=itemgetter(0), reverse=True) print x c1.execute('DROP TABLE IF EXISTS BAR_MATCH') con.commit() sql = """CREATE TABLE `BAR_MATCH` ( NAME TEXT, LATTITUDE TEXT, LONGITUDE TEXT )""" c1.execute(sql) for row in x: bars = row[1] y11 = c1.execute( "SELECT NAME, ADDRESS FROM BARS WHERE NAME like '%s'" % bars) y = c1.fetchall() for y1 in y: print y1 lattitude, longitude = getDistance.findLocation(y1[1]) print lattitude, longitude c1.execute("insert into BAR_MATCH values (?,?,?)", (y1[0], longitude, lattitude)) con.commit()
pruned_data_5.reset_index(inplace=True) #pairs_user = list(partition_users(pruned_data_5, 5, xf.SampleN(1))) pairs_user = list(sample_users(pruned_data_5, 5, 12000, xf.SampleN(1))) pickle_out = open("sample_user.pickle", "wb") pickle.dump(pairs_user, pickle_out) pickle_out.close() truth = pd.concat((p.test for p in pairs_user)) #truth.to_csv(r'results/steam/pruned_5.csv') def algo_eval(path, algo, dataset): evaluation = batch.MultiEval(path=path, predict=False, recommend=100) evaluation.add_algorithms(algos=algo) evaluation.add_datasets(data=dataset) evaluation.run() algo_ii = item_knn.ItemItem(20, center=False, aggregate='sum') #algo_uu = user_knn.UserUser(30, center=False, aggregate='sum') algo_pop = basic.Popular() algo_mf = ImplicitMF(40) algo_bpr = BPR() algo_tf_idf = tf_idf() algo_LDA = LDA() algo_eval('results/steam/all_algo_sample_user', [algo_LDA, algo_tf_idf, algo_ii, algo_pop, algo_mf, algo_bpr], pairs_user)
import tkinter as tk import tkinter.filedialog as tkfd import os from tkinter.scrolledtext import ScrolledText from tf_idf import tf_idf table = tf_idf() main_window = tk.Tk() main_window.title("TD-IDF") main_window.geometry("1250x620") def open_filedialog(): global folder folder = tkfd.askdirectory(initialdir=os.path.dirname(__file__) + '/..', ) entry_dir_name.delete(0, tk.END) entry_dir_name.insert(0, folder) print(folder) return 0 def execute(): table = tf_idf() text_output.delete('1.0', tk.END) folder_name = folder num_of_files = len([ name for name in os.listdir(folder) if os.path.isfile(os.path.join(folder, name)) ]) + 1 for x in range(1, num_of_files):
from morphological_analysis import analysis from tf_idf import tf_idf BLOG = 'mana_blog.csv' LEMMAS = '/home/output/mana_lemmas.csv' analysis(BLOG, LEMMAS) tf_idf(LEMMAS)
@app.route('/search/results', methods=['GET', 'POST']) def search_request(): # print(request.form["input"]) search_term = request.form.get("input") # search_term = flask.request.args.get('name') Q = cosine_similarity(books_data=books_data, DF=DF, tf_idf=tf_idf, total_vocab=total_vocab, total_vocab_size=total_vocab_size, k=10, query=search_term) print(Q) return render_template('results.html', res=Q) # def index(): # return render_template('index.html', variable = Q) if __name__ == "__main__": load_data = False if not load_data: books_data = load_file() N = books_data.shape[0] processed_bookname, processed_text = process_data(books_data) DF, total_vocab_size, total_vocab = build_DF(N, processed_text, processed_bookname) tf_idf, df = tf_idf(N, processed_text, processed_bookname) # Q = cosine_similarity(books_data = books_data,DF = DF, tf_idf = tf_idf,total_vocab = total_vocab, total_vocab_size = total_vocab_size, k = 10, query = "The evening of the day on which Mr Gibson had been to see the squire") app.run(debug=True)
print(train_x.shape) print(valid_x.shape) print(train_y.shape) print(valid_y.shape) encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(valid_y) #Count vectorizing the training data train = [] train = bow.bag_of_words(train_data, X, train_x, valid_x) xtrain_count = train[0] xvalid_count = train[1] #Tf-IDF of training data tf_idf = [] tf_idf = tfi.tf_idf(train_data, X, train_x, valid_x) xtrain_tfidf = tf_idf[0] xvalid_tfidf = tf_idf[1] xtrain_tfidf_ngram = tf_idf[2] xvalid_tfidf_ngram = tf_idf[3] xtrain_tfidf_ngram_chars = tf_idf[4] xvalid_tfidf_ngram_chars = tf_idf[5] #Naive Bayes classifier implementation nb.Naive_Bayes(xtrain_count, xvalid_count, train_y, valid_y, my_tags, xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram, xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars) #Bernoulli Naive Bayes classifier implementation bnb.Bernoulli_Naive_Bayes(xtrain_count, xvalid_count, train_y, valid_y, my_tags, xtrain_tfidf, xvalid_tfidf,
from Co_Occurrence import CoOccur from cosine import cosine_sim from tf_idf import tf_idf from Co_Occurrence import docFreq import json import timeit from file_read_write import file_read_write query = input() start = timeit.default_timer() coOccur_obj = CoOccur(None) # a coOccur temporary object to tokenize the query query_tf = tf_idf() query_list = list(query.split(" ")) query_list = coOccur_obj.spell_check(query_list) empty_str = " " query = empty_str.join(query_list) query_refined = coOccur_obj.tokenize(query) # tokenize query,stem and remove stop words print(query_refined) query_refined.sort() file_reader_object = file_read_write() # search query in cache file_reader_object.cache_reader(query_refined, start) # read dataset of .txt files into dataframe df = file_reader_object.dataset_reader() # df = pd.DataFrame(data, columns=['headline', 'brief', 'article', 'type', 'filename'])
def results(algo=None): print("algorithm:", algo if algo in ALGOS else None) samples = 500 # up to 2000 print("sample size:", samples) keys = glob.glob('Inspec/keys/*.key') res = [0] * samples if algo == 'textrank': # load a spaCy model, depending on language, scale, etc. nlp = spacy.load("en_core_web_sm") # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) elif algo == 'sentiment_pos' or algo == 'sentiment_pos_tfidf': sid = SentimentIntensityAnalyzer() for i, key in enumerate(keys[:samples]): # get actual keywords key_file = open(key) whitespace = re.compile(r"\s+") # remove whitespace and convert to lowercase actual = [ whitespace.sub(" ", w).strip().lower() for w in key_file.readlines() ] # get text document corresponding to current key num = re.findall(r'\d+', key)[0] doc = 'Inspec/docsutf8/{}.txt'.format(num) # get extracted keywords if algo == 'rake': extracted = rake(doc) elif algo == 'textrank': extracted = textrank(doc, nlp) elif algo == 'window': extracted = window(doc) elif algo == 'window_w_tf_idf': extracted = window_w_tf_idf(doc) elif algo == 'tf_idf': extracted = tf_idf(doc) elif algo == 'sentiment_pos': extracted = sentiment_pos(doc, sid) elif algo == 'sentiment_pos_tfidf': extracted = sentiment_pos_tfidf(doc, sid) else: extracted = extract(doc) # calculate results tp = len(set(extracted).intersection( set(actual))) # number of true positives precision = tp / len(extracted) recall = tp / len(actual) f_measure = (2 * precision * recall) / ( precision + recall) if precision + recall else 0 res[i] = (precision, recall, f_measure) # calculate average results avg_res = [sum(x) / len(x) for x in zip(*res)] print("precision: {}, recall: {}, F-measure: {}".format(*avg_res))
for j, y in enumerate(words_weight[i]): print words[j], words_weight[i][j] print for i in range(0, 4): score = get_cossimi(words_weight[i], words_weight[4]) print 'score', i, ':', score print if __name__ == '__main__': # question = '我在哪里可以吃到海鲜意面' question = '怎么学好编程' from tf_idf import tf_idf, get_corpus_list from conf.website_conf import website_list website_corpus = get_corpus_list(website_list) from wikipedia_expansion import get_question_expansion_corpus # question_corpus = get_question_expansion_corpus(question) from jieba_split import split_word_only question_corpus = split_word_only(question) website_corpus.append(question_corpus) words, words_weight = tf_idf(website_corpus) scores = batch_get_sort_scores(words_weight, website_list) for i in scores: print i[0], ':', i[1]
def getXY(input, algo, model, test=0, k=25): """ input: 预处理过的语料库 algo: 使用的特征权重计算方法名 model: 使用的模型名 test = 0 : 记录文件中出现的词汇并构造词汇表(训练集) test = 1 : 不构造词汇表,用已经构造好的(测试集) """ global package # global voca_list # global labelset_list # global vocafreq_list # global weights_list # global doclist # global docname corpus = preprocess(input, package, test, k) labelset = package["labelset"] # 获得preprocess确定的package voca = package["voca"] level = 2 mod = 0 if algo == "tf_idf": weights = tf_idf(corpus, test, package) mod = 1 elif algo == "tf_dc": weights = tf_dc(corpus, test, package) elif algo == "tf_bdc": weights = tf_bdc(corpus, test, package) elif algo == "iqf_qf_icf": weights = iqf_qf_icf(corpus, test, package) elif algo == "tf_eccd": weights = tf_eccd(corpus, test, package) elif algo == "tf_ig": weights = tf_ig(corpus, test, package) elif algo == "tf_rf": weights = tf_rf(corpus, test, package) level = 3 elif algo == "tf_chi": weights = tf_chi(corpus, test, package) level = 3 elif algo == "tf_mrf": weights = tf_mrf(corpus, test, package) level = 3 elif algo == "tf_nrf": weights = tf_nrf(corpus, test, package) level = 3 elif algo == "tf_vc": weights = tf_vc(corpus, test, package) # print weights X = [] Y = [] # 标签集 count = 0 vocalen = len(voca) for doc in corpus: if count % 1000 == 0: print(str(count) + "/" + str(len(corpus))) # print('weights\'s size:') # print(sys.getsizeof(weights)) # print(sys.getsizeof(X)) # process = psutil.Process(os.getpid()) # print('Used Memory:', process.memory_info().rss / 1024 / 1024, 'MB') # print(memory_usage_psutil()) count += 1 # process label labelset.append(doc["label"]) Y.append(int(np.argmax(one_hot( labelset)[-1]))) # 在确定的labelset中添加label,以保证label的位置一致,再进行截取 # np.argmax返回最大数的索引 labelset = labelset[:-1] # 重置labelset # process word temvocalist = list(voca) + list( doc["split_sentence"]) # 与label同理 voca用以确定位置 tem_one_hot = one_hot(temvocalist)[vocalen:] # 截取 # for word in range(len(tem_one_hot)): # .shape[0] for word in range(tem_one_hot.shape[0]): temlabel = doc["label"] # earn temword = doc["split_sentence"][word] temdoc = doc["document"] # earn638 # print("\ntem_one_hot:") # print(tem_one_hot) # print("\n") # weights--词频*权重 if level == 2: if mod == 0: # 有监督 tem_one_hot[word] *= weights[temlabel][temword] else: # 无监督 tem_one_hot[word] *= weights[temdoc][temword] else: tem_one_hot[word] *= weights[temlabel][temdoc][temword] # 空array try: tem_one_hot = np.max(tem_one_hot, axis=0) # 去除多余行 每列只保留最大数 except ValueError: # tem_one_hot = tem_one_hot[0] # print(tem_one_hot) pass if model.lower() == "knn": tem_one_hot = preprocessing.normalize( np.array(tem_one_hot).reshape(1, -1), norm='l2') # 转变为矩阵 # print(tem_one_hot.toarray()) # tem_one_hot = np.full(tem_one_hot) # print(tem_one_hot) # 稀疏矩阵转化回原矩阵! # print(type(tem_one_hot.toarray())) X.append(np.squeeze(tem_one_hot.toarray().tolist())) # print(tem_one_hot.toarray().tolist()) # X.append(tem_one_hot) # print(np.array(X)) # print(Y) return X, Y # squeeze压缩维度 如将二维转变为一维
def compare_restaurants(rest,lat1,lon1,miles): con=connect("restinfo.db") c1=con.cursor() rest=rest.replace("'", "") review1=c1.execute("SELECT REVIEW FROM RESTAURANTS WHERE name like '%s' "%rest) for review in review1: r1=review[0] old_category=c1.execute("SELECT CATEGORY FROM RESTAURANTS WHERE name like '%s'"%rest) old_category=c1.fetchone() for category_new in old_category: c=category_new.lower() print c comp='' rest_list='' comp_list=[] sort_before=[] length='' category_match=c1.execute("SELECT NAME FROM RESTAURANTS WHERE CATEGORY like '%s'" %c) myqueryrecords = c1.fetchall() i=0 length=len(myqueryrecords) while i<len(myqueryrecords): rest_match=myqueryrecords[i][0] print rest_match c1.execute("SELECT ADDRESS FROM RESTAURANTS WHERE NAME like '%s'" %rest_match) rest_address = c1.fetchone() rest_add=rest_address[0] print rest_add rest_match=rest_match.replace("'", "") lat2,lon2 = findLocation(rest_add) #print lat2, lon2 rest_dist = findDistance(lat1, lon1, lat2, lon2) print rest_dist print miles if rest_dist < float(miles): print rest_match review2=c1.execute("SELECT REVIEW FROM RESTAURANTS WHERE NAME like '%s'" %rest_match) for review in review2: r2=review[0] compare_rest=tf_idf.tf_idf(r1,r2)[0] comp_list.append(compare_rest[1]) sort_before.append(myqueryrecords[i][0]) comp=comp_list rest_list=sort_before restaurants=[] p=zip(comp,rest_list) x=sorted(p, key=itemgetter(0),reverse=True) i+=1 c1.execute('DROP TABLE IF EXISTS RESTAURANTS_MATCH') con.commit() sql="""CREATE TABLE RESTAURANTS_MATCH ( NAME FLOAT, LATTITUDE FLOAT, LONGITUDE FLOAT, ADDRESS FLOAT )""" print x c1.execute(sql) for row in x: print row[1] restaurants=row[1].replace("'", "") y11=c1.execute("SELECT NAME, ADDRESS FROM RESTAURANTS WHERE NAME like '%s'" %restaurants) y=c1.fetchall() for y1 in y: latitude, longitude = findLocation(y1[1]) print y1[0], latitude, longitude c1.execute("insert into RESTAURANTS_MATCH (name, lattitude,longitude,address) VALUES (?,?,?,?)",(y1[0],latitude,longitude,y1[1])) con.commit()
#print("\n"+str(x) +":"+str(len(fileContent))) sentenceList = fileContent.split((u"ред")) sentenceListSum = fileContentSummary.split((u"ред")) no_of_sentence = len(sentenceList) for y in range(0, no_of_sentence): multi = math.pow(10, len(str(y))) sentId = ((x * multi) + y) / multi sentenceDict[sentId] = sentenceList[y] #sentenceDict[((x * multi) + y ) / multi] = sentenceList[y] if (sentenceDict[sentId] in sentenceListSum): sentenceDictLabel[sentId] = 'Y' else: sentenceDictLabel[sentId] = 'N' sentenceDictLen[sentId] = len(sentenceDict[sentId].split()) #call for tf-idf res_idf = tf_idf(sentenceDict) #call for tf-isf res_isf = tf_isf(sentenceDict) #merge into one dictionary featureVec = merge(sentenceDict) # ----TestData Set------------------------------------- #read test data fileNameTest = 'complete_corpus\\testFile\\testInput' + str(1) + ".txt" #fileContentTest = read_from_file(fileNameTest) fileContentTest = tokenize_testFile(fileNameTest) no_of_sentence_test = len(fileContentTest) for y in range(0, no_of_sentence_test): multi = math.pow(10, len(str(y))) sentenceDictTest[((1 * multi) + y) / multi] = fileContentTest[y]
elif "adaboo" in a[1]: adaboo_flag = 1 elif "rf" in a[1]: rf_flag = 1 elif "nb" in a[1]: nb_flag = 1 elif "bagging" in a[1]: bagging_flag = 1 training_matrix = input.input("5k_spring_2016_training_dataset.txt", 15000, 40293) testing_matrix = input.input("5k_spring_2016_testing_dataset.txt", 15000, 40293) training_label = input.label("5k_spring_2016_label_training.txt", 15000) training_matrix, testing_matrix, combine = input_preprocess(training_matrix, testing_matrix) #Getting tf-idf of the matrix tf_idf_combine = tf_idf.tf_idf(combine, combine) tf_idf_training_matrix = tf_idf.tf_idf(combine, training_matrix) tf_idf_testing_matrix = tf_idf.tf_idf(combine, testing_matrix) if (lca_flag): print("Doing LCA") train = lca(tf_idf_combine, tf_idf_training_matrix) test = lca(tf_idf_combine, tf_idf_testing_matrix) elif (pca_flag): print("Doing PCA") train = pca_preprocess(tf_idf_combine, tf_idf_training_matrix) test = pca_preprocess(tf_idf_combine, tf_idf_testing_matrix) else: print("Doing No Reduction") train = tf_idf_training_matrix test = tf_idf_testing_matrix
# text_process.save_dict_dict(patent2title_stem_freq, "patent_title_stem_freq") # text_process.save_dict(stem2term, "stem2term") # save end title_stem_freq = load_dict("title_freq") abstr_stem_freq = load_dict("abstr_freq") patent2abstr_stem_freq = load_dict_dict("patent_abstr_stem_freq") patent2title_stem_freq = load_dict_dict("patent_title_stem_freq") stem2term = load_dict("stem2term") # print title_stem_freq # print abstr_stem_freq # print patent2title_stem_freq patent2descr_stem_score = tf_idf.tf_idf(patent2abstr_stem_freq) patent2title_stem_score = tf_idf.tf_idf(patent2title_stem_freq) patent2stem_score = {} for patent, stem_score in patent2descr_stem_score.items(): if patent not in patent2stem_score.keys(): patent2stem_score[patent] = {} for k, v in stem_score.items(): if k in patent2stem_score[patent].keys(): patent2stem_score[patent][k] += v else: patent2stem_score[patent][k] = v # for patent, stem_score in patent2title_stem_score.items(): # if patent not in patent2stem_score.keys():
if term in x2_by_term: x2_by_term[term] += x2 else: x2_by_term[term] = x2 # divide by categories to get average print "calculating average MI, X2 for terms" num_cats = len(cats) for term in mi_by_term: mi_by_term[term] /= num_cats for term in x2_by_term: x2_by_term[term] /= num_cats # for each term: compute TF-IDF, FREQ tf_idf_by_term, freq_by_term = tf_idf.tf_idf(list(terms), doc_terms, vocab_size, set_select) # save (term, value) pairs: MI print "saving (term, value) pairs: MI" top_mi = sorted(mi_by_term.items(), key = lambda (k,v): v, reverse = True) with open(features + "/top_mi.p", 'wb') as file: pickle.dump(top_mi, file) # save (term, value) pairs: X2 print "saving (term, value) pairs: X2" top_x2 = sorted(x2_by_term.items(), key = lambda (k,v): v, reverse = True) with open(features + "/top_x2.p", 'wb') as file: pickle.dump(top_x2, file) # save (term, value) pairs: TF-IDF print "saving (term, value) pairs: TF-IDF"
def extract_tfidf(word_list): return tf_idf.tf_idf(word_list)
def getXY(input, algo, model, test=0): """ input: 预处理过的语料库 algo: 使用的特征权重计算方法名 model: 使用的模型名 test = 0 : 记录文件中出现的词汇并构造词汇表(训练集) test = 1 : 不构造词汇表,用已经构造好的(测试集) """ global package corpus = preprocess(input, package, test) labelset = package["labelset"] voca = package["voca"] level = 2 mod = 0 if algo == "tf_idf": weights = tf_idf(corpus,test,package) mod=1 elif algo == "tf_dc": weights = tf_dc(corpus,test,package) elif algo == "tf_bdc": weights = tf_bdc(corpus,test,package) elif algo == "iqf_qf_icf": weights = iqf_qf_icf(corpus,test,package) elif algo == "tf_eccd": weights = tf_eccd(corpus,test,package) elif algo == "tf_ig": weights = tf_ig(corpus,test,package) elif algo == "tf_rf": weights = tf_rf(corpus,test,package) level = 3 elif algo == "tf_chi": weights = tf_chi(corpus,test,package) level = 3 #print weights X = [] Y = [] count = 0 vocalen = len(voca) for doc in corpus: if count%100 ==0: print str(count) + "/" + str(len(corpus)) count+=1 # process label labelset.append(doc["label"]) Y.append(int(np.argmax(one_hot(labelset)[-1]))) labelset = labelset[:-1] # process word temvocalist = voca + doc["split_sentence"] tem_one_hot = one_hot(temvocalist)[vocalen:] for word in range(len(tem_one_hot)): temlabel = doc["label"] temword = doc["split_sentence"][word] temdoc = doc["document"] if level == 2: if mod ==0: tem_one_hot[word] *= weights[temlabel][temword] else: tem_one_hot[word] *= weights[temdoc][temword] else: tem_one_hot[word] *= weights[temlabel][temdoc][temword] tem_one_hot = np.max(tem_one_hot,axis=0) if (model.lower()=="knn"): tem_one_hot = preprocessing.normalize(np.array(tem_one_hot).reshape(1,-1), norm='l2') X.append(tem_one_hot) return np.squeeze(X),Y
import math from xml.etree.ElementTree import re from collections import defaultdict from InputParser import * import tf_idf tf_idf_obj = tf_idf.tf_idf() # The query needs to be vectorized, tf of query is only based on the query # But the idf of the query is based on the whole document. # Then we need to compute the relationship between the query and each document, then sort it by the score. # The former InputParse needs to be used, since it is the tokenizer. # This function should compute the cosine similarity of one doc to the query # Only a computational function. def cosine(vector1, vector2): # if vector2.count(0) + 1 >= len(vector2): # return 0 top = sum([vector1[i] * vector2[i] for i in range(len(vector1))]) bottom = math.sqrt(sum([pow(i,2) for i in vector1])) return float(top) / float(bottom) + (len(vector2) - vector2.count(0)) * 2 if bottom > 0 else 0 # The steps are as follows: # 1. The InputParser first parse all the files, get all the data. # 2. Get the query, split it by the pattern that we used in the InputParser and remove the stopwords. # 3. Compute the tf-idf vector of the query, store it. # 4. Compute all tf-idf vector of each document, store it. # 5. Compute all cosine value which would compare the query and document. # 6. Sort the cosine value decreasingly, then select the top K documents to return.