def tfidf_calculation_multistage(path_list_1, path_list_2): t = tfidf.TFIDF() for folder in path_list_1: a = t.get_tf_idf(folder) t = tfidf.TFIDF() for folder in path_list_2: b = t.get_tf_idf(folder) a_features = [t.tfidf for t in a] b_features = [t.tfidf for t in b] print(a_features) assert (a_features == b_features)
def tfidf_obj_creation_perf(dataset_path): t = tfidf.TFIDF() dirs = [dataset_path] for dir in dirs: ans = t.get_tf_idf(dir) with open("/tmp/result.txt", 'w') as f: f.write(str(ans))
def encode_tfidf_model(document_type, word_thresh=1): """Load TF-IDF model. """ tfidf_fname = utils.TFIDF_TEMPLATE % (document_type, word_thresh) check_save_directory(filename=tfidf_fname) if os.path.exists(tfidf_fname): with open(tfidf_fname, 'rb') as fid: TFIDF = pickle.load(fid) else: # read the story and gather words story, _ = mqa.get_story_qa_data('full', document_type) sorted_movies = sorted(story.keys()) all_words_use = [] for imdb_key in sorted_movies: all_words_use.append([]) for sentence in story[imdb_key]: norm_sentence = utils.normalize_stemming( utils.normalize_alphanumeric(sentence.lower())) all_words_use[-1].extend(norm_sentence.split(' ')) # compute TFIDF TFIDF = tfidfcalc.TFIDF(sorted_movies) TFIDF.get_filtered_vocabulary(all_words_use, word_thresh=word_thresh) TFIDF.compute_tfidf(all_words_use) # dump to pickle file for future with open(tfidf_fname, 'wb') as fid: pickle.dump(TFIDF, fid) return TFIDF
def calcTfIdf(totalList, chidDict, keyword="digest"): # 建立tf-idf模型 print "tf-idf calculating..." myTfIdf = tfidf.TFIDF(totalList) # 计算idf值 idfDict = myTfIdf.getIdf() # 显示超过总文章数目的1%中都出现的词及其idf值 print[(x.encode('utf-8'), idfDict[x]) for x in idfDict if idfDict[x] < 2] # 计算在某一分类中词语的tf值 tfidfMatrix = {} for index, value in chidDict.iteritems(): tfDict = myTfIdf.getTf(value) # 计算TF-IDF值,存在tfidfMatrix中 for one in myTfIdf.getTfIdf(tfDict, idfDict): oneClass = tfidfMatrix.setdefault(one[0], dict()) oneClass[index] = one[1] # save tfidfMatrix with open("%s_tfidf.txt" % keyword, 'w+') as outfile: classNames = list(chidDict.keys()) print >> outfile, "\t%s" % ('\t'.join([str(x) for x in classNames])) for token, classDict in tfidfMatrix.iteritems(): List = [token.encode("utf-8")] for one in classNames: List.append(str(classDict.get(one, 0.0))) print >> outfile, '\t'.join(List)
def RankedSearch(index, querypath, n): #QUERY PROCESSING & TFIDF queries = [] with open(querypath, 'r') as f: for line in f: query = line.strip("\n").strip("\r") query.split(" ") query = ts.stem_tokens(ts.removestops(ts.tokenizeText(query))) queries.append(query) #TFIDF resultmatrix = [] for query in queries: result = tfidf.TFIDF(query, index, n) print(str(query)) print(str(len(result)) + " documents found. Top 10:") for line in result[:10]: print( str(line[0]) + " 0 " + str(line[2]) + " 0 " + str(line[4]) + " 0") resultmatrix.append(result) print("\n") ##OUTPUT ##another output file version: #with open("tfidf_results.txt", 'w') as f: # for i, results in enumerate(resultmatrix): # f.write("Query: "+str(queries[i])+"\n") # for line in results[:10]: # f.write(str(line).replace(','," ")+"\n") # f.write("\n") with open("results.ranked.txt", 'w') as f: for i, results in enumerate(resultmatrix): f.write("Query: " + str(queries[i]) + "\n") if len(results) > 1000: results = results[:1000] for line in results: f.write( str(line[0]) + " 0 " + str(line[2]) + " 0 " + str(line[4]) + " 0 \n") f.write("\n") return resultmatrix
def __init__(self, ftp_folder, data_dir, tfidf_storage_dir, interval): self.queue = multiprocessing.SimpleQueue() self.ftp_folder = ftp_folder self.data_dir = data_dir self.tfidf_storage_dir = tfidf_storage_dir self.interval = interval #if not os.path.exists(self.data_dir ): # os.makedirs(self.data_dir) #if not os.path.exists(self.tfidf_storage_dir ): # os.makedirs(self.tfidf_storage_dir) self.tfidf_obj = tfidf.TFIDF() self.hostname = socket.gethostbyname(socket.gethostname()) self.daemon = threading.Thread(target=self.tfidf_generater, args=()) self.do_exit = False self.daemon.start()
def basic_tfidf_obj(path): t = tfidf.TFIDF() b = t.get_tf_idf(path) print(b[0].tfidf)