def main(): content = load_stories() # TF-IDF Tester tfidf_handler = TFIDF(use_idf=True, ngram=(1, 3)) tfidf_handler.analyse_corpus(document=content[0]) df = tfidf_handler.extract_keywords(doc_idx=0) print(df.sort_values(by='tfidf', ascending=False))
class FeatureSelector: def __init__(self): self.table = TFIDF() def run(self): """ Generate the features using Top N algorithm """ for dir_name in os.listdir("../data/groups/"): if dir_name == '.DS_Store': continue for file_name in os.listdir("../data/groups/%s" % dir_name): if file_name == '.DS_Store': continue document_name = "%s/%s" % (dir_name, file_name) with open("../data/groups/%s" % document_name, 'r') as f: self.table.add_document(document_name, f.read().lower()) new_data_set = self.table.top_n_words(3) for document_name, words in new_data_set.iteritems(): directory_name, file_name = document_name.split('/') path_name = "../data/features/%s" % directory_name if not os.path.exists(path_name): os.makedirs(path_name) with open("%s/%s" % (path_name, file_name), 'w') as f: for word in words: f.write(word) f.write("\n")
class FeatureSelector: def __init__(self): self.table = TFIDF() def run(self, index_file): """ Generate the features using Top N algorithm """ with open(index_file) as f: lines = f.readlines() for line in lines: name = line[:-1] with open("../data/scoped/%s" % name, 'r') as d: document = Document(d.read()) self.table.add_document(name, document.content_lower) new_data_set = self.table.top_n_words(10) for document_name, words in new_data_set.iteritems(): with open("../data/scoped/%s" % document_name, 'r') as d: document = Document(d.read()) path_name = "../data/features/%s" % document_name with open("%s" % path_name, 'w') as f: for word in words: for _ in xrange(document.count(word)): f.write(word) f.write("\n")
def analysis_data(segment, idf, tfidf, collections): """ 输入mongo中的collection名 对输入的数据集做分词、计算TF、计算IDF、计算TF*IDF操作 """ all_count = sum(counts[c] for c in collections) logger.info('calculate {}.'.format(collections)) tfidf_obj = TFIDF( collection_names=collections, all_count=all_count, ) if segment: tfidf_obj.generate_segment_requests_using_pool() # tfidf_obj.generate_segment_requests() if idf: tfidf_obj.generate_idf_requests() if tfidf: tfidf_obj.generate_tf_idf_requests()
def __init__(self): self.TF = TFIDF() self.articles_dir = "articles/" self.summaries_dir = "summaries/" self.keywords_dir = "keywords/"
class LSA: def __init__(self): self.TF = TFIDF() self.articles_dir = "articles/" self.summaries_dir = "summaries/" self.keywords_dir = "keywords/" def keywords(self, filename, num_topics=5, keywords_per_topic=3): text = "" with open(filename) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} clean_sentences = [] for sent in sentences: clean_sent = {} for word in tokenize(sent, "word", return_spans=False): word = self.TF.clean(word) clean_sent[word] = 1 wc[word] = wc.get(word, 0) + 1 clean_sentences.append(clean_sent) matrix = [] for word in wc.keys(): row = [] for sent in clean_sentences: if word in sent: row.append(self.TF.weight(word, wc[word])) else: row.append(0) matrix.append(row) matrix = numpy.matrix(matrix) U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False) D = s * Vh keywords = [] for topic in range(num_topics): try: words = sorted(enumerate([u for u in U[:, topic]]), key=lambda x: x[1]) except IndexError: print "Problem indexing numpy array for", filename, "on topic", topic continue added = 0 word_index = 0 while added < keywords_per_topic and word_index < len(words): #print "Looking at", words[word_index], wc.keys()[words[word_index][0]] if wc.keys()[words[word_index][0]] not in keywords: keywords.append(wc.keys()[words[word_index][0]]) added += 1 word_index += 1 return ", ".join(keywords) def summarize(self, filename): text = "" with open(filename) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} clean_sentences = [] for sent in sentences: clean_sent = {} for word in tokenize(sent, "word", return_spans=False): word = self.TF.clean(word) clean_sent[word] = 1 wc[word] = wc.get(word, 0) + 1 clean_sentences.append(clean_sent) matrix = [] for word in wc.keys(): #print "adding", word row = [] for sent in clean_sentences: if word in sent: row.append(self.TF.weight(word, wc[word])) else: row.append(0) matrix.append(row) matrix = numpy.matrix(matrix) #print "matrix", matrix U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False) # print "U", U # print "s", s # print "Vh", Vh # D = s * Vh #print "D", D num_sentences = 5 summary_sentence_indices = [] #for topic in range(3): # print "Topic", topic # sent_weights = D[topic,:] # #top_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1], reverse=True)[:5] # bottom_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])[:5] # #print "TOP:", ", ".join(wc.keys()[x[0]] for x in top_words) # print "BOTTOM WORDS:", ", ".join(wc.keys()[x[0]] for x in bottom_words) # top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) [:3] # print "TOP SENTS:", "\n".join([sentences[s[0]] for s in top_sents]) topic = 0 while len(summary_sentence_indices) < num_sentences: sent_weights = D[topic, :] top_sents = sorted(enumerate([s for s in sent_weights]), key=lambda x: x[1]) for sent in top_sents: if sent[0] > 0 and sent[0] not in summary_sentence_indices: summary_sentence_indices.append(sent[0]) break topic += 1 summary = "" summary_sentence_indices.sort() for i in summary_sentence_indices: summary += sentences[i] + "\n" return summary
def calculate_tfidf(x_df: pd.DataFrame) -> pd.DataFrame: # Calculate the 'tf-idf' matrix of the 'text' column # Return the 'tf-idf' Dataframe tfidf_obj = TFIDF(x_df["text"]) return tfidf_obj()
def __init__(self): self.table = TFIDF()
def calculate_tfidf(x_df): # Calculate the 'tf-idf' matrix of the 'text' column # Return the 'tf-idf' Dataframe tfidf_obj = TFIDF(x_df['text']) return tfidf_obj()
def calculate_tfidf(df_text): # Calculate the 'tf-idf' matrix of the 'text' column # Return the 'tf-idf' Dataframe tfidf_obj = TFIDF(df_text["text"]) return tfidf_obj()
class LSA: def __init__(self): self.TF = TFIDF() self.articles_dir = "articles/" self.summaries_dir = "summaries/" self.keywords_dir = "keywords/" def keywords(self, filename, num_topics=5, keywords_per_topic=3): text = "" with open(filename) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} clean_sentences = [] for sent in sentences: clean_sent = {} for word in tokenize(sent, "word", return_spans=False): word = self.TF.clean(word) clean_sent[word] = 1 wc[word] = wc.get(word, 0) + 1 clean_sentences.append(clean_sent) matrix = [] for word in wc.keys(): row = [] for sent in clean_sentences: if word in sent: row.append(self.TF.weight(word, wc[word])) else: row.append(0) matrix.append(row) matrix = numpy.matrix(matrix) U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False) D = s * Vh keywords = [] for topic in range(num_topics): try: words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1]) except IndexError: print "Problem indexing numpy array for", filename, "on topic", topic continue added = 0 word_index = 0 while added < keywords_per_topic and word_index < len(words): #print "Looking at", words[word_index], wc.keys()[words[word_index][0]] if wc.keys()[words[word_index][0]] not in keywords: keywords.append(wc.keys()[words[word_index][0]]) added += 1 word_index += 1 return ", ".join(keywords) def summarize(self, filename): text = "" with open(filename) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} clean_sentences = [] for sent in sentences: clean_sent = {} for word in tokenize(sent, "word", return_spans=False): word = self.TF.clean(word) clean_sent[word] = 1 wc[word] = wc.get(word, 0) + 1 clean_sentences.append(clean_sent) matrix = [] for word in wc.keys(): #print "adding", word row = [] for sent in clean_sentences: if word in sent: row.append(self.TF.weight(word, wc[word])) else: row.append(0) matrix.append(row) matrix = numpy.matrix(matrix) #print "matrix", matrix U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False) # print "U", U # print "s", s # print "Vh", Vh # D = s * Vh #print "D", D num_sentences = 5 summary_sentence_indices = [] #for topic in range(3): # print "Topic", topic # sent_weights = D[topic,:] # #top_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1], reverse=True)[:5] # bottom_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])[:5] # #print "TOP:", ", ".join(wc.keys()[x[0]] for x in top_words) # print "BOTTOM WORDS:", ", ".join(wc.keys()[x[0]] for x in bottom_words) # top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) [:3] # print "TOP SENTS:", "\n".join([sentences[s[0]] for s in top_sents]) topic = 0 while len(summary_sentence_indices) < num_sentences: sent_weights = D[topic,:] top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) for sent in top_sents: if sent[0] > 0 and sent[0] not in summary_sentence_indices: summary_sentence_indices.append(sent[0]) break topic += 1 summary = "" summary_sentence_indices.sort() for i in summary_sentence_indices: summary += sentences[i] + "\n" return summary
from tf_idf import TFIDF, TFIDF_stop_word print( "++++++++++++++++++++++++++++++++TEST Part I ++++++++++++++++++++++++++++++++" ) # Testing # class properties corpus = TFIDF(lowercase=True, v_max=15, n_grams=2) # corpus.fit(['a', 'b', 'c', 'd']) # assert corpus.v_max == 4 # print(corpus.vocabulary) # methods s = [ "This is a test: What do you want from me?", "nothing to clean here", ":?:mickey mouse!@.,#" ] assert corpus.remove_bad_chars(s) == [ "This is a test What do you want from me", "nothing to clean here", "mickey mouse!@#" ] print("TEST 1:") test_corpus = [ 'The hotel and the stay were great', 'This was a great stay', 'Great stay in a great destination', 'Great destination' ] corpus.fit(test_corpus) print(f"vocabulary: {corpus.vocabulary}")