class TextIndexer: __textCollection = None def __init__(self, documents): self.__textCollection = TextCollection(documents) def idf(self, term): return self.__textCollection.idf(term) def tf(self, term, text): return self.__textCollection.tf(term, text)
def Generate_keyword(obj,length): orig_file = './Data/'+obj+'/'+obj+'.xlsx' data = xlrd.open_workbook(filename=orig_file) sheet = data.sheet_by_index(1) review_head = np.array(sheet.col_values(12))[1:] review_body = np.array(sheet.col_values(13))[1:] review_all=[] for i in range(length) : review = review_head[i] + " " +review_body[i] review_all.append(review) review_all = np.array(review_all) # make review tokens tokens=[] for i,review in enumerate(review_all): review = review.lower() replacer = RegexpReplacer() review = replacer.replace(review) remove = str.maketrans('','',string.punctuation) review = review.translate(remove) token = nltk.word_tokenize(review) token = [w for w in token if w == 'not' or not w in stopwords.words('english')] s = nltk.stem.SnowballStemmer('english') token = [s.stem(ws) for ws in token] tokens.append(token) token_file = './Data/'+ obj +'/tokens.pkl' f=open(token_file,'wb') pickle.dump(tokens,f) f.close() corpus=TextCollection(tokens) tf={} tf_idf={} for review in tokens: for word in review: if word not in tf : tf_=corpus.tf(word,corpus) tf[word]=tf_ if word not in tf_idf : tf_idf_=corpus.tf_idf(word,corpus) tf_idf[word] = tf_idf_ tf_sorted = sorted(tf.items(), key=lambda item:item[1], reverse=True) tf_idf_sorted = sorted(tf_idf.items(), key=lambda item:item[1], reverse=True) pd.DataFrame(tf_sorted).to_csv('./Data/'+obj+'/tf_sorted.csv') pd.DataFrame(tf_idf_sorted).to_csv('./Data/'+obj+'/tf_idf_sorted.csv')
def computeTFIDF_text(texts, singletext): #texts是句子字符串列表(语料库),singletext单个句子字符串, texts = [nltk.word_tokenize(text) for text in texts] #对句子列表所有句子分词 corpus = TextCollection(texts) words = nltk.word_tokenize(singletext) #单词列表 tfidf_words = {} #计算机tfidf for word in words: idf = corpus.idf(word) #tf tf = corpus.tf(word, words) #idf tfidf = idf * tf tfidf_words[word] = tfidf return tfidf_words
def preprocess(self,text): #text = text.split(" "); text = word_tokenize(text) if self.display: print "After Tokenizing" print text print "\n\n" text=[w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip())>2] tc = TextCollection([text]) words = list(set(tc)) word_tf = {word: tc.tf(word, text) * len(text) for word in words} return word_tf
def attrexplore(corpus): # s = "in douglas r. stinson, editor,.proc. crypto 93,.lecture notes in computer science no. 773..pages 278-291..1994..avrim blum, merrick furst, michael kearns, and richard j. lipton..springer,.cryptographic primitives based on hard learning problems.." # ss = SenToken(raw=s) # print(ss) # for sent in ss: # print(sent) nltkCorpus = TextCollection(corpus) print(nltkCorpus.idf(term='this')) print(idf(term='this', corpus=corpus)) print(nltkCorpus.tf(term='this', text='this is sentence four')) print(tf_idf(term='this', doc='this is sentence four', corpus=corpus)) fdist = nltk.FreqDist(WordTokener(sent=corpus[0])) print(fdist.tabulate())
def preprocess(self, text): #text = text.split(" "); text = word_tokenize(text) if self.display: print "After Tokenizing" print text print "\n\n" text = [ w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip()) > 2 ] tc = TextCollection([text]) words = list(set(tc)) word_tf = {word: tc.tf(word, text) * len(text) for word in words} return word_tf
def compute_tf_idf_similarity(query: str, content: str, type: str) -> float: """ Compute the mean tf-idf or tf similarity for one sentence with multi query words. :param query: a string contain all key word split by one space :param content: string list with every content relevent to this query. :return: average tf-idf or tf similarity. """ sents = [word_tokenize(content), word_tokenize("")] # add one empty file to smooth. corpus = TextCollection(sents) # 构建语料库 result_list = [] for key_word in query.strip(" ").split(" "): if type == "tf_idf": result_list.append(corpus.tf_idf(key_word, corpus)) elif type == "tf": result_list.append(corpus.tf(key_word, corpus)) else: raise KeyError return sum(result_list) / len(result_list)
def splitter(fileName): with open(fileName,'r') as f: d = json.load(f) #assuming the file is evenly divisable by 10 for i in range(3): shuffle(d) d = d[:1000] corpus = [] corpusList = [] classifiers = [] for i in range(len(d)): d[i]['tAbstract'] = tknize(d[i]['abstract']) corpus.extend(d[i]['tAbstract']) corpusList.extend(d[i]['tAbstract']) if d[i]['type'] not in classifiers: classifiers.append(d[i]['type']) # initialize numpy array for i in range(len(d)): d[i]['vector'] = numpy.empty([len(corpusList)]) tc = TC(corpus) print("Starting vector calculation") for doc in d: place = 0 for word in corpusList: idf = tc.idf(word) tf = tc.tf(word, doc['tAbstract']) # create a vector that is guaranteed to be in the same order for each doc, as # each doc appends the tf-idf score of the word to its vector at the same time doc['vector'][place] = idf * tf place += 1 return d, classifiers
# tokens =nltk.word_tokenize(f2) corpus = TextCollection(chong) #构建语料库 #print("_______________@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" # print(corpus) #输出语料库 sents_1 = sent_tokenize(f1) chong_1 = [word_tokenize(sent) for sent in sents_1] corpus_1 = TextCollection(chong_1) # print("23333333333333333333333333333333333333333333333########################") # print(corpus_1) # print("_________________________") #计算语料库中"one"的tf值 for a in filtered_words: tf = corpus_1.tf(a, corpus_1) # 1/12 #print(a,"tf:",tf) #print(type(a)) #计算语料库中"one"的idf值iii idf = corpus.idf(a) #log(3/1) #print(a,"idf:",idf) tf_idf = tf * idf d = dict.fromkeys([a], tf_idf) #print(d) Word_dict.update(d) #print("_________________________")
print(len(features)) from nltk.text import TextCollection from nltk.tokenize import word_tokenize # 首先,构建语料库corpus sents = [ 'this is sentence one', 'this is sentence two', 'this is sentence three' ] sents = [word_tokenize(sent) for sent in sents] # 对每个句子进行分词 print(sents) # 输出分词后的结果 corpus = TextCollection(sents) # 构建语料库 print(corpus) # 输出语料库 # 计算语料库中"one"的tf值 tf = corpus.tf('one', corpus) # 1/12 print(tf) # 计算语料库中"one"的idf值 idf = corpus.idf('one') # log(3/1) print(idf) # 计算语料库中"one"的tf-idf值 tf_idf = corpus.tf_idf('one', corpus) print(tf_idf) from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer x_train = [ 'TF-IDF 主要 思想 是', '算法 一个 重要 特点 可以 脱离 语料库 背景',
def processSimilarity(self): # Similarity analysis # Find all synonyms of all words wordsGroup = [] if not os.path.exists('featureGroups.txt'): marker = 0 matches = [] group = [] for w in self.CandidateTerms: if w != "" and not (type(w) is list): for word in w.split(): for synset in wn.synsets(word): for synonym in synset.lemmas(): matches = [ term for term in self.CandidateTerms if not (type(term) is list) and ( synonym.name() in term.split() and term not in matches) ] matches = self.RemoveDuplicates(matches) if len(matches) > 0: # Constuct words group group.extend(matches) if len(group) > 0: wordsGroup.append( ('FG' + str(marker), copy.deepcopy(group))) for val in group: with open("featureGroups.txt", "a") as fg: fg.write('FG' + str(marker) + ' -> ' + val + '\n') group = [] marker = marker + 1 else: added = False with open("featureGroups.txt", "r") as fg: grouptext = fg.read() lines = grouptext.split("\n") for line in lines: arr = line.split("->") for (m, v) in wordsGroup: if m == arr[0]: v.append(arr[1]) added = True break if not added: if len(arr) > 1: wordsGroup.append((arr[0], [arr[1]])) added = False textcombine = ' ' for i, s, t in self.tokenized_sentence_array: textcombine = (textcombine + ''.join(s)) corpuscol = TextCollection([textcombine]) for g in wordsGroup: for w in g: cnt = 0 weight = 0.0 for t in w: weight = weight + corpuscol.tf(t, textcombine) cnt = cnt + 1 self.WeightedCandidateTerm.append((w, weight / cnt)) with open("weightedGroups.txt", "a") as wg: wg.write(str(self.WeightedCandidateTerm).strip('[]')) return wordsGroup
class CitationSearch: def __init__(self, pairs, mode='eng', stopwords_flag=True): self.pair_dict = {} self.ids = [pair[0] for pair in pairs] self.tfidfs = [] self.mode = mode self.stopwords_flag = stopwords_flag docs = [pair[1] for pair in pairs] self.docs = [self.preprocess(doc) for doc in docs] for id, text in zip(self.ids, self.docs): self.pair_dict[id] = text self.corpus = TextCollection(self.docs) self.query = [] def preprocess(self, raw): if self.mode == 'eng': return self.preprocess_eng(raw) if self.mode == 'ru': return self.preprocess_ru(raw) if self.mode == 'eng+ru': return self.preprocess_eng(raw) + self.preprocess_ru(raw) def preprocess_eng(self, raw): doc = [] text = re.findall(words_eng, raw) for token in text: token = token.lower() if self.stopwords_flag: if token not in stopwords_eng: doc.append(stemmer.stem(token)) else: doc.append(stemmer.stem(token)) return doc def preprocess_ru(self, raw): doc = [] text = re.findall(words_ru, raw) for token in text: if self.stopwords_flag: if token not in stopwords_ru: doc.append(morph.parse(token)[0].normal_form) else: doc.append(morph.parse(token)[0].normal_form) return doc def get_tf(self, term, document): return self.corpus.tf(term, document) def get_idf(self, term): return self.corpus.idf(term) @staticmethod def normalize_cosine(doc, doc_vecs): counter = Counter(doc) cosine_norm = np.sqrt(np.sum( np.array(list(dict(counter).values()))**2)) doc_vector = np.array(doc_vecs) / cosine_norm return doc_vector def tfidf_docs(self): doc_vectors = [] for doc in self.docs: doc_tfs = {} for term in doc: doc_tfs[term] = self.get_tf(term, doc) * self.get_idf(term) doc_vector = self.normalize_cosine(doc, list(doc_tfs.values())) doc_tfidfs = {} for term, vec in zip(doc_tfs, doc_vector): doc_tfidfs[term] = vec doc_vectors.append(doc_tfidfs) self.tfidfs = doc_vectors def tfidf_queries(self, query): self.query = self.preprocess(query) query_tfsidfs = {} for term in self.query: query_tfsidfs[term] = self.get_tf(term, self.query) * self.get_idf(term) return query_tfsidfs def query_relevance(self, query): tfidf = self.tfidf_queries(query) query_vec = list(tfidf.values()) doc_vecs = [] for doc in self.tfidfs: doc_vec = [] for term_query in tfidf: if term_query in doc: doc_vec.append(doc[term_query]) else: doc_vec.append(0) doc_vecs.append(doc_vec) cosines = [] for vec in doc_vecs: if np.any(vec): cosines.append(1 - cosine(vec, query_vec)) else: cosines.append(0) relevance_ids = [ text_id for _, text_id in sorted( zip(cosines, self.ids), key=(lambda x: x[0]), reverse=True) ] cosines.sort(reverse=True) most_relevant = relevance_ids[0] relevant_candidates = [relevance_ids[0]] for cos in range(1, len(cosines)): if cosines[0] - cosines[cos] <= 0.000001: relevant_candidates.append(relevance_ids[cos]) if len(relevant_candidates) > 1: tiebreaker = [] for id in relevant_candidates: rel_text = self.pair_dict[id] absent_words = 0 for word in rel_text: if word not in self.query: absent_words += 1 tiebreaker.append(absent_words) relevant_candidates = [ text_id for _, text_id in sorted(zip(tiebreaker, relevant_candidates), key=(lambda x: x[0])) ] most_relevant = relevant_candidates[0] return most_relevant, cosines[0]
def write_tfidf_file(xmlcollection, nltk_textcollection): """ Writes a tf*idf matrix file with all tf*idf values for each document, row by row. The columns represent the (alphabetically ordered) stems available in the whole collection. @param xmlcollection: Collection of XML documents, type collection @param nltk_textcollection: NLTK TextCollection of all the stems """ idf_file = get_stems_file(measure="_idf") avg_words_per_doc = len(xmlcollection.get_words()) / \ len(xmlcollection.get_docs()) if not exists(idf_file): write_idf_file(xmlcollection, nltk_textcollection) idf_dict = DictFromFile(idf_file) tfidf_dict = dict() high_tfidf_stems = set() collection_stems = list(xmlcollection.get_stems(uniq=True)) print "Length of collection, all stems:", len(collection_stems) # Remove most frequent (idf<2) / stop stems (or qualifying # as such), and most rare stems (max(idf)), as they are of no # help to separate / make up clusters collection_stems = get_classification_stems(collection_stems, idf_dict) print "Length of collection, cluster stems:", len(collection_stems) f = open(get_tfidf_matrix_file(), "w", get_def_enc()) for doc in xmlcollection.get_docs(): doc_stems = doc.get_stems() col = TextCollection("") stdout.write(doc.get_id()) idf_row = "" stdout.write(" (") for stem in sorted(collection_stems): tf = col.tf(stem, doc_stems) # Reweight tf values, to get more classifcation words # and compensate for the very different document sizes # available # Idea: Accounts for average document length, but also for # the number of times a word effectively occurs in a # specific document; other variations can be thought of # (using log) or maximal tf values # Note: The clustering works better with (in general) # smaller values if tf > 0.0: tf = 1.0 / avg_words_per_doc * tf # If nothing applies: tf is 0.0 tfidf = tf*float(idf_dict[stem]) tfidf_dict[stem] = tfidf # We may find here some threshold that makes sense if (tfidf > 0.0): stdout.write(stem + ", ") high_tfidf_stems.add(stem) idf_row += str(tfidf) + " " f.write(idf_row + "\n") stdout.write(")\n") f.close() print "List length of high value tf*idf terms:", len(high_tfidf_stems) sorted_tfidf_dict = \ sorted(tfidf_dict.iteritems(), reverse=True, key=operator.itemgetter(1)) f = open(get_stems_file(measure="_tfidf_sorted"), "w", get_def_enc()) for pair in sorted_tfidf_dict: f.write(str(pair[1]) + " " + pair[0] + "\n") f.close()
] s = nltk.stem.SnowballStemmer('english') token = [s.stem(ws) for ws in token] tokens.append(token) f = open('../Data/hair_dryer/tokens_dryer.pkl', 'wb') pickle.dump(tokens, f) f.close() #建立语料库 corpus = TextCollection(tokens) tf = {} tf_idf = {} for review in tokens: for word in review: if word not in tf: tf_ = corpus.tf(word, corpus) tf[word] = tf_ if word not in tf_idf: tf_idf_ = corpus.tf_idf(word, corpus) tf_idf[word] = tf_idf_ tf_sorted = sorted(tf.items(), key=lambda item: item[1], reverse=True) tf_idf_sorted = sorted(tf_idf.items(), key=lambda item: item[1], reverse=True) pd.DataFrame(tf_sorted).to_csv('../Data/hair_dryer/tf_sorted_dryer.csv') pd.DataFrame(tf_idf_sorted).to_csv( '../Data/hair_dryer/tf_idf_sorted_dryer.csv')
str(editdistance) + "; this may take a while!" xmlcollection.get_words_by_editdistance(editdistance=editdistance, no_of_most_freq=no_of_topwords) # Write the found sets to disk; also write most frequent words to disk. xmlcollection.write_words_by_editdistance(editdistance=editdistance) xmlcollection.write_topwords(no_of_words=no_of_topwords) print "Top words written to disk." # XXX: BIG F**K UP ################################## FIX FIX FIX ##### # Print idf, tf and tf-idf values for the term "CCC", in document # no. 42 - for testing. nltk_textcollection = TextCollection(xmlcollection.get_words()) print "idf: " + str(nltk_textcollection.idf("CCC")) print "tf: " + str(nltk_textcollection.tf("CCC", TextCollection(xmlcollection.get_doc(42).get_tokens()))) print "tf_idf: " + str(nltk_textcollection.tf_idf("CCC", TextCollection(xmlcollection.get_doc(42).get_tokens()))) # Do that now systematically for all documents print "Document where tf is bigger 0:" cnt = 0 for doc in xmlcollection.get_docs(): tf = nltk_textcollection.tf("CCC", TextCollection(doc.get_tokens())) stdout.write(str(tf) + ", ") cnt += 1 if cnt == 10: print cnt = 0 if tf > 0.0: print "\n" + doc.get_xml_filename()
from __future__ import print_function from nltk.corpus import PlaintextCorpusReader from nltk.text import TextCollection #load all the files in the corpus root, #and calculate tf, idf, and tf_idf on them, and on a specific term if __name__ == "__main__": corpus_root = '../data/source_data' corpus = PlaintextCorpusReader(corpus_root,'[a-zA-Z \-]*\.txt') ids = corpus.fileids() collection = TextCollection(corpus) #for x,word in enumerate(corpus.words(ids[0])[:200]): # print(x,word) source = ids[0] term = corpus.words(source)[107] doc = corpus.words(ids[2]) print("Source: ",source) print("TF of: ",term,": ",collection.tf(term,doc)) print("IDF of: ",term,": ",collection.idf(term)) print("tf_Idf of:",term,": ",collection.tf_idf(term,doc))
novel_data = open(file).read() cleaned_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', novel_data)) wordlist = jieba.lcut(cleaned_data) return wordlist #不同种类的文本 text1 = text(file='./text/new2.txt') #text2 = text(file='./texts/caijing.txt') #text3 = text(file = './texts/xinwen.txt') #text4 = text(file = './texts/keji.txt') #将文本列表初始化为TextCollection类 mytexts = TextCollection([text1]) dict_key = {} #遍历wordllist,与 计算机类的名词尽心匹配,选出排在最前面的几个词 wordlist = text_wordlsit(sys.argv[1]) for wod in wordlist: if len(wod) < 3: continue cfd = mytexts.tf(wod, text1) dict_key[wod] = cfd listdic = sorted(dict_key.items(), key=lambda d: d[1], reverse=True) print(listdic[:5]) #缺词库,貌似分词做的也不太好。