def getDomainUnigram(self, directory = None): collocations = set() #collocation items ewordlists = list() #list of lists of words #extract words from essays if directory is not None: doclist = os.listdir(directory) for essay in doclist: dir_essay = directory+'/'+essay etext = open(dir_essay,'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) else: # using the mapped essay to calcuate the candidate bigrams #need to call mapessay fuction first for ins in self._data: if ins['essay'] is not None: etext = open(ins['essay'],'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) #get collection of all essays under the specified directory / associated essays collection_text = TextCollection(ewordlists) itemlist = list() for (a, b) in collocations: itemlist.append(a) itemlist.append(b) itemlist = list(set(itemlist)) word_idf = [] for i in range(len(itemlist)): word_idf.append((collection_text.idf(itemlist[i]), itemlist[i])) word_idf = sorted(word_idf, key = operator.itemgetter(0)) ave = 0 if len(word_idf)!=0: ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf) wlist = [j for (i, j) in word_idf if i<ave] return wlist
class TextIndexer: __textCollection = None def __init__(self, documents): self.__textCollection = TextCollection(documents) def idf(self, term): return self.__textCollection.idf(term) def tf(self, term, text): return self.__textCollection.tf(term, text)
def tfidf_extraction(self, subset=None): if subset is not None: data = self.data[subset] else: data = self.data get_idf = TextCollection(data.Tokenize.to_list()) word_list = list(set([w for l in data.Tokenize.to_list() for w in l])) full_winfo = [[word, idf, tag[1]] for word, idf, tag in zip(word_list, [get_idf.idf(i) for i in word_list], nltk.pos_tag(word_list))] self.keywords = pd.DataFrame([i for i in full_winfo if i[2] in ["JJ", "NNP", "VBP", 'VBG', 'VBD', 'VBN', 'CD', 'NN', 'NNPS', 'RB', 'IN'] and not is_number(i[0])], columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False) self.full_words = pd.DataFrame(full_winfo, columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False) self.enable_topk == True
def computeTFIDF_text(texts, singletext): #texts是句子字符串列表(语料库),singletext单个句子字符串, texts = [nltk.word_tokenize(text) for text in texts] #对句子列表所有句子分词 corpus = TextCollection(texts) words = nltk.word_tokenize(singletext) #单词列表 tfidf_words = {} #计算机tfidf for word in words: idf = corpus.idf(word) #tf tf = corpus.tf(word, words) #idf tfidf = idf * tf tfidf_words[word] = tfidf return tfidf_words
def calculate_idf(words, corpus): """ Calulate the idf of words by using a corpus :param words: The words to calculate their idf :param corpus: The corpus to use in calculation :return: dict of {word: idf} """ words = set(words) # print("Loading corpus to calculate idf...") corpus_colleciton = TextCollection(corpus) idfs = {} for word in words: idfs[word] = corpus_colleciton.idf(word) return idfs
def IDF(qi, t): if t == 'all': corpus = TextCollection(body_split) elif t == 'ques': corpus = TextCollection(Q_body_split) elif t == 'ans': corpus = TextCollection(A_body_split) elif t == 'btitle': corpus = TextCollection(B_title_split) elif t == 'bsubt': corpus = TextCollection(B_subt_split) elif t == 'bsummary': corpus = TextCollection(B_summary_split) idf = corpus.idf(qi) return idf
def attrexplore(corpus): # s = "in douglas r. stinson, editor,.proc. crypto 93,.lecture notes in computer science no. 773..pages 278-291..1994..avrim blum, merrick furst, michael kearns, and richard j. lipton..springer,.cryptographic primitives based on hard learning problems.." # ss = SenToken(raw=s) # print(ss) # for sent in ss: # print(sent) nltkCorpus = TextCollection(corpus) print(nltkCorpus.idf(term='this')) print(idf(term='this', corpus=corpus)) print(nltkCorpus.tf(term='this', text='this is sentence four')) print(tf_idf(term='this', doc='this is sentence four', corpus=corpus)) fdist = nltk.FreqDist(WordTokener(sent=corpus[0])) print(fdist.tabulate())
def splitter(fileName): with open(fileName,'r') as f: d = json.load(f) #assuming the file is evenly divisable by 10 for i in range(3): shuffle(d) d = d[:1000] corpus = [] corpusList = [] classifiers = [] for i in range(len(d)): d[i]['tAbstract'] = tknize(d[i]['abstract']) corpus.extend(d[i]['tAbstract']) corpusList.extend(d[i]['tAbstract']) if d[i]['type'] not in classifiers: classifiers.append(d[i]['type']) # initialize numpy array for i in range(len(d)): d[i]['vector'] = numpy.empty([len(corpusList)]) tc = TC(corpus) print("Starting vector calculation") for doc in d: place = 0 for word in corpusList: idf = tc.idf(word) tf = tc.tf(word, doc['tAbstract']) # create a vector that is guaranteed to be in the same order for each doc, as # each doc appends the tf-idf score of the word to its vector at the same time doc['vector'][place] = idf * tf place += 1 return d, classifiers
def bag_of_word(self): sub_cl = TextCollection(self.data["Tokenize"].to_list()) sub_collection = list(set([word for text in self.data["Tokenize"].to_list() for word in text])) sub_dict = {} print("\n>> Extracting Bag-of-word Vector with TF-IDF...") for i in tqdm(sub_collection): sub_dict[i] = sub_cl.idf(i) sub_pos = {} index = 0 for i in tqdm(sub_collection): sub_pos[i] = index index+=1 sub_len = len(sub_collection) def d2v(word_list): v = np.zeros(sub_len) for i in word_list: v[sub_pos[i]] = sub_dict[i] return v tqdm.pandas(desc="Processing") self.sub_pos = sub_pos BaggingVector = self.data["Tokenize"].progress_apply(lambda x: d2v(x)) print("\n>> Extracting Finished...") return BaggingVector, sub_pos
def calculate_IDFs(self, courses): ''' This function calculates inverse term frequencies for each word in courses :param courses: Structure containing courses with title and description :return: IDFs A Dictionary containing all words as keys with their inverse document frequencies as values ''' mytexts = [] IDF_scores = {} for course in courses: # Title und Text zu einem String zusammenfassen, dabei Titel gewichten text = course.title + " " + course.description text = self.preprocess(text) # initialize Dictionary with all accuring words text_without_stopwords = "" mystopwords = stopwords.words("english") + stopwords.words( "german") for word in nltk.word_tokenize(text): if word not in mystopwords: IDF_scores[word] = 0.0 text_without_stopwords = text_without_stopwords + " %s" % word mytexts.append(text_without_stopwords) myTextCollection = TextCollection(mytexts) for word in IDF_scores: IDF_scores[word] = myTextCollection.idf(word) return IDF_scores
chong_1 = [word_tokenize(sent) for sent in sents_1] corpus_1 = TextCollection(chong_1) # print("23333333333333333333333333333333333333333333333########################") # print(corpus_1) # print("_________________________") #计算语料库中"one"的tf值 for a in filtered_words: tf = corpus_1.tf(a, corpus_1) # 1/12 #print(a,"tf:",tf) #print(type(a)) #计算语料库中"one"的idf值iii idf = corpus.idf(a) #log(3/1) #print(a,"idf:",idf) tf_idf = tf * idf d = dict.fromkeys([a], tf_idf) #print(d) Word_dict.update(d) #print("_________________________") #print(Word_dict) # from_high_to_low=sorted(Word_dict.items(), key=lambda d:d[1], reverse = False ) from_high_to_low = Counter(Word_dict).most_common() #返回一个列表,按照dict的value从大到小排序 #print(from_high_to_low) for i in from_high_to_low:
" words by edit distance " + \ str(editdistance) + "; this may take a while!" xmlcollection.get_words_by_editdistance(editdistance=editdistance, no_of_most_freq=no_of_topwords) # Write the found sets to disk; also write most frequent words to disk. xmlcollection.write_words_by_editdistance(editdistance=editdistance) xmlcollection.write_topwords(no_of_words=no_of_topwords) print "Top words written to disk." # XXX: BIG F**K UP ################################## FIX FIX FIX ##### # Print idf, tf and tf-idf values for the term "CCC", in document # no. 42 - for testing. nltk_textcollection = TextCollection(xmlcollection.get_words()) print "idf: " + str(nltk_textcollection.idf("CCC")) print "tf: " + str(nltk_textcollection.tf("CCC", TextCollection(xmlcollection.get_doc(42).get_tokens()))) print "tf_idf: " + str(nltk_textcollection.tf_idf("CCC", TextCollection(xmlcollection.get_doc(42).get_tokens()))) # Do that now systematically for all documents print "Document where tf is bigger 0:" cnt = 0 for doc in xmlcollection.get_docs(): tf = nltk_textcollection.tf("CCC", TextCollection(doc.get_tokens())) stdout.write(str(tf) + ", ") cnt += 1 if cnt == 10: print cnt = 0
# 首先,构建语料库corpus sents = [ 'this is sentence one', 'this is sentence two', 'this is sentence three' ] sents = [word_tokenize(sent) for sent in sents] # 对每个句子进行分词 print(sents) # 输出分词后的结果 corpus = TextCollection(sents) # 构建语料库 print(corpus) # 输出语料库 # 计算语料库中"one"的tf值 tf = corpus.tf('one', corpus) # 1/12 print(tf) # 计算语料库中"one"的idf值 idf = corpus.idf('one') # log(3/1) print(idf) # 计算语料库中"one"的tf-idf值 tf_idf = corpus.tf_idf('one', corpus) print(tf_idf) from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer x_train = [ 'TF-IDF 主要 思想 是', '算法 一个 重要 特点 可以 脱离 语料库 背景', '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要' ] x_test = ['原始 文本 进行 标记', '主要 思想']
from __future__ import print_function from nltk.corpus import PlaintextCorpusReader from nltk.text import TextCollection #load all the files in the corpus root, #and calculate tf, idf, and tf_idf on them, and on a specific term if __name__ == "__main__": corpus_root = '../data/source_data' corpus = PlaintextCorpusReader(corpus_root,'[a-zA-Z \-]*\.txt') ids = corpus.fileids() collection = TextCollection(corpus) #for x,word in enumerate(corpus.words(ids[0])[:200]): # print(x,word) source = ids[0] term = corpus.words(source)[107] doc = corpus.words(ids[2]) print("Source: ",source) print("TF of: ",term,": ",collection.tf(term,doc)) print("IDF of: ",term,": ",collection.idf(term)) print("tf_Idf of:",term,": ",collection.tf_idf(term,doc))
class CitationSearch: def __init__(self, pairs, mode='eng', stopwords_flag=True): self.pair_dict = {} self.ids = [pair[0] for pair in pairs] self.tfidfs = [] self.mode = mode self.stopwords_flag = stopwords_flag docs = [pair[1] for pair in pairs] self.docs = [self.preprocess(doc) for doc in docs] for id, text in zip(self.ids, self.docs): self.pair_dict[id] = text self.corpus = TextCollection(self.docs) self.query = [] def preprocess(self, raw): if self.mode == 'eng': return self.preprocess_eng(raw) if self.mode == 'ru': return self.preprocess_ru(raw) if self.mode == 'eng+ru': return self.preprocess_eng(raw) + self.preprocess_ru(raw) def preprocess_eng(self, raw): doc = [] text = re.findall(words_eng, raw) for token in text: token = token.lower() if self.stopwords_flag: if token not in stopwords_eng: doc.append(stemmer.stem(token)) else: doc.append(stemmer.stem(token)) return doc def preprocess_ru(self, raw): doc = [] text = re.findall(words_ru, raw) for token in text: if self.stopwords_flag: if token not in stopwords_ru: doc.append(morph.parse(token)[0].normal_form) else: doc.append(morph.parse(token)[0].normal_form) return doc def get_tf(self, term, document): return self.corpus.tf(term, document) def get_idf(self, term): return self.corpus.idf(term) @staticmethod def normalize_cosine(doc, doc_vecs): counter = Counter(doc) cosine_norm = np.sqrt(np.sum( np.array(list(dict(counter).values()))**2)) doc_vector = np.array(doc_vecs) / cosine_norm return doc_vector def tfidf_docs(self): doc_vectors = [] for doc in self.docs: doc_tfs = {} for term in doc: doc_tfs[term] = self.get_tf(term, doc) * self.get_idf(term) doc_vector = self.normalize_cosine(doc, list(doc_tfs.values())) doc_tfidfs = {} for term, vec in zip(doc_tfs, doc_vector): doc_tfidfs[term] = vec doc_vectors.append(doc_tfidfs) self.tfidfs = doc_vectors def tfidf_queries(self, query): self.query = self.preprocess(query) query_tfsidfs = {} for term in self.query: query_tfsidfs[term] = self.get_tf(term, self.query) * self.get_idf(term) return query_tfsidfs def query_relevance(self, query): tfidf = self.tfidf_queries(query) query_vec = list(tfidf.values()) doc_vecs = [] for doc in self.tfidfs: doc_vec = [] for term_query in tfidf: if term_query in doc: doc_vec.append(doc[term_query]) else: doc_vec.append(0) doc_vecs.append(doc_vec) cosines = [] for vec in doc_vecs: if np.any(vec): cosines.append(1 - cosine(vec, query_vec)) else: cosines.append(0) relevance_ids = [ text_id for _, text_id in sorted( zip(cosines, self.ids), key=(lambda x: x[0]), reverse=True) ] cosines.sort(reverse=True) most_relevant = relevance_ids[0] relevant_candidates = [relevance_ids[0]] for cos in range(1, len(cosines)): if cosines[0] - cosines[cos] <= 0.000001: relevant_candidates.append(relevance_ids[cos]) if len(relevant_candidates) > 1: tiebreaker = [] for id in relevant_candidates: rel_text = self.pair_dict[id] absent_words = 0 for word in rel_text: if word not in self.query: absent_words += 1 tiebreaker.append(absent_words) relevant_candidates = [ text_id for _, text_id in sorted(zip(tiebreaker, relevant_candidates), key=(lambda x: x[0])) ] most_relevant = relevant_candidates[0] return most_relevant, cosines[0]
# return uniqueResult ######################################################################################################################## ######### Test with topN words from allover the corpus################################################################## # all_words = nltk.FreqDist(w for w in dialect.words()) # topN = all_words.most_common(4000) ######################################################################################################################## ########## Test with getting topN with highest idf ##################################################################### uniqueWords = [] for i in text: if not i in uniqueWords: uniqueWords.append(i) print('unique words count', len(uniqueWords)) wordsIDF = [(word, x.idf(word)) for word in uniqueWords] #print('finished getting idf for all unique words' , wordsIDF) sortedidfs = sorted(wordsIDF, key=itemgetter(1), reverse=True) #print('sortedidfs' , sortedidfs) print('length of words with idfs', len(sortedidfs)) eliminateBiggerThanOne = [] for i in sortedidfs: if i[1] < 1: eliminateBiggerThanOne.append(i) print('eliminated bigger than one', eliminateBiggerThanOne) print('length of elimnation', len(eliminateBiggerThanOne)) topN = []
# if not i in uniqueResult: # uniqueResult.append(i) # return uniqueResult # # topN = [i[0] for i in topNwords()] ######################################################################################################################## ########## Test with getting topN with highest idf ##################################################################### uniqueWords =[] for i in text: if not i in uniqueWords: uniqueWords.append(i) print('unique words count', len(uniqueWords)) wordsIDF = [(word,x.idf(word)) for word in uniqueWords] #print('finished getting idf for all unique words' , wordsIDF) sortedidfs = sorted(wordsIDF,key=itemgetter(1), reverse = True) #print('sortedidfs' , sortedidfs) print('length of words with idfs' , len(sortedidfs)) eliminateBiggerThanOne = [] for i in sortedidfs: if i[1] < 1: eliminateBiggerThanOne.append(i) print('eliminated bigger than one' , eliminateBiggerThanOne) print('length of elimnation' , len(eliminateBiggerThanOne))
from nltk.text import TextCollection mytext = TextCollection([ 'The cat hit that dog'.lower(), 'This is a fat dog'.lower(), 'I like this dog', 'I like to go to school' ]) print(mytext.idf('this'))