def compute_tfidf(text,filename): numPara = len(text) print "there should be this many para in the text file ", numPara colList = [] paragraphWords = [] for i in range(numPara): paragraphWords = word_tokenize(text[i]) colList.append(paragraphWords) collection = TextCollection(colList) for paraList in colList: dict={} for term in paraList: print term, "has weight: ", collection.tf_idf(term,paraList) dict[term]= collection.tf_idf(term,paraList) ''' print "BEFORE <><><><><<><<>><><><><><><><>><>< ",type(dict) for key,value in dict.iteritems(): print key," ",value ''' d=sortDict(dict) print "AFTER SORTED <><><><><<><<>><><><><><><><>><>< ",type(d) textFile=open(filename,"a") textFile.write("\n") for key,value in d: s = str(key) + "\t" + str(value)+"\n" textFile.write(s)
def __vectorize(self, corpus): corpus = [list(self.__tokenize(doc)) for doc in corpus] texts = TextCollection(corpus) for doc in corpus: yield {term: texts.tf_idf(term, doc) for term in doc}
def compute_tfidf(text, filename): numPara = len(text) print "there should be this many para in the text file ", numPara colList = [] paragraphWords = [] for i in range(numPara): paragraphWords = word_tokenize(text[i]) colList.append(paragraphWords) collection = TextCollection(colList) for paraList in colList: dict = {} for term in paraList: print term, "has weight: ", collection.tf_idf(term, paraList) dict[term] = collection.tf_idf(term, paraList) ''' print "BEFORE <><><><><<><<>><><><><><><><>><>< ",type(dict) for key,value in dict.iteritems(): print key," ",value ''' d = sortDict(dict) print "AFTER SORTED <><><><><<><<>><><><><><><><>><>< ", type(d) textFile = open(filename, "a") textFile.write("\n") for key, value in d: s = str(key) + "\t" + str(value) + "\n" textFile.write(s)
def sentenceAlignment(simpleParas, normalParas, pairedPara): for key,value in pairedPara.items(): # key is simple and value in normal print "**********************************" print "PARAGRAPH" print "##################################" SPara = simpleParas[key] NPara = normalParas[value] # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list colList, sslist,nslist = formSentenceList(SPara,NPara) collection = TextCollection(colList) dict={} for sentence in colList: weight = 0 for term in sentence: weight = collection.tf_idf(term,sentence) print "TERM -> ",term, "is",weight # what if the term is already in the dic, we need to add the weight if(term not in dict): dict[term] = weight # dict[term] = weight #dict = sortDict(dict) print "================================================================" '''
def get_tf_idf_dict_nltk( self, column_type="review_body", save_path="tf_idf_value/hair_dryer_tf_idf_dict.csv"): ''' ### nltk version it's super slow so don't use it ''' reviews = self.raw_df[column_type].tolist() # get clean header reviews_list_cleaned = clean_tsv(reviews) # get all words words = set() for reviews in reviews_list_cleaned: for review in reviews: words.add(review) words = list(words) corpus = TextCollection(reviews_list_cleaned) tf_idf = [] for word in words: tf_idf.append(corpus.tf_idf(word, corpus)) df = pd.DataFrame({"word": words, "tf-idf": tf_idf}) df.to_csv(save_path, encoding='utf-8')
def vectorize_t(corpus): #corpus = [tokenize(doc) for doc in corpus] texts = TextCollection(corpus) return { term: texts.tf_idf(term, corpus) for term in corpus }
def ranking(reuters, corpus, docids, palavras): '''Cria um ranqueamento entre os textos da busca, sendo o primeiro o mais relevante Args: reuters: corpus vindo do nltk corpus: dicionário contendo a relação entre índice e texto docids: índices dos textos buscados palavras: palavras tokenizadas da query Returns: Lista com todas os índices já ranqueados ''' rank = {} tc = TextCollection(reuters) for e in docids: rank[e] = 0 for i in palavras: rank[e] += tc.tf_idf(i, corpus[e]) rank = { k: v for k, v in reversed(sorted(rank.items(), key=lambda item: item[1])) } return rank.keys()
def nltk_tf_idf(corpus_one, file_name): print('-----starting nltk_tf_idf') corpus_one = [nltk.word_tokenize(doc) for doc in corpus_one] texts = TextCollection(corpus_one) for doc in corpus_one: yield {term: texts.tf_idf(term, doc) for term in doc}
def getDomainUnigram(self, directory = None): collocations = set() #collocation items ewordlists = list() #list of lists of words #extract words from essays if directory is not None: doclist = os.listdir(directory) for essay in doclist: dir_essay = directory+'/'+essay etext = open(dir_essay,'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) else: # using the mapped essay to calcuate the candidate bigrams #need to call mapessay fuction first for ins in self._data: if ins['essay'] is not None: etext = open(ins['essay'],'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) #get collection of all essays under the specified directory / associated essays collection_text = TextCollection(ewordlists) itemlist = list() for (a, b) in collocations: itemlist.append(a) itemlist.append(b) itemlist = list(set(itemlist)) word_idf = [] for i in range(len(itemlist)): word_idf.append((collection_text.idf(itemlist[i]), itemlist[i])) word_idf = sorted(word_idf, key = operator.itemgetter(0)) ave = 0 if len(word_idf)!=0: ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf) wlist = [j for (i, j) in word_idf if i<ave] return wlist
def tf_idf(self): corpus = [ list(self.cr.tokenize_strip_punct(desc)) for desc in self.cr.texts() ] texts = TextCollection(corpus) for desc in corpus: yield {term: texts.tf_idf(term, desc) for term in desc}
def nltk_tfidf_vectorize(corpus): from nltk.text import TextCollection corpus = [list(tokenize(doc)) for doc in corpus] texts = TextCollection(corpus) for doc in corpus: yield {term: texts.tf_idf(term, doc) for term in doc}
def compute_tf_idf(question, messages): import math texts = [question.keywords] total_length = 0 for m in messages: total_length += len(m.keywords) text = Text(tokens=m.keywords) texts.append(text) text_collection = TextCollection(texts) question_tfidf_score = 0 for k in question.keywords: tf_idf = text_collection.tf_idf(k, texts[0]) question_tfidf_score += tf_idf if question_tfidf_score == 0: question_tfidf_score = 0.2 if total_length == 0: total_length = 1 length_factor = len(question.keywords) / total_length score = length_factor * math.log2(question_tfidf_score * 10) base_score = score if base_score == 0: base_score = 1 print(question.content, question_tfidf_score, length_factor, score) print("^^^^^^^^^^^^^^^^^^^^^^^^^^") scores = [] total_score = score print("Math", math) for i in range(0, len(messages)): tf_idf_i = 0 for k in messages[i].keywords: tf_idf = text_collection.tf_idf(k, texts[i + 1]) tf_idf_i += tf_idf if tf_idf_i == 0: continue length_factor = len(messages[i].keywords) / total_length score = length_factor * math.log2(tf_idf_i * 10) scores.append(score) total_score += score print(messages[i].content, tf_idf_i, length_factor, score) print("++++++++++++++++++++++++++++++++") # print(scores) averaged_scores = [] last_message = question results = [last_message] for i in range(0, len(scores)): averaged_score = scores[i] / base_score averaged_scores.append(averaged_score) if averaged_score < 0.52: last_message.comments.append(messages[i]) else: last_message = messages[i] results.append(last_message) print(averaged_scores) return results
def vectorize(corpus): corpus_tokenized = [list(tokenize(doc)) for doc in corpus] texts = TextCollection(corpus_tokenized) for doc in corpus_tokenized: return { term: texts.tf_idf(term, doc) for term in doc }
def tf_idf_vectorize_nltk(corpus): print(corpus) #corpus = [tokenize(doc) for doc in corpus] texts = TextCollection(corpus) print(texts) for doc in corpus: yield { term: texts.tf_idf(term, doc) for term in doc }
def build_text_collections(): text_collections = {} sample_size = 4 for category in ["news", "learned", "fiction"]: texts = [] for fileid in nltk.corpus.brown.fileids( categories=category)[:sample_size]: texts.append(tokenize(nltk.corpus.brown.raw(fileid))) text_collections[category] = TextCollection(texts) text_collections["all"] = TextCollection(text_collections.values()) return text_collections
class TextIndexer: __textCollection = None def __init__(self, documents): self.__textCollection = TextCollection(documents) def idf(self, term): return self.__textCollection.idf(term) def tf(self, term, text): return self.__textCollection.tf(term, text)
def __init__(self, pairs, mode='eng', stopwords_flag=True): self.pair_dict = {} self.ids = [pair[0] for pair in pairs] self.tfidfs = [] self.mode = mode self.stopwords_flag = stopwords_flag docs = [pair[1] for pair in pairs] self.docs = [self.preprocess(doc) for doc in docs] for id, text in zip(self.ids, self.docs): self.pair_dict[id] = text self.corpus = TextCollection(self.docs) self.query = []
def train(self, trainfile=None): print "training WeightedTweetClassifier" self.readTrainingData((trainfile or self.trainfile)) for tweet in self.trainingTweets: # lowercase, remove punctuation nopunct = string.lower( tweet.tweet.translate(string.maketrans("", ""), string.punctuation)) tweet.tweet = nopunct # add all Tweets to our TextCollection. This automatically creates a TF-IDF model self.textCollection = TextCollection( [tweet.tweet for tweet in self.trainingTweets])
def tfidf_extraction(self, subset=None): if subset is not None: data = self.data[subset] else: data = self.data get_idf = TextCollection(data.Tokenize.to_list()) word_list = list(set([w for l in data.Tokenize.to_list() for w in l])) full_winfo = [[word, idf, tag[1]] for word, idf, tag in zip(word_list, [get_idf.idf(i) for i in word_list], nltk.pos_tag(word_list))] self.keywords = pd.DataFrame([i for i in full_winfo if i[2] in ["JJ", "NNP", "VBP", 'VBG', 'VBD', 'VBN', 'CD', 'NN', 'NNPS', 'RB', 'IN'] and not is_number(i[0])], columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False) self.full_words = pd.DataFrame(full_winfo, columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False) self.enable_topk == True
def run_main(): text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terrible movie' tf_analy = TextCollection([text1, text2, text3, text4, text5]) new_text = 'That one is a good movie. This is so good!' word = 'That' tf_idf_val = tf_analy.tf_idf(word, new_text) print(tf_idf_val)
def Generate_keyword(obj,length): orig_file = './Data/'+obj+'/'+obj+'.xlsx' data = xlrd.open_workbook(filename=orig_file) sheet = data.sheet_by_index(1) review_head = np.array(sheet.col_values(12))[1:] review_body = np.array(sheet.col_values(13))[1:] review_all=[] for i in range(length) : review = review_head[i] + " " +review_body[i] review_all.append(review) review_all = np.array(review_all) # make review tokens tokens=[] for i,review in enumerate(review_all): review = review.lower() replacer = RegexpReplacer() review = replacer.replace(review) remove = str.maketrans('','',string.punctuation) review = review.translate(remove) token = nltk.word_tokenize(review) token = [w for w in token if w == 'not' or not w in stopwords.words('english')] s = nltk.stem.SnowballStemmer('english') token = [s.stem(ws) for ws in token] tokens.append(token) token_file = './Data/'+ obj +'/tokens.pkl' f=open(token_file,'wb') pickle.dump(tokens,f) f.close() corpus=TextCollection(tokens) tf={} tf_idf={} for review in tokens: for word in review: if word not in tf : tf_=corpus.tf(word,corpus) tf[word]=tf_ if word not in tf_idf : tf_idf_=corpus.tf_idf(word,corpus) tf_idf[word] = tf_idf_ tf_sorted = sorted(tf.items(), key=lambda item:item[1], reverse=True) tf_idf_sorted = sorted(tf_idf.items(), key=lambda item:item[1], reverse=True) pd.DataFrame(tf_sorted).to_csv('./Data/'+obj+'/tf_sorted.csv') pd.DataFrame(tf_idf_sorted).to_csv('./Data/'+obj+'/tf_idf_sorted.csv')
def calculate_idf(words, corpus): """ Calulate the idf of words by using a corpus :param words: The words to calculate their idf :param corpus: The corpus to use in calculation :return: dict of {word: idf} """ words = set(words) # print("Loading corpus to calculate idf...") corpus_colleciton = TextCollection(corpus) idfs = {} for word in words: idfs[word] = corpus_colleciton.idf(word) return idfs
def computeTFIDF_text(texts, singletext): #texts是句子字符串列表(语料库),singletext单个句子字符串, texts = [nltk.word_tokenize(text) for text in texts] #对句子列表所有句子分词 corpus = TextCollection(texts) words = nltk.word_tokenize(singletext) #单词列表 tfidf_words = {} #计算机tfidf for word in words: idf = corpus.idf(word) #tf tf = corpus.tf(word, words) #idf tfidf = idf * tf tfidf_words[word] = tfidf return tfidf_words
def preprocess(self,text): #text = text.split(" "); text = word_tokenize(text) if self.display: print "After Tokenizing" print text print "\n\n" text=[w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip())>2] tc = TextCollection([text]) words = list(set(tc)) word_tf = {word: tc.tf(word, text) * len(text) for word in words} return word_tf
def getEmmaChapter(): from nltk.text import TextCollection # from nltk.text import * import nltk # from nltk.book import text1, text2, text3 gutenberg = TextCollection(nltk.corpus.gutenberg) # ----- IDF EXAMPLE ---- # print(gutenberg.idf('Dick')) # ----- IDF EXAMPLE ---- i = 2 # line 2 to line 166 is chapter 1 emma = nltk.corpus.gutenberg.sents('austen-emma.txt') # for l in emma: chapterText = '' while i < 167: # print(str(i) + ': ') k = 0 l = emma[i] line = '' for w in l: line += l[k] + ' ' k = k + 1 # print(str(i) + ': ' + line + '\n') chapterText += line + '\n' i = i + 1 print (chapterText) return
def attrexplore(corpus): # s = "in douglas r. stinson, editor,.proc. crypto 93,.lecture notes in computer science no. 773..pages 278-291..1994..avrim blum, merrick furst, michael kearns, and richard j. lipton..springer,.cryptographic primitives based on hard learning problems.." # ss = SenToken(raw=s) # print(ss) # for sent in ss: # print(sent) nltkCorpus = TextCollection(corpus) print(nltkCorpus.idf(term='this')) print(idf(term='this', corpus=corpus)) print(nltkCorpus.tf(term='this', text='this is sentence four')) print(tf_idf(term='this', doc='this is sentence four', corpus=corpus)) fdist = nltk.FreqDist(WordTokener(sent=corpus[0])) print(fdist.tabulate())
def main(): # Get text from folder or file # TODO Change the folder corpus to the upper level! texts = load_text_data(config_path) if not texts: print "No texts found" return # Dictionary that will hold all the ngrams and their values, for each measure (dict of dicts) scored_ngrams = {} # Create a list of Document objects with the texts. Pretreat them also. list_documents = [] for label, text in texts.iteritems(): list_documents.append(Document(text, stem=config_stem, name=label)) # list_documents = TextCollection([Document(text, stem=config_stem, name=label) # for label, text in texts.items()][:]) list_documents = TextCollection(list_documents) global config_ngram if config_ngram == 0: config_ngram = 1 #########################################N GRAM EXTRACTION ################################################# # Now do the ngram extraction for ng in range(2, config_ngram + 1): ngrams = get_any_ngrams(list_documents, ngram=ng, k=config_top_k, min_tok_len=config_min_tok_len, min_freq=config_min_tok_freq) scored_ngrams = update_dict_values(scored_ngrams, ngrams) scored_ngrams = update_dict_values(scored_ngrams, get_concordances(list_documents, scored_ngrams)) make_tables(scored_ngrams, results_folder=config_output) return
def getTopic2(text): # clean input stop = open('stopwords.txt').read() l = [] src = [ w.strip(" .,?!") for w in nltk.word_tokenize(text.lower()) if w not in stop ] candidates = nltk.FreqDist(w for w in src if len(w) > 3) candidates = candidates.keys()[:10] # initialize vectors brown = TextCollection(nltk.corpus.brown) for w in candidates: l.append((w, brown.tf_idf(w, candidates))) vectors = [array(l)] # initialize the clusterer clusterer = nltk.cluster.kmeans.KMeansClusterer(10, euclidean_distance) clusterer.cluster(vectors, True) #pick the one closest to the center of the largest clusterer.Means().Max() o = [l for l in clusterer.Means()] #o = [(clusterer.classify(l.index(i)), l.index(i)) for i in range(len(l))] o.reverse() print o.pop().index(1)
def text_classification(): """ 文本分类 :return: """ text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terrible movie' # 构建TextCollection对象 tc = TextCollection([text1, text2, text3, text4, text5]) new_text = 'That one is a good movie. This is so good!' word = 'That' tf_idf_val = tc.tf_idf(word, new_text) print('{}的TF-IDF值为:{}'.format(word, tf_idf_val))
def retrieve_results(n_percentile): search_queries = parse_trec('documents/irg_queries.trec') search_collections = parse_trec('documents/irg_collection_clean.trec') # search_collections = parse_trec('documents/irg_collection_short.trec') # search_collections = eliminate_stopwords(search_collections) # write_collection_doc(search_collections, 'documents/irg_collection_clean.trec') print('======= Statistics =======') print(f'Queries: {len(search_queries)}') print(f'Collections: {len(search_collections)}') print(f'Removal of {int((1-n_percentile)*100)}%-ile') print('==========================') # TF-IDF document_results = [] for search_query_id, search_query_text in search_queries.items(): print( f'Current query id: {search_query_id}, text: "{search_query_text}"' ) terms = search_query_text.split(' ') documents = keep_n_percentile_most_relevant_words(search_collections, search_query_text, n=n_percentile) document_scores = {} search_texts_collection = TextCollection(documents.values()) for document_id, document_text in documents.items(): for term in terms: current_score = document_scores.get(document_id, 0.0) document_scores[ document_id] = current_score + search_texts_collection.tf_idf( term, document_text) rank = 1 for document_id, document_scores in sorted(document_scores.items(), key=lambda kv: kv[1], reverse=True): if rank <= 1000: document_results.append( Result(search_query_id, document_id, rank, document_scores)) rank += 1 result_writer(document_results, f'IE_result_keep_{int(n_percentile*100)}_percentile.trec') print('Done')
def train(self, trainfile=None): print "training WeightedTweetClassifier" self.readTrainingData((trainfile or self.trainfile)) for tweet in self.trainingTweets: # lowercase, remove punctuation nopunct = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)) tweet.tweet = nopunct # add all Tweets to our TextCollection. This automatically creates a TF-IDF model self.textCollection = TextCollection([tweet.tweet for tweet in self.trainingTweets])
def preprocess(self, text): #text = text.split(" "); text = word_tokenize(text) if self.display: print "After Tokenizing" print text print "\n\n" text = [ w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip()) > 2 ] tc = TextCollection([text]) words = list(set(tc)) word_tf = {word: tc.tf(word, text) * len(text) for word in words} return word_tf
def compute_tf_idf_similarity(query: str, content: str, type: str) -> float: """ Compute the mean tf-idf or tf similarity for one sentence with multi query words. :param query: a string contain all key word split by one space :param content: string list with every content relevent to this query. :return: average tf-idf or tf similarity. """ sents = [word_tokenize(content), word_tokenize("")] # add one empty file to smooth. corpus = TextCollection(sents) # 构建语料库 result_list = [] for key_word in query.strip(" ").split(" "): if type == "tf_idf": result_list.append(corpus.tf_idf(key_word, corpus)) elif type == "tf": result_list.append(corpus.tf(key_word, corpus)) else: raise KeyError return sum(result_list) / len(result_list)
def compute_tfidf(text,filename): numPara = len(text) print "there should be this many para in the text file ", numPara colList = [] paragraphWords = [] for i in range(numPara): paragraphWords = word_tokenize(text[i]) colList.append(paragraphWords) collection = TextCollection(colList) for paraList in colList: dict={} for term in paraList: dict[term]= collection.tf_idf(term,paraList) d=sortDict(dict) textFile=open(filename,"a") textFile.write("\n") for key,value in d: s = str(key) + "\t" + str(value)+"\n" textFile.write(s)
def __init__(self, doc: str, vec_size: int, alpha=0.06): @has_vec_set(doc) def get_vec_set(doc_vec): res = {} cur_line = 0 # tc = TextCollection(self.doc) while 1: try: cur_words = tf_idf_sort(doc_vec.doc, doc_vec.tc, cur_line) for w, v in cur_words: if w in res: res[w] = max(res[w], v) else: res[w] = v except IndexError: break cur_line += 1 print("{} \r".format(cur_line), end='') return res self.doc = read_comments(doc) self.tc = TextCollection(self.doc) self.vec_set = get_vec_set(self) self.vec_set = [(w, self.vec_set[w]) for w in self.vec_set] self.vec_set = DataFrame(self.vec_set) Max = self.vec_set[1].max() Min = self.vec_set[1].min() self.vec_set[1] = self.vec_set[1].apply(lambda x: (x - Min) / (Max - Min)) self.vec_set[1] = self.vec_set[1].apply(lambda x: x * (1 - alpha)) self.vec_set = zip(self.vec_set[0], self.vec_set[1]) self.vec_set = {w: v for w, v in self.vec_set} G = Graph(doc, True) tex_rank_key_word = DataFrame(key_word(G, 10, 5000)) Min = tex_rank_key_word[1].min() Max = tex_rank_key_word[1].max() tex_rank_key_word[1] = tex_rank_key_word[1].apply( lambda x: alpha * (x - Min) / (Max - Min)) tex_rank_key_word = list( zip(tex_rank_key_word[0], tex_rank_key_word[1])) self.vec_set = [(w, self.vec_set[w]) for w, v in tex_rank_key_word if self.vec_set[w] >= alpha] # for w, v in tex_rank_key_word: # if w in self.vec_set: # self.vec_set[w] += v # else: # self.vec_set[w] = v # self.vec_set = sorted([(w, self.vec_set[w]) for w in self.vec_set], key=lambda x: x[1], reverse=True) self.vec_set = sorted(self.vec_set, key=lambda x: x[1], reverse=True) print(len(self.vec_set)) self.vec_size = vec_size
def train_NB_tfidf_nltk(train_data,test_data,all_rev): all_rev = [nltk.word_tokenize(rev) for rev in all_rev] corpus = TextCollection(all_rev) labels = train_data['label'] train_rev = train_data['review'] ID = test_data['ID'] lab = get_lab(labels) fs_train = [] print(train_rev[0]) for i in range(0,len(train_rev)): cut_rev = nltk.word_tokenize(train_rev[i]) fs_dict = {} for j in range(0,len(cut_rev)): fs_dict[cut_rev[j]] = corpus.tf_idf(cut_rev[j],train_rev[i]) fs_train.append((fs_dict,int(lab[i]))) fs_test = [] for i in range(0,len(test_rev)): cut_rev = nltk.word_tokenize(test_rev[i]) fs_dict = {} for j in range(0,len(cut_rev)): fs_dict[cut_rev[j]] = corpus.tf_idf(cut_rev[j],test_rev[i]) fs_test.append(fs_dict) classifier=nltk.NaiveBayesClassifier.train(fs_train) label = 1 train_score = [] test_score = [] for i in range(0,len(fs_train)): dist = classifier.prob_classify(fs_train[i][0]) train_score.append(dist.prob(label)) train_score = np.array(train_score,dtype="float32") for i in range(0,len(fs_test)): dist = classifier.prob_classify(fs_test[i]) test_score.append(dist.prob(label)) test_score = np.array(test_score,dtype="float32") print("AUC: ",cal_auc(train_score,lab)) result = pd.DataFrame({'ID':ID.T,'Pred':test_score.T}) result.to_csv("./result.csv",index = None)
csv_read = pd.read_csv("positive-words.csv", header=0) positive_words = list(csv_read.Positive) csv_read = pd.read_csv("stopwords.csv", header=0) stop = list(csv_read.stopwords) negation = ['no','not','never','n\'t','cannot'] intensify = ['very','really','extremely','absolutely','highly'] """ Create a corpus of text """ reviews=[] for z in reviewsx: n=''.join(x for x in z if x in string.printable) o=' '.join(n.split()) reviews.append(o) reviewcollection = TextCollection(word_tokenize(r) for r in reviews) #package a list of tokenized reviews reviewset = [word_tokenize(r) for r in reviews] """ add the pos/neg lists to a coded dictionary """ subj_dict = {} for w in negative_words: subj_dict[w] = 'NEG' for w in positive_words: subj_dict[w] = 'POS' rating_dict = {} rating_dict['NEG']= -1 rating_dict['IRR']= 0 rating_dict['POS']= 1 rating_dict['negate']= 2
print "Finding forms for the top " + str(no_of_topwords) + \ " words by edit distance " + \ str(editdistance) + "; this may take a while!" xmlcollection.get_words_by_editdistance(editdistance=editdistance, no_of_most_freq=no_of_topwords) # Write the found sets to disk; also write most frequent words to disk. xmlcollection.write_words_by_editdistance(editdistance=editdistance) xmlcollection.write_topwords(no_of_words=no_of_topwords) print "Top words written to disk." # XXX: BIG F**K UP ################################## FIX FIX FIX ##### # Print idf, tf and tf-idf values for the term "CCC", in document # no. 42 - for testing. nltk_textcollection = TextCollection(xmlcollection.get_words()) print "idf: " + str(nltk_textcollection.idf("CCC")) print "tf: " + str(nltk_textcollection.tf("CCC", TextCollection(xmlcollection.get_doc(42).get_tokens()))) print "tf_idf: " + str(nltk_textcollection.tf_idf("CCC", TextCollection(xmlcollection.get_doc(42).get_tokens()))) # Do that now systematically for all documents print "Document where tf is bigger 0:" cnt = 0 for doc in xmlcollection.get_docs(): tf = nltk_textcollection.tf("CCC", TextCollection(doc.get_tokens())) stdout.write(str(tf) + ", ") cnt += 1 if cnt == 10: print
def alignText(simpleParas, normalParas, pairedPara): #print simpleParas, len(simpleParas) #print normalParas, len(normalParas) for key,value in pairedPara.items(): # key is simple and value in normal SPara = simpleParas[key] NPara = normalParas[value] print "=================Paragraphs were above======================================" # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list colList, sslist,nslist = formSentenceList(SPara,NPara) collection = TextCollection(colList) # this is a list of Word object wordsWithWeight = [] dict={} for sentence in colList: weight = 0 for term in sentence: if term not in PUNCTLIST or term not in STOPWORDS or term not in commonAuxilaryVerbs: weight = collection.tf_idf(term,sentence) # what if the term is already in the dic, we need to add the weight if(term not in dict): w = Word(term,"","") w.setWeight(weight) wordsWithWeight.append(w) #dict[term] = weight # dict[term] = weight #dict = sortDict(dict) temp=[] for sentence in sslist: tokSen = word_tokenize(sentence) temp.append(tokSen) sslist = temp temp=[] for sentence in nslist: tokSen = word_tokenize(sentence) temp.append(tokSen) nslist = temp for simpleLine in sslist: stringSimpleLine = listToString(simpleLine) # semantic part simplefilename = "sentence1.txt" SFile=open(simplefilename,"w+") SFile.write(stringSimpleLine) SFile.close() parseFile("sentence1.txt") # if failed to parse, skip this sentence and continue if verifyParsedFile("parsedsentence1.txt") == False: continue buildClause("parsedsentence1.txt", "one") # end semantic part maxSimilarity = 0 for normalLine in nslist: stringNormalLine = listToString(normalLine) # semantic part normalfilename = "sentence2.txt" NFile=open(normalfilename,"w+") NFile.write(stringNormalLine) NFile.close() parseFile("sentence2.txt") #check whether parsing was done properly # if failed to parse, skip this sentence and continue if verifyParsedFile("parsedsentence2.txt") == False: continue # end semantic part #buildClause("parsedsentence1.txt", "one") buildClause("parsedsentence2.txt","two") sentence1Words = [] sentence2Words = [] #makeContextFile(n1,v1,n2,v2) sentence1Words, sentence2Words = makeContextFile(n1,v1,n2,v2) # all words is a dictionary of words:tfidf. I converted this to a dictionary from a list of wordsWithWeight for convenience allWords = {} for w in wordsWithWeight: allWords[w.getValue()]=w.getWeight() numerator1 = 0 denominator1 = 0 for word in sentence1Words: if(word.getValue() in allWords): tfidf = allWords[word.getValue()] semanticWeight = word.getWeight() numerator1 = numerator1+ (semanticWeight*tfidf) denominator1 = denominator1 + allWords[word.getValue()] if(denominator1==0): denominator1 = 1 partA = numerator1/denominator1 numerator2 = 0 denominator2 = 0 for word in sentence2Words: #print "dic index:->", word.getValue(),"value: ",allWords[word.getValue()] if(word.getValue() in allWords): tfidf = allWords[word.getValue()] semanticWeight = word.getWeight() numerator2 = numerator2+ (semanticWeight*tfidf) denominator2 = denominator2 + allWords[word.getValue()] if(denominator2==0): denominator2 = 1 partB = numerator2/denominator2 SIMILARITY = (partA + partB)/2 print "><><><><><><><><><><><><><><><><><><><><><><" print stringSimpleLine print "--------------------------------------------" print stringNormalLine print "Similarity Score -----> ", SIMILARITY print "><><><><><><><><><><><><><><><><><><><><><><"
def write_tfidf_file(xmlcollection, nltk_textcollection): """ Writes a tf*idf matrix file with all tf*idf values for each document, row by row. The columns represent the (alphabetically ordered) stems available in the whole collection. @param xmlcollection: Collection of XML documents, type collection @param nltk_textcollection: NLTK TextCollection of all the stems """ idf_file = get_stems_file(measure="_idf") avg_words_per_doc = len(xmlcollection.get_words()) / \ len(xmlcollection.get_docs()) if not exists(idf_file): write_idf_file(xmlcollection, nltk_textcollection) idf_dict = DictFromFile(idf_file) tfidf_dict = dict() high_tfidf_stems = set() collection_stems = list(xmlcollection.get_stems(uniq=True)) print "Length of collection, all stems:", len(collection_stems) # Remove most frequent (idf<2) / stop stems (or qualifying # as such), and most rare stems (max(idf)), as they are of no # help to separate / make up clusters collection_stems = get_classification_stems(collection_stems, idf_dict) print "Length of collection, cluster stems:", len(collection_stems) f = open(get_tfidf_matrix_file(), "w", get_def_enc()) for doc in xmlcollection.get_docs(): doc_stems = doc.get_stems() col = TextCollection("") stdout.write(doc.get_id()) idf_row = "" stdout.write(" (") for stem in sorted(collection_stems): tf = col.tf(stem, doc_stems) # Reweight tf values, to get more classifcation words # and compensate for the very different document sizes # available # Idea: Accounts for average document length, but also for # the number of times a word effectively occurs in a # specific document; other variations can be thought of # (using log) or maximal tf values # Note: The clustering works better with (in general) # smaller values if tf > 0.0: tf = 1.0 / avg_words_per_doc * tf # If nothing applies: tf is 0.0 tfidf = tf*float(idf_dict[stem]) tfidf_dict[stem] = tfidf # We may find here some threshold that makes sense if (tfidf > 0.0): stdout.write(stem + ", ") high_tfidf_stems.add(stem) idf_row += str(tfidf) + " " f.write(idf_row + "\n") stdout.write(")\n") f.close() print "List length of high value tf*idf terms:", len(high_tfidf_stems) sorted_tfidf_dict = \ sorted(tfidf_dict.iteritems(), reverse=True, key=operator.itemgetter(1)) f = open(get_stems_file(measure="_tfidf_sorted"), "w", get_def_enc()) for pair in sorted_tfidf_dict: f.write(str(pair[1]) + " " + pair[0] + "\n") f.close()
import pymongo from pymongo import Connection MONGODB_PORT = 27017 import nltk from nltk.corpus import brown from nltk.text import TextCollection mongodb=Connection("localhost", MONGODB_PORT)['cablegate'] browntext = TextCollection(brown.words(categories=['news','government'])) count=0 for ng in mongodb.ngrams.find(timeout=False): mongodb.ngrams.update({"_id":ng["_id"]},{"$set":{"tfidf": browntext.tf_idf(ng['label'],brown.words(categories=['news','government'])) }}) count+=1 print "updated tfidf for %d topics"%count
class WeightedTweetClassifier(TweetClassifier): """ Basic idea: train TF-IDF model on training data filter out all words that we do not have clues for multiply all remaining term weights with the corresponding clues (+1, -1, 0), and sum the results """ def __init__(self, dictfile=None, trainfile=None, datafile=None, outfile=None): # Call the superclass constructor super(WeightedTweetClassifier, self).__init__(trainfile, datafile, outfile) self.stemmer = PorterStemmer() self.trainfile = trainfile self.datafile = datafile self.outfile = outfile #this contains the clues we were given: {"clue":1.0, "clue2":-1.0 ... } self.clueValues = {} #the NLTK TextCollection class is used because it provides TF-IDF functionality. self.textCollection = None # read the clues self.readDictionary(dictfile) # for saving sentiment scores, so they can be meaningfully used later on by e.g. the Joint Classifier self.scores = {} def readDictionary(self, dictfile=None): """ read the dictionary file. +1, -1 or 0 is saved as a sentiment for each (stemmed) term in self.clueValues TODO: maybe we don't want to stem, but instead use the provided POS tags? could be a separate classifier though """ with open(dictfile, "r") as dictdata: for line in dictdata.readlines(): fields = line.split(" ") token = self.stemmer.stem(fields[2].split("=")[1].strip()) polarity = fields[5].split("=")[1].strip() self.clueValues[token] = (1.0 if polarity == "positive" else (-1.0 if polarity == "negative" else 0.0)) def train(self, trainfile=None): print "training WeightedTweetClassifier" self.readTrainingData((trainfile or self.trainfile)) for tweet in self.trainingTweets: # lowercase, remove punctuation nopunct = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)) tweet.tweet = nopunct # add all Tweets to our TextCollection. This automatically creates a TF-IDF model self.textCollection = TextCollection([tweet.tweet for tweet in self.trainingTweets]) def classifyTweets(self, datafile=None, outfile=None): print "reading dataset" self.readDataset(datafile) print "classifying Tweets with weighted classifier" for tweet in self.evalTweets: # score = sum of TF-IDF weighted terms which carry sentiment tokens = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)).split(" ") score = sum([self.textCollection.tf_idf(token, tweet.tweet) * self.clueValues.get(self.stemmer.stem(token), 0) for token in tokens]) self.scores[(tweet.id1, tweet.id2)] = score # Any score very close or equal to 0 is judged to be neutral. tweet.sentiment = ("neutral" if abs(score) < 0.01 else ( "negative" if score < 0 else "positive"))
def __init__(self, documents): self.__textCollection = TextCollection(documents)
from __future__ import print_function from nltk.corpus import PlaintextCorpusReader from nltk.text import TextCollection #load all the files in the corpus root, #and calculate tf, idf, and tf_idf on them, and on a specific term if __name__ == "__main__": corpus_root = '../data/source_data' corpus = PlaintextCorpusReader(corpus_root,'[a-zA-Z \-]*\.txt') ids = corpus.fileids() collection = TextCollection(corpus) #for x,word in enumerate(corpus.words(ids[0])[:200]): # print(x,word) source = ids[0] term = corpus.words(source)[107] doc = corpus.words(ids[2]) print("Source: ",source) print("TF of: ",term,": ",collection.tf(term,doc)) print("IDF of: ",term,": ",collection.idf(term)) print("tf_Idf of:",term,": ",collection.tf_idf(term,doc))
# USE THIS SECTION FOR TESTING # extract all words (IN TESTING) if test: wfile = open('words-list.txt', 'r') for line in wfile: words.append(line.strip()) wfile.close() # print some more information print '\nNumber of tweets: ' + str(len(tweets)) print 'Number of words occuring >1 time: ' + str(len(words)) print 'Number of words occuring 1 time: ' + str(len(words1)) # create .arff file for Weka texts = TextCollection(tweets) arff = open('tweets_sentiment.arff', "w") wc = 0 # header arff.write("@relation sentiment_analysis\n\n") arff.write("@attribute numPosEmots numeric\n") arff.write("@attribute numNegEmots numeric\n") arff.write("@attribute numQuest numeric\n") arff.write("@attribute numExclam numeric\n") arff.write("@attribute numPosGaz numeric\n") arff.write("@attribute numNegGaz numeric\n") for word in words: arff.write("@attribute word_") sub_w = re.subn('[^a-zA-Z]', 'X', word) arff.write(sub_w[0])
from nltk.text import TextCollection f = open("cant.txt","r"); cont = f.read() emails = cont.split('GROUP') words = [email.replace('\n', ' ').split() for email in emails] f.close() generator = TextCollection(words) generator.generate(150)
from nltk.text import TextCollection f = open("bible.txt","r"); cont = f.read() emails = cont.split('BOOK OF ') words = [email.replace('\n', ' ').split() for email in emails] #print words f.close() generator = TextCollection(words) #generator.generate(10) #generator.generate(25) generator.generate(1000)
from nltk.text import TextCollection f = open("cuil.txt","r"); cont = f.read() emails = cont.split('Cuils') words = [email.replace('\n', ' ').split() for email in emails] f.close() generator = TextCollection(words) generator.generate(80)