def compute_tfidf(text, filename): numPara = len(text) print "there should be this many para in the text file ", numPara colList = [] paragraphWords = [] for i in range(numPara): paragraphWords = word_tokenize(text[i]) colList.append(paragraphWords) collection = TextCollection(colList) for paraList in colList: dict = {} for term in paraList: print term, "has weight: ", collection.tf_idf(term, paraList) dict[term] = collection.tf_idf(term, paraList) ''' print "BEFORE <><><><><<><<>><><><><><><><>><>< ",type(dict) for key,value in dict.iteritems(): print key," ",value ''' d = sortDict(dict) print "AFTER SORTED <><><><><<><<>><><><><><><><>><>< ", type(d) textFile = open(filename, "a") textFile.write("\n") for key, value in d: s = str(key) + "\t" + str(value) + "\n" textFile.write(s)
def compute_tfidf(text,filename): numPara = len(text) print "there should be this many para in the text file ", numPara colList = [] paragraphWords = [] for i in range(numPara): paragraphWords = word_tokenize(text[i]) colList.append(paragraphWords) collection = TextCollection(colList) for paraList in colList: dict={} for term in paraList: print term, "has weight: ", collection.tf_idf(term,paraList) dict[term]= collection.tf_idf(term,paraList) ''' print "BEFORE <><><><><<><<>><><><><><><><>><>< ",type(dict) for key,value in dict.iteritems(): print key," ",value ''' d=sortDict(dict) print "AFTER SORTED <><><><><<><<>><><><><><><><>><>< ",type(d) textFile=open(filename,"a") textFile.write("\n") for key,value in d: s = str(key) + "\t" + str(value)+"\n" textFile.write(s)
def compute_tf_idf(question, messages): import math texts = [question.keywords] total_length = 0 for m in messages: total_length += len(m.keywords) text = Text(tokens=m.keywords) texts.append(text) text_collection = TextCollection(texts) question_tfidf_score = 0 for k in question.keywords: tf_idf = text_collection.tf_idf(k, texts[0]) question_tfidf_score += tf_idf if question_tfidf_score == 0: question_tfidf_score = 0.2 if total_length == 0: total_length = 1 length_factor = len(question.keywords) / total_length score = length_factor * math.log2(question_tfidf_score * 10) base_score = score if base_score == 0: base_score = 1 print(question.content, question_tfidf_score, length_factor, score) print("^^^^^^^^^^^^^^^^^^^^^^^^^^") scores = [] total_score = score print("Math", math) for i in range(0, len(messages)): tf_idf_i = 0 for k in messages[i].keywords: tf_idf = text_collection.tf_idf(k, texts[i + 1]) tf_idf_i += tf_idf if tf_idf_i == 0: continue length_factor = len(messages[i].keywords) / total_length score = length_factor * math.log2(tf_idf_i * 10) scores.append(score) total_score += score print(messages[i].content, tf_idf_i, length_factor, score) print("++++++++++++++++++++++++++++++++") # print(scores) averaged_scores = [] last_message = question results = [last_message] for i in range(0, len(scores)): averaged_score = scores[i] / base_score averaged_scores.append(averaged_score) if averaged_score < 0.52: last_message.comments.append(messages[i]) else: last_message = messages[i] results.append(last_message) print(averaged_scores) return results
def nltk_tf_idf(corpus_one, file_name): print('-----starting nltk_tf_idf') corpus_one = [nltk.word_tokenize(doc) for doc in corpus_one] texts = TextCollection(corpus_one) for doc in corpus_one: yield {term: texts.tf_idf(term, doc) for term in doc}
def vectorize_t(corpus): #corpus = [tokenize(doc) for doc in corpus] texts = TextCollection(corpus) return { term: texts.tf_idf(term, corpus) for term in corpus }
def sentenceAlignment(simpleParas, normalParas, pairedPara): for key,value in pairedPara.items(): # key is simple and value in normal print "**********************************" print "PARAGRAPH" print "##################################" SPara = simpleParas[key] NPara = normalParas[value] # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list colList, sslist,nslist = formSentenceList(SPara,NPara) collection = TextCollection(colList) dict={} for sentence in colList: weight = 0 for term in sentence: weight = collection.tf_idf(term,sentence) print "TERM -> ",term, "is",weight # what if the term is already in the dic, we need to add the weight if(term not in dict): dict[term] = weight # dict[term] = weight #dict = sortDict(dict) print "================================================================" '''
def __vectorize(self, corpus): corpus = [list(self.__tokenize(doc)) for doc in corpus] texts = TextCollection(corpus) for doc in corpus: yield {term: texts.tf_idf(term, doc) for term in doc}
def get_tf_idf_dict_nltk( self, column_type="review_body", save_path="tf_idf_value/hair_dryer_tf_idf_dict.csv"): ''' ### nltk version it's super slow so don't use it ''' reviews = self.raw_df[column_type].tolist() # get clean header reviews_list_cleaned = clean_tsv(reviews) # get all words words = set() for reviews in reviews_list_cleaned: for review in reviews: words.add(review) words = list(words) corpus = TextCollection(reviews_list_cleaned) tf_idf = [] for word in words: tf_idf.append(corpus.tf_idf(word, corpus)) df = pd.DataFrame({"word": words, "tf-idf": tf_idf}) df.to_csv(save_path, encoding='utf-8')
def ranking(reuters, corpus, docids, palavras): '''Cria um ranqueamento entre os textos da busca, sendo o primeiro o mais relevante Args: reuters: corpus vindo do nltk corpus: dicionário contendo a relação entre índice e texto docids: índices dos textos buscados palavras: palavras tokenizadas da query Returns: Lista com todas os índices já ranqueados ''' rank = {} tc = TextCollection(reuters) for e in docids: rank[e] = 0 for i in palavras: rank[e] += tc.tf_idf(i, corpus[e]) rank = { k: v for k, v in reversed(sorted(rank.items(), key=lambda item: item[1])) } return rank.keys()
def tf_idf(self): corpus = [ list(self.cr.tokenize_strip_punct(desc)) for desc in self.cr.texts() ] texts = TextCollection(corpus) for desc in corpus: yield {term: texts.tf_idf(term, desc) for term in desc}
def nltk_tfidf_vectorize(corpus): from nltk.text import TextCollection corpus = [list(tokenize(doc)) for doc in corpus] texts = TextCollection(corpus) for doc in corpus: yield {term: texts.tf_idf(term, doc) for term in doc}
def vectorize(corpus): corpus_tokenized = [list(tokenize(doc)) for doc in corpus] texts = TextCollection(corpus_tokenized) for doc in corpus_tokenized: return { term: texts.tf_idf(term, doc) for term in doc }
def tf_idf_vectorize_nltk(corpus): print(corpus) #corpus = [tokenize(doc) for doc in corpus] texts = TextCollection(corpus) print(texts) for doc in corpus: yield { term: texts.tf_idf(term, doc) for term in doc }
def run_main(): text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terrible movie' tf_analy = TextCollection([text1, text2, text3, text4, text5]) new_text = 'That one is a good movie. This is so good!' word = 'That' tf_idf_val = tf_analy.tf_idf(word, new_text) print(tf_idf_val)
def Generate_keyword(obj,length): orig_file = './Data/'+obj+'/'+obj+'.xlsx' data = xlrd.open_workbook(filename=orig_file) sheet = data.sheet_by_index(1) review_head = np.array(sheet.col_values(12))[1:] review_body = np.array(sheet.col_values(13))[1:] review_all=[] for i in range(length) : review = review_head[i] + " " +review_body[i] review_all.append(review) review_all = np.array(review_all) # make review tokens tokens=[] for i,review in enumerate(review_all): review = review.lower() replacer = RegexpReplacer() review = replacer.replace(review) remove = str.maketrans('','',string.punctuation) review = review.translate(remove) token = nltk.word_tokenize(review) token = [w for w in token if w == 'not' or not w in stopwords.words('english')] s = nltk.stem.SnowballStemmer('english') token = [s.stem(ws) for ws in token] tokens.append(token) token_file = './Data/'+ obj +'/tokens.pkl' f=open(token_file,'wb') pickle.dump(tokens,f) f.close() corpus=TextCollection(tokens) tf={} tf_idf={} for review in tokens: for word in review: if word not in tf : tf_=corpus.tf(word,corpus) tf[word]=tf_ if word not in tf_idf : tf_idf_=corpus.tf_idf(word,corpus) tf_idf[word] = tf_idf_ tf_sorted = sorted(tf.items(), key=lambda item:item[1], reverse=True) tf_idf_sorted = sorted(tf_idf.items(), key=lambda item:item[1], reverse=True) pd.DataFrame(tf_sorted).to_csv('./Data/'+obj+'/tf_sorted.csv') pd.DataFrame(tf_idf_sorted).to_csv('./Data/'+obj+'/tf_idf_sorted.csv')
def train_NB_tfidf_nltk(train_data,test_data,all_rev): all_rev = [nltk.word_tokenize(rev) for rev in all_rev] corpus = TextCollection(all_rev) labels = train_data['label'] train_rev = train_data['review'] ID = test_data['ID'] lab = get_lab(labels) fs_train = [] print(train_rev[0]) for i in range(0,len(train_rev)): cut_rev = nltk.word_tokenize(train_rev[i]) fs_dict = {} for j in range(0,len(cut_rev)): fs_dict[cut_rev[j]] = corpus.tf_idf(cut_rev[j],train_rev[i]) fs_train.append((fs_dict,int(lab[i]))) fs_test = [] for i in range(0,len(test_rev)): cut_rev = nltk.word_tokenize(test_rev[i]) fs_dict = {} for j in range(0,len(cut_rev)): fs_dict[cut_rev[j]] = corpus.tf_idf(cut_rev[j],test_rev[i]) fs_test.append(fs_dict) classifier=nltk.NaiveBayesClassifier.train(fs_train) label = 1 train_score = [] test_score = [] for i in range(0,len(fs_train)): dist = classifier.prob_classify(fs_train[i][0]) train_score.append(dist.prob(label)) train_score = np.array(train_score,dtype="float32") for i in range(0,len(fs_test)): dist = classifier.prob_classify(fs_test[i]) test_score.append(dist.prob(label)) test_score = np.array(test_score,dtype="float32") print("AUC: ",cal_auc(train_score,lab)) result = pd.DataFrame({'ID':ID.T,'Pred':test_score.T}) result.to_csv("./result.csv",index = None)
def text_classification(): """ 文本分类 :return: """ text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terrible movie' # 构建TextCollection对象 tc = TextCollection([text1, text2, text3, text4, text5]) new_text = 'That one is a good movie. This is so good!' word = 'That' tf_idf_val = tc.tf_idf(word, new_text) print('{}的TF-IDF值为:{}'.format(word, tf_idf_val))
def retrieve_results(n_percentile): search_queries = parse_trec('documents/irg_queries.trec') search_collections = parse_trec('documents/irg_collection_clean.trec') # search_collections = parse_trec('documents/irg_collection_short.trec') # search_collections = eliminate_stopwords(search_collections) # write_collection_doc(search_collections, 'documents/irg_collection_clean.trec') print('======= Statistics =======') print(f'Queries: {len(search_queries)}') print(f'Collections: {len(search_collections)}') print(f'Removal of {int((1-n_percentile)*100)}%-ile') print('==========================') # TF-IDF document_results = [] for search_query_id, search_query_text in search_queries.items(): print( f'Current query id: {search_query_id}, text: "{search_query_text}"' ) terms = search_query_text.split(' ') documents = keep_n_percentile_most_relevant_words(search_collections, search_query_text, n=n_percentile) document_scores = {} search_texts_collection = TextCollection(documents.values()) for document_id, document_text in documents.items(): for term in terms: current_score = document_scores.get(document_id, 0.0) document_scores[ document_id] = current_score + search_texts_collection.tf_idf( term, document_text) rank = 1 for document_id, document_scores in sorted(document_scores.items(), key=lambda kv: kv[1], reverse=True): if rank <= 1000: document_results.append( Result(search_query_id, document_id, rank, document_scores)) rank += 1 result_writer(document_results, f'IE_result_keep_{int(n_percentile*100)}_percentile.trec') print('Done')
def compute_tf_idf_similarity(query: str, content: str, type: str) -> float: """ Compute the mean tf-idf or tf similarity for one sentence with multi query words. :param query: a string contain all key word split by one space :param content: string list with every content relevent to this query. :return: average tf-idf or tf similarity. """ sents = [word_tokenize(content), word_tokenize("")] # add one empty file to smooth. corpus = TextCollection(sents) # 构建语料库 result_list = [] for key_word in query.strip(" ").split(" "): if type == "tf_idf": result_list.append(corpus.tf_idf(key_word, corpus)) elif type == "tf": result_list.append(corpus.tf(key_word, corpus)) else: raise KeyError return sum(result_list) / len(result_list)
def compute_tfidf(text,filename): numPara = len(text) print "there should be this many para in the text file ", numPara colList = [] paragraphWords = [] for i in range(numPara): paragraphWords = word_tokenize(text[i]) colList.append(paragraphWords) collection = TextCollection(colList) for paraList in colList: dict={} for term in paraList: dict[term]= collection.tf_idf(term,paraList) d=sortDict(dict) textFile=open(filename,"a") textFile.write("\n") for key,value in d: s = str(key) + "\t" + str(value)+"\n" textFile.write(s)
standard_position_dict = position_lookup(standard_freq_vector) # print(standard_position_dict) sentence = 'this is cool' freq_vector = [0] * size tokens = nltk.word_tokenize(sentence) for word in tokens: try: freq_vector[standard_position_dict[word]] += 1 except KeyError: continue # print(freq_vector) corpus = TextCollection(['this is sentence one', 'this is sentence two', 'this is sentence three']) standard_vocab = [] for i in standard_freq_vector: standard_vocab.append(i[0]) # print(corpus.tf('is', 'this is sentence four')) new_sentence = 'this is sentence five' for word in standard_vocab: print(corpus.tf_idf(word, new_sentence))
# 3. 文本分类及TF-IDF # 3.1 NLTK中的TF-IDF from nltk.text import TextCollection text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terrible movie' # 创建TextCollection对象 tc = TextCollection([text1, text2, text3, text4, text5]) new_text = 'That one is a good movie. This is so good!' word = 'That' tf_idf_val = tc.tf_idf(word, new_text) pro_text('{}的TF-IDF值为:{}'.format(word, tf_idf_val)) # 3.1 sklearn中的TF-IDF from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() feat = vectorizer.fit_transform([text1, text2, text3, text4, text5]) print(vectorizer.get_feature_names()) feat_arrary = feat.toarray() print(feat_arrary.shape) print(feat_arrary[0, :]) print(vectorizer.transform([new_text]).toarray()) # 3.3 中文中的TF-IDF ch_text1 = ' 非常失望,剧本完全敷衍了事,主线剧情没突破大家可以理解,可所有的人物都缺乏动机,' \ '正邪之间、妇联内部都没什么火花。团结-分裂-团结的三段式虽然老套但其实也可以利用积' \
def nltk_tfidf_vectorize(lists_of_tokens): texts = TextCollection(lists_of_tokens) for article in lists_of_tokens: yield {term: texts.tf_idf(term, article) for term in article}
#!/usr/bin/env python # _*_ coding:utf-8 _*_ from nltk.text import TextCollection # 首先, 把所有的文档放到TextCollection类中。 # 这个类会自动帮你断句, 做统计, 做计算 corpus = TextCollection( ['this is sentence one', 'this is sentence two', 'this is sentence three']) # 直接就能算出tfidf # (term: 一句话中的某个term, text: 这句话) print(corpus.tf_idf('this', 'this is sentence four')) # 0.444342 # 同理, 怎么得到一个标准大小的vector来表示所有的句子? # 对于每个新句子 new_sentence = 'this is sentence five' # 遍历一遍所有的vocabulary中的词: for word in standard_vocab: print(corpus.tf_idf(word, new_sentence)) # 我们会得到一个巨长(=所有vocab长度)的向量
xmlcollection.get_words_by_editdistance(editdistance=editdistance, no_of_most_freq=no_of_topwords) # Write the found sets to disk; also write most frequent words to disk. xmlcollection.write_words_by_editdistance(editdistance=editdistance) xmlcollection.write_topwords(no_of_words=no_of_topwords) print "Top words written to disk." # XXX: BIG F**K UP ################################## FIX FIX FIX ##### # Print idf, tf and tf-idf values for the term "CCC", in document # no. 42 - for testing. nltk_textcollection = TextCollection(xmlcollection.get_words()) print "idf: " + str(nltk_textcollection.idf("CCC")) print "tf: " + str(nltk_textcollection.tf("CCC", TextCollection(xmlcollection.get_doc(42).get_tokens()))) print "tf_idf: " + str(nltk_textcollection.tf_idf("CCC", TextCollection(xmlcollection.get_doc(42).get_tokens()))) # Do that now systematically for all documents print "Document where tf is bigger 0:" cnt = 0 for doc in xmlcollection.get_docs(): tf = nltk_textcollection.tf("CCC", TextCollection(doc.get_tokens())) stdout.write(str(tf) + ", ") cnt += 1 if cnt == 10: print cnt = 0 if tf > 0.0: print "\n" + doc.get_xml_filename()
# -*- coding: utf-8 -*- from nltk.text import TextCollection __author__ = 'Alan Hou' corpus = TextCollection(['this is sentence one', 'this is sentence two', 'this is sentence three']) # 直接算出 tfidf print(corpus.tf_idf('this', 'this is sentence four'))
def prepare_Custom(data, common, count, training=True): X = [] Y = [] wordsid = ["" for x in range(count)] common = {k: common[k] for k in list(common)[:count]} default = np.zeros(count) i = 0 for word in common: default[i] = 0 wordsid[i] = word i = i + 1 wordWeights = [] for i in range(count): wordWeights.append(1 / (i + 1)) wordWeights = np.array(wordWeights) if (training): global Text Text = [] for data_point in data: Text.append(data_point['original_text'].lower()) corpus = TextCollection(Text) for data_point in data: occur = default x = [] Tf_ide = [] data_point["num_words"] = len(data_point['text']) data_point['sentiment'] = np.abs( nltk_sentiment(data_point['original_text'])) data_point['exclam'] = data_point['original_text'].count('!') data_point['hash'] = data_point['original_text'].count('#') popularScore = 0 for word in data_point['text']: if word in wordsid: occur[wordsid.index(word)] += 1 popularScore += 1 for word in common: # tf-idf features x.append(corpus.tf_idf(word, data_point['original_text'])) # popularity-frequency (normalized) x.append(popularScore / data_point["num_words"]) # transform(children) feature x.append(np.log(data_point['children']**2 + 1)) # not new x.append(data_point['controversiality']) # not new x.append(int(data_point['is_root'])) # compute an index of common words x.append(occur.dot(wordWeights.T)) # sentiment analysis transofmed x.append(np.log(np.abs(data_point['sentiment'])**2 + 1)) # count of exclamation points x.append(data_point['original_text'].count('!')) # count of hashtags x.append(data_point['original_text'].count('#')) x.append(1) X.append(x) Y.append(data_point['popularity_score']) return np.array(X), np.array(Y)
] sents = [word_tokenize(sent) for sent in sents] # 对每个句子进行分词 print(sents) # 输出分词后的结果 corpus = TextCollection(sents) # 构建语料库 print(corpus) # 输出语料库 # 计算语料库中"one"的tf值 tf = corpus.tf('one', corpus) # 1/12 print(tf) # 计算语料库中"one"的idf值 idf = corpus.idf('one') # log(3/1) print(idf) # 计算语料库中"one"的tf-idf值 tf_idf = corpus.tf_idf('one', corpus) print(tf_idf) from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer x_train = [ 'TF-IDF 主要 思想 是', '算法 一个 重要 特点 可以 脱离 语料库 背景', '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要' ] x_test = ['原始 文本 进行 标记', '主要 思想'] # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer(max_features=10) # 该类会统计每个词语的tf-idf权值 tf_idf_transformer = TfidfTransformer()
import pymongo from pymongo import Connection MONGODB_PORT = 27017 import nltk from nltk.corpus import brown from nltk.text import TextCollection mongodb=Connection("localhost", MONGODB_PORT)['cablegate'] browntext = TextCollection(brown.words(categories=['news','government'])) count=0 for ng in mongodb.ngrams.find(timeout=False): mongodb.ngrams.update({"_id":ng["_id"]},{"$set":{"tfidf": browntext.tf_idf(ng['label'],brown.words(categories=['news','government'])) }}) count+=1 print "updated tfidf for %d topics"%count
if i in string.punctuation: # 如果字符是标点符号的话就将其替换为空格 s['text'] = s['text'].replace(i, " ") sentence.append(s['text']) sents = [word_tokenize(sent) for sent in sentence] corpus = TextCollection(sents) tf_idf = [] for sen in sents: td = [] for data in sen: elem = [] data = data.lower() if data not in stop_words: td.append(corpus.tf_idf(data, corpus)) tf_idf.append(td) # cosine = [] count_s = [] aspect_cosine = [] for i in range(len(sents)): sentences_vector = [] for w in sents[i]: w = w.lower() if w not in stop_words: try: word_index = list(words_index.keys())[list( words_index.values()).index(w)] sentences_vector.append(wordVectors[word_index]) except ValueError:
topN = [] for i in eliminateBiggerThanOne[:2000]: topN.append(i[0]) print('topN' , topN) ######################################################################################################################## ## Getting TF-IDF of the TopN words to get max and min## s = [d for (d,c) in sentences] tfList = [] for sen in s: for word in sen: if word in topN: tfList.append(x.tf_idf(word,sen)) print(len(tfList)) max= max(tfList) print('max' ,max) min= min(tfList) print('min' , min) res = max - min print('res' , res) half = res/2 print('half',half) twoThird = (max+half)/2 print('twoThird' , twoThird) quarter = half/2 print('quarter',quarter)
def alignText(simpleParas, normalParas, pairedPara): #print simpleParas, len(simpleParas) #print normalParas, len(normalParas) for key,value in pairedPara.items(): # key is simple and value in normal SPara = simpleParas[key] NPara = normalParas[value] print "=================Paragraphs were above======================================" # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list colList, sslist,nslist = formSentenceList(SPara,NPara) collection = TextCollection(colList) # this is a list of Word object wordsWithWeight = [] dict={} for sentence in colList: weight = 0 for term in sentence: if term not in PUNCTLIST or term not in STOPWORDS or term not in commonAuxilaryVerbs: weight = collection.tf_idf(term,sentence) # what if the term is already in the dic, we need to add the weight if(term not in dict): w = Word(term,"","") w.setWeight(weight) wordsWithWeight.append(w) #dict[term] = weight # dict[term] = weight #dict = sortDict(dict) temp=[] for sentence in sslist: tokSen = word_tokenize(sentence) temp.append(tokSen) sslist = temp temp=[] for sentence in nslist: tokSen = word_tokenize(sentence) temp.append(tokSen) nslist = temp for simpleLine in sslist: stringSimpleLine = listToString(simpleLine) # semantic part simplefilename = "sentence1.txt" SFile=open(simplefilename,"w+") SFile.write(stringSimpleLine) SFile.close() parseFile("sentence1.txt") # if failed to parse, skip this sentence and continue if verifyParsedFile("parsedsentence1.txt") == False: continue buildClause("parsedsentence1.txt", "one") # end semantic part maxSimilarity = 0 for normalLine in nslist: stringNormalLine = listToString(normalLine) # semantic part normalfilename = "sentence2.txt" NFile=open(normalfilename,"w+") NFile.write(stringNormalLine) NFile.close() parseFile("sentence2.txt") #check whether parsing was done properly # if failed to parse, skip this sentence and continue if verifyParsedFile("parsedsentence2.txt") == False: continue # end semantic part #buildClause("parsedsentence1.txt", "one") buildClause("parsedsentence2.txt","two") sentence1Words = [] sentence2Words = [] #makeContextFile(n1,v1,n2,v2) sentence1Words, sentence2Words = makeContextFile(n1,v1,n2,v2) # all words is a dictionary of words:tfidf. I converted this to a dictionary from a list of wordsWithWeight for convenience allWords = {} for w in wordsWithWeight: allWords[w.getValue()]=w.getWeight() numerator1 = 0 denominator1 = 0 for word in sentence1Words: if(word.getValue() in allWords): tfidf = allWords[word.getValue()] semanticWeight = word.getWeight() numerator1 = numerator1+ (semanticWeight*tfidf) denominator1 = denominator1 + allWords[word.getValue()] if(denominator1==0): denominator1 = 1 partA = numerator1/denominator1 numerator2 = 0 denominator2 = 0 for word in sentence2Words: #print "dic index:->", word.getValue(),"value: ",allWords[word.getValue()] if(word.getValue() in allWords): tfidf = allWords[word.getValue()] semanticWeight = word.getWeight() numerator2 = numerator2+ (semanticWeight*tfidf) denominator2 = denominator2 + allWords[word.getValue()] if(denominator2==0): denominator2 = 1 partB = numerator2/denominator2 SIMILARITY = (partA + partB)/2 print "><><><><><><><><><><><><><><><><><><><><><><" print stringSimpleLine print "--------------------------------------------" print stringNormalLine print "Similarity Score -----> ", SIMILARITY print "><><><><><><><><><><><><><><><><><><><><><><"
from __future__ import print_function from nltk.corpus import PlaintextCorpusReader from nltk.text import TextCollection #load all the files in the corpus root, #and calculate tf, idf, and tf_idf on them, and on a specific term if __name__ == "__main__": corpus_root = '../data/source_data' corpus = PlaintextCorpusReader(corpus_root,'[a-zA-Z \-]*\.txt') ids = corpus.fileids() collection = TextCollection(corpus) #for x,word in enumerate(corpus.words(ids[0])[:200]): # print(x,word) source = ids[0] term = corpus.words(source)[107] doc = corpus.words(ids[2]) print("Source: ",source) print("TF of: ",term,": ",collection.tf(term,doc)) print("IDF of: ",term,": ",collection.idf(term)) print("tf_Idf of:",term,": ",collection.tf_idf(term,doc))
#coding:utf-8 import nltk from nltk.text import TextCollection data = "Hello world!" tokens = nltk.word_tokenize(data) print(tokens) print("---------------------------------------") corpus = TextCollection( ['this is sentence one', 'this is sentence two', 'this is sentence three']) print(corpus.tf_idf("this", "this is sentence four")) import numpy as np from numpy import dot a = np.array([1, 0]) p = np.array([[.9, .1], [.5, .5]]) n = dot(a, p) for i in range(1000): # n = dot(a,p) n = dot(n, p) print("res::", n)
class WeightedTweetClassifier(TweetClassifier): """ Basic idea: train TF-IDF model on training data filter out all words that we do not have clues for multiply all remaining term weights with the corresponding clues (+1, -1, 0), and sum the results """ def __init__(self, dictfile=None, trainfile=None, datafile=None, outfile=None): # Call the superclass constructor super(WeightedTweetClassifier, self).__init__(trainfile, datafile, outfile) self.stemmer = PorterStemmer() self.trainfile = trainfile self.datafile = datafile self.outfile = outfile #this contains the clues we were given: {"clue":1.0, "clue2":-1.0 ... } self.clueValues = {} #the NLTK TextCollection class is used because it provides TF-IDF functionality. self.textCollection = None # read the clues self.readDictionary(dictfile) # for saving sentiment scores, so they can be meaningfully used later on by e.g. the Joint Classifier self.scores = {} def readDictionary(self, dictfile=None): """ read the dictionary file. +1, -1 or 0 is saved as a sentiment for each (stemmed) term in self.clueValues TODO: maybe we don't want to stem, but instead use the provided POS tags? could be a separate classifier though """ with open(dictfile, "r") as dictdata: for line in dictdata.readlines(): fields = line.split(" ") token = self.stemmer.stem(fields[2].split("=")[1].strip()) polarity = fields[5].split("=")[1].strip() self.clueValues[token] = (1.0 if polarity == "positive" else (-1.0 if polarity == "negative" else 0.0)) def train(self, trainfile=None): print "training WeightedTweetClassifier" self.readTrainingData((trainfile or self.trainfile)) for tweet in self.trainingTweets: # lowercase, remove punctuation nopunct = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)) tweet.tweet = nopunct # add all Tweets to our TextCollection. This automatically creates a TF-IDF model self.textCollection = TextCollection([tweet.tweet for tweet in self.trainingTweets]) def classifyTweets(self, datafile=None, outfile=None): print "reading dataset" self.readDataset(datafile) print "classifying Tweets with weighted classifier" for tweet in self.evalTweets: # score = sum of TF-IDF weighted terms which carry sentiment tokens = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)).split(" ") score = sum([self.textCollection.tf_idf(token, tweet.tweet) * self.clueValues.get(self.stemmer.stem(token), 0) for token in tokens]) self.scores[(tweet.id1, tweet.id2)] = score # Any score very close or equal to 0 is judged to be neutral. tweet.sentiment = ("neutral" if abs(score) < 0.01 else ( "negative" if score < 0 else "positive"))
aspect_keywords.append(keywords_vector) sentence = [] for s in sentences: for i in s['text']: if i in string.punctuation: # 如果字符是标点符号的话就将其替换为空格 s['text'] = s['text'].replace(i, " ") sentence.append(s['text']) sents = [word_tokenize(sent) for sent in sentence] corpus = TextCollection(sents) tf_idf = [] for sen in sents: td = [] for data in sen: elem = [] data = data.lower() if data not in stop_words: # print(data) td.append(corpus.tf_idf(data, corpus)) tf_idf.append(td) for aspect in aspect_keywords: for vector in aspect[1:]: print( deal_data.cosine(aspect[0], vector) * corpus.tf_idf('food', corpus)) print('\n')
arff.write("@relation sentiment_analysis\n\n") arff.write("@attribute numPosEmots numeric\n") arff.write("@attribute numNegEmots numeric\n") arff.write("@attribute numQuest numeric\n") arff.write("@attribute numExclam numeric\n") arff.write("@attribute numPosGaz numeric\n") arff.write("@attribute numNegGaz numeric\n") for word in words: arff.write("@attribute word_") sub_w = re.subn('[^a-zA-Z]', 'X', word) arff.write(sub_w[0]) if sub_w[1] > 0: arff.write('_' + str(wc)) wc += 1 arff.write(" numeric\n") arff.write("@attribute class {POS, NEG, OTHER}\n\n") arff.write("@data\n") # data for i in xrange(len(tweets)): arff.write(str(emots_count[i][0]) + ',' + str(emots_count[i][1]) + ',') arff.write(str(punct_count[i][0]) + ',' + str(punct_count[i][1]) + ',') arff.write(str(gaz_count[i][0]) + ',' + str(gaz_count[i][1]) + ',') for j in xrange(len(words)): #loop through unigrams arff.write(str(texts.tf_idf(words[j], tweets[i])) + ',') arff.write(sentiments[i] + '\n') arff.close() print '\nFinished pre-processing! The ARFF file for Weka has been created.'