def testKnownTFIDF(self): """ Testing to see whether the tfidf values for arbitrarily selected words in the articles correspond with manually calculated values. """ articleList = [] theList = [] for string in self.strings: articleList.append(tfidf.tf(string)) for string in self.theTwentyFive: theList.append(tfidf.tf(string)) idfArtDict = tfidf.idf(articleList) idfTheDict = tfidf.idf(theList) tfidfArtList = tfidf.tfidf(idfArtDict, articleList) tfidfTheList = tfidf.tfidf(idfTheDict, theList) self.assertEqual(tfidfArtList[1]["Meditation"], math.log10(6/1) * (1/19)) self.assertEqual(tfidfArtList[2]["books"], math.log10(6/1) * (1/18)) self.assertEqual(tfidfArtList[5]["the"], math.log10(6/3) * (5/5)) self.assertEqual(tfidfTheList[3]["the"], math.log10(5/5) * (5/5))
def main(): features = lsi(tfidf('small_train.txt'), 100) classes = get_classes('small_train.txt', 0.7, 5) prob = svm_problem(classes, features) param = svm_parameter('-t 0 -c 4 -b 1') m = svm_train(prob, param) test_features = lsi(tfidf('small_test.txt'), 100) test_classes = get_classes('small_test.txt', 0.7, 5) p_label, p_acc, p_val = svm_predict(test_classes, test_features, m)
def get_acc(trainfilename, testfilename, d, a, r): features = lsi(tfidf('small_train.txt'), d) classes = get_classes('small_train.txt', a, r) prob = svm_problem(classes, features) param = svm_parameter('-t 0 -c 4 -b 1') m = svm_train(prob, param) test_features = lsi(tfidf('small_test.txt'),d) test_classes = get_classes('small_test.txt', a, r) p_label, p_acc, p_val = svm_predict(test_classes, test_features, m) return p_acc
def scrape(): try: querystring = request.args.get("querystring") print("Recieved scrape query: " + querystring) scraper_values, stuff, index = scraper_df(querystring, 100) tfidf_values = tfidf(scraper_values) tfidf_pd_values = tfidf_df(scraper_values, stuff, querystring) db.insert({"querystring" : querystring, "tfidf" : tfidf_values, "tfidf_pd_values" : tfidf_pd_values}) indexStorage[querystring] = index return jsonify({ "tfidf" : tfidf_values, "specifics" : tfidf_pd_values }) except Exception as e: print(e) queryer = Query() if len(db.search(queryer.querystring == querystring)): return jsonify({ "tfidf" : db.search(queryer.querystring == querystring)[0]["tfidf"], "specifics" : db.search(queryer.querystring == querystring)[0]["tfidf_pd_values"] }) else: return jsonify({ "tfidf" : "ERROR", "specifics" : "ERROR" }) print(scraper_values)
def generate_weights(documents, lexicon): """ function: generate_weights -------------------------- perform tf-idf to generate importance scores for words in documents :param document: list of documents to use in calculations :returns: dictionary of dictionaries: {"id_" : {"word" : score,...}} """ # weight = { 'document' : { 'word' : score,... },... } weights = dict() m = tfidf() print('Adding documents for TF-IDF...') for i, document in enumerate(documents): m.addDocument(i, document['words']['title'] + document['words']['body']) weights[i] = dict() # generate dictionary of { "word", "score" } pairs for each document print('Generating weight scores for words; This WILL take time...') for word in lexicon['title'] & lexicon['body']: m.get_similarities(word, weights, 'smooth', 1.25) for word in lexicon['title'] - lexicon['body']: m.get_similarities(word, weights, 'smooth', 1.1) for word in lexicon['body'] - lexicon['title']: m.get_similarities(word, weights, 'smooth') return weights
def getDocumentSimilarity(self, queryTermsMap, trainDoc): tfIdfHelper = tfidf(self.wordDF, self.numDocs, self.meanDocLen) similarity = 0.0 for word in trainDoc.wordsMap: if queryTermsMap.has_key(word): similarity += tfIdfHelper.getTfIdf(queryTermsMap[word],word, trainDoc) return similarity
def set_up_table(self, tokens): table = tfidf.tfidf() i = 0 for token in tokens: table.addDocument(self.ids[i],token) i = i+1 return table
def build_tfidf(foreignBrand): datas = [] rs = [] merge_to_series = False (brand, vendor, series) = foreignBrand.split(u"#") #print brand.encode("utf-8") + ":" + maker.encode("utf-8") res = external_spec_col.find({ 'site': sys.argv[1], 'brand.name': brand, "vendor.name": vendor, "series.name": series }) if res.count() == 1: r = res[0] if r['model'].get('key', '') == '0': merge_to_series = True data = [ r['model']['id'], r['brand']['name'], r['vendor']['name'], r['series']['name'], r['model']['name'], r['model'].get('year', '') or '' ] datas.append(data) if merge_to_series: data = datas[0] k = "#".join(data) return k for r in res: data = [ r['model']['id'], r['brand']['name'], r['vendor']['name'], r['series']['name'], r['model']['name'], r['model'].get('year', '') or '' ] datas.append(data) rs.append(r) #print "------------------------------" engine_dict = {} if len(rs) == 0: print "empty engine map " + foreignBrand #return {} for i in xrange(len(rs)): e = getExternalEngine(rs[i]) if engine_dict.has_key(e): tf = engine_dict[e] else: tf = tfidf.tfidf() engine_dict[e] = tf data = datas[i] k = "#".join(data) n = "#".join(data[1:]) c = word_bag(n) tf.addDocument(k, c) #if engine_dict == {}: #print len(rs) #print len(datas) return engine_dict
def preprocess(): #dirname = "Genres" dirname = "Data/Train" #dirname = str(sys.argv[1]) #dirname = "C:/Users/Megha/Documents/studies/544/HW1/project/SpamorHam/train" global labels, x, sq, idf for dirpath, dirs, files in os.walk(dirname): for filename in files: if filename[filename.rfind(".") + 1:] == "txt": i = filename x[i] = {} fname = os.path.join(dirpath, filename) f = open(fname, "r", encoding="latin1") text = f.read() f.close() tokens = text.split() for token in tokens: token = strip_nonalnum_re(token) if token.lower() in stopwords: continue if token not in x[i]: x[i][token] = 1 else: x[i][token] += 1 x, idf, sq = tfidf.tfidf(x)
def set_up_table2(self, ids): table = tfidf.tfidf() # print ids for id in ids: token = self.dict[id] table.addDocument(id,token) return table
def process_files(in_dir): global WORDS, TRAIN WORDS = in_dir+'words' TRAIN = in_dir+'train.dat' CORRECT_LIST = [] matches = [] #CORRECT_LIST = (open(in_dir+'train_corr','r')).readlines() for line in (open(in_dir+'train_corr','r')).readlines(): if line.replace('\n','') != '': CORRECT_LIST.append(line.replace('\n','')) matches.append(line.replace('\n','')) #print CORRECT_LIST for line in open(in_dir+'train_wrong','r').readlines(): if line.replace('\n','') != '': matches.append(line.replace('\n','')) #print matches #matches = (open(in_dir+'train_corr','r')).readlines() + (open(in_dir+'train_wrong','r')).readlines() words_per_file = dict() words = set() length = str(len(matches)) count = 0 user_counter = 0 # first iteration through matches to get all words for match in matches: count +=1 print str(count) + " of " + length + " training example preprocessing done" t = codeParser(match) #if 'tweet' in get_all_words(match) : # user_counter += 1 # print user_counter words_per_file[match] = t.get_compressed() words = words.union(words_per_file[match]) print "preliminary processing done" train_f = open(TRAIN, 'w') # second iteration through matches to get all word counts count = 0 calc_freq = tfidf(words, words_per_file) print "Done calculating idfs" #print "PARAM " + str(calc_freq.getIDF('@param')) for match in matches: count +=1 print str(count) + " of " + length + " training examples done" if match in CORRECT_LIST: #print calc_freq.term_freq('tweet',words_per_file[match]) train_f.write('+1') #print len(words_per_file[match]) else: train_f.write('-1') train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq))) train_f.write('\n') train_f.close() print "Results printed to file: " + str(TRAIN)
def analyzeBlogs(blogList): # Analyze blog with tfidf, and other word analysis. outputWordsArr = [] namesCount, religionCount, weaponryCount, governmentCount, wordCount = 0, 0, 0, 0, 0 for i, blog in enumerate(blogList): scores = {} wordCount = 0 print("Top words in document {}".format(i + 1)) for word in blog.words: flag = True word = word.lower() # Everything is in lowercase. for punc in terms.punctuation(): if punc in word: flag = False wordCount+=1 if flag: scores[word] = tfidf.tfidf(word, blog, blogList) # run tfidf if word in terms.governmentTerms(): # increment count based on content to find word densities. governmentCount+=1 if word in terms.weaponsTerms(): weaponryCount+=1 if word in terms.femaleNames() or word in terms.maleNames(): namesCount+=1 if word in terms.religiousTerms(): religionCount+=1 sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) # sort the words for word, score in sorted_words[0:10]: print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) outputWordsArr.append((word, round(score, 10))) print("---------------------------------------------------------") # Gathering the density scores of each of these defined features, and creating the returning data type analysisOutputs = AnalysisObject(namesCount/wordCount,religionCount/wordCount,weaponryCount/wordCount,governmentCount/wordCount,outputWordsArr) return analysisOutputs
def find_parallels(): sim_matrix = [] count = 0 with codecs.open("data/only_shorashim.json", 'rb', encoding='utf8') as fp: shorash_obj = json.load(fp) all_words = set() for k, v in shorash_obj.items(): if k == u"Obadiah": continue all_words |= set(reduce_list(reduce_list(v))) table = tfidf.tfidf(all_words) input_ketaim_into_table(shorash_obj, table, 2) for keta_obj, keta_dict in table.documents.items(): similarities = table.similarities(keta_obj, 5) sim_matrix += similarities for similarity in similarities: if similarity[1] > 0.5: print u"{}\n\t{}\n\t{}".format(similarity[1], keta_obj, similarity[0]) pass
def main(): #cleanData() #sys.exit() # Initialize feature set. file = open('feature_set.txt', 'r') #file = open('feature_set_2000.txt','r') feature_set = file.read().split('\n') file.close() # Initialize the table that contains all information of replies. #file = open('jieba_data2.txt','r') file = open('jieba_data_full.txt', 'r') lines = file.readlines() file.close() table = tfidf.tfidf() for iter in range(len(lines)): reply_info = lines[iter].split('(#SEP#)') rate_flag = reply_info[0] reply_length = reply_info[1] lines_num = reply_info[2] dividedWords_list = reply_info[-2].lower().split() # eliminate replies of replies. if 'quote' not in dividedWords_list: table.addDocument(str(iter), rate_flag, reply_length, lines_num, dividedWords_list) # print 'lines_num:',lines_num,'reply_length:',reply_length #getFeatureSet(table) #sys.exit() # Get feature set of every sample. sample_list = table.documents infoPairs_list = [] for sample_info in sample_list: rate_flag = sample_info[1] word_list = sample_info[-1].keys() reply_length = sample_info[2] lines_num = sample_info[3] text_features = getTextFeatures(word_list, reply_length, lines_num, feature_set) infoPairs_list.append((text_features, rate_flag)) # Train a decision tree (or naive bayes) classifier and test it. size = int(len(infoPairs_list) * 0.1) train_set, test_set = infoPairs_list[size:], infoPairs_list[:size] print 'Training classifier ...' classifier = nltk.NaiveBayesClassifier.train(train_set) #classifier = nltk.DecisionTreeClassifier.train(train_set) print 'Classfier has been trained!' #print classifier.pseudocode(depth=3) print nltk.classify.accuracy(classifier, test_set) # Save the classifier above. file = open('myNaiveBayesClassifier.pickle', 'wb') #file = open('myDecisionTreeClassifier.pickle', 'wb') pickle.dump(classifier, file) file.close() print 'my classifier has been saved.'
def Three_Ka_thing(notes): A = 3 U = [] for i in range(len(notes) - (A-1)): x = tfidf.tfidf(CombSep(notes[i:i+A])) y = {k: v for k, v in sorted(x.items(), key=lambda item: item[1])} U.append(list(y.keys())[-5:]) return U
def main(): table = tfidf.tfidf() table.addDocument("foo", ["alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "alpha"]) table.addDocument("bar", ["alpha", "bravo", "charlie", "india", "juliet", "kilo"]) table.addDocument("baz", ["kilo", "lima", "mike", "november"]) print table.similarities (["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]]
def getDocumentSimilarity(self, queryTermsMap, trainDoc): tfIdfHelper = tfidf(self.wordDF, self.numDocs, self.meanDocLen) similarity = 0.0 for word in trainDoc.wordsMap: if queryTermsMap.has_key(word): similarity += tfIdfHelper.getTfIdf(queryTermsMap[word], word, trainDoc) return similarity
def get_results(query): tf = tfidf() processed_query = tf.pre_processing(query) relevant_docs = tf.get_relevant_docs(processed_query) query_vector, idf_vector, tf_vector = tf.build_query_vector(processed_query) sorted_score_list, tf_new, idf_new, tf_idf_new = tf.similarity(relevant_docs, query_vector, idf_vector, tf_vector) search_result = tf.get_movie_info(sorted_score_list, tf_new, idf_new, tf_idf_new) return search_result, processed_query
def main(): #cleanData() #sys.exit() # Initialize feature set. file = open('feature_set.txt','r') #file = open('feature_set_2000.txt','r') feature_set = file.read().split('\n') file.close() # Initialize the table that contains all information of replies. #file = open('jieba_data2.txt','r') file = open('jieba_data_full.txt','r') lines = file.readlines() file.close() table = tfidf.tfidf() for iter in range(len(lines)): reply_info = lines[iter].split('(#SEP#)') rate_flag = reply_info[0] reply_length = reply_info[1] lines_num = reply_info[2] dividedWords_list = reply_info[-2].lower().split() # eliminate replies of replies. if 'quote' not in dividedWords_list: table.addDocument(str(iter), rate_flag, reply_length, lines_num, dividedWords_list) # print 'lines_num:',lines_num,'reply_length:',reply_length #getFeatureSet(table) #sys.exit() # Get feature set of every sample. sample_list = table.documents infoPairs_list = [] for sample_info in sample_list: rate_flag = sample_info[1] word_list = sample_info[-1].keys() reply_length = sample_info[2] lines_num = sample_info[3] text_features = getTextFeatures(word_list,reply_length,lines_num,feature_set) infoPairs_list.append( (text_features, rate_flag) ) # Train a decision tree (or naive bayes) classifier and test it. size = int(len(infoPairs_list) * 0.1) train_set, test_set = infoPairs_list[size:], infoPairs_list[:size] print 'Training classifier ...' classifier = nltk.NaiveBayesClassifier.train(train_set) #classifier = nltk.DecisionTreeClassifier.train(train_set) print 'Classfier has been trained!' #print classifier.pseudocode(depth=3) print nltk.classify.accuracy(classifier, test_set) # Save the classifier above. file = open('myNaiveBayesClassifier.pickle', 'wb') #file = open('myDecisionTreeClassifier.pickle', 'wb') pickle.dump(classifier, file) file.close() print 'my classifier has been saved.'
def get_tfidf_doc(docTitle): sortedTitles = tfidf.tfidf(docTitle, nTitles=20) documents = list() for title in sortedTitles: wikisearch = wikipedia.WikipediaPage(title[0]) wikicontent = wikisearch.links documents.append(wikicontent) return sortedTitles, documents
def best_match_asset(): try: query = request.json['query'] return tfidf(query) except: return 'Query not submitted'
def main(): # k value needs to be changed according to the user kmean = kmeans(30) start_time = time.time() kmean.read_doc() kmean.documents = kmean.documents[1:] tfidf_v = tfidf(kmean.documents) kmean.cluster(tfidf_v.tfidf_docs) print("Total time taken:",time.time() - start_time)
def get_feature_string_by_document(self, _set, document): """Returns a string with the right format for features of the specified document. """ label = self.sets[_set][document]['label'] line = "{} ".format(label) for word in self.sets[_set][document]['words']: line += "{}:{} ".format(self.dictionary[word]['id'], tfidf.tfidf(word, document, self)) line += "\n" return line
def loadTfidf(locations): table = tfidf.tfidf() for l in locations: print l.split(" ") table.addDocument(l, ' '.split(l)) print '\nPREDICT:' for f in table.similarities(['Fillmore']): if f[1] > 0: print f
def process_files(in_dir): words_per_file = dict() '''for root, dirnames, filenames in os.walk(DIR_NAME): for filename in fnmatch.filter(filenames, MATCH): matches.append(os.path.join(root, filename)) ''' global WORDS, TRAIN WORDS = in_dir+'words' TEST = in_dir+'test.dat' CORRECT_LIST = [] matches = [] #CORRECT_LIST = (open(in_dir+'train_corr','r')).readlines() for line in (open(in_dir+'test_corr','r')).readlines(): if line.replace('\n','') != '': CORRECT_LIST.append(line.replace('\n','')) matches.append(line.replace('\n','')) #print CORRECT_LIST for line in open(in_dir+'test_wrong','r').readlines(): if line.replace('\n','') != '': matches.append(line.replace('\n','')) words = get_all_words_file() length = str(len(matches)) count = 0 # first iteration through matches to get all words for match in matches: count +=1 print str(count) + " of " + length + " test example preprocessing done" words_per_file[match] = get_all_words(match) print "preliminary processing done" train_f = open(TEST, 'w') # second iteration through matches to get all word counts count = 0 calc_freq = tfidf(words, words_per_file) print "Done calculating idfs" #print calc_freq.getIDF('tweet') for match in matches: count +=1 print str(count) + " of " + length + " testing examples done" if match in CORRECT_LIST: train_f.write('0') else: train_f.write('0') train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq))) train_f.write('\n') train_f.close() print "Results printed to file: " + str(TEST)
def get_related_doc(): tfidf.initFromFile("tfidf.txt") sortedTitles = tfidf.tfidf("United States", nTitles=4) documents = list() for title in sortedTitles: wikisearch = wikipedia.page(title[0]) wikicontent = wikisearch.content wikicontent = re.sub(r'[=]+[^=]*[=]+', '', wikicontent) documents.append(sent_tokenize(wikicontent)) return sortedTitles, documents
def evaluate_keywords(job_posts, evaluated_idf, selection=0.1): for j in job_posts: tfidfs = {} for token in j['tokens']: if token in tfidfs: continue tfidfs[token] = tfidf(token, j['tokens'], evaluated_idf) sorted_tfidfs = sorted(tfidfs.items(), key=lambda x: -x[1]) top_count = int(round(len(sorted_tfidfs)*selection)) keywords = [i[0] for i in sorted_tfidfs[:top_count]] j['keywords'] = keywords
def get_keyword(data: list[Weibo], stopwords=set()) -> list[list[str]]: comments_flat: list[list[str]] = map( lambda w: reduce(lambda x, y: x+y.words, w.comments, []), data) idf = tfidf.idf(comments_flat) weibo_keywd = [] for id, time, total, comments in data: all_text = reduce(lambda x, y: x+y.words, comments, []) weibo_keywd.append( tfidf.tfidf(all_text, idf, stopwords=stopwords)) return weibo_keywd
def __init__(self, urls = []): #Intermediate data will being handled here: urls, extracted text, terms, clusters, etc. #list of urls and their labels, ranking scores #e.g: urls = [["nature.com", 1, 0.9], ["sport.com", 0, 0.01] #list of terms and their labels, ranking scores #e.g: terms = [["science", 1, 0.9], ["sport", 0, 0.02]] self.urls_set = set(urls) self.positive_urls_set = set() self.negative_urls_set = set() self.tfidf = tfidf.tfidf() self.memex_home = environ['MEMEX_HOME']
def make_comparisons(url): arts = Article.objects.all() if len(arts) == 0: content = "Like pages to get started!" comparisons = None else: url_list = [] for article in arts: url_list.append(article.url) # Check if we have already liked/disliked it if url in url_list: if Article.objects.get(url=url).response == 'L': content = 'ALREADY LIKED (100%)' comparisons = None elif Article.objects.get(url=url).response == 'D': content = 'ALREADY DISLIKED (100%)' comparisons = None else: raise AttributeError('Somehow we managed to neither like nor dislike this webpage.') # What to do if we haven't elif url not in url_list: bodies_like = '' bodies_dislike = '' # Grab likes and dislikes for article in arts: if article.response == 'L': bodies_like += ' ' bodies_like += str((unidecode(article.body_text))) elif article.response == 'D': bodies_dislike += ' ' bodies_dislike += str(unidecode(article.body_text)) else: raise ValueError('Unknown rating encountered. Contact admin to reset database.') bodies_like_words = bodies_like.split() bodies_dislike_words = bodies_dislike.split() table = tfidf.tfidf() table.addDocument('likes', bodies_like_words) table.addDocument('dislikes', bodies_dislike_words) compare_text = get_html_text(url) comparisons = table.similarities(compare_text) if comparisons[0][1] > comparisons[1][1]: content = 'LIKE ({0}% Certain)'.format(round(100. * comparisons[0][1], 2)) elif comparisons[0][1] < comparisons[1][1]: content = 'DISLIKE ({0}% Certain)'.format(round(100. * comparisons[1][1], 2)) elif comparisons[0][1] == comparisons[1][1]: content = 'NEUTRAL ({0}% Certain)'.format(round(100. * comparisons[0][1], 2)) else: raise ValueError('You should never receive this error. If so please send the admin a message saying so...') # 'How did you get here?' Error else: raise LookupError("This URL both does not exists in the database and does not, not exist in the database." "Might be time to fall into a solipsistic coma and hope Apocalypse Now isn't real.") return content, comparisons
def runSearch(searchWords): with open("data/jokes.json") as jokeFile: jokeDict = {joke["title"]: joke for joke in json.load(jokeFile)} table = tfidf.tfidf() for joke in jokeDict.values(): table.addDocument(joke["title"], extract_word_list(joke["content"])) for joke in top_joke_list(jokeDict, table, searchWords): print("*** " + joke["title"]) print(joke["content"]) print("=======================")
def search(query, metric): query = preprocess(query) if metric == 'tfidf': result = tfidf(query) elif metric == 'bm25': result = bm25(query) elif metric == 'fasttext': result = fasttext(query) elif metric == 'elmo': result = elmo(query) return result
def load_table(): cr = csv.reader(open("lines.csv","rb")) table = tfidf.tfidf() for r in cr: id = r[0] words = (r[1] + " " + r[2]).lower() words = words.replace('-', ' ') words = words.replace('.', ' ') tokens = nltk.word_tokenize(words) table.addDocument(id, tokens) return table
def tf_idf(directory, ext): table = tfidf.tfidf() for root, dirs, files in os.walk(directory): for file in files: if file.endswith(ext): filename = root + file try: print file content = open(filename, 'rb').read() opcodeList = content.split(",") table.addDocument(file, opcodeList) except Exception as err: print str(err) print table.similarities(["nop"])
def compute_tfidf_matrix(corpus_dir): t = tfidf.tfidf() for path, subdirs, files in os.walk(corpus_dir): for name in files: f = os.path.join(path, name) with codecs.open(f, 'rb', 'utf-8') as i: tokens = [] for line in i: # Skip <doc> tags if not regex.match(ur'</?doc', line): l_tokens = regex.split(ur'[^\p{L}]+', line.lower()) tokens += [token for token in l_tokens if token and token not in STOPWORDS] t.addDocument(f, tokens)
def main(): table = tfidf.tfidf() table.addDocument("foo", [ "alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "alpha" ]) table.addDocument("bar", ["alpha", "bravo", "charlie", "india", "juliet", "kilo"]) table.addDocument("baz", ["kilo", "lima", "mike", "november"]) print table.similarities( ["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]]
def compute_tfidf_matrix(corpus_dir): t = tfidf.tfidf() for path, subdirs, files in os.walk(corpus_dir): for name in files: f = os.path.join(path, name) with codecs.open(f, 'rb', 'utf-8') as i: tokens = [] for line in i: # Skip <doc> tags if not regex.match(ur'</?doc', line): l_tokens = regex.split(ur'[^\p{L}]+', line.lower()) tokens += [ token for token in l_tokens if token and token not in STOPWORDS ] t.addDocument(f, tokens)
def tfidf_per_genre(plot_wc=False): data = pd.read_csv(config.dataset_dir + 'final_data.csv') genres_file = open(config.dataset_dir + 'unique_genres.txt', 'r') genre_list = [genre.strip('\n') for genre in genres_file.readlines()] directory = config.dataset_dir + 'output/sentiment_word_texts/' book_list = [] index = tfidf.create_index(directory) for genre in genre_list: genre = genre.replace('/', ' ') score_dict = {} book_list = [] books_of_genre = data.loc[data['genre'] == genre] for book in books_of_genre['filename']: book_list.append(book) try: tf_matrix, genre_tokens = tfidf.create_tf_matrix( directory, book_list, genre) for term in genre_tokens: score = tfidf.tfidf(term, genre, directory, index, tf_matrix) score_dict[term] = score scores_file = open( config.dataset_dir + 'output/top200_per_genre/' + genre + '.txt', 'w') for w in sorted(score_dict, key=score_dict.get, reverse=True): scores_file.write('%s/n' % w) scores_file.close() print('success') if plot_wc: font_path = config.dataset_dir + 'Open_Sans_Condensed/OpenSansCondensed-Light.ttf' create_wordcloud(score_dict, genre) except ZeroDivisionError: continue except ValueError: continue
def cal_similarity(mail): tmp_tfidf = tfidf() similarity = [] title_sim = [] for i in mail.text: tmp_tfidf.addDocument(i.index,i.remove_stop_ver) for i in mail.text: tmp_vector = [] tmp = tmp_tfidf.similarities(i.remove_stop_ver) for j in tmp: tmp_vector.append(j[1]) similarity.append(tmp_vector) #print(similarity) tmp_ti = tmp_tfidf.similarities(mail.remove_stop_ver_title) for i in tmp_ti: title_sim.append(i[1]) return numpy.asarray(similarity),numpy.asarray(title_sim)
def process_files(): words_per_file = dict() matches = [] for root, dirnames, filenames in os.walk(DIR_NAME): for filename in fnmatch.filter(filenames, MATCH): matches.append(os.path.join(root, filename)) words = set() length = str(len(matches)) count = 0 user_counter = 0 # first iteration through matches to get all words for match in matches: count +=1 print str(count) + " of " + length + " training example preprocessing done" #if 'tweet' in get_all_words(match) : # user_counter += 1 # print user_counter words_per_file[match] = get_all_words(match) words = words.union(words_per_file[match]) print "preliminary processing done" train_f = open(TRAIN, 'w') # second iteration through matches to get all word counts pattern = MATCH.replace('*','') count = 0 calc_freq = tfidf(words, words_per_file) print "Done calculating idfs" #print calc_freq.getIDF('tweet') for match in matches: count +=1 print str(count) + " of " + length + " training examples done" if CORRECT_PATTERN in match: #if match in CORRECT: #if match.replace(DIR_NAME+'/'+pattern,'') in CORRECT: #print calc_freq.term_freq('tweet',words_per_file[match]) train_f.write('+1') #print len(words_per_file[match]) else: train_f.write('-1') train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq))) train_f.write('\n') train_f.close() print "Results printed to file: " + str(TRAIN)
def makeVector(self, wordString, weighting): """ @pre: unique(vectorIndex) """ #Initialise vector with 0's vector = [0] * len(self.vectorKeywordIndex) wordList = self.parser.tokenise(wordString) wordList = self.parser.removeStopWords(wordList) documentString = " ".join(wordList) blob = tb(documentString) ### tf weighting for word in wordList: if weighting == 'tf': vector[self.vectorKeywordIndex[word]] += 1 / len(wordList) #Use simple Term Count Model # vector[self.vectorKeywordIndex[word]] = tfidf.tf(word, blob) elif weighting == 'tfidf': vector[self.vectorKeywordIndex[word]] = tfidf.tfidf(word, blob, self.blobList) return vector
def generate_weights(documents, lexicon): weights = dict() m = tfidf() print('Adding documents for TF-IDF...') for i, document in enumerate(documents): m.addDocument(i, document['words']['title'] + document['words']['body']) weights[i] = dict() print('Generating weight scores for words; This WILL take time...') for word in lexicon['title'] & lexicon['body']: m.get_similarities(word, weights, 'smooth', 1.25) for word in lexicon['title'] - lexicon['body']: m.get_similarities(word, weights, 'smooth', 1.1) for word in lexicon['body'] - lexicon['title']: m.get_similarities(word, weights, 'smooth') return weights
def __call__(self, docname, values): terms = [] fd = nltk.probability.FreqDist() for (term, (inTitle, position), n, N, d) in values: #relativePos = float(position)/m term_str = ' '.join(term) if inTitle: terms.append(term_str) else: score = tfidf.tfidf(n, N, d, self.doccount) #score *= relative_pos fd.inc(term_str, score) # top upper_fraction of terms n = int(self.upper_fraction * len(fd)) terms += fd.keys()[:n] yield docname, separator.join(terms)
def jump_to_relevant_paragraph(question, html): paragraphs = [] table = tfidf.tfidf() input_soup = bs4.BeautifulSoup(html) for index, paragraph in enumerate(input_soup.find_all("p")): paragraphs.append(paragraph.text) words = extract_words(paragraph.text) table.addDocument(index, words) question_words = extract_words(question) most_relevant_paragraph_index = max(table.similarities(question_words), key=lambda (paragraph_index, score):score)[0] output = [] for paragraph in paragraphs[most_relevant_paragraph_index:]: output.append("".join(["<p>", paragraph, "</p>"])) return "".join(output)
def compareDocToCenter(self, document, wordDFMap, numDocs, meanDocLen): if document.centerScore > 0.0: return if self.numCenterWords > self.queryLength: self.numCenterWords = self.queryLength tfIdf = tfidf(wordDFMap, numDocs, meanDocLen) document.centerScore = 0.0 i = 0 wordsUsed = 0 while wordsUsed < self.numCenterWords: word = self.sortedTF[i][0] count = self.sortedTF[i][1] i += 1 if tfIdf.isStopWord(word): continue wordsUsed += 1 if document.wordsMap.has_key(word): document.centerScore += tfIdf.getTfIdf(count, word, document) self.combinedCenterSim += document.centerScore
def bert_answers(question, no_answers): # Insert question - tf-idf operation performed first and top n bert answers will be returned pars = tfidf(question, part_noctable) # Select TF-IDF results above threshold. top_results = pars.loc[pars['Match Percentage'] >= 10] top_results = top_results['Paragraph'] # Crop results based on those specified in no_answers, unless top_results in less than number specified if len(top_results) > no_answers: crop_results = top_results.head(no_answers) else: crop_results = top_results #print(crop_results) for index, row in crop_results.iteritems(): print("Bert Answer {}".format(index + 1)) print(row) print(bert(question, row))
def store_tfidf(self, title, origin, url, text, amount, lang, publish_tsd): self.origin = origin if len(text) > 0: sentences = tfidf(text, amount, lang) if len(sentences) == amount: article = models.Article(title=title, origin=origin, insert_tsd=timezone.now(), publish_tsd=publish_tsd, original_url=url, bp1=sentences[0], bp2=sentences[1], bp3=sentences[2], bp4=sentences[3], bp5=sentences[4]) article.save() self.processed_articles += 1 self.logger.info( f'{self.processed_articles:03d} / {self.max_articles:03d}')
def build_tfidf(foreignBrand): datas = [] rs = [] merge_to_series = False (brand, vendor, series) = foreignBrand.split(u"#") #print brand.encode("utf-8") + ":" + maker.encode("utf-8") res = external_spec_col.find({ 'site': sys.argv[1], 'brand.name': brand, "vendor.name": vendor, "series.name": series }) for r in res: if r['model']['key'] == '0': merge_to_series = True data = [ r['model']['id'], r['brand']['name'], r['vendor']['name'], r['series']['name'], r['model']['name'], r['model'].get('year', '') or '' ] datas.append(data) rs.append(r) if merge_to_series: data = datas[0] k = "#".join(data) return k #engine_dict = {} #for i in xrange(len(rs)): # e = getExternalEngine(rs[i]) # if len(datas) == 0: # #print foreignBrand # return None tf = tfidf.tfidf() for data in datas: k = "#".join(data) n = "#".join(data[1:]) c = word_bag(n) tf.addDocument(k, c) return tf
def submit_selected_urls(self, positive, negative): #Perform ranking and diversifing on all urls with regard to the positive urls # #Args: # labeled_urls: a list of pair <url, label>. Label 1 means positive and 0 means negative. #Returns: # urls: list of urls with ranking scores # Test new positive and negative examples with exisitng classifier # If accuracy above threshold classify pages # Ranking # Diversification documents = {} other = [] all_docs = get_bag_of_words(list(self.urls_set)) for url in positive: if url in all_docs: self.positive_urls_set.add(url) self.negative_urls_set.discard(url) for url in negative: if url in all_docs: self.negative_urls_set.add(url) self.positive_urls_set.discard(url) for url in all_docs.keys(): content = all_docs[url] if (len(self.negative_urls_set) == 0) or (url not in self.negative_urls_set): documents[url] = content if url not in self.positive_urls_set: other.append(url) self.tfidf = tfidf.tfidf(documents) chdir(self.memex_home + '/seed_crawler/ranking') ranker = rank.rank() [ranked_urls,scores] = ranker.results(self.tfidf,self.positive_urls_set, other) return [ranked_urls, scores] # classified, ranked, diversified
def compareDocToCenter(self, document, wordDFMap, numDocs, meanDocLen): if document.centerScore > 0.0: return if self.numCenterWords > self.queryLength: self.numCenterWords = self.queryLength tfIdf = tfidf(wordDFMap, numDocs, meanDocLen) document.centerScore = 0.0 i = 0 wordsUsed = 0 while wordsUsed < self.numCenterWords: word = self.sortedTF[i][0] count = self.sortedTF[i][1] i += 1 if tfIdf.isStopWord(word): continue wordsUsed += 1 if document.wordsMap.has_key(word): document.centerScore += tfIdf.getTfIdf( count, word, document) self.combinedCenterSim += document.centerScore
def set_up_table(self, k): table = tfidf.tfidf() size = len(self.clusters['tier'+str(k)]) i = 0 keys = self.clusters['tier'+str(k)].keys() self.new_pics = {} while i < size: pics = self.clusters['tier'+str(k)][keys[i]] id = '' tokens = [] j = 0 while j < len(pics): id = id + pics[j] tokens = tokens + self.pics[pics[j]]['tokens'] j = j + 1 self.new_pics[id] = tokens i = i +1 for id in self.new_pics.keys(): table.addDocument(id,self.new_pics[id]) return table
def generate_weights(documents, lexicon): """ function: generate_weights -------------------------- perform tf-idf to generate importance scores for words in documents :param document: list of documents to use in calculations :returns: dictionary of dictionaries: {"id_" : {"word" : score,...}} """ # weight = { 'document' : { 'word' : score,... },... } weights = dict() m = tfidf() print('Adding documents for TF-IDF...') for i, document in enumerate(documents): m.addDocument(i, document['words']['title']+document['words']['body']) weights[i] = dict() # generate dictionary of { "word", "score" } pairs for each document print('Generating weight scores for words; This WILL take time...') for word in lexicon['title'] | lexicon['body']: # UNCOMMENT FOR SANITY # print('Generating weights for word:', word) m.get_similarities(word, weights) return weights
with open(f, "r") as handle: data = json.loads(handle.read()) for j in data: cmd = j["name"] desc = j["description"] flags = { (cmd, flag["name"]): flag["description"] for flag in j["optionDesc"] } cmd_docs[cmd] = tokenize(desc) for k, desc in flags.items(): descs[k] = desc toks = tokenize(desc) cmd_docs[cmd] += toks flag_docs[k] = toks print("Indexing...") import tfidf engine = tfidf.tfidf() flag_engines = {} for cmd, toks in cmd_docs.items(): engine.addDocument(cmd, toks) flag_engines[cmd] = tfidf.tfidf() for (cmdd, flag), flag_toks in flag_docs.items(): if cmdd == cmd: flag_engines[cmd].addDocument(flag, flag_toks) print("Running...") while True: try: inp = raw_input("> ") except EOFError: break toks = tokenize(inp)
def process_files(in_dir): words_per_file = dict() t_words_per_file = dict() '''for root, dirnames, filenames in os.walk(DIR_NAME): for filename in fnmatch.filter(filenames, MATCH): matches.append(os.path.join(root, filename)) ''' global WORDS, TRAIN WORDS = in_dir+'words' TEST = in_dir+'test.dat' ## SHOULD TFIDF be calc adding in test_data? TRAIN_SET = [] for line in (open(in_dir+'train_corr','r')).readlines(): if line.replace('\n','') != '': TRAIN_SET.append(line.replace('\n','')) for line in (open(in_dir+'train_wrong','r')).readlines(): if line.replace('\n','') != '': TRAIN_SET.append(line.replace('\n','')) for t_match in TRAIN_SET: t_words_per_file[t_match] = get_all_words(t_match) CORRECT_LIST = [] matches = [] for line in (open(in_dir+'test_corr','r')).readlines(): if line.replace('\n','') != '': CORRECT_LIST.append(line.replace('\n','')) matches.append(line.replace('\n','')) for line in open(in_dir+'test_wrong','r').readlines(): if line.replace('\n','') != '': matches.append(line.replace('\n','')) words = get_all_words_file() length = str(len(matches)) count = 0 # first iteration through matches to get all words for match in matches: count +=1 print str(count) + " of " + length + " test example preprocessing done" words_per_file[match] = get_all_words(match) print "preliminary processing done" train_f = open(TEST, 'w') # second iteration through matches to get all word counts count = 0 # TODO: should this be t_words_per_file, words_per_file or the combination??? calc_freq = tfidf(words, t_words_per_file) print "Done calculating idfs" #print "PARAM " + str(calc_freq.getIDF('@param')) for match in matches: count +=1 print str(count) + " of " + length + " testing examples done" if match in CORRECT_LIST: train_f.write('+1') else: train_f.write('-1') train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq))) train_f.write('\n') train_f.close() print "Results printed to file: " + str(TEST)
def pre_proc_code(run_pre_proc): # grab all corpus files # pre-write the tfidfs to file so can be read form file if not run_pre_proc: matches = [] words_per_file = dict() for root, dirnames, filenames in os.walk(CORPUS_1): for filename in fnmatch.filter(filenames, "*.java_*"): matches.append(os.path.join(root, filename)) for root, dirnames, filenames in os.walk(CORPUS_2): for filename in fnmatch.filter(filenames, "*.java_*"): matches.append(os.path.join(root, filename)) words = set() for match in matches: print match words_per_file[match] = get_all_words(match) words = words.union(words_per_file[match]) ''' print "Writing words per file to file tmp/words_per_file.csv'" f_words_pf = open(TMP_DIR+'words_per_file_tmp.csv', 'w') for w in words_per_file: f_words_pf.write(w + ',' + str(words_per_file[w]) + '\n') f_words_pf.close() print "Writing all words to file tmp/words_tmp.csv'" f_words = open(TMP_DIR+'words_tmp.csv', 'w') for word in words: f_words.write(word + '\n') f_words.close() ''' print "calculuating idfs" calc_freq = tfidf(words, words_per_file) #print "actually calculating idfs" idfs = calc_freq.getAllIdfs() print "Writing idf values to file tmp/idfs_tmp.csv'" f_idfs = open(TMP_DIR+'idfs_tmp.csv', 'w') for key in idfs: if key != '': f_idfs.write(key + ',' + str(idfs[key]) + '\n') f_idfs.close() #caluclate feature vectors feature_vectors_dict = dict() f_feature_vecs = open(TMP_DIR+'/feature_vecs_tmp.csv','w') for match in matches: wrd_cnts = str(get_word_counts(words_per_file[match], words, calc_freq)) f_feature_vecs.write(match+','+ wrd_cnts +'\n') feature_vectors_dict[match] = wrd_cnts else: print "Reading values from files in tmp/ directory" words = set() ''' f_words = open(TMP_DIR+'words_tmp.csv', 'r') 0% for word in f_words.readlines(): words.add(word.replace('\n','')) f_words.close() ''' idfs = dict() f_idfs = open(TMP_DIR+'idfs_tmp.csv', 'r') for line in f_idfs.readlines(): comma = line.split(',') idfs[comma[0]] = float((comma[1]).replace('\n','')) #print comma[0] + ',' + comma[1].replace('\n','') #f_idfs.write(key + ',' + str(idfs[key]) + '\n') f_idfs.close() feature_vectors_dict = dict() with open(TMP_DIR+'feature_vecs_tmp.csv', 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: feature_vectors_dict[row[0]] = (row[1]).replace('\n','') #print (row[0] + ',' + row[1]) csvfile.close() ''' words_per_file = dict() with open(TMP_DIR+'words_per_file_tmp.csv', 'r') as csvfile: reader = csv.reader(csvfile) for row in spamreader: words_per_file[row[0]] = eval(row[1]) #f_words_pf.write(w + ',' + str(words_per_file[w]) + '\n') csvfile.close() ''' # read featute vectors from file return feature_vectors_dict, idfs
from bs4 import BeautifulSoup soup = BeautifulSoup(open('cwec_v2.8.xml'), 'xml') from nltk.stem import WordNetLemmatizer st = WordNetLemmatizer() stemmer = SnowballStemmer("english") print('*'*50) i = 0 z = 0 stopwords = nltk.corpus.stopwords.words('english') g = open('Data2/words.txt', 'w+') g.close() for x in soup.Weakness_Catalog.Weaknesses.find_all('Weakness'): table = tfidf.tfidf() name = x.attrs['Name'] #Get description summary and format text. s = x.Description.Description_Summary.get_text() s = s.lower() s = "".join(c for c in s if c not in string.punctuation) a = s.split() a = [w for w in a if w.lower() not in stopwords] a = [stemmer.stem(w) for w in a] c = Counter(a) #See if there is an extended description before grabbing it. if hasattr(x.Description, 'Extended_Description'): if x.Description.Extended_Description is None: z += 1 p = "" else:
def rocchio(): all_documents, term_dictionary, doc_lengths = textmanip.get_documents() all_queries = textmanip.get_queries() avg_doc_len = numpy.average(doc_lengths) #calculate the tfidf weigthed sum for each document query pair query_doc_similar_list = tfidf.tfidf(term_dictionary, all_queries, all_documents, doc_lengths) #1. Get top relevant docs most_rel_docs = list() for q, query in enumerate(all_queries): if len(all_documents)<10: rel_docs = 2 if len(all_documents)>=2 else len(all_documents) most_rel_docs.append((sorted(query_doc_similar_list[q]))[:rel_docs], reverse=True) #2. Calculate all the TFIDF of the words in the relevant all_documents # Add the values together: for every word, add up the weight given to it by each document # Get the 10 most important words # Add those words to the query # Run TFIDF again with new query query_weight_list = list() for docList in most_rel_docs: doc_weights = list() for doc in docList: currentDocDict = all_documents[doc] word_weights = dict() for word in term_dictionary: word_weights[word] = textmanip.tfidfWordWeighting(word, doc, term_dictionary, all_documents, doc_lengths, avg_doc_len) doc_weights.append(word_weights) query_weight_list.append(doc_weights) #average out the values obtained for the weights accross all docs (per query) -> you get one weight list per query avg_query_w_list = list() for i in range(len(all_queries)): averageWordWeights = dict() for word in term_dictionary: addedWeight = 0 for j in range(len(most_rel_docs[i])): weightDict = query_weight_list[i][j] addedWeight += weightDict[word] averageWordWeights[word] = addedWeight/(len(most_rel_docs[i])*1.00) avg_query_w_list.append(averageWordWeights) rocchio_q = copy.deepcopy(all_queries) for i in range(len(rocchio_q)): #get the max weighted words in the avg_query_w_list[i] dictionary sortedListOfWords = sorted(avg_query_w_list[i].items(), key=operator.itemgetter(1), reverse=True) wordsInTheRelevantDocs = list() for j in range(len(sortedListOfWords)): wordsInTheRelevantDocs.append(sortedListOfWords[j][0]) for word in rocchio_q[i]: if word not in wordsInTheRelevantDocs: rocchio_q[i][word] = (A*(rocchio_q[i][word])) for j in range(len(sortedListOfWords)): wordToAdd = sortedListOfWords[j][0] weightToAdd = sortedListOfWords[j][1] if wordToAdd in rocchio_q[i]: rocchio_q[i][wordToAdd] = (A*(rocchio_q[i][wordToAdd]))+(B*(weightToAdd)) else: rocchio_q[i][wordToAdd] = (B*(weightToAdd)) queryDocSimilaryList = tfidf.tfidf(term_dictionary, rocchio_q, all_documents, doc_lengths) textmanip.outputResults(queryDocSimilaryList, 'best.top')