def Feedback(self, searchList): queryVector = self.buildQueryVector(searchList) ''' self.vectorIDF = [float(2048.0/x) for x in self.vectorIDF] self.vectorIDF = [float(math.log10(x)) for x in self.vectorIDF] self.tfidf = [map(lambda (a,b):a*b,zip(self.vectorIDF, documentVector)) for documentVector in self.documentVectors] ''' ratings = [ util.cosine(queryVector, documentVector) for documentVector in self.tfidf ] maxone = 0 targetone = 0 for i in range(len(ratings)): if (ratings[i] > maxone): maxone = ratings[i] targetone = i newqueryVector = [] for i in range(len(queryVector)): newqueryVector.append(queryVector[i] + ((0.5) * self.documentVectors[targetone][i])) #print(queryVector) #print(newqueryVector) ratings2 = [ util.cosine(newqueryVector, documentVector) for documentVector in self.tfidf ] #print(ratings) return ratings2
def feedbacksearch(self, searchList, wordString, flag): queryVector = self.buildQueryVector(searchList, flag) feedback = self.makeVector(wordString, flag) for index in range( 0, len(queryVector)): queryVector[index] = float( queryVector[index] + feedback[index] * (1/2) ) ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors] return ratings
def baseline_average_cos(article_num, num_times, print_out=True, equal_prob=True): ''' Generates two random actual articles of source article article_num Computes the cosine similarity Does this num_times and outputs the lowest cos, highest, and average ''' lowest = float("inf") highest = float("-inf") total = 0.0 for _ in xrange(num_times): if equal_prob: a1 = source_articles.spin_articles(article_num)[0] a2 = source_articles.spin_articles(article_num)[0] else: a1, a2 = source_articles.spin_articles(article_num, 2) cos_sim = cosine(a1, a2) lowest = min(lowest, cos_sim) highest = max(highest, cos_sim) total += cos_sim average = total / num_times if print_out: print 'Ran {0} times for article {1}'.format(num_times, article_num) if equal_prob: print 'Generated with equal probability' else: print 'Generated using heuristic to produce low cosine' print 'Lowest : {0}\nHighest : {1}\nAverage : {2}\n'.format(lowest, highest, average)
def search(self,searchList): """ search for documents that match based on a list of terms """ queryVector = self.buildQueryVector(searchList) ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors] #ratings.sort(reverse=True) return ratings
def related(self, documentId): """ find documents that are related to the document indexed by passed Id within the document Vectors""" rating_dic = {} for key, value in self.documentVectors.items(): ratings = util.cosine(self.documentVectors[documentId], value) rating_dic[key] = ratings return rating_dic
def related(self, documentId): """ find documents that are related to the document indexed by passed Id within the document Vectors""" ratings = [ util.cosine(self.documentVectors[documentId], documentVector) for documentVector in self.documentVectors ] return ratings
def search(self, searchList, compare, flag): queryVector = self.buildQueryVector(searchList, flag) if compare == "cos": ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors] elif compare == "dis": ratings = [util.Euclidean(queryVector, documentVector) for documentVector in self.documentVectors] return ratings
def related(self,documentId): """ find documents that are related to the document indexed by passed Id within the document Vectors""" ratings = {} for key, value in documentVectors.items(): rating = util.cosine(self.documentVectors[documentId], value) #ratings.sort(reverse=True) ratings[key] = rating return ratings
def TF_Cosine(self, query): queryTFVector = [self.makeTfVector(query)] tf_cos = [] for documentTFVector in self.documentTFVectors: tf_cos.append(util.cosine(queryTFVector, documentTFVector)) return tf_cos
def searchTFIDFWithCosine(self, searchList): """ search for documents that match based on a list of terms """ queryVector = self.buildTFIDFQueryVector(searchList) ratings = [ util.cosine(queryVector, documentVector) for documentVector in self.TFIDFVectors ] return ratings
def relevence_search(self, searchVector, formula="cosine", weighting='tf'): ratings = {} for key, value in self.documentVectors.items(): if formula == "cosine": rating = util.cosine(searchVector, value) elif formula == "euclidean": rating = util.euclidean(searchVector, value) ratings[key] = rating ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)} return ratings
def get_cosines(src_articles): positive_cosines = [] negative_cosines = [] while len(positive_cosines) < NUM_POSITIVES: num = random.randint(0, src_articles.count - 1) a1, a2 = src_articles.spin_dissimilar_articles(num, 2) positive_cosines.append(cosine(" ".join(a1), " ".join(a2))) while len(negative_cosines) < NUM_NEGATIVES: num1 = random.randint(0, src_articles.count - 1) similar_articles = list(src_articles.get_very_similar_articles(num1)) if not similar_articles: continue num2 = random.choice(similar_articles) a1 = " ".join(src_articles.spin_articles(num1)[0]) a2 = " ".join(src_articles.spin_articles(num2)[0]) negative_cosines.append(cosine(a1, a2)) return positive_cosines, negative_cosines
def searchTf(self, query): """ search for documents that match based on a list of terms """ queryVector = self.makeTfVector(query) tf_cos = [ util.cosine(queryVector, documentVector) for documentVector in self.tfVectors ] tf_dist = [ util.euclidean(queryVector, documentVector) for documentVector in self.tfVectors ] return [tf_cos, tf_dist]
def search(self,searchList, formula="cosine", weighting="tf"): """ search for documents that match based on a list of terms """ ratings = {} queryVector = self.buildQueryVector(searchList, weighting) for key, value in self.documentVectors.items(): if formula == "cosine": rating = util.cosine(queryVector, value) elif formula == "euclidean": rating = util.euclidean(queryVector, value) ratings[key] = rating ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)} return ratings
def search_nltk(self, searchList, method="1"): rating_dic = {} for key, value in self.documentVectors.items(): rating_dic[key] = util.cosine(searchList, value) result = { k: v for k, v in sorted( rating_dic.items(), key=lambda item: item[1], reverse=True) } return list(result.items())[:10]
def tf_idf_search(self, searchList): queryVector = self.buildQueryVector(searchList) # print (queryVector) self.tfidVectors = util.tf_idf(self.documentVectors) # print self.tfidVectors ratings = [ util.cosine(queryVector, documentVector) for documentVector in self.tfidVectors ] # ratings = [util.cosine(queryVector, util.tf_idf(documentVectors)) for documentVector in self.documentVectors] # print(ratings) return ratings
def f_search(self,searchList,doc,way): """ search for documents that match based on a list of terms """ queryVector = self.buildQueryVector(searchList) fVector = self.makeTagVector(doc) for i in range(0,len(queryVector)): queryVector[i] += fVector[i] if way == "cosine": ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors] elif way == "euclid": ratings = [util.euclid(queryVector, documentVector) for documentVector in self.documentVectors] #ratings.sort(reverse=True) return ratings
def baseline_cos_different_articles(a1_num, a2_num, num_times): lowest = float("inf") highest = float("-inf") total = 0.0 for _ in xrange(num_times): a1 = source_articles.spin_articles(a1_num)[0] a2 = source_articles.spin_articles(a2_num)[0] cos_sim = cosine(a1, a2) lowest = min(lowest, cos_sim) highest = max(highest, cos_sim) total += cos_sim average = total / num_times return lowest, highest, average
def search1and2(self, searchList, compare, flag): """ search for documents that match based on a list of terms """ queryVector = self.buildQueryVector(searchList) if compare == "cos": ratings = [ util.cosine(queryVector, documentVector) for documentVector in self.documentVectors ] elif compare == "el": ratings = [ util.Euclidean(queryVector, documentVector) for documentVector in self.documentVectors ] return self.Sort(ratings, flag)
def search(self, searchList, method="0"): """ search for documents that match based on a list of terms """ rating_dic = {} queryVector = self.buildQueryVector(searchList, method) for key, value in self.documentVectors.items(): rating_dic[key] = util.cosine(queryVector, value) result = { k: v for k, v in sorted( rating_dic.items(), key=lambda item: item[1], reverse=True) } return list(result.items())[:10]
def search(self, relevanceType): """ search for documents that match based on a list of terms """ self.queryVector = self.buildQueryVector(self.queryList) if relevanceType == 'cs': ratings = [ util.cosine(self.queryVector, documentVector) for documentVector in self.documentVectors ] elif relevanceType == 'eu': ratings = [ util.euclidean(self.queryVector, documentVector) for documentVector in self.documentVectors ] return ratings
def tfidf(self, queryVector, flag): for i in range(len(queryVector)): if self.idf[i] > 0: queryVector[i] = queryVector[i] * math.log10( float(7034 / self.idf[i])) if flag == "cos": ratings = [ util.cosine(queryVector, documentVector) for documentVector in self.documentVectors ] elif flag == "el": ratings = [ util.Euclidean(queryVector, documentVector) for documentVector in self.documentVectors ] return ratings
def searchRelevantFeedback(self, QueryFeedback): # first element is query, second element is relevant feedback query = self.buildTFIDFQueryVector(QueryFeedback[0]) feedback = self.buildTFIDFQueryVector(QueryFeedback[1]) NewQuery = [0] * len(query) for i in range(len(query)): NewQuery[i] = 1 * query[i] + 0.5 * feedback[i] ratings = [ util.cosine(NewQuery, documentVector) for documentVector in self.TFIDFVectors ] return ratings
def search(self, searchList, way, idf=False): """ search for documents that match based on a list of terms """ queryVector = self.buildQueryVector(searchList) if idf == True: queryVector = self.tfidf(queryVector) # print(queryVector) if way == 'cos': ratings = [ util.cosine(queryVector, documentVector) for documentVector in self.documentVectors ] else: ratings = [ util.euc_distance(queryVector, documentVector) for documentVector in self.documentVectors ] #ratings.sort(reverse=True) return ratings
def feedback(self, first_doc): ''' get first result from #1-3, then get noun and verb to make new query to get relevance feedback ''' text = nltk.word_tokenize(first_doc) pos_tagged = nltk.pos_tag(text) feedbackQueryList = [ e[0] for e in filter( lambda x: x[1][:2] == 'NN' or x[1][:2] == 'VB', pos_tagged) ] # feedbackQueryVector = self.buildQueryVector(feedbackQueryList) if len(self.queryVector) == 0: self.queryVector = self.buildQueryVector(self.queryList) queryVector = [ e + 0.5 * a for e, a in zip(self.queryVector, feedbackQueryVector) ] ratings = [ util.cosine(queryVector, documentVector) for documentVector in self.documentVectors ] return ratings
def get_comparison_stats(self, article1, article2, classified_article1, classified_article2, do_print=True): ''' returns cosine of all possible things of two articles and their comparisons ''' a1_a2 = cosine(" ".join(article1), " ".join(article2)) ca1_ca2 = cosine(" ".join(classified_article1), " ".join(classified_article2)) a1_ca1 = cosine(" ".join(classified_article1), " ".join(article1)) a2_ca2 = cosine(" ".join(classified_article2), " ".join(article2)) a1_ca2 = cosine(" ".join(classified_article2), " ".join(article1)) a2_ca1 = cosine(" ".join(classified_article1), " ".join(article2)) if do_print: print "COSINE OF ARTICLES: {0}".format(a1_a2) print "COSINE OF CLASSIFIED ARTICLES: {0}".format(ca1_ca2) print "cosine of A1 and classified A1: {0}".format(a1_ca1) print "cosine of A2 and classified A2: {0}".format(a2_ca2) print "cosine of A1 and classified A2: {0}".format(a1_ca2) print "cosine of A2 and classified A1: {0}".format(a2_ca1) print "ratio: {0}".format(((a2_ca1 / a1_ca1) + (a1_ca2 / a2_ca2)) / 2) return (a1_a2, ca1_ca2, a1_ca1, a2_ca2, a1_ca2, a2_ca1)
print('TF Weighting + Cosine Similarity:') print_top5(top5_tf_cos) print('TF Weighting + Euclidean Distance:') print_top5(top5_tf_dist) print('TF-IDF Weighting + Cosine Similarity:') print_top5(top5_tfidf_cos) print('TF-IDF Weighting + Euclidean Distance:') print_top5(top5_tfidf_dist) # for Q2 Relevance feedback newSearchIndex = indexList.index(top5_tfidf_cos[0][0]) documents = doc[newSearchIndex] feedBackVector = vectorSpace.makeVecRelevance(documents) ansVec = searchVector + feedBackVector finalScore = [ util.cosine(ansVec, docVec) for docVec in vectorSpace.tfidfVec ] Q2 = sorted(list(zip(indexList, finalScore)), reverse=True, key=itemgetter(1))[:5] print('Relevance Feedback + TF-IDF Weighting + Cosine Similarity:') print_top5(Q2) time_end = datetime.datetime.now() time_cost = time_end - time_start print("cost time :", time_cost)
def agreement(self, index1, index2): return util.cosine(self.allIdeas[index1], self.allIdeas[index2])
def searchtfidfcosine(self,searchList): queryVector = self.buildQueryVector(searchList) queryVector = self.computeidf(queryVector) tempVectors = [self.computeidf(documentVector) for documentVector in self.documentVectors] ratings = [util.cosine(queryVector , documentVector) for documentVector in tempVectors] return ratings
def main(query): #create vector space model instance vectorSpace = VectorSpace(documents) #caculate different conmbinations tf_cos = vectorSpace.TF_Cosine(query) tf_euclidean = vectorSpace.TF_Euclidean(query) tfidf_cos = vectorSpace.TFIDF_Cosine(query) tfidf_euclidean = vectorSpace.TFIDF_Euclidean(query) #sort with top five score top5_tf_cos = sorted(list(zip(indexList, tf_cos)), reverse=True, key=lambda x: x[1])[:5] top5_tf_euclidean = sorted(list(zip(indexList, tf_euclidean)), reverse=False, key=lambda x: x[1])[:5] top5_tfidf_cos = sorted(list(zip(indexList, tfidf_cos)), reverse=True, key=lambda x: x[1])[:5] top5_tfidf_euclidean = sorted(list(zip(indexList, tfidf_euclidean)), reverse=False, key=lambda x: x[1])[:5] #print out the output print('Term Frequency Weighting + Cosine Similarity:') print_top(top5_tf_cos) print('Term Frequency Weighting + Euclidean Distance:') print_top(top5_tf_euclidean) print('TF-IDF Weighting + Cosine Similarity:') print_top(top5_tfidf_cos) print('TF-IDF Weighting + Euclidean Distance:') print_top(top5_tfidf_euclidean) #Relevance Feedback #get the document of the first score of the tfidf + cosine similarity by given query indx_fb = indexList.index(top5_tfidf_cos[0][0]) fb = documents[indx_fb] #the new query term weighting scheme is [1 * original query + 0.5 * feedback query] feedback_vector = vectorSpace.makeFeedbackVector(fb) query_vector = np.array(vectorSpace.makeTfIdfVector(query)) rf_vector = query_vector + feedback_vector # evaluate the relevance vector with each document by tfidf + cosine similarity rf_tfidf_cos = [] for documentTFIDFVector in vectorSpace.documentTFIDFVectors: rf_tfidf_cos.append(util.cosine(rf_vector, documentTFIDFVector)) top5_rf_tfidf_cos = sorted(list(zip(indexList, rf_tfidf_cos)), reverse=True, key=lambda x: x[1])[:5] #print out the output print('Relevance Feedback + TF-IDF Weighting + Cosine Similarity:') print_top(top5_rf_tfidf_cos)
for documentVector in self.documentVectors ] return ratings def IDFCOS(self, searchList): queryVector = self.buildQueryVector(searchList) self.vectorIDF = [float(2048.0 / x) for x in self.vectorIDF] self.vectorIDF = [float(math.log10(x)) for x in self.vectorIDF] self.tfidf = [ map(lambda (a, b): a * b, zip(self.vectorIDF, documentVector)) for documentVector in self.documentVectors ] #print(tfidf) ratings = [ util.cosine(queryVector, documentVector) for documentVector in self.tfidf ] #print(ratings) return ratings def IDFED(self, searchList): queryVector = self.buildQueryVector(searchList) #print(self.vectorIDF) #tfidf = [map(lambda (a,b):a*b,zip(self.vectorIDF, documentVector)) for documentVector in self.documentVectors] #print(tfidf) ratings = [ util.Euclidean_Distance(queryVector, documentVector) for documentVector in self.tfidf ]
def cal_fq_tfidf_cs(vectorSpace, files, query): # Feedback Query + TF-IDF Weighting + Cosine Similarity # # step 1 # step 2 # step 3 # step 4 sorted_indices, _ = cal_tfidf_cs(vectorSpace, files, query) # The new query term weighting = [1 * original query + 0.5 * feedback query] ''' For instance, suppose the index vector is ["network" ,"computer" , "share", "ask", "soccer", "song"], the query is "network", and the content of the feedback document is: Jimmy shares songs via the computer network. Then we will get a new query vector like this: 1 * [1, 0, 0, 0, 0, 0] + 0.5 * [1, 1, 1, 0, 0, 1] = [1.5, 0.5, 0.5, 0, 0, 0.5] ''' # get ranked first vector first_vector = vectorSpace.documentVectors[sorted_indices[0]] my_dict = vectorSpace.vectorKeywordIndex def get_key(val): for key, value in my_dict.items(): if val == value: return key return "key doesn't exist" # map the vector' item into words words = [] for i in range(len(first_vector)): if first_vector[i] > 0: for j in range(first_vector[i]): words.append(get_key(i)) # do the pos tagging to words tagged = nltk.pos_tag(words) feedback = [0] * len(vectorSpace.vectorKeywordIndex) # find the verb and noun words and transform to feedback vector pos = ['NN', 'VB', 'VBP', 'VBD', 'VBG'] for tup in tagged: if tup[1] in pos: feedback[my_dict[tup[0]]] += 1 feedback = np.array(feedback) queryVector = vectorSpace.buildQueryVector([query]) new_query = queryVector + 0.5 * feedback new_query = list(new_query) # feedback rating print("Feedback Queries + TF-IDF Weighting + Cosine Similarity feedback") scores = [ util.cosine(new_query, documentVector) for documentVector in vectorSpace.documentVectors ] # Indices of N largest elements in list indices = np.argpartition(scores, -5)[-5:] # save as (index, value) d = {} for i in indices: d[i] = scores[i] # sort dict by value instead of key sd = sorted(d.items(), key=lambda item: item[1], reverse=True) sorted_indices = [s[0] for s in sd] sorted_scores = [s[1] for s in sd] # find docid docid = [] for index in sorted_indices: x = files[index] docid.append(os.path.splitext(x)[0]) round_score = [round(score, 6) for score in sorted_scores] d = {'docID': docid, 'Score': round_score} return pd.DataFrame(data=d)
def searchTFidfCos(self, searchList): searchVec = self.makeVectorforTFidf(searchList) vector = [util.cosine(searchVec, docVec) for docVec in self.tfidfVec] return vector
def related(self,documentId): """ find documents that are related to the document indexed by passed Id within the document Vectors""" ratings = [util.cosine(self.documentVectors[documentId], documentVector) for documentVector in self.documentVectors] ratings.sort(reverse=True) return ratings
# print(my_dict[tup[0]]) feedback[my_dict[tup[0]]] += 1 # print(feedback) feedback = np.array(feedback) queryVector = vectorSpace.buildQueryVector(["drill wood sharp"]) new_query = queryVector + 0.5 * feedback new_query = list(new_query) print(new_query) # feedback rating print("TF-IDF Weighting + Cosine Similarity feedback") scores = [ util.cosine(new_query, documentVector) for documentVector in vectorSpace.documentVectors ] # Indices of N largest elements in list indices = np.argpartition(scores, -5)[-5:] # save as (index, value) d = {} for i in indices: d[i] = scores[i] # sort dict by value instead of key sd = sorted(d.items(), key=lambda item: item[1], reverse=True) # print(sd)
top5_tf_cos = sorted(list(zip(indexes, tf_cos)), reverse=True, key=sortByRatings)[:5] top5_tf_dist = sorted(list(zip(indexes, tf_dist)), reverse=False, key=sortByRatings)[:5] top5_tfidf_cos = sorted(list(zip(indexes, tfidf_cos)), reverse=True, key=sortByRatings)[:5] top5_tfidf_dist = sorted(list(zip(indexes, tfidf_dist)), reverse=False, key=sortByRatings)[:5] print('Term Frequency Weighting + Cosine Similarity:') printResult(top5_tf_cos) print('Term Frequency Weighting + Euclidean Distance:') printResult(top5_tf_dist) print('TF-IDF Weighting + Cosine Similarity:') printResult(top5_tfidf_cos) print('TF-IDF Weighting + Euclidean Distance:') printResult(top5_tfidf_dist) #create feedback-relevance vector newQueryIndex = indexes.index(top5_tfidf_cos[0][0]) doc = contents[newQueryIndex] feedbackVector = vectorspace.getRelevanceFeedbackVector(doc) qfVector = queryVector+feedbackVector # compute the scores and re-rank scores = [util.cosine(qfVector, documentVector) for documentVector in vectorspace.tfidfVectors] relevanceFeedback = sorted(list(zip(indexes, scores)), reverse=True, key=sortByRatings)[:5] print('Feedback Queries + TF-IDF Weighting + Cosine Similarity:') printResult(relevanceFeedback)
from VectorSpace import * import util f = open('data/data.txt','r') descs = [] descs.append(f.read()) f.close() f = open('data/id3v23.txt','r') descs.append(f.read()) f.close() vs = VectorSpace(descs) print util.cosine(vs.documentVectors[0],vs.documentVectors[1]) print util.cosine(vs.documentVectors[1],vs.documentVectors[0])
def s3(t1, terms): return set([t2 for t2 in terms if t1 != t2 and len(t1)>3 and cosine(t1,t2) > 0.8])