def find_closest(word_vector, vocab, number, metric): vectors = np.array(word_vector) # print vectors # print len(vectors) top_number = -number similarity = [] if (metric == 'cosine'): for c in vocab: sim = cosine_sim(c[1], vectors) similarity.append([c[0], sim]) # print len(vocab) # print vocab[1] # print len(similarity) # print similarity[1] simi = [x[1] for x in similarity] top_n = np.argsort(simi)[-10:] # print top_n top_n_sim = [] wiki_closest = [] for j in top_n: top_n_sim.append(simi[j]) wiki_closest.append(vocab[j]) return top_n_sim, wiki_closest
def ICS(reviews): customerToProductDictionary = defaultdict(lambda: defaultdict(lambda: [])) ICS_Dictionary = defaultdict(lambda: defaultdict(lambda: 0)) for review in reviews: #build up dictionaries if (review["productId"] != "None" and review["memberId"] != "None"): customerToProductDictionary[review["memberId"]][ review["productId"]].append(review["reviewText"]) for member in customerToProductDictionary: for productId in customerToProductDictionary[member]: temp = 0 cnt = 0 if len( customerToProductDictionary[member][productId] ) > 1: #check if the same memeber review on a product repeatedly for i in customerToProductDictionary[member][productId]: for j in customerToProductDictionary[member][productId]: if i != j: temp = cosine_sim(i, j) cnt += 1 if cnt != 0: ICS = 1.0 * temp / cnt #take average ICS_Dictionary[member][productId] = ICS return ICS_Dictionary
def main(): itemsim = {} tagged = pickle.load(open("tagged", "rb")) c = 1 for a1 in tagged: for a2 in tagged: if c % 1000000 == 0: print(c / 1000000, "M") #DONT compare same artists if int(a2) > int(a1): score = cosine_sim(tagged[a1], tagged[a2]) #only include cosine similarities higher than a certain threshold if score > 0.50: try: itemsim[a1][a2] = score except KeyError: itemsim[a1] = {} itemsim[a1][a2] = score try: itemsim[a2][a1] = score except KeyError: itemsim[a2] = {} itemsim[a2][a1] = score c += 1 pickle.dump(itemsim, open("item_sim50", "wb"))
def GMCS(group): MCS = [] count = [] for i in range(len(group)): cur_user = group[i] MCS.append(0) count.append(0) for x in range(len(cur_user[1])-1):#each review for y in range(x+1,len(cur_user[1])): MCS[i]+=cosine_sim(cur_user[1][x]["reviewText"], cur_user[1][y]["reviewText"]) count[i]+=1 MCS[i]/=count[i] Sum = 0 for indi in MCS: Sum+=indi return float(Sum)/len(group)
def main(): listen = get_listen() try: sim = pickle.load(open("user_sim01", "rb")) except FileNotFoundError: sim = {} for u1 in listen: sim[u1] = {} for u2 in listen: if u1 != u2: score = cosine_sim(listen[u1], listen[u2]) if score > 0.01: sim[u1][u2] = score pickle.dump(sim, open("user_sim01", "wb")) total = 0 # count the total amount of similarity relations for u1 in sim: for u2 in sim[u1]: total += 1 print(total) """u1 = "2"
def CS(reviews): texts = [review["reviewText"] for review in reviews] return avg([ cosine_sim(review1, review2) for review1 in texts for review2 in texts ])
def recommender(user_train,user_test,movie_train,k): user_avg={} for user in user_train: count2=0 sum=0 for movie in user_train[user]: if user_train[user][movie]!=0: sum=sum+user_train[user][movie] count2=count2+1 if count2==0: avg=0 else : avg=float(sum)/float(count2) user_avg[user]=avg IUF_train={} for movie in movie_train: count2=0 for user in movie_train[movie]: if movie_train[movie][user]!=0: count2=count2+1 if count2 ==0: IUF==0 else: a=1000/count2 IUF=log10(a) IUF_train[movie]=IUF sim_user={} sim_userabs={} for user in user_test: dict1={} dict2={} for user1 in user_train: if user!=user1: '''if user==201 and user1==2: print 'user_train' print user_train[user1] print 'user_test' print user_test[user]''' #print 'train '+ str(user1)+' '+'test '+ str(user) sim=cosine_sim(user_train[user1],user_test[user],IUF_train) dict1[user1]=sim dict2[user1]=abs(sim) sim_user[user]=dict1 sim_userabs[user]=dict2 reco_dict1={} for user in user_test: dict2={} for movie in user_test[user]: if user_test[user][movie]==0: #print user #print movie lista=sorted(sim_userabs[user].items(),key=itemgetter(1), reverse=True) #print lista count=0 topklist=[] for user1,sim in lista: #print user1 #print movie #print 'user'+str(user) if user_train[user1][movie]!=0: topklist.append(user1) count=count+1 if count==k: break rating=reco_cosine(user,movie,user_train,user_test,topklist,sim_user,user_avg,k) #print rating a=int(rating) #a=int(round(rating)) #if a==0: # a=1 dict2[movie]=a #reco_dict[user]=dict2 reco_dict1[user]=dict2 #print reco_dict[205] return reco_dict1
from util import * from cosine_sim import cosine_sim import sys if __name__ == "__main__": cosine_sim_weights = [0.50, 0.30, 0.20] window_weight = 100 window_function = (lambda v : 1.0 / v) dictionary = read_dictionary() data = sys.argv[1] queries = read_train_data(data) corpus = read_corpus() for q in queries: scored_urls = cosine_sim(q, cosine_sim_weights, dictionary, corpus) # output the urls in order print "query: " + q.query_terms boosted_urls = [] for (s,u) in scored_urls: min_dist = u.minimum_body_window(q.query_terms) min_dist = min(min_dist, u.minimum_title_window(q.query_terms)) min_dist = min(min_dist, u.minimum_anchor_window(q.query_terms)) min_dist -= len(q.query_terms.split()) # 1/0 isn't a thing min_dist += 1 # maps it in the range [B,1] B = 1.0 + window_function(min_dist) * (window_weight - 1) boosted_urls.append((s * B, u)) boosted_urls.sort(reverse=True) for (s,u) in boosted_urls: print " url: " + u.url with open('Weights','w') as f: cosine_sim_weights.reverse()