def svd_protocol_evaluation(data_path, params): solr = "http://localhost:8983/solr/grrecsys" test_c = consumption( ratings_path=data_path + 'test/test_N20.data', rel_thresh=0, with_ratings=True ) #debiera ser el test_c, pero como includeRated=False, da lo mismo train_c = consumption(ratings_path=data_path + 'eval_train_N20.data', rel_thresh=0, with_ratings=False) svd = pyreclab.SVD(dataset=data_path + 'eval_train_N20.data', dlmchar=b',', header=False, usercol=0, itemcol=1, ratingcol=2) svd.train(factors=params['f'], maxiter=params['mi'], lr=params['lr'], lamb=params['lamb']) recommendationList, map, ndcg = svd.testrec(input_file=data_path + 'test/test_N20.data', dlmchar=b',', header=False, usercol=0, itemcol=1, ratingcol=2, topn=100, relevance_threshold=0, includeRated=False) MRRs = dict((N, []) for N in [5, 10, 15, 20]) nDCGs = dict((N, []) for N in [5, 10, 15, 20]) APs = dict((N, []) for N in [5, 10, 15, 20]) Rprecs = dict((N, []) for N in [5, 10, 15, 20]) for userId, recList in recommendationList.items(): book_recs = remove_consumed( user_consumption=train_c[userId], rec_list=recommendationList[userId] ) #da lo mismo este paso, según Gabriel el testrec no devuelve items consumidos book_recs = recs_cleaner(solr=solr, consumpt=train_c[userId], recs=book_recs[:100]) recs = user_ranked_recs(user_recs=book_recs, user_consumpt=test_c[userId]) for N in [5, 10, 15, 20]: mini_recs = dict((k, recs[k]) for k in list(recs.keys()) [:N]) #python 3.x: list() es necesario MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1)) nDCGs[N].append( nDCG(recs=mini_recs, alt_form=False, rel_thresh=False)) APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1)) Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs)) for N in [5, 10, 15, 20]: with open('TwitterRatings/funkSVD/clean/protocol.txt', 'a') as file: file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \ (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
def nDCGMAP_calculator(data_path, params, topN, output_filename): user_consumption = consumption(ratings_path=data_path + 'ratings.total', rel_thresh=0, with_ratings=True) svd = pyreclab.SVD( dataset=data_path + 'ratings.train', #data_path+'train/train.'+str(i), dlmchar=b',', header=False, usercol=0, itemcol=1, ratingcol=2) svd.train(factors=params['f'], maxiter=params['mi'], lr=params['lr'], lamb=params['lamb']) recommendationList = svd.testrec( input_file=data_path + 'test/' + os.listdir(data_path + 'test/')[0], #data_path+'val/val.'+str(i), dlmchar=b',', header=False, usercol=0, itemcol=1, ratingcol=2, topn=100, includeRated=False) MRR_thresh4 = [] MRR_thresh3 = [] nDCGs_bin_thresh4 = dict((n, []) for n in topN) nDCGs_bin_thresh3 = dict((n, []) for n in topN) nDCGs_normal = dict((n, []) for n in topN) nDCGs_altform = dict((n, []) for n in topN) APs_thresh4 = dict((n, []) for n in topN) APs_thresh3 = dict((n, []) for n in topN) APs_thresh2 = dict((n, []) for n in topN) for userId in recommendationList[0]: recs = user_ranked_recs(user_recs=recommendationList[0][userId], user_consumpt=user_consumption[userId]) MRR_thresh4.append(MRR(recs=recs, rel_thresh=4)) MRR_thresh3.append(MRR(recs=recs, rel_thresh=3)) for n in topN: mini_recs = dict((k, recs[k]) for k in recs.keys()[:n]) nDCGs_bin_thresh4[n].append( nDCG(recs=mini_recs, alt_form=False, rel_thresh=4)) nDCGs_bin_thresh3[n].append( nDCG(recs=mini_recs, alt_form=False, rel_thresh=3)) nDCGs_normal[n].append( nDCG(recs=mini_recs, alt_form=False, rel_thresh=False)) nDCGs_altform[n].append( nDCG(recs=mini_recs, alt_form=True, rel_thresh=False)) APs_thresh4[n].append(AP_at_N(n=n, recs=recs, rel_thresh=4)) APs_thresh3[n].append(AP_at_N(n=n, recs=recs, rel_thresh=3)) APs_thresh2[n].append(AP_at_N(n=n, recs=recs, rel_thresh=2)) with open('TwitterRatings/funkSVD/' + output_filename, 'a') as file: for n in topN: file.write( "N=%s, normal nDCG=%s, alternative nDCG=%s, bin nDCG(rel_thresh=4)=%s, bin nDCG(rel_thresh=3)=%s, MAP(rel_thresh=4)=%s, MAP(rel_thresh=3)=%s, MAP(rel_thresh=2)=%s, MRR(rel_thresh=4)=%s, MRR(rel_thresh=3)=%s\n" % \ (n, mean(nDCGs_normal[n]), mean(nDCGs_altform[n]), mean(nDCGs_bin_thresh4[n]), mean(nDCGs_bin_thresh3[n]), mean(APs_thresh4[n]), mean(APs_thresh3[n]), mean(APs_thresh2[n]), mean(MRR_thresh4), mean(MRR_thresh3)) )
def save_testing_recommendations(data_path, which_model, metric, representation): solr = 'http://localhost:8983/solr/grrecsys' test_c = consumption(ratings_path=data_path + 'test/test_N20.data', rel_thresh=0, with_ratings=True) train_c = consumption(ratings_path=data_path + 'eval_train_N20.data', rel_thresh=0, with_ratings=False) docs2vec = np.load('./w2v-tmp/' + which_model + '/docs2vec_' + which_model + '.npy').item() users2vec = np.load('./w2v-tmp/' + which_model + '/users2vec_' + representation + '_' + which_model + '.npy').item() recommendations = {} i = 1 for userId in test_c: logging.info("MODO 2. {0} de {1}. User ID: {2}".format( i, len(test_c), userId)) i += 1 distances = dict((bookId, 0.0) for bookId in docs2vec) for bookId in docs2vec: if metric == 'angular': distances[bookId] = spatial.distance.cosine( users2vec[userId], docs2vec[bookId]) elif metric == 'euclidean': distances[bookId] = spatial.distance.euclidean( users2vec[userId], docs2vec[bookId]) sorted_sims = sorted( distances.items(), key=operator.itemgetter(1), reverse=False) #[(<grId>, MENOR dist), ..., (<grId>, MAYOR dist)] book_recs = [bookId for bookId, sim in sorted_sims] book_recs = remove_consumed(user_consumption=train_c[userId], rec_list=book_recs) book_recs = recs_cleaner(solr=solr, consumpt=train_c[userId], recs=book_recs[:50]) recommendations[userId] = book_recs np.save('TwitterRatings/recommended_items/w2v_op2gbangular.npy', recommendations)
def statistics_protocol(data_path, N, folds): logging.info( "N={N}".format(N=N) ) all_c = consumption(ratings_path= data_path+'eval_all_N'+str(N)+'.data', rel_thresh= 0, with_ratings= True) # #users, #items, #ratings, avg. rating items = set() users = set() ratings = [] with open(data_path+'eval_all_N'+str(N)+'.data', 'r') as f: for line in f: (userId,itemId,rating) = line.split(',') items.add(itemId) users.add(userId) ratings.append(int(rating)) logging.info( "#users={users}".format(users= len(users)) ) logging.info( "#users={users}".format(users= len(items)) ) logging.info( "#ratings={ratings}".format(ratings= len(ratings)) ) logging.info( "avg. rating={mean}±{stdev}".format(mean= mean(ratings), stdev= stdev(ratings)) ) # Ratings por item item_ratings = dict((itemId, []) for itemId in items) for user in all_c: for item in all_c[user]: item_ratings[item].append( all_c[user][item] ) ratings_per_item = [] for item in item_ratings: ratings_per_item.append( len(item_ratings[item]) ) logging.info( "promedio de ratings por item: {mean}±{stdev}".format(mean= mean(ratings_per_item), stdev= stdev(ratings_per_item)) ) #Ratings por usuario ratings_per_user = [] for user in all_c: ratings_per_user.append( len(all_c[user]) ) logging.info( "promedio de ratings por usuario: {mean}±{stdev}".format(mean= mean(ratings_per_user), stdev= stdev(ratings_per_user)) ) #Sparsity count = 0 for freq in ratings_per_item: count += freq / float( len(users) ) count = count / float( len(items) ) count = count*100 logging.info( "density: {}".format(count) )
def option1_protocol_evaluation(data_path, which_model, metric): solr = 'http://localhost:8983/solr/grrecsys' # userId='113447232' 285597345 test_c = consumption(ratings_path=data_path + 'test/test_N20.data', rel_thresh=0, with_ratings=True) train_c = consumption(ratings_path=data_path + 'eval_train_N20.data', rel_thresh=0, with_ratings=False) MRRs = dict((N, []) for N in [5, 10, 15, 20]) nDCGs = dict((N, []) for N in [5, 10, 15, 20]) APs = dict((N, []) for N in [5, 10, 15, 20]) Rprecs = dict((N, []) for N in [5, 10, 15, 20]) docs2vec = np.load('./w2v-tmp/' + which_model + '/docs2vec_' + which_model + '.npy').item() if which_model == 'twit': vector_size = 200 else: vector_size = 300 t = AnnoyIndex(vector_size, metric=metric) t.load('./w2v-tmp/' + which_model + '/doc_vecs_t100_' + metric + '_' + which_model + '.tree') num_to_grId = np.load('./w2v-tmp/' + which_model + '/num_to_grId_' + metric + '_' + which_model + '.npy').item() grId_to_num = np.load('./w2v-tmp/' + which_model + '/grId_to_num_' + metric + '_' + which_model + '.npy').item() i = 1 for userId in test_c: logging.info("MODO 1. {0} de {1}. User ID: {2}".format( i, len(test_c), userId)) i += 1 book_recs = [] for bookId in train_c[userId]: try: docs = t.get_nns_by_item(grId_to_num[bookId], 500) book_recs.append( [str(num_to_grId[doc_num]) for doc_num in docs]) except KeyError as e: logging.info( "{} ES UNO DE LOS LIBROS CUYO HTML NO PUDO SER DESCARGADO. PROSIGUIENDO CON EL SIGUIENTE LIBRO.." .format(bookId)) continue book_recs = flatten_list(list_of_lists=book_recs, rows=len( book_recs[0])) #rows=len(sorted_sims)) book_recs = remove_consumed(user_consumption=train_c[userId], rec_list=book_recs) book_recs = recs_cleaner(solr=solr, consumpt=train_c[userId], recs=book_recs[:50]) try: recs = user_ranked_recs(user_recs=book_recs, user_consumpt=test_c[userId]) except KeyError as e: logging.info( "Usuario {0} del fold de train (total) no encontrado en fold de 'test'" .format(userId)) continue for N in [5, 10, 15, 20]: mini_recs = dict((k, recs[k]) for k in list(recs.keys())[:N]) MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1)) nDCGs[N].append( nDCG(recs=mini_recs, alt_form=False, rel_thresh=False)) APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1)) Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs)) with open( 'TwitterRatings/word2vec/clean/option1_protocol_' + which_model + '.txt', 'a') as file: file.write("METRIC: %s\n" % (metric)) for N in [5, 10, 15, 20]: with open( 'TwitterRatings/word2vec/clean/option1_protocol_' + which_model + '.txt', 'a') as file: file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \ (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
def option2_protocol_evaluation(data_path, which_model, metric, representation): solr = 'http://localhost:8983/solr/grrecsys' test_c = consumption(ratings_path=data_path + 'test/test_N20.data', rel_thresh=0, with_ratings=True) train_c = consumption(ratings_path=data_path + 'eval_train_N20.data', rel_thresh=0, with_ratings=False) MRRs = dict((N, []) for N in [5, 10, 15, 20]) nDCGs = dict((N, []) for N in [5, 10, 15, 20]) APs = dict((N, []) for N in [5, 10, 15, 20]) Rprecs = dict((N, []) for N in [5, 10, 15, 20]) docs2vec = np.load('./w2v-tmp/' + which_model + '/docs2vec_' + which_model + '.npy').item() users2vec = np.load('./w2v-tmp/' + which_model + '/users2vec_' + representation + '_' + which_model + '.npy').item() i = 1 for userId in test_c: logging.info("MODO 2. {0} de {1}. User ID: {2}".format( i, len(test_c), userId)) i += 1 distances = dict((bookId, 0.0) for bookId in docs2vec) for bookId in docs2vec: if metric == 'angular': distances[bookId] = spatial.distance.cosine( users2vec[userId], docs2vec[bookId]) elif metric == 'euclidean': distances[bookId] = spatial.distance.euclidean( users2vec[userId], docs2vec[bookId]) sorted_sims = sorted( distances.items(), key=operator.itemgetter(1), reverse=False) #[(<grId>, MENOR dist), ..., (<grId>, MAYOR dist)] book_recs = [bookId for bookId, sim in sorted_sims] book_recs = remove_consumed(user_consumption=train_c[userId], rec_list=book_recs) book_recs = recs_cleaner(solr=solr, consumpt=train_c[userId], recs=book_recs[:50]) try: recs = user_ranked_recs(user_recs=book_recs, user_consumpt=test_c[userId]) except KeyError as e: logging.info( "Usuario {0} del fold de train (total) no encontrado en fold de 'test'" .format(userId)) continue for N in [5, 10, 15, 20]: mini_recs = dict((k, recs[k]) for k in list(recs.keys())[:N]) MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1)) nDCGs[N].append( nDCG(recs=mini_recs, alt_form=False, rel_thresh=False)) APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1)) Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs)) with open( 'TwitterRatings/word2vec/clean/option2_protocol_' + which_model + '.txt', 'a') as file: file.write("METRIC: %s \t REPR: %s\n" % (metric, representation)) for N in [5, 10, 15, 20]: with open( 'TwitterRatings/word2vec/clean/option2_protocol_' + which_model + '.txt', 'a') as file: file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \ (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
def evaluation_set_with_authors(db_conn, N, folds, out_path): # """ SÓLO SI YA SE EJECUTÓ ANTES evaluation_set(). Guarda set de train y test con autores de los libros consumidos por los usuarios """ data_path = 'TwitterRatings/funkSVD/data/' c = db_conn.cursor() c.execute("SELECT DISTINCT user_reviews.bookId, authors.id, authors.name\ FROM user_reviews\ INNER JOIN authors\ ON user_reviews.bookId=authors.bookId;") all_rows = c.fetchall() books = {} for tupl in all_rows: bookId, authorId, author_name = tupl if str(bookId) not in books: books[str(bookId)] = [] if str(authorId) not in books[str(bookId)]: books[str(bookId)].append( str(authorId) ) logging.info("Guardando test..") test = consumption(ratings_path=data_path+'test/test_N'+str(N)+'.data', rel_thresh=0, with_ratings=True, with_timestamps=True) with open(out_path+'test/test_N'+str(N)+'.data', 'w') as f: for user, d in test.items(): for item, tupl in d.items(): s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1]) for i in range(3):#for author in books[item]: Dejamos 3 autores try: author = books[item][i] except IndexError as e: author = 0 if i!=2: s+=',{author}'.format(author=author) else: s+=',{author}\n'.format(author=author) f.write( s ) logging.info("Guardando train..") train = consumption(ratings_path=data_path+'eval_train_N'+str(N)+'.data', rel_thresh=0, with_ratings=True, with_timestamps=True) with open(out_path+'eval_train_N'+str(N)+'.data', 'w') as f: for user, d in train.items(): for item, tupl in d.items(): s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1]) for i in range(3):#for author in books[item]: Dejamos 3 autores try: author = books[item][i] except IndexError as e: author = 0 if i!=2: s+=',{author}'.format(author=author) else: s+=',{author}\n'.format(author=author) f.write( s ) for j in range(1, folds): logging.info("Guardando validation folds y training aggregated folds. Fold #i={}".format(j)) val_f = consumption(ratings_path=data_path+'val/val_N'+str(N)+'.'+str(j), rel_thresh=0, with_ratings=True, with_timestamps=True) with open(out_path+'val/val_N'+str(N)+'.'+str(j), 'w') as f: for user, d in val_f.items(): for item, tupl in d.items(): s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1]) for i in range(3):#for author in books[item]: Dejamos 3 autores try: author = books[item][i] except IndexError as e: author = 0 if i!=2: s+=',{author}'.format(author=author) else: s+=',{author}\n'.format(author=author) f.write( s ) train_f = consumption(ratings_path=data_path+'train/train_N'+str(N)+'.'+str(j), rel_thresh=0, with_ratings=True, with_timestamps=True) with open(out_path+'train/train_N'+str(N)+'.'+str(j), 'w') as f: for user, d in train_f.items(): for item, tupl in d.items(): s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1]) for i in range(3):#for author in books[item]: Dejamos 3 autores try: author = books[item][i] except IndexError as e: author = 0 if i!=2: s+=',{author}'.format(author=author) else: s+=',{author}\n'.format(author=author) f.write( s ) logging.info("Guardando total..") everything = consumption(ratings_path=data_path+'eval_all_N'+str(N)+'.data', rel_thresh=0, with_ratings=True, with_timestamps=True) with open(out_path+'eval_all_N'+str(N)+'.data', 'w') as f: for user, d in everything.items(): for item, tupl in d.items(): s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1]) for i in range(3):#for author in books[item]: Dejamos 3 autores try: author = books[item][i] except IndexError as e: author = 0 if i!=2: s+=',{author}'.format(author=author) else: s+=',{author}\n'.format(author=author) f.write( s )
def option1_protocol_evaluation(data_path, N, model): # userId='113447232' user_bookId='17310690' test_c = consumption(ratings_path=data_path + 'test/test_N' + str(N) + '.data', rel_thresh=0, with_ratings=True) train_c = consumption(ratings_path=data_path + 'eval_train_N' + str(N) + '.data', rel_thresh=0, with_ratings=False) MRRs = [] nDCGs = [] APs = [] Rprecs = [] flat_docs = np.load('./w2v-tmp/flattened_docs_fea05b2.npy').item() num_to_grId = np.load('./w2v-tmp/num_to_grId.npy').item() grId_to_num = np.load('./w2v-tmp/grId_to_num.npy').item() t = AnnoyIndex(300) t.load('./w2v-tmp/doc_vecs_t100.tree') num_best = 20 i = 1 for userId in test_c: logging.info("MODO 1. {0} de {1}. User ID: {2}".format( i, len(test_c), userId)) i += 1 # stream_url = solr + '/query?rows=1000&q=goodreadsId:{ids}' # ids_string = encoded_itemIds(item_list=train_c[userId]) # url = stream_url.format(ids=ids_string) # response = json.loads( urlopen(url).read().decode('utf8') ) # try: # docs = response['response']['docs'] # except TypeError as e: # continue book_recs = [] book_recs_cos = [] for user_bookId in train_c[userId]: #for user_doc in docs: try: docs = t.get_nns_by_item(grId_to_num[user_bookId], 4) book_recs_cos += [ str(num_to_grId[doc_num]) for doc_num in docs ] except KeyError as e: logging.info( "{} ES UNO DE LOS LIBROS CUYO HTML NO PUDO SER DESCARGADO. PROSIGUIENDO CON EL SIGUIENTE LIBRO.." .format(bookId)) continue # Removemos de la primera lista los items consumidos, dado que get_nns_by_items() los incluye book_recs_cos = [ bookId for bookId in book_recs_cos if bookId not in train_c[userId] ] wmd_corpus = [] num_to_grId_wmd = {} j = 0 for grId in book_recs_cos: wmd_corpus.append(flat_docs[grId]) num_to_grId_wmd[j] = grId j += 1 grId_to_num_wmd = {v: k for k, v in num_to_grId_wmd.items()} index = WmdSimilarity(wmd_corpus, model, num_best=num_best, normalize_w2v_and_replace=False) for user_bookId in train_c[userId]: r = index[flat_docs[user_bookId]] book_recs.append([num_to_grId_wmd[id] for id, score in r]) # wmds = dict((bookId, 0.0) for bookId in flat_docs) # user_bookId = str(user_doc['goodreadsId'][0]) #id de libro consumido por user # for bookId in flat_docs: #ids de libros en la DB # if bookId == user_bookId: continue # wmds[bookId] = model.wmdistance(flat_docs[bookId], flat_docs[user_bookId]) #1 - dist = similarity # sorted_sims = sorted(wmds.items(), key=operator.itemgetter(1), reverse=False) #[(<grId>, MAYOR sim), ..., (<grId>, menor sim)] # book_recs.append( [ bookId for bookId, sim in sorted_sims ] ) book_recs = flatten_list(list_of_lists=book_recs, rows=len(book_recs[0])) book_recs = remove_consumed(user_consumption=train_c[userId], rec_list=book_recs) try: recs = user_ranked_recs(user_recs=book_recs, user_consumpt=test_c[userId]) except KeyError as e: logging.info( "Usuario {0} del fold de train (total) no encontrado en fold de 'test'" .format(userId)) continue #################################### mini_recs = dict( (k, recs[k]) for k in list(recs.keys()) [:N]) #Python 3.x: .keys() devuelve una vista, no una lista nDCGs.append(nDCG(recs=mini_recs, alt_form=False, rel_thresh=False)) APs.append(AP_at_N(n=N, recs=mini_recs, rel_thresh=1)) MRRs.append(MRR(recs=mini_recs, rel_thresh=1)) Rprecs.append(R_precision(n_relevants=N, recs=mini_recs)) #################################### with open('TwitterRatings/word2vec/option1_protocol_wmd.txt', 'a') as file: file.write( "N=%s, normal nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \ (N, mean(nDCGs), mean(APs), mean(MRRs), mean(Rprecs)) )
def option2_protocol_evaluation(data_path, N, model): test_c = consumption(ratings_path=data_path + 'test/test_N' + str(N) + '.data', rel_thresh=0, with_ratings=True) train_c = consumption(ratings_path=data_path + 'eval_train_N' + str(N) + '.data', rel_thresh=0, with_ratings=False) MRRs = [] nDCGs = [] APs = [] Rprecs = [] flat_docs = np.load('./w2v-tmp/flattened_docs_fea05b2.npy').item() flat_users = np.load('./w2v-tmp/flattened_users_fea05b2.npy').item() docs2vec = np.load('./w2v-tmp/docs2vec.npy').item() users2vec = np.load('./w2v-tmp/users2vec.npy').item() num_best = 20 i = 1 for userId in test_c: logging.info("MODO 2. {0} de {1}. User ID: {2}".format( i, len(test_c), userId)) i += 1 # wmds = dict((bookId, 0.0) for bookId in flat_docs) # for bookId in flat_docs: # wmds[bookId] = model.wmdistance(flat_users[userId], flat_docs[bookId]) cosines = dict((bookId, 0.0) for bookId in docs2vec) for bookId in docs2vec: cosines[bookId] = 1 - spatial.distance.cosine( users2vec[userId], docs2vec[bookId]) #1 - dist = similarity sorted_sims = sorted( cosines.items(), key=operator.itemgetter(1), reverse=True) #[(<grId>, MAYOR sim), ..., (<grId>, menor sim)] book_recs_cos = [bookId for bookId, sim in sorted_sims] book_recs_cos = remove_consumed(user_consumption=train_c[userId], rec_list=book_recs) wmd_corpus = [] num_to_grId_wmd = {} j = 0 for grId in book_recs_cos[:50]: wmd_corpus.append(flat_docs[grId]) num_to_grId_wmd[j] = grId j += 1 grId_to_num_wmd = {v: k for k, v in num_to_grId_wmd.items()} # Creamos índice WMD con un subset de (50) ítems recomendados al usuario por cossim index = WmdSimilarity(wmd_corpus, model, num_best=num_best, normalize_w2v_and_replace=False) r = index[flat_users[userId]] book_recs = [num_to_grId_wmd[id] for id, score in r] # sorted_sims = sorted(wmds.items(), key=operator.itemgetter(1), reverse=False) #[(<grId>, MAYOR sim), ..., (<grId>, menor sim)] # book_recs = [ bookId for bookId, sim in sorted_sims ] book_recs = remove_consumed(user_consumption=train_c[userId], rec_list=book_recs) try: recs = user_ranked_recs(user_recs=book_recs, user_consumpt=test_c[userId]) except KeyError as e: logging.info( "Usuario {0} del fold de train (total) no encontrado en fold de 'test'" .format(userId)) continue #################################### mini_recs = dict( (k, recs[k]) for k in list(recs.keys()) [:N]) #Python 3.x: .keys() devuelve una vista, no una lista nDCGs.append(nDCG(recs=mini_recs, alt_form=False, rel_thresh=False)) APs.append(AP_at_N(n=N, recs=mini_recs, rel_thresh=1)) MRRs.append(MRR(recs=mini_recs, rel_thresh=1)) Rprecs.append(R_precision(n_relevants=N, recs=mini_recs)) #################################### with open('TwitterRatings/word2vec/option2_protocol_wmd.txt', 'a') as file: file.write( "N=%s, normal nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \ (N, mean(nDCGs), mean(APs), mean(MRRs), mean(Rprecs)) )
def PRF_calculator(params, folds, topN): ratings_train, ratings_test = [], [] with open('TwitterRatings/funkSVD/ratings.train', 'r') as f: for line in f: ratings_train.append(line.strip()) with open('TwitterRatings/funkSVD/ratings.test', 'r') as f: for line in f: ratings_test.append(line.strip()) preferred_consumption = consumption( ratings_path='TwitterRatings/funkSVD/ratings.test', rel_thresh=4, with_ratings=False) for n in topN: precision_folds, recall_folds = [], [] # for _ in range(0, folds): # ratingsSampler(ratings_train, 'TwitterRatings/funkSVD/ratings_temp.train', 0.8) # ratingsSampler(ratings_test, 'TwitterRatings/funkSVD/ratings_temp.test', 0.8) svd = pyreclab.SVD( dataset= 'TwitterRatings/funkSVD/ratings.train', #o ratings_temp.train dlmchar=b',', header=False, usercol=0, itemcol=1, ratingcol=2) svd.train(factors=params['f'], maxiter=params['mi'], lr=params['lr'], lamb=params['lamb']) recommendationList = svd.testrec( input_file= 'TwitterRatings/funkSVD/ratings.test', #o ratings_temp.test dlmchar=b',', header=False, usercol=0, itemcol=1, ratingcol=2, topn=n, includeRated=False) users_precisions, users_recalls = [], [] for userId in recommendationList[0]: recs = set(recommendationList[0][userId]) cons = set(preferred_consumption[userId]) tp = len(recs & cons) fp = len(recs - cons) fn = len(cons - recs) users_precisions.append(float(tp) / (tp + fp)) try: users_recalls.append(float(tp) / (tp + fn)) except ZeroDivisionError as e: continue precision_folds.append(mean(users_precisions)) recall_folds.append(mean(users_recalls)) p = mean(precision_folds) r = mean(recall_folds) f = 2 * p * r / (p + r) with open('TwitterRatings/funkSVD/recall.txt', 'a') as file: file.write("N=%s, P=%s, R=%s, F=%s\n" % (n, p, r, f))