def main(train_file, user_item_side_information_file, hierarchy_file, test_file): A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_item_side_information_file) hierarchy = json.loads(open(hierarchy_file).read()) lda = LDAHierarquical(B, hierarchy, topics=15) #####REMOVE_IT def important_topics(x, topics): if not x: return x transf = [(i, j) for i, j in enumerate(x)] transf = sorted(transf, cmp=lambda x, y: cmp(x[1], y[1])) return [i[0] for i in transf[:topics]] topics = 3 coincidencias = [] for user in range(1, 101): # Topicos do usuario 10 user_topics = important_topics(lda.model['users'][user], topics) # Topicos das cidades de teste do usuario 10 T = tsv_to_matrix(test_file) cities = T[user].nonzero()[0] cities_topics = [ important_topics(lda.model['cities'].get(city, []), topics) for city in cities ] total = 0 topics_compared = 0 coinc = 0 for city_topic in cities_topics: if city_topic: coinc += len(set(user_topics) & set(city_topic)) topics_compared += len(user_topics) total += 1 else: pass if total: perc = (coinc / float(topics_compared)) else: perc = -1 coincidencias.append([coinc, topics_compared, perc]) aa = open('/tmp/coincidencias.json', 'w') aa.write(json.dumps(coincidencias)) aa.close() ##### W = slim_train(A) recommendations = slim_lda_recommender(A, W, lda) compute_precision(recommendations, test_file)
def main(train_file, user_item_side_information_file, hierarchy_file, test_file): A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_item_side_information_file) hierarchy = json.loads(open(hierarchy_file).read()) lda = LDAHierarquical(B, hierarchy, topics=15) #####REMOVE_IT def important_topics(x, topics): if not x: return x transf = [ (i, j) for i, j in enumerate(x) ] transf = sorted(transf, cmp=lambda x, y: cmp(x[1], y[1])) return [ i[0] for i in transf[:topics] ] topics = 3 coincidencias = [] for user in range(1, 101): # Topicos do usuario 10 user_topics = important_topics(lda.model['users'][user], topics) # Topicos das cidades de teste do usuario 10 T = tsv_to_matrix(test_file) cities = T[user].nonzero()[0] cities_topics = [ important_topics(lda.model['cities'].get(city, []), topics) for city in cities ] total = 0 topics_compared = 0 coinc = 0 for city_topic in cities_topics: if city_topic: coinc += len(set(user_topics) & set(city_topic)) topics_compared += len(user_topics) total += 1 else: pass if total: perc = (coinc/float(topics_compared)) else: perc = -1 coincidencias.append([coinc, topics_compared, perc]) aa = open('/tmp/coincidencias.json', 'w') aa.write(json.dumps(coincidencias)) aa.close() ##### W = slim_train(A) recommendations = slim_lda_recommender(A, W, lda) compute_precision(recommendations, test_file)
def main(train_file, part_file, test_file): AG = tsv_to_matrix(train_file, 942, 1682) AP = tsv_to_matrix(part_file, 942, 1682) W1 = slim_train(AG) W2 = slim_train(AP) for i in range(0, 11): W = (i / 10) * W1 + (1 - i / 10) * W2 print(i / 10) recommendations = slim_recommender(AP, W) compute_precision(recommendations, test_file)
def main(train_file, part_file): AG = tsv_to_matrix(train_file, 942, 1682) AP = tsv_to_matrix(part_file, 942, 1682) W1 = slim_train(AG) W2 = slim_train(AP) W = 0 * W1 + 1 * W2 recommendations = slim_recommender(AP, W) compute_precision(recommendations, part_file)
def main(train_file, user_sideinformation_file, test_file, normalize): A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_sideinformation_file) if normalize: B = normalize_values(B) A, B = make_compatible(A, B) W = sslim_train(A, B) recommendations = slim_recommender(A, W) return compute_precision(recommendations, test_file)
def main(train_file, user_item_side_information_file, hierarchy_file, test_file): A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_item_side_information_file) hierarchy = json.loads(open(hierarchy_file).read()) W = slim_train(A) ## LDA #lda = LDAHierarquical(B, hierarchy, topics=20) #recommendations_lda = slim_lda_recommender(A, W, lda) #compute_precision(recommendations_lda, test_file) ### recommendations_slim = slim_recommender(A, W) ## HSLIM from hslim import handle_user_bias, hierarchy_factory, normalize_wline, generate_subitem_hierarchy hierarchy = hierarchy_factory('data/hierarchy.json') K = slim_train(handle_user_bias(B)) Wline = generate_subitem_hierarchy(K, W, hierarchy) WlineNorm = normalize_wline(Wline) recommendations_other = slim_recommender(A, WlineNorm) ### kendall_tau_values = [] differences_values = [] for u in recommendations_slim.iterkeys(): ranking_slim = recommendations_slim[u][:RANKING_UNTIL] ranking_other = recommendations_other[u][:RANKING_UNTIL] kendall_tau_values.append(kendalltau(ranking_slim, ranking_other)) differences_values.append(RANKING_UNTIL - len(set(ranking_slim) & set(ranking_other))) # Differences plt.hist(differences_values) plt.xlabel('Size of difference') plt.ylabel('Amount of rankings') plt.title('Differences (novelty) between rankings') # Ranking comparison show_matplot_fig() plt.figure() plt.hist([i[0] for i in kendall_tau_values]) plt.xlabel('KendallTau Distance SLIM/SLIM LDA') plt.ylabel('Number of occurrences') plt.title('Comparison between rankings') show_matplot_fig()
def main(train_file, user_sideinformation_file, test_file): A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_sideinformation_file) """ from util import mm2csr mm2csr(A, '/tmp/train.mat') mm2csr(useritem_featureitem, '/tmp/train_feature.mat') C = tsv_to_matrix(test_file) mm2csr(C, '/tmp/test.mat') """ W = sslim_train(A, B) recommendations = slim_recommender(A, W) compute_precision(recommendations, test_file)
def main(train_file, user_item_side_information_file, hierarchy_file, test_file): A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_item_side_information_file) hierarchy = json.loads(open(hierarchy_file).read()) W = slim_train(A) ## LDA #lda = LDAHierarquical(B, hierarchy, topics=20) #recommendations_lda = slim_lda_recommender(A, W, lda) #compute_precision(recommendations_lda, test_file) ### recommendations_slim = slim_recommender(A, W) ## HSLIM from hslim import handle_user_bias, hierarchy_factory, normalize_wline, generate_subitem_hierarchy hierarchy = hierarchy_factory('data/hierarchy.json') K = slim_train(handle_user_bias(B)) Wline = generate_subitem_hierarchy(K, W, hierarchy) WlineNorm = normalize_wline(Wline) recommendations_other = slim_recommender(A, WlineNorm) ### kendall_tau_values = [] differences_values = [] for u in recommendations_slim.iterkeys(): ranking_slim = recommendations_slim[u][:RANKING_UNTIL] ranking_other = recommendations_other[u][:RANKING_UNTIL] kendall_tau_values.append(kendalltau(ranking_slim, ranking_other)) differences_values.append(RANKING_UNTIL-len(set(ranking_slim) & set(ranking_other))) # Differences plt.hist(differences_values) plt.xlabel('Size of difference') plt.ylabel('Amount of rankings') plt.title('Differences (novelty) between rankings') # Ranking comparison show_matplot_fig() plt.figure() plt.hist([ i[0] for i in kendall_tau_values ]) plt.xlabel('KendallTau Distance SLIM/SLIM LDA') plt.ylabel('Number of occurrences') plt.title('Comparison between rankings') show_matplot_fig()
def main(train_file, test_file): A = tsv_to_matrix(train_file) W = slim_train(A) recommendations = slim_recommender(A, W) compute_precision_as_an_oracle(recommendations, test_file)
def main(train_file, user_sideinformation_file, test_file, normalize): A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_sideinformation_file) if normalize: B = normalize_values(B) A, B = make_compatible(A, B) W = sslim_train(A, B) save_matrix(W, 'sslim_oracle_wmatrix.tsv') recommendations = slim_recommender(A, W) precisions = compute_precision_as_an_oracle(recommendations, test_file) return precisions
def main(train_file, test_file): A = tsv_to_matrix(train_file) W = slim_train(A) recommendations = slim_recommender(A, W) return compute_precision(recommendations, test_file)
def main(train_file): data = tsv_to_matrix(train_file, 942, 1682) kmeans = KMeans(n_clusters=2, random_state=0).fit(data) print (data.shape) k = 4 # k-means picking the first k points as centroids centroids = kmeans.cluster_centers labels = kmeans.labels_ print(centroids)
def main(train_file): data = tsv_to_matrix(train_file, 942, 1682).toarray() k = 8 #shape[0] is 4, shape[1] is 1682 dim = data.shape[1] # print("data shape0:" + str(data[:k].shape[0])) # print("data shape1:" + str(data[:k].shape[1])) iteration = 10 for i in range(0, iteration): #call cuckoo search if i == 0: centroids = cuckoo_search(k, dim, data[:k]) else: centroids = cuckoo_search(k, dim, new_centroids) # k-means picking the first k points as centroids #centroids = data[:k] print("shape[0]: " + str(centroids.shape[0])) print("shape[1]: " + str(centroids.shape[1])) clusters, labels, new_centroids = kmeans(k, centroids, data, "first") if i == iteration - 1: train0 = np.array(clusters[0]) train1 = np.array(clusters[1]) train2 = np.array(clusters[2]) train3 = np.array(clusters[3]) train4 = np.array(clusters[4]) train5 = np.array(clusters[5]) train6 = np.array(clusters[6]) train7 = np.array(clusters[7]) file = open('./data/k8/k8_label.txt', 'w') for i in range(len(labels)): file.write("%s\n" % (str(labels[i]))) file.close() print(len(train0)) print(len(train1)) print(len(train2)) print(len(train3)) print(len(train4)) print(len(train5)) print(len(train6)) print(len(train7))
def main(train_file, user_sideinformation_file, hierarchy_file, test_file): A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_sideinformation_file, A.shape[0], A.shape[1]) hierarchy = hierarchy_factory(hierarchy_file) # Learning using SLIM # We handle user bias only in B because in B we have explicit evaluations K = slim_train(handle_user_bias(B)) W = slim_train(A) Wline = generate_subitem_hierarchy(K, W, hierarchy) WlineNorm = normalize_wline(Wline) #recommendations = slim_recommender(A, W + 0.2 * WlineNorm) import pdb;pdb.set_trace() recommendations = slim_recommender(A, WlineNorm) # See if the predictor is just of not #user_cities = np.array([ map(hierarchy, B[i].nonzero()[0].tolist()) for i in range(B.shape[0]) ]) #G = tsv_to_matrix(test_file) #print 'TEM QUE DAR VAZIO: ', set(G[1].nonzero()[0]) & set(user_cities[1]) ### ---- FIM REMOVAME compute_precision(recommendations, test_file)
def main(train_file): data = tsv_to_matrix(train_file, 942, 1682).toarray() k = 4 # k-means picking the first k points as centroids centroids = data[:k] clusters, labels = kmeans(k, centroids, data, "first") train0 = np.array(clusters[0]) train1 = np.array(clusters[1]) train2 = np.array(clusters[2]) train3 = np.array(clusters[3]) file = open('label.txt', 'w') for i in range(len(labels)): file.write("%s\n" %(str(labels[i]))) file.close() print(train0.shape)
read "SLIM: Sparse LInear Methods for Top-N Recommender Systems". """ from sklearn.linear_model import SGDRegressor from util import tsv_to_matrix, generate_slices from util.metrics import compute_precision from util.recommender import slim_recommender import numpy as np import multiprocessing import ctypes from util import parse_args import simplejson as json args = parse_args() # Loading matrices A = tsv_to_matrix(args.train) # Loading shared array to be used in results shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1] ** 2) shared_array = np.ctypeslib.as_array(shared_array_base.get_obj()) shared_array = shared_array.reshape(A.shape[1], A.shape[1]) # because in SLIM each column is independent we can use make this work in # parallel def work(params, W=shared_array): from_j = params[0] to_j = params[1] M = params[2] model = params[3] counter = 0
from util import (tsv_to_matrix, generate_slices, make_compatible, normalize_values, save_matrix) from util.metrics import compute_precision import multiprocessing import ctypes from scipy.sparse import vstack import datetime from util import parse_args import simplejson as json print '>>> Start: %s' % datetime.datetime.now() args = parse_args(side_information=True, beta=True) # Loading matrices A = tsv_to_matrix(args.train) B = tsv_to_matrix(args.side_information) if args.normalize: B = normalize_values(B) A, B = make_compatible(A, B) # Loading shared array to be used in results shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1] ** 2) shared_array = np.ctypeslib.as_array(shared_array_base.get_obj()) shared_array = shared_array.reshape(A.shape[1], A.shape[1]) # We create a work function to fit each one of the columns of our W matrix, # because in SLIM each column is independent we can use make this work in
def main(train_file, part_file, test_file): AG = tsv_to_matrix(train_file, 942, 1682) AP = tsv_to_matrix(part_file, 942, 1682) W1 = slim_train(AG) W2 = slim_train(AP) # total_precision = [] k = 2 matrix_5 = np.zeros((21, k)) matrix_10 = np.zeros((21, k)) matrix_15 = np.zeros((21, k)) matrix_20 = np.zeros((21, k)) for i in range(0, 105, 5): gu = i / 100 W = gu * W1 + (1 - gu) * W2 print("gu: " + str(gu)) recommendations = slim_recommender(AP, W) top5, top10, top15, top20 = compute_precision(recommendations, test_file) for j in range(2): matrix_5[int(i / 5)][j] = top5[j] matrix_10[int(i / 5)][j] = top10[j] matrix_15[int(i / 5)][j] = top15[j] matrix_20[int(i / 5)][j] = top20[j] hr_values = [] hr_values1 = [] index1, value1 = max(enumerate(matrix_5[:, 0]), key=operator.itemgetter(1)) index2, value2 = max(enumerate(matrix_10[:, 0]), key=operator.itemgetter(1)) index3, value3 = max(enumerate(matrix_15[:, 0]), key=operator.itemgetter(1)) index4, value4 = max(enumerate(matrix_20[:, 0]), key=operator.itemgetter(1)) hr_values.append(index1 * 0.05) hr_values.append(value1) hr_values.append(index2 * 0.05) hr_values.append(value2) hr_values.append(index3 * 0.05) hr_values.append(value3) hr_values.append(index4 * 0.05) hr_values.append(value4) hr_values1.append(matrix_5[20][0]) hr_values1.append(matrix_10[20][0]) hr_values1.append(matrix_15[20][0]) hr_values1.append(matrix_20[20][0]) arhr_values = [] arhr_values1 = [] index1, value1 = max(enumerate(matrix_5[:, 1]), key=operator.itemgetter(1)) index2, value2 = max(enumerate(matrix_10[:, 1]), key=operator.itemgetter(1)) index3, value3 = max(enumerate(matrix_15[:, 1]), key=operator.itemgetter(1)) index4, value4 = max(enumerate(matrix_20[:, 1]), key=operator.itemgetter(1)) arhr_values.append(index1 * 0.05) arhr_values.append(value1) arhr_values.append(index2 * 0.05) arhr_values.append(value2) arhr_values.append(index3 * 0.05) arhr_values.append(value3) arhr_values.append(index4 * 0.05) arhr_values.append(value4) arhr_values1.append(matrix_5[20][1]) arhr_values1.append(matrix_10[20][1]) arhr_values1.append(matrix_15[20][1]) arhr_values1.append(matrix_20[20][1]) print('k8 top5: %s' % (matrix_5)) print('k8 top10: %s' % (matrix_10)) print('k8 top15: %s' % (matrix_15)) print('k8 top20: %s' % (matrix_20)) print('Max HR: %s' % (hr_values)) print('HR at gu = 1: %s' % (hr_values1)) print('Max ARHR: %s' % (arhr_values)) print('ARHR at gu = 1: %s' % (arhr_values1))
read "SLIM: Sparse LInear Methods for Top-N Recommender Systems". """ from sklearn.linear_model import SGDRegressor from util import tsv_to_matrix, generate_slices from util.metrics import compute_precision from util.recommender import slim_recommender import numpy as np import multiprocessing import ctypes from util import parse_args import simplejson as json args = parse_args() # Loading matrices A = tsv_to_matrix(args.train) # Loading shared array to be used in results shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2) shared_array = np.ctypeslib.as_array(shared_array_base.get_obj()) shared_array = shared_array.reshape(A.shape[1], A.shape[1]) # because in SLIM each column is independent we can use make this work in # parallel def work(params, W=shared_array): from_j = params[0] to_j = params[1] M = params[2] model = params[3] counter = 0
read "Sparse Linear Methods with Side Information for Top-N Recommendations" """ from sklearn.linear_model import SGDRegressor import numpy as np from recommender import slim_recommender from util import tsv_to_matrix, split_train_test, generate_slices from metrics import compute_precision import multiprocessing import ctypes import sys from scipy.sparse import vstack train_file, user_sideinformation_file, test_file = sys.argv[1:] # Loading matrices A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_sideinformation_file) # Loading shared array to be used in results shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2) shared_array = np.ctypeslib.as_array(shared_array_base.get_obj()) shared_array = shared_array.reshape(A.shape[1], A.shape[1]) # We create a work function to fit each one of the columns of our W matrix, # because in SLIM each column is independent we can use make this work in # parallel def work(params, W=shared_array): from_j = params[0] to_j = params[1] M = params[2]
SLIM Parallel implementation. To understand deeply how it works we encourage you to read "SLIM: Sparse LInear Methods for Top-N Recommender Systems". """ from sklearn.linear_model import SGDRegressor from util import tsv_to_matrix, generate_slices from metrics import compute_precision from recommender import slim_recommender import numpy as np import multiprocessing import ctypes import sys train_file, test_file = sys.argv[1:] # Loading matrices A = tsv_to_matrix(train_file) # Loading shared array to be used in results shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2) shared_array = np.ctypeslib.as_array(shared_array_base.get_obj()) shared_array = shared_array.reshape(A.shape[1], A.shape[1]) # because in SLIM each column is independent we can use make this work in # parallel def work(params, W=shared_array): from_j = params[0] to_j = params[1] M = params[2] model = params[3] counter = 0
read "Sparse Linear Methods with Side Information for Top-N Recommendations" """ from sklearn.linear_model import SGDRegressor import numpy as np from recommender import slim_recommender from util import tsv_to_matrix, split_train_test, generate_slices from metrics import compute_precision import multiprocessing import ctypes import sys from scipy.sparse import vstack train_file, user_sideinformation_file, test_file = sys.argv[1:] # Loading matrices A = tsv_to_matrix(train_file) B = tsv_to_matrix(user_sideinformation_file) # Loading shared array to be used in results shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2) shared_array = np.ctypeslib.as_array(shared_array_base.get_obj()) shared_array = shared_array.reshape(A.shape[1], A.shape[1]) # We create a work function to fit each one of the columns of our W matrix, # because in SLIM each column is independent we can use make this work in # parallel def work(params, W=shared_array): from_j = params[0] to_j = params[1] M = params[2] model = params[3]
from util import (tsv_to_matrix, generate_slices, make_compatible, normalize_values, save_matrix) from util.metrics import compute_precision import multiprocessing import ctypes from scipy.sparse import vstack import datetime from util import parse_args import simplejson as json print '>>> Start: %s' % datetime.datetime.now() args = parse_args(side_information=True, beta=True) # Loading matrices A = tsv_to_matrix(args.train) B = tsv_to_matrix(args.side_information) if args.normalize: B = normalize_values(B) A, B = make_compatible(A, B) # Loading shared array to be used in results shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2) shared_array = np.ctypeslib.as_array(shared_array_base.get_obj()) shared_array = shared_array.reshape(A.shape[1], A.shape[1]) # We create a work function to fit each one of the columns of our W matrix, # because in SLIM each column is independent we can use make this work in