def fit(self, URM, n_factors=10, learning_rate=1e-4, epochs=10, user_regularization=0.001, positive_item_regularization=0.001, negative_item_regularization=0.001, evaluate_every=1): self.URM = URM self.epochs = epochs self.n_users = self.URM.shape[0] self.n_items = self.URM.shape[1] e = MFBPR_Epoch( URM, n_factors=n_factors, learning_rate=learning_rate, user_regularization=user_regularization, positive_item_regularization=positive_item_regularization, negative_item_regularization=negative_item_regularization) print('Fitting MFBPR...') for numEpoch in range(self.epochs): print('Epoch:', numEpoch) e.epochIteration() if (numEpoch + 1) % evaluate_every == 0: self.user_factors, self.item_factors = e.get_user_item_factors( ) recs = self.recommend_batch(userids=d.get_target_playlists()) self.evaluate(recs, d.get_urm_test_1()) self.user_factors, self.item_factors = e.get_user_item_factors() # let's see how fine it performs in the test set: # getting as positive sample a semple in the test set but not in the training trials = 10000 count_wrong = 0 for _ in range(trials): test = d.get_urm_test_1() user_id = np.random.choice(self.n_users) user_seen_items = d.get_urm()[user_id, :].indices test_items = test[user_id, :].indices pos_item_id = np.random.choice(test_items) neg_item_selected = False while (not neg_item_selected): neg_item_id = np.random.randint(0, self.n_items) if (neg_item_id not in user_seen_items): neg_item_selected = True xui = np.dot(self.user_factors[user_id, :], self.item_factors[pos_item_id, :]) xuj = np.dot(self.user_factors[user_id, :], self.item_factors[neg_item_id, :]) xuij = xui - xuj if xuij < 0: count_wrong += 1 # print('u: {}, i: {}, j: {}. xui - xuj: {}'.format(user_id, pos_item_id, neg_item_id, xuij)) print('percentange of wrong preferences in test set: {}'.format( count_wrong / trials))
def fit(self, URM_train=d.get_urm_train_1(), epochs=190, URM_test=d.get_urm_test_1(), user_ids=d.get_target_playlists(), batch_size=1000, validate_every_N_epochs=2, start_validation_after_N_epochs=191, lambda_i=0.0, lambda_j=0.0, learning_rate=0.01, topK=1500, sgd_mode='adagrad'): """ train the model finding matrix W :param epochs(int) :param batch_size(int) after how many items the params should be updated :param lambda_i(float) first regularization term :param lambda_j(float) second regularization term :param learning_rate(float) algorithm learning rate :param topK(int) how many elements should be taken into account while computing URM*W :param sgd_mode(string) optimization algorithm :param URM_train(csr_matrix) the URM used to train the model. Either the full or the validation one :param URM_test(csr_matrix) needed if we'd like to perform validation :param user_ids(list) needed if we'd like to perform validation :param validate_every_N_epochs(int) how often the MAP evaluation should be displayed :param start_validation_after_N_epochs(int) """ self.URM_train = URM_train.T self.n_users = URM_train.shape[0] self.n_items = URM_train.shape[1] self.sgd_mode = sgd_mode print('before cython') from cythoncompiled.SLIM_BPR.SLIM_BPR_Cython_Epoch import SLIM_BPR_Cython_Epoch self.cythonEpoch = SLIM_BPR_Cython_Epoch(self.URM_train, sparse_weights=False, topK=topK, learning_rate=learning_rate, li_reg=lambda_i, lj_reg=lambda_j, batch_size=100, symmetric=True, sgd_mode=sgd_mode) print('after cython') # Cal super.fit to start training self._fit_alreadyInitialized(epochs=epochs, logFile=None, URM_test=URM_test, user_ids=user_ids, filterTopPop=False, minRatingsPerUser=1, batch_size=batch_size, validate_every_N_epochs=validate_every_N_epochs, start_validation_after_N_epochs=start_validation_after_N_epochs, lambda_i=lambda_i, lambda_j=lambda_j, learning_rate=learning_rate, topK=topK) print('after already_initialized')
def run(self, urm_train=None, urm=None, urm_test=None, targetids=None, factors=100, regularization=0.01, iterations=100, alpha=25, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Parameters ---------- num_factors : int, number of latent factors urm : csr matrix, URM. If None, used: data.get_urm_train(). This should be the entire URM for which the targetids corresponds to the row indexes. urm_test : csr matrix, urm where to test the model. If None, use: data.get_urm_test() targetids : list, target user ids. If None, use: data.get_target_playlists() Returns ------- recs: (list) recommendations map10: (float) MAP10 for the provided recommendations """ _urm = data.get_urm_train_1() _icm = data.get_icm() _urm_test = data.get_urm_test_1() _targetids = data.get_target_playlists() #_targetids = data.get_all_playlists() start = time.time() urm_train = _urm if urm_train is None else urm_train #urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(urm=urm_train, factors=factors, regularization=regularization, iterations=iterations, alpha=alpha) recs = self.recommend_batch(userids=targetids, with_scores=with_scores, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name=self.name, verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time()-start)) return recs, map10
def run(self, distance, ucm_train=None, urm=None, urm_test=None, targetids=None, k=100, shrink=10, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Parameters ---------- distance : str, distance metric targetids : list, target user ids. If None, use: data.get_target_playlists() k : int, K nearest neighbour to consider shrink : float, shrink term used in the normalization threshold : float, all the values under this value are cutted from the final result implicit : bool, if true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM Returns ------- recs: (list) recommendations map10: (float) MAP10 for the provided recommendations """ start = time.time() _ucm_train = data.get_ucm_train() _urm = data.get_urm_train_1() _urm_test = data.get_urm_test_1() _targetids = data.get_target_playlists() ucm_train = _ucm_train if ucm_train is None else ucm_train urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(ucm_train, k=k, distance=distance, alpha=alpha, beta=beta, c=c, l=l, shrink=shrink, threshold=threshold, implicit=implicit) recs = self.recommend_batch(targetids, urm=urm, with_scores=with_scores, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name='{}_{}'.format(self.name,distance), verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time()-start)) return recs, map10
def validate(self, user_ids=d.get_target_playlists(), log_path=None, normalize_similarity=[False], damp_coeff=[1], add_zeros_quota=[1], loss_tolerance=[1e-6], iteration_limit=[30], use_incremental=[False]): if log_path != None: orig_stdout = sys.stdout f = open( log_path + '/' + self.name + ' ' + time.strftime('_%H-%M-%S') + ' ' + time.strftime('%d-%m-%Y') + '.txt', 'w') sys.stdout = f for ns in normalize_similarity: for dc in damp_coeff: for adq in add_zeros_quota: for lt in loss_tolerance: for il in iteration_limit: for ui in use_incremental: print( self._print(normalize_similarity=ns, add_zeros_quota=dc, loss_tolerance=lt, iteration_limit=il, damp_coeff=dc, use_incremental=ui)) self.fit(ICM=d.get_icm(), URM_train=d.get_urm_train_1(), normalize_similarity=ns, add_zeros_quota=adq, loss_tolerance=lt, iteration_limit=il, damp_coeff=dc, use_incremental=ui) recs = self.recommend_batch( user_ids, urm=d.get_urm_train_1()) r.evaluate(recs, d.get_urm_test_1()) if log_path != None: sys.stdout = orig_stdout f.close()
def run(self, distance, h, k=100, shrink=10, threshold=0, alpha=None, beta=None, l=None, c=None, export=True, verbose=True): """ Run the model and export the results to a file Parameters ---------- distance : str, distance metric k : int, K nearest neighbour to consider shrink : float, shrink term used in the normalization threshold : float, all the values under this value are cutted from the final result implicit : bool, if true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM Returns ------- recs: (list) recommendations map10: (float) MAP10 for the provided recommendations """ start = time.time() self.fit(k=k, distance=distance, shrink=shrink, alpha=alpha, beta=beta, l=l, c=c, verbose=verbose) recs = self.recommend_batch(N=10, filter_already_liked=True, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=data.get_urm_test_1(), verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name='{}_{}'.format(self.name,distance), verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time()-start)) return recs, map10
def validate(l1_ratio_array, alpha_array, max_iter_array, topK_array, userids=data.get_target_playlists(), urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), filter_already_liked=True, items_to_exclude=[], N=10, verbose=True, write_on_file=True): """ ----------- :return: _ """ #create the initial model recommender = SLIMElasticNetRecommender() path = 'validation_results/' name = 'slim_rmse' folder = time.strftime('%d-%m-%Y') filename = '{}/{}/{}{}.csv'.format(path, folder, name, time.strftime('_%H-%M-%S')) # create dir if not exists os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, 'w') as out: for l in l1_ratio_array: for a in alpha_array: for m in max_iter_array: for k in topK_array: #train the model with the parameters if verbose: print( '\n\nTraining slim_rmse with\n l1_ratio: {}\n alpha: {}\n' 'Iterations: {}\n topK: {}'.format(l, a, m, k)) print('\n training phase...') recommender.fit(urm=urm_train, l1_ratio=l, alpha=a, max_iter=m, topK=k) #get the recommendations from the trained model recommendations = recommender.recommend_batch( userids=userids, N=N, filter_already_liked=filter_already_liked, items_to_exclude=items_to_exclude) #evaluate the model with map10 map10 = recommender.evaluate(recommendations, test_urm=urm_test) if verbose: print('map@10: {}'.format(map10)) #write on external files on folder models_validation if write_on_file: out.write( '\n\nl1_ratio: {}\n alpha: {}\n Iterations: {}\n ' 'topK: {}\n evaluation map@10: {}'.format( l, a, m, k, map10))
def run(self, urm_train=None, urm=None, urm_test=None, targetids=None, factors=100, regularization=0.01, iterations=100, alpha=25, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Returns ------- :return: recs: (list) recommendations :return: map10: (float) MAP10 for the provided recommendations """ _urm_train = data.get_urm_train_1() _urm = data.get_urm() _icm = data.get_icm() _urm_test = data.get_urm_test_1() _targetids = data.get_target_playlists() # _targetids = data.get_all_playlists() start = time.time() urm_train = _urm_train if urm_train is None else urm_train urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(l1_ratio=0.1, positive_only=True, alpha=1e-4, fit_intercept=False, copy_X=False, precompute=False, selection='random', max_iter=100, topK=100, tol=1e-4, workers=multiprocessing.cpu_count()) recs = self.recommend_batch(userids=targetids, with_scores=with_scores, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name=self.name, verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time() - start)) return recs, map10
userids=userids, N=N, filter_already_liked=filter_already_liked, items_to_exclude=items_to_exclude) #evaluate the model with map10 map10 = recommender.evaluate(recommendations, test_urm=urm_test) if verbose: print('map@10: {}'.format(map10)) #write on external files on folder models_validation if write_on_file: out.write( '\n\nl1_ratio: {}\n alpha: {}\n Iterations: {}\n ' 'topK: {}\n evaluation map@10: {}'.format( l, a, m, k, map10)) """ If this file is executed, test the SPLUS distance metric """ if __name__ == '__main__': rec = SLIMElasticNetRecommender() rec.fit(urm=data.get_urm_train_1(), max_iter=1, topK=400, alpha=1e-4, l1_ratio=0.5) recs = rec.recommend_batch(userids=data.get_target_playlists()) rec.evaluate(recommendations=recs, test_urm=data.get_urm_test_1())
log.warning('(s) Save the similarity matrix') log.warning('(v) Validate the model') log.warning('(e) Export the submission') log.warning('(x) Exit') arg = input()[0] print() if arg == 't': model = P3alphaRecommender(data.get_urm_train_1()) model.fit(topK=900, alpha=1.2, min_rating=0, implicit=True, normalize_similarity=False) recs = model.recommend_batch(data.get_target_playlists()) evaluate(recs, test_urm=data.get_urm_test_1()) elif arg == 'r': log.info('Wanna save for evaluation (y/n)?') if input()[0] == 'y': model = P3alphaRecommender(data.get_urm()) path = 'raw_data/saved_r_hat_evaluation/' else: model = P3alphaRecommender(data.get_urm_train_1()) path = 'raw_data/saved_r_hat/' model.fit(topK=500, alpha=1.7, min_rating=1, normalize_similarity=True) print('Saving the R^...') r_hat = sps.csr_matrix( np.dot(model.URM_train[data.get_target_playlists()], model.W_sparse))
def validate(self, factors_array, iteration_array, urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), verbose=True, write_on_file=True, userids=data.get_target_playlists(), N=10, filter_already_liked=True, items_to_exclude=[]): #create the initial model recommender = Pure_SVD() path = 'validation_results/' name = 'pure_SVD' folder = time.strftime('%d-%m-%Y') filename = '{}/{}/{}{}.csv'.format(path, folder, name, time.strftime('_%H-%M-%S')) # create dir if not exists os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, 'w') as out: for f in factors_array: for i in iteration_array: #train the model with the parameters if verbose: print('\n\nTraining PURE_SVD with\n Factors: {}\n Iteration: {}\n'.format(f, i)) print('\n training phase...') recommender.fit(urm_train=urm_train, num_factors=f, iteration=i) #get the recommendations from the trained model recommendations = recommender.recommend_batch(userids=userids, N=N, filter_already_liked=filter_already_liked, items_to_exclude=items_to_exclude) #evaluate the model with map10 map10 = recommender.evaluate(recommendations, test_urm=urm_test) if verbose: print('map@10: {}'.format(map10)) #write on external files on folder models_validation if write_on_file: out.write('\n\nFactors: {}\n Iteration: {}\n evaluation map@10: {}'.format(f, i, map10))
def wizard_hybrid(): SIM_MATRIX = ['saved_sim_matrix', 'saved_sim_matrix_evaluation'] R_HAT = ['saved_r_hat', 'saved_r_hat_evaluation'] SAVE = ['saved_sim_matrix', 'saved_r_hat'] EVALUATE = ['saved_sim_matrix_evaluation', 'saved_r_hat_evaluation'] start = time.time() matrices_array, folder, models = hb.create_matrices_array() print('matrices loaded in {:.2f} s'.format(time.time() - start)) log.success('You have loaded: {}'.format(models)) NORMALIZATION_MODE = normalization_mode_selection() if folder in SAVE: WEIGHTS = weights_selection(models) if folder in SIM_MATRIX: name, urm_filter_tracks, rel_path = option_selection_save('SIM') hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) sps.save_npz('raw_data/' + rel_path + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) if folder in R_HAT: name, urm_filter_tracks, rel_path, EXPORT = option_selection_save( 'R_HAT') hybrid_rec = HybridRHat(matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) if EXPORT: N = ask_number_recommendations() recommendations = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) exportcsv(recommendations, path='submission', name=name) else: sps.save_npz('raw_data/' + rel_path + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) elif folder in EVALUATE: log.success('|WHAT YOU WANT TO DO ???|') log.warning('\'1\' BAYESIAN SEARCH VALIDATION') log.warning('\'2\' HAND CRAFTED WEIGHTS') mode = input()[0] # BAYESIAN SEARCH if mode == '1': log.success( '|SELECT A NUMBER OF |||ITERATIONS||| FOR THE ALGORITHM|') iterations = float(input()) urm_filter_tracks = data.get_urm_train_1() if folder in SIM_MATRIX: hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) if folder in R_HAT: hybrid_rec = HybridRHat(matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) hybrid_rec.validate(iterations=iterations, urm_test=data.get_urm_test_1(), userids=data.get_target_playlists()) # MANUAL WEIGHTS elif mode == '2': WEIGHTS = weights_selection(models) urm_filter_tracks = data.get_urm_train_1() chose = option_selection_evaluation_2() # save, evaluate or csv if chose == 's': log.success('|CHOSE A NAME FOR THE MATRIX...|') name = input() if folder in SIM_MATRIX: type = 'SIM' hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) elif folder in R_HAT: type = 'R_HAT' hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) sps.save_npz('raw_data/saved_r_hat_evaluation/' + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) sym_rec = symmetric_recommender_creator( models, type, NORMALIZATION_MODE, urm_filter_tracks=data.get_urm_train_2()) sps.save_npz('raw_data/saved_r_hat_evaluation_2/' + name, sym_rec.get_r_hat(weights_array=WEIGHTS)) elif chose == 'e': if folder in SIM_MATRIX: type = 'SIM' hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) elif folder in R_HAT: type = 'R_HAT' hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) N = ask_number_recommendations() print('Recommending...') recs = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) hybrid_rec.evaluate(recommendations=recs, test_urm=data.get_urm_test_1()) # export the recommendations log.success( 'Do you want to save the CSV with these recomendations? (y/n)' ) if input()[0] == 'y': export_csv_wizard(recs) sym_rec = symmetric_recommender_creator( models, type, NORMALIZATION_MODE, urm_filter_tracks=data.get_urm_train_2()) recs2 = sym_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists()) sym_rec.evaluate(recommendations=recs2, test_urm=data.get_urm_test_2()) elif chose == 'c': if folder in R_HAT: hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) N = ask_number_recommendations() print('Recommending...') recs = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) export_csv_wizard(recs) else: log.error('not implemented yet') else: log.error('WRONG FOLDER')
#input_data_tensor = Variable(torch.from_numpy(np.asarray(input_data, dtype=np.int64))).to(self.device) input_data_tensor = Variable(input_data).to(self.device) label_tensor = Variable(label).to(self.device) user_coordinates = input_data_tensor[:, 0] item_coordinates = input_data_tensor[:, 1] # FORWARD pass prediction = self.pyTorchModel(user_coordinates, item_coordinates) # Pass prediction and label removing last empty dimension of prediction loss = self.lossFunction(prediction.view(-1), label_tensor) # BACKWARD pass self.optimizer.zero_grad() loss.backward() self.optimizer.step() def get_r_hat(self, load_from_file=False, path=''): pass def run(self): pass m = MF_MSE_PyTorch() m.fit(d.get_urm_train_1(), user_ids=d.get_target_playlists(), URM_test=d.get_urm_test_1())
""" if not at_k > 0: log.error('Invalid value of k {}'.format(at_k)) return aps = 0.0 for r in recommendations: row = test_urm.getrow(r[0]).indices m = min(at_k, len(row)) ap = 0.0 n_elems_found = 0.0 for j in range(1, m+1): if r[j] in row: n_elems_found += 1 ap = ap + n_elems_found/j if m > 0: ap = ap/m aps = aps + ap result = aps/len(recommendations) if verbose: log.warning('MAP: {}'.format(result)) return result rec = AsySVD() rec.fit(data.get_urm_train_1()) recs = rec.recommend_batch(data.get_target_playlists()) rec.evaluate(recs, data.get_urm_test_1())
distance=model.SIM_SPLUS, k=600, alpha=0.25, beta=0.5, shrink=10, l=0.25, c=0.5) print('Saving the similarity matrix...') sps.save_npz( 'raw_data/saved_sim_matrix_evaluation_2/{}'.format(model.name), model.get_sim_matrix()) elif arg == 'v': # model.validate(iterations=10, urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), targetids=data.get_target_playlists(), # distance=model.SIM_SPLUS, k=(100, 600), alpha=(0,2), beta=(0,2),shrink=(0,100),l=(0,1),c=(0,1)) model.validate(iterations=10, urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), targetids=data.get_target_playlists(), distance=model.SIM_RP3BETA, k=(100, 600), alpha=(0, 2), beta=(0, 2), shrink=(0, 100), l=1, c=1) #model.test(distance=CFItemBased.SIM_P3ALPHA, k=300,alpha=(0,2),shrink=(0,100)) elif arg == 'x': pass else: log.error('Wrong option!')