def fit(self, data,test_split=True, test_portion=0.1, search_parameter_space=False): if not search_parameter_space: self.test_split = test_split self.__set_data(data, test_portion) print('Initializing features for Users and Items...') initial = initializer(self.user_ids, self.item_ids, self.initialization_method, self.n_latent, 0,0) user_features, item_features = initial.initialize_latent_vectors() bu = dict([(key, 0) for key in self.train_data_user_ids]) bi = dict([(key, 0) for key in self.train_data_item_ids]) if not self.biased: global_mean = 0 else: global_mean = self.all_ratings_in_train / len(self.train_data) for current_epoch in range(self.n_epochs): if self.verbose: print("Processing epoch {}".format(current_epoch)) # (re)initialize nums and denoms to zero self.user_num = dict([(key, np.zeros(self.n_latent)) for key in self.train_data_user_ids]) self.user_denom = dict([(key, np.zeros(self.n_latent)) for key in self.train_data_user_ids]) self.item_num = dict([(key, np.zeros(self.n_latent)) for key in self.train_data_item_ids]) self.item_denom = dict([(key, np.zeros(self.n_latent)) for key in self.train_data_item_ids]) for u, i, r in self.train_data: # compute current estimation and error dot = 0 # <q_i, p_u> for f in range(self.n_latent): dot += user_features[u][f] * item_features[i][f] est = global_mean + bu[u] + bi[i] + dot err = r - est # Update biases if self.biased: bu[u] += self.lr_bu * (err - self.reg_bu * bu[u]) bi[i] += self.lr_bi * (err - self.reg_bi * bi[i]) # Compute numerators and denominators for f in range(self.n_latent): self.user_num[u][f] += item_features[i][f] * r self.user_denom[u][f] += item_features[i][f] * est self.item_num[i][f] += user_features[u][f] * r self.item_denom[i][f] += user_features[u][f] * est # Update user factors for u in self.train_data_user_ids: n_ratings = self.user_n_ratings[u] for f in range(self.n_latent): self.user_denom[u][f] += n_ratings * self.reg_user_features * user_features[u][f] user_features[u][f] *= self.user_num[u][f] / self.user_denom[u][f] # Update item factors for i in self.train_data_item_ids: n_ratings = self.item_n_ratings[i] for f in range(self.n_latent): self.item_denom[i][f] += n_ratings * self.reg_item_features * item_features[i][f] item_features[i][f] *= self.item_num[i][f] / self.item_denom[i][f] # Calculate errors #error_counter += 1 train_error = Test.rmse_error( self.train_data, user_features, item_features) # Show error to Client if self.test_split: test_error = Test.rmse_error( self.test_data, user_features, item_features) print('Epoch Number: {}/{} Training RMSE: {:.2f} Test RMSE: {}'.format(current_epoch+1, self.n_epochs, train_error, test_error)) else: print('Epoch Number: {}/{} Training RMSE: {:.2f}'.format(current_epoch+1, self.n_epochs, train_error)) self.bu = bu self.bi = bi self.user_features = user_features self.item_features = item_features
def fit(self, data, test_split=True, test_portion=0.1, search_parameter_space=False): # Set train_data, test_data, user_ids etc. if search parameter is False # If True, this lets us search parameter space with the same train-test split if not search_parameter_space: self.test_split = test_split self.__set_data(data, test_portion) # Initialization print('Initializing features for Users and Items...') initial = initializer(self.user_ids, self.item_ids, self.initialization_method, self.n_latent, self.init_mean, self.init_std) self.user_features, self.item_features = initial.initialize_latent_vectors() # Training print('Starting training...') error_counter = 0 for epoch in range(self.max_epoch): # updating user and item features for user, item, rating in self.train_data: error = rating - \ np.dot(self.user_features[user], self.item_features[item]) # Use temp to update each item and user feature in sync. temp = self.user_features[user] # Update user and item feature for each user, item and rating pair self.user_features[user] += self.learning_rate * \ (error * self.item_features[item] - self.regularization * self.user_features[user]) self.item_features[item] += self.learning_rate * \ (error * temp - self.regularization * self.item_features[item]) # Calculate errors error_counter += 1 train_error = Test.rmse_error( self.train_data, self.user_features, self.item_features) # Show error to Client if self.test_split: test_error = Test.rmse_error( self.test_data, self.user_features, self.item_features) print('Epoch Number: {}/{} Training RMSE: {:.2f} Test RMSE: {}'.format(epoch+1, self.max_epoch, train_error, test_error)) else: print('Epoch Number: {}/{} Training RMSE: {:.2f}'.format(epoch+1, self.max_epoch, train_error)) # Save best features depending on test_error if self.test_split and test_error < self.min_test_error: self.min_test_error = test_error self.best_user_features = copy.deepcopy(self.user_features) self.best_item_features = copy.deepcopy(self.item_features) error_counter = 0 # Save best features if test data is False elif not self.test_split and train_error < self.min_train_error: self.min_train_error = train_error self.best_user_features = copy.deepcopy(self.user_features) self.best_item_features = copy.deepcopy(self.item_features) # Break if test_error didn't improve for the last n rounds and early stopping is true if self.early_stopping and error_counter >= self.early_stopping: print("Test error didn't get lower for the last {} epochs. Training is stopped.".format( error_counter)) print('Best test error is: {:.2f}. Best features are saved.'.format( self.min_test_error)) break print('Training has ended...') self.user_features = copy.deepcopy(self.best_user_features) self.item_features = copy.deepcopy(self.best_item_features)
def fit(self, data,test_split=True, test_portion=0.1, search_parameter_space=False): lr_bu = self.lr_bu lr_bi = self.lr_bi lr_pu = self.lr_pu lr_qi = self.lr_qi lr_yj = self.lr_yj reg_bu = self.reg_bu reg_bi = self.reg_bi reg_pu = self.reg_pu reg_qi = self.reg_qi reg_yj = self.reg_yj if not search_parameter_space: self.test_split = test_split self.__set_data(data, test_portion) print('Initializing features for Users and Items...') initial = initializer(self.user_ids, self.item_ids, self.initialization_method, self.n_latent, self.init_mean, self.init_std) self.pu, self.qi = initial.initialize_latent_vectors(initalization_method='normal') _, self.yj = initial.initialize_latent_vectors(initalization_method='normal') self.bu = dict([(key, 0) for key in self.train_data_user_ids]) self.bi = dict([(key, 0) for key in self.train_data_item_ids]) if not self.biased: global_mean = 0 else: global_mean = self.all_ratings_in_train / len(self.train_data) for current_epoch in range(self.n_epochs): if self.verbose: print(" processing epoch {}".format(current_epoch)) for u, i, r in self.train_data: # items rated by u self.Iu = [items for items in self.user_existing_ratings[u]] self.sqrt_Iu = np.sqrt(len(self.Iu)) # implicit feedback self.u_impl_fdb = np.zeros(self.n_latent, np.double) for j in self.Iu: for f in range(self.n_latent): self.u_impl_fdb[f] += self.yj[j][f] / self.sqrt_Iu # compute current error dot = 0 for f in range(self.n_latent): dot += self.qi[i][f] * (self.pu[u][f] + self.u_impl_fdb[f]) err = r - (global_mean + self.bu[u] + self.bi[i] + dot) # update biases self.bu[u] += lr_bu * (err - reg_bu * self.bu[u]) self.bi[i] += lr_bi * (err - reg_bi * self.bi[i]) # update factors for f in range(self.n_latent): puf = self.pu[u][f] qif = self.qi[i][f] self.pu[u][f] += lr_pu * (err * qif - reg_pu * puf) self.qi[i][f] += lr_qi * (err * puf - reg_qi * qif) for j in self.Iu: self.yj[j][f] += lr_yj * (err * qif / self.sqrt_Iu - reg_yj * self.yj[j][f]) # Calculate errors #error_counter += 1 train_error = Test.rmse_error( self.train_data, self.pu, self.qi) # Show error to Client if self.test_split: test_error = Test.rmse_error( self.test_data, self.pu, self.qi) print('Epoch Number: {}/{} Training RMSE: {:.2f} Test RMSE: {}'.format(current_epoch+1, self.n_epochs, train_error, test_error)) else: print('Epoch Number: {}/{} Training RMSE: {:.2f}'.format(current_epoch+1, self.n_epochs, train_error)) self.bu = self.bu self.bi = self.bi self.pu = self.pu self.qi = self.qi self.yj = self.yj