def __item_rating_dictionary_to_user_vector(self, item_ratings): if self.normalization: ord_item_ratings = OrderedDict(item_ratings) items = ord_item_ratings.keys() counts = normalize(ord_item_ratings.values()) for i in xrange(len(items)): item_ratings[items[i]] = counts[i] with h5py.File(self.h5filename, 'r') as model: user_vector = np.zeros(len(model['items'])) for item_id in model['items']: if self.__unescape(item_id) in item_ratings: user_vector[model['items'][item_id][()]] = item_ratings[self.__unescape(item_id)] return user_vector
def __item_rating_dictionary_to_user_vector(self, item_ratings): if self.normalization: ord_item_ratings = OrderedDict(item_ratings) items = ord_item_ratings.keys() counts = normalize(ord_item_ratings.values()) for i in xrange(len(items)): item_ratings[items[i]] = counts[i] with h5py.File(self.h5filename, 'r') as model: user_vector = np.zeros(len(model['items'])) for item_id in model['items']: if self.__unescape(item_id) in item_ratings: user_vector[model['items'][item_id][( )]] = item_ratings[self.__unescape(item_id)] return user_vector
def train(self, csv_path): with h5py.File(self.h5filename) as model, open(csv_path, 'rb') as f_csv: logging.info("Loading data from %s" % csv_path) if self.training_sample_size is not None: logging.info("Using sample size %d" % self.training_sample_size) matrix_size = (100, 100) logging.info("Starting with a %dx%d ratings matrix in HDF5 file" % matrix_size) model.create_dataset('ratings', matrix_size, maxshape=(None, None)) model['ratings'][...] = 0 users_index = model.create_group('users') items_index = model.create_group('items') row_counter = 0 user_counter = 0 item_counter = 0 for user, item, rating in csv.reader(f_csv, delimiter=self.training_csv_delimiter): try: if not self.__escape(user) in users_index: users_index[self.__escape(user)] = user_counter user_counter += 1 if not self.__escape(item) in items_index: items_index[self.__escape(item)] = item_counter item_counter += 1 if user_counter >= model['ratings'].shape[0]: matrix_size = (matrix_size[0] * 2, matrix_size[1]) logging.info("Increasing ratings matrix in HDF5 file to a %dx%d dimension" % matrix_size) model['ratings'].resize(matrix_size) if item_counter >= model['ratings'].shape[1]: matrix_size = (matrix_size[0], matrix_size[1] * 2) logging.info("Increasing ratings matrix in HDF5 file to a %dx%d dimension" % matrix_size) model['ratings'].resize(matrix_size) row = model['ratings'][users_index[self.__escape(user)][()]] row[items_index[self.__escape(item)][()]] = int(rating) model['ratings'][users_index[self.__escape(user)][()]] = row if self.training_sample_size is not None: row_counter += 1 if row_counter >= self.training_sample_size: break except Exception: logging.warning("Error storing rating for (%s, %s, %s), skipping" % (user, item, rating)) matrix_size = (user_counter, item_counter) logging.info("Reducing ratings matrix in HDF5 file to a %dx%d dimension" % matrix_size) model['ratings'].resize(matrix_size) if self.normalization: logging.info("Normalizing ratings matrix in HDF5 file") for i in xrange(user_counter): user_vector = model['ratings'][i][()] model['ratings'][i] = normalize(user_vector) #logging.info("Running non-negative matrix factorization in disk") #mfact = pymf.NMF(model['ratings'], num_bases=self.training_rank) logging.info("Running singular value decomposition in disk") mfact = pymf.SVD(model['ratings']) mfact.factorize() logging.info("Storing %dx%d W matrix and %dx%d H matrix in HDF5 file" % (user_counter, self.training_rank, self.training_rank, item_counter)) # Users' latent factors. #model['W'] = mfact.W #model['W'] = np.dot(mfact.U, np.sqrt(mfact.S)) model['U'] = mfact.U model['S'] = mfact.S # Items' latent factors. #model['H'] = mfact.H #model['H'] = np.dot(np.sqrt(mfact.S), mfact.V) model['V'] = mfact.V logging.info("Training completed")
def k_fold_cross_validation(self, original_csv_path, k=10, given_fraction=0.8, feature_sampling=None, max_features=None, output_filename=None): self.enable_normalization() with open(original_csv_path, 'rb') as f_csv: user_set = set([]) item_set = set([]) # Load all user IDs. logging.info("Loading all user IDs from %s" % original_csv_path) for user, item, rating in csv.reader(f_csv, delimiter=self.training_csv_delimiter): user_set.add(user) item_set.add(item) user_set = list(user_set) # Generate k partitions. logging.info("Partitioning the user data into %d sets" % k) random.shuffle(user_set) folds = self.__partition(user_set, k) # Create a temporary CVS file for each partition. fold_csv_files = [] fold_csv_writers = [] for fold in folds: tmp_file = NamedTemporaryFile(delete=False) fold_csv_files.append(tmp_file) fold_csv_writers.append(csv.writer(tmp_file, delimiter=self.training_csv_delimiter)) # Create user-fold_index dictionary. user_fold_index = {} has_item = [] for i in xrange(len(folds)): has_item.append(set([])) for user in folds[i]: user_fold_index[user] = i hdf5_filenames = [] for i in xrange(len(folds)): tmpfile = NamedTemporaryFile(delete=False) tmpfile.close() hdf5_filenames.append(tmpfile.name) # Restart reading CSV file and fill CSVs. logging.info("Creating training and test sets") f_csv.seek(0) item_vector = list(item_set) for user, item, rating in csv.reader(f_csv, delimiter=self.training_csv_delimiter): rating = int(rating) for i in xrange(len(fold_csv_writers)): if user_fold_index[user] == i: # Store test set. with h5py.File(hdf5_filenames[i]) as model: if not 'test' in model: model.create_dataset('test', (len(folds[i]), len(item_vector)), dtype='f') model['test'][...] = 0 model['test'][folds[i].index(user), item_vector.index(item)] = rating else: # Create training data. fold_csv_writers[i].writerow([user, item, rating]) has_item[i].add(item) # Normalize test sets. logging.info("Normalizing test sets") for i in xrange(len(hdf5_filenames)): with h5py.File(hdf5_filenames[i]) as model: for i in xrange(model['test'].shape[0]): model['test'][i] = normalize(model['test'][i]) # Add zero values for missing items for a random user # (fix to guarantee that all items are included) logging.info("Adding reference to missing items") for i in xrange(len(has_item)): not_i = 0 if i == 0: not_i = 1 for item in item_set - has_item[i]: fold_csv_writers[i].writerow([folds[not_i][0], item, 0]) # Create model for each training set. logging.info("Training models for each of the %d folds" % k) for i in xrange(len(fold_csv_files)): model = LatentFactorsModel(hdf5_filenames[i]) model.set_training_csv_delimiter(self.training_csv_delimiter) model.set_training_rank(self.training_rank) model.set_training_sample_size(self.training_sample_size) fold_csv_files[i].close() model.train(fold_csv_files[i].name) # Predict and evaluate all folds logging.info("Using test sets to evaluate models in k=%d folds" % k) fold_mae = [] fold_rmse = [] n_feature_fold_mae = {} n_features = None feature_sizes = None for i in xrange(len(hdf5_filenames)): with h5py.File(hdf5_filenames[i]) as model: n_features = model['V'].shape[1] n_max_features = n_features if max_features is not None: n_max_features = max_features if feature_sampling is not None: #feature_sizes = (range(1, min(n_max_features/feature_sampling, 20)) # + range(n_max_features / feature_sampling, # n_max_features, n_max_features / feature_sampling)) feature_sizes = sorted(set([int(round(x)) for x in np.logspace(0, np.log10(n_max_features), feature_sampling)])) logging.info("Calculating MAE for the following number of features: %s" % ', '.join([str(s) for s in feature_sizes])) query_prediction_indices_tuples = [] n_feature_query_prediction_indices_tuples = {} for user_vector in model['test']: # Randomly set given_fraction of ratings to zero. query_user_vector = np.array([rating for rating in user_vector]) idx_ratings = np.flatnonzero(query_user_vector) random.shuffle(idx_ratings) idx_ratings = idx_ratings[0:math.ceil(len(idx_ratings) * given_fraction)] query_user_vector[idx_ratings] = 0 query_user_vector = list(query_user_vector) idx_ratings = list(idx_ratings) projected_user_vector = np.dot(query_user_vector, np.dot(model['V'][:].T, np.linalg.inv(model['S']))) predictions = normalize( np.dot(projected_user_vector, np.dot(model['S'], model['V']))) query_prediction_indices_tuples.append((user_vector, predictions, idx_ratings)) if feature_sampling is not None: for size in feature_sizes: projected_user_vector = np.dot(query_user_vector, np.dot(model['V'][0:size].T, np.linalg.inv(model['S'][0:size, 0:size]))) predictions = normalize(np.dot(projected_user_vector, np.dot(model['S'][0:size, 0:size], model['V'][0:size]))) if not size in n_feature_query_prediction_indices_tuples: n_feature_query_prediction_indices_tuples[size] = [] n_feature_query_prediction_indices_tuples[size].append( (user_vector, predictions, idx_ratings)) fold_mae.append(self.mean_absolute_error(query_prediction_indices_tuples)) fold_rmse.append(self.root_mean_squared_error(query_prediction_indices_tuples)) if feature_sampling is not None: for size in feature_sizes: if not size in n_feature_fold_mae: n_feature_fold_mae[size] = [] n_feature_fold_mae[size].append( self.mean_absolute_error(n_feature_query_prediction_indices_tuples[size])) logging.info("Fold %d done" % (i+1, )) out = None writer = None if output_filename is not None: out = open(output_filename, 'wb') writer = csv.writer(out, delimiter=self.training_csv_delimiter) avg_mae = np.mean(fold_mae) std_mae = np.std(fold_mae) logging.info("MAE(n_features=%d): %f +/- %f" % (n_features, avg_mae, std_mae)) logging.info("RMSE(n_features=%d): %f +/- %f" % (n_features, np.mean(fold_rmse), np.std(fold_rmse))) if writer is not None: logging.info("Storing evaluation results in %s" % output_filename) writer.writerow(["features", "avg.mae", "std.mae"]) if feature_sampling is not None: for size in n_feature_fold_mae: avg_mae = np.mean(n_feature_fold_mae[size]) std_mae = np.std(n_feature_fold_mae[size]) if writer is None: logging.info("MAE(n_features=%d) = %f +/- %f" % (size, avg_mae, std_mae)) else: writer.writerow([size, avg_mae, std_mae]) if writer is not None: writer.writerow([n_features, avg_mae, std_mae]) if out is not None: out.close() # Delete temporary files. logging.info("Deleting temporary files") for fold_csv_file, hdf5_filename in zip(fold_csv_files, hdf5_filenames): os.unlink(fold_csv_file.name) os.unlink(hdf5_filename)
def train(self, csv_path): with h5py.File(self.h5filename) as model, open(csv_path, 'rb') as f_csv: logging.info("Loading data from %s" % csv_path) if self.training_sample_size is not None: logging.info("Using sample size %d" % self.training_sample_size) matrix_size = (100, 100) logging.info("Starting with a %dx%d ratings matrix in HDF5 file" % matrix_size) model.create_dataset('ratings', matrix_size, maxshape=(None, None)) model['ratings'][...] = 0 users_index = model.create_group('users') items_index = model.create_group('items') row_counter = 0 user_counter = 0 item_counter = 0 for user, item, rating in csv.reader( f_csv, delimiter=self.training_csv_delimiter): try: if not self.__escape(user) in users_index: users_index[self.__escape(user)] = user_counter user_counter += 1 if not self.__escape(item) in items_index: items_index[self.__escape(item)] = item_counter item_counter += 1 if user_counter >= model['ratings'].shape[0]: matrix_size = (matrix_size[0] * 2, matrix_size[1]) logging.info( "Increasing ratings matrix in HDF5 file to a %dx%d dimension" % matrix_size) model['ratings'].resize(matrix_size) if item_counter >= model['ratings'].shape[1]: matrix_size = (matrix_size[0], matrix_size[1] * 2) logging.info( "Increasing ratings matrix in HDF5 file to a %dx%d dimension" % matrix_size) model['ratings'].resize(matrix_size) row = model['ratings'][users_index[self.__escape(user)][( )]] row[items_index[self.__escape(item)][()]] = int(rating) model['ratings'][users_index[self.__escape(user)][( )]] = row if self.training_sample_size is not None: row_counter += 1 if row_counter >= self.training_sample_size: break except Exception: logging.warning( "Error storing rating for (%s, %s, %s), skipping" % (user, item, rating)) matrix_size = (user_counter, item_counter) logging.info( "Reducing ratings matrix in HDF5 file to a %dx%d dimension" % matrix_size) model['ratings'].resize(matrix_size) if self.normalization: logging.info("Normalizing ratings matrix in HDF5 file") for i in xrange(user_counter): user_vector = model['ratings'][i][()] model['ratings'][i] = normalize(user_vector) #logging.info("Running non-negative matrix factorization in disk") #mfact = pymf.NMF(model['ratings'], num_bases=self.training_rank) logging.info("Running singular value decomposition in disk") mfact = pymf.SVD(model['ratings']) mfact.factorize() logging.info( "Storing %dx%d W matrix and %dx%d H matrix in HDF5 file" % (user_counter, self.training_rank, self.training_rank, item_counter)) # Users' latent factors. #model['W'] = mfact.W #model['W'] = np.dot(mfact.U, np.sqrt(mfact.S)) model['U'] = mfact.U model['S'] = mfact.S # Items' latent factors. #model['H'] = mfact.H #model['H'] = np.dot(np.sqrt(mfact.S), mfact.V) model['V'] = mfact.V logging.info("Training completed")
def k_fold_cross_validation(self, original_csv_path, k=10, given_fraction=0.8, feature_sampling=None, max_features=None, output_filename=None): self.enable_normalization() with open(original_csv_path, 'rb') as f_csv: user_set = set([]) item_set = set([]) # Load all user IDs. logging.info("Loading all user IDs from %s" % original_csv_path) for user, item, rating in csv.reader( f_csv, delimiter=self.training_csv_delimiter): user_set.add(user) item_set.add(item) user_set = list(user_set) # Generate k partitions. logging.info("Partitioning the user data into %d sets" % k) random.shuffle(user_set) folds = self.__partition(user_set, k) # Create a temporary CVS file for each partition. fold_csv_files = [] fold_csv_writers = [] for fold in folds: tmp_file = NamedTemporaryFile(delete=False) fold_csv_files.append(tmp_file) fold_csv_writers.append( csv.writer(tmp_file, delimiter=self.training_csv_delimiter)) # Create user-fold_index dictionary. user_fold_index = {} has_item = [] for i in xrange(len(folds)): has_item.append(set([])) for user in folds[i]: user_fold_index[user] = i hdf5_filenames = [] for i in xrange(len(folds)): tmpfile = NamedTemporaryFile(delete=False) tmpfile.close() hdf5_filenames.append(tmpfile.name) # Restart reading CSV file and fill CSVs. logging.info("Creating training and test sets") f_csv.seek(0) item_vector = list(item_set) for user, item, rating in csv.reader( f_csv, delimiter=self.training_csv_delimiter): rating = int(rating) for i in xrange(len(fold_csv_writers)): if user_fold_index[user] == i: # Store test set. with h5py.File(hdf5_filenames[i]) as model: if not 'test' in model: model.create_dataset( 'test', (len(folds[i]), len(item_vector)), dtype='f') model['test'][...] = 0 model['test'][folds[i].index(user), item_vector.index(item)] = rating else: # Create training data. fold_csv_writers[i].writerow([user, item, rating]) has_item[i].add(item) # Normalize test sets. logging.info("Normalizing test sets") for i in xrange(len(hdf5_filenames)): with h5py.File(hdf5_filenames[i]) as model: for i in xrange(model['test'].shape[0]): model['test'][i] = normalize(model['test'][i]) # Add zero values for missing items for a random user # (fix to guarantee that all items are included) logging.info("Adding reference to missing items") for i in xrange(len(has_item)): not_i = 0 if i == 0: not_i = 1 for item in item_set - has_item[i]: fold_csv_writers[i].writerow([folds[not_i][0], item, 0]) # Create model for each training set. logging.info("Training models for each of the %d folds" % k) for i in xrange(len(fold_csv_files)): model = LatentFactorsModel(hdf5_filenames[i]) model.set_training_csv_delimiter(self.training_csv_delimiter) model.set_training_rank(self.training_rank) model.set_training_sample_size(self.training_sample_size) fold_csv_files[i].close() model.train(fold_csv_files[i].name) # Predict and evaluate all folds logging.info("Using test sets to evaluate models in k=%d folds" % k) fold_mae = [] fold_rmse = [] n_feature_fold_mae = {} n_features = None feature_sizes = None for i in xrange(len(hdf5_filenames)): with h5py.File(hdf5_filenames[i]) as model: n_features = model['V'].shape[1] n_max_features = n_features if max_features is not None: n_max_features = max_features if feature_sampling is not None: #feature_sizes = (range(1, min(n_max_features/feature_sampling, 20)) # + range(n_max_features / feature_sampling, # n_max_features, n_max_features / feature_sampling)) feature_sizes = sorted( set([ int(round(x)) for x in np.logspace(0, np.log10( n_max_features), feature_sampling) ])) logging.info( "Calculating MAE for the following number of features: %s" % ', '.join([str(s) for s in feature_sizes])) query_prediction_indices_tuples = [] n_feature_query_prediction_indices_tuples = {} for user_vector in model['test']: # Randomly set given_fraction of ratings to zero. query_user_vector = np.array( [rating for rating in user_vector]) idx_ratings = np.flatnonzero(query_user_vector) random.shuffle(idx_ratings) idx_ratings = idx_ratings[ 0:math.ceil(len(idx_ratings) * given_fraction)] query_user_vector[idx_ratings] = 0 query_user_vector = list(query_user_vector) idx_ratings = list(idx_ratings) projected_user_vector = np.dot( query_user_vector, np.dot(model['V'][:].T, np.linalg.inv(model['S']))) predictions = normalize( np.dot(projected_user_vector, np.dot(model['S'], model['V']))) query_prediction_indices_tuples.append( (user_vector, predictions, idx_ratings)) if feature_sampling is not None: for size in feature_sizes: projected_user_vector = np.dot( query_user_vector, np.dot( model['V'][0:size].T, np.linalg.inv(model['S'][0:size, 0:size]))) predictions = normalize( np.dot( projected_user_vector, np.dot(model['S'][0:size, 0:size], model['V'][0:size]))) if not size in n_feature_query_prediction_indices_tuples: n_feature_query_prediction_indices_tuples[ size] = [] n_feature_query_prediction_indices_tuples[ size].append((user_vector, predictions, idx_ratings)) fold_mae.append( self.mean_absolute_error( query_prediction_indices_tuples)) fold_rmse.append( self.root_mean_squared_error( query_prediction_indices_tuples)) if feature_sampling is not None: for size in feature_sizes: if not size in n_feature_fold_mae: n_feature_fold_mae[size] = [] n_feature_fold_mae[size].append( self.mean_absolute_error( n_feature_query_prediction_indices_tuples[ size])) logging.info("Fold %d done" % (i + 1, )) out = None writer = None if output_filename is not None: out = open(output_filename, 'wb') writer = csv.writer(out, delimiter=self.training_csv_delimiter) avg_mae = np.mean(fold_mae) std_mae = np.std(fold_mae) logging.info("MAE(n_features=%d): %f +/- %f" % (n_features, avg_mae, std_mae)) logging.info("RMSE(n_features=%d): %f +/- %f" % (n_features, np.mean(fold_rmse), np.std(fold_rmse))) if writer is not None: logging.info("Storing evaluation results in %s" % output_filename) writer.writerow(["features", "avg.mae", "std.mae"]) if feature_sampling is not None: for size in n_feature_fold_mae: avg_mae = np.mean(n_feature_fold_mae[size]) std_mae = np.std(n_feature_fold_mae[size]) if writer is None: logging.info("MAE(n_features=%d) = %f +/- %f" % (size, avg_mae, std_mae)) else: writer.writerow([size, avg_mae, std_mae]) if writer is not None: writer.writerow([n_features, avg_mae, std_mae]) if out is not None: out.close() # Delete temporary files. logging.info("Deleting temporary files") for fold_csv_file, hdf5_filename in zip(fold_csv_files, hdf5_filenames): os.unlink(fold_csv_file.name) os.unlink(hdf5_filename)