def get_dataset(dataset_id): BUILTIN_DATASETS = { 'ml-100k': { 'path': './ml-100k/u.data', 'line_format': 'user item rating timestamp', 'rating_scale': (1, 5), 'sep': '\t' }, 'ml-1m': { 'path': './ml-1m/ratings.dat', 'line_format': 'user item rating timestamp', 'rating_scale': (1, 5), 'sep': '::' }, 'ml-10m': { 'path': './ml-10M100K/ratings.dat', 'line_format': 'user item rating timestamp', 'rating_scale': (1, 5), 'sep': '::' } } dataset_props = BUILTIN_DATASETS[dataset_id] return Dataset.load_from_file( dataset_props.get('path'), Reader(line_format=dataset_props.get('line_format'), rating_scale=dataset_props.get('rating_scale'), sep=dataset_props.get('sep')))
def svd_ratings_predicate(observed_ratings_df, truth_ratings_df, fold='0', phase='eval'): """ pmf_ratings Predicates """ print("SVD predicates") svd_model = SVD() reader = Reader(rating_scale=(0.2, 1)) train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index( ).loc[:, ['userId', 'movieId', 'rating']], reader=reader) svd_model.fit(train_dataset.build_full_trainset()) # make predictions predictions = pd.DataFrame(index=truth_ratings_df.index, columns=['rating']) for row in truth_ratings_df.loc[:, ['rating']].iterrows(): uid = row[0][0] iid = row[0][1] predictions.loc[(uid, iid), 'rating'] = svd_model.predict(uid, iid).est write(predictions, 'svd_rating_obs', fold, phase)
def __init__(self, love_matrix: np.array, model: AlgoBase, test_size: float = 0.2): self.love_matrix = love_matrix self.model = model self.reader = Reader(rating_scale=self.rating_scale) self.test_size = test_size
def evaluate_prediction(df, sample_num, dir_attack_profiles, attack): """ :param df: :param sample_num: :param dir_attack_profiles: :param attack: :return: the elaborated dataframe and the sample name """ print("\t\t\t\tAttack {0}".format(attack)) target_item_id = int(attack.split('_')[2].split('.')[0]) df_attack = pd.read_csv(os.path.join(dir_attack_profiles, attack)) # Reduce the number of shilling profiles with respect to the maximum perc_of_shilling_users = round( cfg.attackSizePercentage / max(cfg.size_of_attacks), 2) shilling_users = df_attack.userId.unique() df_attack = df_attack[df_attack.userId.isin( shilling_users[:int(len(shilling_users) * perc_of_shilling_users)])] shilling_ids = list(df_attack['userId'].unique()) df_attack = df_attack.append(df, ignore_index=True).reset_index() algo = get_algo(df_attack) # First train a Recommender Algorithm on the sample dataset. if cfg.model in [cfg.ncf]: algo.fit() else: reader = Reader(line_format='user item rating', rating_scale=cfg.rating_scale.get(cfg.dataset)) data = Dataset.load_from_df(df_attack[['userId', 'itemId', 'rating']], reader) trainset = data.build_full_trainset() algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. # predictions = algo.test(testset) # We are Evaluating on a Single Item # rec_list, final_positions, final_scores = get_rec_list_faster(predictions, target_item_id) # final_positions, final_scores = get_rec_list_faster(predictions, target_item_id, shilling_ids) print('\t\t\t\tEvaluating post prediction') if cfg.model in [cfg.ncf]: final_positions, final_scores = algo.test([target_item_id], shilling_ids) else: testset = trainset.build_anti_testset() predictions = algo.test(testset) final_positions, final_scores = get_rec_list_faster( predictions, target_item_id, shilling_ids) print('\t\t\t\tEnd Evaluation of post prediction') return final_positions, final_scores, sample_num, target_item_id
def load_builtin(cls, name='ml-100k', prompt=True): """Load a built-in dataset. If the dataset has not already been loaded, it will be downloaded and saved. You will have to split your dataset using the :meth:`split <DatasetAutoFolds.split>` method. See an example in the :ref:`User Guide <cross_validate_example>`. Args: name(:obj:`string`): The name of the built-in dataset to load. Accepted values are 'ml-100k', 'ml-1m', and 'jester'. Default is 'ml-100k'. prompt(:obj:`bool`): Prompt before downloading if dataset is not already on disk. Default is True. Returns: A :obj:`Dataset` object. Raises: ValueError: If the ``name`` parameter is incorrect. """ try: dataset = BUILTIN_DATASETS[name] except KeyError: raise ValueError('unknown dataset ' + name + '. Accepted values are ' + ', '.join(BUILTIN_DATASETS.keys()) + '.') # if dataset does not exist, offer to download it if not os.path.isfile(dataset.path): answered = not prompt while not answered: print('Dataset ' + name + ' could not be found. Do you want ' 'to download it? [Y/n] ', end='') choice = input().lower() if choice in ['yes', 'y', '', 'omg this is so nice of you!!']: answered = True elif choice in ['no', 'n', 'hell no why would i want that?!']: answered = True print("Ok then, I'm out!") sys.exit() download_builtin_dataset(name) reader = Reader(**dataset.reader_params) return cls.load_from_file(file_path=dataset.path, reader=reader, item_path=dataset.item_path)
def perform_operation(self): self.LOG_HANDLE.info( "Running the collaborative filtering algorithms...") latest_ratings_file_name = self.get_latest_output_file_name( configurations.RATINGS_FILE_IN_REQUIRED_FORMAT_FILE_NAME, next=False)[1] latest_ratings_file_location = os.path.join( configurations.OUTPUT_FILES_DIRECTORY, latest_ratings_file_name) self.LOG_HANDLE.info("Running recommender models on the file here: " + latest_ratings_file_location) print("Running all recommender models") # Params from here: http://surprise.readthedocs.io/en/stable/reader.html reader = Reader(sep=constants.COMMA_STR) # Params from here: http://surprise.readthedocs.io/en/stable/dataset.html ratings_dataset = Dataset.load_from_file(latest_ratings_file_location, reader) # Divide the data set into the training and test sets trainset, testset = train_test_split( ratings_dataset, test_size=model_params.test_set_size) # Add different algorithms here - Removed SVD PP algorithm collaborative_algorithms = [ normal_algo_wrapper(), knn_algo_wrapper(), svd_algo_wrapper() ] rmse_values = {} for collaborative_algorithm in collaborative_algorithms: print("Started Algorithm: " + collaborative_algorithm.algo_name) rmse_values[collaborative_algorithm. algo_name] = collaborative_algorithm.evaluate_on_test( trainset, testset) collaborative_algorithm.perform_grid_search_with_cv( ratings_dataset) print("Completed Algorithm: " + collaborative_algorithm.algo_name) print("All recommender models have been run...") plt.scatter(rmse_values.keys(), rmse_values.values()) plt.xlabel('Collaborative filtering algorithm') plt.ylabel('Root mean square error (RMSE) on test predictions') plt.show()
def main(): with open('random_recommender_config.json', 'r') as f: config = json.load(f) path = config['path'] separator = config['separator'] n_folds = config['n_folds'] output_recommendation_file_path = path + '<output_recommendation_file_path>' input_file_path = path + '<input_file_path>' ratings_file_path = path + '<ratings_file_path>' random_path = output_recommendation_file_path + 'random/' reader = Reader(line_format='user item rating timestamp', sep=' ') recommender = RandomRecommender(ratings_file_path=ratings_file_path, separator=separator) recommender.recommend_rival(n_folds=n_folds, train_test_file_path=input_file_path, reader=reader, recommendation_file_path=random_path)
def estimate_preference(self, user_id, item_id): """ Estimate the preference value by a specific user. :param user_id: Id of the user to recommend. :param item_id: Id of the item to recommend. :return: The estimate preference by the sepecific recommender. """ # train file: df_ratings = self.rating_data_model.df_ratings # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(self.rating_data_model.get_min_preference(), self.rating_data_model.get_max_preference())) train_data = Dataset(reader=reader) # The columns must correspond to user id, item id and ratings (in that order). raw_trainset = train_data.load_from_df(df_ratings[['user_id', 'item_id', 'rating']], reader) trainset = train_data.construct_trainset(raw_trainset.raw_ratings) # Train recommendation input_model: self.model.fit(trainset) return float(self.model.estimate(u=user_id, i=item_id)[0])
def evaluate_prediction(sample_path, sample_num): """ :param sample_path: The Absolute Path of the sample useful to read the data samples csv file :param sample_num: the number of sample under analysis :return: the elaborated dataframe and the sample name """ "" # Load the dataset (download it if needed). df = pd.read_csv(os.path.join(project_dir, sample_path)) target_items = \ pd.read_csv(os.path.join(project_dir, cfg.data, cfg.dataset, cfg.target_items), usecols=['itemId'])['itemId'].tolist() try: algo = get_algo(df) except Exception as e: print(e) # First train a Recommender Algorithm on the sample dataset. print("\t\t\t\tFit {0}{1}".format(sample_path, sample_num)) if cfg.model in [cfg.ncf]: algo.fit() else: reader = Reader(line_format='user item rating', rating_scale=cfg.rating_scale.get(cfg.dataset)) data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader) trainset = data.build_full_trainset() algo.fit(trainset) print("\t\t\t\tEND - Fit {0}{1}".format(sample_path, sample_num)) # Than predict ratings for all pairs (u, i) that are NOT in the training set. print("\t\t\t\tPredict {0}{1}".format(sample_path, sample_num)) if cfg.model in [cfg.ncf]: initial_positions, initial_scores = algo.test(target_items[:]) else: testset = trainset.build_anti_testset() predictions = algo.test(testset) initial_positions, initial_scores = get_rec_list( predictions, target_items[:]) print("\t\t\t\tEND - Predict {0}{1}".format(sample_path, sample_num)) # rec_list, initial_positions, initial_scores = get_rec_list(predictions, target_items[:]) print("\t\t\t\tStoring Initial Predictions {0}{1}".format( sample_path, sample_num)) initial_prediction = { 'initial_positions': initial_positions, 'initial_scores': initial_scores } save_obj( initial_prediction, os.path.join(project_dir, cfg.model, cfg.results, cfg.dataset, cfg.initial_prediction)) # if cfg.save_full_rec_list: # # Save Also FULL REC LIST # save_obj(rec_list, # os.path.join(project_dir, cfg.model, cfg.results, cfg.dataset, # 'Full_{0}'.format(cfg.initial_prediction))) print("\t\t\t\tEND - Store Initial Positions {0}{1}".format( sample_path, sample_num))
def user_defined_file(file: Path) -> (surprise.dataset.DatasetAutoFolds): reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file(file, reader) return data
from surprise import SVD from surprise import Dataset from surprise.model_selection import cross_validate from surprise.model_selection import train_test_split from surprise import accuracy from surprise.reader import Reader import os import pandas as pd from sklearn.decomposition import PCA if __name__ == "__main__": os.chdir("C:\\Users\\22560\\Documents\\iptv") behavior = pd.read_csv( "C:\\Users\\22560\\Documents\\iptv\\originalData\\behavior.csv") reader = Reader(rating_scale=(behavior['MEDIACOUNT'].min(), behavior['MEDIACOUNT'].max())) data = Dataset.load_from_df( behavior[['newUserID', 'TV_NAME', 'MEDIACOUNT']], reader) trainset, testset = train_test_split(data, test_size=.25) algo = SVD() algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions) userattr = algo.pu itemattr = algo.qi pca = PCA(n_components=2) behavior.head()
# We'll use the famous SVD algorithm. algo = SVD() # Run 5-fold cross-validation and print results ratings = all_ratings.iloc[0:10**6] data2 = pd.DataFrame({ 'userID': ratings['user'], 'itemID': ratings['movie'], 'rating': ratings["rating"] }) data2['rating'].describe() data = Dataset.load_from_df( data2, Reader(line_format=u'user item rating', rating_scale=(1, 5))) cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # define a cross-validation iterator kf = KFold(n_splits=5) time1 = datetime.now() for trainset, testset in kf.split(data): time2 = datetime.now() # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True) print("hi:{}".format(time2 - time1)) time1 = time2
train_directory = "./data/02_intermediate/" submission_directory = "./data/07_model_output/" model_directory = "./data/06_models/" file_train = f"{train_directory}opiniones_train.csv" file_test = f"{train_directory}opiniones_test.csv" train = pd.read_csv(file_train) test = pd.read_csv(file_test) genre_means, genre_book_means = calculate_means(train) global_mean = train.puntuacion.mean() users_in_train = set(train.usuario.values) scale = (1.0, 10.0) reader = Reader(rating_scale=scale) data_train = Dataset.load_from_df( train[["usuario", "libro", "puntuacion"]], reader) trainset = data_train.build_full_trainset() # testing_algorithm(train) # SVD param_grid = { "n_factors": [70, 80, 90, 100, 110, 120, 130, 140, 150, 160], "n_epochs": [100], "lr_all": [0.002, 0.005, 0.01, 0.05], "reg_all": [0.1, 0.4, 0.6], "random_state": [0, 5, 42], }
def main(): # Load data reader = Reader(sep=',', rating_scale=(0.0, 5.0), skip_lines=1) allMoives = Dataset.load_from_file('ratings.csv', reader=reader) popMoives = Dataset.load_from_file('popular.csv', reader=reader) unpopMoives = Dataset.load_from_file('unpopular.csv', reader=reader) varMoives = Dataset.load_from_file('variance.csv', reader=reader) binary = [] binary.append(Dataset.load_from_file('bin2.5.csv', reader=reader)) binary.append(Dataset.load_from_file('bin3.csv', reader=reader)) binary.append(Dataset.load_from_file('bin3.5.csv', reader=reader)) binary.append(Dataset.load_from_file('bin4.csv', reader=reader)) with open('movies.csv', 'r', encoding='utf8') as f: reader = csv.reader(f, delimiter=',', quotechar='"') next(reader, None) movies = {int(movie[0]): movie[2] for movie in reader} # NMFs ks = range(2, 52, 2) mae, rmse = [0] * len(ks), [0] * len(ks) def nmf(dataName, data, biased=True): print('Start building NMF with ' + dataName + '!') for i, k in enumerate(ks): nmf = NMF(n_factors=k, biased=biased) scores = cross_validate(nmf, data, cv=10) mae[i] = scores['test_mae'].mean() rmse[i] = scores['test_rmse'].mean() print('k = ' + str(k) + ' finished!') plt.figure() plt.subplot(211) plt.plot(ks, mae) plt.xlabel('k') plt.ylabel('mean absolute error') plt.title('Mean absolute error vs. k of ' + dataName) plt.subplot(212) plt.plot(ks, rmse) plt.xlabel('k') plt.ylabel('root mean squared error') plt.title('Root mean squared error vs. k of ' + dataName) print('mae:') print(mae) print('rmse:') print(rmse) print('Finish building NMF with ' + dataName + '!') # Q17 nmf('all movies', allMoives) # Q18 optimalK = 4 print('The optimal number of latent factors is ' + str(optimalK)) # Q19 nmf('popular movies', popMoives) # Q20 nmf('unpopular movies', unpopMoives) # Q21 nmf('high variance movies', varMoives) # Draw ROC Curve thresholds = [2.5, 3, 3.5, 4] def drawRoc(model, i, k): print('Start drawing ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i]) + '!') train, test = train_test_split(binary[i], train_size=0.9, test_size=0.1) model.fit(train) labels = model.test(test) y_true = [label.r_ui for label in labels] y_pred = [label.est for label in labels] fpr, tpr, _ = roc_curve(y_true, y_pred) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i])) plt.legend(loc="lower right") print('Finish drawing ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i]) + '!') # Q22 nmf = NMF(n_factors=optimalK) for i in range(len(thresholds)): drawRoc(nmf, i, optimalK) # Q23 print("Start finding top K!") k, col = 20, 5 nmf = NMF(n_factors=k) trainAllMovies = allMoives.build_full_trainset() nmf.fit(trainAllMovies) ids = [[] for _ in range(col)] for i in range(col): factors = nmf.qi[:, i] s = sorted([[i, factor] for i, factor in enumerate(factors)], key=lambda x: x[1], reverse=True) for k in range(10): ids[i].append(s[k][0]) genres = [[] for _ in range(col)] for i in range(col): for j in range(10): genres[i].append(movies[int(trainAllMovies.to_raw_iid(ids[i][j]))]) for i in range(col): print('Col ' + str(i + 1) + ':') for genre in genres[i]: print(genre, end=', ') print('') print("Finish finding top K!") # Q24 nmf('all movies', allMoives, True) # Q25 optimalKBiased = 2 print('The optimal number of latent factors is ' + optimalKBiased) # Q26 nmf('popular movies', popMoives, True) # Q27 nmf('unpopular movies', unpopMoives, True) # Q28 nmf('high variance movies', varMoives, True) # Q29 optimalKBiased = 2 nmfBiased = NMF(n_factors=optimalKBiased, biased=True) for i in range(len(thresholds)): drawRoc(nmfBiased, i, optimalKBiased) plt.show()
def prepare_boycott_task(i, experimental_iteration, args, config, ratings_df, seed_base, outname, algo_name, algo, head_items, data): """ To simulate a boycott, we need to figure out which ratings are being held out For large datasets and large boycotts (e.g. 50% of ML-20M) this is very slow So we need to parallelize it That's the purpose of this function """ if config['type'] == 'individual_users': row = experimental_iteration[1] identifier = row.user_id name = 'individual' if args.indices != 'all': if identifier < args.indices[0] or identifier > args.indices[1]: return boycott_uid_set = set([row.user_id]) like_boycotters_uid_set = set([]) elif config['type'] in [ 'sample_users', 'gender', 'age', 'power', 'state', 'genre', 'genre_strict', 'occupation', ]: identifier = i name = experimental_iteration['name'] possible_boycotters_df = experimental_iteration['df'] print(name) print(possible_boycotters_df.head()) if args.userfrac != 1.0: boycotters_df = possible_boycotters_df.sample(frac=args.userfrac, random_state=(seed_base+i)*2) else: boycotters_df = possible_boycotters_df boycott_uid_set = set(boycotters_df.user_id) like_boycotters_df = possible_boycotters_df.drop(boycotters_df.index) like_boycotters_uid_set = set(like_boycotters_df.user_id) tic = time.time() mask_boycott_ratings = ratings_df.user_id.isin(boycott_uid_set) non_boycott_user_ratings_df = ratings_df[~mask_boycott_ratings] # makes a df copy print('isin time: {}'.format(time.time() - tic)) boycott_ratings_df = None boycott_user_lingering_ratings_df = None tic = time.time() # BAD (slow) CODE warning: this part is pretty slow when simulating large boycotts for large datasets (e.g. 90% of ML-20M) # room for improvement if args.ratingfrac == 1.0: # skip this complicated stuff! boycott_ratings_df = ratings_df[mask_boycott_ratings] # copy the df but drop all rows boycott_user_lingering_ratings_df = boycott_ratings_df.drop(boycott_ratings_df.index) else: for uid in boycott_uid_set: ratings_belonging_to_user = ratings_df[ratings_df.user_id == uid] if args.ratingfrac != 1.0: boycott_ratings_for_user = ratings_belonging_to_user.sample(frac=args.ratingfrac, random_state=(seed_base+i)*3) else: boycott_ratings_for_user = ratings_belonging_to_user lingering_ratings_for_user = ratings_belonging_to_user.drop(boycott_ratings_for_user.index) if boycott_ratings_df is None: boycott_ratings_df = boycott_ratings_for_user else: boycott_ratings_df = pd.concat([boycott_ratings_df, boycott_ratings_for_user]) if boycott_user_lingering_ratings_df is None: boycott_user_lingering_ratings_df = lingering_ratings_for_user else: boycott_user_lingering_ratings_df = pd.concat([boycott_user_lingering_ratings_df, lingering_ratings_for_user]) print('going through each uid time: {}'.format(time.time() - tic)) print('Iteration: {}'.format(i)) print('Boycott ratings: {}, Lingering Ratings from Boycott Users: {}'.format( len(boycott_ratings_df.index), len(boycott_user_lingering_ratings_df.index) )) all_non_boycott_ratings_df = pd.concat( [non_boycott_user_ratings_df, boycott_user_lingering_ratings_df]) print('Created dataframes', psutil.virtual_memory().used / (1024**3)) nonboycott = Dataset.load_from_df( all_non_boycott_ratings_df[['user_id', 'movie_id', 'rating']], reader=Reader() ) # makes a copy boycott = Dataset.load_from_df( boycott_ratings_df[['user_id', 'movie_id', 'rating']], reader=Reader() ) # makes a copy # why are the Dataset objects taking up 4GB when the dataframe is only 760 MB??? print('nonboycott.raw_ratings size', sys.getsizeof(nonboycott.raw_ratings)) print('Created dataset objects', psutil.virtual_memory().used / (1024**3)) identifier = str(identifier).zfill(4) num_users = len(all_non_boycott_ratings_df.user_id.value_counts()) num_movies = len(all_non_boycott_ratings_df.movie_id.value_counts()) num_ratings = len(all_non_boycott_ratings_df.index) # make sure to save the set of boycott ids and like boycott ids experiment_identifier_to_uid_sets = { identifier: {} } experiment_identifier_to_uid_sets[identifier]['boycott_uid_set'] = ';'.join(str(x) for x in boycott_uid_set) experiment_identifier_to_uid_sets[identifier]['like_boycotters_uid_set'] = ';'.join(str(x) for x in like_boycotters_uid_set) save_path = outname.replace('results/', 'predictions/boycotts/{}__'.format(identifier)).replace('.csv', '_') if args.save_path == 'False': print('Since you passed --save_path False, predictions will NOT BE SAVED') save_path = None elif args.save_path is None: save_path = os.getcwd() + '/' + save_path else: save_path = args.save_path + '/' + save_path if args.load_path == 'False': load_path = None elif args.load_path is None: load_path = os.getcwd() + '/predictions/standards/{}_{}_'.format(args.dataset, algo_name) else: load_path = args.load_path + '/standards/{}_{}_'.format(args.dataset, algo_name) load_boycotts_path = save_path if args.load_boycotts_path is None: load_boycotts_path = None return ( algo_name, algo, nonboycott, boycott, boycott_uid_set, like_boycotters_uid_set, MEASURES, NUM_FOLDS, False, identifier, num_ratings, num_users, num_movies, name, head_items, save_path, load_path, load_boycotts_path, data ), experiment_identifier_to_uid_sets
xbins = dict(start = 0, end = 50, size = 2)) layout = go.Layout(title = 'Distribution Of Number of Ratings Per User', xaxis = dict(title = 'Ratings Per User'), yaxis = dict(title = 'Count'), bargap = 0.2) fig = go.Figure(data=[trace], layout=layout) iplot(fig) ##Find the k for KNN algorithm file_path = os.path.expanduser('/Users/wuyanxu/Desktop/finaldata.csv') reader = Reader(line_format='item user rating', sep=',') data = Dataset.load_from_file(file_path, reader=reader) sim_options = {'name': 'cosine', 'user_based': True } min_mean = float("inf") optimal_k = 1 for k in [10,20,30,40,50,60,70,80,90,100]: algo = KNNBasic(sim_options=sim_options, k=k) x = cross_validate(algo, data, verbose=True) cur_mean = np.mean(x['test_rmse']) if(cur_mean < min_mean):
'./using-data/{}/train_ratings.csv'.format(data), header=0, dtype={ 'userId': np.int32, 'movieId': np.int32, 'rating': np.float }) df_eval_ratings = pd.read_csv('./using-data/{}/eval_ratings.csv'.format(data), header=0, dtype={ 'userId': np.int32, 'movieId': np.int32, 'rating': np.float }) reader = Reader(rating_scale=(1, 5)) train_data = Dataset.load_from_df( df_train_ratings.loc[:, ['userId', 'movieId', 'rating']], reader) test_data = Dataset.load_from_df( df_eval_ratings.loc[:, ['userId', 'movieId', 'rating']], reader) trainset = train_data.build_full_trainset() _, testset = train_test_split(test_data, test_size=.999) lu = [0.2, 0.02, 0.002] lv = [0.2, 0.02, 0.002] for u in lu: for v in lv: algo_pmf = SVD(n_factors=50, lr_all=0.005,
from steven.ratings_residuals_histogram import single_histogram, double_histogram from steven.steven_baselines import MeanOfMeans FILE_DIRECTORY = os.path.split(os.path.realpath(__file__))[0] DATA_DIRECTORY = os.path.join( os.path.split(FILE_DIRECTORY)[0], 'data', 'movies') if __name__ == "__main__": # Read data df = pd.read_csv(os.path.join(DATA_DIRECTORY, 'ratings.csv')) # Drop unneeded column 'timestamp' df.drop('timestamp', axis=1, inplace=True) # Load the data into the surprise format reader = Reader() data = Dataset.load_from_df(df, reader=reader) # Train ALS model print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} trainset, testset = train_test_split(data, test_size=0.25) algo = BaselineOnly(bsl_options=bsl_options) predictions = algo.fit(trainset).test(testset) # Get the RMSE of our predictions rmse = accuracy.rmse(predictions) # Get the cross-validated RMSE of our predictions cv_results = cross_validate(algo, data) cv_rmse = cv_results['test_rmse'].mean()
def main(args): """ Run the sandbox experiments """ out_prefix = 'out/' if args.send_to_out else "" times = OrderedDict() times['start'] = time.time() algos = ALGOS if args.movie_mean: algos = { 'MovieMean': MovieMean(), 'GlobalMean': GlobalMean(), } algos_for_standards = ALGOS_FOR_STANDARDS dfs = get_dfs(args.dataset) head_items = load_head_items(args.dataset) times['dfs_loaded'] = time.time() - times['start'] print('Got dataframes, took {} seconds'.format(times['dfs_loaded'])) print('Total examples: {}'.format(len(dfs['ratings'].index))) ratings_df, users_df, movies_df = dfs['ratings'], dfs['users'], dfs['movies'] if args.mode == 'info': print(ratings_df.memory_usage(index=True)) print(users_df.memory_usage(index=True)) print(movies_df.memory_usage(index=True)) print(ratings_df.info()) print(users_df.info()) return data = Dataset.load_from_df( ratings_df[['user_id', 'movie_id', 'rating']], reader=Reader() ) times['data_constructed'] = time.time() - times['dfs_loaded'] # note to reader: why are precision, recall, and ndcg all stuffed together in one string? # this ensures they will be computed all at once. Evaluation code will split them up for presentation metric_names = [] for measure in MEASURES: if '_' in measure: splitnames = measure.lower().split('_') metric_names += splitnames metric_names += [x + '_frac' for x in splitnames] metric_names += ['tail' + x for x in splitnames] else: metric_names.append(measure.lower()) metric_names = get_metric_names() if args.compute_standards: standard_results = defaultdict(list) for algo_name in algos_for_standards: for _ in range(args.num_standards): filename_ratingcv_standards = out_prefix + 'standard_results/{}_ratingcv_standards_for_{}.json'.format( args.dataset, algo_name) print('Computing standard results for {}'.format(algo_name)) if args.save_path is False: save_path = None elif args.save_path is None: save_path = os.getcwd() + '/' + out_prefix + 'predictions/standards/{}_{}_'.format(args.dataset, algo_name) else: save_path = args.save_path if 'KNN' in algo_name and args.dataset == 'ml-20m': # running this in parallel runs out of memory with KNN results = cross_validate_custom( algos_for_standards[algo_name], data, Dataset.load_from_df(pd.DataFrame(), reader=Reader()), [], [], MEASURES, NUM_FOLDS, n_jobs=1, head_items=head_items, save_path=save_path) else: results = cross_validate_custom( algos_for_standards[algo_name], data, Dataset.load_from_df(pd.DataFrame(), reader=Reader()), [], [], MEASURES, NUM_FOLDS, head_items=head_items, save_path=save_path) saved_results = {} for metric in metric_names: saved_results[metric] = np.mean(results[metric + '_all']) # frac_key = metric + '_frac_all' # if frac_key in results: # saved_results[frac_key] = np.mean(results[frac_key]) with open(filename_ratingcv_standards, 'w') as f: json.dump(saved_results, f) standard_results[algo_name].append(saved_results) standard_results_df = pd.DataFrame(standard_results[algo_name]) print(standard_results_df.mean()) standard_results_df.mean().to_csv('{}'.format( filename_ratingcv_standards).replace('.json', '_{}.csv'.format( args.num_standards) ) ) experiment_configs = [] if args.grouping == 'individual_users': experiment_configs += [{'type': 'individual_users', 'size': None}] elif args.grouping == 'sample': if args.sample_sizes: experiment_configs += [ { 'type': 'sample_users', 'size': sample_size } for sample_size in args.sample_sizes] else: raise ValueError( 'When using grouping="sample", you must provide a set of sample sizes' ) elif args.grouping in [ 'gender', 'age', 'power', 'state', 'genre', 'genre_strict', 'occupation', ]: experiment_configs += [{'type': args.grouping, 'size': None}] else: experiment_configs = [] uid_to_error = {} experimental_iterations = [] seed_base = args.indices[0] for config in experiment_configs: outname = out_prefix + concat_output_filename( args.dataset, config['type'], args.userfrac, args.ratingfrac, config['size'], args.num_samples, args.indices ) if config['type'] == 'individual_users': experimental_iterations = list(users_df.iterrows()) elif config['type'] == 'sample_users': experimental_iterations = [{ 'df': users_df.sample(config['size'], random_state=seed_base+index), # copies user_df 'name': '{} user sample'.format(config['size']) } for index in range(args.num_samples)] elif config['type'] == 'gender': for _ in range(args.num_samples): experimental_iterations += group_by_gender(users_df) elif config['type'] == 'age': for _ in range(args.num_samples): experimental_iterations += group_by_age(users_df) elif config['type'] == 'state': for _ in range(args.num_samples): experimental_iterations += group_by_state(users_df, dataset=args.dataset) elif config['type'] == 'genre': for _ in range(args.num_samples): experimental_iterations += group_by_genre( users_df=users_df, ratings_df=ratings_df, movies_df=movies_df, dataset=args.dataset) elif config['type'] == 'genre_strict': for _ in range(args.num_samples): experimental_iterations += group_by_genre_strict( users_df=users_df, ratings_df=ratings_df, movies_df=movies_df, dataset=args.dataset) elif config['type'] == 'power': for _ in range(args.num_samples): experimental_iterations += group_by_power(users_df=users_df, ratings_df=ratings_df, dataset=args.dataset) elif config['type'] == 'occupation': for _ in range(args.num_samples): experimental_iterations += group_by_occupation(users_df) experiment_identifier_to_uid_sets = {} for algo_name in algos: prep_boycott_tasks = ( delayed(prepare_boycott_task)( i, experimental_iteration, args, config, ratings_df, seed_base, outname, algo_name, algos[algo_name], head_items, data ) for i, experimental_iteration in enumerate(experimental_iterations) ) simulate_boycott_tasks = [] tic = time.time() out = Parallel(n_jobs=-1, verbose=5, max_nbytes=None)((x for x in prep_boycott_tasks)) for task_args, d in out: simulate_boycott_tasks.append(delayed(task)(*task_args)) experiment_identifier_to_uid_sets.update(d) print('parallelized prep_boycott_task took {} seconds'.format(time.time() - tic)) print('About to run Parallel() with {} tasks'.format(len(simulate_boycott_tasks))) out_dicts = Parallel(n_jobs=-1, verbose=5)((x for x in simulate_boycott_tasks)) for d in out_dicts: res = d['subset_results'] algo_name = d['algo_name'] uid = str(d['identifier']) + '_' + d['algo_name'] uid_to_error[uid] = { 'num_ratings': d['num_ratings'], 'num_users': d['num_users'], 'num_movies': d['num_movies'], 'name': d['name'], 'algo_name': d['algo_name'], } for metric in metric_names + ['fit_time', 'test_times', 'num_tested']: for group in ['all', 'non-boycott', 'boycott', 'like-boycott', 'all-like-boycott']: key = '{}_{}'.format(metric, group) # if group in ['boycott', ]: # val = np.nanmean(res[key]) vals = res.get(key) if vals: val = np.mean(res[key]) uid_to_error[uid].update({ key: val, }) standards_key = 'standards_' + key standards_vals = res.get(standards_key) if standards_vals: standards_val = np.mean(res[standards_key]) uid_to_error[uid].update({ standards_key: standards_val, }) err_df = pd.DataFrame.from_dict(uid_to_error, orient='index') uid_sets_outname = outname.replace('results/', 'uid_sets/uid_sets_') pd.DataFrame.from_dict(experiment_identifier_to_uid_sets, orient='index').to_csv(uid_sets_outname) if args.movie_mean: outname = outname.replace('results/', 'results/MOVIEMEAN_') err_df.to_csv(outname) print('Full runtime was: {} for {} experimental iterations'.format(time.time() - times['start'], len(experimental_iterations)))