def __init__(self, folds_files=None, reader=None): Dataset.__init__(self, reader) self.folds_files = folds_files # check that all files actually exist. for train_test_files in self.folds_files: for f in train_test_files: if not os.path.isfile(os.path.expanduser(f)): raise ValueError('File ' + str(f) + ' does not exist.')
def create_train_set(self, train_data): ''' Function to create training set ------ Args: train_data: Training set in the form of list ------ Returns: Trainset object from surprise Basically a list with the following format: user, item, rating, timestamp ''' ds = Dataset(self.reader) return ds.construct_trainset(train_data)
def svd_ratings_predicate(observed_ratings_df, truth_ratings_df, fold='0', phase='eval'): """ pmf_ratings Predicates """ print("SVD predicates") svd_model = SVD() reader = Reader(rating_scale=(0.2, 1)) train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index( ).loc[:, ['userId', 'movieId', 'rating']], reader=reader) svd_model.fit(train_dataset.build_full_trainset()) # make predictions predictions = pd.DataFrame(index=truth_ratings_df.index, columns=['rating']) for row in truth_ratings_df.loc[:, ['rating']].iterrows(): uid = row[0][0] iid = row[0][1] predictions.loc[(uid, iid), 'rating'] = svd_model.predict(uid, iid).est write(predictions, 'svd_rating_obs', fold, phase)
def test_unknown_user_or_item(): """Ensure that all algorithms act gracefully when asked to predict a rating of an unknown user, an unknown item, and when both are unknown. """ reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) file_path = os.path.dirname(os.path.realpath(__file__)) + '/custom_train' data = Dataset.load_from_file(file_path=file_path, reader=reader) for trainset, testset in data.folds(): pass # just need trainset and testset to be set klasses = (NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNBaseline, SVD, SVDpp) for klass in klasses: algo = klass() algo.train(trainset) algo.predict(0, 'unknown_item') algo.predict('unkown_user', 0) algo.predict('unkown_user', 'unknown_item')
def test_performances(): """Test the returned dict. Also do dumping.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) algo = NormalPredictor() tmp_dir = tempfile.mkdtemp() # create tmp dir performances = evaluate(algo, data, measures=['RmSe', 'Mae'], with_dump=True, dump_dir=tmp_dir, verbose=2) shutil.rmtree(tmp_dir) # remove tmp dir print(performances) assert performances['RMSE'] is performances['rmse'] assert performances['MaE'] is performances['mae']
def recommend_rival(self, n_folds, train_test_file_path, reader, recommendation_file_path): """ Prepare the predictions to take them to RiVaL Toolkit. :param n_folds: Number of folds. :param train_test_file_path: Path with train and input_test files. :param recommendation_file_path: Path where the suitable files to run RiVaL Toolkit are saved. :return: The suitable files to run RiVaL Toolkit are saved. """ for i in range(n_folds): print('Fold: ', i) timestart = time.time() # train file: train_file_name = train_test_file_path + 'train_bin_verified_sep_' + str(i) + '.csv' train_data = Dataset(reader=reader) raw_trainset = train_data.read_ratings(file_name=train_file_name) trainset = train_data.construct_trainset(raw_trainset) timeend = time.time() print('Train file loading time: ', (timeend - timestart), 'seconds') timestart = time.time() # Train recommendation input_model: self.model.fit(trainset) timeend = time.time() print('Training time: ', (timeend - timestart), 'seconds') # input_test file: timestart = time.time() test_file_name = train_test_file_path + 'test_bin_verified_sep_' + str(i) + '.csv' test_data = Dataset(reader=reader) raw_testset = test_data.read_ratings(file_name=test_file_name) testset = test_data.construct_testset(raw_testset) timeend = time.time() print('Load time of the input_test file: ', (timeend - timestart), 'seconds') # Predictions: timestart = time.time() predictions = self.model.test(testset) file_name = open(recommendation_file_path + 'recs_' + str(i) + '.csv', 'w') for pred in predictions: user_id = pred[0] item_id = pred[1] rating_real = pred[2] rating_estimated = pred[3] file_name.write(user_id + "\t" + item_id + "\t" + str(rating_estimated) + '\n') timeend = time.time() print('Prediction time: ', (timeend - timestart), 'seconds')
def estimate_preference(self, user_id, item_id): """ Estimate the preference value by a specific user. :param user_id: Id of the user to recommend. :param item_id: Id of the item to recommend. :return: The estimate preference by the sepecific recommender. """ # train file: df_ratings = self.rating_data_model.df_ratings # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(self.rating_data_model.get_min_preference(), self.rating_data_model.get_max_preference())) train_data = Dataset(reader=reader) # The columns must correspond to user id, item id and ratings (in that order). raw_trainset = train_data.load_from_df(df_ratings[['user_id', 'item_id', 'rating']], reader) trainset = train_data.construct_trainset(raw_trainset.raw_ratings) # Train recommendation input_model: self.model.fit(trainset) return float(self.model.estimate(u=user_id, i=item_id)[0])
def test_knns(): """Ensure the k and min_k parameters are effective for knn algorithms.""" # the test and train files are from the ml-100k dataset (10% of u1.base and # 10 % of u1.test) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) # Actually, as KNNWithMeans and KNNBaseline have back up solutions for when # there are not enough neighbors, we can't really test them... klasses = (KNNBasic, ) # KNNWithMeans, KNNBaseline) k, min_k = 20, 5 for klass in klasses: algo = klass(k=k, min_k=min_k) for trainset, testset in data.folds(): algo.train(trainset) predictions = algo.test(testset) for pred in predictions: if not pred.details['was_impossible']: assert min_k <= pred.details['actual_k'] <= k
def get_surprise_ml_100k(split_ratio): data = Dataset.load_builtin() data_train, data_test = train_test_split(data, split_ratio) return data_train, data_test
def main(): # Load data reader = Reader(sep=',', rating_scale=(0.0, 5.0), skip_lines=1) allMoives = Dataset.load_from_file('ratings.csv', reader=reader) popMoives = Dataset.load_from_file('popular.csv', reader=reader) unpopMoives = Dataset.load_from_file('unpopular.csv', reader=reader) varMoives = Dataset.load_from_file('variance.csv', reader=reader) binary = [] binary.append(Dataset.load_from_file('bin2.5.csv', reader=reader)) binary.append(Dataset.load_from_file('bin3.csv', reader=reader)) binary.append(Dataset.load_from_file('bin3.5.csv', reader=reader)) binary.append(Dataset.load_from_file('bin4.csv', reader=reader)) with open('movies.csv', 'r', encoding='utf8') as f: reader = csv.reader(f, delimiter=',', quotechar='"') next(reader, None) movies = {int(movie[0]): movie[2] for movie in reader} # NMFs ks = range(2, 52, 2) mae, rmse = [0] * len(ks), [0] * len(ks) def nmf(dataName, data, biased=True): print('Start building NMF with ' + dataName + '!') for i, k in enumerate(ks): nmf = NMF(n_factors=k, biased=biased) scores = cross_validate(nmf, data, cv=10) mae[i] = scores['test_mae'].mean() rmse[i] = scores['test_rmse'].mean() print('k = ' + str(k) + ' finished!') plt.figure() plt.subplot(211) plt.plot(ks, mae) plt.xlabel('k') plt.ylabel('mean absolute error') plt.title('Mean absolute error vs. k of ' + dataName) plt.subplot(212) plt.plot(ks, rmse) plt.xlabel('k') plt.ylabel('root mean squared error') plt.title('Root mean squared error vs. k of ' + dataName) print('mae:') print(mae) print('rmse:') print(rmse) print('Finish building NMF with ' + dataName + '!') # Q17 nmf('all movies', allMoives) # Q18 optimalK = 4 print('The optimal number of latent factors is ' + str(optimalK)) # Q19 nmf('popular movies', popMoives) # Q20 nmf('unpopular movies', unpopMoives) # Q21 nmf('high variance movies', varMoives) # Draw ROC Curve thresholds = [2.5, 3, 3.5, 4] def drawRoc(model, i, k): print('Start drawing ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i]) + '!') train, test = train_test_split(binary[i], train_size=0.9, test_size=0.1) model.fit(train) labels = model.test(test) y_true = [label.r_ui for label in labels] y_pred = [label.est for label in labels] fpr, tpr, _ = roc_curve(y_true, y_pred) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i])) plt.legend(loc="lower right") print('Finish drawing ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i]) + '!') # Q22 nmf = NMF(n_factors=optimalK) for i in range(len(thresholds)): drawRoc(nmf, i, optimalK) # Q23 print("Start finding top K!") k, col = 20, 5 nmf = NMF(n_factors=k) trainAllMovies = allMoives.build_full_trainset() nmf.fit(trainAllMovies) ids = [[] for _ in range(col)] for i in range(col): factors = nmf.qi[:, i] s = sorted([[i, factor] for i, factor in enumerate(factors)], key=lambda x: x[1], reverse=True) for k in range(10): ids[i].append(s[k][0]) genres = [[] for _ in range(col)] for i in range(col): for j in range(10): genres[i].append(movies[int(trainAllMovies.to_raw_iid(ids[i][j]))]) for i in range(col): print('Col ' + str(i + 1) + ':') for genre in genres[i]: print(genre, end=', ') print('') print("Finish finding top K!") # Q24 nmf('all movies', allMoives, True) # Q25 optimalKBiased = 2 print('The optimal number of latent factors is ' + optimalKBiased) # Q26 nmf('popular movies', popMoives, True) # Q27 nmf('unpopular movies', unpopMoives, True) # Q28 nmf('high variance movies', varMoives, True) # Q29 optimalKBiased = 2 nmfBiased = NMF(n_factors=optimalKBiased, biased=True) for i in range(len(thresholds)): drawRoc(nmfBiased, i, optimalKBiased) plt.show()
head(10) #%% Most active users -- Check correlation of numbers with rating/time? ratings.\ groupby('User')['Recipe'].\ count().\ sort_values(ascending=False).\ head(10) #%% Distribution of Ratings print(ratings.Rating.describe()) print(set(ratings.Rating)) #%% Build train - test split reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(ratings, reader) random.seed(42) random.shuffle(data.raw_ratings) cut_off = int(len(data.raw_ratings) * 0.75) train_ratings = data.raw_ratings[:cut_off] test_ratings = data.raw_ratings[cut_off:] data.raw_ratings = train_ratings #%% Evaluate baseline on all, bias and test error def evaluator(algo, df, cv_method, verbose = False): """ wrapper to streamline evaluation """
from __future__ import (absolute_import, division, print_function, unicode_literals) import os from surprise.prediction_algorithms import SVD from surprise.prediction_algorithms import SVDpp from surprise.dataset import Dataset from surprise.dataset import Reader from surprise.evaluate import evaluate # the test and train files are from the ml-100k dataset (10% of u1.base and # 10 % of u1.test) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) def test_SVD_parameters(): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = SVD(n_factors=1, n_epochs=1) rmse_default = evaluate(algo, data, measures=['rmse'])['rmse'] # n_factors algo = SVD(n_factors=2, n_epochs=1) rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_factors # n_epochs
from surprise import KNNBasic from surprise import KNNWithMeans from surprise import KNNWithZScore from surprise import KNNBaseline from surprise import SVD from surprise import BaselineOnly from surprise import SVDpp from surprise import NMF from surprise import SlopeOne from surprise import CoClustering from surprise.accuracy import rmse from surprise.model_selection import train_test_split from surprise import accuracy reader = Reader(rating_scale=(1, 7)) data = Dataset.load_from_df(df_c1[['Smart Card_', 'Class.1_', 'freq']], reader) # getting the most effective Algorithm for Recommendation System benchmark = [] for algorithm in [ SVD(), NMF(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()
mtarix_toGO['Norm_Tot_Amnt']= (mtarix_toGO['Mean_amount'] -min_amt)/max_amt #lower_bound = min(mtarix_toGO['Log_Mean_Amount']) #upper_bound = max(mtarix_toGO['Log_Mean_Amount']) #print lower_bound #print upper_bound # Remove the outliers dfx=mtarix_toGO[mtarix_toGO['Norm_Tot_Amnt'] <= 0.4] lower_bound = min(dfx['Norm_Tot_Amnt']) upper_bound = max(dfx['Norm_Tot_Amnt']) print 'Lower Bound normalized spending =',lower_bound print 'Upper Bound normalized spending =',upper_bound print 'Number of Transactions remaining after removing Outliers::',mtarix_toGO.shape[0] #define the reader with upper and lower bounds , also now we are predicting Normalized Total Amount column reader_x = Reader(rating_scale = (lower_bound,upper_bound)) data = Dataset.load_from_df(df=dfx[['CustomerID','StockCode','Norm_Tot_Amnt']],reader=reader_x) #for i in range(9): # print (data.raw_ratings[0][2] - data.df['Log_Mean_amount'][0]) print 'difference in processed and pre-processed dataset = ',(data.raw_ratings[0][2] - data.df['Norm_Tot_Amnt'][0]) import time start_time = time.time() #param_grid = {'n_factors':[2,5,10,50],'n_epochs': [10,50,100], 'lr_bu': [0.1,0.01,0.001,0.0001],'lr_bi': [0.1,0.01,0.001,0.0001],'reg_bi': [0.1,0.01,0.001,0.0001],'reg_bu': [0.1,0.01,0.001,0.0001],'reg_qi': [0.1,0.01,0.001,0.0001],'reg_pu': [0.1,0.01,0.001,0.0001]} param_grid = {'n_factors':[5,10,50,100],'n_epochs': [5,10,20,50,100], 'lr_all': [0.1,0.01,0.001],'reg_all': [0.1,0.01,0.001} grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=1)
del df4['geo_point_2d'] df_f = df4.join(df5) df_i = df_f.loc[df_f["ARRONDISSEMENT"].str.contains('paris') == True] df_i = df_i.drop(204726) df_i['ARRONDISSEMENT'] = encoder.fit_transform(df_i['ARRONDISSEMENT']) df_i['LIEU/ADRESSE'] = encoder.fit_transform(df_i['LIEU/ADRESSE']) df_i['STADE'] = encoder.fit_transform(df_i['STADE']) df_a = df_i.loc[df_i["ALLERGIE"] == 1] reader = Reader(rating_scale=(1, 164151)) df_etude_2 = Dataset.load_from_df(df_a[['LATITUDE', 'LONGITUDE', 'GENRE']], reader) X = StandardScaler().fit_transform(df_a) algo5 = DBSCAN(eps=0.3, min_samples=7).fit(X) labels = algo5.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) train_2, test_2 = train_test_split(df_etude_2, test_size=.25) algo = SVD() predictions_2 = algo.fit(train_2).test(test_2) lat = [] lng = [] for i in predictions_2:
def main(): class MyParser(argparse.ArgumentParser): '''A parser which prints the help message when an error occurs. Taken from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu.''' # noqa def error(self, message): sys.stderr.write('error: %s\n' % message) self.print_help() sys.exit(2) parser = MyParser( description='Evaluate the performance of a rating prediction ' + 'algorithm ' + 'on a given dataset using cross validation. You can use a built-in ' + 'or a custom dataset, and you can choose to automatically split the ' + 'dataset into folds, or manually specify train and test files. ' + 'Please refer to the documentation page ' + '(http://surprise.readthedocs.io/) for more details.', epilog="""Example:\n surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}" -load-builtin ml-100k -n-folds 3""") algo_choices = { 'NormalPredictor': NormalPredictor, 'BaselineOnly': BaselineOnly, 'KNNBasic': KNNBasic, 'KNNBaseline': KNNBaseline, 'KNNWithMeans': KNNWithMeans, 'SVD': SVD, 'SVDpp': SVDpp, 'NMF': NMF, 'SlopeOne': SlopeOne, 'CoClustering': CoClustering, } parser.add_argument('-algo', type=str, choices=algo_choices, help='The prediction algorithm to use. ' + 'Allowed values are ' + ', '.join(algo_choices.keys()) + '.', metavar='<prediction algorithm>') parser.add_argument('-params', type=str, metavar='<algorithm parameters>', default='{}', help='A kwargs dictionary that contains all the ' + 'algorithm parameters.' + 'Example: "{\'n_epochs\': 10}".') parser.add_argument('-load-builtin', type=str, dest='load_builtin', metavar='<dataset name>', default='ml-100k', help='The name of the built-in dataset to use.' + 'Allowed values are ' + ', '.join(dataset.BUILTIN_DATASETS.keys()) + '. Default is ml-100k.') parser.add_argument( '-load-custom', type=str, dest='load_custom', metavar='<file path>', default=None, help='A file path to custom dataset to use. ' + 'Ignored if ' + '-loadbuiltin is set. The -reader parameter needs ' + 'to be set.') parser.add_argument('-folds-files', type=str, dest='folds_files', metavar='<train1 test1 train2 test2... >', default=None, help='A list of custom train and test files. ' + 'Ignored if -load-builtin or -load-custom is set. ' 'The -reader parameter needs to be set.') parser.add_argument('-reader', type=str, metavar='<reader>', default=None, help='A Reader to read the custom dataset. Example: ' + '"Reader(line_format=\'user item rating timestamp\',' + ' sep=\'\\t\')"') parser.add_argument('-n-folds', type=int, dest='n_folds', metavar="<number of folds>", default=5, help='The number of folds for cross-validation. ' + 'Default is 5.') parser.add_argument('-seed', type=int, metavar='<random seed>', default=None, help='The seed to use for RNG. ' + 'Default is the current system time.') parser.add_argument('--with-dump', dest='with_dump', action='store_true', help='Dump the algorithm ' + 'results in a file (one file per fold). ' + 'Default is False.') parser.add_argument('-dump-dir', dest='dump_dir', type=str, metavar='<dir>', default=None, help='Where to dump the files. Ignored if ' + 'with-dump is not set. Default is ' + os.path.join(get_dataset_dir(), 'dumps/')) parser.add_argument('--clean', dest='clean', action='store_true', help='Remove the ' + get_dataset_dir() + ' directory and exit.') parser.add_argument('-v', '--version', action='version', version=__version__) args = parser.parse_args() if args.clean: folder = get_dataset_dir() shutil.rmtree(folder) print('Removed', folder) exit() # setup RNG rd.seed(args.seed) np.random.seed(args.seed) # setup algorithm params = eval(args.params) if args.algo is None: parser.error('No algorithm was specified.') algo = algo_choices[args.algo](**params) # setup dataset if args.load_custom is not None: # load custom and split if args.reader is None: parser.error('-reader parameter is needed.') reader = eval(args.reader) data = Dataset.load_from_file(args.load_custom, reader=reader) cv = KFold(n_splits=args.n_folds, random_state=args.seed) elif args.folds_files is not None: # load from files if args.reader is None: parser.error('-reader parameter is needed.') reader = eval(args.reader) folds_files = args.folds_files.split() folds_files = [(folds_files[i], folds_files[i + 1]) for i in range(0, len(folds_files) - 1, 2)] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) cv = PredefinedKFold() else: # load builtin dataset and split data = Dataset.load_builtin(args.load_builtin) cv = KFold(n_splits=args.n_folds, random_state=args.seed) cross_validate(algo, data, cv=cv, verbose=True)
def main(): class MyParser(argparse.ArgumentParser): '''A parser which prints the help message when an error occurs. Taken from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu.''' # noqa def error(self, message): sys.stderr.write('error: %s\n' % message) self.print_help() sys.exit(2) parser = MyParser( description='Evaluate the performance of a rating prediction ' + 'algorithm ' + 'on a given dataset using cross validation. You can use a built-in ' + 'or a custom dataset, and you can choose to automatically split the ' + 'dataset into folds, or manually specify train and test files. ' + 'Please refer to the documentation page ' + '(http://surprise.readthedocs.io/) for more details.', epilog="""Example:\n surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}" -load-builtin ml-100k -n-folds 3""") algo_choices = { 'NormalPredictor': NormalPredictor, 'BaselineOnly': BaselineOnly, 'KNNBasic': KNNBasic, 'KNNBaseline': KNNBaseline, 'KNNWithMeans': KNNWithMeans, 'SVD': SVD, 'SVDpp': SVDpp, 'NMF': NMF, 'SlopeOne': SlopeOne, 'CoClustering': CoClustering, } parser.add_argument('-algo', type=str, choices=algo_choices, help='The prediction algorithm to use. ' + 'Allowed values are ' + ', '.join(algo_choices.keys()) + '.', metavar='<prediction algorithm>') parser.add_argument('-params', type=str, metavar='<algorithm parameters>', default='{}', help='A kwargs dictionary that contains all the ' + 'algorithm parameters.' + 'Example: "{\'n_epochs\': 10}".' ) parser.add_argument('-load-builtin', type=str, dest='load_builtin', metavar='<dataset name>', default='ml-100k', help='The name of the built-in dataset to use.' + 'Allowed values are ' + ', '.join(dataset.BUILTIN_DATASETS.keys()) + '. Default is ml-100k.' ) parser.add_argument('-load-custom', type=str, dest='load_custom', metavar='<file path>', default=None, help='A file path to custom dataset to use. ' + 'Ignored if ' + '-loadbuiltin is set. The -reader parameter needs ' + 'to be set.' ) parser.add_argument('-folds-files', type=str, dest='folds_files', metavar='<train1 test1 train2 test2... >', default=None, help='A list of custom train and test files. ' + 'Ignored if -load-builtin or -load-custom is set. ' 'The -reader parameter needs to be set.' ) parser.add_argument('-reader', type=str, metavar='<reader>', default=None, help='A Reader to read the custom dataset. Example: ' + '"Reader(line_format=\'user item rating timestamp\',' + ' sep=\'\\t\')"' ) parser.add_argument('-n-folds', type=int, dest='n_folds', metavar="<number of folds>", default=5, help='The number of folds for cross-validation. ' + 'Default is 5.' ) parser.add_argument('-seed', type=int, metavar='<random seed>', default=None, help='The seed to use for RNG. ' + 'Default is the current system time.' ) parser.add_argument('--with-dump', dest='with_dump', action='store_true', help='Dump the algorithm ' + 'results in a file (one file per fold). ' + 'Default is False.' ) parser.add_argument('-dump-dir', dest='dump_dir', type=str, metavar='<dir>', default=None, help='Where to dump the files. Ignored if ' + 'with-dump is not set. Default is ' + os.path.join(get_dataset_dir(), 'dumps/') ) parser.add_argument('--clean', dest='clean', action='store_true', help='Remove the ' + get_dataset_dir() + ' directory and exit.' ) parser.add_argument('-v', '--version', action='version', version=__version__) args = parser.parse_args() if args.clean: folder = get_dataset_dir() shutil.rmtree(folder) print('Removed', folder) exit() # setup RNG rd.seed(args.seed) np.random.seed(args.seed) # setup algorithm params = eval(args.params) if args.algo is None: parser.error('No algorithm was specified.') algo = algo_choices[args.algo](**params) # setup dataset if args.load_custom is not None: # load custom and split if args.reader is None: parser.error('-reader parameter is needed.') reader = eval(args.reader) data = Dataset.load_from_file(args.load_custom, reader=reader) cv = KFold(n_splits=args.n_folds, random_state=args.seed) elif args.folds_files is not None: # load from files if args.reader is None: parser.error('-reader parameter is needed.') reader = eval(args.reader) folds_files = args.folds_files.split() folds_files = [(folds_files[i], folds_files[i + 1]) for i in range(0, len(folds_files) - 1, 2)] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) cv = PredefinedKFold() else: # load builtin dataset and split data = Dataset.load_builtin(args.load_builtin) cv = KFold(n_splits=args.n_folds, random_state=args.seed) cross_validate(algo, data, cv=cv, verbose=True)