def learnModel(self, X): learner = WARPMFRecommender(self.k, self.alpha, self.lmbda, self.batchSize, self.maxTrials) learner.fit(X) return learner.U, learner.V
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.item_similarity.knn import CosineKNNRecommender from mrec.mf.warp import WARPMFRecommender from mrec.reranking_recommender import RerankingRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format, filepath) item_sim_model = CosineKNNRecommender(k=100) mf_model = WARPMFRecommender(d=80, gamma=0.01, C=100.0, max_iters=25000, validation_iters=1000, batch_size=10) recommender = RerankingRecommender(item_sim_model, mf_model, num_candidates=100) recommender.fit(train) save_recommender(recommender, outfile)
def testPredictionMethods(train_filename, eval_item_filename, user_means_filename): ''' compare predictions generated by the different approaches computes pairwise list overlap and average recall for each method ''' logging.info('testing predictions with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename)) mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) mrec_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE) mrec_recommender.fit(mrec_train_data) warp_recommender = WARPMFRecommender(d=50, gamma=0.01, C=100.0) warp_recommender.fit(mrec_train_data.X) train_data = trainData.TrainData(train_filename, user_means_filename) _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE) recalls = {} overlaps = {} top_recs = {} user_counter = 0.0 methods = ['mrec', 'warp', 'mf', 'ub_classic', 'ib_classic', 'ub_damping', 'ib_damping', 'ub_non', 'ib_non'] with open(eval_item_filename,'rb') as eval_file: for line in eval_file: data = line.split('\t') user_id = data[0] ground_truth_items = data[1].split(',') random_unrated_items = data[2].rstrip('\n').split(',') evaluation_item_ids = ground_truth_items + random_unrated_items # for each prediction method, compute topN recommendations once per user predictions1 = mrec_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True) top_recs['mrec'] = topNLists.getTopNList(predictions1, evaluation_item_ids=evaluation_item_ids) predictions2 = warp_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True) top_recs['warp'] = topNLists.getTopNList(predictions2, evaluation_item_ids=evaluation_item_ids) predictions3 = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids) top_recs['mf'] = topNLists.getTopNList(predictions3) predictions4 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'classic') top_recs['ub_classic'] = topNLists.getTopNList(predictions4) predictions5 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'classic') top_recs['ib_classic'] = topNLists.getTopNList(predictions5) predictions6 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'self_damping') top_recs['ub_damping'] = topNLists.getTopNList(predictions6) predictions7 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'self_damping') top_recs['ib_damping'] = topNLists.getTopNList(predictions7) predictions8 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized') top_recs['ub_non'] = topNLists.getTopNList(predictions8) predictions9 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized') top_recs['ib_non'] = topNLists.getTopNList(predictions9) # then, use the computed topN lists to update recall and overlap values for method1 in methods: if method1 in recalls: recalls[method1] += topNLists.getRecall(ground_truth_items, top_recs[method1]) else: recalls[method1] = topNLists.getRecall(ground_truth_items, top_recs[method1]) for method2 in methods: dict_key = method1 + '_' + method2 if dict_key in overlaps: overlaps[dict_key] += topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2]) else: overlaps[dict_key] = topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2]) user_counter += 1.0 logging.info('Tested user {0}. Current recalls: {1}. Current overlaps: {2}'.\ format(user_id, [(k, v/user_counter) for k,v in recalls.items()], [(k, v/user_counter) for k,v in overlaps.items()])) return recalls, overlaps
parser = OptionParser() parser.add_option('-m', '--main_split_dir', dest='main_split_dir', help='directory containing 50/50 splits for main evaluation') parser.add_option('-l', '--loo_split_dir', dest='loo_split_dir', help='directory containing LOO splits for hit rate evaluation') parser.add_option('-n', '--num_splits', dest='num_splits', type='int', default=5, help='number of splits in each directory (default: %default)') (opts, args) = parser.parse_args() if not (opts.main_split_dir or opts.loo_split_dir) or not opts.num_splits: parser.print_help() raise SystemExit print('doing a grid search for regularization parameters...') params = {'d': [100], 'gamma': [0.01], 'C': [100], 'max_iter': [100000], 'validation_iters': [500]} models = [WARPMFRecommender(**a) for a in ParameterGrid(params)] for train in glob: # get test # load em both up # put them into something that returns train,test.keys(),test in a generator() # test is a dict id->[id,id,...] if opts.main_split_dir: generate_main_metrics = generate_metrics(get_known_items_from_dict, compute_main_metrics) main_metrics = run_evaluation(models, retrain_recommender, load_splits(opts.main_split_dir, opts.num_splits), opts.num_splits, generate_main_metrics) print_report(models, main_metrics)
gamma_values = [0.01, 0.001, 0.0001] filenames = dataPreprocessing.loadData(mode='beyond_accuracy') # 5-fold cross-validation for iteration, (train_filename, test_filename, user_means_filename, eval_item_filename) in enumerate(filenames, 1): mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) # create the training data and required recommendation models train_data = trainData.TrainData(train_filename, user_means_filename) for factor_value, C_value, gamma_value in product( factor_values, C_values, gamma_values): warp_recommender = WARPMFRecommender(d=factor_value, gamma=gamma_value, C=C_value) warp_recommender.fit(mrec_train_data.X) logging.info('running fold {0} with f={1}, C={2}, g={3}...'.format( iteration, factor_value, C_value, gamma_value)) recall = 0 evaluation_cases = 0 with open(eval_item_filename, 'rb') as eval_file: for line in eval_file: data = line.split('\t') user_id = data[0] ground_truth_items = data[1].split(',') random_unrated_items = data[2].rstrip('\n').split(',')
def main(): import os import logging import glob import subprocess from optparse import OptionParser from IPython.parallel import Client from mrec import load_fast_sparse_matrix, save_recommender from mrec.item_similarity.slim import SLIM from mrec.item_similarity.knn import CosineKNNRecommender, DotProductKNNRecommender from mrec.mf.wrmf import WRMFRecommender from mrec.mf.warp import WARPMFRecommender from mrec.mf.warp2 import WARP2MFRecommender from mrec.popularity import ItemPopularityRecommender from mrec.parallel.item_similarity import ItemSimilarityRunner from mrec.parallel.wrmf import WRMFRunner from mrec.parallel.warp import WARPMFRunner logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use') parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') parser.add_option('--outdir',dest='outdir',help='directory for output files') parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir') parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)') parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)') parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)') parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)') parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)') parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)') parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)') parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)') parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)') parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)') parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)') parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') (opts,args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines: parser.print_help() raise SystemExit opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir)) trainfiles = glob.glob(opts.train) if opts.model == 'popularity': # special case, don't need to run in parallel subprocess.check_call(['mkdir','-p',opts.outdir]) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh) dataset = load_fast_sparse_matrix(opts.input_format,trainfile) model.fit(dataset) modelfile = get_modelfile(trainfile,opts.outdir) save_recommender(model,modelfile) logging.info('done') return # create an ipython client c = Client(packer=opts.packer) view = c.load_balanced_view() if opts.add_module_paths: c[:].execute('import sys') for path in opts.add_module_paths.split(','): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) if opts.model == 'slim': if opts.learner == 'fs_sgd': num_selected_features = 2 * opts.max_sims # preselect this many candidate similar items model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features) else: model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner) elif opts.model == 'knn': if opts.metric == 'cosine': model = CosineKNNRecommender(k=opts.max_sims) elif opts.metric == 'dot': model = DotProductKNNRecommender(k=opts.max_sims) else: parser.print_help() raise SystemExit('unknown metric: {0}'.format(opts.metric)) elif opts.model == 'wrmf': model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters) elif opts.model == 'warp': num_factors_per_engine = max(opts.num_factors/opts.num_engines,1) if opts.item_features: model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: parser.print_help() raise SystemExit('unknown model type: {0}'.format(opts.model)) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) modelfile = get_modelfile(trainfile,opts.outdir) if opts.model == 'wrmf': runner = WRMFRunner() factorsdir = get_factorsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile) elif opts.model == 'warp': runner = WARPMFRunner() modelsdir = get_modelsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile) else: runner = ItemSimilarityRunner() simsdir = get_simsdir(trainfile,opts.outdir) simsfile = get_simsfile(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
# create the training data and required recommendation models train_data = trainData.TrainData(train_filename, user_means_filename) Q = None library_recommender = None if options.algorithm == 'mf': _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE) elif options.algorithm == 'mrec': mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) library_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE) library_recommender.fit(mrec_train_data) elif options.algorithm == 'warp': mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) library_recommender = WARPMFRecommender(d=config.FACTOR_MODEL_SIZE, gamma=0.01, C=options.cvalue) library_recommender.fit(mrec_train_data.X) elif options.algorithm in ['ub', 'ib']: pass else: raise ValueError('Wrong rec. algorithm entered. Choose between ub, ib, mf, mrec, and warp') # run the beyondAccuracy for all users in the .eval file logging.info('running beyondAccuracy with {0}...'.format(eval_item_filename)) evaluation_cases = 0 with open(eval_item_filename,'rb') as eval_file: for line in eval_file: data = line.split('\t') user_id = data[0]