def compute_factors(task): """ WRMF update method to run on an IPython engine. This reads from file and writes back to file, only filepaths and an empty model need to be passed. """ # import modules needed on engine import os import numpy as np from mrec import load_fast_sparse_matrix model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir = task data = load_fast_sparse_matrix(input_format,trainfile) if fixed_factor_files: H = np.vstack([np.load(f) for f in fixed_factor_files]) else: H = init_fixed_factors(model,data) HH = H.T.dot(H) W = np.zeros(((end-start),model.d)) for j in xrange(start,end): indices = get_indices(data,j) if indices.size: W[j-start,:] = model.update(indices,H,HH) np.save(os.path.join(workdir,'{0}.{1}.npy'.format(factor_type,start)),W) return start,end
def process(task): """ Training task to run on an ipython engine. """ # import modules required by engine import os import subprocess from mrec import load_fast_sparse_matrix model,input_format,trainfile,outdir,start,end,max_similar_items = task # initialise the model dataset = load_fast_sparse_matrix(input_format,trainfile) if hasattr(model,'similarity_matrix'): # clear out any existing similarity matrix model.similarity_matrix = None # write sims directly to file as we compute them outfile = os.path.join(outdir,'sims.{0}-{1}.tsv'.format(start,end)) out = open(outfile,'w') for j in xrange(start,end): w = model.get_similar_items(j,max_similar_items=max_similar_items,dataset=dataset) for k,v in w: print >>out,'{0}\t{1}\t{2}'.format(j+1,k+1,v) # write as 1-indexed out.close() # record success cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] subprocess.check_call(cmd) # return the range that we've processed return start,end
def compute_factors(task): """ WRMF update method to run on an IPython engine. This reads from file and writes back to file, only filepaths and an empty model need to be passed. """ # import modules needed on engine import os import numpy as np from mrec import load_fast_sparse_matrix model, input_format, trainfile, factor_type, get_indices, init_fixed_factors, fixed_factor_files, start, end, workdir = task data = load_fast_sparse_matrix(input_format, trainfile) if fixed_factor_files: H = np.vstack([np.load(f) for f in fixed_factor_files]) else: H = init_fixed_factors(model, data) HH = H.T.dot(H) W = np.zeros(((end - start), model.d)) for j in xrange(start, end): indices = get_indices(data, j) if indices.size: W[j - start, :] = model.update(indices, H, HH) np.save(os.path.join(workdir, '{0}.{1}.npy'.format(factor_type, start)), W) return start, end
def process(view,opts,modelfile,trainfile,testfile,outdir,evaluator): logging.info('finding number of users...') dataset = load_fast_sparse_matrix(opts.input_format,trainfile) num_users,num_items = dataset.shape del dataset recsdir = get_recsdir(trainfile,opts.outdir) logging.info('creating recs directory {0}...'.format(recsdir)) subprocess.check_call(['mkdir','-p',recsdir]) done = [] if not opts.overwrite: logging.info('checking for existing output recs...') done.extend(find_done(recsdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = create_tasks(modelfile,opts.input_format,trainfile,opts.test_input_format,testfile,recsdir,num_users,opts.num_engines,done,evaluator) logging.info('running in parallel across ipython engines...') results = [] results.append(view.map_async(predict.run,tasks,retries=2)) # wait for tasks to complete processed = [r.get() for r in results] logging.info('checking output files...') done = find_done(recsdir) remaining = len(tasks) - len(done) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) paths = [os.path.join(recsdir,'recs.{0}-{1}.tsv'.format(start,end)) for start,end in done] cmd = ['cat']+paths recsfile = get_recsfile(trainfile,outdir) subprocess.check_call(cmd,stdout=open(recsfile,'w')) logging.info('removing partial output files...') rmtree(recsdir) logging.info('done') # aggregate metrics from each task avg_metrics = defaultdict(float) tot_count = 0 for results in processed: for cum_metrics,count in results: for m,val in cum_metrics.iteritems(): avg_metrics[m] += val tot_count += count for m in avg_metrics: avg_metrics[m] /= float(tot_count) else: logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) logging.error('try rerunning the command to retry the remaining tasks') avg_metrics = None return read_recommender_description(modelfile),avg_metrics
def main(): parser = OptionParser() parser.add_option('-d','--dataset',dest='dataset',help='path to dataset') parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') parser.add_option('--l1_min',dest='l1_min',type='float',help='min l1 constant to try (expected to be a power of 10)') parser.add_option('--l1_max',dest='l1_max',type='float',help='max l1 constant to try (expected to be a power of 10)') parser.add_option('--l2_min',dest='l2_min',type='float',help='min l2 constant to try (expected to be a power of 10)') parser.add_option('--l2_max',dest='l2_max',type='float',help='max l2 constant to try (expected to be a power of 10)') parser.add_option('--max_sims',dest='max_sims',type='int',default=2000,help='max desired number of positive item similarity weights (default: %default)') parser.add_option('--min_sims',dest='min_sims',type='int',default=15,help='min desired number of positive item similarity weights (default: %default)') parser.add_option('--max_sparse',dest='max_sparse',type='float',default=0.01,help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)') parser.add_option('--num_samples',dest='num_samples',type='int',default=100,help='number of sample items to evaluate for each regularization setting') parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') parser.add_option('--add_module_paths',dest='add_module_paths',help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules') (opts,args) = parser.parse_args() if not opts.dataset or not opts.input_format or not opts.l1_min or not opts.l1_max or not opts.l2_min or not opts.l2_max: parser.print_help() raise SystemExit logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') dataset = load_fast_sparse_matrix(opts.input_format,opts.dataset) params = {'l1_reg':pow_range(opts.l1_min,opts.l1_max), 'l2_reg':pow_range(opts.l2_min,opts.l2_max)} num_items = dataset.shape[1] sample_items = random.sample(xrange(num_items),opts.num_samples) logging.info('preparing tasks for a grid search of these values:') logging.info(params) tasks = [(args,dataset,opts.min_sims,sample_items) for args in ParameterGrid(params)] c = Client(packer=opts.packer) view = c.load_balanced_view() if opts.add_module_paths: c[:].execute('import sys') for path in opts.add_module_paths.split(','): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) logging.info('running {0} tasks in parallel...'.format(len(tasks))) results = view.map(estimate_sparsity,tasks,ordered=False) candidates = [(args,nsims,nsparse,nneg) for args,nsims,nsparse,nneg in results if nsims <= opts.max_sims and nsparse <= opts.max_sparse] if candidates: best = min(candidates,key=itemgetter(1)) print 'best parameter setting: {0}'.format(best[0]) print 'mean # positive similarity weights per item = {0:.3}'.format(best[1]) print 'proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,best[2]) print 'mean # negative similarity weights per item = {0:.3}'.format(best[3]) else: print 'no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse'
def process(view,opts,model,trainfile,outdir): logging.info('finding number of items...') dataset = load_fast_sparse_matrix(opts.input_format,trainfile) num_users,num_items = dataset.shape del dataset simsdir = get_simsdir(trainfile,outdir) logging.info('creating sims directory {0}...'.format(simsdir)) subprocess.check_call(['mkdir','-p',simsdir]) done = [] if not opts.overwrite: logging.info('checking for existing output sims...') done.extend(find_done(simsdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = create_tasks(model,opts.input_format,trainfile,simsdir,num_items,opts.num_engines,opts.max_sims,done) logging.info('running in parallel across ipython engines...') results = [] results.append(view.map_async(train.run,tasks,retries=2)) # wait for tasks to complete processed = [r.get() for r in results] logging.info('checking output files...') done = find_done(simsdir) remaining = len(tasks) - len(done) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done] cmd = ['cat']+paths simsfile = get_simsfile(trainfile,outdir) subprocess.check_call(cmd,stdout=open(simsfile,'w')) logging.info('removing partial output files...') rmtree(simsdir) model.load_similarity_matrix(simsfile,num_items) modelfile = get_modelfile(trainfile,outdir) save_recommender(model,modelfile) logging.info('done') else: logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) logging.error('try rerunning the command to retry the remaining tasks')
def process(task): """ Training task to run on an ipython engine. """ # import modules required by engine import os import subprocess from mrec import load_fast_sparse_matrix model, input_format, trainfile, outdir, start, end, max_similar_items = task # initialise the model dataset = load_fast_sparse_matrix(input_format, trainfile) if hasattr(model, 'similarity_matrix'): # clear out any existing similarity matrix to trigger recomputation of # the item-item similarities from the users' ratings. model.similarity_matrix = None # write sims directly to file as we compute them outfile = os.path.join(outdir, 'sims.{0}-{1}.tsv'.format(start, end)) out = open(outfile, 'w') for j in xrange(start, end): w = model.get_similar_items(j, max_similar_items=max_similar_items, dataset=dataset) for k, v in w: print >> out, '{0}\t{1}\t{2}'.format(j + 1, k + 1, v) # write as 1-indexed out.close() # record success cmd = ['touch', os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end))] subprocess.check_call(cmd) # return the range that we've processed return start, end
def main(): parser = OptionParser() parser.add_option('-d', '--dataset', dest='dataset', help='path to dataset') parser.add_option('--input_format', dest='input_format', help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') parser.add_option('--l1_min', dest='l1_min', type='float', help='min l1 constant to try (expected to be a power of 10)') parser.add_option('--l1_max', dest='l1_max', type='float', help='max l1 constant to try (expected to be a power of 10)') parser.add_option('--l2_min', dest='l2_min', type='float', help='min l2 constant to try (expected to be a power of 10)') parser.add_option('--l2_max', dest='l2_max', type='float', help='max l2 constant to try (expected to be a power of 10)') parser.add_option('--max_sims', dest='max_sims', type='int', default=2000, help='max desired number of positive item similarity weights (default: %default)') parser.add_option('--min_sims', dest='min_sims', type='int', default=15, help='min desired number of positive item similarity weights (default: %default)') parser.add_option('--max_sparse', dest='max_sparse', type='float', default=0.01, help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)') parser.add_option('--num_samples', dest='num_samples', type='int', default=100, help='number of sample items to evaluate for each regularization setting') parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)') parser.add_option('--add_module_paths', dest='add_module_paths', help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules') (opts, args) = parser.parse_args() if not opts.dataset or not opts.input_format or not opts.l1_min or not opts.l1_max or not opts.l2_min or not opts.l2_max: parser.print_help() raise SystemExit logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') dataset = load_fast_sparse_matrix(opts.input_format, opts.dataset) params = {'l1_reg': pow_range(opts.l1_min, opts.l1_max), 'l2_reg': pow_range(opts.l2_min, opts.l2_max)} num_items = dataset.shape[1] sample_items = random.sample(range(num_items), opts.num_samples) logging.info('preparing tasks for a grid search of these values:') logging.info(params) tasks = [(args, dataset, opts.min_sims, sample_items) for args in ParameterGrid(params)] c = Client(packer=opts.packer) view = c.load_balanced_view() if opts.add_module_paths: c[:].execute('import sys') for path in opts.add_module_paths.split(','): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) logging.info('running {0} tasks in parallel...'.format(len(tasks))) results = view.map(estimate_sparsity, tasks, ordered=False) candidates = [(args, nsims, nsparse, nneg) for args, nsims, nsparse, nneg in results if nsims <= opts.max_sims and nsparse <= opts.max_sparse] if candidates: best = min(candidates, key=itemgetter(1)) print('best parameter setting: {0}'.format(best[0])) print('mean # positive similarity weights per item = {0:.3}'.format(best[1])) print('proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims, best[2])) print('mean # negative similarity weights per item = {0:.3}'.format(best[3])) else: print('no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse')
print 'loading test data...' data = """\ %%MatrixMarket matrix coordinate real general 3 5 9 1 1 1 1 2 1 1 3 1 1 4 1 2 2 1 2 3 1 2 5 1 3 3 1 3 4 1 """ print data dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data)) num_users,num_items = dataset.shape model = SLIM() num_samples = 2 def output(i,j,val): # convert back to 1-indexed print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val) print 'computing some item similarities...' print 'item\tsim\tweight' # if we want we can compute these individually without calling fit() for i in random.sample(xrange(num_items),num_samples): for j,weight in model.get_similar_items(i,max_similar_items=10,dataset=dataset):
print('loading test data...') data = """\ %%MatrixMarket matrix coordinate real general 3 5 9 1 1 1 1 2 1 1 3 1 1 4 1 2 2 1 2 3 1 2 5 1 3 3 1 3 4 1 """ print(data) dataset = load_fast_sparse_matrix('mm', StringIO.StringIO(data)) num_users, num_items = dataset.shape model = CosineKNNRecommender(k=2) num_samples = 2 def output(i, j, val): # convert back to 1-indexed print('{0}\t{1}\t{2:.3f}'.format(i + 1, j + 1, val)) print('computing some item similarities...') print('item\tsim\tweight') # if we want we can compute these individually without calling fit() for i in random.sample(range(num_items), num_samples): for j, weight in model.get_similar_items(i, max_similar_items=2, dataset=dataset):
def testPredictionMethods(train_filename, eval_item_filename, user_means_filename): ''' compare predictions generated by the different approaches computes pairwise list overlap and average recall for each method ''' logging.info('testing predictions with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename)) mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) mrec_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE) mrec_recommender.fit(mrec_train_data) warp_recommender = WARPMFRecommender(d=50, gamma=0.01, C=100.0) warp_recommender.fit(mrec_train_data.X) train_data = trainData.TrainData(train_filename, user_means_filename) _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE) recalls = {} overlaps = {} top_recs = {} user_counter = 0.0 methods = ['mrec', 'warp', 'mf', 'ub_classic', 'ib_classic', 'ub_damping', 'ib_damping', 'ub_non', 'ib_non'] with open(eval_item_filename,'rb') as eval_file: for line in eval_file: data = line.split('\t') user_id = data[0] ground_truth_items = data[1].split(',') random_unrated_items = data[2].rstrip('\n').split(',') evaluation_item_ids = ground_truth_items + random_unrated_items # for each prediction method, compute topN recommendations once per user predictions1 = mrec_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True) top_recs['mrec'] = topNLists.getTopNList(predictions1, evaluation_item_ids=evaluation_item_ids) predictions2 = warp_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True) top_recs['warp'] = topNLists.getTopNList(predictions2, evaluation_item_ids=evaluation_item_ids) predictions3 = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids) top_recs['mf'] = topNLists.getTopNList(predictions3) predictions4 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'classic') top_recs['ub_classic'] = topNLists.getTopNList(predictions4) predictions5 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'classic') top_recs['ib_classic'] = topNLists.getTopNList(predictions5) predictions6 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'self_damping') top_recs['ub_damping'] = topNLists.getTopNList(predictions6) predictions7 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'self_damping') top_recs['ib_damping'] = topNLists.getTopNList(predictions7) predictions8 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized') top_recs['ub_non'] = topNLists.getTopNList(predictions8) predictions9 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized') top_recs['ib_non'] = topNLists.getTopNList(predictions9) # then, use the computed topN lists to update recall and overlap values for method1 in methods: if method1 in recalls: recalls[method1] += topNLists.getRecall(ground_truth_items, top_recs[method1]) else: recalls[method1] = topNLists.getRecall(ground_truth_items, top_recs[method1]) for method2 in methods: dict_key = method1 + '_' + method2 if dict_key in overlaps: overlaps[dict_key] += topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2]) else: overlaps[dict_key] = topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2]) user_counter += 1.0 logging.info('Tested user {0}. Current recalls: {1}. Current overlaps: {2}'.\ format(user_id, [(k, v/user_counter) for k,v in recalls.items()], [(k, v/user_counter) for k,v in overlaps.items()])) return recalls, overlaps
def main(): import os import logging import glob import subprocess from optparse import OptionParser from IPython.parallel import Client from mrec import load_fast_sparse_matrix, save_recommender from mrec.item_similarity.slim import SLIM from mrec.item_similarity.knn import (CosineKNNRecommender, DotProductKNNRecommender, AdjustedCosineKNNRecommender, JaccardKNNRecommender) from mrec.mf.wrmf import WRMFRecommender from mrec.mf.warp import WARPMFRecommender from mrec.mf.warp2 import WARP2MFRecommender from mrec.popularity import ItemPopularityRecommender from mrec.parallel.item_similarity import ItemSimilarityRunner from mrec.parallel.wrmf import WRMFRunner from mrec.parallel.warp import WARPMFRunner logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use') parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') parser.add_option('--outdir',dest='outdir',help='directory for output files') parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir') parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)') parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)') parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)') parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot | adjusted_cosine | jaccard (default: %default)') parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)') parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)') parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)') parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)') parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)') parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)') parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)') parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') (opts,args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines: parser.print_help() raise SystemExit opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir)) trainfiles = glob.glob(opts.train) if opts.model == 'popularity': # special case, don't need to run in parallel subprocess.check_call(['mkdir','-p',opts.outdir]) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh) dataset = load_fast_sparse_matrix(opts.input_format,trainfile) model.fit(dataset) modelfile = get_modelfile(trainfile,opts.outdir) save_recommender(model,modelfile) logging.info('done') return # create an ipython client c = Client(packer=opts.packer) view = c.load_balanced_view() if opts.add_module_paths: c[:].execute('import sys') for path in opts.add_module_paths.split(','): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) if opts.model == 'slim': if opts.learner == 'fs_sgd': num_selected_features = 2 * opts.max_sims # preselect this many candidate similar items model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features) else: model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner) elif opts.model == 'knn': if opts.metric == 'cosine': model = CosineKNNRecommender(k=opts.max_sims) elif opts.metric == 'dot': model = DotProductKNNRecommender(k=opts.max_sims) elif opts.metric == 'adjusted_cosine': model = AdjustedCosineKNNRecommender(k=opts.max_sims) elif opts.metric == 'jaccard': model = JaccardKNNRecommender(k=opts.max_sims) else: parser.print_help() raise SystemExit('unknown metric: {0}'.format(opts.metric)) elif opts.model == 'wrmf': model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters) elif opts.model == 'warp': num_factors_per_engine = max(opts.num_factors/opts.num_engines,1) if opts.item_features: model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: parser.print_help() raise SystemExit('unknown model type: {0}'.format(opts.model)) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) modelfile = get_modelfile(trainfile,opts.outdir) if opts.model == 'wrmf': runner = WRMFRunner() factorsdir = get_factorsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile) elif opts.model == 'warp': runner = WARPMFRunner() modelsdir = get_modelsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile) else: runner = ItemSimilarityRunner() simsdir = get_simsdir(trainfile,opts.outdir) simsfile = get_simsfile(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
config.SPLIT_DIR = os.path.join(config.PACKAGE_DIR, '../grid_search_music_splits/') config.LABEL_FREQUENCY_THRESHOLD = 10 if not os.path.exists(config.SPLIT_DIR): os.makedirs(config.SPLIT_DIR) factor_values = [25, 50, 75, 100] C_values = [1.0, 10.0, 100.0, 1000.0] gamma_values = [0.01, 0.001, 0.0001] filenames = dataPreprocessing.loadData(mode='beyond_accuracy') # 5-fold cross-validation for iteration, (train_filename, test_filename, user_means_filename, eval_item_filename) in enumerate(filenames, 1): mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) # create the training data and required recommendation models train_data = trainData.TrainData(train_filename, user_means_filename) for factor_value, C_value, gamma_value in product( factor_values, C_values, gamma_values): warp_recommender = WARPMFRecommender(d=factor_value, gamma=gamma_value, C=C_value) warp_recommender.fit(mrec_train_data.X) logging.info('running fold {0} with f={1}, C={2}, g={3}...'.format( iteration, factor_value, C_value, gamma_value)) recall = 0
def main(): import os import logging import glob import subprocess from optparse import OptionParser from IPython.parallel import Client from mrec import load_fast_sparse_matrix, save_recommender from mrec.item_similarity.slim import SLIM from mrec.item_similarity.knn import CosineKNNRecommender, DotProductKNNRecommender from mrec.mf.wrmf import WRMFRecommender from mrec.mf.warp import WARPMFRecommender from mrec.mf.warp2 import WARP2MFRecommender from mrec.popularity import ItemPopularityRecommender from mrec.parallel.item_similarity import ItemSimilarityRunner from mrec.parallel.wrmf import WRMFRunner from mrec.parallel.warp import WARPMFRunner logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use') parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') parser.add_option('--outdir',dest='outdir',help='directory for output files') parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir') parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)') parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)') parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)') parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)') parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)') parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)') parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)') parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)') parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)') parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)') parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)') parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') (opts,args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines: parser.print_help() raise SystemExit opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir)) trainfiles = glob.glob(opts.train) if opts.model == 'popularity': # special case, don't need to run in parallel subprocess.check_call(['mkdir','-p',opts.outdir]) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh) dataset = load_fast_sparse_matrix(opts.input_format,trainfile) model.fit(dataset) modelfile = get_modelfile(trainfile,opts.outdir) save_recommender(model,modelfile) logging.info('done') return # create an ipython client c = Client(packer=opts.packer) view = c.load_balanced_view() if opts.add_module_paths: c[:].execute('import sys') for path in opts.add_module_paths.split(','): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) if opts.model == 'slim': if opts.learner == 'fs_sgd': num_selected_features = 2 * opts.max_sims # preselect this many candidate similar items model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features) else: model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner) elif opts.model == 'knn': if opts.metric == 'cosine': model = CosineKNNRecommender(k=opts.max_sims) elif opts.metric == 'dot': model = DotProductKNNRecommender(k=opts.max_sims) else: parser.print_help() raise SystemExit('unknown metric: {0}'.format(opts.metric)) elif opts.model == 'wrmf': model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters) elif opts.model == 'warp': num_factors_per_engine = max(opts.num_factors/opts.num_engines,1) if opts.item_features: model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: parser.print_help() raise SystemExit('unknown model type: {0}'.format(opts.model)) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) modelfile = get_modelfile(trainfile,opts.outdir) if opts.model == 'wrmf': runner = WRMFRunner() factorsdir = get_factorsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile) elif opts.model == 'warp': runner = WARPMFRunner() modelsdir = get_modelsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile) else: runner = ItemSimilarityRunner() simsdir = get_simsdir(trainfile,opts.outdir) simsfile = get_simsfile(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
# Preparing format for mrec toy = data.copy() le = preprocessing.LabelEncoder() user_num = le.fit_transform(toy['user']) + 1 toy['user'] = user_num le2 = preprocessing.LabelEncoder() game_num = le2.fit_transform(toy['item']) + 1 toy['item'] = game_num # converting play count to 0: did not play, 1: did play binary = toy.copy() binary['score'] = binary['score'].map(lambda x: 1) binary_train = binary.sort('user')[::2] binary_test = binary.sort('user')[1::2] binary_train.to_csv('../ge10/train.tsv', sep='\t', header = False, index = False) binary_test.to_csv('../ge10/test.tsv', sep='\t', header = False, index = False) dataset = load_fast_sparse_matrix("../ge10/train.tsv") num_users,num_items = dataset.shape model = SLIM() recs = model.batch_recommend_items(dataset.X)