def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.item_similarity.knn import CosineKNNRecommender from mrec.mf.warp import WARPMFRecommender from mrec.reranking_recommender import RerankingRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format, filepath) item_sim_model = CosineKNNRecommender(k=100) mf_model = WARPMFRecommender(d=80, gamma=0.01, C=100.0, max_iters=25000, validation_iters=1000, batch_size=10) recommender = RerankingRecommender(item_sim_model, mf_model, num_candidates=100) recommender.fit(train) save_recommender(recommender, outfile)
def process(task): """ Training task to run on an ipython engine. """ # import modules required by engine import os import subprocess from mrec import load_sparse_matrix, save_recommender model, input_format, trainfile, feature_format, featurefile, outfile, offset, step = task dataset = load_sparse_matrix(input_format, trainfile) if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix item_features = load_sparse_matrix(feature_format, featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] item_features = item_features[:num_items, :] model.fit(dataset, item_features=item_features) else: model.fit(dataset) save_recommender(model, outfile) # record success cmd = ['touch', '{0}.SUCCESS'.format(outfile)] subprocess.check_call(cmd) # return the offset for the samples that we've learned from return offset
def run(self, view, model, input_format, trainfile, num_engines, workdir, modelfile): logging.info('creating factors directory {0}'.format(workdir)) subprocess.check_call(['mkdir', '-p', workdir]) logging.info('getting data size') data = load_sparse_matrix(input_format, trainfile) num_users, num_items = data.shape del data for it in xrange(model.num_iters): logging.info('iteration {0}'.format(it)) tasks = self.create_tasks(num_users, num_engines, model, input_format, trainfile, workdir, 'U', get_user_indices, get_item_factor_files, init_item_factors) self.run_tasks(view, tasks) tasks = self.create_tasks( num_items, num_engines, model, input_format, trainfile, workdir, 'V', get_item_indices, get_user_factor_files, None) # won't need to initialize user factors self.run_tasks(view, tasks) model.U = np.vstack( [np.load(f) for f in get_user_factor_files(workdir)]) model.V = np.vstack( [np.load(f) for f in get_item_factor_files(workdir)]) save_recommender(model, modelfile) logging.info('removing partial output files') rmtree(workdir) logging.info('done')
def process(task): """ Training task to run on an ipython engine. """ # import modules required by engine import os import subprocess from mrec import load_sparse_matrix, save_recommender model,input_format,trainfile,feature_format,featurefile,outfile,offset,step = task dataset = load_sparse_matrix(input_format,trainfile) if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix item_features = load_sparse_matrix(feature_format,featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] item_features = item_features[:num_items,:] model.fit(dataset,item_features=item_features) else: model.fit(dataset) save_recommender(model,outfile) # record success cmd = ['touch','{0}.SUCCESS'.format(outfile)] subprocess.check_call(cmd) # return the offset for the samples that we've learned from return offset
def process(task): """ Training task to run on an ipython engine. """ # import modules required by engine import os import subprocess from mrec import load_sparse_matrix, save_recommender from mrec.mf.warp import ShuffleSampler model,input_format,trainfile,outfile,offset,step = task # TODO: configure this!!! positive_thresh = 1 dataset = load_sparse_matrix(input_format,trainfile) # TODO: models don't seem to converge, investigate.... #sampler = ShuffleSampler(dataset,positive_thresh,42,offset,step) sampler = ShuffleSampler(dataset,positive_thresh,42) model.fit(dataset,sampler) save_recommender(model,outfile) # record success cmd = ['touch','{0}.SUCCESS'.format(outfile)] subprocess.check_call(cmd) # return the offset for the samples that we've learned from return offset
def main(): import logging import subprocess from optparse import OptionParser import numpy as np from scipy.io import mmread from mrec import save_recommender from mrec.mf.recommender import MatrixFactorizationRecommender from mrec.examples.filename_conventions import get_modelfile logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option('--factor_format', dest='factor_format', help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)') parser.add_option('--user_factors', dest='user_factors', help='user factors filepath') parser.add_option('--item_factors', dest='item_factors', help='item factors filepath') parser.add_option('--train', dest='train', help='filepath to training data, just used to apply naming convention to output model saved here') parser.add_option('--outdir', dest='outdir', help='directory for output') parser.add_option('--description', dest='description', help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results') (opts, args) = parser.parse_args() if not opts.factor_format or not opts.user_factors or not opts.item_factors \ or not opts.outdir: parser.print_help() raise SystemExit model = MatrixFactorizationRecommender() logging.info('loading factors...') if opts.factor_format == 'npy': model.U = np.load(opts.user_factors) model.V = np.load(opts.item_factors) elif opts.factor_format == 'mm': model.U = mmread(opts.user_factors) model.V = mmread(opts.item_factors) elif opts.factor_format == 'tsv': model.U = np.loadtxt(opts.user_factors) model.V = np.loadtxt(opts.item_factors) else: raise ValueError('unknown factor format: {0}'.format(factor_format)) if opts.description: model.description = opts.description logging.info('saving model...') logging.info('creating output directory {0}...'.format(opts.outdir)) subprocess.check_call(['mkdir', '-p', opts.outdir]) modelfile = get_modelfile(opts.train, opts.outdir) save_recommender(model, modelfile) logging.info('done')
def main(): import os import logging import subprocess from optparse import OptionParser import numpy as np from scipy.io import mmread from mrec import save_recommender from mrec.mf.recommender import MatrixFactorizationRecommender from filename_conventions import get_modelfile logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option('--factor_format',dest='factor_format',help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)') parser.add_option('--user_factors',dest='user_factors',help='user factors filepath') parser.add_option('--item_factors',dest='item_factors',help='item factors filepath') parser.add_option('--train',dest='train',help='filepath to training data, just used to apply naming convention to output model saved here') parser.add_option('--outdir',dest='outdir',help='directory for output') parser.add_option('--description',dest='description',help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results') (opts,args) = parser.parse_args() if not opts.factor_format or not opts.user_factors or not opts.item_factors \ or not opts.outdir: parser.print_help() raise SystemExit model = MatrixFactorizationRecommender() logging.info('loading factors...') if opts.factor_format == 'npy': model.U = np.load(opts.user_factors) model.V = np.load(opts.item_factors) elif opts.factor_format == 'mm': model.U = mmread(opts.user_factors) model.V = mmread(opts.item_factors) elif opts.factor_format == 'tsv': model.U = np.loadtxt(opts.user_factors) model.V = np.loadtxt(opts.item_factors) else: raise ValueError('unknown factor format: {0}'.format(factor_format)) if opts.description: model.description = opts.description logging.info('saving model...') logging.info('creating output directory {0}...'.format(opts.outdir)) subprocess.check_call(['mkdir','-p',opts.outdir]) modelfile = get_modelfile(opts.train,opts.outdir) save_recommender(model,modelfile) logging.info('done')
def run(self, view, model, input_format, trainfile, feature_format, featurefile, num_engines, workdir, overwrite, modelfile): logging.info('creating models directory {0}...'.format(workdir)) subprocess.check_call(['mkdir', '-p', workdir]) done = [] if not overwrite: logging.info('checking for existing output models...') done.extend(self.find_done(workdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = self.create_tasks(model, input_format, trainfile, feature_format, featurefile, workdir, num_engines, done) if tasks: logging.info('running in parallel across ipython engines...') async_job = view.map_async(process, tasks, retries=2) # wait for tasks to complete results = async_job.get() logging.info('checking output files...') done = self.find_done(workdir) remaining = len(tasks) - len(done) else: remaining = 0 if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} models...'.format(len(done))) for ix in sorted(done): partial_model = load_recommender( self.get_modelfile(ix, workdir)) if ix == 0: model = partial_model else: # concatenate factors model.d += partial_model.d model.U = np.hstack((model.U, partial_model.U)) model.V = np.hstack((model.V, partial_model.V)) if hasattr(model, 'W'): model.W = np.hstack((model.W, partial_model.W)) save_recommender(model, modelfile) logging.info('removing partial output files...') rmtree(workdir) logging.info('done') else: logging.error( 'FAILED: {0}/{1} tasks did not complete successfully'.format( remaining, len(tasks))) logging.error( 'try rerunning the command to retry the remaining tasks')
def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max_sims,simsfile,modelfile): logging.info('finding number of items...') dataset = load_sparse_matrix(input_format,trainfile) num_users,num_items = dataset.shape del dataset logging.info('%d users and %d items', num_users, num_items) logging.info('creating sims directory {0}...'.format(simsdir)) subprocess.check_call(['mkdir','-p',simsdir]) done = [] if not overwrite: logging.info('checking for existing output sims...') done.extend(self.find_done(simsdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = self.create_tasks(model,input_format,trainfile,simsdir,num_items,num_engines,max_sims,done) if num_engines > 0: logging.info('running %d tasks in parallel across ipython' ' engines...', len(tasks)) async_job = view.map_async(process,tasks,retries=2) # wait for tasks to complete results = async_job.get() else: # Sequential run to make it easier for debugging logging.info('training similarity model sequentially') results = [process(task) for task in tasks] logging.info('checking output files...') done = self.find_done(simsdir) remaining = len(tasks) - len(done) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done] cmd = ['cat']+paths subprocess.check_call(cmd,stdout=open(simsfile,'w')) logging.info('removing partial output files...') rmtree(simsdir) logging.info('loading %d items in %s model from %s', num_items, type(model).__name__, simsfile) model.load_similarity_matrix(simsfile,num_items) save_recommender(model,modelfile) logging.info('done') else: logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) logging.error('try rerunning the command to retry the remaining tasks')
def main(): import sys from mrec import load_sparse_matrix, save_recommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format, filepath) model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10) model.fit(train) save_recommender(model, outfile)
def main(file_format, filepath, feature_format, feature_file, outfile): from mrec import load_sparse_matrix, save_recommender # load training set train = load_sparse_matrix(file_format, filepath) # load item features, assume they are tsv: item_id,feature_id,val X = load_sparse_matrix(feature_format, feature_file).toarray() # strip features for any trailing items that don't appear in training set num_items = train.shape[1] X = X[:num_items, :] model = WARP2MFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10) model.fit(train, X) save_recommender(model, outfile)
def main(file_format,filepath,feature_format,feature_file,outfile): from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix # load training set train = load_sparse_matrix(file_format,filepath) # load item features, assume they are tsv: item_id,feature_id,val X = load_sparse_matrix(feature_format,feature_file).toarray() # strip features for any trailing items that don't appear in training set num_items = train.shape[1] X = X[:num_items,:] model = WARP2MFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10) model.fit(train,X) save_recommender(model,outfile)
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.mf.climf import CLiMFRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format,filepath) model = CLiMFRecommender(d=5) model.fit(train) save_recommender(model,outfile)
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.mf.climf import CLiMFRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format, filepath) model = CLiMFRecommender(d=5) model.fit(train) save_recommender(model, outfile)
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format,filepath) model = WARPMFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10) model.fit(train) save_recommender(model,outfile)
def process(view,opts,model,trainfile,outdir): logging.info('finding number of items...') dataset = load_fast_sparse_matrix(opts.input_format,trainfile) num_users,num_items = dataset.shape del dataset simsdir = get_simsdir(trainfile,outdir) logging.info('creating sims directory {0}...'.format(simsdir)) subprocess.check_call(['mkdir','-p',simsdir]) done = [] if not opts.overwrite: logging.info('checking for existing output sims...') done.extend(find_done(simsdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = create_tasks(model,opts.input_format,trainfile,simsdir,num_items,opts.num_engines,opts.max_sims,done) logging.info('running in parallel across ipython engines...') results = [] results.append(view.map_async(train.run,tasks,retries=2)) # wait for tasks to complete processed = [r.get() for r in results] logging.info('checking output files...') done = find_done(simsdir) remaining = len(tasks) - len(done) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done] cmd = ['cat']+paths simsfile = get_simsfile(trainfile,outdir) subprocess.check_call(cmd,stdout=open(simsfile,'w')) logging.info('removing partial output files...') rmtree(simsdir) model.load_similarity_matrix(simsfile,num_items) modelfile = get_modelfile(trainfile,outdir) save_recommender(model,modelfile) logging.info('done') else: logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) logging.error('try rerunning the command to retry the remaining tasks')
def run_mrec(d=10,num_iters=4,reg=0.02): #d is dimension of subspace, i.e. groups import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.mf.wrmf import WRMFRecommender alpha=1.0 start=time.time() file_format = "csv" #file shoule be csv, with: row,col,data #data may just be ones filepath = PARS['data_dir']+"/reduced_row_col_num_cutoff_1.5.csv" #filepath = PARS['data_dir']+"test_10_mill.csv" outfile = make_mrec_outfile(filepath,d,num_iters,reg) print outfile print 'reading file: %s'%filepath # load training set as scipy sparse matrix print "loading file" train = load_sparse_matrix(file_format,filepath) print "loaded file" print (time.time()-start),"seconds" print "size:",train.shape print "creating recommender" model = WRMFRecommender(d=d,num_iters=num_iters,alpha=alpha,lbda=reg) print "training on data" print time.time()-start model.fit(train) print "done training" print time.time()-start print "saving model" save_recommender(model,outfile) print "wrote model to: %s"%outfile print time.time()-start return print "validating" data,U,V=read_mrec(mrec_file=outfile) plot_file=outfile.replace('.npz','.png') multi_thresh(data,model,thresh_list=None,plot_file=plot_file) run_time=(time.time()-start)/60.0 print "runtime: %0.3f minutes"%run_time print 'done'
def run_mrec(d=10, num_iters=4, reg=0.02): #d is dimension of subspace, i.e. groups import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.mf.wrmf import WRMFRecommender alpha = 1.0 start = time.time() file_format = "csv" #file shoule be csv, with: row,col,data #data may just be ones filepath = PARS['data_dir'] + "/reduced_row_col_num_cutoff_1.5.csv" #filepath = PARS['data_dir']+"test_10_mill.csv" outfile = make_mrec_outfile(filepath, d, num_iters, reg) print outfile print 'reading file: %s' % filepath # load training set as scipy sparse matrix print "loading file" train = load_sparse_matrix(file_format, filepath) print "loaded file" print(time.time() - start), "seconds" print "size:", train.shape print "creating recommender" model = WRMFRecommender(d=d, num_iters=num_iters, alpha=alpha, lbda=reg) print "training on data" print time.time() - start model.fit(train) print "done training" print time.time() - start print "saving model" save_recommender(model, outfile) print "wrote model to: %s" % outfile print time.time() - start return print "validating" data, U, V = read_mrec(mrec_file=outfile) plot_file = outfile.replace('.npz', '.png') multi_thresh(data, model, thresh_list=None, plot_file=plot_file) run_time = (time.time() - start) / 60.0 print "runtime: %0.3f minutes" % run_time print 'done'
def run(self,view,model,input_format,trainfile,num_engines,workdir,overwrite,modelfile): logging.info('creating models directory {0}...'.format(workdir)) subprocess.check_call(['mkdir','-p',workdir]) done = [] if not overwrite: logging.info('checking for existing output models...') done.extend(self.find_done(workdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = self.create_tasks(model,input_format,trainfile,workdir,num_engines,done) if tasks: logging.info('running in parallel across ipython engines...') async_job = view.map_async(process,tasks,retries=2) # wait for tasks to complete results = async_job.get() logging.info('checking output files...') done = self.find_done(workdir) remaining = len(tasks) - len(done) else: remaining = 0 if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('averaging {0} models...'.format(len(done))) for ix in sorted(done): # average two models at a time to limit memory usage partial_model = load_recommender(self.get_modelfile(ix,workdir)) if ix == 0: model = partial_model else: model.U = (ix*model.U + partial_model.U)/float(ix+1) model.V = (ix*model.V + partial_model.V)/float(ix+1) save_recommender(model,modelfile) logging.info('removing partial output files...') rmtree(workdir) logging.info('done') else: logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) logging.error('try rerunning the command to retry the remaining tasks')
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.item_similarity.knn import CosineKNNRecommender from mrec.mf.warp import WARPMFRecommender from mrec.reranking_recommender import RerankingRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format,filepath) item_sim_model = CosineKNNRecommender(k=100) mf_model = WARPMFRecommender(d=80,gamma=0.01,C=100.0,max_iters=25000,validation_iters=1000,batch_size=10) recommender = RerankingRecommender(item_sim_model,mf_model,num_candidates=100) recommender.fit(train) save_recommender(recommender,outfile)
def run(self,view,model,input_format,trainfile,num_engines,workdir,modelfile): logging.info('creating factors directory {0}'.format(workdir)) subprocess.check_call(['mkdir','-p',workdir]) logging.info('getting data size') data = load_sparse_matrix(input_format,trainfile) num_users,num_items = data.shape del data for it in xrange(model.num_iters): logging.info('iteration {0}'.format(it)) tasks = self.create_tasks(num_users,num_engines,model,input_format,trainfile,workdir,'U',get_user_indices,get_item_factor_files,init_item_factors) self.run_tasks(view,tasks) tasks = self.create_tasks(num_items,num_engines,model,input_format,trainfile,workdir,'V',get_item_indices,get_user_factor_files,None) # won't need to initialize user factors self.run_tasks(view,tasks) model.U = np.vstack([np.load(f) for f in get_user_factor_files(workdir)]) model.V = np.vstack([np.load(f) for f in get_item_factor_files(workdir)]) save_recommender(model,modelfile) logging.info('removing partial output files') rmtree(workdir) logging.info('done')
def main(): import os import logging import glob import subprocess from optparse import OptionParser from IPython.parallel import Client from mrec import load_fast_sparse_matrix, save_recommender from mrec.item_similarity.slim import SLIM from mrec.item_similarity.knn import (CosineKNNRecommender, DotProductKNNRecommender, AdjustedCosineKNNRecommender, JaccardKNNRecommender) from mrec.mf.wrmf import WRMFRecommender from mrec.mf.warp import WARPMFRecommender from mrec.mf.warp2 import WARP2MFRecommender from mrec.popularity import ItemPopularityRecommender from mrec.parallel.item_similarity import ItemSimilarityRunner from mrec.parallel.wrmf import WRMFRunner from mrec.parallel.warp import WARPMFRunner logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use') parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') parser.add_option('--outdir',dest='outdir',help='directory for output files') parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir') parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)') parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)') parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)') parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot | adjusted_cosine | jaccard (default: %default)') parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)') parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)') parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)') parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)') parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)') parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)') parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)') parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') (opts,args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines: parser.print_help() raise SystemExit opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir)) trainfiles = glob.glob(opts.train) if opts.model == 'popularity': # special case, don't need to run in parallel subprocess.check_call(['mkdir','-p',opts.outdir]) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh) dataset = load_fast_sparse_matrix(opts.input_format,trainfile) model.fit(dataset) modelfile = get_modelfile(trainfile,opts.outdir) save_recommender(model,modelfile) logging.info('done') return # create an ipython client c = Client(packer=opts.packer) view = c.load_balanced_view() if opts.add_module_paths: c[:].execute('import sys') for path in opts.add_module_paths.split(','): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) if opts.model == 'slim': if opts.learner == 'fs_sgd': num_selected_features = 2 * opts.max_sims # preselect this many candidate similar items model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features) else: model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner) elif opts.model == 'knn': if opts.metric == 'cosine': model = CosineKNNRecommender(k=opts.max_sims) elif opts.metric == 'dot': model = DotProductKNNRecommender(k=opts.max_sims) elif opts.metric == 'adjusted_cosine': model = AdjustedCosineKNNRecommender(k=opts.max_sims) elif opts.metric == 'jaccard': model = JaccardKNNRecommender(k=opts.max_sims) else: parser.print_help() raise SystemExit('unknown metric: {0}'.format(opts.metric)) elif opts.model == 'wrmf': model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters) elif opts.model == 'warp': num_factors_per_engine = max(opts.num_factors/opts.num_engines,1) if opts.item_features: model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: parser.print_help() raise SystemExit('unknown model type: {0}'.format(opts.model)) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) modelfile = get_modelfile(trainfile,opts.outdir) if opts.model == 'wrmf': runner = WRMFRunner() factorsdir = get_factorsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile) elif opts.model == 'warp': runner = WARPMFRunner() modelsdir = get_modelsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile) else: runner = ItemSimilarityRunner() simsdir = get_simsdir(trainfile,opts.outdir) simsfile = get_simsfile(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
def run(self, view, model, input_format, trainfile, num_engines, simsdir, overwrite, max_sims, simsfile, modelfile): logging.info('finding number of items...') dataset = load_sparse_matrix(input_format, trainfile) num_users, num_items = dataset.shape del dataset logging.info('%d users and %d items', num_users, num_items) logging.info('creating sims directory {0}...'.format(simsdir)) subprocess.check_call(['mkdir', '-p', simsdir]) done = [] if not overwrite: logging.info('checking for existing output sims...') done.extend(self.find_done(simsdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = self.create_tasks(model, input_format, trainfile, simsdir, num_items, num_engines, max_sims, done) if num_engines > 0: logging.info( 'running %d tasks in parallel across ipython' ' engines...', len(tasks)) async_job = view.map_async(process, tasks, retries=2) # wait for tasks to complete results = async_job.get() else: # Sequential run to make it easier for debugging logging.info('training similarity model sequentially') results = [process(task) for task in tasks] logging.info('checking output files...') done = self.find_done(simsdir) remaining = len(tasks) - len(done) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format( len(done))) paths = [ os.path.join(simsdir, 'sims.{0}-{1}.tsv'.format(start, end)) for start, end in done ] cmd = ['cat'] + paths subprocess.check_call(cmd, stdout=open(simsfile, 'w')) logging.info('removing partial output files...') rmtree(simsdir) logging.info('loading %d items in %s model from %s', num_items, type(model).__name__, simsfile) model.load_similarity_matrix(simsfile, num_items) save_recommender(model, modelfile) logging.info('done') else: logging.error( 'FAILED: {0}/{1} tasks did not complete successfully'.format( remaining, len(tasks))) logging.error( 'try rerunning the command to retry the remaining tasks')
def main(): import os import logging import glob import subprocess from optparse import OptionParser from IPython.parallel import Client from mrec import load_fast_sparse_matrix, save_recommender from mrec.item_similarity.slim import SLIM from mrec.item_similarity.knn import CosineKNNRecommender, DotProductKNNRecommender from mrec.mf.wrmf import WRMFRecommender from mrec.mf.warp import WARPMFRecommender from mrec.mf.warp2 import WARP2MFRecommender from mrec.popularity import ItemPopularityRecommender from mrec.parallel.item_similarity import ItemSimilarityRunner from mrec.parallel.wrmf import WRMFRunner from mrec.parallel.warp import WARPMFRunner logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use') parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') parser.add_option('--outdir',dest='outdir',help='directory for output files') parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir') parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)') parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)') parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)') parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)') parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)') parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)') parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)') parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)') parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)') parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)') parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)') parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') (opts,args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines: parser.print_help() raise SystemExit opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir)) trainfiles = glob.glob(opts.train) if opts.model == 'popularity': # special case, don't need to run in parallel subprocess.check_call(['mkdir','-p',opts.outdir]) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh) dataset = load_fast_sparse_matrix(opts.input_format,trainfile) model.fit(dataset) modelfile = get_modelfile(trainfile,opts.outdir) save_recommender(model,modelfile) logging.info('done') return # create an ipython client c = Client(packer=opts.packer) view = c.load_balanced_view() if opts.add_module_paths: c[:].execute('import sys') for path in opts.add_module_paths.split(','): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) if opts.model == 'slim': if opts.learner == 'fs_sgd': num_selected_features = 2 * opts.max_sims # preselect this many candidate similar items model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features) else: model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner) elif opts.model == 'knn': if opts.metric == 'cosine': model = CosineKNNRecommender(k=opts.max_sims) elif opts.metric == 'dot': model = DotProductKNNRecommender(k=opts.max_sims) else: parser.print_help() raise SystemExit('unknown metric: {0}'.format(opts.metric)) elif opts.model == 'wrmf': model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters) elif opts.model == 'warp': num_factors_per_engine = max(opts.num_factors/opts.num_engines,1) if opts.item_features: model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) else: parser.print_help() raise SystemExit('unknown model type: {0}'.format(opts.model)) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) modelfile = get_modelfile(trainfile,opts.outdir) if opts.model == 'wrmf': runner = WRMFRunner() factorsdir = get_factorsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile) elif opts.model == 'warp': runner = WARPMFRunner() modelsdir = get_modelsdir(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile) else: runner = ItemSimilarityRunner() simsdir = get_simsdir(trainfile,opts.outdir) simsfile = get_simsfile(trainfile,opts.outdir) runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)