def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.item_similarity.knn import CosineKNNRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.reranking_recommender import RerankingRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    item_sim_model = CosineKNNRecommender(k=100)
    mf_model = WARPMFRecommender(d=80,
                                 gamma=0.01,
                                 C=100.0,
                                 max_iters=25000,
                                 validation_iters=1000,
                                 batch_size=10)
    recommender = RerankingRecommender(item_sim_model,
                                       mf_model,
                                       num_candidates=100)

    recommender.fit(train)

    save_recommender(recommender, outfile)
Example #2
0
def process(task):
    """
    Training task to run on an ipython engine.
    """

    # import modules required by engine
    import os
    import subprocess
    from mrec import load_sparse_matrix, save_recommender

    model, input_format, trainfile, feature_format, featurefile, outfile, offset, step = task

    dataset = load_sparse_matrix(input_format, trainfile)
    if featurefile is not None:
        # currently runs much faster if features are loaded as a dense matrix
        item_features = load_sparse_matrix(feature_format,
                                           featurefile).toarray()
        # strip features for any trailing items that don't appear in training set
        num_items = dataset.shape[1]
        item_features = item_features[:num_items, :]
        model.fit(dataset, item_features=item_features)
    else:
        model.fit(dataset)
    save_recommender(model, outfile)

    # record success
    cmd = ['touch', '{0}.SUCCESS'.format(outfile)]
    subprocess.check_call(cmd)

    # return the offset for the samples that we've learned from
    return offset
    def run(self, view, model, input_format, trainfile, num_engines, workdir,
            modelfile):
        logging.info('creating factors directory {0}'.format(workdir))
        subprocess.check_call(['mkdir', '-p', workdir])

        logging.info('getting data size')
        data = load_sparse_matrix(input_format, trainfile)
        num_users, num_items = data.shape
        del data

        for it in xrange(model.num_iters):
            logging.info('iteration {0}'.format(it))
            tasks = self.create_tasks(num_users, num_engines, model,
                                      input_format, trainfile, workdir, 'U',
                                      get_user_indices, get_item_factor_files,
                                      init_item_factors)
            self.run_tasks(view, tasks)
            tasks = self.create_tasks(
                num_items, num_engines, model, input_format, trainfile,
                workdir, 'V', get_item_indices, get_user_factor_files,
                None)  # won't need to initialize user factors
            self.run_tasks(view, tasks)

        model.U = np.vstack(
            [np.load(f) for f in get_user_factor_files(workdir)])
        model.V = np.vstack(
            [np.load(f) for f in get_item_factor_files(workdir)])

        save_recommender(model, modelfile)

        logging.info('removing partial output files')
        rmtree(workdir)
        logging.info('done')
Example #4
0
def process(task):
    """
    Training task to run on an ipython engine.
    """

    # import modules required by engine
    import os
    import subprocess
    from mrec import load_sparse_matrix, save_recommender

    model,input_format,trainfile,feature_format,featurefile,outfile,offset,step = task

    dataset = load_sparse_matrix(input_format,trainfile)
    if featurefile is not None:
        # currently runs much faster if features are loaded as a dense matrix
        item_features = load_sparse_matrix(feature_format,featurefile).toarray()
        # strip features for any trailing items that don't appear in training set
        num_items = dataset.shape[1]
        item_features = item_features[:num_items,:]
        model.fit(dataset,item_features=item_features)
    else:
        model.fit(dataset)
    save_recommender(model,outfile)

    # record success
    cmd = ['touch','{0}.SUCCESS'.format(outfile)]
    subprocess.check_call(cmd)

    # return the offset for the samples that we've learned from
    return offset
Example #5
0
def process(task):
    """
    Training task to run on an ipython engine.
    """

    # import modules required by engine
    import os
    import subprocess
    from mrec import load_sparse_matrix, save_recommender
    from mrec.mf.warp import ShuffleSampler

    model,input_format,trainfile,outfile,offset,step = task

    # TODO: configure this!!!
    positive_thresh = 1

    dataset = load_sparse_matrix(input_format,trainfile)
    # TODO: models don't seem to converge, investigate....
    #sampler = ShuffleSampler(dataset,positive_thresh,42,offset,step)
    sampler = ShuffleSampler(dataset,positive_thresh,42)

    model.fit(dataset,sampler)
    save_recommender(model,outfile)

    # record success
    cmd = ['touch','{0}.SUCCESS'.format(outfile)]
    subprocess.check_call(cmd)

    # return the offset for the samples that we've learned from
    return offset
Example #6
0
def main():
    import logging
    import subprocess
    from optparse import OptionParser
    import numpy as np
    from scipy.io import mmread

    from mrec import save_recommender
    from mrec.mf.recommender import MatrixFactorizationRecommender
    from mrec.examples.filename_conventions import get_modelfile

    logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('--factor_format', dest='factor_format',
                      help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)')
    parser.add_option('--user_factors', dest='user_factors', help='user factors filepath')
    parser.add_option('--item_factors', dest='item_factors', help='item factors filepath')
    parser.add_option('--train', dest='train',
                      help='filepath to training data, just used to apply naming convention to output model saved here')
    parser.add_option('--outdir', dest='outdir', help='directory for output')
    parser.add_option('--description', dest='description',
                      help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results')

    (opts, args) = parser.parse_args()
    if not opts.factor_format or not opts.user_factors or not opts.item_factors \
            or not opts.outdir:
        parser.print_help()
        raise SystemExit

    model = MatrixFactorizationRecommender()

    logging.info('loading factors...')

    if opts.factor_format == 'npy':
        model.U = np.load(opts.user_factors)
        model.V = np.load(opts.item_factors)
    elif opts.factor_format == 'mm':
        model.U = mmread(opts.user_factors)
        model.V = mmread(opts.item_factors)
    elif opts.factor_format == 'tsv':
        model.U = np.loadtxt(opts.user_factors)
        model.V = np.loadtxt(opts.item_factors)
    else:
        raise ValueError('unknown factor format: {0}'.format(factor_format))

    if opts.description:
        model.description = opts.description

    logging.info('saving model...')

    logging.info('creating output directory {0}...'.format(opts.outdir))
    subprocess.check_call(['mkdir', '-p', opts.outdir])

    modelfile = get_modelfile(opts.train, opts.outdir)
    save_recommender(model, modelfile)

    logging.info('done')
Example #7
0
def main():

    import os
    import logging
    import subprocess
    from optparse import OptionParser
    import numpy as np
    from scipy.io import mmread

    from mrec import save_recommender
    from mrec.mf.recommender import MatrixFactorizationRecommender
    from filename_conventions import get_modelfile

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('--factor_format',dest='factor_format',help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)')
    parser.add_option('--user_factors',dest='user_factors',help='user factors filepath')
    parser.add_option('--item_factors',dest='item_factors',help='item factors filepath')
    parser.add_option('--train',dest='train',help='filepath to training data, just used to apply naming convention to output model saved here')
    parser.add_option('--outdir',dest='outdir',help='directory for output')
    parser.add_option('--description',dest='description',help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results')

    (opts,args) = parser.parse_args()
    if not opts.factor_format or not opts.user_factors or not opts.item_factors \
            or not opts.outdir:
        parser.print_help()
        raise SystemExit

    model = MatrixFactorizationRecommender()

    logging.info('loading factors...')

    if opts.factor_format == 'npy':
        model.U = np.load(opts.user_factors)
        model.V = np.load(opts.item_factors)
    elif opts.factor_format == 'mm':
        model.U = mmread(opts.user_factors)
        model.V = mmread(opts.item_factors)
    elif opts.factor_format == 'tsv':
        model.U = np.loadtxt(opts.user_factors)
        model.V = np.loadtxt(opts.item_factors)
    else:
        raise ValueError('unknown factor format: {0}'.format(factor_format))

    if opts.description:
        model.description = opts.description

    logging.info('saving model...')

    logging.info('creating output directory {0}...'.format(opts.outdir))
    subprocess.check_call(['mkdir','-p',opts.outdir])

    modelfile = get_modelfile(opts.train,opts.outdir)
    save_recommender(model,modelfile)

    logging.info('done')
Example #8
0
    def run(self, view, model, input_format, trainfile, feature_format,
            featurefile, num_engines, workdir, overwrite, modelfile):

        logging.info('creating models directory {0}...'.format(workdir))
        subprocess.check_call(['mkdir', '-p', workdir])

        done = []
        if not overwrite:
            logging.info('checking for existing output models...')
            done.extend(self.find_done(workdir))
            if done:
                logging.info('found {0} output files'.format(len(done)))

        logging.info('creating tasks...')
        tasks = self.create_tasks(model, input_format, trainfile,
                                  feature_format, featurefile, workdir,
                                  num_engines, done)

        if tasks:
            logging.info('running in parallel across ipython engines...')
            async_job = view.map_async(process, tasks, retries=2)

            # wait for tasks to complete
            results = async_job.get()

            logging.info('checking output files...')
            done = self.find_done(workdir)
            remaining = len(tasks) - len(done)
        else:
            remaining = 0

        if remaining == 0:
            logging.info('SUCCESS: all tasks completed')
            logging.info('concatenating {0} models...'.format(len(done)))
            for ix in sorted(done):
                partial_model = load_recommender(
                    self.get_modelfile(ix, workdir))
                if ix == 0:
                    model = partial_model
                else:
                    # concatenate factors
                    model.d += partial_model.d
                    model.U = np.hstack((model.U, partial_model.U))
                    model.V = np.hstack((model.V, partial_model.V))
                    if hasattr(model, 'W'):
                        model.W = np.hstack((model.W, partial_model.W))
            save_recommender(model, modelfile)
            logging.info('removing partial output files...')
            rmtree(workdir)
            logging.info('done')
        else:
            logging.error(
                'FAILED: {0}/{1} tasks did not complete successfully'.format(
                    remaining, len(tasks)))
            logging.error(
                'try rerunning the command to retry the remaining tasks')
Example #9
0
    def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max_sims,simsfile,modelfile):

        logging.info('finding number of items...')
        dataset = load_sparse_matrix(input_format,trainfile)
        num_users,num_items = dataset.shape
        del dataset
        logging.info('%d users and %d items', num_users, num_items)

        logging.info('creating sims directory {0}...'.format(simsdir))
        subprocess.check_call(['mkdir','-p',simsdir])

        done = []
        if not overwrite:
            logging.info('checking for existing output sims...')
            done.extend(self.find_done(simsdir))
            if done:
                logging.info('found {0} output files'.format(len(done)))

        logging.info('creating tasks...')
        tasks = self.create_tasks(model,input_format,trainfile,simsdir,num_items,num_engines,max_sims,done)

        if num_engines > 0:
            logging.info('running %d tasks in parallel across ipython'
                         ' engines...', len(tasks))
            async_job = view.map_async(process,tasks,retries=2)
            # wait for tasks to complete
            results = async_job.get()
        else:
            # Sequential run to make it easier for debugging
            logging.info('training similarity model sequentially')
            results = [process(task) for task in tasks]

        logging.info('checking output files...')
        done = self.find_done(simsdir)
        remaining = len(tasks) - len(done)
        if remaining == 0:
            logging.info('SUCCESS: all tasks completed')
            logging.info('concatenating {0} partial output files...'.format(len(done)))
            paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done]
            cmd = ['cat']+paths
            subprocess.check_call(cmd,stdout=open(simsfile,'w'))
            logging.info('removing partial output files...')
            rmtree(simsdir)
            logging.info('loading %d items in %s model from %s',
                         num_items, type(model).__name__, simsfile)
            model.load_similarity_matrix(simsfile,num_items)
            save_recommender(model,modelfile)
            logging.info('done')
        else:
            logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
            logging.error('try rerunning the command to retry the remaining tasks')
Example #10
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10)
    model.fit(train)

    save_recommender(model, outfile)
Example #11
0
def main(file_format, filepath, feature_format, feature_file, outfile):
    from mrec import load_sparse_matrix, save_recommender

    # load training set
    train = load_sparse_matrix(file_format, filepath)
    # load item features, assume they are tsv: item_id,feature_id,val
    X = load_sparse_matrix(feature_format, feature_file).toarray()
    # strip features for any trailing items that don't appear in training set
    num_items = train.shape[1]
    X = X[:num_items, :]

    model = WARP2MFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10)
    model.fit(train, X)

    save_recommender(model, outfile)
Example #12
0
def main(file_format,filepath,feature_format,feature_file,outfile):
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix

    # load training set
    train = load_sparse_matrix(file_format,filepath)
    # load item features, assume they are tsv: item_id,feature_id,val
    X = load_sparse_matrix(feature_format,feature_file).toarray()
    # strip features for any trailing items that don't appear in training set
    num_items = train.shape[1]
    X = X[:num_items,:]

    model = WARP2MFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10)
    model.fit(train,X)

    save_recommender(model,outfile)
Example #13
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.mf.climf import CLiMFRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format,filepath)

    model = CLiMFRecommender(d=5)
    model.fit(train)

    save_recommender(model,outfile)
Example #14
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.mf.climf import CLiMFRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    model = CLiMFRecommender(d=5)
    model.fit(train)

    save_recommender(model, outfile)
Example #15
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format,filepath)

    model = WARPMFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10)
    model.fit(train)

    save_recommender(model,outfile)
Example #16
0
File: train.py Project: BloodD/mrec
def process(view,opts,model,trainfile,outdir):

    logging.info('finding number of items...')
    dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
    num_users,num_items = dataset.shape
    del dataset

    simsdir = get_simsdir(trainfile,outdir)
    logging.info('creating sims directory {0}...'.format(simsdir))
    subprocess.check_call(['mkdir','-p',simsdir])

    done = []
    if not opts.overwrite:
        logging.info('checking for existing output sims...')
        done.extend(find_done(simsdir))
        if done:
            logging.info('found {0} output files'.format(len(done)))

    logging.info('creating tasks...')
    tasks = create_tasks(model,opts.input_format,trainfile,simsdir,num_items,opts.num_engines,opts.max_sims,done)

    logging.info('running in parallel across ipython engines...')
    results = []
    results.append(view.map_async(train.run,tasks,retries=2))

    # wait for tasks to complete
    processed = [r.get() for r in results]

    logging.info('checking output files...')
    done = find_done(simsdir)
    remaining = len(tasks) - len(done)
    if remaining == 0:
        logging.info('SUCCESS: all tasks completed')
        logging.info('concatenating {0} partial output files...'.format(len(done)))
        paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done]
        cmd = ['cat']+paths
        simsfile = get_simsfile(trainfile,outdir)
        subprocess.check_call(cmd,stdout=open(simsfile,'w'))
        logging.info('removing partial output files...')
        rmtree(simsdir)
        model.load_similarity_matrix(simsfile,num_items)
        modelfile = get_modelfile(trainfile,outdir)
        save_recommender(model,modelfile)
        logging.info('done')
    else:
        logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
        logging.error('try rerunning the command to retry the remaining tasks')
Example #17
0
def run_mrec(d=10,num_iters=4,reg=0.02):
    #d is dimension of subspace, i.e. groups
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.mf.wrmf import WRMFRecommender

    alpha=1.0
    start=time.time()

    file_format = "csv"
    #file shoule be csv, with: row,col,data
    #data may just be ones
    filepath = PARS['data_dir']+"/reduced_row_col_num_cutoff_1.5.csv"
    #filepath = PARS['data_dir']+"test_10_mill.csv" 
    outfile = make_mrec_outfile(filepath,d,num_iters,reg)
    print outfile
    print 'reading file: %s'%filepath
    # load training set as scipy sparse matrix
    print "loading file"
    train = load_sparse_matrix(file_format,filepath)
    print "loaded file"
    print (time.time()-start),"seconds"
    print "size:",train.shape

    print "creating recommender"
    model = WRMFRecommender(d=d,num_iters=num_iters,alpha=alpha,lbda=reg)
    print "training on data"
    print time.time()-start
    model.fit(train)
    print "done training"
    print time.time()-start
    print "saving model"
    save_recommender(model,outfile)
    print "wrote model to: %s"%outfile
    print time.time()-start

    return

    print "validating"
    data,U,V=read_mrec(mrec_file=outfile)
    plot_file=outfile.replace('.npz','.png')
    multi_thresh(data,model,thresh_list=None,plot_file=plot_file)
    run_time=(time.time()-start)/60.0
    print "runtime: %0.3f minutes"%run_time
    print 'done'
Example #18
0
def run_mrec(d=10, num_iters=4, reg=0.02):
    #d is dimension of subspace, i.e. groups
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.mf.wrmf import WRMFRecommender

    alpha = 1.0
    start = time.time()

    file_format = "csv"
    #file shoule be csv, with: row,col,data
    #data may just be ones
    filepath = PARS['data_dir'] + "/reduced_row_col_num_cutoff_1.5.csv"
    #filepath = PARS['data_dir']+"test_10_mill.csv"
    outfile = make_mrec_outfile(filepath, d, num_iters, reg)
    print outfile
    print 'reading file: %s' % filepath
    # load training set as scipy sparse matrix
    print "loading file"
    train = load_sparse_matrix(file_format, filepath)
    print "loaded file"
    print(time.time() - start), "seconds"
    print "size:", train.shape

    print "creating recommender"
    model = WRMFRecommender(d=d, num_iters=num_iters, alpha=alpha, lbda=reg)
    print "training on data"
    print time.time() - start
    model.fit(train)
    print "done training"
    print time.time() - start
    print "saving model"
    save_recommender(model, outfile)
    print "wrote model to: %s" % outfile
    print time.time() - start

    return

    print "validating"
    data, U, V = read_mrec(mrec_file=outfile)
    plot_file = outfile.replace('.npz', '.png')
    multi_thresh(data, model, thresh_list=None, plot_file=plot_file)
    run_time = (time.time() - start) / 60.0
    print "runtime: %0.3f minutes" % run_time
    print 'done'
Example #19
0
    def run(self,view,model,input_format,trainfile,num_engines,workdir,overwrite,modelfile):

        logging.info('creating models directory {0}...'.format(workdir))
        subprocess.check_call(['mkdir','-p',workdir])

        done = []
        if not overwrite:
            logging.info('checking for existing output models...')
            done.extend(self.find_done(workdir))
            if done:
                logging.info('found {0} output files'.format(len(done)))

        logging.info('creating tasks...')
        tasks = self.create_tasks(model,input_format,trainfile,workdir,num_engines,done)

        if tasks:
            logging.info('running in parallel across ipython engines...')
            async_job = view.map_async(process,tasks,retries=2)

            # wait for tasks to complete
            results = async_job.get()

            logging.info('checking output files...')
            done = self.find_done(workdir)
            remaining = len(tasks) - len(done)
        else:
            remaining = 0

        if remaining == 0:
            logging.info('SUCCESS: all tasks completed')
            logging.info('averaging {0} models...'.format(len(done)))
            for ix in sorted(done):
                # average two models at a time to limit memory usage
                partial_model = load_recommender(self.get_modelfile(ix,workdir))
                if ix == 0:
                    model = partial_model
                else:
                    model.U = (ix*model.U + partial_model.U)/float(ix+1)
                    model.V = (ix*model.V + partial_model.V)/float(ix+1)
            save_recommender(model,modelfile)
            logging.info('removing partial output files...')
            rmtree(workdir)
            logging.info('done')
        else:
            logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
            logging.error('try rerunning the command to retry the remaining tasks')
Example #20
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.item_similarity.knn import CosineKNNRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.reranking_recommender import RerankingRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format,filepath)

    item_sim_model = CosineKNNRecommender(k=100)
    mf_model = WARPMFRecommender(d=80,gamma=0.01,C=100.0,max_iters=25000,validation_iters=1000,batch_size=10)
    recommender = RerankingRecommender(item_sim_model,mf_model,num_candidates=100)

    recommender.fit(train)

    save_recommender(recommender,outfile)
Example #21
0
    def run(self,view,model,input_format,trainfile,num_engines,workdir,modelfile):
        logging.info('creating factors directory {0}'.format(workdir))
        subprocess.check_call(['mkdir','-p',workdir])

        logging.info('getting data size')
        data = load_sparse_matrix(input_format,trainfile)
        num_users,num_items = data.shape
        del data

        for it in xrange(model.num_iters):
            logging.info('iteration {0}'.format(it))
            tasks = self.create_tasks(num_users,num_engines,model,input_format,trainfile,workdir,'U',get_user_indices,get_item_factor_files,init_item_factors)
            self.run_tasks(view,tasks)
            tasks = self.create_tasks(num_items,num_engines,model,input_format,trainfile,workdir,'V',get_item_indices,get_user_factor_files,None)  # won't need to initialize user factors
            self.run_tasks(view,tasks)

        model.U = np.vstack([np.load(f) for f in get_user_factor_files(workdir)])
        model.V = np.vstack([np.load(f) for f in get_item_factor_files(workdir)])

        save_recommender(model,modelfile)

        logging.info('removing partial output files')
        rmtree(workdir)
        logging.info('done')
Example #22
0
File: train.py Project: adw/mrec
def main():

    import os
    import logging
    import glob
    import subprocess
    from optparse import OptionParser
    from IPython.parallel import Client

    from mrec import load_fast_sparse_matrix, save_recommender
    from mrec.item_similarity.slim import SLIM
    from mrec.item_similarity.knn import (CosineKNNRecommender, DotProductKNNRecommender,
                                          AdjustedCosineKNNRecommender, JaccardKNNRecommender)
    from mrec.mf.wrmf import WRMFRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.mf.warp2 import WARP2MFRecommender
    from mrec.popularity import ItemPopularityRecommender
    from mrec.parallel.item_similarity import ItemSimilarityRunner
    from mrec.parallel.wrmf import WRMFRunner
    from mrec.parallel.warp import WARPMFRunner

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--outdir',dest='outdir',help='directory for output files')
    parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir')
    parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)')
    parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)')
    parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)')
    parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)')
    parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)')
    parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot | adjusted_cosine | jaccard (default: %default)')
    parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)')
    parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)')
    parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)')
    parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)')
    parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)')
    parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)')
    parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)')
    parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)')
    parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)')
    parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))

    trainfiles = glob.glob(opts.train)

    if opts.model == 'popularity':
        # special case, don't need to run in parallel
        subprocess.check_call(['mkdir','-p',opts.outdir])
        for trainfile in trainfiles:
            logging.info('processing {0}...'.format(trainfile))
            model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh)
            dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
            model.fit(dataset)
            modelfile = get_modelfile(trainfile,opts.outdir)
            save_recommender(model,modelfile)
        logging.info('done')
        return

    # create an ipython client
    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    if opts.model == 'slim':
        if opts.learner == 'fs_sgd':
            num_selected_features = 2 * opts.max_sims  # preselect this many candidate similar items
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features)
        else:
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner)
    elif opts.model == 'knn':
        if opts.metric == 'cosine':
            model = CosineKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'dot':
            model = DotProductKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'adjusted_cosine':
            model = AdjustedCosineKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'jaccard':
            model = JaccardKNNRecommender(k=opts.max_sims)
        else:
            parser.print_help()
            raise SystemExit('unknown metric: {0}'.format(opts.metric))
    elif opts.model == 'wrmf':
        model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters)
    elif opts.model == 'warp':
        num_factors_per_engine = max(opts.num_factors/opts.num_engines,1)
        if opts.item_features:
            model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
        else:
            model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
    else:
        parser.print_help()
        raise SystemExit('unknown model type: {0}'.format(opts.model))

    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        modelfile = get_modelfile(trainfile,opts.outdir)
        if opts.model == 'wrmf':
            runner = WRMFRunner()
            factorsdir = get_factorsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile)
        elif opts.model == 'warp':
            runner = WARPMFRunner()
            modelsdir = get_modelsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile)
        else:
            runner = ItemSimilarityRunner()
            simsdir = get_simsdir(trainfile,opts.outdir)
            simsfile = get_simsfile(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
    def run(self, view, model, input_format, trainfile, num_engines, simsdir,
            overwrite, max_sims, simsfile, modelfile):

        logging.info('finding number of items...')
        dataset = load_sparse_matrix(input_format, trainfile)
        num_users, num_items = dataset.shape
        del dataset
        logging.info('%d users and %d items', num_users, num_items)

        logging.info('creating sims directory {0}...'.format(simsdir))
        subprocess.check_call(['mkdir', '-p', simsdir])

        done = []
        if not overwrite:
            logging.info('checking for existing output sims...')
            done.extend(self.find_done(simsdir))
            if done:
                logging.info('found {0} output files'.format(len(done)))

        logging.info('creating tasks...')
        tasks = self.create_tasks(model, input_format, trainfile, simsdir,
                                  num_items, num_engines, max_sims, done)

        if num_engines > 0:
            logging.info(
                'running %d tasks in parallel across ipython'
                ' engines...', len(tasks))
            async_job = view.map_async(process, tasks, retries=2)
            # wait for tasks to complete
            results = async_job.get()
        else:
            # Sequential run to make it easier for debugging
            logging.info('training similarity model sequentially')
            results = [process(task) for task in tasks]

        logging.info('checking output files...')
        done = self.find_done(simsdir)
        remaining = len(tasks) - len(done)
        if remaining == 0:
            logging.info('SUCCESS: all tasks completed')
            logging.info('concatenating {0} partial output files...'.format(
                len(done)))
            paths = [
                os.path.join(simsdir, 'sims.{0}-{1}.tsv'.format(start, end))
                for start, end in done
            ]
            cmd = ['cat'] + paths
            subprocess.check_call(cmd, stdout=open(simsfile, 'w'))
            logging.info('removing partial output files...')
            rmtree(simsdir)
            logging.info('loading %d items in %s model from %s', num_items,
                         type(model).__name__, simsfile)
            model.load_similarity_matrix(simsfile, num_items)
            save_recommender(model, modelfile)
            logging.info('done')
        else:
            logging.error(
                'FAILED: {0}/{1} tasks did not complete successfully'.format(
                    remaining, len(tasks)))
            logging.error(
                'try rerunning the command to retry the remaining tasks')
Example #24
0
def main():

    import os
    import logging
    import glob
    import subprocess
    from optparse import OptionParser
    from IPython.parallel import Client

    from mrec import load_fast_sparse_matrix, save_recommender
    from mrec.item_similarity.slim import SLIM
    from mrec.item_similarity.knn import CosineKNNRecommender, DotProductKNNRecommender
    from mrec.mf.wrmf import WRMFRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.mf.warp2 import WARP2MFRecommender
    from mrec.popularity import ItemPopularityRecommender
    from mrec.parallel.item_similarity import ItemSimilarityRunner
    from mrec.parallel.wrmf import WRMFRunner
    from mrec.parallel.warp import WARPMFRunner

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--outdir',dest='outdir',help='directory for output files')
    parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir')
    parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)')
    parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)')
    parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)')
    parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)')
    parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)')
    parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)')
    parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)')
    parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)')
    parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)')
    parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)')
    parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)')
    parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)')
    parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)')
    parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)')
    parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)')
    parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))

    trainfiles = glob.glob(opts.train)

    if opts.model == 'popularity':
        # special case, don't need to run in parallel
        subprocess.check_call(['mkdir','-p',opts.outdir])
        for trainfile in trainfiles:
            logging.info('processing {0}...'.format(trainfile))
            model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh)
            dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
            model.fit(dataset)
            modelfile = get_modelfile(trainfile,opts.outdir)
            save_recommender(model,modelfile)
        logging.info('done')
        return

    # create an ipython client
    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    if opts.model == 'slim':
        if opts.learner == 'fs_sgd':
            num_selected_features = 2 * opts.max_sims  # preselect this many candidate similar items
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features)
        else:
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner)
    elif opts.model == 'knn':
        if opts.metric == 'cosine':
            model = CosineKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'dot':
            model = DotProductKNNRecommender(k=opts.max_sims)
        else:
            parser.print_help()
            raise SystemExit('unknown metric: {0}'.format(opts.metric))
    elif opts.model == 'wrmf':
        model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters)
    elif opts.model == 'warp':
        num_factors_per_engine = max(opts.num_factors/opts.num_engines,1)
        if opts.item_features:
            model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
        else:
            model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
    else:
        parser.print_help()
        raise SystemExit('unknown model type: {0}'.format(opts.model))

    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        modelfile = get_modelfile(trainfile,opts.outdir)
        if opts.model == 'wrmf':
            runner = WRMFRunner()
            factorsdir = get_factorsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile)
        elif opts.model == 'warp':
            runner = WARPMFRunner()
            modelsdir = get_modelsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile)
        else:
            runner = ItemSimilarityRunner()
            simsdir = get_simsdir(trainfile,opts.outdir)
            simsfile = get_simsfile(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)