Example #1
0
def process(task):
    """
    Training task to run on an ipython engine.
    """

    # import modules required by engine
    import os
    import subprocess
    from mrec import load_sparse_matrix, save_recommender

    model,input_format,trainfile,feature_format,featurefile,outfile,offset,step = task

    dataset = load_sparse_matrix(input_format,trainfile)
    if featurefile is not None:
        # currently runs much faster if features are loaded as a dense matrix
        item_features = load_sparse_matrix(feature_format,featurefile).toarray()
        # strip features for any trailing items that don't appear in training set
        num_items = dataset.shape[1]
        item_features = item_features[:num_items,:]
        model.fit(dataset,item_features=item_features)
    else:
        model.fit(dataset)
    save_recommender(model,outfile)

    # record success
    cmd = ['touch','{0}.SUCCESS'.format(outfile)]
    subprocess.check_call(cmd)

    # return the offset for the samples that we've learned from
    return offset
Example #2
0
def process(task):
    """
    Training task to run on an ipython engine.
    """

    # import modules required by engine
    import os
    import subprocess
    from mrec import load_sparse_matrix, save_recommender

    model, input_format, trainfile, feature_format, featurefile, outfile, offset, step = task

    dataset = load_sparse_matrix(input_format, trainfile)
    if featurefile is not None:
        # currently runs much faster if features are loaded as a dense matrix
        item_features = load_sparse_matrix(feature_format,
                                           featurefile).toarray()
        # strip features for any trailing items that don't appear in training set
        num_items = dataset.shape[1]
        item_features = item_features[:num_items, :]
        model.fit(dataset, item_features=item_features)
    else:
        model.fit(dataset)
    save_recommender(model, outfile)

    # record success
    cmd = ['touch', '{0}.SUCCESS'.format(outfile)]
    subprocess.check_call(cmd)

    # return the offset for the samples that we've learned from
    return offset
Example #3
0
def run(task):

    # import modules required by engine
    import os
    import subprocess
    import numpy as np
    from scipy.sparse import coo_matrix

    from mrec import load_sparse_matrix, load_recommender
    from mrec.evaluation import Evaluator

    modelfile, input_format, trainfile, test_input_format, testfile, feature_format, featurefile, outdir, start, end, evaluator, generate = task

    # initialise the model
    model = load_recommender(modelfile)

    outfile = os.path.join(outdir, 'recs.{0}-{1}.tsv'.format(start, end))

    if generate:
        # generate recommendations for our batch of users
        dataset = load_sparse_matrix(input_format, trainfile)
        out = open(outfile, 'w')
        if featurefile is not None:
            # currently runs much faster if features are loaded as a dense matrix
            item_features = load_sparse_matrix(feature_format,
                                               featurefile).toarray()
            # strip features for any trailing items that don't appear in training set
            num_items = dataset.shape[1]
            item_features = item_features[:num_items, :]
            recs = model.range_recommend_items(dataset,
                                               start,
                                               end,
                                               max_items=20,
                                               return_scores=True,
                                               item_features=item_features)
        else:
            recs = model.range_recommend_items(dataset,
                                               start,
                                               end,
                                               max_items=20,
                                               return_scores=True)
        for u, items in zip(xrange(start, end), recs):
            for i, w in items:
                print >> out, '{0}\t{1}\t{2}'.format(u + 1, i + 1,
                                                     w)  # write as 1-indexed
        out.close()

        # record success
        cmd = [
            'touch',
            os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end))
        ]
        subprocess.check_call(cmd)

    # load the test data
    testdata = load_sparse_matrix(test_input_format, testfile).tocsr()

    # return evaluation metrics
    return evaluator.process(testdata, outfile, start, end)
Example #4
0
def main(file_format, filepath, feature_format, feature_file, outfile):
    from mrec import load_sparse_matrix, save_recommender

    # load training set
    train = load_sparse_matrix(file_format, filepath)
    # load item features, assume they are tsv: item_id,feature_id,val
    X = load_sparse_matrix(feature_format, feature_file).toarray()
    # strip features for any trailing items that don't appear in training set
    num_items = train.shape[1]
    X = X[:num_items, :]

    model = WARP2MFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10)
    model.fit(train, X)

    save_recommender(model, outfile)
Example #5
0
def main():
    from optparse import OptionParser

    from mrec import load_sparse_matrix, save_sparse_matrix

    parser = OptionParser()
    parser.add_option('--input_format',dest='input_format',help='format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)')
    parser.add_option('--input',dest='input',help='filepath to input')
    parser.add_option('--output_format',dest='output_format',help='format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)')
    parser.add_option('--output',dest='output',help='filepath for output')

    (opts,args) = parser.parse_args()
    if not opts.input or not opts.output or not opts.input_format or not opts.output_format:
        parser.print_help()
        raise SystemExit

    if opts.output_format == opts.input_format:
        raise SystemExit('input and output format are the same, not doing anything')

    if opts.input_format == 'tsv' and opts.output_format == 'mm':
        # we can do this without loading the data
        tsv2mtx(opts.input,opts.output)
    else:
        data = load_sparse_matrix(opts.input_format,opts.input)
        save_sparse_matrix(data,opts.output_format,opts.output)
    def run(self, view, model, input_format, trainfile, num_engines, workdir,
            modelfile):
        logging.info('creating factors directory {0}'.format(workdir))
        subprocess.check_call(['mkdir', '-p', workdir])

        logging.info('getting data size')
        data = load_sparse_matrix(input_format, trainfile)
        num_users, num_items = data.shape
        del data

        for it in xrange(model.num_iters):
            logging.info('iteration {0}'.format(it))
            tasks = self.create_tasks(num_users, num_engines, model,
                                      input_format, trainfile, workdir, 'U',
                                      get_user_indices, get_item_factor_files,
                                      init_item_factors)
            self.run_tasks(view, tasks)
            tasks = self.create_tasks(
                num_items, num_engines, model, input_format, trainfile,
                workdir, 'V', get_item_indices, get_user_factor_files,
                None)  # won't need to initialize user factors
            self.run_tasks(view, tasks)

        model.U = np.vstack(
            [np.load(f) for f in get_user_factor_files(workdir)])
        model.V = np.vstack(
            [np.load(f) for f in get_item_factor_files(workdir)])

        save_recommender(model, modelfile)

        logging.info('removing partial output files')
        rmtree(workdir)
        logging.info('done')
Example #7
0
def process(task):
    """
    Training task to run on an ipython engine.
    """

    # import modules required by engine
    import os
    import subprocess
    from mrec import load_sparse_matrix, save_recommender
    from mrec.mf.warp import ShuffleSampler

    model,input_format,trainfile,outfile,offset,step = task

    # TODO: configure this!!!
    positive_thresh = 1

    dataset = load_sparse_matrix(input_format,trainfile)
    # TODO: models don't seem to converge, investigate....
    #sampler = ShuffleSampler(dataset,positive_thresh,42,offset,step)
    sampler = ShuffleSampler(dataset,positive_thresh,42)

    model.fit(dataset,sampler)
    save_recommender(model,outfile)

    # record success
    cmd = ['touch','{0}.SUCCESS'.format(outfile)]
    subprocess.check_call(cmd)

    # return the offset for the samples that we've learned from
    return offset
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.item_similarity.knn import CosineKNNRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.reranking_recommender import RerankingRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    item_sim_model = CosineKNNRecommender(k=100)
    mf_model = WARPMFRecommender(d=80,
                                 gamma=0.01,
                                 C=100.0,
                                 max_iters=25000,
                                 validation_iters=1000,
                                 batch_size=10)
    recommender = RerankingRecommender(item_sim_model,
                                       mf_model,
                                       num_candidates=100)

    recommender.fit(train)

    save_recommender(recommender, outfile)
Example #9
0
def main(file_format,filepath,feature_format,feature_file,outfile):
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix

    # load training set
    train = load_sparse_matrix(file_format,filepath)
    # load item features, assume they are tsv: item_id,feature_id,val
    X = load_sparse_matrix(feature_format,feature_file).toarray()
    # strip features for any trailing items that don't appear in training set
    num_items = train.shape[1]
    X = X[:num_items,:]

    model = WARP2MFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10)
    model.fit(train,X)

    save_recommender(model,outfile)
Example #10
0
def run(task):

    # import modules required by engine
    import os
    import subprocess
    import numpy as np
    from scipy.sparse import coo_matrix

    from mrec import load_sparse_matrix, load_recommender
    from mrec.evaluation import Evaluator

    modelfile,input_format,trainfile,test_input_format,testfile,feature_format,featurefile,outdir,start,end,evaluator,generate = task

    # initialise the model
    model = load_recommender(modelfile)

    outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end))

    if generate:
        # generate recommendations for our batch of users
        dataset = load_sparse_matrix(input_format,trainfile)
        out = open(outfile,'w')
        if featurefile is not None:
            # currently runs much faster if features are loaded as a dense matrix
            item_features = load_sparse_matrix(feature_format,featurefile).toarray()
            # strip features for any trailing items that don't appear in training set
            num_items = dataset.shape[1]
            item_features = item_features[:num_items,:]
            recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True,item_features=item_features)
        else:
            recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True)
        for u,items in zip(xrange(start,end),recs):
            for i,w in items:
                print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w)  # write as 1-indexed
        out.close()

        # record success
        cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))]
        subprocess.check_call(cmd)

    # load the test data
    testdata = load_sparse_matrix(test_input_format,testfile).tocsr()

    # return evaluation metrics
    return evaluator.process(testdata,outfile,start,end)
Example #11
0
    def __call__(self):
        from mrec import load_sparse_matrix
        import random
        current_split_path = self.__generate_current_split_path()
        train = load_sparse_matrix('csv', current_split_path)

        import warp

        max_iters,validation_iters,validation = warp.WARPMFRecommender.create_validation_set(train)
        users = validation.keys()
        return train, users, validation
Example #12
0
def run(task):
    # import modules required by engine

    from mrec import load_sparse_matrix

    input_format, testfile, recsfile, start, end, evaluator = task

    # load the test data
    testdata = load_sparse_matrix(input_format, testfile)

    return evaluator.process(testdata, recsfile, start, end)
Example #13
0
    def __call__(self):
        from mrec import load_sparse_matrix
        import random
        current_split_path = self.__generate_current_split_path()
        train = load_sparse_matrix('csv', current_split_path)

        import warp

        max_iters, validation_iters, validation = warp.WARPMFRecommender.create_validation_set(
            train)
        users = validation.keys()
        return train, users, validation
Example #14
0
    def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max_sims,simsfile,modelfile):

        logging.info('finding number of items...')
        dataset = load_sparse_matrix(input_format,trainfile)
        num_users,num_items = dataset.shape
        del dataset
        logging.info('%d users and %d items', num_users, num_items)

        logging.info('creating sims directory {0}...'.format(simsdir))
        subprocess.check_call(['mkdir','-p',simsdir])

        done = []
        if not overwrite:
            logging.info('checking for existing output sims...')
            done.extend(self.find_done(simsdir))
            if done:
                logging.info('found {0} output files'.format(len(done)))

        logging.info('creating tasks...')
        tasks = self.create_tasks(model,input_format,trainfile,simsdir,num_items,num_engines,max_sims,done)

        if num_engines > 0:
            logging.info('running %d tasks in parallel across ipython'
                         ' engines...', len(tasks))
            async_job = view.map_async(process,tasks,retries=2)
            # wait for tasks to complete
            results = async_job.get()
        else:
            # Sequential run to make it easier for debugging
            logging.info('training similarity model sequentially')
            results = [process(task) for task in tasks]

        logging.info('checking output files...')
        done = self.find_done(simsdir)
        remaining = len(tasks) - len(done)
        if remaining == 0:
            logging.info('SUCCESS: all tasks completed')
            logging.info('concatenating {0} partial output files...'.format(len(done)))
            paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done]
            cmd = ['cat']+paths
            subprocess.check_call(cmd,stdout=open(simsfile,'w'))
            logging.info('removing partial output files...')
            rmtree(simsdir)
            logging.info('loading %d items in %s model from %s',
                         num_items, type(model).__name__, simsfile)
            model.load_similarity_matrix(simsfile,num_items)
            save_recommender(model,modelfile)
            logging.info('done')
        else:
            logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
            logging.error('try rerunning the command to retry the remaining tasks')
Example #15
0
def main():

    import os
    import logging
    import glob
    from optparse import OptionParser
    from collections import defaultdict

    from mrec import load_sparse_matrix
    from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate
    from mrec.evaluation import Evaluator
    from mrec.evaluation.metrics import print_report
    from filename_conventions import get_testfile, get_recsfile

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary)  (default: %default)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--recsdir',dest='recsdir',help='directory containing tsv files of precomputed recommendations')
    parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)')
    parser.add_option('--description',dest='description',help='description of model which generated the recommendations')
    metrics_funcs = {'main':compute_main_metrics,
                     'hitrate':compute_hit_rate}

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.recsdir \
            or opts.metrics not in metrics_funcs:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir))

    evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20)

    trainfiles = glob.glob(opts.train)

    all_metrics = defaultdict(list)
    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        testfile = get_testfile(trainfile)
        recsfile = get_recsfile(trainfile,opts.recsdir)
        testdata = load_sparse_matrix(opts.test_input_format,testfile).tocsr()
        cum_metrics,count = evaluator.process(testdata,recsfile,0,testdata.shape[0])
        if cum_metrics is not None:
            for m in cum_metrics:
                all_metrics[m].append(float(cum_metrics[m])/count)

    print_report([opts.description],[all_metrics])
Example #16
0
def test_save_load_sparse_matrix():
    X = get_random_coo_matrix()
    for fmt in ["tsv", "csv", "npz", "mm", "fsm"]:
        if fmt == "mm":
            suffix = ".mtx"
        elif fmt == "npz" or fmt == "fsm":
            suffix = ".npz"
        else:
            suffix = ""
        f, path = tempfile.mkstemp(suffix=suffix)
        save_sparse_matrix(X, fmt, path)
        Y = load_sparse_matrix(fmt, path)
        assert_sparse_matrix_equal(X, Y)
        os.remove(path)
Example #17
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format,filepath)

    model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10, max_iters=7001, validation_iters = 1000, sample_item_rate=0.1)
    model.fit(train)
def test_save_load_sparse_matrix():
    X = get_random_coo_matrix()
    for fmt in ['tsv','csv','npz','mm','fsm']:
        if fmt == 'mm':
            suffix = '.mtx'
        elif fmt == 'npz' or fmt == 'fsm':
            suffix = '.npz'
        else:
            suffix = ''
        f,path = tempfile.mkstemp(suffix=suffix)
        save_sparse_matrix(X,fmt,path)
        Y = load_sparse_matrix(fmt,path)
        assert_sparse_matrix_equal(X,Y)
        os.remove(path)
Example #19
0
def run(task):

    # import modules required by engine
    import os
    import subprocess
    import numpy as np
    from scipy.sparse import coo_matrix

    from mrec import load_sparse_matrix, load_recommender
    from mrec.evaluation import Evaluator

    modelfile,input_format,trainfile,test_input_format,testfile,outdir,start,end,evaluator,generate = task

    # initialise the model
    model = load_recommender(modelfile)
    dataset = load_sparse_matrix(input_format,trainfile)

    outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end))

    if generate:
        # generate recommendations for our batch of users
        out = open(outfile,'w')
        recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True)
        for u,items in zip(xrange(start,end),recs):
            for i,w in items:
                print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w)  # write as 1-indexed
        out.close()

        # record success
        cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))]
        subprocess.check_call(cmd)

    # load the test data
    testdata = load_sparse_matrix(test_input_format,testfile).tocsr()

    # return evaluation metrics
    return evaluator.process(testdata,outfile,start,end)
Example #20
0
def run(task):

    # import modules required by engine
    import numpy as np
    from scipy.sparse import coo_matrix
    from collections import defaultdict

    from mrec import load_sparse_matrix

    input_format,testfile,recsfile,start,end,evaluator = task

    # load the test data
    testdata = load_sparse_matrix(input_format,testfile)

    return evaluator.process(testdata,recsfile,start,end)
def run(task):

    # import modules required by engine
    import numpy as np
    from scipy.sparse import coo_matrix
    from collections import defaultdict

    from mrec import load_sparse_matrix

    input_format, testfile, recsfile, start, end, evaluator = task

    # load the test data
    testdata = load_sparse_matrix(input_format, testfile)

    return evaluator.process(testdata, recsfile, start, end)
Example #22
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10)
    model.fit(train)

    save_recommender(model, outfile)
Example #23
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format,filepath)

    model = WARPMFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10)
    model.fit(train)

    save_recommender(model,outfile)
Example #24
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.mf.climf import CLiMFRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    model = CLiMFRecommender(d=5)
    model.fit(train)

    save_recommender(model, outfile)
Example #25
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.mf.climf import CLiMFRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format,filepath)

    model = CLiMFRecommender(d=5)
    model.fit(train)

    save_recommender(model,outfile)
Example #26
0
def run_mrec(d=10, num_iters=4, reg=0.02):
    #d is dimension of subspace, i.e. groups
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.mf.wrmf import WRMFRecommender

    alpha = 1.0
    start = time.time()

    file_format = "csv"
    #file shoule be csv, with: row,col,data
    #data may just be ones
    filepath = PARS['data_dir'] + "/reduced_row_col_num_cutoff_1.5.csv"
    #filepath = PARS['data_dir']+"test_10_mill.csv"
    outfile = make_mrec_outfile(filepath, d, num_iters, reg)
    print outfile
    print 'reading file: %s' % filepath
    # load training set as scipy sparse matrix
    print "loading file"
    train = load_sparse_matrix(file_format, filepath)
    print "loaded file"
    print(time.time() - start), "seconds"
    print "size:", train.shape

    print "creating recommender"
    model = WRMFRecommender(d=d, num_iters=num_iters, alpha=alpha, lbda=reg)
    print "training on data"
    print time.time() - start
    model.fit(train)
    print "done training"
    print time.time() - start
    print "saving model"
    save_recommender(model, outfile)
    print "wrote model to: %s" % outfile
    print time.time() - start

    return

    print "validating"
    data, U, V = read_mrec(mrec_file=outfile)
    plot_file = outfile.replace('.npz', '.png')
    multi_thresh(data, model, thresh_list=None, plot_file=plot_file)
    run_time = (time.time() - start) / 60.0
    print "runtime: %0.3f minutes" % run_time
    print 'done'
Example #27
0
def run_mrec(d=10,num_iters=4,reg=0.02):
    #d is dimension of subspace, i.e. groups
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.mf.wrmf import WRMFRecommender

    alpha=1.0
    start=time.time()

    file_format = "csv"
    #file shoule be csv, with: row,col,data
    #data may just be ones
    filepath = PARS['data_dir']+"/reduced_row_col_num_cutoff_1.5.csv"
    #filepath = PARS['data_dir']+"test_10_mill.csv" 
    outfile = make_mrec_outfile(filepath,d,num_iters,reg)
    print outfile
    print 'reading file: %s'%filepath
    # load training set as scipy sparse matrix
    print "loading file"
    train = load_sparse_matrix(file_format,filepath)
    print "loaded file"
    print (time.time()-start),"seconds"
    print "size:",train.shape

    print "creating recommender"
    model = WRMFRecommender(d=d,num_iters=num_iters,alpha=alpha,lbda=reg)
    print "training on data"
    print time.time()-start
    model.fit(train)
    print "done training"
    print time.time()-start
    print "saving model"
    save_recommender(model,outfile)
    print "wrote model to: %s"%outfile
    print time.time()-start

    return

    print "validating"
    data,U,V=read_mrec(mrec_file=outfile)
    plot_file=outfile.replace('.npz','.png')
    multi_thresh(data,model,thresh_list=None,plot_file=plot_file)
    run_time=(time.time()-start)/60.0
    print "runtime: %0.3f minutes"%run_time
    print 'done'
Example #28
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    model = WARPMFRecommender(d=100,
                              gamma=0.01,
                              C=100.0,
                              batch_size=10,
                              max_iters=7001,
                              validation_iters=1000,
                              sample_item_rate=0.1)
    model.fit(train)
Example #29
0
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.item_similarity.knn import CosineKNNRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.reranking_recommender import RerankingRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format,filepath)

    item_sim_model = CosineKNNRecommender(k=100)
    mf_model = WARPMFRecommender(d=80,gamma=0.01,C=100.0,max_iters=25000,validation_iters=1000,batch_size=10)
    recommender = RerankingRecommender(item_sim_model,mf_model,num_candidates=100)

    recommender.fit(train)

    save_recommender(recommender,outfile)
Example #30
0
    def run(self,view,model,input_format,trainfile,num_engines,workdir,modelfile):
        logging.info('creating factors directory {0}'.format(workdir))
        subprocess.check_call(['mkdir','-p',workdir])

        logging.info('getting data size')
        data = load_sparse_matrix(input_format,trainfile)
        num_users,num_items = data.shape
        del data

        for it in xrange(model.num_iters):
            logging.info('iteration {0}'.format(it))
            tasks = self.create_tasks(num_users,num_engines,model,input_format,trainfile,workdir,'U',get_user_indices,get_item_factor_files,init_item_factors)
            self.run_tasks(view,tasks)
            tasks = self.create_tasks(num_items,num_engines,model,input_format,trainfile,workdir,'V',get_item_indices,get_user_factor_files,None)  # won't need to initialize user factors
            self.run_tasks(view,tasks)

        model.U = np.vstack([np.load(f) for f in get_user_factor_files(workdir)])
        model.V = np.vstack([np.load(f) for f in get_item_factor_files(workdir)])

        save_recommender(model,modelfile)

        logging.info('removing partial output files')
        rmtree(workdir)
        logging.info('done')
Example #31
0
def main():
    import sys
    from mrec import load_sparse_matrix
    from mrec.sparse import fast_sparse_matrix

    # load training set as scipy sparse matrix
    file_format = sys.argv[1]
    filepath = sys.argv[2]
    train = load_sparse_matrix(file_format,filepath)

    model = WARPMFRecommender(d=100,gamma=0.01,C=100,max_iter=100000,validation_iters=500)  # these values work for ml-100k
    sampler = ShuffleSampler(train,1)
    model.fit(train,sampler)

    def output(i,j,val):
        # convert back to 1-indexed
        print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val)

    print 'making some recommendations...'
    for u in xrange(20):
        recs = model.recommend_items(train,u)
        for i,score in recs:
            output(u,i,score)

    print 'making batch recommendations...'
    recs = model.batch_recommend_items(train)
    for u in xrange(20):
        for i,score in recs[u]:
            output(u,i,score)

    print 'making range recommendations...'
    for start,end in [(0,2),(2,3)]:
        recs = model.range_recommend_items(train,start,end)
        for u in xrange(start,end):
            for i,score in recs[u-start]:
                output(u,i,score)
Example #32
0
def main():
    from optparse import OptionParser

    from mrec import load_sparse_matrix, save_sparse_matrix

    parser = OptionParser()
    parser.add_option(
        '--input_format',
        dest='input_format',
        help=
        'format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)'
    )
    parser.add_option('--input', dest='input', help='filepath to input')
    parser.add_option(
        '--output_format',
        dest='output_format',
        help=
        'format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)'
    )
    parser.add_option('--output', dest='output', help='filepath for output')

    (opts, args) = parser.parse_args()
    if not opts.input or not opts.output or not opts.input_format or not opts.output_format:
        parser.print_help()
        raise SystemExit

    if opts.output_format == opts.input_format:
        raise SystemExit(
            'input and output format are the same, not doing anything')

    if opts.input_format == 'tsv' and opts.output_format == 'mm':
        # we can do this without loading the data
        tsv2mtx(opts.input, opts.output)
    else:
        data = load_sparse_matrix(opts.input_format, opts.input)
        save_sparse_matrix(data, opts.output_format, opts.output)
Example #33
0
import pandas as pd 
import numpy as np 
import random
from mrec import load_sparse_matrix, load_recommender
from in_store_dict import stores


train = load_sparse_matrix('tsv','../data/PATH_TO_DATA_USED_TO_TRAIN_FINAL_MODEL')
model = load_recommender('../../../mrec/PATH_TO_FINAL_MODEL')

next_usr_num = 382,716

# ->  load in users to predict and make into mrec format:
	# item id == label encoded,
	# user id == new numbers starting at next_usr_num (add new user code to label encoded dict),
	# call this table to_predict

cold_starters = ['BIG BASS WHEEL', 'SUPER SHOT', 'WIZARD OF OZ 6 PLAYER PUSHER']

counts = to_predict.groupby('user').count().sort('item')

def predict_one_user(user, store):
	if counts.ix[user] < 3:
		i = 0 
		game = random.choice(cold_starters)
		while game not in stores[game] and i < 1000:
			game = random.choice(cold_starters)
			i += 1
		if store in stores[game]:
			return game
		else:
    def run(self, view, model, input_format, trainfile, num_engines, simsdir,
            overwrite, max_sims, simsfile, modelfile):

        logging.info('finding number of items...')
        dataset = load_sparse_matrix(input_format, trainfile)
        num_users, num_items = dataset.shape
        del dataset
        logging.info('%d users and %d items', num_users, num_items)

        logging.info('creating sims directory {0}...'.format(simsdir))
        subprocess.check_call(['mkdir', '-p', simsdir])

        done = []
        if not overwrite:
            logging.info('checking for existing output sims...')
            done.extend(self.find_done(simsdir))
            if done:
                logging.info('found {0} output files'.format(len(done)))

        logging.info('creating tasks...')
        tasks = self.create_tasks(model, input_format, trainfile, simsdir,
                                  num_items, num_engines, max_sims, done)

        if num_engines > 0:
            logging.info(
                'running %d tasks in parallel across ipython'
                ' engines...', len(tasks))
            async_job = view.map_async(process, tasks, retries=2)
            # wait for tasks to complete
            results = async_job.get()
        else:
            # Sequential run to make it easier for debugging
            logging.info('training similarity model sequentially')
            results = [process(task) for task in tasks]

        logging.info('checking output files...')
        done = self.find_done(simsdir)
        remaining = len(tasks) - len(done)
        if remaining == 0:
            logging.info('SUCCESS: all tasks completed')
            logging.info('concatenating {0} partial output files...'.format(
                len(done)))
            paths = [
                os.path.join(simsdir, 'sims.{0}-{1}.tsv'.format(start, end))
                for start, end in done
            ]
            cmd = ['cat'] + paths
            subprocess.check_call(cmd, stdout=open(simsfile, 'w'))
            logging.info('removing partial output files...')
            rmtree(simsdir)
            logging.info('loading %d items in %s model from %s', num_items,
                         type(model).__name__, simsfile)
            model.load_similarity_matrix(simsfile, num_items)
            save_recommender(model, modelfile)
            logging.info('done')
        else:
            logging.error(
                'FAILED: {0}/{1} tasks did not complete successfully'.format(
                    remaining, len(tasks)))
            logging.error(
                'try rerunning the command to retry the remaining tasks')
Example #35
0
def main():

    import os
    import logging
    import glob
    from optparse import OptionParser
    from collections import defaultdict

    from mrec import load_sparse_matrix
    from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate
    from mrec.evaluation import Evaluator
    from mrec.evaluation.metrics import print_report
    from filename_conventions import get_testfile, get_recsfile

    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option(
        '--input_format',
        dest='input_format',
        help=
        'format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)'
    )
    parser.add_option(
        '--test_input_format',
        dest='test_input_format',
        default='npz',
        help=
        'format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary)  (default: %default)'
    )
    parser.add_option(
        '--train',
        dest='train',
        help=
        'glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard'
    )
    parser.add_option(
        '--recsdir',
        dest='recsdir',
        help='directory containing tsv files of precomputed recommendations')
    parser.add_option(
        '--metrics',
        dest='metrics',
        default='main',
        help='which set of metrics to compute, main|hitrate (default: %default)'
    )
    parser.add_option(
        '--description',
        dest='description',
        help='description of model which generated the recommendations')
    metrics_funcs = {'main': compute_main_metrics, 'hitrate': compute_hit_rate}

    (opts, args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.recsdir \
            or opts.metrics not in metrics_funcs:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir))

    evaluator = Evaluator(metrics_funcs[opts.metrics], max_items=20)

    trainfiles = glob.glob(opts.train)

    all_metrics = defaultdict(list)
    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        testfile = get_testfile(trainfile)
        recsfile = get_recsfile(trainfile, opts.recsdir)
        testdata = load_sparse_matrix(opts.test_input_format, testfile).tocsr()
        cum_metrics, count = evaluator.process(testdata, recsfile, 0,
                                               testdata.shape[0])
        if cum_metrics is not None:
            for m in cum_metrics:
                all_metrics[m].append(float(cum_metrics[m]) / count)

    print_report([opts.description], [all_metrics])
Example #36
0
def get_dataset_size(input_format, datafile):
    logging.info('loading dataset to get size...')
    dataset = load_sparse_matrix(input_format, datafile)
    return dataset.shape[0], dataset.shape[1], dataset.nnz