Esempio n. 1
0
def compute_factors(task):
    """
    WRMF update method to run on an IPython engine.
    This reads from file and writes back to file,
    only filepaths and an empty model need to be passed.
    """

    # import modules needed on engine
    import os
    import numpy as np
    from mrec import load_fast_sparse_matrix

    model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir = task

    data = load_fast_sparse_matrix(input_format,trainfile)

    if fixed_factor_files:
        H = np.vstack([np.load(f) for f in fixed_factor_files])
    else:
        H = init_fixed_factors(model,data)

    HH = H.T.dot(H)
    W = np.zeros(((end-start),model.d))
    for j in xrange(start,end):
        indices = get_indices(data,j)
        if indices.size:
            W[j-start,:] = model.update(indices,H,HH)

    np.save(os.path.join(workdir,'{0}.{1}.npy'.format(factor_type,start)),W)

    return start,end
Esempio n. 2
0
def process(task):
    """
    Training task to run on an ipython engine.
    """

    # import modules required by engine
    import os
    import subprocess
    from mrec import load_fast_sparse_matrix

    model,input_format,trainfile,outdir,start,end,max_similar_items = task

    # initialise the model
    dataset = load_fast_sparse_matrix(input_format,trainfile)
    if hasattr(model,'similarity_matrix'):
        # clear out any existing similarity matrix
        model.similarity_matrix = None

    # write sims directly to file as we compute them
    outfile = os.path.join(outdir,'sims.{0}-{1}.tsv'.format(start,end))
    out = open(outfile,'w')
    for j in xrange(start,end):
        w = model.get_similar_items(j,max_similar_items=max_similar_items,dataset=dataset)
        for k,v in w:
            print >>out,'{0}\t{1}\t{2}'.format(j+1,k+1,v)  # write as 1-indexed
    out.close()

    # record success
    cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))]
    subprocess.check_call(cmd)

    # return the range that we've processed
    return start,end
def compute_factors(task):
    """
    WRMF update method to run on an IPython engine.
    This reads from file and writes back to file,
    only filepaths and an empty model need to be passed.
    """

    # import modules needed on engine
    import os
    import numpy as np
    from mrec import load_fast_sparse_matrix

    model, input_format, trainfile, factor_type, get_indices, init_fixed_factors, fixed_factor_files, start, end, workdir = task

    data = load_fast_sparse_matrix(input_format, trainfile)

    if fixed_factor_files:
        H = np.vstack([np.load(f) for f in fixed_factor_files])
    else:
        H = init_fixed_factors(model, data)

    HH = H.T.dot(H)
    W = np.zeros(((end - start), model.d))
    for j in xrange(start, end):
        indices = get_indices(data, j)
        if indices.size:
            W[j - start, :] = model.update(indices, H, HH)

    np.save(os.path.join(workdir, '{0}.{1}.npy'.format(factor_type, start)), W)

    return start, end
Esempio n. 4
0
def process(view,opts,modelfile,trainfile,testfile,outdir,evaluator):

    logging.info('finding number of users...')
    dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
    num_users,num_items = dataset.shape
    del dataset

    recsdir = get_recsdir(trainfile,opts.outdir)
    logging.info('creating recs directory {0}...'.format(recsdir))
    subprocess.check_call(['mkdir','-p',recsdir])

    done = []
    if not opts.overwrite:
        logging.info('checking for existing output recs...')
        done.extend(find_done(recsdir))
        if done:
            logging.info('found {0} output files'.format(len(done)))

    logging.info('creating tasks...')
    tasks = create_tasks(modelfile,opts.input_format,trainfile,opts.test_input_format,testfile,recsdir,num_users,opts.num_engines,done,evaluator)

    logging.info('running in parallel across ipython engines...')
    results = []
    results.append(view.map_async(predict.run,tasks,retries=2))

    # wait for tasks to complete
    processed = [r.get() for r in results]

    logging.info('checking output files...')
    done = find_done(recsdir)
    remaining = len(tasks) - len(done)

    if remaining == 0:
        logging.info('SUCCESS: all tasks completed')
        logging.info('concatenating {0} partial output files...'.format(len(done)))
        paths = [os.path.join(recsdir,'recs.{0}-{1}.tsv'.format(start,end)) for start,end in done]
        cmd = ['cat']+paths
        recsfile = get_recsfile(trainfile,outdir)
        subprocess.check_call(cmd,stdout=open(recsfile,'w'))
        logging.info('removing partial output files...')
        rmtree(recsdir)
        logging.info('done')

        # aggregate metrics from each task
        avg_metrics = defaultdict(float)
        tot_count = 0
        for results in processed:
            for cum_metrics,count in results:
                for m,val in cum_metrics.iteritems():
                    avg_metrics[m] += val
                tot_count += count
        for m in avg_metrics:
            avg_metrics[m] /= float(tot_count)
    else:
        logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
        logging.error('try rerunning the command to retry the remaining tasks')
        avg_metrics = None

    return read_recommender_description(modelfile),avg_metrics
Esempio n. 5
0
def main():
    parser = OptionParser()
    parser.add_option('-d','--dataset',dest='dataset',help='path to dataset')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--l1_min',dest='l1_min',type='float',help='min l1 constant to try (expected to be a power of 10)')
    parser.add_option('--l1_max',dest='l1_max',type='float',help='max l1 constant to try (expected to be a power of 10)')
    parser.add_option('--l2_min',dest='l2_min',type='float',help='min l2 constant to try (expected to be a power of 10)')
    parser.add_option('--l2_max',dest='l2_max',type='float',help='max l2 constant to try (expected to be a power of 10)')
    parser.add_option('--max_sims',dest='max_sims',type='int',default=2000,help='max desired number of positive item similarity weights (default: %default)')
    parser.add_option('--min_sims',dest='min_sims',type='int',default=15,help='min desired number of positive item similarity weights (default: %default)')
    parser.add_option('--max_sparse',dest='max_sparse',type='float',default=0.01,help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)')
    parser.add_option('--num_samples',dest='num_samples',type='int',default=100,help='number of sample items to evaluate for each regularization setting')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules')

    (opts,args) = parser.parse_args()
    if not opts.dataset or not opts.input_format or not opts.l1_min or not opts.l1_max or not opts.l2_min or not opts.l2_max:
        parser.print_help()
        raise SystemExit

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    dataset = load_fast_sparse_matrix(opts.input_format,opts.dataset)

    params = {'l1_reg':pow_range(opts.l1_min,opts.l1_max),
              'l2_reg':pow_range(opts.l2_min,opts.l2_max)}
    num_items = dataset.shape[1]
    sample_items = random.sample(xrange(num_items),opts.num_samples)

    logging.info('preparing tasks for a grid search of these values:')
    logging.info(params)
    tasks = [(args,dataset,opts.min_sims,sample_items) for args in ParameterGrid(params)]

    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    logging.info('running {0} tasks in parallel...'.format(len(tasks)))
    results = view.map(estimate_sparsity,tasks,ordered=False)

    candidates = [(args,nsims,nsparse,nneg) for args,nsims,nsparse,nneg in results if nsims <= opts.max_sims and nsparse <= opts.max_sparse]

    if candidates:
        best = min(candidates,key=itemgetter(1))

        print 'best parameter setting: {0}'.format(best[0])
        print 'mean # positive similarity weights per item = {0:.3}'.format(best[1])
        print 'proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,best[2])
        print 'mean # negative similarity weights per item = {0:.3}'.format(best[3])
    else:
        print 'no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse'
Esempio n. 6
0
File: train.py Progetto: BloodD/mrec
def process(view,opts,model,trainfile,outdir):

    logging.info('finding number of items...')
    dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
    num_users,num_items = dataset.shape
    del dataset

    simsdir = get_simsdir(trainfile,outdir)
    logging.info('creating sims directory {0}...'.format(simsdir))
    subprocess.check_call(['mkdir','-p',simsdir])

    done = []
    if not opts.overwrite:
        logging.info('checking for existing output sims...')
        done.extend(find_done(simsdir))
        if done:
            logging.info('found {0} output files'.format(len(done)))

    logging.info('creating tasks...')
    tasks = create_tasks(model,opts.input_format,trainfile,simsdir,num_items,opts.num_engines,opts.max_sims,done)

    logging.info('running in parallel across ipython engines...')
    results = []
    results.append(view.map_async(train.run,tasks,retries=2))

    # wait for tasks to complete
    processed = [r.get() for r in results]

    logging.info('checking output files...')
    done = find_done(simsdir)
    remaining = len(tasks) - len(done)
    if remaining == 0:
        logging.info('SUCCESS: all tasks completed')
        logging.info('concatenating {0} partial output files...'.format(len(done)))
        paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done]
        cmd = ['cat']+paths
        simsfile = get_simsfile(trainfile,outdir)
        subprocess.check_call(cmd,stdout=open(simsfile,'w'))
        logging.info('removing partial output files...')
        rmtree(simsdir)
        model.load_similarity_matrix(simsfile,num_items)
        modelfile = get_modelfile(trainfile,outdir)
        save_recommender(model,modelfile)
        logging.info('done')
    else:
        logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
        logging.error('try rerunning the command to retry the remaining tasks')
def process(task):
    """
    Training task to run on an ipython engine.
    """

    # import modules required by engine
    import os
    import subprocess
    from mrec import load_fast_sparse_matrix

    model, input_format, trainfile, outdir, start, end, max_similar_items = task

    # initialise the model
    dataset = load_fast_sparse_matrix(input_format, trainfile)
    if hasattr(model, 'similarity_matrix'):
        # clear out any existing similarity matrix to trigger recomputation of
        # the item-item similarities from the users' ratings.
        model.similarity_matrix = None

    # write sims directly to file as we compute them
    outfile = os.path.join(outdir, 'sims.{0}-{1}.tsv'.format(start, end))
    out = open(outfile, 'w')
    for j in xrange(start, end):
        w = model.get_similar_items(j,
                                    max_similar_items=max_similar_items,
                                    dataset=dataset)
        for k, v in w:
            print >> out, '{0}\t{1}\t{2}'.format(j + 1, k + 1,
                                                 v)  # write as 1-indexed
    out.close()

    # record success
    cmd = ['touch', os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end))]
    subprocess.check_call(cmd)

    # return the range that we've processed
    return start, end
Esempio n. 8
0
def main():
    parser = OptionParser()
    parser.add_option('-d', '--dataset', dest='dataset', help='path to dataset')
    parser.add_option('--input_format', dest='input_format',
                      help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--l1_min', dest='l1_min', type='float',
                      help='min l1 constant to try (expected to be a power of 10)')
    parser.add_option('--l1_max', dest='l1_max', type='float',
                      help='max l1 constant to try (expected to be a power of 10)')
    parser.add_option('--l2_min', dest='l2_min', type='float',
                      help='min l2 constant to try (expected to be a power of 10)')
    parser.add_option('--l2_max', dest='l2_max', type='float',
                      help='max l2 constant to try (expected to be a power of 10)')
    parser.add_option('--max_sims', dest='max_sims', type='int', default=2000,
                      help='max desired number of positive item similarity weights (default: %default)')
    parser.add_option('--min_sims', dest='min_sims', type='int', default=15,
                      help='min desired number of positive item similarity weights (default: %default)')
    parser.add_option('--max_sparse', dest='max_sparse', type='float', default=0.01,
                      help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)')
    parser.add_option('--num_samples', dest='num_samples', type='int', default=100,
                      help='number of sample items to evaluate for each regularization setting')
    parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths', dest='add_module_paths',
                      help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules')

    (opts, args) = parser.parse_args()
    if not opts.dataset or not opts.input_format or not opts.l1_min or not opts.l1_max or not opts.l2_min or not opts.l2_max:
        parser.print_help()
        raise SystemExit

    logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

    dataset = load_fast_sparse_matrix(opts.input_format, opts.dataset)

    params = {'l1_reg': pow_range(opts.l1_min, opts.l1_max),
              'l2_reg': pow_range(opts.l2_min, opts.l2_max)}
    num_items = dataset.shape[1]
    sample_items = random.sample(range(num_items), opts.num_samples)

    logging.info('preparing tasks for a grid search of these values:')
    logging.info(params)
    tasks = [(args, dataset, opts.min_sims, sample_items) for args in ParameterGrid(params)]

    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    logging.info('running {0} tasks in parallel...'.format(len(tasks)))
    results = view.map(estimate_sparsity, tasks, ordered=False)

    candidates = [(args, nsims, nsparse, nneg) for args, nsims, nsparse, nneg in results if
                  nsims <= opts.max_sims and nsparse <= opts.max_sparse]

    if candidates:
        best = min(candidates, key=itemgetter(1))

        print('best parameter setting: {0}'.format(best[0]))
        print('mean # positive similarity weights per item = {0:.3}'.format(best[1]))
        print('proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,
                                                                                                    best[2]))
        print('mean # negative similarity weights per item = {0:.3}'.format(best[3]))
    else:
        print('no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse')
Esempio n. 9
0
File: slim.py Progetto: mayahhf/mrec
    print 'loading test data...'
    data = """\
%%MatrixMarket matrix coordinate real general
3 5 9
1	1	1
1	2	1
1	3	1
1	4	1
2	2	1
2	3	1
2	5	1
3	3	1
3	4	1
"""
    print data
    dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data))
    num_users,num_items = dataset.shape

    model = SLIM()

    num_samples = 2

    def output(i,j,val):
        # convert back to 1-indexed
        print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val)

    print 'computing some item similarities...'
    print 'item\tsim\tweight'
    # if we want we can compute these individually without calling fit()
    for i in random.sample(xrange(num_items),num_samples):
        for j,weight in model.get_similar_items(i,max_similar_items=10,dataset=dataset):
Esempio n. 10
0
    print('loading test data...')
    data = """\
%%MatrixMarket matrix coordinate real general
3 5 9
1	1	1
1	2	1
1	3	1
1	4	1
2	2	1
2	3	1
2	5	1
3	3	1
3	4	1
"""
    print(data)
    dataset = load_fast_sparse_matrix('mm', StringIO.StringIO(data))
    num_users, num_items = dataset.shape

    model = CosineKNNRecommender(k=2)

    num_samples = 2

    def output(i, j, val):
        # convert back to 1-indexed
        print('{0}\t{1}\t{2:.3f}'.format(i + 1, j + 1, val))

    print('computing some item similarities...')
    print('item\tsim\tweight')
    # if we want we can compute these individually without calling fit()
    for i in random.sample(range(num_items), num_samples):
        for j, weight in model.get_similar_items(i, max_similar_items=2, dataset=dataset):
Esempio n. 11
0
def testPredictionMethods(train_filename, eval_item_filename, user_means_filename):
    '''
    compare predictions generated by the different approaches
    computes pairwise list overlap and average recall for each method
    '''
    
    logging.info('testing predictions with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename))
    
    
    mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
    
    mrec_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE)
    mrec_recommender.fit(mrec_train_data)
    
    warp_recommender = WARPMFRecommender(d=50, gamma=0.01, C=100.0)
    warp_recommender.fit(mrec_train_data.X)
    
    train_data = trainData.TrainData(train_filename, user_means_filename)
    _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
    
    recalls = {}
    overlaps = {}
    top_recs = {}
    user_counter = 0.0
    methods = ['mrec', 'warp', 'mf', 'ub_classic', 'ib_classic', 'ub_damping', 'ib_damping', 'ub_non', 'ib_non']
    
    with open(eval_item_filename,'rb') as eval_file:
        for line in eval_file:
            data = line.split('\t')
            user_id = data[0]
            ground_truth_items = data[1].split(',')
            random_unrated_items = data[2].rstrip('\n').split(',')
            
            evaluation_item_ids = ground_truth_items + random_unrated_items
            
            # for each prediction method, compute topN recommendations once per user
            predictions1 = mrec_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
            top_recs['mrec'] = topNLists.getTopNList(predictions1, evaluation_item_ids=evaluation_item_ids)
            
            predictions2 = warp_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
            top_recs['warp'] = topNLists.getTopNList(predictions2, evaluation_item_ids=evaluation_item_ids)
            
            predictions3 = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids)
            top_recs['mf'] = topNLists.getTopNList(predictions3)
            
            predictions4 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'classic')
            top_recs['ub_classic'] = topNLists.getTopNList(predictions4)
            
            predictions5 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'classic')
            top_recs['ib_classic'] = topNLists.getTopNList(predictions5)
            
            predictions6 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
            top_recs['ub_damping'] = topNLists.getTopNList(predictions6)
            
            predictions7 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
            top_recs['ib_damping'] = topNLists.getTopNList(predictions7)
            
            predictions8 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs['ub_non'] = topNLists.getTopNList(predictions8)
            
            predictions9 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs['ib_non'] = topNLists.getTopNList(predictions9)
            
            # then, use the computed topN lists to update recall and overlap values
            for method1 in methods:
                if method1 in recalls:
                    recalls[method1] += topNLists.getRecall(ground_truth_items, top_recs[method1])
                else:
                    recalls[method1] = topNLists.getRecall(ground_truth_items, top_recs[method1])
                
                for method2 in methods:
                    dict_key = method1 + '_' + method2
                    if dict_key in overlaps:
                        overlaps[dict_key] += topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2])
                    else:
                        overlaps[dict_key] = topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2])
            
            user_counter += 1.0
            logging.info('Tested user {0}. Current recalls: {1}. Current overlaps: {2}'.\
                         format(user_id, [(k, v/user_counter) for k,v in recalls.items()], [(k, v/user_counter) for k,v in overlaps.items()]))
            
    return recalls, overlaps
Esempio n. 12
0
File: train.py Progetto: adw/mrec
def main():

    import os
    import logging
    import glob
    import subprocess
    from optparse import OptionParser
    from IPython.parallel import Client

    from mrec import load_fast_sparse_matrix, save_recommender
    from mrec.item_similarity.slim import SLIM
    from mrec.item_similarity.knn import (CosineKNNRecommender, DotProductKNNRecommender,
                                          AdjustedCosineKNNRecommender, JaccardKNNRecommender)
    from mrec.mf.wrmf import WRMFRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.mf.warp2 import WARP2MFRecommender
    from mrec.popularity import ItemPopularityRecommender
    from mrec.parallel.item_similarity import ItemSimilarityRunner
    from mrec.parallel.wrmf import WRMFRunner
    from mrec.parallel.warp import WARPMFRunner

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--outdir',dest='outdir',help='directory for output files')
    parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir')
    parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)')
    parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)')
    parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)')
    parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)')
    parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)')
    parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot | adjusted_cosine | jaccard (default: %default)')
    parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)')
    parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)')
    parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)')
    parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)')
    parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)')
    parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)')
    parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)')
    parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)')
    parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)')
    parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))

    trainfiles = glob.glob(opts.train)

    if opts.model == 'popularity':
        # special case, don't need to run in parallel
        subprocess.check_call(['mkdir','-p',opts.outdir])
        for trainfile in trainfiles:
            logging.info('processing {0}...'.format(trainfile))
            model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh)
            dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
            model.fit(dataset)
            modelfile = get_modelfile(trainfile,opts.outdir)
            save_recommender(model,modelfile)
        logging.info('done')
        return

    # create an ipython client
    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    if opts.model == 'slim':
        if opts.learner == 'fs_sgd':
            num_selected_features = 2 * opts.max_sims  # preselect this many candidate similar items
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features)
        else:
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner)
    elif opts.model == 'knn':
        if opts.metric == 'cosine':
            model = CosineKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'dot':
            model = DotProductKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'adjusted_cosine':
            model = AdjustedCosineKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'jaccard':
            model = JaccardKNNRecommender(k=opts.max_sims)
        else:
            parser.print_help()
            raise SystemExit('unknown metric: {0}'.format(opts.metric))
    elif opts.model == 'wrmf':
        model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters)
    elif opts.model == 'warp':
        num_factors_per_engine = max(opts.num_factors/opts.num_engines,1)
        if opts.item_features:
            model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
        else:
            model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
    else:
        parser.print_help()
        raise SystemExit('unknown model type: {0}'.format(opts.model))

    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        modelfile = get_modelfile(trainfile,opts.outdir)
        if opts.model == 'wrmf':
            runner = WRMFRunner()
            factorsdir = get_factorsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile)
        elif opts.model == 'warp':
            runner = WARPMFRunner()
            modelsdir = get_modelsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile)
        else:
            runner = ItemSimilarityRunner()
            simsdir = get_simsdir(trainfile,opts.outdir)
            simsfile = get_simsfile(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
Esempio n. 13
0
    config.SPLIT_DIR = os.path.join(config.PACKAGE_DIR,
                                    '../grid_search_music_splits/')
    config.LABEL_FREQUENCY_THRESHOLD = 10
    if not os.path.exists(config.SPLIT_DIR):
        os.makedirs(config.SPLIT_DIR)

    factor_values = [25, 50, 75, 100]
    C_values = [1.0, 10.0, 100.0, 1000.0]
    gamma_values = [0.01, 0.001, 0.0001]

    filenames = dataPreprocessing.loadData(mode='beyond_accuracy')
    # 5-fold cross-validation
    for iteration, (train_filename, test_filename, user_means_filename,
                    eval_item_filename) in enumerate(filenames, 1):

        mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
        # create the training data and required recommendation models
        train_data = trainData.TrainData(train_filename, user_means_filename)

        for factor_value, C_value, gamma_value in product(
                factor_values, C_values, gamma_values):

            warp_recommender = WARPMFRecommender(d=factor_value,
                                                 gamma=gamma_value,
                                                 C=C_value)
            warp_recommender.fit(mrec_train_data.X)

            logging.info('running fold {0} with f={1}, C={2}, g={3}...'.format(
                iteration, factor_value, C_value, gamma_value))

            recall = 0
Esempio n. 14
0
def main():

    import os
    import logging
    import glob
    import subprocess
    from optparse import OptionParser
    from IPython.parallel import Client

    from mrec import load_fast_sparse_matrix, save_recommender
    from mrec.item_similarity.slim import SLIM
    from mrec.item_similarity.knn import CosineKNNRecommender, DotProductKNNRecommender
    from mrec.mf.wrmf import WRMFRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.mf.warp2 import WARP2MFRecommender
    from mrec.popularity import ItemPopularityRecommender
    from mrec.parallel.item_similarity import ItemSimilarityRunner
    from mrec.parallel.wrmf import WRMFRunner
    from mrec.parallel.warp import WARPMFRunner

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--outdir',dest='outdir',help='directory for output files')
    parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir')
    parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)')
    parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)')
    parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)')
    parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)')
    parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)')
    parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)')
    parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)')
    parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)')
    parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)')
    parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)')
    parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)')
    parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)')
    parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)')
    parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)')
    parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)')
    parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))

    trainfiles = glob.glob(opts.train)

    if opts.model == 'popularity':
        # special case, don't need to run in parallel
        subprocess.check_call(['mkdir','-p',opts.outdir])
        for trainfile in trainfiles:
            logging.info('processing {0}...'.format(trainfile))
            model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh)
            dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
            model.fit(dataset)
            modelfile = get_modelfile(trainfile,opts.outdir)
            save_recommender(model,modelfile)
        logging.info('done')
        return

    # create an ipython client
    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    if opts.model == 'slim':
        if opts.learner == 'fs_sgd':
            num_selected_features = 2 * opts.max_sims  # preselect this many candidate similar items
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features)
        else:
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner)
    elif opts.model == 'knn':
        if opts.metric == 'cosine':
            model = CosineKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'dot':
            model = DotProductKNNRecommender(k=opts.max_sims)
        else:
            parser.print_help()
            raise SystemExit('unknown metric: {0}'.format(opts.metric))
    elif opts.model == 'wrmf':
        model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters)
    elif opts.model == 'warp':
        num_factors_per_engine = max(opts.num_factors/opts.num_engines,1)
        if opts.item_features:
            model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
        else:
            model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
    else:
        parser.print_help()
        raise SystemExit('unknown model type: {0}'.format(opts.model))

    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        modelfile = get_modelfile(trainfile,opts.outdir)
        if opts.model == 'wrmf':
            runner = WRMFRunner()
            factorsdir = get_factorsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile)
        elif opts.model == 'warp':
            runner = WARPMFRunner()
            modelsdir = get_modelsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile)
        else:
            runner = ItemSimilarityRunner()
            simsdir = get_simsdir(trainfile,opts.outdir)
            simsfile = get_simsfile(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
Esempio n. 15
0
# Preparing format for mrec
toy = data.copy()

le = preprocessing.LabelEncoder()
user_num = le.fit_transform(toy['user']) + 1
toy['user'] = user_num

le2 = preprocessing.LabelEncoder()
game_num = le2.fit_transform(toy['item']) + 1
toy['item'] = game_num

# converting play count to 0: did not play, 1: did play
binary = toy.copy()
binary['score'] = binary['score'].map(lambda x: 1)

binary_train = binary.sort('user')[::2]
binary_test = binary.sort('user')[1::2]

binary_train.to_csv('../ge10/train.tsv', sep='\t', header = False, index = False)
binary_test.to_csv('../ge10/test.tsv', sep='\t', header = False, index = False)

dataset = load_fast_sparse_matrix("../ge10/train.tsv")
num_users,num_items = dataset.shape
model = SLIM()
recs = model.batch_recommend_items(dataset.X)