Example #1
0
 def learnModel(self, X): 
     
     learner = WARPMFRecommender(self.k, self.alpha, self.lmbda, self.batchSize, self.maxTrials)
     
     learner.fit(X)
     
     return learner.U, learner.V 
Example #2
0
    def learnModel(self, X):

        learner = WARPMFRecommender(self.k, self.alpha, self.lmbda,
                                    self.batchSize, self.maxTrials)

        learner.fit(X)

        return learner.U, learner.V
def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.item_similarity.knn import CosineKNNRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.reranking_recommender import RerankingRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    item_sim_model = CosineKNNRecommender(k=100)
    mf_model = WARPMFRecommender(d=80,
                                 gamma=0.01,
                                 C=100.0,
                                 max_iters=25000,
                                 validation_iters=1000,
                                 batch_size=10)
    recommender = RerankingRecommender(item_sim_model,
                                       mf_model,
                                       num_candidates=100)

    recommender.fit(train)

    save_recommender(recommender, outfile)
Example #4
0
def testPredictionMethods(train_filename, eval_item_filename, user_means_filename):
    '''
    compare predictions generated by the different approaches
    computes pairwise list overlap and average recall for each method
    '''
    
    logging.info('testing predictions with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename))
    
    
    mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
    
    mrec_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE)
    mrec_recommender.fit(mrec_train_data)
    
    warp_recommender = WARPMFRecommender(d=50, gamma=0.01, C=100.0)
    warp_recommender.fit(mrec_train_data.X)
    
    train_data = trainData.TrainData(train_filename, user_means_filename)
    _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
    
    recalls = {}
    overlaps = {}
    top_recs = {}
    user_counter = 0.0
    methods = ['mrec', 'warp', 'mf', 'ub_classic', 'ib_classic', 'ub_damping', 'ib_damping', 'ub_non', 'ib_non']
    
    with open(eval_item_filename,'rb') as eval_file:
        for line in eval_file:
            data = line.split('\t')
            user_id = data[0]
            ground_truth_items = data[1].split(',')
            random_unrated_items = data[2].rstrip('\n').split(',')
            
            evaluation_item_ids = ground_truth_items + random_unrated_items
            
            # for each prediction method, compute topN recommendations once per user
            predictions1 = mrec_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
            top_recs['mrec'] = topNLists.getTopNList(predictions1, evaluation_item_ids=evaluation_item_ids)
            
            predictions2 = warp_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
            top_recs['warp'] = topNLists.getTopNList(predictions2, evaluation_item_ids=evaluation_item_ids)
            
            predictions3 = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids)
            top_recs['mf'] = topNLists.getTopNList(predictions3)
            
            predictions4 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'classic')
            top_recs['ub_classic'] = topNLists.getTopNList(predictions4)
            
            predictions5 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'classic')
            top_recs['ib_classic'] = topNLists.getTopNList(predictions5)
            
            predictions6 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
            top_recs['ub_damping'] = topNLists.getTopNList(predictions6)
            
            predictions7 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
            top_recs['ib_damping'] = topNLists.getTopNList(predictions7)
            
            predictions8 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs['ub_non'] = topNLists.getTopNList(predictions8)
            
            predictions9 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs['ib_non'] = topNLists.getTopNList(predictions9)
            
            # then, use the computed topN lists to update recall and overlap values
            for method1 in methods:
                if method1 in recalls:
                    recalls[method1] += topNLists.getRecall(ground_truth_items, top_recs[method1])
                else:
                    recalls[method1] = topNLists.getRecall(ground_truth_items, top_recs[method1])
                
                for method2 in methods:
                    dict_key = method1 + '_' + method2
                    if dict_key in overlaps:
                        overlaps[dict_key] += topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2])
                    else:
                        overlaps[dict_key] = topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2])
            
            user_counter += 1.0
            logging.info('Tested user {0}. Current recalls: {1}. Current overlaps: {2}'.\
                         format(user_id, [(k, v/user_counter) for k,v in recalls.items()], [(k, v/user_counter) for k,v in overlaps.items()]))
            
    return recalls, overlaps
Example #5
0
    parser = OptionParser()
    parser.add_option('-m', '--main_split_dir', dest='main_split_dir',
                      help='directory containing 50/50 splits for main evaluation')
    parser.add_option('-l', '--loo_split_dir', dest='loo_split_dir',
                      help='directory containing LOO splits for hit rate evaluation')
    parser.add_option('-n', '--num_splits', dest='num_splits', type='int', default=5,
                      help='number of splits in each directory (default: %default)')

    (opts, args) = parser.parse_args()
    if not (opts.main_split_dir or opts.loo_split_dir) or not opts.num_splits:
        parser.print_help()
        raise SystemExit

    print('doing a grid search for regularization parameters...')
    params = {'d': [100], 'gamma': [0.01], 'C': [100], 'max_iter': [100000], 'validation_iters': [500]}
    models = [WARPMFRecommender(**a) for a in ParameterGrid(params)]

    for train in glob:
        # get test
        # load em both up
        # put them into something that returns train,test.keys(),test in a generator()
        # test is a dict id->[id,id,...]

    if opts.main_split_dir:
        generate_main_metrics = generate_metrics(get_known_items_from_dict, compute_main_metrics)
        main_metrics = run_evaluation(models,
                                      retrain_recommender,
                                      load_splits(opts.main_split_dir, opts.num_splits),
                                      opts.num_splits,
                                      generate_main_metrics)
        print_report(models, main_metrics)
Example #6
0
    gamma_values = [0.01, 0.001, 0.0001]

    filenames = dataPreprocessing.loadData(mode='beyond_accuracy')
    # 5-fold cross-validation
    for iteration, (train_filename, test_filename, user_means_filename,
                    eval_item_filename) in enumerate(filenames, 1):

        mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
        # create the training data and required recommendation models
        train_data = trainData.TrainData(train_filename, user_means_filename)

        for factor_value, C_value, gamma_value in product(
                factor_values, C_values, gamma_values):

            warp_recommender = WARPMFRecommender(d=factor_value,
                                                 gamma=gamma_value,
                                                 C=C_value)
            warp_recommender.fit(mrec_train_data.X)

            logging.info('running fold {0} with f={1}, C={2}, g={3}...'.format(
                iteration, factor_value, C_value, gamma_value))

            recall = 0
            evaluation_cases = 0
            with open(eval_item_filename, 'rb') as eval_file:

                for line in eval_file:
                    data = line.split('\t')
                    user_id = data[0]
                    ground_truth_items = data[1].split(',')
                    random_unrated_items = data[2].rstrip('\n').split(',')
Example #7
0
def main():

    import os
    import logging
    import glob
    import subprocess
    from optparse import OptionParser
    from IPython.parallel import Client

    from mrec import load_fast_sparse_matrix, save_recommender
    from mrec.item_similarity.slim import SLIM
    from mrec.item_similarity.knn import CosineKNNRecommender, DotProductKNNRecommender
    from mrec.mf.wrmf import WRMFRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.mf.warp2 import WARP2MFRecommender
    from mrec.popularity import ItemPopularityRecommender
    from mrec.parallel.item_similarity import ItemSimilarityRunner
    from mrec.parallel.wrmf import WRMFRunner
    from mrec.parallel.warp import WARPMFRunner

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--outdir',dest='outdir',help='directory for output files')
    parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir')
    parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)')
    parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)')
    parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)')
    parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)')
    parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)')
    parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)')
    parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)')
    parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)')
    parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)')
    parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)')
    parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)')
    parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)')
    parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)')
    parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)')
    parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)')
    parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))

    trainfiles = glob.glob(opts.train)

    if opts.model == 'popularity':
        # special case, don't need to run in parallel
        subprocess.check_call(['mkdir','-p',opts.outdir])
        for trainfile in trainfiles:
            logging.info('processing {0}...'.format(trainfile))
            model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh)
            dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
            model.fit(dataset)
            modelfile = get_modelfile(trainfile,opts.outdir)
            save_recommender(model,modelfile)
        logging.info('done')
        return

    # create an ipython client
    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    if opts.model == 'slim':
        if opts.learner == 'fs_sgd':
            num_selected_features = 2 * opts.max_sims  # preselect this many candidate similar items
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features)
        else:
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner)
    elif opts.model == 'knn':
        if opts.metric == 'cosine':
            model = CosineKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'dot':
            model = DotProductKNNRecommender(k=opts.max_sims)
        else:
            parser.print_help()
            raise SystemExit('unknown metric: {0}'.format(opts.metric))
    elif opts.model == 'wrmf':
        model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters)
    elif opts.model == 'warp':
        num_factors_per_engine = max(opts.num_factors/opts.num_engines,1)
        if opts.item_features:
            model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
        else:
            model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
    else:
        parser.print_help()
        raise SystemExit('unknown model type: {0}'.format(opts.model))

    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        modelfile = get_modelfile(trainfile,opts.outdir)
        if opts.model == 'wrmf':
            runner = WRMFRunner()
            factorsdir = get_factorsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile)
        elif opts.model == 'warp':
            runner = WARPMFRunner()
            modelsdir = get_modelsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile)
        else:
            runner = ItemSimilarityRunner()
            simsdir = get_simsdir(trainfile,opts.outdir)
            simsfile = get_simsfile(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
 
 # create the training data and required recommendation models
 train_data = trainData.TrainData(train_filename, user_means_filename)
 
 Q = None
 library_recommender = None
 
 if options.algorithm == 'mf':
     _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
 elif options.algorithm == 'mrec':
     mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
     library_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE)
     library_recommender.fit(mrec_train_data)
 elif options.algorithm == 'warp':
     mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
     library_recommender = WARPMFRecommender(d=config.FACTOR_MODEL_SIZE, gamma=0.01, C=options.cvalue)
     library_recommender.fit(mrec_train_data.X)
 elif options.algorithm in ['ub', 'ib']:
     pass
 else:
     raise ValueError('Wrong rec. algorithm entered. Choose between ub, ib, mf, mrec, and warp')
 
 
 # run the beyondAccuracy for all users in the .eval file
 logging.info('running beyondAccuracy with {0}...'.format(eval_item_filename))
 evaluation_cases = 0
 with open(eval_item_filename,'rb') as eval_file:
     
     for line in eval_file:
         data = line.split('\t')
         user_id = data[0]