def main():
    import sys
    from mrec import load_sparse_matrix, save_recommender
    from mrec.sparse import fast_sparse_matrix
    from mrec.item_similarity.knn import CosineKNNRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.reranking_recommender import RerankingRecommender

    file_format = sys.argv[1]
    filepath = sys.argv[2]
    outfile = sys.argv[3]

    # load training set as scipy sparse matrix
    train = load_sparse_matrix(file_format, filepath)

    item_sim_model = CosineKNNRecommender(k=100)
    mf_model = WARPMFRecommender(d=80,
                                 gamma=0.01,
                                 C=100.0,
                                 max_iters=25000,
                                 validation_iters=1000,
                                 batch_size=10)
    recommender = RerankingRecommender(item_sim_model,
                                       mf_model,
                                       num_candidates=100)

    recommender.fit(train)

    save_recommender(recommender, outfile)
Ejemplo n.º 2
0
 def learnModel(self, X): 
     self.X = X 
     self.learner = CosineKNNRecommender(self.k)
     self.learner.fit(X)
Ejemplo n.º 3
0
class KNNRecommender(object): 
    """
    A wrapper for the mrec class CosineKNNRecommender. 
    """
    
    def __init__(self, k): 
        self.k = k 
                
        self.folds = 3
        self.numAucSamples = 100
        self.numProcesses = multiprocessing.cpu_count()
        self.chunkSize = 1
        
    def learnModel(self, X): 
        self.X = X 
        self.learner = CosineKNNRecommender(self.k)
        self.learner.fit(X)
    
    def predict(self, maxItems):
        orderedItems = self.learner.batch_recommend_items(self.X, maxItems, return_scores=False)
        orderedItems = numpy.array(orderedItems)
        return orderedItems 
        
    def modelSelect(self, X): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        precisions = numpy.zeros((self.ks.shape[0], len(cvInds)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)
            
            testOmegaList = SparseUtils.getOmegaList(testX)
            
            for i, k in enumerate(self.ks): 
                learner = self.copy()
                learner.k = k
                paramList.append((trainX, testX, testOmegaList, learner))
                    
        #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
        #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize)
        import itertools
        resultsIterator = itertools.imap(computePrecision, paramList)
        
        for icv, (trainInds, testInds) in enumerate(cvInds):        
            for i, k in enumerate(self.ks): 
                tempPrecision = resultsIterator.next()
                precisions[i, icv] = tempPrecision
        
        #pool.terminate()
        
        meanPrecisions = numpy.mean(precisions, 1)
        stdPrecisions = numpy.std(precisions, 1)
        
        logging.debug(meanPrecisions)
        
        k = self.ks[numpy.argmax(meanPrecisions)]

        
        logging.debug("Model parameters: k=" + str(k)) 
        
        self.k = k 
        
        return meanPrecisions, stdPrecisions
    
    def copy(self): 
        learner = KNNRecommender(self.k)
        learner.ks = self.ks
        learner.folds = self.folds 
        learner.numAucSamples = self.numAucSamples
        
        return learner 

    def __str__(self): 
        outputStr = "KnnRecommender: k=" + str(self.k) 
        outputStr += " numAucSamples=" + str(self.numAucSamples)
        
        return outputStr         
Ejemplo n.º 4
0
def testPredictionMethods(train_filename, eval_item_filename, user_means_filename):
    '''
    compare predictions generated by the different approaches
    computes pairwise list overlap and average recall for each method
    '''
    
    logging.info('testing predictions with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename))
    
    
    mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
    
    mrec_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE)
    mrec_recommender.fit(mrec_train_data)
    
    warp_recommender = WARPMFRecommender(d=50, gamma=0.01, C=100.0)
    warp_recommender.fit(mrec_train_data.X)
    
    train_data = trainData.TrainData(train_filename, user_means_filename)
    _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
    
    recalls = {}
    overlaps = {}
    top_recs = {}
    user_counter = 0.0
    methods = ['mrec', 'warp', 'mf', 'ub_classic', 'ib_classic', 'ub_damping', 'ib_damping', 'ub_non', 'ib_non']
    
    with open(eval_item_filename,'rb') as eval_file:
        for line in eval_file:
            data = line.split('\t')
            user_id = data[0]
            ground_truth_items = data[1].split(',')
            random_unrated_items = data[2].rstrip('\n').split(',')
            
            evaluation_item_ids = ground_truth_items + random_unrated_items
            
            # for each prediction method, compute topN recommendations once per user
            predictions1 = mrec_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
            top_recs['mrec'] = topNLists.getTopNList(predictions1, evaluation_item_ids=evaluation_item_ids)
            
            predictions2 = warp_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
            top_recs['warp'] = topNLists.getTopNList(predictions2, evaluation_item_ids=evaluation_item_ids)
            
            predictions3 = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids)
            top_recs['mf'] = topNLists.getTopNList(predictions3)
            
            predictions4 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'classic')
            top_recs['ub_classic'] = topNLists.getTopNList(predictions4)
            
            predictions5 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'classic')
            top_recs['ib_classic'] = topNLists.getTopNList(predictions5)
            
            predictions6 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
            top_recs['ub_damping'] = topNLists.getTopNList(predictions6)
            
            predictions7 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
            top_recs['ib_damping'] = topNLists.getTopNList(predictions7)
            
            predictions8 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs['ub_non'] = topNLists.getTopNList(predictions8)
            
            predictions9 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs['ib_non'] = topNLists.getTopNList(predictions9)
            
            # then, use the computed topN lists to update recall and overlap values
            for method1 in methods:
                if method1 in recalls:
                    recalls[method1] += topNLists.getRecall(ground_truth_items, top_recs[method1])
                else:
                    recalls[method1] = topNLists.getRecall(ground_truth_items, top_recs[method1])
                
                for method2 in methods:
                    dict_key = method1 + '_' + method2
                    if dict_key in overlaps:
                        overlaps[dict_key] += topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2])
                    else:
                        overlaps[dict_key] = topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2])
            
            user_counter += 1.0
            logging.info('Tested user {0}. Current recalls: {1}. Current overlaps: {2}'.\
                         format(user_id, [(k, v/user_counter) for k,v in recalls.items()], [(k, v/user_counter) for k,v in overlaps.items()]))
            
    return recalls, overlaps
Ejemplo n.º 5
0
def main():

    import os
    import logging
    import glob
    import subprocess
    from optparse import OptionParser
    from IPython.parallel import Client

    from mrec import load_fast_sparse_matrix, save_recommender
    from mrec.item_similarity.slim import SLIM
    from mrec.item_similarity.knn import CosineKNNRecommender, DotProductKNNRecommender
    from mrec.mf.wrmf import WRMFRecommender
    from mrec.mf.warp import WARPMFRecommender
    from mrec.mf.warp2 import WARP2MFRecommender
    from mrec.popularity import ItemPopularityRecommender
    from mrec.parallel.item_similarity import ItemSimilarityRunner
    from mrec.parallel.wrmf import WRMFRunner
    from mrec.parallel.warp import WARPMFRunner

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--outdir',dest='outdir',help='directory for output files')
    parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir')
    parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)')
    parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)')
    parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)')
    parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)')
    parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)')
    parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)')
    parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)')
    parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)')
    parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)')
    parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)')
    parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)')
    parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)')
    parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)')
    parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)')
    parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)')
    parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))

    trainfiles = glob.glob(opts.train)

    if opts.model == 'popularity':
        # special case, don't need to run in parallel
        subprocess.check_call(['mkdir','-p',opts.outdir])
        for trainfile in trainfiles:
            logging.info('processing {0}...'.format(trainfile))
            model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh)
            dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
            model.fit(dataset)
            modelfile = get_modelfile(trainfile,opts.outdir)
            save_recommender(model,modelfile)
        logging.info('done')
        return

    # create an ipython client
    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    if opts.model == 'slim':
        if opts.learner == 'fs_sgd':
            num_selected_features = 2 * opts.max_sims  # preselect this many candidate similar items
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features)
        else:
            model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner)
    elif opts.model == 'knn':
        if opts.metric == 'cosine':
            model = CosineKNNRecommender(k=opts.max_sims)
        elif opts.metric == 'dot':
            model = DotProductKNNRecommender(k=opts.max_sims)
        else:
            parser.print_help()
            raise SystemExit('unknown metric: {0}'.format(opts.metric))
    elif opts.model == 'wrmf':
        model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters)
    elif opts.model == 'warp':
        num_factors_per_engine = max(opts.num_factors/opts.num_engines,1)
        if opts.item_features:
            model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
        else:
            model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
    else:
        parser.print_help()
        raise SystemExit('unknown model type: {0}'.format(opts.model))

    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        modelfile = get_modelfile(trainfile,opts.outdir)
        if opts.model == 'wrmf':
            runner = WRMFRunner()
            factorsdir = get_factorsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile)
        elif opts.model == 'warp':
            runner = WARPMFRunner()
            modelsdir = get_modelsdir(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile)
        else:
            runner = ItemSimilarityRunner()
            simsdir = get_simsdir(trainfile,opts.outdir)
            simsfile = get_simsfile(trainfile,opts.outdir)
            runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
 else:
     raise ValueError('Wrong reranking method entered. Choose between: bs | div_c | div_r | sur_c | sur_r | sur_r_n | nov')
     
 result_object.file = result_file
 
 # create the training data and required recommendation models
 train_data = trainData.TrainData(train_filename, user_means_filename)
 
 Q = None
 library_recommender = None
 
 if options.algorithm == 'mf':
     _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
 elif options.algorithm == 'mrec':
     mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
     library_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE)
     library_recommender.fit(mrec_train_data)
 elif options.algorithm == 'warp':
     mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
     library_recommender = WARPMFRecommender(d=config.FACTOR_MODEL_SIZE, gamma=0.01, C=options.cvalue)
     library_recommender.fit(mrec_train_data.X)
 elif options.algorithm in ['ub', 'ib']:
     pass
 else:
     raise ValueError('Wrong rec. algorithm entered. Choose between ub, ib, mf, mrec, and warp')
 
 
 # run the beyondAccuracy for all users in the .eval file
 logging.info('running beyondAccuracy with {0}...'.format(eval_item_filename))
 evaluation_cases = 0
 with open(eval_item_filename,'rb') as eval_file: