コード例 #1
0
ファイル: predict.py プロジェクト: inpefess/mrec
def estimate_users_per_task(mb_per_task, input_format, trainfile, modelfile):
    num_users, num_items, nnz = get_dataset_size(input_format, trainfile)
    logging.info('loading model to get size...')
    model = load_recommender(modelfile)
    # we load the training and test data on every task
    # - let's guess that worst case the test data will be the same size
    required_mb_per_task = 2 * (nnz * 16) / ONE_MB
    if isinstance(model, MatrixFactorizationRecommender):
        # we have to load the factors on every task
        required_mb_per_task += ((model.U.size + model.V.size) * 16) / ONE_MB
        if mb_per_task > required_mb_per_task:
            # remaining mem usage is dominated by computed scores:
            users_per_task = ((mb_per_task - required_mb_per_task) * ONE_MB) / (num_items * 16)
    elif isinstance(model, ItemSimilarityRecommender):
        # we have to load the similarity matrix on every task
        required_mb_per_task += (model.similarity_matrix.nnz * 16) / ONE_MB
        if mb_per_task > required_mb_per_task:
            # estimate additional usage from avg items per user and sims per item
            items_per_user = nnz / num_users
            sims_per_item = model.similarity_matrix.nnz / num_items
            users_per_task = ((mb_per_task - required_mb_per_task) * ONE_MB) / (items_per_user * sims_per_item * 16)
    else:
        # assume nothing else to load
        users_per_task = num_users

    if mb_per_task <= required_mb_per_task:
        raise RuntimeError(
            'requires at least {0}MB per task, increase --mb_per_task if you can'.format(required_mb_per_task))

    return int(users_per_task), int(num_users)
コード例 #2
0
def run(task):

    # import modules required by engine
    import os
    import subprocess
    import numpy as np
    from scipy.sparse import coo_matrix

    from mrec import load_sparse_matrix, load_recommender
    from mrec.evaluation import Evaluator

    modelfile, input_format, trainfile, test_input_format, testfile, feature_format, featurefile, outdir, start, end, evaluator, generate = task

    # initialise the model
    model = load_recommender(modelfile)

    outfile = os.path.join(outdir, 'recs.{0}-{1}.tsv'.format(start, end))

    if generate:
        # generate recommendations for our batch of users
        dataset = load_sparse_matrix(input_format, trainfile)
        out = open(outfile, 'w')
        if featurefile is not None:
            # currently runs much faster if features are loaded as a dense matrix
            item_features = load_sparse_matrix(feature_format,
                                               featurefile).toarray()
            # strip features for any trailing items that don't appear in training set
            num_items = dataset.shape[1]
            item_features = item_features[:num_items, :]
            recs = model.range_recommend_items(dataset,
                                               start,
                                               end,
                                               max_items=20,
                                               return_scores=True,
                                               item_features=item_features)
        else:
            recs = model.range_recommend_items(dataset,
                                               start,
                                               end,
                                               max_items=20,
                                               return_scores=True)
        for u, items in zip(xrange(start, end), recs):
            for i, w in items:
                print >> out, '{0}\t{1}\t{2}'.format(u + 1, i + 1,
                                                     w)  # write as 1-indexed
        out.close()

        # record success
        cmd = [
            'touch',
            os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end))
        ]
        subprocess.check_call(cmd)

    # load the test data
    testdata = load_sparse_matrix(test_input_format, testfile).tocsr()

    # return evaluation metrics
    return evaluator.process(testdata, outfile, start, end)
コード例 #3
0
    def run(self, view, model, input_format, trainfile, feature_format,
            featurefile, num_engines, workdir, overwrite, modelfile):

        logging.info('creating models directory {0}...'.format(workdir))
        subprocess.check_call(['mkdir', '-p', workdir])

        done = []
        if not overwrite:
            logging.info('checking for existing output models...')
            done.extend(self.find_done(workdir))
            if done:
                logging.info('found {0} output files'.format(len(done)))

        logging.info('creating tasks...')
        tasks = self.create_tasks(model, input_format, trainfile,
                                  feature_format, featurefile, workdir,
                                  num_engines, done)

        if tasks:
            logging.info('running in parallel across ipython engines...')
            async_job = view.map_async(process, tasks, retries=2)

            # wait for tasks to complete
            results = async_job.get()

            logging.info('checking output files...')
            done = self.find_done(workdir)
            remaining = len(tasks) - len(done)
        else:
            remaining = 0

        if remaining == 0:
            logging.info('SUCCESS: all tasks completed')
            logging.info('concatenating {0} models...'.format(len(done)))
            for ix in sorted(done):
                partial_model = load_recommender(
                    self.get_modelfile(ix, workdir))
                if ix == 0:
                    model = partial_model
                else:
                    # concatenate factors
                    model.d += partial_model.d
                    model.U = np.hstack((model.U, partial_model.U))
                    model.V = np.hstack((model.V, partial_model.V))
                    if hasattr(model, 'W'):
                        model.W = np.hstack((model.W, partial_model.W))
            save_recommender(model, modelfile)
            logging.info('removing partial output files...')
            rmtree(workdir)
            logging.info('done')
        else:
            logging.error(
                'FAILED: {0}/{1} tasks did not complete successfully'.format(
                    remaining, len(tasks)))
            logging.error(
                'try rerunning the command to retry the remaining tasks')
コード例 #4
0
ファイル: warp.py プロジェクト: azizur77/mrec
    def run(self,view,model,input_format,trainfile,num_engines,workdir,overwrite,modelfile):

        logging.info('creating models directory {0}...'.format(workdir))
        subprocess.check_call(['mkdir','-p',workdir])

        done = []
        if not overwrite:
            logging.info('checking for existing output models...')
            done.extend(self.find_done(workdir))
            if done:
                logging.info('found {0} output files'.format(len(done)))

        logging.info('creating tasks...')
        tasks = self.create_tasks(model,input_format,trainfile,workdir,num_engines,done)

        if tasks:
            logging.info('running in parallel across ipython engines...')
            async_job = view.map_async(process,tasks,retries=2)

            # wait for tasks to complete
            results = async_job.get()

            logging.info('checking output files...')
            done = self.find_done(workdir)
            remaining = len(tasks) - len(done)
        else:
            remaining = 0

        if remaining == 0:
            logging.info('SUCCESS: all tasks completed')
            logging.info('averaging {0} models...'.format(len(done)))
            for ix in sorted(done):
                # average two models at a time to limit memory usage
                partial_model = load_recommender(self.get_modelfile(ix,workdir))
                if ix == 0:
                    model = partial_model
                else:
                    model.U = (ix*model.U + partial_model.U)/float(ix+1)
                    model.V = (ix*model.V + partial_model.V)/float(ix+1)
            save_recommender(model,modelfile)
            logging.info('removing partial output files...')
            rmtree(workdir)
            logging.info('done')
        else:
            logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
            logging.error('try rerunning the command to retry the remaining tasks')
コード例 #5
0
ファイル: predict.py プロジェクト: KobeDeShow/mrec
def run(task):

    # import modules required by engine
    import os
    import subprocess
    import numpy as np
    from scipy.sparse import coo_matrix

    from mrec import load_sparse_matrix, load_recommender
    from mrec.evaluation import Evaluator

    modelfile,input_format,trainfile,test_input_format,testfile,feature_format,featurefile,outdir,start,end,evaluator,generate = task

    # initialise the model
    model = load_recommender(modelfile)

    outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end))

    if generate:
        # generate recommendations for our batch of users
        dataset = load_sparse_matrix(input_format,trainfile)
        out = open(outfile,'w')
        if featurefile is not None:
            # currently runs much faster if features are loaded as a dense matrix
            item_features = load_sparse_matrix(feature_format,featurefile).toarray()
            # strip features for any trailing items that don't appear in training set
            num_items = dataset.shape[1]
            item_features = item_features[:num_items,:]
            recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True,item_features=item_features)
        else:
            recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True)
        for u,items in zip(xrange(start,end),recs):
            for i,w in items:
                print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w)  # write as 1-indexed
        out.close()

        # record success
        cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))]
        subprocess.check_call(cmd)

    # load the test data
    testdata = load_sparse_matrix(test_input_format,testfile).tocsr()

    # return evaluation metrics
    return evaluator.process(testdata,outfile,start,end)
コード例 #6
0
ファイル: m_rec.py プロジェクト: dave31415/AVSC
 def __init__(self,d=10,num_iters=4,reg=0.02):
     #TODO: clean this up
     filepath = PARS['data_dir']+"/reduced_row_col_num_cutoff_1.5.csv"
     file_name='/Users/davej/data/AVSC/reduced.csv'
     
     self.model_file = make_mrec_outfile(filepath,d=d,num_iters=num_iters,reg=reg)
     self.dictfile_user=file_name.replace('.csv','_dict_user.csv')
     self.dictfile_item=file_name.replace('.csv','_dict_item.csv')
     print "loading model in : %s" % self.model_file
     self.model=load_recommender(self.model_file)
     print "loading dictionaries"
     self.dict_user=dict(list(csv.reader(open(self.dictfile_user,'rU'))))
     self.dict_item=dict(list(csv.reader(open(self.dictfile_item,'rU'))))
     self.nbad=0
     #kmeans stuff
     self.k_default=75
     self.alpha=10.0
     self.mu=0.30
コード例 #7
0
ファイル: m_rec.py プロジェクト: dave31415/AVSC
def read_mrec(mrec_file='reduced.v1_numbers_mrec_d5_iter9_reg0.0150.npz'):
    file_name=mrec_file
    data_file_name=file_name.split('_mrec_')[0]+'.csv'
    model=mrec.load_recommender(file_name)
    U=model.U
    V=model.V
    model_matrix=np.dot(U,V.transpose())
    shape=model_matrix.shape
    shape=(U.shape[0],V.shape[0])
    data_matrix=np.ndarray(shape,dtype=int)
    line_num=0
    for line in open(data_file_name,'r'):
        line_num+=1
        if line_num % 1000000 ==0 : print line_num
        dat=line.strip().split(',')
        row=int(dat[0])-1
        col=int(dat[1])-1
        val=int(float(dat[2]))
        data_matrix[row,col]=val
    return (data_matrix,U,V)
コード例 #8
0
ファイル: m_rec.py プロジェクト: Tinku-ari/AVSC
def read_mrec(mrec_file='reduced.v1_numbers_mrec_d5_iter9_reg0.0150.npz'):
    file_name = mrec_file
    data_file_name = file_name.split('_mrec_')[0] + '.csv'
    model = mrec.load_recommender(file_name)
    U = model.U
    V = model.V
    model_matrix = np.dot(U, V.transpose())
    shape = model_matrix.shape
    shape = (U.shape[0], V.shape[0])
    data_matrix = np.ndarray(shape, dtype=int)
    line_num = 0
    for line in open(data_file_name, 'r'):
        line_num += 1
        if line_num % 1000000 == 0: print line_num
        dat = line.strip().split(',')
        row = int(dat[0]) - 1
        col = int(dat[1]) - 1
        val = int(float(dat[2]))
        data_matrix[row, col] = val
    return (data_matrix, U, V)
コード例 #9
0
ファイル: m_rec.py プロジェクト: Tinku-ari/AVSC
    def __init__(self, d=10, num_iters=4, reg=0.02):
        #TODO: clean this up
        filepath = PARS['data_dir'] + "/reduced_row_col_num_cutoff_1.5.csv"
        file_name = '/Users/davej/data/AVSC/reduced.csv'

        self.model_file = make_mrec_outfile(filepath,
                                            d=d,
                                            num_iters=num_iters,
                                            reg=reg)
        self.dictfile_user = file_name.replace('.csv', '_dict_user.csv')
        self.dictfile_item = file_name.replace('.csv', '_dict_item.csv')
        print "loading model in : %s" % self.model_file
        self.model = load_recommender(self.model_file)
        print "loading dictionaries"
        self.dict_user = dict(list(csv.reader(open(self.dictfile_user, 'rU'))))
        self.dict_item = dict(list(csv.reader(open(self.dictfile_item, 'rU'))))
        self.nbad = 0
        #kmeans stuff
        self.k_default = 75
        self.alpha = 5.0
        self.mu = 0.31
コード例 #10
0
ファイル: predict.py プロジェクト: BloodD/mrec
def run(task):

    # import modules required by engine
    import os
    import subprocess
    import numpy as np
    from scipy.sparse import coo_matrix

    from mrec import load_sparse_matrix, load_recommender
    from mrec.evaluation import Evaluator

    modelfile,input_format,trainfile,test_input_format,testfile,outdir,start,end,evaluator,generate = task

    # initialise the model
    model = load_recommender(modelfile)
    dataset = load_sparse_matrix(input_format,trainfile)

    outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end))

    if generate:
        # generate recommendations for our batch of users
        out = open(outfile,'w')
        recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True)
        for u,items in zip(xrange(start,end),recs):
            for i,w in items:
                print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w)  # write as 1-indexed
        out.close()

        # record success
        cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))]
        subprocess.check_call(cmd)

    # load the test data
    testdata = load_sparse_matrix(test_input_format,testfile).tocsr()

    # return evaluation metrics
    return evaluator.process(testdata,outfile,start,end)
コード例 #11
0
ファイル: final_model.py プロジェクト: jgriff5/PlayMore
import pandas as pd 
import numpy as np 
import random
from mrec import load_sparse_matrix, load_recommender
from in_store_dict import stores


train = load_sparse_matrix('tsv','../data/PATH_TO_DATA_USED_TO_TRAIN_FINAL_MODEL')
model = load_recommender('../../../mrec/PATH_TO_FINAL_MODEL')

next_usr_num = 382,716

# ->  load in users to predict and make into mrec format:
	# item id == label encoded,
	# user id == new numbers starting at next_usr_num (add new user code to label encoded dict),
	# call this table to_predict

cold_starters = ['BIG BASS WHEEL', 'SUPER SHOT', 'WIZARD OF OZ 6 PLAYER PUSHER']

counts = to_predict.groupby('user').count().sort('item')

def predict_one_user(user, store):
	if counts.ix[user] < 3:
		i = 0 
		game = random.choice(cold_starters)
		while game not in stores[game] and i < 1000:
			game = random.choice(cold_starters)
			i += 1
		if store in stores[game]:
			return game
		else: