Ejemplo n.º 1
0
def experiment_unit_rand_split(exp_id, method, tr_data, te_data, iteration):
    '''
    One iteration of training and testing. The experimental ID 
    '''

    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration)
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    # check resource for existing model.
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
                                     sub_folder)
    if not trained_model:

        # train model using the training data.
        # NOTE: this is the most time-consuming part.
        mcpl_log('training models...')
        method.train(tr_data)

        # save resource
        trained_model = [method]
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model,
                         sub_folder)

    # compute performance on test data using the model.
    [method] = trained_model
    mcpl_log('computing evaluation metrics on the test data...')
    eval_result = rmse(te_data.data_val,
                       method.predict(te_data.data_row, te_data.data_col))

    return eval_result
Ejemplo n.º 2
0
def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                           method_list, leave_k_out, total_iteration, top_n, binary = False):
    '''
    
    Parameters
    ----------
    @param exp_name: the experiment name (prefix) 
    @param daily_datafile:
    @param min_occ_user:
    
    @param method_list:
    @param leave_k_out: leave k out for each user. The k must be strict less than
         min_occ_user
    
    @param binary: if this is set to true then the binary data is used (non-zero set to 1). 
    
    Returns
    ----------
    @return out 
    '''

    if leave_k_out >= min_occ_user:
        raise ValueError(
            'The k in the leave k out should be strictly less than min_occ_user.'
        )

    # define lko_log style.
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    # construct exp_id
    if binary:
        exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration)
    else:
        exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration)
    lko_log('Experiment ID: ' + exp_id)

    # load data.
    lko_log('Read data...')
    reader = DailyWatchTimeReader()
    data = reader.read_file_with_minval(daily_data_file, min_occ_user,
                                        min_occ_prog)
    lko_log('Data loaded: ' + str(data))

    if binary:
        lko_log('Binarizing data...')
        data.binarize()
    else:
        # normalize
        lko_log('Normalizing data...')
        data.normalize_row()

    result = {}

    for method in method_list:
        # do for each method

        perf_vect = []
        for iteration in range(total_iteration):
            # do for each iteration for each method.

            lko_log('Method: ' + method.unique_str() + ' Iteration: ' +
                    str(iteration))

            # data split of the current iteration.
            split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str(
                iteration)
            split_dir = exp_id + '/lv_idx'
            leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT,
                                               split_resource_str, split_dir)
            if not leave_k_out_idx:
                # randomly generate k items from each row/user.
                leave_k_out_idx = ds.leave_k_out(data, leave_k_out)
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str,
                                 leave_k_out_idx, split_dir)

            # split the k items as a separate.
            [data_left, data_tr] = data.leave_k_out(leave_k_out_idx)

            iter_result = experiment_unit_leave_k_out(exp_id, method, \
                                    data_tr, data_left, iteration, top_n)

            perf_vect.append(iter_result)

        result[method.unique_str()] = perf_vect

    return result
Ejemplo n.º 3
0
def experiment_unit_leave_k_out(exp_id, method, data_tr, data_left, iteration,
                                top_n):
    '''
    This method works on the column/row index of the data_tr and data_left, and 
    the data_tr and data_left must be completely aligned in both row-wise and column-wise. 
    '''

    # define lko_log style.
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration)
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    # check resource for existing model.
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
                                     sub_folder)
    if not trained_model:

        # train model using the training data.
        # NOTE: this is the most time-consuming part.
        lko_log('training models...')
        method.train(data_tr)

        # save resource
        trained_model = [method]
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model,
                         sub_folder)

    # compute performance on test data using the model.
    [method] = trained_model
    lko_log('computing evaluation metrics on the test data...')

    eval_result = {}
    # ranked list.

    col_num = data_left.num_col
    pred_col = range(col_num)

    tr_data_csr = data_tr.get_sparse_matrix().tocsr()
    lo_data_csr = data_left.get_sparse_matrix().tocsr()

    for user_idx in range(data_left.num_row):
        # predict the entire row.

        #pred_row = [user_idx] * col_num;
        #row_pred = method.predict(pred_row, pred_col);
        row_pred = method.predict_row(user_idx, pred_col)

        # rank the column (the result is a list of indices).
        srt_col = [
            k[0] for k in sorted(
                enumerate(row_pred), key=lambda x: x[1], reverse=True)
        ]
        # trained columns.
        tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist())
        # remove the trained column.
        te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col]
        # top - k (safeguard)
        te_topk_col = te_srt_col[:min(top_n,
                                      len(te_srt_col) - 1)]
        # test column index;
        lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist())

        prec = precision_itemlist(te_topk_col, lo_col)
        rec = recall_itemlist(te_topk_col, lo_col)
    eval_result['prec'] = prec
    eval_result['recall'] = rec
    eval_result['rmse'] = rmse(
        data_left.data_val,
        method.predict(data_left.data_row, data_left.data_col))
    return eval_result
Ejemplo n.º 4
0
    feedback_data = reader.read_file_with_minval(daily_data_file, min_occ_user,
                                                 min_occ_prog, num_user,
                                                 num_prog)

    exp_id = 'lko_bi_' + exp_name + '_data' + hash_file_str\
                      + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_nu' + str(num_user) + '_np' + str(num_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration)

    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration)
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
                                     sub_folder)
    [method] = trained_model

    learnt_genre = method.V

    program_mapping = feedback_data.col_mapping
    program_inv_mapping = {y: x
                           for x, y in program_mapping.items()}
    program_name = [
        program_inv_mapping[i] for i in range(len(program_mapping))
    ]

    sio.savemat("prog_genre_mat.mat", {
        'genre_mat': learnt_genre,
        'prog_name': program_name
    })
Ejemplo n.º 5
0
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                          method_list,  training_prec, total_iteration):
    '''
    
    Parameters
    ----------
    exp_name:    a human-readable experiment name.
    method_list: a list of matrix completion models  
    
    Returns
    ---------- 
    
    '''
    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    #mcpl_log('Data ID: ' + hash(daily_data_file));

    # here we use a regular hash.
    exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration)

    mcpl_log('Experiment ID: ' + exp_id)

    # save experiment splitting as resources.
    reader = UtilityDataReader()
    data = reader.read_file_with_minval(daily_data_file, min_occ_user,
                                        min_occ_prog)

    # we normalize here before splitting.
    mcpl_log('Normalizing data...')
    data.normalize_row()

    result = {}

    for method in method_list:
        # do for each method

        perf_vect = []
        for iteration in range(total_iteration):
            # do for each iteration for each method;

            mcpl_log('Method: ' + method.unique_str() + ' Iteration: ' +
                     str(iteration))

            # data split of the current iteration.
            split_resource_str = 'exp' + exp_id + '_split_iter' + str(
                iteration)
            split_dir = exp_id + '/split'
            split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str,
                                     split_dir)
            if not split:
                split = ds.split(data.num_row, training_prec)
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split,
                                 split_dir)

            [split_tr, split_te] = split
            data_tr = data.subdata_row(split_tr)
            data_te = data.subdata_row(split_te)

            iter_result = experiment_unit_rand_split(exp_id, method, data_tr,
                                                     data_te, iteration)

            perf_vect.append(iter_result)

        result[method.unique_str()] = perf_vect

    mcpl_log('Experiment Done [' + exp_id + ']')

    return result
Ejemplo n.º 6
0
 def read_file_with_minval(self, filename, min_duid, min_pid, num_duid = None, num_pid = None, rand_seed = 1): 
     '''
     This method first goes through the data once, and filter out 
     the device and program that has occurrences below specified values. 
     Support random undersampling.
     
     Parameters
     ----------
     @param filename: a string consists of the file name and location of the data file to be read.
     @param min_duid: a positive integer. the minimum occurrence of a device for the device to be included.
     @param min_pid:  a positive integer. the minimum occurrence of a program for the program to be included. 
      
     Returns
     ----------
     result: a FeedbackData data structure constructed from the data file. In the result there is also 
             a genre-program mapping data (result.meta['pggr_pg'][i], meta['pggr_pr'][i]) indicates that 
             the program at result.meta['pggr_pg'][i] is marked by genre at meta['pggr_pr'][i]. The genre 
             mapping is in R:/Data/Rovi/genre.csv, and a vintage copy is also kept in datasample/Rovi folder.
     '''
     
     if num_duid is None and num_pid is None:
         subsample = False;
         res_str  = 'DWT_RFWMV[' + str(filename) + '][MIN DUID' + str(min_duid) + '][MIN PID' + str(min_pid) +']';
     elif num_duid is not None and num_pid is not None:
         subsample = True;
         res_str  = 'DWT_RFWMV[' + str(filename) + '][MIN DUID' + str(min_duid) + '][MIN PID' + str(min_pid) +']'\
                         + '[NUM DUID' + str(num_duid) + ']' + '[NUM PID' + str(num_pid) + ']';
     else:
         raise ValueError('num_duid and num_pid should be both set or both use default');
     
     
     
     
     # We check if the current resource is available. If not then load from test data and save resource.  
     if not URM.CheckResource(URM.RTYPE_DATA, res_str): 
     
         Logger.Log('Computing data information...');
         [occur_duid, occur_pid, _, _] = self.read_file_info(filename);
         print str(len(occur_duid)), 'devices', str(len(occur_pid)), 'programs';
         
         Logger.Log('Generating filtering indices...');
         duidlist = [sel_duid for sel_duid, sel_duidcnt in occur_duid.iteritems() if sel_duidcnt > min_duid];
         pidlist  = [sel_pid  for sel_pid,  sel_pidcnt  in occur_pid.iteritems()  if sel_pidcnt  > min_pid];
         
         print 'After filtering [MIN_DUID',str(min_duid), ' MIN_PID:', str(min_pid),']:',\
             str(len(occur_duid)), 'devices', str(len(occur_pid)), 'programs';
         
         # perform random sampling.
         if subsample:
             random.seed(rand_seed);
             if len(duidlist) > num_duid:
                 # subsample DUID;
                 random.shuffle(duidlist);
                 duidlist = duidlist[:num_duid];
             
             if len(pidlist)  > num_pid:
                 # subsample PID;
                 random.shuffle(pidlist);
                 pidlist  = pidlist[:num_pid];
         
         duidlist = set(duidlist);
         pidlist  = set(pidlist);
         
         # read the raw data file with the list.
         [mapping_duid, mapping_pid, row, col, data, pggr_pg, pggr_gr] \
             = self.read_file_with_id_list(filename, duidlist, pidlist);
             
         Logger.Log('read_file_with_minval process completed.');
         
         result = FeedbackData(row, col, data, len(mapping_duid), len(mapping_pid),\
                 mapping_duid, mapping_pid, {'pggr_pg': pggr_pg, 'pggr_gr': pggr_gr});
         
         # save computed results to resource cache. 
         URM.SaveResource(URM.RTYPE_DATA, res_str, result);    
         return result;
     else:
         return URM.LoadResource(URM.RTYPE_DATA, res_str);
Ejemplo n.º 7
0
def experiment_coldstart_map(exp_name,     daily_data_file,\
                    min_occ_user, min_occ_prog, num_user, num_prog,\
                    method_list, blind_k_out, total_iteration, max_rank, binary = False):
    '''
    
    Parameters
    ----------
    @param exp_name:       the experiment name (prefix) 
    @param daily_datafile: a list of files. 
    @param min_occ_user:   cold start user criteria
    @param min_occ_prog:   cold start user criteria
    @param num_user:       the number of users selected in the experiment. 
    @param num_prog:       the number of programs selected in the experiment. 
    @param method_list:
    @param blind_k_out: leave k out for each user. The k must be strict less than
         min_occ_user
    
    @param binary: if this is set to true then the binary data is used (non-zero set to 1). 
    
    Returns
    ----------
    @return out 
    '''
    
    print 'Blind k out: k = ', str(blind_k_out);
    print 'Min_occ_user: '******'Min_occ_prog: ',    str(min_occ_prog);
    
    if blind_k_out >= min_occ_user:
        raise ValueError('The k in the leave k out [' + str(blind_k_out) 
                         +'] should be strictly less than min_occ_user [' + str(min_occ_user) +'].'); 
    
    # define lko_log style. 
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    
    if isinstance(daily_data_file, list):    
        hash_file_str = str(hash(tuple(daily_data_file)));
    else:
        hash_file_str = str(hash(daily_data_file));
    
    # construct exp_id
    if binary:
        exp_id = 'cst_bi_' + exp_name + '_data' + hash_file_str\
                      + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_nu' + str(num_user) + '_np' + str(num_prog) \
                      + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration);
    else:
        exp_id = 'cst_'    + exp_name + '_data' + hash_file_str\
                      + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_nu' + str(num_user) + '_np' + str(num_prog) \
                      + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration);
    lko_log('Experiment ID: ' + exp_id);
    
    # load data. 
    lko_log('Read data...');
    reader = DailyWatchTimeReader();
    data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog);
    lko_log('Data loaded: ' + str(data));
    
    if binary:
        lko_log('Binarizing data...');
        data.binarize();
    else:
        # normalize 
        lko_log('Normalizing data...');
        data.normalize_row();
    
    result = {};
    
    for method in method_list:
        # do for each method
    
        perf_vect = [];
        for iteration in range(total_iteration):
            # do for each iteration for each method. 
    
            lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration));
    
            # data split of the current iteration. 
            split_resource_str = 'exp' + exp_id + '_blind_idx_iter' + str(iteration); 
            split_dir = exp_id + '/blind_idx';
            blind_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir);
            if not blind_out_idx:
                # randomly generate k items to blind out.
                blind_out_idx   = ds.sample_num(data.num_col, blind_k_out);    
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, blind_out_idx, split_dir);
            
            lko_log('Blind index done.');
            # split the k items as a separate. 
            [data_tr, data_left] = data.blind_k_out(blind_out_idx); 
            
            lko_log('Start index');
            iter_result = experiment_unit_leave_k_out_map(exp_id, method, \
                                    data_tr, data_left, iteration, max_rank);
            
            perf_vect.append(iter_result);
    
        result[method.unique_str()] = perf_vect;
    
    return result;
Ejemplo n.º 8
0
def experiment_unit_leave_k_out_map(exp_id, method, data_tr, data_left, iteration, max_rank):
    '''
    This method works on the column/row index of the data_tr and data_left, and 
    the data_tr and data_left must be completely aligned in both row-wise and column-wise. 
    '''
    
    # define lko_log style. 
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration);
    sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. 
    
    # check resource for existing model.  
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder);
    if not trained_model:
        
        # train model using the training data. 
        # NOTE: this is the most time-consuming part. 
        lko_log('training models...');
        method.train(data_tr);
        
        # save resource
        trained_model = [method];
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder);
    
    # compute performance on test data using the model.    
    [method] = trained_model;
    lko_log('computing evaluation metrics on the test data...');
    
    eval_result = {};
    # ranked list.
    
    col_num  = data_left.num_col;
    pred_col = range(col_num);
    
    tr_data_csr = data_tr.get_sparse_matrix().tocsr();
    lo_data_csr = data_left.get_sparse_matrix().tocsr();
    
    perf_vect_prec = np.zeros(max_rank); # precision 
    perf_vect_rec  = np.zeros(max_rank); # recall 
    perf_vect_hr   = np.zeros(max_rank); # hit rate (Modification of Xia Ning's Paper) 
    
    for user_idx in range(data_left.num_row): 
        # predict the entire row. 
        
        
        # test column index;
        lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist());
        
        # there is no testing on this user. 
        if len(lo_col) == 0:
            continue;
        
        #pred_row = [user_idx] * col_num;
        #row_pred = method.predict(pred_row, pred_col);
        row_pred = method.predict_row(user_idx, pred_col);
        
        # rank the column (the result is a list of indices).
        srt_col = [k[0] for k in sorted(enumerate(row_pred), key=lambda x:x[1], reverse=True)];
        
        # trained columns.
        tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist());
        
        
        
        # remove the trained column from prediction. 
        # this contains a set of indices that predicted (excluding training items).
        te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col];
        
        #max_rank will result in an array of 0:max_rank-1;
        
        hit = 0; # the hit variable keeps track of the number of hits till the current rank. 
        
        for rk in range(max_rank):
            # if rk is greater than the length of te_srt_col, then continue;
            # if not, detect possible hits.
            #    a hit is defined by items hits  
            if (rk < len(te_srt_col)) and (te_srt_col[rk] in lo_col):
                hit += 1;
            
            perf_vect_hr[rk]   += float(hit)/len(lo_col); # hit rate
            perf_vect_prec[rk] += float(hit)/(rk+1);          # precision
            perf_vect_rec[rk]  += float(hit)/len(lo_col); # recall

    #normalization over users.
    perf_vect_hr   = perf_vect_hr/data_left.num_row; 
    perf_vect_prec = perf_vect_prec/data_left.num_row;
    perf_vect_rec  = perf_vect_rec/data_left.num_row;
         
    eval_result['hit_rate']  = perf_vect_hr;
    eval_result['precision'] = perf_vect_prec; 
    eval_result['recall']    = perf_vect_rec; 
    eval_result['RMSE']      = rmse(data_left.data_val, method.predict(data_left.data_row, data_left.data_col));
    return eval_result;
Ejemplo n.º 9
0
'''
Created on Jan 28, 2014

@author: jiayu.zhou
'''

import numpy as np
from rs.cache.urm import URM

if __name__ == '__main__':

    a = URM.LoadResource(URM.RTYPE_DATA, 'test001')
    if not a:
        print "not found."

    res = [np.random.rand(3, 5), 'test']
    print res

    URM.SaveResource(URM.RTYPE_DATA, 'test001', res)

    a = URM.LoadResource(URM.RTYPE_DATA, 'test001')
    if not a:
        print "not found."

    print res
def experiment_unit_future_program(exp_id, method, tr_data, te_data, top_k):
    '''
    '''

    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    result_resource_str = 'model_exp'      + exp_id + \
                          '_method'  + method.unique_str()
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    # check resource for existing model.
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
                                     sub_folder)
    if not trained_model:

        # train model using the training data.
        # NOTE: this is the most time-consuming part.
        mcpl_log('training models...')
        method.train(tr_data)

        # save resource
        trained_model = [method]
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model,
                         sub_folder)

    # compute performance on test data using the model.
    [method] = trained_model
    mcpl_log('computing evaluation metrics on the test data...')

    # compute the score of the programs in the prediction.
    prog_list = te_data.col_mapping.keys()
    # program list

    te_datamat = te_data.get_sparse_matrix().tolil()

    eval_result = []

    # TODO: on a subset of DUID?
    for duid in te_data.row_mapping.keys():  # iterate every element.
        prog_score = method.get_score(duid, prog_list, te_data.meta)
        # get scores of the programs in the list.

        # sort the score (first dimension is the index and the second is the actual prediction value).
        #    NOTE: the first dimension is the order with respect to prog_list
        srt_list = [(k[0], k[1]) for k in sorted(
            enumerate(prog_score), key=lambda x: x[1], reverse=True)]

        srt_list = srt_list[:top_k]
        # truncate to top k.

        [srt_idx, _] = zip(*srt_list)

        # map from prog_list to actual index.
        mapped_srt_idx = [
            te_data.col_mapping[prog_list[idx]] for idx in srt_idx
        ]

        #print te_datamat[te_data.row_mapping[duid], mapped_srt_idx].todense();

        # get the ground truth hit.
        prog_hit = (te_datamat[te_data.row_mapping[duid],
                               mapped_srt_idx].todense().tolist())[0]

        # compute hit precision (now we consider only binary hit).
        eval_result.append(hit_prec(prog_hit))

    return eval_result