def experiment_future_program(exp_name, previous_data_files, future_data_file, \
                              min_occ_user, min_occ_prog, method_list, top_k):
    '''
    experiment entrance for future programs.
    
    Top-k precision. 
    
    Parameters
    ----------
    exp_name:    a human-readable experiment name.
    method_list: a list of recommendation models  
    
    Returns
    ---------- 
    '''
    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    exp_id = exp_name + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog)

    mcpl_log('Experimental ID: ' + exp_id)

    reader = DailyWatchTimeReader()
    tr_data = reader.read_file_with_minval(previous_data_files, min_occ_user,
                                           min_occ_prog)
    te_data = reader.read_file_with_minval(future_data_file, min_occ_user,
                                           min_occ_prog)

    mcpl_log('Normalization data ...')
    tr_data.normalize_row()
    # there is no need to normalize train data because we evaluate the hits.

    result = {}

    for method in method_list:
        # do for each method

        mcpl_log('Method: ' + method.unique_str())
        method_result = experiment_unit_future_program(exp_id, method, tr_data,
                                                       te_data, top_k)

        result[method.unique_str()] = method_result

    mcpl_log('Experiment Done [' + exp_id + ']')

    return result
Ejemplo n.º 2
0
'''
Created on Feb 13, 2014

@author: jiayu.zhou
'''

import scipy.io as sio

from rs.data.daily_watchtime import DailyWatchTimeReader

if __name__ == '__main__':
    daily_data_file = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209/part-r-00000"
    reader = DailyWatchTimeReader()
    data = reader.read_file_with_minval(daily_data_file, 1, 1)

    data_mat = data.get_sparse_matrix()

    ### directory save sparse matrix data structure to Matlab.
    #sio.savemat("/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209.mat", {'data': data_mat})

    ###
    sio.savemat("/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209_sparse.mat",
                {'data': data_mat.data, 'i': data_mat.row, 'j': data_mat.col, \
                 'm': data_mat.shape[0], 'n': data_mat.shape[1]})

    print 'Done'
Ejemplo n.º 3
0
def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                           method_list, leave_k_out, total_iteration, top_n, binary = False):
    '''
    
    Parameters
    ----------
    @param exp_name: the experiment name (prefix) 
    @param daily_datafile:
    @param min_occ_user:
    
    @param method_list:
    @param leave_k_out: leave k out for each user. The k must be strict less than
         min_occ_user
    
    @param binary: if this is set to true then the binary data is used (non-zero set to 1). 
    
    Returns
    ----------
    @return out 
    '''

    if leave_k_out >= min_occ_user:
        raise ValueError(
            'The k in the leave k out should be strictly less than min_occ_user.'
        )

    # define lko_log style.
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    # construct exp_id
    if binary:
        exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration)
    else:
        exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration)
    lko_log('Experiment ID: ' + exp_id)

    # load data.
    lko_log('Read data...')
    reader = DailyWatchTimeReader()
    data = reader.read_file_with_minval(daily_data_file, min_occ_user,
                                        min_occ_prog)
    lko_log('Data loaded: ' + str(data))

    if binary:
        lko_log('Binarizing data...')
        data.binarize()
    else:
        # normalize
        lko_log('Normalizing data...')
        data.normalize_row()

    result = {}

    for method in method_list:
        # do for each method

        perf_vect = []
        for iteration in range(total_iteration):
            # do for each iteration for each method.

            lko_log('Method: ' + method.unique_str() + ' Iteration: ' +
                    str(iteration))

            # data split of the current iteration.
            split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str(
                iteration)
            split_dir = exp_id + '/lv_idx'
            leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT,
                                               split_resource_str, split_dir)
            if not leave_k_out_idx:
                # randomly generate k items from each row/user.
                leave_k_out_idx = ds.leave_k_out(data, leave_k_out)
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str,
                                 leave_k_out_idx, split_dir)

            # split the k items as a separate.
            [data_left, data_tr] = data.leave_k_out(leave_k_out_idx)

            iter_result = experiment_unit_leave_k_out(exp_id, method, \
                                    data_tr, data_left, iteration, top_n)

            perf_vect.append(iter_result)

        result[method.unique_str()] = perf_vect

    return result
Ejemplo n.º 4
0
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                          method_list,  training_prec, total_iteration):
    '''
    
    Parameters
    ----------
    exp_name:    a human-readable experiment name.
    method_list: a list of matrix completion models  
    
    Returns
    ---------- 
    
    '''
    # define mcpl_log style. 
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    #mcpl_log('Data ID: ' + hash(daily_data_file));
    
    # here we use a regular hash. 
    exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration);
    
    mcpl_log('Experiment ID: ' + exp_id);
    
    # save experiment splitting as resources. 
    reader = DailyWatchTimeReader();
    data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog);
    
    # we normalize here before splitting.
    mcpl_log('Normalizing data...'); 
    data.normalize_row();
    
    result = {};
    
    for method in method_list:
    # do for each method
        
        perf_vect = [];
        for iteration in range(total_iteration):
        # do for each iteration for each method;
            
            mcpl_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration));
            
            # data split of the current iteration. 
            split_resource_str = 'exp' + exp_id + '_split_iter' + str(iteration); 
            split_dir = exp_id + '/split';
            split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir);
            if not split:
                split = ds.split(data.num_row, training_prec);
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir);
            
            [split_tr, split_te] = split;
            data_tr = data.subdata_row(split_tr);
            data_te = data.subdata_row(split_te);
            
            iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration);
                            
            perf_vect.append(iter_result);
       
        result[method.unique_str()] = perf_vect;
        
    mcpl_log('Experiment Done [' + exp_id + ']');
    
    return result;
Ejemplo n.º 5
0
def experiment_coldstart_map(exp_name,     daily_data_file,\
                    min_occ_user, min_occ_prog, num_user, num_prog,\
                    method_list, blind_k_out, total_iteration, max_rank, binary = False):
    '''
    
    Parameters
    ----------
    @param exp_name:       the experiment name (prefix) 
    @param daily_datafile: a list of files. 
    @param min_occ_user:   cold start user criteria
    @param min_occ_prog:   cold start user criteria
    @param num_user:       the number of users selected in the experiment. 
    @param num_prog:       the number of programs selected in the experiment. 
    @param method_list:
    @param blind_k_out: leave k out for each user. The k must be strict less than
         min_occ_user
    
    @param binary: if this is set to true then the binary data is used (non-zero set to 1). 
    
    Returns
    ----------
    @return out 
    '''
    
    print 'Blind k out: k = ', str(blind_k_out);
    print 'Min_occ_user: '******'Min_occ_prog: ',    str(min_occ_prog);
    
    if blind_k_out >= min_occ_user:
        raise ValueError('The k in the leave k out [' + str(blind_k_out) 
                         +'] should be strictly less than min_occ_user [' + str(min_occ_user) +'].'); 
    
    # define lko_log style. 
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    
    if isinstance(daily_data_file, list):    
        hash_file_str = str(hash(tuple(daily_data_file)));
    else:
        hash_file_str = str(hash(daily_data_file));
    
    # construct exp_id
    if binary:
        exp_id = 'cst_bi_' + exp_name + '_data' + hash_file_str\
                      + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_nu' + str(num_user) + '_np' + str(num_prog) \
                      + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration);
    else:
        exp_id = 'cst_'    + exp_name + '_data' + hash_file_str\
                      + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_nu' + str(num_user) + '_np' + str(num_prog) \
                      + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration);
    lko_log('Experiment ID: ' + exp_id);
    
    # load data. 
    lko_log('Read data...');
    reader = DailyWatchTimeReader();
    data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog);
    lko_log('Data loaded: ' + str(data));
    
    if binary:
        lko_log('Binarizing data...');
        data.binarize();
    else:
        # normalize 
        lko_log('Normalizing data...');
        data.normalize_row();
    
    result = {};
    
    for method in method_list:
        # do for each method
    
        perf_vect = [];
        for iteration in range(total_iteration):
            # do for each iteration for each method. 
    
            lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration));
    
            # data split of the current iteration. 
            split_resource_str = 'exp' + exp_id + '_blind_idx_iter' + str(iteration); 
            split_dir = exp_id + '/blind_idx';
            blind_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir);
            if not blind_out_idx:
                # randomly generate k items to blind out.
                blind_out_idx   = ds.sample_num(data.num_col, blind_k_out);    
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, blind_out_idx, split_dir);
            
            lko_log('Blind index done.');
            # split the k items as a separate. 
            [data_tr, data_left] = data.blind_k_out(blind_out_idx); 
            
            lko_log('Start index');
            iter_result = experiment_unit_leave_k_out_map(exp_id, method, \
                                    data_tr, data_left, iteration, max_rank);
            
            perf_vect.append(iter_result);
    
        result[method.unique_str()] = perf_vect;
    
    return result;