Esempio n. 1
0
 def split(self, percentage):
     '''
     get a random splitting of data with a specified proportion of rows. 
     NOTE: it is recommended to use subdata_row method in get deterministic splits. 
     
     Parameters
     ----------
     @param percentage: the percentage of data split. 
     
     Returns
     ----------
     @return out: a list [data_split, data_split_comp, selidx_split, selidx_split_comp]
     data_split: a Feedback data of percentage, whose index (in the full data set) 
                 is given in selidx_split
     data_split_comp: a Feedback data of 1- percentage, whose index is given 
                      in data_split_comp. This is the complement part of data_split. 
     selidx_split: the index of rows in data_split.
     selidx_split_comp: the index of rows in data_split_comp. 
     '''
     
     # obtain the indices of the split / complement of the split.  
     [selidx_split, selidx_split_comp] = ds.split(self.num_row, percentage);
     
     # acquire data from the splits. 
     data_split      = self.subdata_row(selidx_split);
     data_split_comp = self.subdata_row(selidx_split_comp);
     
     return [data_split, data_split_comp, selidx_split, selidx_split_comp];
Esempio n. 2
0
    def split(self, percentage):
        '''
        get a random splitting of data with a specified proportion of rows. 
        NOTE: it is recommended to use subdata_row method in get deterministic splits. 
        
        Parameters
        ----------
        @param percentage: the percentage of data split. 
        
        Returns
        ----------
        @return out: a list [data_split, data_split_comp, selidx_split, selidx_split_comp]
        data_split: a Feedback data of percentage, whose index (in the full data set) 
                    is given in selidx_split
        data_split_comp: a Feedback data of 1- percentage, whose index is given 
                         in data_split_comp. This is the complement part of data_split. 
        selidx_split: the index of rows in data_split.
        selidx_split_comp: the index of rows in data_split_comp. 
        '''

        # obtain the indices of the split / complement of the split.
        [selidx_split, selidx_split_comp] = ds.split(self.num_row, percentage)

        # acquire data from the splits.
        data_split = self.subdata_row(selidx_split)
        data_split_comp = self.subdata_row(selidx_split_comp)

        return [data_split, data_split_comp, selidx_split, selidx_split_comp]
Esempio n. 3
0
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                          method_list,  training_prec, total_iteration):
    '''
    
    Parameters
    ----------
    exp_name:    a human-readable experiment name.
    method_list: a list of matrix completion models  
    
    Returns
    ---------- 
    
    '''
    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    #mcpl_log('Data ID: ' + hash(daily_data_file));

    # here we use a regular hash.
    exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration)

    mcpl_log('Experiment ID: ' + exp_id)

    # save experiment splitting as resources.
    reader = UtilityDataReader()
    data = reader.read_file_with_minval(daily_data_file, min_occ_user,
                                        min_occ_prog)

    # we normalize here before splitting.
    mcpl_log('Normalizing data...')
    data.normalize_row()

    result = {}

    for method in method_list:
        # do for each method

        perf_vect = []
        for iteration in range(total_iteration):
            # do for each iteration for each method;

            mcpl_log('Method: ' + method.unique_str() + ' Iteration: ' +
                     str(iteration))

            # data split of the current iteration.
            split_resource_str = 'exp' + exp_id + '_split_iter' + str(
                iteration)
            split_dir = exp_id + '/split'
            split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str,
                                     split_dir)
            if not split:
                split = ds.split(data.num_row, training_prec)
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split,
                                 split_dir)

            [split_tr, split_te] = split
            data_tr = data.subdata_row(split_tr)
            data_te = data.subdata_row(split_te)

            iter_result = experiment_unit_rand_split(exp_id, method, data_tr,
                                                     data_te, iteration)

            perf_vect.append(iter_result)

        result[method.unique_str()] = perf_vect

    mcpl_log('Experiment Done [' + exp_id + ']')

    return result
Esempio n. 4
0
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                          method_list,  training_prec, total_iteration):
    '''
    
    Parameters
    ----------
    exp_name:    a human-readable experiment name.
    method_list: a list of matrix completion models  
    
    Returns
    ---------- 
    
    '''
    # define mcpl_log style. 
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    #mcpl_log('Data ID: ' + hash(daily_data_file));
    
    # here we use a regular hash. 
    exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration);
    
    mcpl_log('Experiment ID: ' + exp_id);
    
    # save experiment splitting as resources. 
    reader = DailyWatchTimeReader();
    data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog);
    
    # we normalize here before splitting.
    mcpl_log('Normalizing data...'); 
    data.normalize_row();
    
    result = {};
    
    for method in method_list:
    # do for each method
        
        perf_vect = [];
        for iteration in range(total_iteration):
        # do for each iteration for each method;
            
            mcpl_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration));
            
            # data split of the current iteration. 
            split_resource_str = 'exp' + exp_id + '_split_iter' + str(iteration); 
            split_dir = exp_id + '/split';
            split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir);
            if not split:
                split = ds.split(data.num_row, training_prec);
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir);
            
            [split_tr, split_te] = split;
            data_tr = data.subdata_row(split_tr);
            data_te = data.subdata_row(split_te);
            
            iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration);
                            
            perf_vect.append(iter_result);
       
        result[method.unique_str()] = perf_vect;
        
    mcpl_log('Experiment Done [' + exp_id + ']');
    
    return result;