def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, leave_k_out, total_iteration, top_n, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: @param min_occ_user: @param method_list: @param leave_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' if leave_k_out >= min_occ_user: raise ValueError('The k in the leave k out should be strictly less than min_occ_user.'); # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); # construct exp_id if binary: exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration); else: exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration); lko_log('Experiment ID: ' + exp_id); # load data. lko_log('Read data...'); reader = UtilityDataReader(); data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog); lko_log('Data loaded: ' + str(data)); if binary: lko_log('Binarizing data...'); data.binarize(); else: # normalize lko_log('Normalizing data...'); data.normalize_row(); result = {}; for method in method_list: # do for each method perf_vect = []; for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration)); # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str(iteration); split_dir = exp_id + '/lv_idx'; leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir); if not leave_k_out_idx: # randomly generate k items from each row/user. leave_k_out_idx = ds.leave_k_out(data, leave_k_out); URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, leave_k_out_idx, split_dir); # split the k items as a separate. [data_left, data_tr] = data.leave_k_out(leave_k_out_idx); iter_result = experiment_unit_leave_k_out(exp_id, method, \ data_tr, data_left, iteration, top_n); perf_vect.append(iter_result); result[method.unique_str()] = perf_vect; return result;
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, training_prec, total_iteration): ''' Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of matrix completion models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) #mcpl_log('Data ID: ' + hash(daily_data_file)); # here we use a regular hash. exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration) mcpl_log('Experiment ID: ' + exp_id) # save experiment splitting as resources. reader = UtilityDataReader() data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog) # we normalize here before splitting. mcpl_log('Normalizing data...') data.normalize_row() result = {} for method in method_list: # do for each method perf_vect = [] for iteration in range(total_iteration): # do for each iteration for each method; mcpl_log('Method: ' + method.unique_str() + ' Iteration: ' + str(iteration)) # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_split_iter' + str( iteration) split_dir = exp_id + '/split' split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir) if not split: split = ds.split(data.num_row, training_prec) URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir) [split_tr, split_te] = split data_tr = data.subdata_row(split_tr) data_te = data.subdata_row(split_te) iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration) perf_vect.append(iter_result) result[method.unique_str()] = perf_vect mcpl_log('Experiment Done [' + exp_id + ']') return result
def experiment_tr_te_map(exp_name, train_data_file, test_data_file, \ train_item_feature_file, test_item_feature_file, \ max_rank, binary = False): ''' Parameters ---------- @param exp_name: @param train_data_file: @param test_data_file: @param train_content_file: @param test_content_file: @param max_rank: the maximal N in the computation. Returns ---------- @return out ''' # initialize utilities trte_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) # processing file name hashing (used for cache string). # create hash for single file or a list of files. if isinstance(train_data_file, list): hash_file_tr_data_str = str(hash(tuple(train_data_file))) else: hash_file_tr_data_str = str(hash(train_data_file)) if isinstance(test_data_file, list): hash_file_te_data_str = str(hash(tuple(test_data_file))) else: hash_file_te_data_str = str(hash(test_data_file)) if train_item_feature_file: if isinstance(train_item_feature_file, list): hash_file_tr_item_feature_str = str( hash(tuple(train_item_feature_file))) else: hash_file_tr_item_feature_str = str(hash(train_item_feature_file)) else: hash_file_tr_item_feature_str = '' if test_item_feature_file: if isinstance(test_item_feature_file, list): hash_file_te_item_feature_str = str( hash(tuple(test_item_feature_file))) else: hash_file_te_item_feature_str = str(hash(test_item_feature_file)) else: hash_file_te_item_feature_str = '' # display information print 'Training data file', train_data_file, ' [hash:', hash_file_tr_data_str, ']' if train_item_feature_file: print 'Training content feature provided: ', train_item_feature_file, \ ' [hash:', hash_file_tr_item_feature_str, ']' else: print 'Training content feature not provided.' print 'Testing data file ', test_data_file, ' [hash:', hash_file_te_data_str, ']' if test_item_feature_file: print 'Testing content feature provided: ', test_item_feature_file, \ ' [hash:', hash_file_te_item_feature_str, ']' else: print 'Testing content feature not provided.' if binary: exp_id_prefix = 'trte_bi_' else: exp_id_prefix = 'trte_' exp_id = exp_id_prefix + exp_name + '_trdata_' + hash_file_tr_data_str \ + '_tedata_' + hash_file_te_data_str \ + '_tritemf_' + hash_file_tr_item_feature_str \ + '_tritemf_' + hash_file_te_item_feature_str trte_log('Experiment ID: ' + exp_id) # load utility data and feature data. trte_log('Read training data...') reader = UtilityDataReader(fieldDelimiter='\t') tr_data = reader.read_file_with_minval(train_data_file, 0, 0) trte_log('Training data loaded: ' + str(tr_data)) te_data = reader.read_file_with_minval(test_data_file, 0, 0) trte_log('Testing data loaded: ' + str(te_data)) # load item feature data if binary: trte_log('Binarizing data...') tr_data.binarize() te_data.binarize() result = {}
#from StringIO import StringIO from rs.data.utility_data import UtilityDataReader from rs.data.recdata import FeedbackData from copy import deepcopy if __name__ == '__main__': #txt = "U1\tD1\t44\n"+"U2\tD2\t10\n"+"U2\tD1\t20\n" #for line in StringIO(txt): # print line data_file = '../../datasample/agg_duid_pid_watchtime_genre/toy_small_day1' reader = UtilityDataReader(); feedback = reader.readFile(data_file); print feedback.col_mapping col_feature_map= {}; col_feature_map['P0001'] = 'feature1' col_feature_map['P0002'] = 'feature2' col_feature_map['P0003'] = 'feature3' col_feature_map['P0004'] = 'feature4' feedback2 = deepcopy(feedback) feedback.attach_col_feature(col_feature_map) print feedback.meta[FeedbackData.METAKEY_COL_FEATURE]
def experiment_tr_te_map(exp_name, train_data_file, test_data_file, \ train_item_feature_file, test_item_feature_file, \ max_rank, binary = False): ''' Parameters ---------- @param exp_name: @param train_data_file: @param test_data_file: @param train_content_file: @param test_content_file: @param max_rank: the maximal N in the computation. Returns ---------- @return out ''' # initialize utilities trte_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) # processing file name hashing (used for cache string). # create hash for single file or a list of files. if isinstance(train_data_file, list): hash_file_tr_data_str = str(hash(tuple(train_data_file))); else: hash_file_tr_data_str = str(hash(train_data_file)); if isinstance(test_data_file, list): hash_file_te_data_str = str(hash(tuple(test_data_file))); else: hash_file_te_data_str = str(hash(test_data_file)); if train_item_feature_file: if isinstance(train_item_feature_file, list): hash_file_tr_item_feature_str = str(hash(tuple(train_item_feature_file))) else: hash_file_tr_item_feature_str = str(hash(train_item_feature_file)) else: hash_file_tr_item_feature_str = ''; if test_item_feature_file: if isinstance(test_item_feature_file, list): hash_file_te_item_feature_str = str(hash(tuple(test_item_feature_file))) else: hash_file_te_item_feature_str = str(hash(test_item_feature_file)) else: hash_file_te_item_feature_str = ''; # display information print 'Training data file', train_data_file, ' [hash:', hash_file_tr_data_str, ']' if train_item_feature_file: print 'Training content feature provided: ', train_item_feature_file, \ ' [hash:', hash_file_tr_item_feature_str, ']' else: print 'Training content feature not provided.' print 'Testing data file ', test_data_file, ' [hash:', hash_file_te_data_str, ']' if test_item_feature_file: print 'Testing content feature provided: ', test_item_feature_file, \ ' [hash:', hash_file_te_item_feature_str, ']' else: print 'Testing content feature not provided.' if binary: exp_id_prefix = 'trte_bi_' else: exp_id_prefix = 'trte_' exp_id = exp_id_prefix + exp_name + '_trdata_' + hash_file_tr_data_str \ + '_tedata_' + hash_file_te_data_str \ + '_tritemf_' + hash_file_tr_item_feature_str \ + '_tritemf_' + hash_file_te_item_feature_str; trte_log('Experiment ID: ' + exp_id) # load utility data and feature data. trte_log('Read training data...') reader = UtilityDataReader(fieldDelimiter = '\t'); tr_data = reader.read_file_with_minval(train_data_file, 0, 0); trte_log('Training data loaded: '+ str(tr_data)) te_data = reader.read_file_with_minval(test_data_file, 0, 0); trte_log('Testing data loaded: '+ str(te_data)) # load item feature data if binary: trte_log('Binarizing data...'); tr_data.binarize(); te_data.binarize(); result = {};