def experiment_unit_rand_split(exp_id, method, tr_data, te_data, iteration): ''' One iteration of training and testing. The experimental ID ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration) sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. mcpl_log('training models...') method.train(tr_data) # save resource trained_model = [method] URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder) # compute performance on test data using the model. [method] = trained_model mcpl_log('computing evaluation metrics on the test data...') eval_result = rmse(te_data.data_val, method.predict(te_data.data_row, te_data.data_col)) return eval_result
def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, leave_k_out, total_iteration, top_n, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: @param min_occ_user: @param method_list: @param leave_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' if leave_k_out >= min_occ_user: raise ValueError( 'The k in the leave k out should be strictly less than min_occ_user.' ) # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) # construct exp_id if binary: exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) else: exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) lko_log('Experiment ID: ' + exp_id) # load data. lko_log('Read data...') reader = DailyWatchTimeReader() data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog) lko_log('Data loaded: ' + str(data)) if binary: lko_log('Binarizing data...') data.binarize() else: # normalize lko_log('Normalizing data...') data.normalize_row() result = {} for method in method_list: # do for each method perf_vect = [] for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: ' + method.unique_str() + ' Iteration: ' + str(iteration)) # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str( iteration) split_dir = exp_id + '/lv_idx' leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir) if not leave_k_out_idx: # randomly generate k items from each row/user. leave_k_out_idx = ds.leave_k_out(data, leave_k_out) URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, leave_k_out_idx, split_dir) # split the k items as a separate. [data_left, data_tr] = data.leave_k_out(leave_k_out_idx) iter_result = experiment_unit_leave_k_out(exp_id, method, \ data_tr, data_left, iteration, top_n) perf_vect.append(iter_result) result[method.unique_str()] = perf_vect return result
def experiment_unit_leave_k_out(exp_id, method, data_tr, data_left, iteration, top_n): ''' This method works on the column/row index of the data_tr and data_left, and the data_tr and data_left must be completely aligned in both row-wise and column-wise. ''' # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration) sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. lko_log('training models...') method.train(data_tr) # save resource trained_model = [method] URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder) # compute performance on test data using the model. [method] = trained_model lko_log('computing evaluation metrics on the test data...') eval_result = {} # ranked list. col_num = data_left.num_col pred_col = range(col_num) tr_data_csr = data_tr.get_sparse_matrix().tocsr() lo_data_csr = data_left.get_sparse_matrix().tocsr() for user_idx in range(data_left.num_row): # predict the entire row. #pred_row = [user_idx] * col_num; #row_pred = method.predict(pred_row, pred_col); row_pred = method.predict_row(user_idx, pred_col) # rank the column (the result is a list of indices). srt_col = [ k[0] for k in sorted( enumerate(row_pred), key=lambda x: x[1], reverse=True) ] # trained columns. tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist()) # remove the trained column. te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col] # top - k (safeguard) te_topk_col = te_srt_col[:min(top_n, len(te_srt_col) - 1)] # test column index; lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist()) prec = precision_itemlist(te_topk_col, lo_col) rec = recall_itemlist(te_topk_col, lo_col) eval_result['prec'] = prec eval_result['recall'] = rec eval_result['rmse'] = rmse( data_left.data_val, method.predict(data_left.data_row, data_left.data_col)) return eval_result
feedback_data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog) exp_id = 'lko_bi_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration) sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) [method] = trained_model learnt_genre = method.V program_mapping = feedback_data.col_mapping program_inv_mapping = {y: x for x, y in program_mapping.items()} program_name = [ program_inv_mapping[i] for i in range(len(program_mapping)) ] sio.savemat("prog_genre_mat.mat", { 'genre_mat': learnt_genre, 'prog_name': program_name })
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, training_prec, total_iteration): ''' Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of matrix completion models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) #mcpl_log('Data ID: ' + hash(daily_data_file)); # here we use a regular hash. exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration) mcpl_log('Experiment ID: ' + exp_id) # save experiment splitting as resources. reader = UtilityDataReader() data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog) # we normalize here before splitting. mcpl_log('Normalizing data...') data.normalize_row() result = {} for method in method_list: # do for each method perf_vect = [] for iteration in range(total_iteration): # do for each iteration for each method; mcpl_log('Method: ' + method.unique_str() + ' Iteration: ' + str(iteration)) # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_split_iter' + str( iteration) split_dir = exp_id + '/split' split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir) if not split: split = ds.split(data.num_row, training_prec) URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir) [split_tr, split_te] = split data_tr = data.subdata_row(split_tr) data_te = data.subdata_row(split_te) iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration) perf_vect.append(iter_result) result[method.unique_str()] = perf_vect mcpl_log('Experiment Done [' + exp_id + ']') return result
def read_file_with_minval(self, filename, min_duid, min_pid, num_duid = None, num_pid = None, rand_seed = 1): ''' This method first goes through the data once, and filter out the device and program that has occurrences below specified values. Support random undersampling. Parameters ---------- @param filename: a string consists of the file name and location of the data file to be read. @param min_duid: a positive integer. the minimum occurrence of a device for the device to be included. @param min_pid: a positive integer. the minimum occurrence of a program for the program to be included. Returns ---------- result: a FeedbackData data structure constructed from the data file. In the result there is also a genre-program mapping data (result.meta['pggr_pg'][i], meta['pggr_pr'][i]) indicates that the program at result.meta['pggr_pg'][i] is marked by genre at meta['pggr_pr'][i]. The genre mapping is in R:/Data/Rovi/genre.csv, and a vintage copy is also kept in datasample/Rovi folder. ''' if num_duid is None and num_pid is None: subsample = False; res_str = 'DWT_RFWMV[' + str(filename) + '][MIN DUID' + str(min_duid) + '][MIN PID' + str(min_pid) +']'; elif num_duid is not None and num_pid is not None: subsample = True; res_str = 'DWT_RFWMV[' + str(filename) + '][MIN DUID' + str(min_duid) + '][MIN PID' + str(min_pid) +']'\ + '[NUM DUID' + str(num_duid) + ']' + '[NUM PID' + str(num_pid) + ']'; else: raise ValueError('num_duid and num_pid should be both set or both use default'); # We check if the current resource is available. If not then load from test data and save resource. if not URM.CheckResource(URM.RTYPE_DATA, res_str): Logger.Log('Computing data information...'); [occur_duid, occur_pid, _, _] = self.read_file_info(filename); print str(len(occur_duid)), 'devices', str(len(occur_pid)), 'programs'; Logger.Log('Generating filtering indices...'); duidlist = [sel_duid for sel_duid, sel_duidcnt in occur_duid.iteritems() if sel_duidcnt > min_duid]; pidlist = [sel_pid for sel_pid, sel_pidcnt in occur_pid.iteritems() if sel_pidcnt > min_pid]; print 'After filtering [MIN_DUID',str(min_duid), ' MIN_PID:', str(min_pid),']:',\ str(len(occur_duid)), 'devices', str(len(occur_pid)), 'programs'; # perform random sampling. if subsample: random.seed(rand_seed); if len(duidlist) > num_duid: # subsample DUID; random.shuffle(duidlist); duidlist = duidlist[:num_duid]; if len(pidlist) > num_pid: # subsample PID; random.shuffle(pidlist); pidlist = pidlist[:num_pid]; duidlist = set(duidlist); pidlist = set(pidlist); # read the raw data file with the list. [mapping_duid, mapping_pid, row, col, data, pggr_pg, pggr_gr] \ = self.read_file_with_id_list(filename, duidlist, pidlist); Logger.Log('read_file_with_minval process completed.'); result = FeedbackData(row, col, data, len(mapping_duid), len(mapping_pid),\ mapping_duid, mapping_pid, {'pggr_pg': pggr_pg, 'pggr_gr': pggr_gr}); # save computed results to resource cache. URM.SaveResource(URM.RTYPE_DATA, res_str, result); return result; else: return URM.LoadResource(URM.RTYPE_DATA, res_str);
def experiment_coldstart_map(exp_name, daily_data_file,\ min_occ_user, min_occ_prog, num_user, num_prog,\ method_list, blind_k_out, total_iteration, max_rank, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: a list of files. @param min_occ_user: cold start user criteria @param min_occ_prog: cold start user criteria @param num_user: the number of users selected in the experiment. @param num_prog: the number of programs selected in the experiment. @param method_list: @param blind_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' print 'Blind k out: k = ', str(blind_k_out); print 'Min_occ_user: '******'Min_occ_prog: ', str(min_occ_prog); if blind_k_out >= min_occ_user: raise ValueError('The k in the leave k out [' + str(blind_k_out) +'] should be strictly less than min_occ_user [' + str(min_occ_user) +'].'); # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); if isinstance(daily_data_file, list): hash_file_str = str(hash(tuple(daily_data_file))); else: hash_file_str = str(hash(daily_data_file)); # construct exp_id if binary: exp_id = 'cst_bi_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration); else: exp_id = 'cst_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration); lko_log('Experiment ID: ' + exp_id); # load data. lko_log('Read data...'); reader = DailyWatchTimeReader(); data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog); lko_log('Data loaded: ' + str(data)); if binary: lko_log('Binarizing data...'); data.binarize(); else: # normalize lko_log('Normalizing data...'); data.normalize_row(); result = {}; for method in method_list: # do for each method perf_vect = []; for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration)); # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_blind_idx_iter' + str(iteration); split_dir = exp_id + '/blind_idx'; blind_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir); if not blind_out_idx: # randomly generate k items to blind out. blind_out_idx = ds.sample_num(data.num_col, blind_k_out); URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, blind_out_idx, split_dir); lko_log('Blind index done.'); # split the k items as a separate. [data_tr, data_left] = data.blind_k_out(blind_out_idx); lko_log('Start index'); iter_result = experiment_unit_leave_k_out_map(exp_id, method, \ data_tr, data_left, iteration, max_rank); perf_vect.append(iter_result); result[method.unique_str()] = perf_vect; return result;
def experiment_unit_leave_k_out_map(exp_id, method, data_tr, data_left, iteration, max_rank): ''' This method works on the column/row index of the data_tr and data_left, and the data_tr and data_left must be completely aligned in both row-wise and column-wise. ''' # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration); sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder); if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. lko_log('training models...'); method.train(data_tr); # save resource trained_model = [method]; URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder); # compute performance on test data using the model. [method] = trained_model; lko_log('computing evaluation metrics on the test data...'); eval_result = {}; # ranked list. col_num = data_left.num_col; pred_col = range(col_num); tr_data_csr = data_tr.get_sparse_matrix().tocsr(); lo_data_csr = data_left.get_sparse_matrix().tocsr(); perf_vect_prec = np.zeros(max_rank); # precision perf_vect_rec = np.zeros(max_rank); # recall perf_vect_hr = np.zeros(max_rank); # hit rate (Modification of Xia Ning's Paper) for user_idx in range(data_left.num_row): # predict the entire row. # test column index; lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist()); # there is no testing on this user. if len(lo_col) == 0: continue; #pred_row = [user_idx] * col_num; #row_pred = method.predict(pred_row, pred_col); row_pred = method.predict_row(user_idx, pred_col); # rank the column (the result is a list of indices). srt_col = [k[0] for k in sorted(enumerate(row_pred), key=lambda x:x[1], reverse=True)]; # trained columns. tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist()); # remove the trained column from prediction. # this contains a set of indices that predicted (excluding training items). te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col]; #max_rank will result in an array of 0:max_rank-1; hit = 0; # the hit variable keeps track of the number of hits till the current rank. for rk in range(max_rank): # if rk is greater than the length of te_srt_col, then continue; # if not, detect possible hits. # a hit is defined by items hits if (rk < len(te_srt_col)) and (te_srt_col[rk] in lo_col): hit += 1; perf_vect_hr[rk] += float(hit)/len(lo_col); # hit rate perf_vect_prec[rk] += float(hit)/(rk+1); # precision perf_vect_rec[rk] += float(hit)/len(lo_col); # recall #normalization over users. perf_vect_hr = perf_vect_hr/data_left.num_row; perf_vect_prec = perf_vect_prec/data_left.num_row; perf_vect_rec = perf_vect_rec/data_left.num_row; eval_result['hit_rate'] = perf_vect_hr; eval_result['precision'] = perf_vect_prec; eval_result['recall'] = perf_vect_rec; eval_result['RMSE'] = rmse(data_left.data_val, method.predict(data_left.data_row, data_left.data_col)); return eval_result;
''' Created on Jan 28, 2014 @author: jiayu.zhou ''' import numpy as np from rs.cache.urm import URM if __name__ == '__main__': a = URM.LoadResource(URM.RTYPE_DATA, 'test001') if not a: print "not found." res = [np.random.rand(3, 5), 'test'] print res URM.SaveResource(URM.RTYPE_DATA, 'test001', res) a = URM.LoadResource(URM.RTYPE_DATA, 'test001') if not a: print "not found." print res
def experiment_unit_future_program(exp_id, method, tr_data, te_data, top_k): ''' ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) result_resource_str = 'model_exp' + exp_id + \ '_method' + method.unique_str() sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. mcpl_log('training models...') method.train(tr_data) # save resource trained_model = [method] URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder) # compute performance on test data using the model. [method] = trained_model mcpl_log('computing evaluation metrics on the test data...') # compute the score of the programs in the prediction. prog_list = te_data.col_mapping.keys() # program list te_datamat = te_data.get_sparse_matrix().tolil() eval_result = [] # TODO: on a subset of DUID? for duid in te_data.row_mapping.keys(): # iterate every element. prog_score = method.get_score(duid, prog_list, te_data.meta) # get scores of the programs in the list. # sort the score (first dimension is the index and the second is the actual prediction value). # NOTE: the first dimension is the order with respect to prog_list srt_list = [(k[0], k[1]) for k in sorted( enumerate(prog_score), key=lambda x: x[1], reverse=True)] srt_list = srt_list[:top_k] # truncate to top k. [srt_idx, _] = zip(*srt_list) # map from prog_list to actual index. mapped_srt_idx = [ te_data.col_mapping[prog_list[idx]] for idx in srt_idx ] #print te_datamat[te_data.row_mapping[duid], mapped_srt_idx].todense(); # get the ground truth hit. prog_hit = (te_datamat[te_data.row_mapping[duid], mapped_srt_idx].todense().tolist())[0] # compute hit precision (now we consider only binary hit). eval_result.append(hit_prec(prog_hit)) return eval_result