def experiment_future_program(exp_name, previous_data_files, future_data_file, \ min_occ_user, min_occ_prog, method_list, top_k): ''' experiment entrance for future programs. Top-k precision. Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of recommendation models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) exp_id = exp_name + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) mcpl_log('Experimental ID: ' + exp_id) reader = DailyWatchTimeReader() tr_data = reader.read_file_with_minval(previous_data_files, min_occ_user, min_occ_prog) te_data = reader.read_file_with_minval(future_data_file, min_occ_user, min_occ_prog) mcpl_log('Normalization data ...') tr_data.normalize_row() # there is no need to normalize train data because we evaluate the hits. result = {} for method in method_list: # do for each method mcpl_log('Method: ' + method.unique_str()) method_result = experiment_unit_future_program(exp_id, method, tr_data, te_data, top_k) result[method.unique_str()] = method_result mcpl_log('Experiment Done [' + exp_id + ']') return result
''' Created on Feb 13, 2014 @author: jiayu.zhou ''' import scipy.io as sio from rs.data.daily_watchtime import DailyWatchTimeReader if __name__ == '__main__': daily_data_file = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209/part-r-00000" reader = DailyWatchTimeReader() data = reader.read_file_with_minval(daily_data_file, 1, 1) data_mat = data.get_sparse_matrix() ### directory save sparse matrix data structure to Matlab. #sio.savemat("/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209.mat", {'data': data_mat}) ### sio.savemat("/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209_sparse.mat", {'data': data_mat.data, 'i': data_mat.row, 'j': data_mat.col, \ 'm': data_mat.shape[0], 'n': data_mat.shape[1]}) print 'Done'
def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, leave_k_out, total_iteration, top_n, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: @param min_occ_user: @param method_list: @param leave_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' if leave_k_out >= min_occ_user: raise ValueError( 'The k in the leave k out should be strictly less than min_occ_user.' ) # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) # construct exp_id if binary: exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) else: exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) lko_log('Experiment ID: ' + exp_id) # load data. lko_log('Read data...') reader = DailyWatchTimeReader() data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog) lko_log('Data loaded: ' + str(data)) if binary: lko_log('Binarizing data...') data.binarize() else: # normalize lko_log('Normalizing data...') data.normalize_row() result = {} for method in method_list: # do for each method perf_vect = [] for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: ' + method.unique_str() + ' Iteration: ' + str(iteration)) # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str( iteration) split_dir = exp_id + '/lv_idx' leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir) if not leave_k_out_idx: # randomly generate k items from each row/user. leave_k_out_idx = ds.leave_k_out(data, leave_k_out) URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, leave_k_out_idx, split_dir) # split the k items as a separate. [data_left, data_tr] = data.leave_k_out(leave_k_out_idx) iter_result = experiment_unit_leave_k_out(exp_id, method, \ data_tr, data_left, iteration, top_n) perf_vect.append(iter_result) result[method.unique_str()] = perf_vect return result
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, training_prec, total_iteration): ''' Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of matrix completion models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); #mcpl_log('Data ID: ' + hash(daily_data_file)); # here we use a regular hash. exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration); mcpl_log('Experiment ID: ' + exp_id); # save experiment splitting as resources. reader = DailyWatchTimeReader(); data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog); # we normalize here before splitting. mcpl_log('Normalizing data...'); data.normalize_row(); result = {}; for method in method_list: # do for each method perf_vect = []; for iteration in range(total_iteration): # do for each iteration for each method; mcpl_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration)); # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_split_iter' + str(iteration); split_dir = exp_id + '/split'; split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir); if not split: split = ds.split(data.num_row, training_prec); URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir); [split_tr, split_te] = split; data_tr = data.subdata_row(split_tr); data_te = data.subdata_row(split_te); iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration); perf_vect.append(iter_result); result[method.unique_str()] = perf_vect; mcpl_log('Experiment Done [' + exp_id + ']'); return result;
def experiment_coldstart_map(exp_name, daily_data_file,\ min_occ_user, min_occ_prog, num_user, num_prog,\ method_list, blind_k_out, total_iteration, max_rank, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: a list of files. @param min_occ_user: cold start user criteria @param min_occ_prog: cold start user criteria @param num_user: the number of users selected in the experiment. @param num_prog: the number of programs selected in the experiment. @param method_list: @param blind_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' print 'Blind k out: k = ', str(blind_k_out); print 'Min_occ_user: '******'Min_occ_prog: ', str(min_occ_prog); if blind_k_out >= min_occ_user: raise ValueError('The k in the leave k out [' + str(blind_k_out) +'] should be strictly less than min_occ_user [' + str(min_occ_user) +'].'); # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); if isinstance(daily_data_file, list): hash_file_str = str(hash(tuple(daily_data_file))); else: hash_file_str = str(hash(daily_data_file)); # construct exp_id if binary: exp_id = 'cst_bi_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration); else: exp_id = 'cst_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration); lko_log('Experiment ID: ' + exp_id); # load data. lko_log('Read data...'); reader = DailyWatchTimeReader(); data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog); lko_log('Data loaded: ' + str(data)); if binary: lko_log('Binarizing data...'); data.binarize(); else: # normalize lko_log('Normalizing data...'); data.normalize_row(); result = {}; for method in method_list: # do for each method perf_vect = []; for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration)); # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_blind_idx_iter' + str(iteration); split_dir = exp_id + '/blind_idx'; blind_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir); if not blind_out_idx: # randomly generate k items to blind out. blind_out_idx = ds.sample_num(data.num_col, blind_k_out); URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, blind_out_idx, split_dir); lko_log('Blind index done.'); # split the k items as a separate. [data_tr, data_left] = data.blind_k_out(blind_out_idx); lko_log('Start index'); iter_result = experiment_unit_leave_k_out_map(exp_id, method, \ data_tr, data_left, iteration, max_rank); perf_vect.append(iter_result); result[method.unique_str()] = perf_vect; return result;