def readFile(self, filename): ''' This file reads an aggregated file, and return 1. duid mapping (from duid to an integer, indicates the row number in the sparse matrix); 2. pid mapping (from pid to an integer, indicates the column number in the sparse matrix); 3. core sparse matrix. Note: this method pops data for ALL users. ''' mapping_duid = {} # store duid->row# mapping mapping_pid = {} # store pid->col# mapping row = [] col = [] data = [] lineNum = 0 with open(filename, 'rb') as csvfile: logreader = csv.reader(csvfile, delimiter=self.fieldDelimiter, quotechar='|') for logrow in logreader: log_duid = logrow[self.fieldMapping['duid']] log_pid = logrow[self.fieldMapping['pid']] log_watchtime = int(logrow[self.fieldMapping['watchtime']]) if not (log_duid in mapping_duid): mapping_duid[log_duid] = len(mapping_duid) row.append(mapping_duid[log_duid]) if not (log_pid in mapping_pid): mapping_pid[log_pid] = len(mapping_pid) col.append(mapping_pid[log_pid]) data.append(log_watchtime) lineNum += 1 if self.verbose and (lineNum % self.display == 0): Logger.Log(str(lineNum) + ' lines read.') if (self.verbose): Logger.Log('Done reading agg log file. '+str(len(data)) + ' elements read'+ \ ' ( '+str(len(mapping_duid))+' row/user, '+str(len(mapping_pid))+' col/program).') #return [mapping_duid, mapping_pid, row, col, data]; result = FeedbackData(row, col, data, len(mapping_duid), len(mapping_pid),\ mapping_duid, mapping_pid, []) return result
def read_file_info(self, filename): ''' This file reads an aggregated file and get summary (occurrences) for program and device. This information can be used to filtered out programs/devices later. ''' occur_duid = {} occur_pid = {} # turn a single file into a file list. if not isinstance(filename, list): filename_arr = [filename] else: filename_arr = filename lineNum = 0 for filename in filename_arr: with open(filename, 'rb') as csvfile: logreader = csv.reader(csvfile, delimiter=self.fieldDelimiter, quotechar='|') for logrow in logreader: log_duid = logrow[self.fieldMapping['duid']] log_pid = logrow[self.fieldMapping['pid']] if not (log_duid in occur_duid): occur_duid[log_duid] = 1 else: occur_duid[log_duid] += 1 if not (log_pid in occur_pid): occur_pid[log_pid] = 1 else: occur_pid[log_pid] += 1 lineNum += 1 if self.verbose and (lineNum % self.display == 0): Logger.Log(str(lineNum) + ' lines read.') # count occurrence into bins. cnt_duid = {} # cnt_duid[number of occurrence] = number of duid with specific occurrence. for val in occur_duid.values(): if not (val in cnt_duid): cnt_duid[val] = 1 else: cnt_duid[val] += 1 cnt_pid = {} # cnt_duid[number of occurrence] = number of duid with specific occurrence. for val in occur_pid.values(): if not (val in cnt_pid): cnt_pid[val] = 1 else: cnt_pid[val] += 1 return [occur_duid, occur_pid, cnt_duid, cnt_pid]
def read_file_with_id_list(self, filename, duidlist, pidlist): ''' This file reads an aggregated file. The file only include the specified duid and pid. ''' mapping_duid = {} # store duid->row# mapping mapping_pid = {} # store pid->col# mapping row = [] col = [] data = [] lineNum = 0 # turn a single file into a file list. if not isinstance(filename, list): filename_arr = [filename] else: filename_arr = filename for filename in filename_arr: with open(filename, 'rb') as csvfile: logreader = csv.reader(csvfile, delimiter=self.fieldDelimiter, quotechar='|') for logrow in logreader: log_duid = logrow[self.fieldMapping['duid']] log_pid = logrow[self.fieldMapping['pid']] ## we need both duid and pid are in the list. if (log_duid in duidlist) and (log_pid in pidlist): log_watchtime = int( logrow[self.fieldMapping['watchtime']]) if not (log_duid in mapping_duid): mapping_duid[log_duid] = len(mapping_duid) row.append(mapping_duid[log_duid]) if not (log_pid in mapping_pid): mapping_pid[log_pid] = len(mapping_pid) col.append(mapping_pid[log_pid]) data.append(log_watchtime) lineNum += 1 if self.verbose and (lineNum % self.display == 0): print str(lineNum), ' lines read.' if (self.verbose): Logger.Log('Done reading agg log file. '+str(len(data)) + ' elements read'+ \ ' ( '+str(len(mapping_duid))+' row/user, '+str(len(mapping_pid))+' col/program).') return [mapping_duid, mapping_pid, row, col, data]
def experiment_future_program(exp_name, previous_data_files, future_data_file, \ min_occ_user, min_occ_prog, method_list, top_k): ''' experiment entrance for future programs. Top-k precision. Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of recommendation models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) exp_id = exp_name + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) mcpl_log('Experimental ID: ' + exp_id) reader = DailyWatchTimeReader() tr_data = reader.read_file_with_minval(previous_data_files, min_occ_user, min_occ_prog) te_data = reader.read_file_with_minval(future_data_file, min_occ_user, min_occ_prog) mcpl_log('Normalization data ...') tr_data.normalize_row() # there is no need to normalize train data because we evaluate the hits. result = {} for method in method_list: # do for each method mcpl_log('Method: ' + method.unique_str()) method_result = experiment_unit_future_program(exp_id, method, tr_data, te_data, top_k) result[method.unique_str()] = method_result mcpl_log('Experiment Done [' + exp_id + ']') return result
def __init__(self, cache_folder=None, use_cache=None): ''' Create a cache manager with the specified cache location. ''' # if cache is system is used. # if turned off then save/load/check resource will not do anything. config_use_cache = ConfigManager.GetBoolean(CFG_SEC_UTILS, CFG_UTILS_USECACHE) self.use_cache = config_use_cache if use_cache is None else use_cache if not self.use_cache: Logger.Log("URM is turned off. All cached resources are not available.", \ Logger.MSG_CATEGORY_SYSTEM) # set up the directory for cache. config_cache_folder = ConfigManager.Get(CFG_SEC_UTILS, CFG_UTILS_CACHEFOLDER) self.cache_folder = config_cache_folder if cache_folder is None else cache_folder # create directory if it does not exist self.cacheLocation = self.cache_folder if not os.path.exists(self.cacheLocation): os.makedirs(self.cacheLocation)
def experiment_unit_rand_split(exp_id, method, tr_data, te_data, iteration): ''' One iteration of training and testing. The experimental ID ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration) sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. mcpl_log('training models...') method.train(tr_data) # save resource trained_model = [method] URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder) # compute performance on test data using the model. [method] = trained_model mcpl_log('computing evaluation metrics on the test data...') eval_result = rmse(te_data.data_val, method.predict(te_data.data_row, te_data.data_col)) return eval_result
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, training_prec, total_iteration): ''' Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of matrix completion models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) #mcpl_log('Data ID: ' + hash(daily_data_file)); # here we use a regular hash. exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration) mcpl_log('Experiment ID: ' + exp_id) # save experiment splitting as resources. reader = UtilityDataReader() data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog) # we normalize here before splitting. mcpl_log('Normalizing data...') data.normalize_row() result = {} for method in method_list: # do for each method perf_vect = [] for iteration in range(total_iteration): # do for each iteration for each method; mcpl_log('Method: ' + method.unique_str() + ' Iteration: ' + str(iteration)) # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_split_iter' + str( iteration) split_dir = exp_id + '/split' split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir) if not split: split = ds.split(data.num_row, training_prec) URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir) [split_tr, split_te] = split data_tr = data.subdata_row(split_tr) data_te = data.subdata_row(split_te) iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration) perf_vect.append(iter_result) result[method.unique_str()] = perf_vect mcpl_log('Experiment Done [' + exp_id + ']') return result
''' import numpy as np import scipy.sparse import scipy.linalg #import timeit; from rs.utils.log import Logger from rs.algorithms.recommendation.generic_recalg import CFAlg from rs.algorithms.optimization.prox import projfun_probability_simplex,\ proximal, proj_nonneg from rs.algorithms.optimization.sparsa import Opt_SpaRSA # an encapsulated logger. log = lambda message: Logger.Log(HierLat.ALG_NAME + ':' + message, Logger. MSG_CATEGORY_ALGO) class HierLat(CFAlg): ''' A random guess recommender (demo). ''' ALG_NAME = 'HierLat' CS_EQUAL_PROB = 'cs_equal_prob' # equal probability CS_ALL_AVG = 'cs_all_avg' # all average and then normalize. CS_OBS_AVG = 'cs_obs_avg' # average at observe and then normalize.
Created on Jan 31, 2014 @author: Shiyu C. ([email protected]) ''' import numpy as np; from rs.algorithms.recommendation.generic_recalg import CFAlg; from rs.utils.log import Logger; import scipy.sparse; import scipy.linalg # an encapsulated logger. log = lambda message: Logger.Log(CF_ONMTF.ALG_NAME + ':'+message, Logger.MSG_CATEGORY_ALGO); class CF_ONMTF(CFAlg): ''' A random guess recommender (demo). ''' ALG_NAME = 'CF_ONMTF'; ################################################################################################## def __init__(self, latent_factor = 20, lamb = 1e-3, stop_delta = 1e-4, maxiter = 1e3, verbose = False): ''' Constructor
@author: Shiyu C. ([email protected]) Modified on Feb 5, 2014 by Jiayu Zhou, added Rec_LMaFit. ''' import numpy as np; from rs.algorithms.recommendation.generic_recalg import CFAlg; from rs.algorithms.recommendation.recommender_wrapper import Recommender; from rs.utils.log import Logger; import scipy.sparse import scipy.linalg # encapsulated loggers. mcpl_log = lambda message: Logger.Log(LMaFit.ALG_NAME + ':'+message, Logger.MSG_CATEGORY_ALGO); rec_log = lambda message: Logger.Log(Rec_LMaFit.ALG_NAME + ':'+message, Logger.MSG_CATEGORY_ALGO); class Rec_LMaFit(Recommender): ''' An implementation recommender wrapper using the LMaFit algorithm (with the capability of ensemble). ''' ALG_NAME = 'Rec_LMaFit'; def __init__(self, latent_factors = [1, 2], lamb = [1e-3, 1e-1], stop_delta = 1e-4, maxiter = 1e3, verbose = False): ''' Constructor. ''' self.models = []; for model_idx in range(len(latent_factors)):
''' Created on Jan 27, 2014 @author: jiayu.zhou ''' from rs.utils.log import Logger if __name__ == '__main__': logger = Logger('./logs'); logger._log('Hello world', 'TEST', 0); logger._log('The second line', 'TEST', 0 ); logger._log('Something else.', 'TEST', 0);
''' Created on Jan 27, 2014 @author: jiayu.zhou ''' from rs.utils.log import Logger if __name__ == '__main__': logger = Logger('./logs') logger._log('Hello world', 'TEST', 0) logger._log('The second line', 'TEST', 0) logger._log('Something else.', 'TEST', 0)
def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, leave_k_out, total_iteration, top_n, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: @param min_occ_user: @param method_list: @param leave_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' if leave_k_out >= min_occ_user: raise ValueError( 'The k in the leave k out should be strictly less than min_occ_user.' ) # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) # construct exp_id if binary: exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) else: exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) lko_log('Experiment ID: ' + exp_id) # load data. lko_log('Read data...') reader = DailyWatchTimeReader() data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog) lko_log('Data loaded: ' + str(data)) if binary: lko_log('Binarizing data...') data.binarize() else: # normalize lko_log('Normalizing data...') data.normalize_row() result = {} for method in method_list: # do for each method perf_vect = [] for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: ' + method.unique_str() + ' Iteration: ' + str(iteration)) # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str( iteration) split_dir = exp_id + '/lv_idx' leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir) if not leave_k_out_idx: # randomly generate k items from each row/user. leave_k_out_idx = ds.leave_k_out(data, leave_k_out) URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, leave_k_out_idx, split_dir) # split the k items as a separate. [data_left, data_tr] = data.leave_k_out(leave_k_out_idx) iter_result = experiment_unit_leave_k_out(exp_id, method, \ data_tr, data_left, iteration, top_n) perf_vect.append(iter_result) result[method.unique_str()] = perf_vect return result
def read_file_with_id_list(self, filename, duidlist, pidlist, ignore_prog_without_genre = True): ''' This file reads an aggregated file. The file only include the specified duid and pid. ''' mapping_duid = {}; # store duid->row# mapping mapping_pid = {}; # store pid->col# mapping row = []; col = []; data = []; pggr_pg = []; pggr_gr = []; visited_program_list = set([]); lineNum = 0; # turn a single file into a file list. if not isinstance(filename, list): filename_arr = [filename]; else: filename_arr = filename; for filename in filename_arr: with open(filename, 'rb') as csvfile: logreader = csv.reader(csvfile, delimiter = self.fieldDelimiter, quotechar = '|'); for logrow in logreader: log_duid = logrow[self.fieldMapping['duid']]; log_pid = logrow[self.fieldMapping['pid']]; log_pg_gr = logrow[self.fieldMapping['genre']].strip(); if not log_pg_gr: Logger.Log('Empty genre information for program '+log_pid, Logger.MSG_CATEGORY_DATA); if ignore_prog_without_genre: # ignore records whose program has no genre information. continue; ## we need both duid and pid are in the list. if (log_duid in duidlist) and (log_pid in pidlist): log_watchtime = int(logrow[self.fieldMapping['watchtime']]); if not (log_duid in mapping_duid): mapping_duid[log_duid] = len(mapping_duid); row.append(mapping_duid[log_duid]); if not (log_pid in mapping_pid): mapping_pid[log_pid] = len(mapping_pid); col.append(mapping_pid[log_pid]); # store program - genre mappings, for programs that were not visited before. if not mapping_pid[log_pid] in visited_program_list: for pg_gr in log_pg_gr.split(','): if not pg_gr: continue; pggr_pg.append(mapping_pid[log_pid]); pggr_gr.append(int(pg_gr)); visited_program_list.add(mapping_pid[log_pid]); data.append(log_watchtime); lineNum+=1; if self.verbose and (lineNum%self.display == 0): print str(lineNum), ' lines read.'; if (self.verbose): Logger.Log('Done reading agg log file. '+str(len(data)) + ' elements read'+ \ ' ( '+str(len(mapping_duid))+' row/user, '+str(len(mapping_pid))+' col/program).'); return [mapping_duid, mapping_pid, row, col, data, pggr_pg, pggr_gr];
def experiment_unit_future_program(exp_id, method, tr_data, te_data, top_k): ''' ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) result_resource_str = 'model_exp' + exp_id + \ '_method' + method.unique_str() sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. mcpl_log('training models...') method.train(tr_data) # save resource trained_model = [method] URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder) # compute performance on test data using the model. [method] = trained_model mcpl_log('computing evaluation metrics on the test data...') # compute the score of the programs in the prediction. prog_list = te_data.col_mapping.keys() # program list te_datamat = te_data.get_sparse_matrix().tolil() eval_result = [] # TODO: on a subset of DUID? for duid in te_data.row_mapping.keys(): # iterate every element. prog_score = method.get_score(duid, prog_list, te_data.meta) # get scores of the programs in the list. # sort the score (first dimension is the index and the second is the actual prediction value). # NOTE: the first dimension is the order with respect to prog_list srt_list = [(k[0], k[1]) for k in sorted( enumerate(prog_score), key=lambda x: x[1], reverse=True)] srt_list = srt_list[:top_k] # truncate to top k. [srt_idx, _] = zip(*srt_list) # map from prog_list to actual index. mapped_srt_idx = [ te_data.col_mapping[prog_list[idx]] for idx in srt_idx ] #print te_datamat[te_data.row_mapping[duid], mapped_srt_idx].todense(); # get the ground truth hit. prog_hit = (te_datamat[te_data.row_mapping[duid], mapped_srt_idx].todense().tolist())[0] # compute hit precision (now we consider only binary hit). eval_result.append(hit_prec(prog_hit)) return eval_result
def log(msg): ''' logging style. ''' Logger.Log(msg, Logger.MSG_CATEGORY_CACHE)
''' Created on Feb 17, 2014 @author: Shiyu C. ([email protected]) ''' import numpy as np from rs.algorithms.recommendation.generic_recalg import CFAlg from rs.utils.log import Logger import scipy.sparse #import scipy.linalg # an encapsulated logger. log = lambda message: Logger.Log(item_item_sim.ALG_NAME + ':'+message, \ Logger.MSG_CATEGORY_ALGO) class item_item_sim(CFAlg): ALG_NAME = 'item_item_sim' def __init__(self, N=3): ''' Constructor ''' # initialize parameters. self.N = N log('Item-based similarity algorithm created: Neighborhood size' + str(self.N))
''' Created on Feb 12, 2014 @author: jiayu.zhou ''' from rs.algorithms.recommendation.generic_recalg import CFAlg from rs.utils.log import Logger; import numpy as np; mcpl_log = lambda message: Logger.Log(SVDPlusPlus.ALG_NAME + ':'+message, Logger.MSG_CATEGORY_ALGO); class SVDPlusPlus(CFAlg): ''' SVD Plus Plus ''' ALG_NAME = 'SVD++'; def __init__(self, params): ''' Constructor ''' self.regularization = 0.015; self.learn_rate = 0.001; self.bais_learn_rate = 0.7; self.bais_reg = 0.33;
def experiment_coldstart_map(exp_name, daily_data_file,\ min_occ_user, min_occ_prog, num_user, num_prog,\ method_list, blind_k_out, total_iteration, max_rank, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: a list of files. @param min_occ_user: cold start user criteria @param min_occ_prog: cold start user criteria @param num_user: the number of users selected in the experiment. @param num_prog: the number of programs selected in the experiment. @param method_list: @param blind_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' print 'Blind k out: k = ', str(blind_k_out); print 'Min_occ_user: '******'Min_occ_prog: ', str(min_occ_prog); if blind_k_out >= min_occ_user: raise ValueError('The k in the leave k out [' + str(blind_k_out) +'] should be strictly less than min_occ_user [' + str(min_occ_user) +'].'); # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); if isinstance(daily_data_file, list): hash_file_str = str(hash(tuple(daily_data_file))); else: hash_file_str = str(hash(daily_data_file)); # construct exp_id if binary: exp_id = 'cst_bi_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration); else: exp_id = 'cst_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration); lko_log('Experiment ID: ' + exp_id); # load data. lko_log('Read data...'); reader = DailyWatchTimeReader(); data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog); lko_log('Data loaded: ' + str(data)); if binary: lko_log('Binarizing data...'); data.binarize(); else: # normalize lko_log('Normalizing data...'); data.normalize_row(); result = {}; for method in method_list: # do for each method perf_vect = []; for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration)); # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_blind_idx_iter' + str(iteration); split_dir = exp_id + '/blind_idx'; blind_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir); if not blind_out_idx: # randomly generate k items to blind out. blind_out_idx = ds.sample_num(data.num_col, blind_k_out); URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, blind_out_idx, split_dir); lko_log('Blind index done.'); # split the k items as a separate. [data_tr, data_left] = data.blind_k_out(blind_out_idx); lko_log('Start index'); iter_result = experiment_unit_leave_k_out_map(exp_id, method, \ data_tr, data_left, iteration, max_rank); perf_vect.append(iter_result); result[method.unique_str()] = perf_vect; return result;
def experiment_unit_leave_k_out_map(exp_id, method, data_tr, data_left, iteration, max_rank): ''' This method works on the column/row index of the data_tr and data_left, and the data_tr and data_left must be completely aligned in both row-wise and column-wise. ''' # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration); sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder); if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. lko_log('training models...'); method.train(data_tr); # save resource trained_model = [method]; URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder); # compute performance on test data using the model. [method] = trained_model; lko_log('computing evaluation metrics on the test data...'); eval_result = {}; # ranked list. col_num = data_left.num_col; pred_col = range(col_num); tr_data_csr = data_tr.get_sparse_matrix().tocsr(); lo_data_csr = data_left.get_sparse_matrix().tocsr(); perf_vect_prec = np.zeros(max_rank); # precision perf_vect_rec = np.zeros(max_rank); # recall perf_vect_hr = np.zeros(max_rank); # hit rate (Modification of Xia Ning's Paper) for user_idx in range(data_left.num_row): # predict the entire row. # test column index; lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist()); # there is no testing on this user. if len(lo_col) == 0: continue; #pred_row = [user_idx] * col_num; #row_pred = method.predict(pred_row, pred_col); row_pred = method.predict_row(user_idx, pred_col); # rank the column (the result is a list of indices). srt_col = [k[0] for k in sorted(enumerate(row_pred), key=lambda x:x[1], reverse=True)]; # trained columns. tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist()); # remove the trained column from prediction. # this contains a set of indices that predicted (excluding training items). te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col]; #max_rank will result in an array of 0:max_rank-1; hit = 0; # the hit variable keeps track of the number of hits till the current rank. for rk in range(max_rank): # if rk is greater than the length of te_srt_col, then continue; # if not, detect possible hits. # a hit is defined by items hits if (rk < len(te_srt_col)) and (te_srt_col[rk] in lo_col): hit += 1; perf_vect_hr[rk] += float(hit)/len(lo_col); # hit rate perf_vect_prec[rk] += float(hit)/(rk+1); # precision perf_vect_rec[rk] += float(hit)/len(lo_col); # recall #normalization over users. perf_vect_hr = perf_vect_hr/data_left.num_row; perf_vect_prec = perf_vect_prec/data_left.num_row; perf_vect_rec = perf_vect_rec/data_left.num_row; eval_result['hit_rate'] = perf_vect_hr; eval_result['precision'] = perf_vect_prec; eval_result['recall'] = perf_vect_rec; eval_result['RMSE'] = rmse(data_left.data_val, method.predict(data_left.data_row, data_left.data_col)); return eval_result;
def read_file_with_minval(self, filename, min_duid, min_pid, num_duid = None, num_pid = None, rand_seed = 1): ''' This method first goes through the data once, and filter out the device and program that has occurrences below specified values. Support random undersampling. Parameters ---------- @param filename: a string consists of the file name and location of the data file to be read. @param min_duid: a positive integer. the minimum occurrence of a device for the device to be included. @param min_pid: a positive integer. the minimum occurrence of a program for the program to be included. Returns ---------- result: a FeedbackData data structure constructed from the data file. In the result there is also a genre-program mapping data (result.meta['pggr_pg'][i], meta['pggr_pr'][i]) indicates that the program at result.meta['pggr_pg'][i] is marked by genre at meta['pggr_pr'][i]. The genre mapping is in R:/Data/Rovi/genre.csv, and a vintage copy is also kept in datasample/Rovi folder. ''' if num_duid is None and num_pid is None: subsample = False; res_str = 'DWT_RFWMV[' + str(filename) + '][MIN DUID' + str(min_duid) + '][MIN PID' + str(min_pid) +']'; elif num_duid is not None and num_pid is not None: subsample = True; res_str = 'DWT_RFWMV[' + str(filename) + '][MIN DUID' + str(min_duid) + '][MIN PID' + str(min_pid) +']'\ + '[NUM DUID' + str(num_duid) + ']' + '[NUM PID' + str(num_pid) + ']'; else: raise ValueError('num_duid and num_pid should be both set or both use default'); # We check if the current resource is available. If not then load from test data and save resource. if not URM.CheckResource(URM.RTYPE_DATA, res_str): Logger.Log('Computing data information...'); [occur_duid, occur_pid, _, _] = self.read_file_info(filename); print str(len(occur_duid)), 'devices', str(len(occur_pid)), 'programs'; Logger.Log('Generating filtering indices...'); duidlist = [sel_duid for sel_duid, sel_duidcnt in occur_duid.iteritems() if sel_duidcnt > min_duid]; pidlist = [sel_pid for sel_pid, sel_pidcnt in occur_pid.iteritems() if sel_pidcnt > min_pid]; print 'After filtering [MIN_DUID',str(min_duid), ' MIN_PID:', str(min_pid),']:',\ str(len(occur_duid)), 'devices', str(len(occur_pid)), 'programs'; # perform random sampling. if subsample: random.seed(rand_seed); if len(duidlist) > num_duid: # subsample DUID; random.shuffle(duidlist); duidlist = duidlist[:num_duid]; if len(pidlist) > num_pid: # subsample PID; random.shuffle(pidlist); pidlist = pidlist[:num_pid]; duidlist = set(duidlist); pidlist = set(pidlist); # read the raw data file with the list. [mapping_duid, mapping_pid, row, col, data, pggr_pg, pggr_gr] \ = self.read_file_with_id_list(filename, duidlist, pidlist); Logger.Log('read_file_with_minval process completed.'); result = FeedbackData(row, col, data, len(mapping_duid), len(mapping_pid),\ mapping_duid, mapping_pid, {'pggr_pg': pggr_pg, 'pggr_gr': pggr_gr}); # save computed results to resource cache. URM.SaveResource(URM.RTYPE_DATA, res_str, result); return result; else: return URM.LoadResource(URM.RTYPE_DATA, res_str);
def experiment_unit_leave_k_out(exp_id, method, data_tr, data_left, iteration, top_n): ''' This method works on the column/row index of the data_tr and data_left, and the data_tr and data_left must be completely aligned in both row-wise and column-wise. ''' # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration) sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. lko_log('training models...') method.train(data_tr) # save resource trained_model = [method] URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder) # compute performance on test data using the model. [method] = trained_model lko_log('computing evaluation metrics on the test data...') eval_result = {} # ranked list. col_num = data_left.num_col pred_col = range(col_num) tr_data_csr = data_tr.get_sparse_matrix().tocsr() lo_data_csr = data_left.get_sparse_matrix().tocsr() for user_idx in range(data_left.num_row): # predict the entire row. #pred_row = [user_idx] * col_num; #row_pred = method.predict(pred_row, pred_col); row_pred = method.predict_row(user_idx, pred_col) # rank the column (the result is a list of indices). srt_col = [ k[0] for k in sorted( enumerate(row_pred), key=lambda x: x[1], reverse=True) ] # trained columns. tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist()) # remove the trained column. te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col] # top - k (safeguard) te_topk_col = te_srt_col[:min(top_n, len(te_srt_col) - 1)] # test column index; lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist()) prec = precision_itemlist(te_topk_col, lo_col) rec = recall_itemlist(te_topk_col, lo_col) eval_result['prec'] = prec eval_result['recall'] = rec eval_result['rmse'] = rmse( data_left.data_val, method.predict(data_left.data_row, data_left.data_col)) return eval_result
def experiment_tr_te_map(exp_name, train_data_file, test_data_file, \ train_item_feature_file, test_item_feature_file, \ max_rank, binary = False): ''' Parameters ---------- @param exp_name: @param train_data_file: @param test_data_file: @param train_content_file: @param test_content_file: @param max_rank: the maximal N in the computation. Returns ---------- @return out ''' # initialize utilities trte_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) # processing file name hashing (used for cache string). # create hash for single file or a list of files. if isinstance(train_data_file, list): hash_file_tr_data_str = str(hash(tuple(train_data_file))) else: hash_file_tr_data_str = str(hash(train_data_file)) if isinstance(test_data_file, list): hash_file_te_data_str = str(hash(tuple(test_data_file))) else: hash_file_te_data_str = str(hash(test_data_file)) if train_item_feature_file: if isinstance(train_item_feature_file, list): hash_file_tr_item_feature_str = str( hash(tuple(train_item_feature_file))) else: hash_file_tr_item_feature_str = str(hash(train_item_feature_file)) else: hash_file_tr_item_feature_str = '' if test_item_feature_file: if isinstance(test_item_feature_file, list): hash_file_te_item_feature_str = str( hash(tuple(test_item_feature_file))) else: hash_file_te_item_feature_str = str(hash(test_item_feature_file)) else: hash_file_te_item_feature_str = '' # display information print 'Training data file', train_data_file, ' [hash:', hash_file_tr_data_str, ']' if train_item_feature_file: print 'Training content feature provided: ', train_item_feature_file, \ ' [hash:', hash_file_tr_item_feature_str, ']' else: print 'Training content feature not provided.' print 'Testing data file ', test_data_file, ' [hash:', hash_file_te_data_str, ']' if test_item_feature_file: print 'Testing content feature provided: ', test_item_feature_file, \ ' [hash:', hash_file_te_item_feature_str, ']' else: print 'Testing content feature not provided.' if binary: exp_id_prefix = 'trte_bi_' else: exp_id_prefix = 'trte_' exp_id = exp_id_prefix + exp_name + '_trdata_' + hash_file_tr_data_str \ + '_tedata_' + hash_file_te_data_str \ + '_tritemf_' + hash_file_tr_item_feature_str \ + '_tritemf_' + hash_file_te_item_feature_str trte_log('Experiment ID: ' + exp_id) # load utility data and feature data. trte_log('Read training data...') reader = UtilityDataReader(fieldDelimiter='\t') tr_data = reader.read_file_with_minval(train_data_file, 0, 0) trte_log('Training data loaded: ' + str(tr_data)) te_data = reader.read_file_with_minval(test_data_file, 0, 0) trte_log('Testing data loaded: ' + str(te_data)) # load item feature data if binary: trte_log('Binarizing data...') tr_data.binarize() te_data.binarize() result = {}
''' Created on Jan 29, 2014 @author: jiayu.zhou ''' import numpy as np from rs.algorithms.recommendation.generic_recalg import CFAlg from rs.utils.log import Logger # an encapsulated logger. log = lambda message: Logger.Log(RandUV.ALG_NAME + ':' + message, Logger. MSG_CATEGORY_ALGO) class RandUV(CFAlg): ''' A random guess recommender (demo). ''' ALG_NAME = 'RANDOM ALGO' def __init__(self, latent_factor=5, verbose=False): ''' Constructor ''' # initialize parameters. self.latent_factor = latent_factor log('dummy algorithm instance created: latent factor ' + str(self.latent_factor)) self.verbose = verbose
''' import numpy as np import csv import cPickle as pickle import os #@UnusedImport import sys from rs.utils.log import Logger from scipy.sparse import coo_matrix from rs.data.daily_watchtime import DailyWatchTimeReader from rs.utils.sparse_matrix import normalize_row from scipy.spatial.distance import cosine mcpl_log = lambda message: Logger.Log('PROG SIMILARITY: ' + message, Logger. MSG_CATEGORY_EXP) if __name__ == '__main__': if len(sys.argv) >= 3: filename = sys.argv[1] rovi_daily_file = sys.argv[2] else: # INPUT: the ROVI daily mapping. # Hadoop location: /apps/vddil/rovi_daily rovi_daily_file = "/Users/jiayu.zhou/Data/rovi_daily/20131209.tsv" # INPUT: the aggregated data file. # Hadoop location: /apps/vddil/duid-program-watchTime-genre #filename = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209/part-r-00000"; filename = "../../datasample/agg_duid_pid_watchtime_genre/20131209_100000"