def plot_measure_of_group(group: List[int], method: str, model: models.ScoreModel, title: str, measure_array: np.array, axis_label: Tuple[str, str], dest_path_tail: str): #compare user dest = share.PLOT_TOP + '/measure_of_group/' + model.get_dir_name( ) + '/' + dest_path_tail util.init_file(dest) pyplot.figure(figsize=(10, 6)) pyplot.subplot(GS[0, :GRID_WIDTH - 1]) for user_id in group: source = util.get_result_path(dir_name=share.RESULT_TOP + '/' + model.get_dir_name(), method=method, user_id=user_id) data = pd.read_csv(source) xs = [x * 0.1 for x in range(measure_array.shape[0])] ys = [] #sequence type differ ok??? for measure in measure_array: ys.append(data[measure].values[0]) if measure_array.shape[0] == 1: pyplot.scatter(xs, ys, label='user' + str(user_id)) else: pyplot.plot(xs, ys, label='user' + str(user_id)) pyplot.title(title) pyplot.xlabel(axis_label[0]) pyplot.ylabel(axis_label[1]) pyplot.xticks(xs, measure_array) pyplot.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0) pyplot.savefig(dest)
def set_exam_tex_form(group:List[int],compare_dict:Dict[str,Tuple[str,models.ScoreModel]],title:str): #OrderedDict,compare_dict={'key':(method,model),...},OrderedDict #user_id,measure1,measure2,... dest_h=dir_name=share.TEX_EXAM_SOURCE_TOP+'/'+title#method??? splitter=',' keys=compare_dict.keys() #ordered dict ideal for measure_type in share.MEASURE_TYPE_LIST: for measure in share.MEASURE_TYPE_MEASURE_DICT[measure_type]: dest=dest_h+'/'+measure header='user_id' for key in keys: header+=splitter+key util.init_file(dest) with open(dest,'wt') as fout: fout.write(header+'\n') for user_id in group: line=str(user_id) for key in keys: line+=splitter method,score_model=compare_dict[key] source=util.get_result_path(dir_name=share.RESULT_TOP+'/'+score_model.get_dir_name(),user_id=user_id,method=method) try: data=pd.read_csv(source) line+=str(round(data[measure].values[0],share.DIGIT)) except FileNotFoundError: print(source+' not found'+'\n') fout.write(line+'\n')
def plot_measure_of_compare(compare_dict: Dict[str, Tuple[str, models.ScoreModel]], title: str, group: List[int], measure_array: np.array, axis_label: Tuple[str, str], dest_path_tail: str): #compare model of compare_dict dest = share.PLOT_TOP + '/measure_of_compare/' + dest_path_tail util.init_file(dest) pyplot.figure(figsize=(10, 6)) pyplot.subplot(GS[0, :GRID_WIDTH - 1]) for key, (method, compare) in compare_dict.items(): xs = [x * 0.1 for x in range(measure_array.shape[0])] #grid size ys = [] for measure in measure_array: value = .0 for user_id in group: source = util.get_result_path(dir_name=share.RESULT_TOP + '/' + compare.get_dir_name(), method=method, user_id=user_id) data = pd.read_csv(source) value += data[measure].values[0] / len(group) ys.append(value) if measure_array.shape[0] == 1: #for MAiP pyplot.scatter(xs, ys, label=key) else: pyplot.plot(xs, ys, label=key) pyplot.title(title) pyplot.xlabel(axis_label[0]) pyplot.ylabel(axis_label[1]) pyplot.xticks(xs, measure_array) pyplot.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0) pyplot.savefig(dest)
def get_exam_tex_form(compare_list:List[str],compared:str,title:str): source_h=share.TEX_EXAM_SOURCE_TOP+'/'+title dest=dir_name=share.TEX_EXAM_TOP+'/'+title splitter='&' measure_suffix='\\\n' measure_type_suffix='\\ \hline\n' header='$measure$' for compare in compare_list: header+=splitter+compare header+=measure_type_suffix util.init_file(dest) with open(dest,'wt') as fout: fout.write(header) for measure_type in share.MEASURE_TYPE_LIST: measure_array=share.MEASURE_TYPE_MEASURE_DICT[measure_type] for i,measure in enumerate(measure_array): line=measure source=source_h+'/'+measure data=pd.read_csv(source) for compare in compare_list: try: #manwhitney u-test u,p=stats.mannwhitneyu(data[compared],data[compare],alternative='greater') line+=splitter+str(round(p,share.DIGIT)) except ValueError: line+=splitter+str(np.nan) if i+1==measure_array.shape[0]: line+=measure_type_suffix else: line+=measure_suffix fout.write(line)
def create_cluster(df: pd.DataFrame, n_clusters: int, target_axis_list: List[str], train_id: int, user_id: int, remapping: bool) -> List[pd.DataFrame]: #n_clusters from 1,pred from 0 hotel_cluster = [] path = util.get_cluster_path(dir_name=share.CLUSTER_DATA_TOP + '/remapping_' + str(remapping) + '/cluster_' + str(n_clusters), train_id=train_id, user_id=user_id) #odd if share.REUSE_CLUSTER and os.path.isfile(path): #reuse valid and cluster_file exist hotel_cluster = read_cluster(n_clusters, path) else: util.init_file(path) header = 'n_clusters,cluster' for column in df.columns: header += ',' + column header += '\n' with open(path, 'wt') as fout: fout.write(header) if n_clusters == 1: hotel_cluster = [df] for i in df.index: #access by label,series hotel = df.loc[i] line = util.get_line_from_series(data=hotel, key_list=hotel.index, splitter=',', start=str(n_clusters) + ',0') fout.write(line + '\n') else: axis_array = [ df[score_type].tolist() for score_type in target_axis_list ] axis_array = np.array(axis_array).T pred = KMeans(n_clusters=n_clusters).fit_predict(axis_array) for _ in range(n_clusters): hotel_cluster.append(pd.DataFrame()) for i, cluster_num in enumerate(pred): line = str(n_clusters) + ',' + str(cluster_num) row = df.iloc[i] #iloc use number hotel_cluster[cluster_num] = hotel_cluster[ cluster_num].append(row) # id_ignore??? line = util.get_line_from_series(key_list=row.index, data=row, splitter=',', start=line) fout.write(line + '\n') return hotel_cluster
def log_weight_and_score_model_list(self): dest = self.dest_dict[ 'log_weight_and_score_model_list'] + '/' + self.user_train_id_path util.init_file(dest) # copula have '_'leading variable. This leads to pyper.RError # For this,pickle model_list except copula pickled = [] for weight, score_model_dict in self.weight_and_score_model_list: temp_dict = {} for key, value in score_model_dict.items(): if not key == 'copula': temp_dict[key] = value pickled.append((weight, temp_dict)) with open(dest, 'wb') as fout: pickle.dump(pickled, fout)
def log_axis(self): dest = self.dest_dict['log_axis'] + '/' + self.user_train_id_path header = 'left,const_a,med,madn,bound_dict,kl_dict,prod,score_type_list,reduced,tl_score_type_list\n' line = 'left,' + str(self.const_a) + ',' + str(self.med) + ',' + str( self.madn) for column in [ self.bound_dict, self.kl_dict, self.prod_axis, self.score_type_list, self.reduced_axis, self.tlr_axis ]: object_quotation = '"' line += ',' + object_quotation + str(column) + object_quotation line += '\n' util.init_file(dest) with open(dest, 'wt') as fout: fout.write(header) fout.write(line)
def plot_kl_profile(user_id: int, train_id: int, model: models.ScoreModel): #plot attention of score_type and bound by kl_divergence source = model.get_dest_dict( )['log_axis'] + '/' + util.get_user_train_id_path(user_id=user_id, train_id=train_id) dest = share.PLOT_TOP + '/kl_profile/' + model.get_dir_name( ) + '/' + util.get_user_train_id_path(user_id=user_id, train_id=train_id) util.init_file(dest) pyplot.figure(figsize=(10, 6)) pyplot.subplot(GS[0, :GRID_WIDTH - 1]) marker_axis_list = [] row = pd.read_csv(source).iloc[0] #default quotechar is " bound_dict = eval(row['bound_dict']) #ordered dict marker_axis_list.append(('v', eval(row['reduced']))) #down triangle marker_axis_list.append(('.', eval(row['score_type_list']))) #dot marker_axis_list.append(('^', eval(row['prod']))) #up triangle marker_axis_list.append(('o', eval(row['tl_score_type_list']))) #circle kl_dict = eval(row['kl_dict']) med, madn = row['med'], row['madn'] for marker, axis in marker_axis_list: for score_type in axis: xs = [kl_dict[score_type]] ys = [norm.pdf(x=x, loc=med, scale=madn) for x in xs] pyplot.scatter(xs, ys, marker=marker, label=score_type) bound_color_dict = { 'bound1': 'blue', 'bound2': 'red', 'bound3': 'green' } #bound3 is optional y_top = norm.pdf(x=med, loc=med, scale=madn) for key, bound in bound_dict.items(): pyplot.plot([bound] * 2, [.0, y_top], label=key, color=bound_color_dict[key]) pyplot.title('kl_profile') pyplot.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0) pyplot.savefig(dest)
def get_measure_tex_form(group:List[int],compare_dict:Dict[str,Tuple[str,models.ScoreModel]],title:str):#compare_dict must be OrderedDict dest=dir_name=share.TEX_MEASURE_TOP+'/'+title splitter='&' measure_suffix='\\\n' measure_type_suffix='\\ \hline\n' header='measure' group_size=len(group) keys=compare_dict.keys() #ordered_dict for key in keys: header+=splitter+key header+=measure_suffix util.init_file(dest) with open(dest,'wt') as fout: fout.write(header) for measure_type in share.MEASURE_TYPE_LIST: measure_array=share.MEASURE_TYPE_MEASURE_DICT[measure_type] for i,measure in enumerate(measure_array): line=measure for key in keys: value=0.0 line+=splitter for user_id in group: method,model=compare_dict[key] source=util.get_result_path(dir_name=share.RESULT_TOP+'/'+model.get_dir_name(),user_id=user_id,method=method) try: data=pd.read_csv(source) value+=data[measure].values[0]/group_size except FileNotFoundError: print(source+' not found\n') break line+=str(round(value,share.DIGIT)) if i+1==measure_array.shape[0]: line+=measure_type_suffix else: line+=measure_suffix fout.write(line)
def plot_hist(path: str): #plot histgram of chargeScore and middleCharge bin_width = 1000 charge_min = 1000 charge_max = 28000 charge_xticks = [ x for x in range(int(charge_min / bin_width), int(charge_max / bin_width) + 1) ] data = pd.read_csv(path) HIST_COLUMNS = ['hotelMiddleCharge', 'chargeScore'] #charge_xs=data['hotelMiddleCharge'].values score_xs = data['chargeScore'].values size = score_xs.shape[0] score_dest = share.PLOT_TOP + 'histgram/chargeScore/' util.init_file(score_dest) pyplot.hist(score_xs, range=(0.0, 1.0)) pyplot.title('histgram for chargeScore of all items') pyplot.legend() util.init_file(score_dest) pyplot.savefig(score_dest) pyplot.figure() charge_xs = data['hotelMiddleCharge'].values size = charge_xs.shape[0] vfunc = np.vectorize(pyfunc=lambda x: x / charge_min, otypes=int) charge_xs = vfunc(charge_xs) charge_dest = share.PLOT_TOP + '/histgram/middleCharge' util.initFile(charge_dest) pyplot.hist(charge_xs, range=(charge_min / bin_width, charge_max / bin_width), bins=bin_num) pyplot.xticks(charge_xticks) pyplot.title('histgram for Charge of all items') pyplot.legend() util.init_file(charge_dest) pyplot.savefig(charge_dest) pyplot.figure()
def plot_marg_and_rank(model: models.ScoreModel, user_id: str, train_id: int, method: str, boolean_value: int, title: str, denominator_list: List[int], denominator_title, dest_path_tail: str): rank_space = 10 color_list = [ 'red', 'orange', 'yellow', 'green', 'blue', 'violet', 'black' ] _, _, mapping_id = util.get_score_mapping_param(user_id) all_items_marg_path = model.get_dest_dict( )['all_items_marg_dict'] + '/' + mapping_id #all marg data with open(all_items_marg_path, 'rb') as fin: all_marg_dict = pickle.load(fin) user_input_path = model.get_dest_dict( )['log_weight_and_score_model_list'] + '/' + util.get_user_train_id_path( user_id=user_id, train_id=train_id) #user marg data with open(user_input_path, 'rb') as fin: user_weight_marg_dict_list = pickle.load(fin) #ranking data rank_input_path = util.get_result_path(dir_name=share.RANKING_TOP + '/' + model.get_dir_name(), method=method, user_id=user_id, train_id=train_id) rank_data = pd.read_csv(rank_input_path, index_col='id') #count true/false sum denominator_size = 0 ranking_list = [] bool_data = rank_data[rank_data['boolean'] == boolean_value] size = bool_data.shape[0] for i in color_list: ranking_list.append([]) for hotel_id, row in bool_data.iterrows(): i = int(row['ranking'] / rank_space) ranking_list[i].append(hotel_id) if row['boolean'] in denominator_list: denominator_size += 1 for score_type in share.DEFAULT_SCORE_TYPE_LIST: dest = util.get_result_path( dir_name=share.PLOT_TOP + '/marg_and_boolean/' + model.get_dir_name() + '/' + dest_path_tail, method=method, user_id=user_id, train_id=train_id) + '/' + score_type try: all_pdf = all_marg_dict[score_type].pdf user_pdf = util.get_pdf_from_weight_marg_dict_list( weight_marg_dict_list=user_weight_marg_dict_list, score_type=score_type) xs = [ x for x in np.arange(SCORE_SPACE_DICT[score_type][1], SCORE_SPACE_DICT[score_type][0], 0.01) ] ys = [user_pdf(x) for x in xs] xs_all = xs ys_all = [all_pdf(x) for x in xs_all] for i, color in enumerate(color_list): bool_xs = [ bool_data.loc[index][score_type] for index in ranking_list[i] ] bool_ys = [user_pdf(x) for x in bool_xs] pyplot.scatter(bool_xs, bool_ys, label='top' + str((i + 1) * rank_space), color=color) pyplot.plot(xs_all, ys_all, label='all_items') pyplot.plot(xs, ys, label='user') pyplot.title('pdf and ' + title + ' ' + str(size) + 'items/' + denominator_title + str(denominator_size) + 'items ' + 'for ' + score_type) pyplot.xlabel('score') pyplot.ylabel('pdf') pyplot.xticks( np.arange(SCORE_SPACE_DICT[score_type][1], SCORE_SPACE_DICT[score_type][0], 0.1)) pyplot.legend() util.init_file(dest) pyplot.savefig(dest) pyplot.figure() except KeyError: #score_type reduced by kl_reduced pass
sys.stderr.write(share.CLUSTER_DATA_TOP+' exist\n') sys.stderr.write("iloc "+str(iloc)+" is used."+"Retry command+='--iloc=another_loc'\n") sys.exit(share.ERROR_STATUS) #set ppl param per user of group if args['set_ppl']: mapping_id_user_dict={} for user_id in group: path=share.PPL_TOP+'/'+'user'+str(user_id) _,_,mapping_id=util.set_score_mapping_param(path=path,user_id=user_id) if mapping_id in mapping_id_user_dict: mapping_id_user_dict[mapping_id].append(user_id) else: mapping_id_user_dict[mapping_id]=[user_id] util.init_file(share.MAPPING_ID_USER_DICT_PATH) with open(share.MAPPING_ID_USER_DICT_PATH,'wt') as fout: header='mapping_id,user_id' fout.write(header+'\n') object_quotation='"' for mapping_id in mapping_id_user_dict.keys(): line=mapping_id+','+object_quotation+str(mapping_id_user_dict[mapping_id])+object_quotation fout.write(line+'\n') sys.exit() #do plotting from result if args['plot']: plot.set_score_space_dict() plot.test_plot() sys.exit()
def select_axis(self, mapping_id: str, training_data_t: pd.DataFrame, training_data_f: pd.DataFrame, all_items: pd.DataFrame, axis: List[str]): if (not self.mapping_id) or (not self.mapping_id == mapping_id): #self.mapping_id is init or reset ,reset all_items_marg self.mapping_id = mapping_id print(mapping_id) all_items_marg_path = self.dest_dict[ 'all_items_marg_dict'] + '/' + mapping_id if share.REUSE_PICKLE: if os.path.isfile(all_items_marg_path) and share.REUSE_PICKLE: #already modeled,deserialize with open(all_items_marg_path, 'rb') as fin: self.all_items_marg_dict = pickle.load(fin) else: #reuse valid but yet modeled,exit sys.stderr.write( 'file ' + all_items_marg_path + ' not found.retry command+=i_reuse_pickle\n') sys.exit(share.ERROR_STATUS) else: #not yet modeled,model and serialize util.init_file(all_items_marg_path) self.all_items_marg_dict = {} for score_type in axis: all_marg = marginal.factory_marg( marg_name=self.marg_name, marg_option=self.marg_option) all_marg.set_param( training_data=all_items[score_type].values, score_type=score_type) self.all_items_marg_dict[score_type] = all_marg with open(all_items_marg_path, 'wb') as fout: pickle.dump(self.all_items_marg_dict, fout) self.kl_dict = {} for score_type in axis: self.marg_model.set_param( training_data=training_data_t[score_type].values, score_type=score_type) kl = util.kl_divergence_between_population_and_users( all_marg=self.all_items_marg_dict[score_type], attn=self.attn, score_type=score_type, user_marg=self.marg_model) self.kl_dict[score_type] = kl tmp_dict = {k: v for k, v in self.kl_dict.items()} if self.attn == share.ATTN_INF: self.kl_dict = {k: np.log1p(v) for k, v in tmp_dict.items()} kl_values = np.array(list(self.kl_dict.values())) med = statistics.median(kl_values) mad = statistics.median([abs(x - med) for x in kl_values]) madn = mad / 0.675 bound1 = med - self.const_a * madn bound2 = med + self.const_a * madn bound_dict = OrderedDict() bound_dict['bound1'] = bound1 bound_dict['bound2'] = bound2 self.med, self.madn = med, madn # self.score_type sorted by attn axis = sorted(axis, key=lambda x: float(self.kl_dict[x]), reverse=True) self.prod_axis = [ x for x in axis if self.kl_dict[x] > bound_dict['bound2'] ] self.tlr_axis = [] if self.tlr_limit: #tlr valid if self.tlr == share.TLR_NUM_UPPER: #use prod_axis.it none,use tlr_limit num of upper_axis if self.prod_axis: self.tlr_axis = copy.deepcopy(self.prod_axis) else: self.tlr_axis = axis[0:int(self.tlr_limit)] elif self.tlr == share.TLR_OL: bound_dict['ol'] = med + madn * float(self.tlr_limit) self.tlr_axis = [ x for x in axis if self.kl_dict[x] > bound_dict['ol'] ] elif self.tlr == share.TLR_PROD: self.tlr_axis = [x for x in self.prod_axis] elif not self.tlr == share.I_TLR: sys.stderr.write('invalid trl string') sys.exit(share.ERROR_STATUS) for score_type in share.DISC_SCORE_TYPE_LIST: #remove during iterator danger if score_type in self.tlr_axis: print('removing disc_score') self.tlr_axis.remove(score_type) #renew score_type_list self.score_type_list, self.reduced_axis = [], [] for score_type in axis: if self.kl_dict[score_type] > bound_dict['bound1']: self.score_type_list.append(score_type) else: self.reduced_axis.append(score_type) self.bound_dict = bound_dict if len(self.score_type_list) == 1: self.score_type_list = axis print('prod ' + str(self.prod_axis)) print('no_reduced' + str(self.score_type_list)) print('tlr_axis' + str(self.tlr_axis))
def do_measure(model: models.ScoreModel, group: List[int]): model_remapping = model.get_remapping() for user_id in group: print('###########################################') print('user = '******'###########################################') respective_method_measures_dict = {} user_k_folded_path = share.TRAIN_DATA_TOP + '/user' + str( user_id) + '_kfolded.json' with open(user_k_folded_path, 'rt') as fin: #load train_and_test_data kfolded_training_and_test_data_list = json.load(fin) if model_remapping: remapping, score_mapping_dict, mapping_id = util.get_score_mapping_param( user_id) else: #remapping invalid for group users remapping = False if remapping: #differ from default mapping,=>remapping valid #deepcopy all_items = copy.deepcopy(share.ALL_ITEMS) util.convert_score(all_items, score_mapping_dict) else: all_items = share.ALL_ITEMS #shallow copy for train_id, training_and_test_data in enumerate( kfolded_training_and_test_data_list ): #train and test by TRAIN_SIZEs training_hotel_list = training_and_test_data['trainingTrue'] training_false_hotel_list = training_and_test_data['trainingFalse'] test_hotel_list = training_and_test_data['testTrue'] test_false_hotel_list = training_and_test_data['testFalse'] model.train( training_data_t=pd.DataFrame.from_records(training_hotel_list), training_data_f=pd.DataFrame.from_records( training_false_hotel_list), all_items=all_items, mapping_id=mapping_id, train_id=train_id, user_id=user_id) #log parameter of model.train() model.make_log() ranking_dict = model.calc_ranking(all_items=all_items) test_hotel_id_list = [ test_hotel['id'] for test_hotel in test_hotel_list ] training_hotel_id_list = [ training_hotel['id'] for training_hotel in training_hotel_list ] training_false_hotel_id_list = [ training_false_hotel['id'] for training_false_hotel in training_false_hotel_list ] for method, ranking in ranking_dict.items(): ranking = ranking.drop(training_hotel_id_list) ranking = ranking.drop(training_false_hotel_id_list) print(method + '\n') print(ranking) dest = util.get_result_path(dir_name=share.RANKING_TOP + '/' + model.get_dir_name(), method=method, user_id=user_id, train_id=train_id) util.log_ranking(all_items=all_items, ranking=ranking, path=dest, score_type_list=model.get_score_type_list(), test_id_list=test_hotel_id_list) #odd??? if method not in respective_method_measures_dict: temp = {} for measure_type in share.MEASURE_TYPE_LIST: temp[measure_type] = [ .0 ] * share.MEASURE_TYPE_MEASURE_DICT[ measure_type].shape[0] for label_type in share.LABEL_TYPE_LIST: temp[label_type] = [] respective_method_measures_dict[method] = temp ips = ip(ranking, test_hotel_id_list) for i in range(0, share.MEASURE_TYPE_MEASURE_DICT['iP'].shape[0]): #enumerate??? respective_method_measures_dict[method]['iP'][i] += ips[i] respective_method_measures_dict[method]['MAiP'][0] += ips[11] for i in range( 0, share.MEASURE_TYPE_MEASURE_DICT['nDCG'].shape[0]): respective_method_measures_dict[method]['nDCG'][ i] += n_dcg(ranking, 5 * (i + 1), test_hotel_id_list) for i in range(0, share.MEASURE_TYPE_MEASURE_DICT['P'].shape[0]): respective_method_measures_dict[method]['P'][ i] += precision(ranking, 5 * (i + 1), test_hotel_id_list) for i, label_type in enumerate(share.LABEL_TYPE_LIST): respective_method_measures_dict[method][label_type].extend( adhoc_task(ranking, 10 * (i + 1), test_hotel_id_list)) for method, respective_measures in respective_method_measures_dict.items( ): file_name = util.get_result_path(dir_name=share.RESULT_TOP + '/' + model.get_dir_name(), method=method, user_id=user_id) util.init_file(file_name) with open(file_name, 'wt') as fout: header = 'file,user' line = file_name + ',user' + str(user_id) for measure_type in share.MEASURE_TYPE_LIST: for item, measure in enumerate( share.MEASURE_TYPE_MEASURE_DICT[measure_type]): header += ',' + measure line += ',' + str( respective_measures[measure_type][item] / share.TRAIN_SIZE) header += '\n' line += '\n' fout.write(header + line) for label_type in share.LABEL_TYPE_LIST: label_file_name = util.get_result_path( dir_name=share.LABEL_TOP + '/' + label_type + '/' + model.get_dir_name(), method=method, user_id=user_id) adhoc_testing_task(label_file_name, respective_measures[label_type])
def adhoc_testing_task(file_name: str, labels: List[bool]): num_label = ['1' if lbl else '0' for lbl in labels] util.init_file(file_name) with open(file_name, 'wt') as fout: fout.write(",".join(num_label)) fout.write('\n') #org