def plot_similarity_array(self): abs_corr_array = self.workflow.redundancy_method.similarity_array op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset( op_id_name_map[0], op_id_name_map[1], self.workflow.redundancy_method.similarity_array_op_ids) measures = np.zeros((2, len(names))) tmp_ind = hlp.ismember( self.workflow.redundancy_method.similarity_array_op_ids, self.workflow.good_op_ids) # -- number of problems for which each good performing feature has been calculated measures[0, :] = (~self.workflow.stats_good_op[:, tmp_ind].mask).sum( axis=0) # -- z scored u-stat(for all features) for top features stats_good_op_z_score = fap.normalise_masked_array( self.workflow.stats_good_op_comb, axis=0, norm_type='zscore')[0] measures[1, :] = stats_good_op_z_score[tmp_ind] fiplt.plot_arr_dendrogram(abs_corr_array, names, max_dist_cluster=self.max_dist_cluster, measures=measures)
def reduce_to_good_perf_ops(self,ops_base_vals,good_perf_op_ids,good_op_ids): """ Reduce the ops_base_vals by keeping only the columns corresponding to the op_ids in self.good_perf_op_ids Parameters: ----------- ops_base_vals : nd array Array containing the values on which the similarity of the operations will be calculated good_op_ids : ndarray The op_ids of the columns in ops_base_vals good_perf_op_ids : ndarray The op_ids of the features we are interested in Returns: -------- ops_base_perf_vals : ndarray ops_base_vals reduced to contain only operations with ids given in good_perf_op_ids with the same ordering. """ good_perf_ind = hlp.ismember(good_perf_op_ids,good_op_ids) ops_base_perf_vals = ops_base_vals[:,good_perf_ind] return ops_base_perf_vals
def plot_similarity_array(self): abs_corr_array = self.workflow.redundancy_method.similarity_array op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids) measures = np.zeros((2,len(names))) tmp_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids, self.workflow.good_op_ids) # -- number of problems for which each good performing feature has been calculated measures[0,:] = (~self.workflow.stats_good_op[:,tmp_ind].mask).sum(axis=0) # -- z scored u-stat(for all features) for top features stats_good_op_z_score = fap.normalise_masked_array(self.workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0] measures[1,:] = stats_good_op_z_score[tmp_ind] fiplt.plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster=self.max_dist_cluster,measures = measures)
def reduce_to_good_perf_ops(self, ops_base_vals, good_perf_op_ids, good_op_ids): """ Reduce the ops_base_vals by keeping only the columns corresponding to the op_ids in self.good_perf_op_ids Parameters: ----------- ops_base_vals : nd array Array containing the values on which the similarity of the operations will be calculated good_op_ids : ndarray The op_ids of the columns in ops_base_vals good_perf_op_ids : ndarray The op_ids of the features we are interested in Returns: -------- ops_base_perf_vals : ndarray ops_base_vals reduced to contain only operations with ids given in good_perf_op_ids with the same ordering. """ good_perf_ind = hlp.ismember(good_perf_op_ids, good_op_ids) ops_base_perf_vals = ops_base_vals[:, good_perf_ind] return ops_base_perf_vals
def collect_stats_good_op_ids(self): """ Collect all combined stats for each task and take stats for good operations only """ #stats_good_op_ma = np.empty((data.shape[0],np.array(self.good_op_ids).shape[0])) stats_good_op_tmp = [] #stats_good_op_ma[:] = np.NaN for task in self.tasks: # -- create tmp array for good stats for current task. For sake of simplicity when dealing with different # dimensions of task.tot_stats we transpose stats_good_op_ma_tmp so row corresponds to feature temporarily if task.tot_stats.ndim > 1: stats_good_op_ma_tmp = np.empty((self.good_op_ids.shape[0],task.tot_stats.shape[0])) else: stats_good_op_ma_tmp = np.empty((self.good_op_ids.shape[0])) stats_good_op_ma_tmp[:] = np.NaN ind = hlp.ismember(task.op_ids,self.good_op_ids,is_return_masked_array = True,return_dtype = int) # -- it is position in task.op_ids and i is position in self.good_op_ids for it,i in enumerate(ind): if i is not np.ma.masked: # -- that means the entry in task.op_ids is also in self.good_op_ids stats_good_op_ma_tmp[i] = task.tot_stats[it].T # -- We return to the usual ordering: column equals feature stats_good_op_tmp.append(stats_good_op_ma_tmp.T) self.stats_good_op = np.ma.masked_invalid(np.vstack(stats_good_op_tmp))
def cat_data_op_subset(file_paths, op_id_top, is_from_old_matlab=False, is_return_masked=True): """ Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths. Warning, this can take a while and the returned data matrix can be very large. XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is only a little more work to implement. XXX Parameters: ----------- file_paths : list list of file paths pointing to the files containing the data op_id_top : list,ndarray list of operation ids wanted in the concatenated data array is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. is_return_masked : boolean Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with data_all = np.ma.masked_invalid(data_all) Returns: -------- data_all : ndarray/masked ndarray Concatenated data array """ is_first = True data_all = None for file_path in file_paths: print "Adding data from {:s} \n to complete data matrix".format( file_path) data, op = mIO.read_from_mat_file( file_path, ['TS_DataMat', 'Operations'], is_from_old_matlab=is_from_old_matlab) # -- find the indices in the data for for op_id_top ind = hlp.ismember(op['id'], op_id_top, is_return_masked_array=True, return_dtype=int) # -- if any of the operations was not calculated for this problem # -- create a masked array and copy only valid data and mask # -- invalid data if ind.data != op_id_top: # -- create an masked array filled with NaN. # -- This makes later masking of non-existent entries easier # -- each column of data_ma corresponds to the op_id in op_id_top with the # -- same index (column i in data_ma corresponds to op_id_top[i]) data_ma = np.empty((data.shape[0], np.array(op_id_top).shape[0])) data_ma[:] = np.NaN for it, i in enumerate(ind): # -- if i is masked in ind that means that the current operation in data # -- is not part of op_id_top. We therefore do not need this operation to # -- be included in data_ma. if i is not np.ma.masked: data_ma[:, i] = data[:, it] # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary) else: data_ma = np.array(data[:, ind]) # -- mask all NaN (not calculated) entries and stick them together #data_ma = np.ma.masked_invalid(data_ma) if is_first == True: data_all = data_ma is_first = False else: data_all = np.vstack((data_all, data_ma)) # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module. if is_return_masked == True: data_all = np.ma.masked_invalid(data_all) return data_all
def plot_stat_array(self): fig = plt.figure(figsize = ((15,15))) # -- plot layout ------------------------------------------------------ rect_ustat_arr = [0.25,0.175,.5,.5] rect_dendr = [0.755,0.175,.145,.5] rect_measures0 = [0.25,0.68,0.5,0.1] rect_measures1 = [0.25,0.785,0.5,0.1] rect_measures2 = [0.25,0.89,0.5,0.1] ax_ustat_arr = fig.add_axes(rect_ustat_arr) ax_dendr = fig.add_axes(rect_dendr) ax_measures00 = fig.add_axes(rect_measures0) ax_measures01 = plt.twinx(ax_measures00) ax_measures10 = fig.add_axes(rect_measures1) ax_measures10.set_xticklabels([]) ax_measures20 = fig.add_axes(rect_measures2) ax_measures20.set_xticklabels([]) ax_measures21 = plt.twinx(ax_measures20) # -- calculate and plot the dendrogram dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='left',no_plot=True) hierarchy.dendrogram(self.linkage, orientation='left',p=50,truncate_mode ='lastp',ax = ax_dendr) ax_dendr.set_yticks([]) ax_dendr.axvline(self.max_dist_cluster,ls='--',c='k') # -- plot sorted classification stat array ------------------------------------------ # -- create index that sort rows to correspond to dendrogram feat_sort_ind = dist_dendrogram['leaves'] # -- sort the good performant features so they have the same order as the similarity array sort_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids,self.workflow.redundancy_method.good_perf_op_ids) self.ops_base_perf_vals = self.ops_base_perf_vals[:,sort_ind] # -- create index that sort columns with respect to their mean value task_sort_ind = np.argsort(self.ops_base_perf_vals[:,feat_sort_ind].mean(axis=1)) #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0)) #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0] all_classes_avg_top = self.ops_base_perf_vals # -- plot the operation names as y-axis tick labels aspect = all_classes_avg_top.shape[0] / float(all_classes_avg_top.shape[1]) ax_ustat_arr.matshow(all_classes_avg_top[task_sort_ind,:][:,feat_sort_ind].T,aspect=aspect,origin='bottom') ax_ustat_arr.set_yticks(range(len(feat_sort_ind))) op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids) ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind]) # -- plot the problem names as x axis labels ax_ustat_arr.xaxis.tick_bottom() ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0])) ax_ustat_arr.set_xticklabels(self.task_names[task_sort_ind],rotation='vertical') # -- plot clusters ---------------------------------- cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5 for cluster_bound in cluster_bounds: ax_ustat_arr.axhline(cluster_bound,c='w',lw=2) # -------------------------------------------------------------------------------- # -- calculate and plot measures ------------------------------------------------- # -------------------------------------------------------------------------------- # -- nr samples and nr labels -------------------------------------------------- n_samples_avg = [np.array(self.workflow.tasks[i].ts['n_samples']).mean() for i in task_sort_ind] n_classes = [len(set(self.workflow.tasks[i].labels)) for i in task_sort_ind] x_loc = np.arange(0,len(self.workflow.tasks))+0.5 ax_measures00.scatter(x_loc,n_classes,c='b',s=40) ax_measures00.plot(x_loc,n_classes,c='b') [label.set_color('b') for label in ax_measures00.get_yticklabels()] ax_measures00.set_ylabel('nr classes') ax_measures00.yaxis.label.set_color('b') ax_measures00.set_ylim([0,max(n_classes)+1]) ax_measures00.set_xticklabels([]) ax_measures01.scatter(x_loc,n_samples_avg,c='r',s=40) ax_measures01.plot(x_loc,n_samples_avg,c='r') [label.set_color('r') for label in ax_measures01.get_yticklabels()] ax_measures01.set_ylabel('avg samples') ax_measures01.yaxis.label.set_color('r') ax_measures01.set_ylim([0,max(n_samples_avg)+100]) ax_measures00.set_xlim([0,len(self.workflow.tasks)]) ax_measures00.set_xticklabels([]) # -- Classification stat measures -------------------------------------------------- # -- minimum average classification stat for all features ax_measures10.plot(x_loc,np.min(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o',label='min. avg. classification stat all') # -- minimum average classification stat for top features ax_measures10.plot(x_loc,np.ma.min(self.ops_base_perf_vals[task_sort_ind,:],axis=1),marker='o',label='min. avg. classification stat top') # -- average minimum (for each class pair) classification stat for top features # XXX This would require task.pair_stats to be available (not saved as intermediate at the moment); then it is trivial to implement ax_measures10.legend(loc=2,fontsize='small',labelspacing=.1) ax_measures10.set_ylabel('classification stat') ax_measures10.set_xlim([0,len(self.workflow.tasks)]) ax_measures10.set_ylim([0,0.5]) # -- Classification stat measures and avg operations working-------------------------------------------------- # -- mean average classification stat for all features ax_measures20.plot(x_loc,np.ma.mean(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o') [label.set_color('b') for label in ax_measures20.get_yticklabels()] ax_measures20.set_ylabel('avrg classification stat all feat') ax_measures20.yaxis.label.set_color('b') # -- number of successfully calculated features ax_measures21.plot(x_loc,[len(self.workflow.tasks[i].op['id']) for i in task_sort_ind],c='r',marker='o') [label.set_color('r') for label in ax_measures21.get_yticklabels()] ax_measures21.set_ylabel('nr calc feat') ax_measures21.yaxis.label.set_color('r') ax_measures20.set_xlim([0,len(self.workflow.tasks)])
def plot_stat_array(self): fig = plt.figure(figsize = ((15,15))) # -- plot layout ------------------------------------------------------ rect_ustat_arr = [0.25,0.175,.5,.5] rect_dendr = [0.755,0.175,.145,.5] rect_measures0 = [0.25,0.68,0.5,0.1] rect_measures1 = [0.25,0.785,0.5,0.1] rect_measures2 = [0.25,0.89,0.5,0.1] ax_ustat_arr = fig.add_axes(rect_ustat_arr) ax_dendr = fig.add_axes(rect_dendr) ax_measures00 = fig.add_axes(rect_measures0) ax_measures01 = plt.twinx(ax_measures00) ax_measures10 = fig.add_axes(rect_measures1) ax_measures10.set_xticklabels([]) ax_measures20 = fig.add_axes(rect_measures2) ax_measures20.set_xticklabels([]) ax_measures21 = plt.twinx(ax_measures20) # -- calculate and plot the dendrogram dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='left',no_plot=True) hierarchy.dendrogram(self.linkage, orientation='left',p=50,truncate_mode ='lastp',ax = ax_dendr) ax_dendr.set_yticks([]) ax_dendr.axvline(self.max_dist_cluster,ls='--',c='k') # -- plot sorted U-Stat array ------------------------------------------ # -- create index that sort rows to correspond to dendrogram feat_sort_ind = dist_dendrogram['leaves'] # -- sort the good performant features so they have the same order as the similarity array sort_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids,self.workflow.redundancy_method.good_perf_op_ids) self.ops_base_perf_vals = self.ops_base_perf_vals[:,sort_ind] # -- create index that sort columns with respect to their mean value task_sort_ind = np.argsort(self.ops_base_perf_vals[:,feat_sort_ind].mean(axis=1)) #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0)) #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0] all_classes_avg_top = self.ops_base_perf_vals # -- plot the operation names as y-axis tick labels aspect = all_classes_avg_top.shape[0] / float(all_classes_avg_top.shape[1]) ax_ustat_arr.matshow(all_classes_avg_top[task_sort_ind,:][:,feat_sort_ind].T,aspect=aspect,origin='bottom') ax_ustat_arr.set_yticks(range(len(feat_sort_ind))) op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids) ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind]) # -- plot the problem names as x axis labels ax_ustat_arr.xaxis.tick_bottom() ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0])) ax_ustat_arr.set_xticklabels(self.task_names[task_sort_ind],rotation='vertical') # -- plot clusters ---------------------------------- cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5 for cluster_bound in cluster_bounds: ax_ustat_arr.axhline(cluster_bound,c='w',lw=2) # -------------------------------------------------------------------------------- # -- calculate and plot measures ------------------------------------------------- # -------------------------------------------------------------------------------- # -- nr samples and nr labels -------------------------------------------------- n_samples_avg = [np.array(self.workflow.tasks[i].ts['n_samples']).mean() for i in task_sort_ind] n_classes = [len(set(self.workflow.tasks[i].labels)) for i in task_sort_ind] x_loc = np.arange(0,len(self.workflow.tasks))+0.5 ax_measures00.scatter(x_loc,n_classes,c='b',s=40) ax_measures00.plot(x_loc,n_classes,c='b') [label.set_color('b') for label in ax_measures00.get_yticklabels()] ax_measures00.set_ylabel('nr classes') ax_measures00.yaxis.label.set_color('b') ax_measures00.set_ylim([0,max(n_classes)+1]) ax_measures00.set_xticklabels([]) ax_measures01.scatter(x_loc,n_samples_avg,c='r',s=40) ax_measures01.plot(x_loc,n_samples_avg,c='r') [label.set_color('r') for label in ax_measures01.get_yticklabels()] ax_measures01.set_ylabel('avg samples') ax_measures01.yaxis.label.set_color('r') ax_measures01.set_ylim([0,max(n_samples_avg)+100]) ax_measures00.set_xlim([0,len(self.workflow.tasks)]) ax_measures00.set_xticklabels([]) # -- U-stat measures -------------------------------------------------- # -- minimum average U-score for all features ax_measures10.plot(x_loc,np.min(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o',label='min. avg. U-score all') # -- minimum average U-score for top features ax_measures10.plot(x_loc,np.ma.min(self.ops_base_perf_vals[task_sort_ind,:],axis=1),marker='o',label='min. avg. U-score top') # -- average minimum (for each class pair) U-score for top features # XXX This would require task.pair_stats to be available (not saved as intermediate at the moment); then it is trivial to implement ax_measures10.legend(loc=2,fontsize='small',labelspacing=.1) ax_measures10.set_ylabel('u-score') ax_measures10.set_xlim([0,len(self.workflow.tasks)]) ax_measures10.set_ylim([0,0.5]) # -- U-stat measures and avg operations working-------------------------------------------------- # -- mean average U-score for all features ax_measures20.plot(x_loc,np.ma.mean(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o') [label.set_color('b') for label in ax_measures20.get_yticklabels()] ax_measures20.set_ylabel('avrg u-scrore all feat') ax_measures20.yaxis.label.set_color('b') # -- number of successfully calculated features ax_measures21.plot(x_loc,[len(self.workflow.tasks[i].op['id']) for i in task_sort_ind],c='r',marker='o') [label.set_color('r') for label in ax_measures21.get_yticklabels()] ax_measures21.set_ylabel('nr calc feat') ax_measures21.yaxis.label.set_color('r') ax_measures20.set_xlim([0,len(self.workflow.tasks)])
def cat_data_op_subset(file_paths,op_id_top,is_from_old_matlab = False,is_return_masked = True): """ Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths. Warning, this can take a while and the returned data matrix can be very large. XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is only a little more work to implement. XXX Parameters: ----------- file_paths : list list of file paths pointing to the files containing the data op_id_top : list,ndarray list of operation ids wanted in the concatenated data array is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. is_return_masked : boolean Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with data_all = np.ma.masked_invalid(data_all) Returns: -------- data_all : ndarray/masked ndarray Concatenated data array """ is_first = True data_all = None for file_path in file_paths: print "Adding data from {:s} \n to complete data matrix".format(file_path) data,op = mIO.read_from_mat_file(file_path, ['TS_DataMat','Operations'],is_from_old_matlab = is_from_old_matlab) # -- find the indices in the data for for op_id_top ind = hlp.ismember(op['id'],op_id_top,is_return_masked_array = True,return_dtype = int) # -- if any of the operations was not calculated for this problem # -- create a masked array and copy only valid data and mask # -- invalid data if ind.data != op_id_top: # -- create an masked array filled with NaN. # -- This makes later masking of non-existent entries easier # -- each column of data_ma corresponds to the op_id in op_id_top with the # -- same index (column i in data_ma corresponds to op_id_top[i]) data_ma = np.empty((data.shape[0],np.array(op_id_top).shape[0])) data_ma[:] = np.NaN for it,i in enumerate(ind): # -- if i is masked in ind that means that the current operation in data # -- is not part of op_id_top. We therefore do not need this operation to # -- be included in data_ma. if i is not np.ma.masked: data_ma[:,i] = data[:,it] # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary) else: data_ma = np.array(data[:,ind]) # -- mask all NaN (not calculated) entries and stick them together #data_ma = np.ma.masked_invalid(data_ma) if is_first == True: data_all = data_ma is_first = False else: data_all = np.vstack((data_all,data_ma)) # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module. if is_return_masked == True: data_all = np.ma.masked_invalid(data_all) return data_all
# -- number of successfully calculated features ax_measures21.plot(x_loc[:-1], (~all_classes_avg_masked_sort.mask).sum(axis=1)[:-1], c="r", marker="o") [label.set_color("r") for label in ax_measures21.get_yticklabels()] ax_measures21.set_ylabel("nr calc feat") ax_measures21.yaxis.label.set_color("r") ax_measures20.set_xlim([0, problem_paths.shape[0]]) # -- Calculate the average min (for each label pair separately) score for every problem if False: avg_min_u_score = np.ones(problem_paths.shape[0]) * np.NaN ustat_paths = np.array(glob.glob(intermediate_data_root + "/*_ustat.npy")) reg_ex = re.compile("../data/(.*)_ustat.npy") ustat_names = np.array([reg_ex.match(ustat_path).group(1) for ustat_path in ustat_paths]) # -- sort ustat paths to match the problem_paths ustat_sort_ind = hlp.ismember(problem_names, ustat_names) ustat_paths = ustat_paths[ustat_sort_ind] ustat_names = ustat_names[ustat_sort_ind] for i, (ustat_path, mat_file_path) in enumerate(zip(ustat_paths, problem_paths)): ustat = np.load(ustat_path) # -- calculate the scaling factor for every label pairing of the current classification problem u_scale = testst.u_stat_norm_factor(mat_file_path, is_from_old_matlab="True") print ustat_path avg_min_u_score[i] = (np.min(ustat, axis=1) / u_scale).mean() np.save(avg_min_u_score_path, avg_min_u_score) else: avg_min_u_score = np.load(avg_min_u_score_path) # -- average minimum (for each class pair) U-score for top features ax_measures10.plot(x_loc, avg_min_u_score[porblem_sort_ind], marker="o", label="avg. min. U-score all") ax_measures10.legend(loc=2, fontsize="small", labelspacing=0.1)
def plot_stat_array(self): fig = plt.figure(figsize=((15, 15))) # -- plot layout ------------------------------------------------------ #rect_ustat_arr = [0.01,0.01,0.75,0.75] #[0.25,0.175,.5,.5] #rect_dendr = [0.76,0.01,.2175,.75] #[0.755,0.175,.145,.5] rect_ustat_arr = fig.add_axes([0.15, 0.2, 0.7, 0.8]) #rect_dendr = fig.add_axes([0.7, 0.1, 0.145, 0.8]) rect_dendr = fig.add_axes([0.15, 0.8, 0.873, 0.2]) '''rect_measures0 = [0.25,0.68,0.5,0.1] rect_measures1 = [0.25,0.785,0.5,0.1] rect_measures2 = [0.25,0.89,0.5,0.1]''' ax_ustat_arr = fig.add_axes(rect_ustat_arr) ax_dendr = fig.add_axes(rect_dendr) '''ax_measures00 = fig.add_axes(rect_measures0) ax_measures01 = plt.twinx(ax_measures00) ax_measures10 = fig.add_axes(rect_measures1) ax_measures10.set_xticklabels([]) ax_measures20 = fig.add_axes(rect_measures2) ax_measures20.set_xticklabels([]) ax_measures21 = plt.twinx(ax_measures20)''' # -- calculate and plot the dendrogram dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='top', no_plot=True) hierarchy.dendrogram(self.linkage, orientation='top', p=50, truncate_mode='lastp', ax=ax_dendr) ax_dendr.set_xticks([]) ax_dendr.axvline(self.max_dist_cluster, ls='--', c='k') # -- plot sorted classification stat array ------------------------------------------ # -- create index that sort rows to correspond to dendrogram feat_sort_ind = dist_dendrogram['leaves'] # -- sort the good performant features so they have the same order as the similarity array sort_ind = hlp.ismember( self.workflow.redundancy_method.similarity_array_op_ids, self.workflow.redundancy_method.good_perf_op_ids) self.ops_base_perf_vals = self.ops_base_perf_vals[:, sort_ind] # -- create index that sort columns with respect to their mean value task_sort_ind = np.argsort( self.ops_base_perf_vals[:, feat_sort_ind].mean(axis=1)) #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0)) #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0] all_classes_avg_top = self.ops_base_perf_vals # -- plot the operation names as y-axis tick labels aspect = all_classes_avg_top.shape[0] / float( all_classes_avg_top.shape[1]) im = ax_ustat_arr.matshow( all_classes_avg_top[task_sort_ind, :][:, feat_sort_ind], aspect=aspect, origin='bottom', cmap='turbo') ax_ustat_arr.set_yticks(range(all_classes_avg_top.shape[0])) op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset( op_id_name_map[0], op_id_name_map[1], self.workflow.redundancy_method.similarity_array_op_ids) ax_ustat_arr.set_yticklabels(self.task_names[task_sort_ind]) # -- plot the problem names as x axis labels ax_ustat_arr.xaxis.tick_bottom() ax_ustat_arr.set_xticks(range(len(feat_sort_ind))) ax_ustat_arr.set_xticklabels(np.array(names)[feat_sort_ind], rotation='vertical') fig.colorbar(im) # -- plot clusters ---------------------------------- '''cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5
marker='o') [label.set_color('r') for label in ax_measures21.get_yticklabels()] ax_measures21.set_ylabel('nr calc feat') ax_measures21.yaxis.label.set_color('r') ax_measures20.set_xlim([0, problem_paths.shape[0]]) # -- Calculate the average min (for each label pair separately) score for every problem if False: avg_min_u_score = np.ones(problem_paths.shape[0]) * np.NaN ustat_paths = np.array(glob.glob(intermediate_data_root + '/*_ustat.npy')) reg_ex = re.compile('../data/(.*)_ustat.npy') ustat_names = np.array( [reg_ex.match(ustat_path).group(1) for ustat_path in ustat_paths]) # -- sort ustat paths to match the problem_paths ustat_sort_ind = hlp.ismember(problem_names, ustat_names) ustat_paths = ustat_paths[ustat_sort_ind] ustat_names = ustat_names[ustat_sort_ind] for i, (ustat_path, mat_file_path) in enumerate(zip(ustat_paths, problem_paths)): ustat = np.load(ustat_path) # -- calculate the scaling factor for every label pairing of the current classification problem u_scale = testst.u_stat_norm_factor(mat_file_path, is_from_old_matlab='True') print ustat_path avg_min_u_score[i] = (np.min(ustat, axis=1) / u_scale).mean() np.save(avg_min_u_score_path, avg_min_u_score) else: avg_min_u_score = np.load(avg_min_u_score_path) # -- average minimum (for each class pair) U-score for top features