def plot_similarity_array(self): abs_corr_array = self.workflow.redundancy_method.similarity_array op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset( op_id_name_map[0], op_id_name_map[1], self.workflow.redundancy_method.similarity_array_op_ids) measures = np.zeros((2, len(names))) tmp_ind = hlp.ismember( self.workflow.redundancy_method.similarity_array_op_ids, self.workflow.good_op_ids) # -- number of problems for which each good performing feature has been calculated measures[0, :] = (~self.workflow.stats_good_op[:, tmp_ind].mask).sum( axis=0) # -- z scored u-stat(for all features) for top features stats_good_op_z_score = fap.normalise_masked_array( self.workflow.stats_good_op_comb, axis=0, norm_type='zscore')[0] measures[1, :] = stats_good_op_z_score[tmp_ind] fiplt.plot_arr_dendrogram(abs_corr_array, names, max_dist_cluster=self.max_dist_cluster, measures=measures)
def plot_similarity_array(self): abs_corr_array = self.workflow.redundancy_method.similarity_array op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids) measures = np.zeros((2,len(names))) tmp_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids, self.workflow.good_op_ids) # -- number of problems for which each good performing feature has been calculated measures[0,:] = (~self.workflow.stats_good_op[:,tmp_ind].mask).sum(axis=0) # -- z scored u-stat(for all features) for top features stats_good_op_z_score = fap.normalise_masked_array(self.workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0] measures[1,:] = stats_good_op_z_score[tmp_ind] fiplt.plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster=self.max_dist_cluster,measures = measures)
def reduce_to_good_perf_ops(self,ops_base_vals,good_perf_op_ids,good_op_ids): """ Reduce the ops_base_vals by keeping only the columns corresponding to the op_ids in self.good_perf_op_ids Parameters: ----------- ops_base_vals : nd array Array containing the values on which the similarity of the operations will be calculated good_op_ids : ndarray The op_ids of the columns in ops_base_vals good_perf_op_ids : ndarray The op_ids of the features we are interested in Returns: -------- ops_base_perf_vals : ndarray ops_base_vals reduced to contain only operations with ids given in good_perf_op_ids with the same ordering. """ good_perf_ind = hlp.ismember(good_perf_op_ids,good_op_ids) ops_base_perf_vals = ops_base_vals[:,good_perf_ind] return ops_base_perf_vals
def reduce_to_good_perf_ops(self, ops_base_vals, good_perf_op_ids, good_op_ids): """ Reduce the ops_base_vals by keeping only the columns corresponding to the op_ids in self.good_perf_op_ids Parameters: ----------- ops_base_vals : nd array Array containing the values on which the similarity of the operations will be calculated good_op_ids : ndarray The op_ids of the columns in ops_base_vals good_perf_op_ids : ndarray The op_ids of the features we are interested in Returns: -------- ops_base_perf_vals : ndarray ops_base_vals reduced to contain only operations with ids given in good_perf_op_ids with the same ordering. """ good_perf_ind = hlp.ismember(good_perf_op_ids, good_op_ids) ops_base_perf_vals = ops_base_vals[:, good_perf_ind] return ops_base_perf_vals
def collect_stats_good_op_ids(self): """ Collect all combined stats for each task and take stats for good operations only """ #stats_good_op_ma = np.empty((data.shape[0],np.array(self.good_op_ids).shape[0])) stats_good_op_tmp = [] #stats_good_op_ma[:] = np.NaN for task in self.tasks: # -- create tmp array for good stats for current task. For sake of simplicity when dealing with different # dimensions of task.tot_stats we transpose stats_good_op_ma_tmp so row corresponds to feature temporarily if task.tot_stats.ndim > 1: stats_good_op_ma_tmp = np.empty((self.good_op_ids.shape[0],task.tot_stats.shape[0])) else: stats_good_op_ma_tmp = np.empty((self.good_op_ids.shape[0])) stats_good_op_ma_tmp[:] = np.NaN ind = hlp.ismember(task.op_ids,self.good_op_ids,is_return_masked_array = True,return_dtype = int) # -- it is position in task.op_ids and i is position in self.good_op_ids for it,i in enumerate(ind): if i is not np.ma.masked: # -- that means the entry in task.op_ids is also in self.good_op_ids stats_good_op_ma_tmp[i] = task.tot_stats[it].T # -- We return to the usual ordering: column equals feature stats_good_op_tmp.append(stats_good_op_ma_tmp.T) self.stats_good_op = np.ma.masked_invalid(np.vstack(stats_good_op_tmp))
def cat_data_op_subset(file_paths, op_id_top, is_from_old_matlab=False, is_return_masked=True): """ Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths. Warning, this can take a while and the returned data matrix can be very large. XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is only a little more work to implement. XXX Parameters: ----------- file_paths : list list of file paths pointing to the files containing the data op_id_top : list,ndarray list of operation ids wanted in the concatenated data array is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. is_return_masked : boolean Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with data_all = np.ma.masked_invalid(data_all) Returns: -------- data_all : ndarray/masked ndarray Concatenated data array """ is_first = True data_all = None for file_path in file_paths: print "Adding data from {:s} \n to complete data matrix".format( file_path) data, op = mIO.read_from_mat_file( file_path, ['TS_DataMat', 'Operations'], is_from_old_matlab=is_from_old_matlab) # -- find the indices in the data for for op_id_top ind = hlp.ismember(op['id'], op_id_top, is_return_masked_array=True, return_dtype=int) # -- if any of the operations was not calculated for this problem # -- create a masked array and copy only valid data and mask # -- invalid data if ind.data != op_id_top: # -- create an masked array filled with NaN. # -- This makes later masking of non-existent entries easier # -- each column of data_ma corresponds to the op_id in op_id_top with the # -- same index (column i in data_ma corresponds to op_id_top[i]) data_ma = np.empty((data.shape[0], np.array(op_id_top).shape[0])) data_ma[:] = np.NaN for it, i in enumerate(ind): # -- if i is masked in ind that means that the current operation in data # -- is not part of op_id_top. We therefore do not need this operation to # -- be included in data_ma. if i is not np.ma.masked: data_ma[:, i] = data[:, it] # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary) else: data_ma = np.array(data[:, ind]) # -- mask all NaN (not calculated) entries and stick them together #data_ma = np.ma.masked_invalid(data_ma) if is_first == True: data_all = data_ma is_first = False else: data_all = np.vstack((data_all, data_ma)) # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module. if is_return_masked == True: data_all = np.ma.masked_invalid(data_all) return data_all
def corelated_features_mask(data=None, abs_corr_array=None, calc_times=None, op_ids=None): """ Computes a mask that, when applied, removes correlated features from the data array. Parmeters: ---------- data : ndarray A data matrix with rows represent training samples and columns represent features abs_corr_array : ndarray The correlation matrix of all features. Has to be given id data == none calc_times : ndarray Array where the first row corresponds to operation id's and the second row to calculation times for these operation ids op_ids : The operation ids corresponding to the rows/columns in abs_corr_array Returns: ------- mask : ndarray,dtype=bool 1d array whose entries are one only for uncorrelated entries. abs_corr_arrayabs_corr_array : ndarray the correlation matrix """ if abs_corr_array == None: abs_corr_array = np.abs(np.ma.corrcoef(data, rowvar=0)) # -- Vector containing 0 for all operations we don't want mask = np.ones(abs_corr_array.shape[0], dtype='bool') for i in range(abs_corr_array.shape[0]): # -- if the current line represents an operation not yet eleminated if mask[i]: # -- remove operations which are highly correlated mask[(abs_corr_array[i] > 0.8)] = 0 #---------------------------------------------------------- # -- Use fastest operation in a set of correlated operations #---------------------------------------------------------- if calc_times != None and op_ids != None: # -- find ind in abs_corr_array of correlated features ind_corr = np.nonzero((abs_corr_array[i] > 0.8))[0] # -- translate ind_corr to op ids op_id_corr = hlp.ind_map_subset(range(abs_corr_array.shape[0]), op_ids, ind_corr) # -- get calculation time of correlated op ids t_corr = hlp.ind_map_subset(calc_times[0], calc_times[1], op_id_corr) # -- check if all entries are None -> no timing information for any of the operations if np.nonzero(t_corr)[0].shape[0] == 0: # -- pick the first operation as fastest as there is no timing information op_id_corr_fastest = op_id_corr[0] # -- else pick the fastest operation else: # -- get op_id of fastest operation in this correlated set op_id_corr_fastest = op_id_corr[np.nanargmin(t_corr)] # -- get index of fastest operation in this correlated set ind_corr_fastest = hlp.ind_map_subset( op_ids, range(abs_corr_array.shape[0]), op_id_corr_fastest) # -- add fastest index back in mask[ind_corr_fastest] = 1 #---------------------------------------------------------- # -- Use arbitrary operation in a set of correlated operations #---------------------------------------------------------- else: mask[i] = 1 return mask, abs_corr_array
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'], is_from_old_matlab=True) max_feat = 50 # -- calculate the correlation abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat( all_classes_avg_good, norm='z-score', max_feat=max_feat) # -- save the op id's in order of performance (first entry = best performance) np.save(op_id_order_path, op_id_good[sort_good_ind]) # -- sort the permutation vector that would sort the data array containing the good operations only np.save(sort_good_ind_path, sort_good_ind) # -- extract the top feature names names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat]) # -- Calculate the measures to be plotted problems_succ = (~all_classes_avg_good[:, sort_good_ind[:max_feat]].mask).sum( axis=0) u_stat_mean = all_classes_avg_good_norm[:, sort_good_ind[:max_feat]].mean(axis=0) measures = np.vstack((problems_succ, u_stat_mean)) fiplt.plot_arr_dendrogram(abs_corr_array, names, measures=measures) plt.savefig('/home/philip/Desktop/tmp/figure_tmp/corr_array.png') plt.show()
def plot_stat_array(self): fig = plt.figure(figsize = ((15,15))) # -- plot layout ------------------------------------------------------ rect_ustat_arr = [0.25,0.175,.5,.5] rect_dendr = [0.755,0.175,.145,.5] rect_measures0 = [0.25,0.68,0.5,0.1] rect_measures1 = [0.25,0.785,0.5,0.1] rect_measures2 = [0.25,0.89,0.5,0.1] ax_ustat_arr = fig.add_axes(rect_ustat_arr) ax_dendr = fig.add_axes(rect_dendr) ax_measures00 = fig.add_axes(rect_measures0) ax_measures01 = plt.twinx(ax_measures00) ax_measures10 = fig.add_axes(rect_measures1) ax_measures10.set_xticklabels([]) ax_measures20 = fig.add_axes(rect_measures2) ax_measures20.set_xticklabels([]) ax_measures21 = plt.twinx(ax_measures20) # -- calculate and plot the dendrogram dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='left',no_plot=True) hierarchy.dendrogram(self.linkage, orientation='left',p=50,truncate_mode ='lastp',ax = ax_dendr) ax_dendr.set_yticks([]) ax_dendr.axvline(self.max_dist_cluster,ls='--',c='k') # -- plot sorted classification stat array ------------------------------------------ # -- create index that sort rows to correspond to dendrogram feat_sort_ind = dist_dendrogram['leaves'] # -- sort the good performant features so they have the same order as the similarity array sort_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids,self.workflow.redundancy_method.good_perf_op_ids) self.ops_base_perf_vals = self.ops_base_perf_vals[:,sort_ind] # -- create index that sort columns with respect to their mean value task_sort_ind = np.argsort(self.ops_base_perf_vals[:,feat_sort_ind].mean(axis=1)) #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0)) #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0] all_classes_avg_top = self.ops_base_perf_vals # -- plot the operation names as y-axis tick labels aspect = all_classes_avg_top.shape[0] / float(all_classes_avg_top.shape[1]) ax_ustat_arr.matshow(all_classes_avg_top[task_sort_ind,:][:,feat_sort_ind].T,aspect=aspect,origin='bottom') ax_ustat_arr.set_yticks(range(len(feat_sort_ind))) op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids) ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind]) # -- plot the problem names as x axis labels ax_ustat_arr.xaxis.tick_bottom() ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0])) ax_ustat_arr.set_xticklabels(self.task_names[task_sort_ind],rotation='vertical') # -- plot clusters ---------------------------------- cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5 for cluster_bound in cluster_bounds: ax_ustat_arr.axhline(cluster_bound,c='w',lw=2) # -------------------------------------------------------------------------------- # -- calculate and plot measures ------------------------------------------------- # -------------------------------------------------------------------------------- # -- nr samples and nr labels -------------------------------------------------- n_samples_avg = [np.array(self.workflow.tasks[i].ts['n_samples']).mean() for i in task_sort_ind] n_classes = [len(set(self.workflow.tasks[i].labels)) for i in task_sort_ind] x_loc = np.arange(0,len(self.workflow.tasks))+0.5 ax_measures00.scatter(x_loc,n_classes,c='b',s=40) ax_measures00.plot(x_loc,n_classes,c='b') [label.set_color('b') for label in ax_measures00.get_yticklabels()] ax_measures00.set_ylabel('nr classes') ax_measures00.yaxis.label.set_color('b') ax_measures00.set_ylim([0,max(n_classes)+1]) ax_measures00.set_xticklabels([]) ax_measures01.scatter(x_loc,n_samples_avg,c='r',s=40) ax_measures01.plot(x_loc,n_samples_avg,c='r') [label.set_color('r') for label in ax_measures01.get_yticklabels()] ax_measures01.set_ylabel('avg samples') ax_measures01.yaxis.label.set_color('r') ax_measures01.set_ylim([0,max(n_samples_avg)+100]) ax_measures00.set_xlim([0,len(self.workflow.tasks)]) ax_measures00.set_xticklabels([]) # -- Classification stat measures -------------------------------------------------- # -- minimum average classification stat for all features ax_measures10.plot(x_loc,np.min(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o',label='min. avg. classification stat all') # -- minimum average classification stat for top features ax_measures10.plot(x_loc,np.ma.min(self.ops_base_perf_vals[task_sort_ind,:],axis=1),marker='o',label='min. avg. classification stat top') # -- average minimum (for each class pair) classification stat for top features # XXX This would require task.pair_stats to be available (not saved as intermediate at the moment); then it is trivial to implement ax_measures10.legend(loc=2,fontsize='small',labelspacing=.1) ax_measures10.set_ylabel('classification stat') ax_measures10.set_xlim([0,len(self.workflow.tasks)]) ax_measures10.set_ylim([0,0.5]) # -- Classification stat measures and avg operations working-------------------------------------------------- # -- mean average classification stat for all features ax_measures20.plot(x_loc,np.ma.mean(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o') [label.set_color('b') for label in ax_measures20.get_yticklabels()] ax_measures20.set_ylabel('avrg classification stat all feat') ax_measures20.yaxis.label.set_color('b') # -- number of successfully calculated features ax_measures21.plot(x_loc,[len(self.workflow.tasks[i].op['id']) for i in task_sort_ind],c='r',marker='o') [label.set_color('r') for label in ax_measures21.get_yticklabels()] ax_measures21.set_ylabel('nr calc feat') ax_measures21.yaxis.label.set_color('r') ax_measures20.set_xlim([0,len(self.workflow.tasks)])
def plot_stat_array(self): fig = plt.figure(figsize = ((15,15))) # -- plot layout ------------------------------------------------------ rect_ustat_arr = [0.25,0.175,.5,.5] rect_dendr = [0.755,0.175,.145,.5] rect_measures0 = [0.25,0.68,0.5,0.1] rect_measures1 = [0.25,0.785,0.5,0.1] rect_measures2 = [0.25,0.89,0.5,0.1] ax_ustat_arr = fig.add_axes(rect_ustat_arr) ax_dendr = fig.add_axes(rect_dendr) ax_measures00 = fig.add_axes(rect_measures0) ax_measures01 = plt.twinx(ax_measures00) ax_measures10 = fig.add_axes(rect_measures1) ax_measures10.set_xticklabels([]) ax_measures20 = fig.add_axes(rect_measures2) ax_measures20.set_xticklabels([]) ax_measures21 = plt.twinx(ax_measures20) # -- calculate and plot the dendrogram dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='left',no_plot=True) hierarchy.dendrogram(self.linkage, orientation='left',p=50,truncate_mode ='lastp',ax = ax_dendr) ax_dendr.set_yticks([]) ax_dendr.axvline(self.max_dist_cluster,ls='--',c='k') # -- plot sorted U-Stat array ------------------------------------------ # -- create index that sort rows to correspond to dendrogram feat_sort_ind = dist_dendrogram['leaves'] # -- sort the good performant features so they have the same order as the similarity array sort_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids,self.workflow.redundancy_method.good_perf_op_ids) self.ops_base_perf_vals = self.ops_base_perf_vals[:,sort_ind] # -- create index that sort columns with respect to their mean value task_sort_ind = np.argsort(self.ops_base_perf_vals[:,feat_sort_ind].mean(axis=1)) #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0)) #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0] all_classes_avg_top = self.ops_base_perf_vals # -- plot the operation names as y-axis tick labels aspect = all_classes_avg_top.shape[0] / float(all_classes_avg_top.shape[1]) ax_ustat_arr.matshow(all_classes_avg_top[task_sort_ind,:][:,feat_sort_ind].T,aspect=aspect,origin='bottom') ax_ustat_arr.set_yticks(range(len(feat_sort_ind))) op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids) ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind]) # -- plot the problem names as x axis labels ax_ustat_arr.xaxis.tick_bottom() ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0])) ax_ustat_arr.set_xticklabels(self.task_names[task_sort_ind],rotation='vertical') # -- plot clusters ---------------------------------- cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5 for cluster_bound in cluster_bounds: ax_ustat_arr.axhline(cluster_bound,c='w',lw=2) # -------------------------------------------------------------------------------- # -- calculate and plot measures ------------------------------------------------- # -------------------------------------------------------------------------------- # -- nr samples and nr labels -------------------------------------------------- n_samples_avg = [np.array(self.workflow.tasks[i].ts['n_samples']).mean() for i in task_sort_ind] n_classes = [len(set(self.workflow.tasks[i].labels)) for i in task_sort_ind] x_loc = np.arange(0,len(self.workflow.tasks))+0.5 ax_measures00.scatter(x_loc,n_classes,c='b',s=40) ax_measures00.plot(x_loc,n_classes,c='b') [label.set_color('b') for label in ax_measures00.get_yticklabels()] ax_measures00.set_ylabel('nr classes') ax_measures00.yaxis.label.set_color('b') ax_measures00.set_ylim([0,max(n_classes)+1]) ax_measures00.set_xticklabels([]) ax_measures01.scatter(x_loc,n_samples_avg,c='r',s=40) ax_measures01.plot(x_loc,n_samples_avg,c='r') [label.set_color('r') for label in ax_measures01.get_yticklabels()] ax_measures01.set_ylabel('avg samples') ax_measures01.yaxis.label.set_color('r') ax_measures01.set_ylim([0,max(n_samples_avg)+100]) ax_measures00.set_xlim([0,len(self.workflow.tasks)]) ax_measures00.set_xticklabels([]) # -- U-stat measures -------------------------------------------------- # -- minimum average U-score for all features ax_measures10.plot(x_loc,np.min(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o',label='min. avg. U-score all') # -- minimum average U-score for top features ax_measures10.plot(x_loc,np.ma.min(self.ops_base_perf_vals[task_sort_ind,:],axis=1),marker='o',label='min. avg. U-score top') # -- average minimum (for each class pair) U-score for top features # XXX This would require task.pair_stats to be available (not saved as intermediate at the moment); then it is trivial to implement ax_measures10.legend(loc=2,fontsize='small',labelspacing=.1) ax_measures10.set_ylabel('u-score') ax_measures10.set_xlim([0,len(self.workflow.tasks)]) ax_measures10.set_ylim([0,0.5]) # -- U-stat measures and avg operations working-------------------------------------------------- # -- mean average U-score for all features ax_measures20.plot(x_loc,np.ma.mean(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o') [label.set_color('b') for label in ax_measures20.get_yticklabels()] ax_measures20.set_ylabel('avrg u-scrore all feat') ax_measures20.yaxis.label.set_color('b') # -- number of successfully calculated features ax_measures21.plot(x_loc,[len(self.workflow.tasks[i].op['id']) for i in task_sort_ind],c='r',marker='o') [label.set_color('r') for label in ax_measures21.get_yticklabels()] ax_measures21.set_ylabel('nr calc feat') ax_measures21.yaxis.label.set_color('r') ax_measures20.set_xlim([0,len(self.workflow.tasks)])
# abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0)) # -- calculate the correlation array with respect to performance and mask nan. abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg[:,op_id_good],norm='z-score', max_feat = max_feat) all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_good][:,sort_good_ind[:max_feat]]) # -- calculate the linkage for the correlation corr_linkage = idtop.calc_linkage(abs_corr_array)[0] # -- extract operation names --- ------------------------------------------ # -- load a reference HCTSA_loc.mat containing all op_ids op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True) top_id = op_id_good[sort_good_ind][:max_feat] names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat]) # -- extract problem names --- ------------------------------------------ reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat') problem_paths = np.load(problem_names_path) problem_names = np.array([reg_ex.match(problem_path).group(1) for problem_path in problem_paths]) # --------------------------------------------------------------------- # -- Plot ------------------------------------------------------------- # --------------------------------------------------------------------- fig = plt.figure(figsize = ((15,15))) # -- plot layout ------------------------------------------------------
def cat_data_op_subset(file_paths,op_id_top,is_from_old_matlab = False,is_return_masked = True): """ Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths. Warning, this can take a while and the returned data matrix can be very large. XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is only a little more work to implement. XXX Parameters: ----------- file_paths : list list of file paths pointing to the files containing the data op_id_top : list,ndarray list of operation ids wanted in the concatenated data array is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. is_return_masked : boolean Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with data_all = np.ma.masked_invalid(data_all) Returns: -------- data_all : ndarray/masked ndarray Concatenated data array """ is_first = True data_all = None for file_path in file_paths: print "Adding data from {:s} \n to complete data matrix".format(file_path) data,op = mIO.read_from_mat_file(file_path, ['TS_DataMat','Operations'],is_from_old_matlab = is_from_old_matlab) # -- find the indices in the data for for op_id_top ind = hlp.ismember(op['id'],op_id_top,is_return_masked_array = True,return_dtype = int) # -- if any of the operations was not calculated for this problem # -- create a masked array and copy only valid data and mask # -- invalid data if ind.data != op_id_top: # -- create an masked array filled with NaN. # -- This makes later masking of non-existent entries easier # -- each column of data_ma corresponds to the op_id in op_id_top with the # -- same index (column i in data_ma corresponds to op_id_top[i]) data_ma = np.empty((data.shape[0],np.array(op_id_top).shape[0])) data_ma[:] = np.NaN for it,i in enumerate(ind): # -- if i is masked in ind that means that the current operation in data # -- is not part of op_id_top. We therefore do not need this operation to # -- be included in data_ma. if i is not np.ma.masked: data_ma[:,i] = data[:,it] # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary) else: data_ma = np.array(data[:,ind]) # -- mask all NaN (not calculated) entries and stick them together #data_ma = np.ma.masked_invalid(data_ma) if is_first == True: data_all = data_ma is_first = False else: data_all = np.vstack((data_all,data_ma)) # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module. if is_return_masked == True: data_all = np.ma.masked_invalid(data_all) return data_all
def corelated_features_mask(data=None,abs_corr_array=None,calc_times=None,op_ids=None): """ Computes a mask that, when applied, removes correlated features from the data array. Parmeters: ---------- data : ndarray A data matrix with rows represent training samples and columns represent features abs_corr_array : ndarray The correlation matrix of all features. Has to be given id data == none calc_times : ndarray Array where the first row corresponds to operation id's and the second row to calculation times for these operation ids op_ids : The operation ids corresponding to the rows/columns in abs_corr_array Returns: ------- mask : ndarray,dtype=bool 1d array whose entries are one only for uncorrelated entries. abs_corr_arrayabs_corr_array : ndarray the correlation matrix """ if abs_corr_array == None: abs_corr_array = np.abs(np.ma.corrcoef(data, rowvar=0)) # -- Vector containing 0 for all operations we don't want mask = np.ones(abs_corr_array.shape[0],dtype='bool') for i in range(abs_corr_array.shape[0]): # -- if the current line represents an operation not yet eleminated if mask[i]: # -- remove operations which are highly correlated mask[(abs_corr_array[i] > 0.8)] = 0 #---------------------------------------------------------- # -- Use fastest operation in a set of correlated operations #---------------------------------------------------------- if calc_times != None and op_ids != None: # -- find ind in abs_corr_array of correlated features ind_corr = np.nonzero((abs_corr_array[i] > 0.8))[0] # -- translate ind_corr to op ids op_id_corr = hlp.ind_map_subset(range(abs_corr_array.shape[0]),op_ids,ind_corr) # -- get calculation time of correlated op ids t_corr = hlp.ind_map_subset(calc_times[0],calc_times[1],op_id_corr) # -- check if all entries are None -> no timing information for any of the operations if np.nonzero(t_corr)[0].shape[0] == 0: # -- pick the first operation as fastest as there is no timing information op_id_corr_fastest = op_id_corr[0] # -- else pick the fastest operation else: # -- get op_id of fastest operation in this correlated set op_id_corr_fastest = op_id_corr[np.nanargmin(t_corr)] # -- get index of fastest operation in this correlated set ind_corr_fastest = hlp.ind_map_subset(op_ids,range(abs_corr_array.shape[0]),op_id_corr_fastest) # -- add fastest index back in mask[ind_corr_fastest] = 1 #---------------------------------------------------------- # -- Use arbitrary operation in a set of correlated operations #---------------------------------------------------------- else: mask[i] = 1 return mask,abs_corr_array
# -- calculate the correlation array with respect to performance and mask nan. abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat( all_classes_avg[:, op_id_good], norm="z-score", max_feat=max_feat ) all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:, op_id_good][:, sort_good_ind[:max_feat]]) # -- calculate the linkage for the correlation corr_linkage = idtop.calc_linkage(abs_corr_array)[0] # -- extract operation names --- ------------------------------------------ # -- load a reference HCTSA_loc.mat containing all op_ids op_ref_HCTSA_path = "/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat" op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ["Operations"], is_from_old_matlab=True) top_id = op_id_good[sort_good_ind][:max_feat] names = hlp.ind_map_subset(op["id"], op["name"], op_id_good[sort_good_ind][:max_feat]) # -- extract problem names --- ------------------------------------------ reg_ex = re.compile(".*\/HCTSA_(.*)_N_70_100_reduced.mat") problem_paths = np.load(problem_names_path) problem_names = np.array([reg_ex.match(problem_path).group(1) for problem_path in problem_paths]) # --------------------------------------------------------------------- # -- Plot ------------------------------------------------------------- # --------------------------------------------------------------------- fig = plt.figure(figsize=((15, 15))) # -- plot layout ------------------------------------------------------
def plot_stat_array(self): fig = plt.figure(figsize=((15, 15))) # -- plot layout ------------------------------------------------------ #rect_ustat_arr = [0.01,0.01,0.75,0.75] #[0.25,0.175,.5,.5] #rect_dendr = [0.76,0.01,.2175,.75] #[0.755,0.175,.145,.5] rect_ustat_arr = fig.add_axes([0.15, 0.2, 0.7, 0.8]) #rect_dendr = fig.add_axes([0.7, 0.1, 0.145, 0.8]) rect_dendr = fig.add_axes([0.15, 0.8, 0.873, 0.2]) '''rect_measures0 = [0.25,0.68,0.5,0.1] rect_measures1 = [0.25,0.785,0.5,0.1] rect_measures2 = [0.25,0.89,0.5,0.1]''' ax_ustat_arr = fig.add_axes(rect_ustat_arr) ax_dendr = fig.add_axes(rect_dendr) '''ax_measures00 = fig.add_axes(rect_measures0) ax_measures01 = plt.twinx(ax_measures00) ax_measures10 = fig.add_axes(rect_measures1) ax_measures10.set_xticklabels([]) ax_measures20 = fig.add_axes(rect_measures2) ax_measures20.set_xticklabels([]) ax_measures21 = plt.twinx(ax_measures20)''' # -- calculate and plot the dendrogram dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='top', no_plot=True) hierarchy.dendrogram(self.linkage, orientation='top', p=50, truncate_mode='lastp', ax=ax_dendr) ax_dendr.set_xticks([]) ax_dendr.axvline(self.max_dist_cluster, ls='--', c='k') # -- plot sorted classification stat array ------------------------------------------ # -- create index that sort rows to correspond to dendrogram feat_sort_ind = dist_dendrogram['leaves'] # -- sort the good performant features so they have the same order as the similarity array sort_ind = hlp.ismember( self.workflow.redundancy_method.similarity_array_op_ids, self.workflow.redundancy_method.good_perf_op_ids) self.ops_base_perf_vals = self.ops_base_perf_vals[:, sort_ind] # -- create index that sort columns with respect to their mean value task_sort_ind = np.argsort( self.ops_base_perf_vals[:, feat_sort_ind].mean(axis=1)) #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0)) #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0] all_classes_avg_top = self.ops_base_perf_vals # -- plot the operation names as y-axis tick labels aspect = all_classes_avg_top.shape[0] / float( all_classes_avg_top.shape[1]) im = ax_ustat_arr.matshow( all_classes_avg_top[task_sort_ind, :][:, feat_sort_ind], aspect=aspect, origin='bottom', cmap='turbo') ax_ustat_arr.set_yticks(range(all_classes_avg_top.shape[0])) op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset( op_id_name_map[0], op_id_name_map[1], self.workflow.redundancy_method.similarity_array_op_ids) ax_ustat_arr.set_yticklabels(self.task_names[task_sort_ind]) # -- plot the problem names as x axis labels ax_ustat_arr.xaxis.tick_bottom() ax_ustat_arr.set_xticks(range(len(feat_sort_ind))) ax_ustat_arr.set_xticklabels(np.array(names)[feat_sort_ind], rotation='vertical') fig.colorbar(im) # -- plot clusters ---------------------------------- '''cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5
abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat( all_classes_avg[:, op_id_good], norm='z-score', max_feat=max_feat) all_classes_avg_top = np.ma.masked_invalid( all_classes_avg[:, op_id_good][:, sort_good_ind[:max_feat]]) # -- calculate the linkage for the correlation corr_linkage = idtop.calc_linkage(abs_corr_array)[0] # -- extract operation names --- ------------------------------------------ # -- load a reference HCTSA_loc.mat containing all op_ids op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'], is_from_old_matlab=True) top_id = op_id_good[sort_good_ind][:max_feat] names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat]) # -- extract problem names --- ------------------------------------------ reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat') problem_paths = np.load(problem_names_path) problem_names = np.array( [reg_ex.match(problem_path).group(1) for problem_path in problem_paths]) # --------------------------------------------------------------------- # -- Plot ------------------------------------------------------------- # --------------------------------------------------------------------- fig = plt.figure(figsize=((15, 15))) # -- plot layout ------------------------------------------------------