def calc_abs_corr(self): """ Calculate the distance matrix using a correlation approach for every column in self.ops_base_perf_vals """ # -- no normalisation in here as the best performing features have been picked already, potentially using normalisation self.similarity_array,sort_ind,_ = idtop.calc_perform_corr_mat(self.ops_base_perf_vals,norm=None, max_feat = self.ops_base_perf_vals.shape[1]) self.similarity_array_op_ids = self.good_perf_op_ids[sort_ind]
def calc_abs_corr(self): """ Calculate the distance matrix using a correlation approach for every column in self.ops_base_perf_vals """ # -- no normalisation in here as the best performing features have been picked already, potentially using normalisation self.similarity_array, sort_ind, _ = idtop.calc_perform_corr_mat( self.ops_base_perf_vals, norm=None, max_feat=self.ops_base_perf_vals.shape[1]) self.similarity_array_op_ids = self.good_perf_op_ids[sort_ind]
def plot_arr_dendrogram_bak(abs_corr_array, names): """ Compute dendrogram and create a plot plotting dendrogram and abs_corr_array Parameters: ---------- abs_corr_array : ndarray array containing the correlation matrix names : list list of strings containing the names of the operations in abs_corr_array in the corresponding order. Returns: -------- index : list list of indices used to reorder the correlation matrix """ figsize = (10, 10) rect_dendro = [0.25, 0.76, 0.65, 0.23] rect_matrix = [0.25, 0.1, 0.65, 0.65] rect_color = [0.91, 0.1, 0.02, 0.65] # Compute and plot dendrogram. fig = plt.figure(figsize=figsize) axdendro = fig.add_axes(rect_dendro) corr_linkage = idtop.calc_linkage(abs_corr_array)[0] corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation='top') axdendro.set_xticks([]) axdendro.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes(rect_matrix) index = corr_dendrogram['leaves'] abs_corr_array = abs_corr_array[index, :] abs_corr_array = abs_corr_array[:, index] im = axmatrix.matshow(abs_corr_array, aspect='auto', origin='upper', vmin=0, vmax=1) axmatrix.set_xticks([]) axmatrix.set_yticks(range(len(index))) axmatrix.set_yticklabels(np.array(names)[index]) # Plot colorbar. axcolor = fig.add_axes(rect_color) plt.colorbar(im, cax=axcolor) #fig.savefig('/home/philip/work/reports/feature_importance/data/correlation_plots/problem_space/dendr_{:d}_Norm.png'.format(len(index))) return index
def calc_hierch_cluster(self,t = 0.2, criterion='distance' ): """ Calculate the clustering using hierachical clustering Parameters: ----------- t : float The threshold to apply when forming flat clusters. criterion : str, optional The criterion to use in forming flat clusters. """ self.linkage = idtop.calc_linkage(self.similarity_array)[0] self.cluster_inds = hierarchy.fcluster(self.linkage, t = t, criterion=criterion) # -- map index to op_id and create list of lists representing clusters self.cluster_op_id_list = [[] for x in xrange(self.cluster_inds.max())] for i,cluster_ind in enumerate(self.cluster_inds): self.cluster_op_id_list[cluster_ind-1].append(self.similarity_array_op_ids[i])
def plot_arr_dendrogram_bak(abs_corr_array,names): """ Compute dendrogram and create a plot plotting dendrogram and abs_corr_array Parameters: ---------- abs_corr_array : ndarray array containing the correlation matrix names : list list of strings containing the names of the operations in abs_corr_array in the corresponding order. Returns: -------- index : list list of indices used to reorder the correlation matrix """ figsize=(10,10) rect_dendro = [0.25,0.76,0.65,0.23] rect_matrix = [0.25,0.1,0.65,0.65] rect_color = [0.91,0.1,0.02,0.65] # Compute and plot dendrogram. fig = plt.figure(figsize=figsize) axdendro = fig.add_axes(rect_dendro) corr_linkage = idtop.calc_linkage(abs_corr_array)[0] corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation='top') axdendro.set_xticks([]) axdendro.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes(rect_matrix) index = corr_dendrogram['leaves'] abs_corr_array = abs_corr_array[index,:] abs_corr_array = abs_corr_array[:,index] im = axmatrix.matshow(abs_corr_array, aspect='auto', origin='upper',vmin=0,vmax=1) axmatrix.set_xticks([]) axmatrix.set_yticks(range(len(index))) axmatrix.set_yticklabels(np.array(names)[index]) # Plot colorbar. axcolor = fig.add_axes(rect_color) plt.colorbar(im, cax=axcolor) #fig.savefig('/home/philip/work/reports/feature_importance/data/correlation_plots/problem_space/dendr_{:d}_Norm.png'.format(len(index))) return index
def calc_hierch_cluster(self, t=0.2, criterion='distance'): """ Calculate the clustering using hierachical clustering Parameters: ----------- t : float The threshold to apply when forming flat clusters. criterion : str, optional The criterion to use in forming flat clusters. """ self.linkage = idtop.calc_linkage(self.similarity_array)[0] self.cluster_inds = hierarchy.fcluster(self.linkage, t=t, criterion=criterion) # -- map index to op_id and create list of lists representing clusters self.cluster_op_id_list = [[] for x in xrange(self.cluster_inds.max())] for i, cluster_ind in enumerate(self.cluster_inds): self.cluster_op_id_list[cluster_ind - 1].append( self.similarity_array_op_ids[i])
# -- Load the data all_classes_avg = np.load(all_classes_avg_out_path) op_id_good = np.load(op_id_good_path) # -- Mask NaN entires all_classes_avg_good = np.ma.masked_invalid(all_classes_avg[:, op_id_good]) # -- load a reference HCTSA_loc.mat containing all op_ids import modules.misc.PK_matlab_IO as mIO op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'], is_from_old_matlab=True) max_feat = 50 # -- calculate the correlation abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat( all_classes_avg_good, norm='z-score', max_feat=max_feat) # -- save the op id's in order of performance (first entry = best performance) np.save(op_id_order_path, op_id_good[sort_good_ind]) # -- sort the permutation vector that would sort the data array containing the good operations only np.save(sort_good_ind_path, sort_good_ind) # -- extract the top feature names names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat]) # -- Calculate the measures to be plotted problems_succ = (~all_classes_avg_good[:, sort_good_ind[:max_feat]].mask).sum( axis=0) u_stat_mean = all_classes_avg_good_norm[:, sort_good_ind[:max_feat]].mean(axis=0)
def plot_arr_dendrogram(abs_corr_array, names, max_dist_cluster, measures=None): """ Compute dendrogram and create a plot plotting dendrogram and abs_corr_array Parameters: ---------- abs_corr_array : ndarray array containing the correlation matrix names : list list of strings containing the names of the operations in abs_corr_array in the corresponding order. max_dist_cluster : float Maximum distance in the clusters measures : ndarray (n_measures x abs_corr_array.shape[0]) Array containing measures to be plotted on top of the matrix. Positions corresponding positions of operations in abs_corr_array. Returns: -------- index : list list of indices used to reorder the correlation matrix """ figsize = (18, 12) #figsize=(46.81,33.11) rect_measures = [0.25, 0.8075, 0.5, 0.15] rect_dendro = [0.755, 0.02, 0.15, 0.94] rect_matrix = [0.175, 0.02, 0.55, 0.94] rect_color = [0.92, 0.02, 0.02, 0.94] # Compute and plot dendrogram. fig = plt.figure(figsize=figsize) axdendro = fig.add_axes(rect_dendro) corr_linkage = idtop.calc_linkage(abs_corr_array)[0] corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation='left', color_threshold=max_dist_cluster) #axdendro.set_xticks([]) axdendro.set_yticks([]) axdendro.axvline(max_dist_cluster, ls='--', c='k') axdendro.set_xlabel('correlation distance') # Plot distance matrix. axmatrix = fig.add_axes(rect_matrix) index = corr_dendrogram['leaves'] abs_corr_array = abs_corr_array[index, :] abs_corr_array = abs_corr_array[:, index] # -- plot the correlation matrix vmin = round(np.min(abs_corr_array), 1) vmax = 1 numSteps = (vmax - vmin) * 20 # steps of 0.05 in correlation im = axmatrix.matshow(abs_corr_array, aspect='auto', origin='lower', vmin=vmin, vmax=vmax, cmap=mpl.pyplot.cm.get_cmap('turbo', numSteps)) axmatrix.set_xticks([]) axmatrix.set_yticks(range(len(index))) #axmatrix.set_yticklabels(np.array(names)[index],fontsize=5) axmatrix.set_yticklabels(np.array(names)[index], fontsize=5.6) #,rotation =45) # Plot colorbar. axcolor = fig.add_axes(rect_color) cbar = plt.colorbar(im, cax=axcolor) cbar.set_label('Pearson correlation') # Plot the quality measures '''axmeasure = fig.add_axes(rect_measures) axmeasure.xaxis.set_ticklabels([]) axmeasure.scatter(np.arange(0,measures.shape[-1])+0.5,measures[0,index]) axmeasure.set_xlim([0,measures.shape[-1]]) axmeasure.set_ylabel('problems calculated') axmeasure.yaxis.label.set_color('b') [label.set_color('b') for label in axmeasure.get_yticklabels()] axmeasure2 = axmeasure.twinx() axmeasure2.plot(np.arange(0,measures.shape[-1])+0.5,measures[1,index],color='r') axmeasure2.set_xlim([0,measures.shape[-1]]) [label.set_color('r') for label in axmeasure2.get_yticklabels()] axmeasure2.set_ylabel('avg classification accuracy') axmeasure2.yaxis.label.set_color('r')''' # ----------------------------------------------------------------- # -- calculate and plot clusters ---------------------------------- # ----------------------------------------------------------------- #cluster_ind = hierarchy.fcluster(link_arr, t=cluster_t, criterion=cluster_criterion) cluster_ind = hierarchy.fcluster(corr_linkage, t=max_dist_cluster, criterion='distance') # -- plot delimiters for measures cluster_bounds = np.hstack((-1, np.nonzero(np.diff( cluster_ind[index]))[0], abs_corr_array.shape[0] - 1)) + 1 '''for bound in cluster_bounds: axmeasure.axvline(bound,linestyle='--',color='k')''' # -- calculate the locations for the cluster squares patch_bounds = cluster_bounds - .5 patch_sizes = np.diff(patch_bounds) cluter_square_params = tuple( ((patch_bounds[i], patch_bounds[i]), patch_sizes[i], patch_sizes[i]) for i in range(len(patch_sizes))) for cluster_square_param in cluter_square_params: axmatrix.add_patch( mpl.patches.Rectangle(cluster_square_param[0], cluster_square_param[1], cluster_square_param[2], fill=0, ec='w', lw=2)) # ----------------------------------------------------------------- # -- calculate and plot best features ----------------------------- # ----------------------------------------------------------------- best_features_marker = [] for (i, j) in zip(cluster_bounds[:-1], cluster_bounds[1:]): measures_dendr = measures[1, index] best_features_marker.append(i + np.argmin(measures_dendr[i:j])) axmatrix.scatter(best_features_marker, best_features_marker, color='w') axmatrix.set_xlim([-0.5, abs_corr_array.shape[0] - 0.5]) axmatrix.set_ylim([-0.5, abs_corr_array.shape[0] - 0.5]) [(text.set_color('k'), text.set_weight('bold')) for i, text in enumerate(axmatrix.get_yticklabels()) if i in best_features_marker] return index
problem_names_path = intermediate_data_root + 'problem_names.npy' measures_problems_path = intermediate_data_root + 'measure_problems.npy' # -- Load the data all_classes_avg = np.load(all_classes_avg_out_path) op_id_order = np.load(op_id_order_path) op_id_good = np.load(op_id_good_path) max_feat = 50 max_corr_dist = 0.2 # # -- mask all nan values and take top 200 features # all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_order[:100]]) # # -- calculate the z-score of the u stat array # all_classes_avg_top = ((all_classes_avg_top.T - np.ma.mean(all_classes_avg_top,axis=1)) / np.ma.std(all_classes_avg_top,axis=1)).T # abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0)) # -- calculate the correlation array with respect to performance and mask nan. abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat( all_classes_avg[:, op_id_good], norm='z-score', max_feat=max_feat) all_classes_avg_top = np.ma.masked_invalid( all_classes_avg[:, op_id_good][:, sort_good_ind[:max_feat]]) # -- calculate the linkage for the correlation corr_linkage = idtop.calc_linkage(abs_corr_array)[0] # -- extract operation names --- ------------------------------------------ # -- load a reference HCTSA_loc.mat containing all op_ids op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'], is_from_old_matlab=True) top_id = op_id_good[sort_good_ind][:max_feat] names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat])
problem_names_path = intermediate_data_root+'problem_names.npy' measures_problems_path = intermediate_data_root+'measure_problems.npy' # -- Load the data all_classes_avg = np.load(all_classes_avg_out_path) op_id_order = np.load(op_id_order_path) op_id_good = np.load(op_id_good_path) max_feat = 50 max_corr_dist = 0.2 # # -- mask all nan values and take top 200 features # all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_order[:100]]) # # -- calculate the z-score of the u stat array # all_classes_avg_top = ((all_classes_avg_top.T - np.ma.mean(all_classes_avg_top,axis=1)) / np.ma.std(all_classes_avg_top,axis=1)).T # abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0)) # -- calculate the correlation array with respect to performance and mask nan. abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg[:,op_id_good],norm='z-score', max_feat = max_feat) all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_good][:,sort_good_ind[:max_feat]]) # -- calculate the linkage for the correlation corr_linkage = idtop.calc_linkage(abs_corr_array)[0] # -- extract operation names --- ------------------------------------------ # -- load a reference HCTSA_loc.mat containing all op_ids op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True) top_id = op_id_good[sort_good_ind][:max_feat] names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat]) # -- extract problem names --- ------------------------------------------ reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat')
def plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster,measures = None): """ Compute dendrogram and create a plot plotting dendrogram and abs_corr_array Parameters: ---------- abs_corr_array : ndarray array containing the correlation matrix names : list list of strings containing the names of the operations in abs_corr_array in the corresponding order. max_dist_cluster : float Maximum distance in the clusters measures : ndarray (n_measures x abs_corr_array.shape[0]) Array containing measures to be plotted on top of the matrix. Positions corresponding positions of operations in abs_corr_array. Returns: -------- index : list list of indices used to reorder the correlation matrix """ figsize=(18,12) #figsize=(46.81,33.11) rect_measures = [0.25,0.8075,0.5,0.15] rect_dendro = [0.755,0.05,0.15,0.75] rect_matrix = [0.25,0.05,0.5,0.75] rect_color = [0.92,0.05,0.02,0.75] # Compute and plot dendrogram. fig = plt.figure(figsize=figsize) axdendro = fig.add_axes(rect_dendro) corr_linkage = idtop.calc_linkage(abs_corr_array)[0] corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation='left') #axdendro.set_xticks([]) axdendro.set_yticks([]) axdendro.axvline(max_dist_cluster,ls='--',c='k') # Plot distance matrix. axmatrix = fig.add_axes(rect_matrix) index = corr_dendrogram['leaves'] abs_corr_array = abs_corr_array[index,:] abs_corr_array = abs_corr_array[:,index] # -- plot the correlation matrix im = axmatrix.matshow(abs_corr_array, aspect='auto', origin='lower',vmin=0,vmax=1) axmatrix.set_xticks([]) axmatrix.set_yticks(range(len(index))) #axmatrix.set_yticklabels(np.array(names)[index],fontsize=5) axmatrix.set_yticklabels(np.array(names)[index]) # Plot colorbar. axcolor = fig.add_axes(rect_color) plt.colorbar(im, cax=axcolor) # Plot the quality measures axmeasure = fig.add_axes(rect_measures) axmeasure.xaxis.set_ticklabels([]) axmeasure.scatter(np.arange(0,measures.shape[-1])+0.5,measures[0,index]) axmeasure.set_xlim([0,measures.shape[-1]]) axmeasure.set_ylabel('problems calculated') axmeasure.yaxis.label.set_color('b') [label.set_color('b') for label in axmeasure.get_yticklabels()] axmeasure2 = axmeasure.twinx() axmeasure2.plot(np.arange(0,measures.shape[-1])+0.5,measures[1,index],color='r') axmeasure2.set_xlim([0,measures.shape[-1]]) [label.set_color('r') for label in axmeasure2.get_yticklabels()] axmeasure2.set_ylabel('z-scored avg u-stat') axmeasure2.yaxis.label.set_color('r') # ----------------------------------------------------------------- # -- calculate and plot clusters ---------------------------------- # ----------------------------------------------------------------- #cluster_ind = hierarchy.fcluster(link_arr, t=cluster_t, criterion=cluster_criterion) cluster_ind = hierarchy.fcluster(corr_linkage, t = max_dist_cluster, criterion='distance') # -- plot delimiters for measures cluster_bounds = np.hstack((-1,np.nonzero(np.diff(cluster_ind[index]))[0],abs_corr_array.shape[0]-1))+1 for bound in cluster_bounds: axmeasure.axvline(bound,linestyle='--',color='k') # -- calculate the locations for the cluster squares patch_bounds = cluster_bounds - .5 patch_sizes = np.diff(patch_bounds) cluter_square_params = tuple(((patch_bounds[i],patch_bounds[i]),patch_sizes[i],patch_sizes[i]) for i in range(len(patch_sizes))) for cluster_square_param in cluter_square_params: axmatrix.add_patch(mpl.patches.Rectangle(cluster_square_param[0],cluster_square_param[1],cluster_square_param[2],fill=0,ec='w',lw=2)) # ----------------------------------------------------------------- # -- calculate and plot best features ----------------------------- # ----------------------------------------------------------------- best_features_marker = [] for (i,j) in zip(cluster_bounds[:-1],cluster_bounds[1:]): measures_dendr = measures[1,index] best_features_marker.append(i+np.argmin(measures_dendr[i:j])) axmatrix.scatter(best_features_marker,best_features_marker,color='w') axmatrix.set_xlim([-0.5,abs_corr_array.shape[0]-0.5]) axmatrix.set_ylim([-0.5,abs_corr_array.shape[0]-0.5]) [(text.set_color('k'),text.set_weight('bold')) for i,text in enumerate(axmatrix.get_yticklabels()) if i in best_features_marker] return index
# -- Load the data all_classes_avg = np.load(all_classes_avg_out_path) op_id_good = np.load(op_id_good_path) # -- Mask NaN entires all_classes_avg_good = np.ma.masked_invalid(all_classes_avg[:,op_id_good]) # -- load a reference HCTSA_loc.mat containing all op_ids import modules.misc.PK_matlab_IO as mIO op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True) max_feat = 50 # -- calculate the correlation abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg_good,norm='z-score', max_feat = max_feat) # -- save the op id's in order of performance (first entry = best performance) np.save(op_id_order_path,op_id_good[sort_good_ind]) # -- sort the permutation vector that would sort the data array containing the good operations only np.save(sort_good_ind_path,sort_good_ind) # -- extract the top feature names names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat]) # -- Calculate the measures to be plotted problems_succ = (~all_classes_avg_good[:,sort_good_ind[:max_feat]].mask).sum(axis=0) u_stat_mean = all_classes_avg_good_norm[:,sort_good_ind[:max_feat]].mean(axis=0)