def calc_abs_corr(self):
     """
     Calculate the distance matrix using a correlation approach for every column in self.ops_base_perf_vals
     """
     # -- no normalisation in here as the best performing features have been picked already, potentially using normalisation
     self.similarity_array,sort_ind,_ = idtop.calc_perform_corr_mat(self.ops_base_perf_vals,norm=None, 
                                                           max_feat = self.ops_base_perf_vals.shape[1])
     self.similarity_array_op_ids = self.good_perf_op_ids[sort_ind]
 def calc_abs_corr(self):
     """
     Calculate the distance matrix using a correlation approach for every column in self.ops_base_perf_vals
     """
     # -- no normalisation in here as the best performing features have been picked already, potentially using normalisation
     self.similarity_array, sort_ind, _ = idtop.calc_perform_corr_mat(
         self.ops_base_perf_vals,
         norm=None,
         max_feat=self.ops_base_perf_vals.shape[1])
     self.similarity_array_op_ids = self.good_perf_op_ids[sort_ind]
Beispiel #3
0
def plot_arr_dendrogram_bak(abs_corr_array, names):
    """
    Compute  dendrogram and create a plot plotting dendrogram and abs_corr_array
    Parameters:
    ----------
    abs_corr_array : ndarray
        array containing the correlation matrix
    names : list 
        list of strings containing the names of the operations in abs_corr_array in the
        corresponding order.
    Returns:
    --------
    index : list
        list of indices used to reorder the correlation matrix
    """
    figsize = (10, 10)
    rect_dendro = [0.25, 0.76, 0.65, 0.23]
    rect_matrix = [0.25, 0.1, 0.65, 0.65]
    rect_color = [0.91, 0.1, 0.02, 0.65]

    # Compute and plot dendrogram.
    fig = plt.figure(figsize=figsize)
    axdendro = fig.add_axes(rect_dendro)
    corr_linkage = idtop.calc_linkage(abs_corr_array)[0]
    corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation='top')
    axdendro.set_xticks([])
    axdendro.set_yticks([])

    # Plot distance matrix.
    axmatrix = fig.add_axes(rect_matrix)
    index = corr_dendrogram['leaves']
    abs_corr_array = abs_corr_array[index, :]
    abs_corr_array = abs_corr_array[:, index]
    im = axmatrix.matshow(abs_corr_array,
                          aspect='auto',
                          origin='upper',
                          vmin=0,
                          vmax=1)
    axmatrix.set_xticks([])
    axmatrix.set_yticks(range(len(index)))
    axmatrix.set_yticklabels(np.array(names)[index])

    # Plot colorbar.
    axcolor = fig.add_axes(rect_color)
    plt.colorbar(im, cax=axcolor)
    #fig.savefig('/home/philip/work/reports/feature_importance/data/correlation_plots/problem_space/dendr_{:d}_Norm.png'.format(len(index)))

    return index
 def calc_hierch_cluster(self,t = 0.2, criterion='distance' ):
     """
     Calculate the clustering using hierachical clustering
     Parameters:
     -----------
     t : float
         The threshold to apply when forming flat clusters.
     criterion : str, optional
         The criterion to use in forming flat clusters. 
     """
     self.linkage = idtop.calc_linkage(self.similarity_array)[0]
     self.cluster_inds = hierarchy.fcluster(self.linkage, t = t, criterion=criterion)
     # -- map index to op_id and create list of lists representing clusters
     self.cluster_op_id_list = [[] for x in xrange(self.cluster_inds.max())]
     for i,cluster_ind in enumerate(self.cluster_inds):
         self.cluster_op_id_list[cluster_ind-1].append(self.similarity_array_op_ids[i])
def plot_arr_dendrogram_bak(abs_corr_array,names):
    """
    Compute  dendrogram and create a plot plotting dendrogram and abs_corr_array
    Parameters:
    ----------
    abs_corr_array : ndarray
        array containing the correlation matrix
    names : list 
        list of strings containing the names of the operations in abs_corr_array in the
        corresponding order.
    Returns:
    --------
    index : list
        list of indices used to reorder the correlation matrix
    """
    figsize=(10,10)
    rect_dendro = [0.25,0.76,0.65,0.23]
    rect_matrix = [0.25,0.1,0.65,0.65]
    rect_color = [0.91,0.1,0.02,0.65]
    
    
    # Compute and plot dendrogram.
    fig = plt.figure(figsize=figsize)
    axdendro = fig.add_axes(rect_dendro)
    corr_linkage = idtop.calc_linkage(abs_corr_array)[0]
    corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation='top')
    axdendro.set_xticks([])
    axdendro.set_yticks([])
    
    # Plot distance matrix.
    axmatrix = fig.add_axes(rect_matrix)
    index = corr_dendrogram['leaves']
    abs_corr_array = abs_corr_array[index,:]
    abs_corr_array = abs_corr_array[:,index]
    im = axmatrix.matshow(abs_corr_array, aspect='auto', origin='upper',vmin=0,vmax=1)
    axmatrix.set_xticks([])
    axmatrix.set_yticks(range(len(index)))
    axmatrix.set_yticklabels(np.array(names)[index])

    # Plot colorbar.
    axcolor = fig.add_axes(rect_color)
    plt.colorbar(im, cax=axcolor) 
    #fig.savefig('/home/philip/work/reports/feature_importance/data/correlation_plots/problem_space/dendr_{:d}_Norm.png'.format(len(index)))

    return index
 def calc_hierch_cluster(self, t=0.2, criterion='distance'):
     """
     Calculate the clustering using hierachical clustering
     Parameters:
     -----------
     t : float
         The threshold to apply when forming flat clusters.
     criterion : str, optional
         The criterion to use in forming flat clusters. 
     """
     self.linkage = idtop.calc_linkage(self.similarity_array)[0]
     self.cluster_inds = hierarchy.fcluster(self.linkage,
                                            t=t,
                                            criterion=criterion)
     # -- map index to op_id and create list of lists representing clusters
     self.cluster_op_id_list = [[] for x in xrange(self.cluster_inds.max())]
     for i, cluster_ind in enumerate(self.cluster_inds):
         self.cluster_op_id_list[cluster_ind - 1].append(
             self.similarity_array_op_ids[i])
Beispiel #7
0
# -- Load the data
all_classes_avg = np.load(all_classes_avg_out_path)
op_id_good = np.load(op_id_good_path)

# -- Mask NaN entires
all_classes_avg_good = np.ma.masked_invalid(all_classes_avg[:, op_id_good])

# -- load a reference HCTSA_loc.mat containing all op_ids
import modules.misc.PK_matlab_IO as mIO
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'],
                             is_from_old_matlab=True)

max_feat = 50
# -- calculate the correlation
abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat(
    all_classes_avg_good, norm='z-score', max_feat=max_feat)

# -- save the op id's in order of performance (first entry = best performance)
np.save(op_id_order_path, op_id_good[sort_good_ind])
# -- sort the permutation vector that would sort the data array containing the good operations only
np.save(sort_good_ind_path, sort_good_ind)

# -- extract the top feature names
names = hlp.ind_map_subset(op['id'], op['name'],
                           op_id_good[sort_good_ind][:max_feat])

# -- Calculate the measures to be plotted
problems_succ = (~all_classes_avg_good[:, sort_good_ind[:max_feat]].mask).sum(
    axis=0)
u_stat_mean = all_classes_avg_good_norm[:,
                                        sort_good_ind[:max_feat]].mean(axis=0)
Beispiel #8
0
def plot_arr_dendrogram(abs_corr_array,
                        names,
                        max_dist_cluster,
                        measures=None):
    """
    Compute  dendrogram and create a plot plotting dendrogram and abs_corr_array
    Parameters:
    ----------
    abs_corr_array : ndarray
        array containing the correlation matrix
    names : list 
        list of strings containing the names of the operations in abs_corr_array in the
        corresponding order.
    max_dist_cluster : float
        Maximum distance in the clusters
    measures : ndarray (n_measures x abs_corr_array.shape[0])
        Array containing measures to be plotted on top of the matrix. Positions corresponding positions
        of operations in abs_corr_array.
    Returns:
    --------
    index : list
        list of indices used to reorder the correlation matrix
    """

    figsize = (18, 12)
    #figsize=(46.81,33.11)
    rect_measures = [0.25, 0.8075, 0.5, 0.15]
    rect_dendro = [0.755, 0.02, 0.15, 0.94]
    rect_matrix = [0.175, 0.02, 0.55, 0.94]
    rect_color = [0.92, 0.02, 0.02, 0.94]

    # Compute and plot dendrogram.
    fig = plt.figure(figsize=figsize)
    axdendro = fig.add_axes(rect_dendro)
    corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

    corr_dendrogram = hierarchy.dendrogram(corr_linkage,
                                           orientation='left',
                                           color_threshold=max_dist_cluster)
    #axdendro.set_xticks([])
    axdendro.set_yticks([])
    axdendro.axvline(max_dist_cluster, ls='--', c='k')
    axdendro.set_xlabel('correlation distance')
    # Plot distance matrix.
    axmatrix = fig.add_axes(rect_matrix)
    index = corr_dendrogram['leaves']
    abs_corr_array = abs_corr_array[index, :]
    abs_corr_array = abs_corr_array[:, index]

    # -- plot the correlation matrix
    vmin = round(np.min(abs_corr_array), 1)
    vmax = 1
    numSteps = (vmax - vmin) * 20  # steps of 0.05 in correlation
    im = axmatrix.matshow(abs_corr_array,
                          aspect='auto',
                          origin='lower',
                          vmin=vmin,
                          vmax=vmax,
                          cmap=mpl.pyplot.cm.get_cmap('turbo', numSteps))

    axmatrix.set_xticks([])
    axmatrix.set_yticks(range(len(index)))
    #axmatrix.set_yticklabels(np.array(names)[index],fontsize=5)
    axmatrix.set_yticklabels(np.array(names)[index],
                             fontsize=5.6)  #,rotation =45)

    # Plot colorbar.
    axcolor = fig.add_axes(rect_color)
    cbar = plt.colorbar(im, cax=axcolor)
    cbar.set_label('Pearson correlation')

    # Plot the quality measures
    '''axmeasure = fig.add_axes(rect_measures)
    axmeasure.xaxis.set_ticklabels([]) 
    axmeasure.scatter(np.arange(0,measures.shape[-1])+0.5,measures[0,index])
    axmeasure.set_xlim([0,measures.shape[-1]])
    axmeasure.set_ylabel('problems calculated')
    axmeasure.yaxis.label.set_color('b')
    [label.set_color('b') for label in axmeasure.get_yticklabels()]
    axmeasure2 = axmeasure.twinx()
    axmeasure2.plot(np.arange(0,measures.shape[-1])+0.5,measures[1,index],color='r')
    axmeasure2.set_xlim([0,measures.shape[-1]])

    [label.set_color('r') for label in axmeasure2.get_yticklabels()]
    axmeasure2.set_ylabel('avg classification accuracy')
    axmeasure2.yaxis.label.set_color('r')'''

    # -----------------------------------------------------------------
    # -- calculate and plot clusters ----------------------------------
    # -----------------------------------------------------------------
    #cluster_ind = hierarchy.fcluster(link_arr, t=cluster_t, criterion=cluster_criterion)
    cluster_ind = hierarchy.fcluster(corr_linkage,
                                     t=max_dist_cluster,
                                     criterion='distance')

    # -- plot delimiters for measures
    cluster_bounds = np.hstack((-1, np.nonzero(np.diff(
        cluster_ind[index]))[0], abs_corr_array.shape[0] - 1)) + 1
    '''for bound in cluster_bounds:
        axmeasure.axvline(bound,linestyle='--',color='k')'''

    # -- calculate the locations for the cluster squares
    patch_bounds = cluster_bounds - .5
    patch_sizes = np.diff(patch_bounds)
    cluter_square_params = tuple(
        ((patch_bounds[i], patch_bounds[i]), patch_sizes[i], patch_sizes[i])
        for i in range(len(patch_sizes)))
    for cluster_square_param in cluter_square_params:
        axmatrix.add_patch(
            mpl.patches.Rectangle(cluster_square_param[0],
                                  cluster_square_param[1],
                                  cluster_square_param[2],
                                  fill=0,
                                  ec='w',
                                  lw=2))

    # -----------------------------------------------------------------
    # -- calculate and plot best features -----------------------------
    # -----------------------------------------------------------------
    best_features_marker = []
    for (i, j) in zip(cluster_bounds[:-1], cluster_bounds[1:]):
        measures_dendr = measures[1, index]
        best_features_marker.append(i + np.argmin(measures_dendr[i:j]))

    axmatrix.scatter(best_features_marker, best_features_marker, color='w')
    axmatrix.set_xlim([-0.5, abs_corr_array.shape[0] - 0.5])
    axmatrix.set_ylim([-0.5, abs_corr_array.shape[0] - 0.5])

    [(text.set_color('k'), text.set_weight('bold'))
     for i, text in enumerate(axmatrix.get_yticklabels())
     if i in best_features_marker]

    return index
Beispiel #9
0
problem_names_path = intermediate_data_root + 'problem_names.npy'
measures_problems_path = intermediate_data_root + 'measure_problems.npy'
# -- Load the data
all_classes_avg = np.load(all_classes_avg_out_path)
op_id_order = np.load(op_id_order_path)
op_id_good = np.load(op_id_good_path)
max_feat = 50
max_corr_dist = 0.2
# # -- mask all nan values and take top 200 features
# all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_order[:100]])
# # -- calculate the z-score of the u stat array
# all_classes_avg_top = ((all_classes_avg_top.T - np.ma.mean(all_classes_avg_top,axis=1)) / np.ma.std(all_classes_avg_top,axis=1)).T
# abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0))

# -- calculate the correlation array with respect to performance and mask nan.
abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat(
    all_classes_avg[:, op_id_good], norm='z-score', max_feat=max_feat)
all_classes_avg_top = np.ma.masked_invalid(
    all_classes_avg[:, op_id_good][:, sort_good_ind[:max_feat]])

# -- calculate the linkage for the correlation
corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

# -- extract operation names --- ------------------------------------------
# -- load a reference HCTSA_loc.mat containing all op_ids
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'],
                             is_from_old_matlab=True)

top_id = op_id_good[sort_good_ind][:max_feat]
names = hlp.ind_map_subset(op['id'], op['name'],
                           op_id_good[sort_good_ind][:max_feat])
problem_names_path = intermediate_data_root+'problem_names.npy'
measures_problems_path = intermediate_data_root+'measure_problems.npy'
# -- Load the data
all_classes_avg = np.load(all_classes_avg_out_path)
op_id_order = np.load(op_id_order_path)
op_id_good = np.load(op_id_good_path)
max_feat = 50
max_corr_dist = 0.2
# # -- mask all nan values and take top 200 features
# all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_order[:100]])
# # -- calculate the z-score of the u stat array
# all_classes_avg_top = ((all_classes_avg_top.T - np.ma.mean(all_classes_avg_top,axis=1)) / np.ma.std(all_classes_avg_top,axis=1)).T
# abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0)) 

# -- calculate the correlation array with respect to performance and mask nan.
abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg[:,op_id_good],norm='z-score', max_feat = max_feat)
all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_good][:,sort_good_ind[:max_feat]])

# -- calculate the linkage for the correlation
corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

# -- extract operation names --- ------------------------------------------
# -- load a reference HCTSA_loc.mat containing all op_ids
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True)   

top_id = op_id_good[sort_good_ind][:max_feat]
names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat])

# -- extract problem names --- ------------------------------------------
reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat')
def plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster,measures = None):
    """
    Compute  dendrogram and create a plot plotting dendrogram and abs_corr_array
    Parameters:
    ----------
    abs_corr_array : ndarray
        array containing the correlation matrix
    names : list 
        list of strings containing the names of the operations in abs_corr_array in the
        corresponding order.
    max_dist_cluster : float
        Maximum distance in the clusters
    measures : ndarray (n_measures x abs_corr_array.shape[0])
        Array containing measures to be plotted on top of the matrix. Positions corresponding positions
        of operations in abs_corr_array.
    Returns:
    --------
    index : list
        list of indices used to reorder the correlation matrix
    """
 
    figsize=(18,12)    
    #figsize=(46.81,33.11) 
    rect_measures = [0.25,0.8075,0.5,0.15]
    rect_dendro = [0.755,0.05,0.15,0.75]
    rect_matrix = [0.25,0.05,0.5,0.75]
    rect_color = [0.92,0.05,0.02,0.75]
    

    # Compute and plot dendrogram.
    fig = plt.figure(figsize=figsize)
    axdendro = fig.add_axes(rect_dendro)
    corr_linkage = idtop.calc_linkage(abs_corr_array)[0]
      
    corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation='left')
    #axdendro.set_xticks([])
    axdendro.set_yticks([])
    axdendro.axvline(max_dist_cluster,ls='--',c='k')
    # Plot distance matrix.
    axmatrix = fig.add_axes(rect_matrix)
    index = corr_dendrogram['leaves']
    abs_corr_array = abs_corr_array[index,:]
    abs_corr_array = abs_corr_array[:,index]
    
    # -- plot the correlation matrix
    im = axmatrix.matshow(abs_corr_array, aspect='auto', origin='lower',vmin=0,vmax=1)
      
    axmatrix.set_xticks([])
    axmatrix.set_yticks(range(len(index)))
    #axmatrix.set_yticklabels(np.array(names)[index],fontsize=5)
    axmatrix.set_yticklabels(np.array(names)[index])

    # Plot colorbar.
    axcolor = fig.add_axes(rect_color)
    plt.colorbar(im, cax=axcolor) 
    
    
    # Plot the quality measures
    axmeasure = fig.add_axes(rect_measures)
    axmeasure.xaxis.set_ticklabels([]) 
    axmeasure.scatter(np.arange(0,measures.shape[-1])+0.5,measures[0,index])
    axmeasure.set_xlim([0,measures.shape[-1]])
    axmeasure.set_ylabel('problems calculated')
    axmeasure.yaxis.label.set_color('b')
    [label.set_color('b') for label in axmeasure.get_yticklabels()]
    axmeasure2 = axmeasure.twinx()
    axmeasure2.plot(np.arange(0,measures.shape[-1])+0.5,measures[1,index],color='r')
    axmeasure2.set_xlim([0,measures.shape[-1]])

    [label.set_color('r') for label in axmeasure2.get_yticklabels()]
    axmeasure2.set_ylabel('z-scored avg u-stat')
    axmeasure2.yaxis.label.set_color('r')

    # -----------------------------------------------------------------
    # -- calculate and plot clusters ----------------------------------
    # -----------------------------------------------------------------
    #cluster_ind = hierarchy.fcluster(link_arr, t=cluster_t, criterion=cluster_criterion)
    cluster_ind = hierarchy.fcluster(corr_linkage, t = max_dist_cluster, criterion='distance')
                                     
    # -- plot delimiters for measures
    cluster_bounds = np.hstack((-1,np.nonzero(np.diff(cluster_ind[index]))[0],abs_corr_array.shape[0]-1))+1
    for bound in cluster_bounds:
        axmeasure.axvline(bound,linestyle='--',color='k')                            
                                     
    # -- calculate the locations for the cluster squares
    patch_bounds = cluster_bounds - .5
    patch_sizes = np.diff(patch_bounds)
    cluter_square_params = tuple(((patch_bounds[i],patch_bounds[i]),patch_sizes[i],patch_sizes[i]) for i in range(len(patch_sizes)))
    for cluster_square_param in cluter_square_params:
        axmatrix.add_patch(mpl.patches.Rectangle(cluster_square_param[0],cluster_square_param[1],cluster_square_param[2],fill=0,ec='w',lw=2))  

    
    # -----------------------------------------------------------------
    # -- calculate and plot best features -----------------------------
    # -----------------------------------------------------------------  
    best_features_marker = []
    for (i,j) in zip(cluster_bounds[:-1],cluster_bounds[1:]):
        measures_dendr = measures[1,index]
        best_features_marker.append(i+np.argmin(measures_dendr[i:j]))
        
    axmatrix.scatter(best_features_marker,best_features_marker,color='w') 
    axmatrix.set_xlim([-0.5,abs_corr_array.shape[0]-0.5])
    axmatrix.set_ylim([-0.5,abs_corr_array.shape[0]-0.5])
    
    [(text.set_color('k'),text.set_weight('bold')) for i,text in enumerate(axmatrix.get_yticklabels()) if i in best_features_marker]
    
    
    
    return index
# -- Load the data
all_classes_avg = np.load(all_classes_avg_out_path)
op_id_good = np.load(op_id_good_path)

# -- Mask NaN entires
all_classes_avg_good = np.ma.masked_invalid(all_classes_avg[:,op_id_good])

# -- load a reference HCTSA_loc.mat containing all op_ids
import modules.misc.PK_matlab_IO as mIO    
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True)   

max_feat = 50
# -- calculate the correlation
abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg_good,norm='z-score', max_feat = max_feat)


# -- save the op id's in order of performance (first entry = best performance)
np.save(op_id_order_path,op_id_good[sort_good_ind])
# -- sort the permutation vector that would sort the data array containing the good operations only 
np.save(sort_good_ind_path,sort_good_ind)

# -- extract the top feature names
names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat])



# -- Calculate the measures to be plotted
problems_succ = (~all_classes_avg_good[:,sort_good_ind[:max_feat]].mask).sum(axis=0)
u_stat_mean = all_classes_avg_good_norm[:,sort_good_ind[:max_feat]].mean(axis=0)