Ejemplo n.º 1
0
    def plot_similarity_array(self):

        abs_corr_array = self.workflow.redundancy_method.similarity_array

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(
            op_id_name_map[0], op_id_name_map[1],
            self.workflow.redundancy_method.similarity_array_op_ids)
        measures = np.zeros((2, len(names)))

        tmp_ind = hlp.ismember(
            self.workflow.redundancy_method.similarity_array_op_ids,
            self.workflow.good_op_ids)

        # -- number of problems for which each good performing feature has been calculated
        measures[0, :] = (~self.workflow.stats_good_op[:, tmp_ind].mask).sum(
            axis=0)
        # -- z scored u-stat(for all features) for top features
        stats_good_op_z_score = fap.normalise_masked_array(
            self.workflow.stats_good_op_comb, axis=0, norm_type='zscore')[0]
        measures[1, :] = stats_good_op_z_score[tmp_ind]

        fiplt.plot_arr_dendrogram(abs_corr_array,
                                  names,
                                  max_dist_cluster=self.max_dist_cluster,
                                  measures=measures)
Ejemplo n.º 2
0
 def plot_similarity_array(self):
     
     abs_corr_array = self.workflow.redundancy_method.similarity_array
     
     op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
     names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids)
     measures = np.zeros((2,len(names)))
    
     
     tmp_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids, 
                    self.workflow.good_op_ids)
     
     # -- number of problems for which each good performing feature has been calculated
     measures[0,:] = (~self.workflow.stats_good_op[:,tmp_ind].mask).sum(axis=0)
     # -- z scored u-stat(for all features) for top features 
     stats_good_op_z_score = fap.normalise_masked_array(self.workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0]
     measures[1,:] = stats_good_op_z_score[tmp_ind]
     
     fiplt.plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster=self.max_dist_cluster,measures = measures)
 def reduce_to_good_perf_ops(self,ops_base_vals,good_perf_op_ids,good_op_ids):
     """
     Reduce the ops_base_vals by keeping only the columns corresponding to the op_ids in self.good_perf_op_ids
     Parameters:
     -----------
     ops_base_vals : nd array
         Array containing the values on which the similarity of the operations will be calculated
     good_op_ids : ndarray
         The op_ids of the columns in  ops_base_vals
     good_perf_op_ids : ndarray
         The op_ids of the features we are interested in
     Returns:
     --------
     ops_base_perf_vals : ndarray
         ops_base_vals reduced to contain only operations with ids given in good_perf_op_ids with the same ordering.
     """
     good_perf_ind = hlp.ismember(good_perf_op_ids,good_op_ids)
     ops_base_perf_vals = ops_base_vals[:,good_perf_ind]
     return ops_base_perf_vals
Ejemplo n.º 4
0
 def reduce_to_good_perf_ops(self, ops_base_vals, good_perf_op_ids,
                             good_op_ids):
     """
     Reduce the ops_base_vals by keeping only the columns corresponding to the op_ids in self.good_perf_op_ids
     Parameters:
     -----------
     ops_base_vals : nd array
         Array containing the values on which the similarity of the operations will be calculated
     good_op_ids : ndarray
         The op_ids of the columns in  ops_base_vals
     good_perf_op_ids : ndarray
         The op_ids of the features we are interested in
     Returns:
     --------
     ops_base_perf_vals : ndarray
         ops_base_vals reduced to contain only operations with ids given in good_perf_op_ids with the same ordering.
     """
     good_perf_ind = hlp.ismember(good_perf_op_ids, good_op_ids)
     ops_base_perf_vals = ops_base_vals[:, good_perf_ind]
     return ops_base_perf_vals
Ejemplo n.º 5
0
 def collect_stats_good_op_ids(self):
     """
     Collect all combined stats for each task and take stats for good operations only
     """
     #stats_good_op_ma = np.empty((data.shape[0],np.array(self.good_op_ids).shape[0]))
     stats_good_op_tmp = []
     #stats_good_op_ma[:] = np.NaN
     for task in self.tasks:
         # -- create tmp array for good stats for current task. For sake of simplicity when dealing with different
         # dimensions of task.tot_stats we transpose stats_good_op_ma_tmp so row corresponds to feature temporarily
         if task.tot_stats.ndim > 1:
             stats_good_op_ma_tmp = np.empty((self.good_op_ids.shape[0],task.tot_stats.shape[0]))
         else:
             stats_good_op_ma_tmp = np.empty((self.good_op_ids.shape[0]))
         stats_good_op_ma_tmp[:] = np.NaN
         
         ind = hlp.ismember(task.op_ids,self.good_op_ids,is_return_masked_array = True,return_dtype = int)
         # -- it is position in task.op_ids and i is position in self.good_op_ids
         for it,i in enumerate(ind):
             if i is not np.ma.masked: # -- that means the entry in task.op_ids is also in self.good_op_ids
                 stats_good_op_ma_tmp[i] = task.tot_stats[it].T
         # -- We return to the usual ordering: column equals feature
         stats_good_op_tmp.append(stats_good_op_ma_tmp.T)
     self.stats_good_op = np.ma.masked_invalid(np.vstack(stats_good_op_tmp))
Ejemplo n.º 6
0
def cat_data_op_subset(file_paths,
                       op_id_top,
                       is_from_old_matlab=False,
                       is_return_masked=True):
    """
    Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths.
    Warning, this can take a while and the returned data matrix can be very large.
    XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same
    database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is
    only a little more work to implement. XXX
    Parameters:
    -----------
    file_paths : list
        list of file paths pointing to the files containing the data
    op_id_top : list,ndarray
        list of operation ids wanted in the concatenated data array
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.
    is_return_masked : boolean
        Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion
         returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with 
         data_all = np.ma.masked_invalid(data_all)
    Returns:
    --------
    data_all : ndarray/masked ndarray
        Concatenated data array
   """
    is_first = True
    data_all = None

    for file_path in file_paths:
        print "Adding data from {:s} \n to complete data matrix".format(
            file_path)
        data, op = mIO.read_from_mat_file(
            file_path, ['TS_DataMat', 'Operations'],
            is_from_old_matlab=is_from_old_matlab)

        # -- find the indices in the data for for op_id_top
        ind = hlp.ismember(op['id'],
                           op_id_top,
                           is_return_masked_array=True,
                           return_dtype=int)
        # -- if any of the operations was not calculated for this problem
        # -- create a masked array and copy only valid data and mask
        # -- invalid data
        if ind.data != op_id_top:
            # -- create an masked array filled with NaN.
            # -- This makes later masking of non-existent entries easier
            # -- each column of data_ma corresponds to the op_id in op_id_top with the
            # -- same index (column i in data_ma corresponds to op_id_top[i])

            data_ma = np.empty((data.shape[0], np.array(op_id_top).shape[0]))
            data_ma[:] = np.NaN
            for it, i in enumerate(ind):
                # -- if i is masked in ind that means that the current operation in data
                # -- is not part of op_id_top. We therefore do not need this operation to
                # -- be included in data_ma.
                if i is not np.ma.masked:
                    data_ma[:, i] = data[:, it]
        # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary)
        else:
            data_ma = np.array(data[:, ind])

        # -- mask all NaN (not calculated) entries and stick them together
        #data_ma = np.ma.masked_invalid(data_ma)
        if is_first == True:
            data_all = data_ma
            is_first = False
        else:
            data_all = np.vstack((data_all, data_ma))
    # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module.
    if is_return_masked == True:
        data_all = np.ma.masked_invalid(data_all)
    return data_all
Ejemplo n.º 7
0
def corelated_features_mask(data=None,
                            abs_corr_array=None,
                            calc_times=None,
                            op_ids=None):
    """
    Computes a mask that, when applied, removes correlated features from the data array.
    Parmeters:
    ----------
    data : ndarray
        A data matrix with rows represent training samples and columns represent features
    abs_corr_array : ndarray
        The correlation matrix of all features. Has to be given id data == none
    calc_times : ndarray
        Array where the first row corresponds to operation id's and the second row to calculation times
        for these operation ids
    op_ids : 
        The operation ids corresponding to the rows/columns in abs_corr_array
    Returns:
    -------
    mask : ndarray,dtype=bool
        1d array whose entries are one only for uncorrelated entries.
    abs_corr_arrayabs_corr_array : ndarray
        the correlation matrix
    """

    if abs_corr_array == None:
        abs_corr_array = np.abs(np.ma.corrcoef(data, rowvar=0))

    # -- Vector containing 0 for all operations we don't want
    mask = np.ones(abs_corr_array.shape[0], dtype='bool')

    for i in range(abs_corr_array.shape[0]):
        # -- if the current line represents an operation not yet eleminated
        if mask[i]:
            # -- remove operations which are highly correlated
            mask[(abs_corr_array[i] > 0.8)] = 0
            #----------------------------------------------------------
            # -- Use fastest operation in a set of correlated operations
            #----------------------------------------------------------
            if calc_times != None and op_ids != None:
                # -- find ind in abs_corr_array of correlated features
                ind_corr = np.nonzero((abs_corr_array[i] > 0.8))[0]

                # -- translate ind_corr to op ids
                op_id_corr = hlp.ind_map_subset(range(abs_corr_array.shape[0]),
                                                op_ids, ind_corr)

                # -- get calculation time of correlated op ids
                t_corr = hlp.ind_map_subset(calc_times[0], calc_times[1],
                                            op_id_corr)
                # -- check if all entries are None -> no timing information for any of the operations
                if np.nonzero(t_corr)[0].shape[0] == 0:
                    # -- pick the first operation as fastest as there is no timing information
                    op_id_corr_fastest = op_id_corr[0]
                # -- else pick the fastest operation
                else:
                    # -- get op_id of fastest operation in this correlated set
                    op_id_corr_fastest = op_id_corr[np.nanargmin(t_corr)]

                # -- get index of fastest operation in this correlated set
                ind_corr_fastest = hlp.ind_map_subset(
                    op_ids, range(abs_corr_array.shape[0]), op_id_corr_fastest)

                # -- add fastest index back in
                mask[ind_corr_fastest] = 1
            #----------------------------------------------------------
            # -- Use arbitrary operation in a set of correlated operations
            #----------------------------------------------------------
            else:
                mask[i] = 1
    return mask, abs_corr_array
Ejemplo n.º 8
0
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'],
                             is_from_old_matlab=True)

max_feat = 50
# -- calculate the correlation
abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat(
    all_classes_avg_good, norm='z-score', max_feat=max_feat)

# -- save the op id's in order of performance (first entry = best performance)
np.save(op_id_order_path, op_id_good[sort_good_ind])
# -- sort the permutation vector that would sort the data array containing the good operations only
np.save(sort_good_ind_path, sort_good_ind)

# -- extract the top feature names
names = hlp.ind_map_subset(op['id'], op['name'],
                           op_id_good[sort_good_ind][:max_feat])

# -- Calculate the measures to be plotted
problems_succ = (~all_classes_avg_good[:, sort_good_ind[:max_feat]].mask).sum(
    axis=0)
u_stat_mean = all_classes_avg_good_norm[:,
                                        sort_good_ind[:max_feat]].mean(axis=0)

measures = np.vstack((problems_succ, u_stat_mean))

fiplt.plot_arr_dendrogram(abs_corr_array, names, measures=measures)
plt.savefig('/home/philip/Desktop/tmp/figure_tmp/corr_array.png')

plt.show()
Ejemplo n.º 9
0
    def plot_stat_array(self):
        
        fig = plt.figure(figsize = ((15,15)))
        # -- plot layout ------------------------------------------------------
        
        rect_ustat_arr = [0.25,0.175,.5,.5]
        rect_dendr = [0.755,0.175,.145,.5]
        rect_measures0 = [0.25,0.68,0.5,0.1]
        rect_measures1 = [0.25,0.785,0.5,0.1]
        rect_measures2 = [0.25,0.89,0.5,0.1]
        
        ax_ustat_arr = fig.add_axes(rect_ustat_arr)
        ax_dendr = fig.add_axes(rect_dendr)
        ax_measures00 = fig.add_axes(rect_measures0)
        ax_measures01 = plt.twinx(ax_measures00) 
        ax_measures10 = fig.add_axes(rect_measures1)
        ax_measures10.set_xticklabels([])
        ax_measures20 = fig.add_axes(rect_measures2)
        
        ax_measures20.set_xticklabels([])
        ax_measures21 = plt.twinx(ax_measures20)
        
        # -- calculate and plot the dendrogram
        dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='left',no_plot=True)
        hierarchy.dendrogram(self.linkage, orientation='left',p=50,truncate_mode ='lastp',ax = ax_dendr)
        
        ax_dendr.set_yticks([])
        ax_dendr.axvline(self.max_dist_cluster,ls='--',c='k')

        # -- plot sorted classification stat array ------------------------------------------

        # -- create index that sort rows to correspond to dendrogram
        feat_sort_ind = dist_dendrogram['leaves']
        # -- sort the good performant features so they have the same order as the similarity array
        sort_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids,self.workflow.redundancy_method.good_perf_op_ids)
        self.ops_base_perf_vals = self.ops_base_perf_vals[:,sort_ind]
        
        # -- create index that sort columns with respect to their mean value
        task_sort_ind = np.argsort(self.ops_base_perf_vals[:,feat_sort_ind].mean(axis=1))
        
        #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
        #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0]
        all_classes_avg_top = self.ops_base_perf_vals
        # -- plot the operation names as y-axis tick labels
        aspect = all_classes_avg_top.shape[0] / float(all_classes_avg_top.shape[1])
        ax_ustat_arr.matshow(all_classes_avg_top[task_sort_ind,:][:,feat_sort_ind].T,aspect=aspect,origin='bottom')


        ax_ustat_arr.set_yticks(range(len(feat_sort_ind)))

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids)
        ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind])
        # -- plot the problem names as x axis labels
        ax_ustat_arr.xaxis.tick_bottom()
        ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0]))
        ax_ustat_arr.set_xticklabels(self.task_names[task_sort_ind],rotation='vertical')

        # -- plot clusters ----------------------------------
        
        cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5
        for cluster_bound in cluster_bounds:
            ax_ustat_arr.axhline(cluster_bound,c='w',lw=2)
    
        # --------------------------------------------------------------------------------
        # -- calculate and plot measures -------------------------------------------------
        # --------------------------------------------------------------------------------

        # -- nr samples and nr labels --------------------------------------------------
  
        n_samples_avg = [np.array(self.workflow.tasks[i].ts['n_samples']).mean() for i in task_sort_ind]
        n_classes = [len(set(self.workflow.tasks[i].labels)) for i in task_sort_ind]
        x_loc = np.arange(0,len(self.workflow.tasks))+0.5
        ax_measures00.scatter(x_loc,n_classes,c='b',s=40)
        ax_measures00.plot(x_loc,n_classes,c='b')
        [label.set_color('b') for label in ax_measures00.get_yticklabels()]
        ax_measures00.set_ylabel('nr classes')
        ax_measures00.yaxis.label.set_color('b')
        ax_measures00.set_ylim([0,max(n_classes)+1])
        ax_measures00.set_xticklabels([])
    
        ax_measures01.scatter(x_loc,n_samples_avg,c='r',s=40)
        ax_measures01.plot(x_loc,n_samples_avg,c='r')
        
        [label.set_color('r') for label in ax_measures01.get_yticklabels()]
        ax_measures01.set_ylabel('avg samples')
        ax_measures01.yaxis.label.set_color('r')
        ax_measures01.set_ylim([0,max(n_samples_avg)+100])
        
        ax_measures00.set_xlim([0,len(self.workflow.tasks)])
        ax_measures00.set_xticklabels([])
        

        # -- Classification stat measures --------------------------------------------------
        
        # -- minimum average classification stat for all features
        ax_measures10.plot(x_loc,np.min(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o',label='min. avg. classification stat all')

        # -- minimum average classification stat for top features
        ax_measures10.plot(x_loc,np.ma.min(self.ops_base_perf_vals[task_sort_ind,:],axis=1),marker='o',label='min. avg. classification stat top')
            
        # -- average minimum (for each class pair) classification stat for top features
        # XXX This would require task.pair_stats to be available (not saved as intermediate at the moment); then it is trivial to implement
        
        ax_measures10.legend(loc=2,fontsize='small',labelspacing=.1)
        ax_measures10.set_ylabel('classification stat')
        ax_measures10.set_xlim([0,len(self.workflow.tasks)])
        ax_measures10.set_ylim([0,0.5])

        # -- Classification stat measures and avg operations working--------------------------------------------------
        # -- mean average classification stat for all features
        ax_measures20.plot(x_loc,np.ma.mean(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o')
        [label.set_color('b') for label in ax_measures20.get_yticklabels()]
        ax_measures20.set_ylabel('avrg classification stat all feat')
        ax_measures20.yaxis.label.set_color('b')
        
        # -- number of successfully calculated features

        ax_measures21.plot(x_loc,[len(self.workflow.tasks[i].op['id']) for i in task_sort_ind],c='r',marker='o')
        [label.set_color('r') for label in ax_measures21.get_yticklabels()]
        ax_measures21.set_ylabel('nr calc feat')
        ax_measures21.yaxis.label.set_color('r')
        
        ax_measures20.set_xlim([0,len(self.workflow.tasks)])
Ejemplo n.º 10
0
    def plot_stat_array(self):
        
        fig = plt.figure(figsize = ((15,15)))
        # -- plot layout ------------------------------------------------------
        
        rect_ustat_arr = [0.25,0.175,.5,.5]
        rect_dendr = [0.755,0.175,.145,.5]
        rect_measures0 = [0.25,0.68,0.5,0.1]
        rect_measures1 = [0.25,0.785,0.5,0.1]
        rect_measures2 = [0.25,0.89,0.5,0.1]
        
        ax_ustat_arr = fig.add_axes(rect_ustat_arr)
        ax_dendr = fig.add_axes(rect_dendr)
        ax_measures00 = fig.add_axes(rect_measures0)
        ax_measures01 = plt.twinx(ax_measures00) 
        ax_measures10 = fig.add_axes(rect_measures1)
        ax_measures10.set_xticklabels([])
        ax_measures20 = fig.add_axes(rect_measures2)
        
        ax_measures20.set_xticklabels([])
        ax_measures21 = plt.twinx(ax_measures20)
        
        # -- calculate and plot the dendrogram
        dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='left',no_plot=True)
        hierarchy.dendrogram(self.linkage, orientation='left',p=50,truncate_mode ='lastp',ax = ax_dendr)
        
        ax_dendr.set_yticks([])
        ax_dendr.axvline(self.max_dist_cluster,ls='--',c='k')

        # -- plot sorted U-Stat array ------------------------------------------

        # -- create index that sort rows to correspond to dendrogram
        feat_sort_ind = dist_dendrogram['leaves']
        # -- sort the good performant features so they have the same order as the similarity array
        sort_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids,self.workflow.redundancy_method.good_perf_op_ids)
        self.ops_base_perf_vals = self.ops_base_perf_vals[:,sort_ind]
        
        # -- create index that sort columns with respect to their mean value
        task_sort_ind = np.argsort(self.ops_base_perf_vals[:,feat_sort_ind].mean(axis=1))
        
        #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
        #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0]
        all_classes_avg_top = self.ops_base_perf_vals
        # -- plot the operation names as y-axis tick labels
        aspect = all_classes_avg_top.shape[0] / float(all_classes_avg_top.shape[1])
        ax_ustat_arr.matshow(all_classes_avg_top[task_sort_ind,:][:,feat_sort_ind].T,aspect=aspect,origin='bottom')


        ax_ustat_arr.set_yticks(range(len(feat_sort_ind)))

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids)
        ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind])
        # -- plot the problem names as x axis labels
        ax_ustat_arr.xaxis.tick_bottom()
        ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0]))
        ax_ustat_arr.set_xticklabels(self.task_names[task_sort_ind],rotation='vertical')

        # -- plot clusters ----------------------------------
        
        cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5
        for cluster_bound in cluster_bounds:
            ax_ustat_arr.axhline(cluster_bound,c='w',lw=2)
    
        # --------------------------------------------------------------------------------
        # -- calculate and plot measures -------------------------------------------------
        # --------------------------------------------------------------------------------

        # -- nr samples and nr labels --------------------------------------------------
  
        n_samples_avg = [np.array(self.workflow.tasks[i].ts['n_samples']).mean() for i in task_sort_ind]
        n_classes = [len(set(self.workflow.tasks[i].labels)) for i in task_sort_ind]
        x_loc = np.arange(0,len(self.workflow.tasks))+0.5
        ax_measures00.scatter(x_loc,n_classes,c='b',s=40)
        ax_measures00.plot(x_loc,n_classes,c='b')
        [label.set_color('b') for label in ax_measures00.get_yticklabels()]
        ax_measures00.set_ylabel('nr classes')
        ax_measures00.yaxis.label.set_color('b')
        ax_measures00.set_ylim([0,max(n_classes)+1])
        ax_measures00.set_xticklabels([])
    
        ax_measures01.scatter(x_loc,n_samples_avg,c='r',s=40)
        ax_measures01.plot(x_loc,n_samples_avg,c='r')
        
        [label.set_color('r') for label in ax_measures01.get_yticklabels()]
        ax_measures01.set_ylabel('avg samples')
        ax_measures01.yaxis.label.set_color('r')
        ax_measures01.set_ylim([0,max(n_samples_avg)+100])
        
        ax_measures00.set_xlim([0,len(self.workflow.tasks)])
        ax_measures00.set_xticklabels([])
        

        # -- U-stat measures --------------------------------------------------
        
        # -- minimum average U-score for all features
        ax_measures10.plot(x_loc,np.min(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o',label='min. avg. U-score all')

        # -- minimum average U-score for top features
        ax_measures10.plot(x_loc,np.ma.min(self.ops_base_perf_vals[task_sort_ind,:],axis=1),marker='o',label='min. avg. U-score top')
            
        # -- average minimum (for each class pair) U-score for top features
        # XXX This would require task.pair_stats to be available (not saved as intermediate at the moment); then it is trivial to implement
        
        ax_measures10.legend(loc=2,fontsize='small',labelspacing=.1)
        ax_measures10.set_ylabel('u-score')
        ax_measures10.set_xlim([0,len(self.workflow.tasks)])
        ax_measures10.set_ylim([0,0.5])

        # -- U-stat measures and avg operations working--------------------------------------------------
        # -- mean average U-score for all features
        ax_measures20.plot(x_loc,np.ma.mean(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o')
        [label.set_color('b') for label in ax_measures20.get_yticklabels()]
        ax_measures20.set_ylabel('avrg u-scrore all feat')
        ax_measures20.yaxis.label.set_color('b')
        
        # -- number of successfully calculated features

        ax_measures21.plot(x_loc,[len(self.workflow.tasks[i].op['id']) for i in task_sort_ind],c='r',marker='o')
        [label.set_color('r') for label in ax_measures21.get_yticklabels()]
        ax_measures21.set_ylabel('nr calc feat')
        ax_measures21.yaxis.label.set_color('r')
        
        ax_measures20.set_xlim([0,len(self.workflow.tasks)])
# abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0)) 

# -- calculate the correlation array with respect to performance and mask nan.
abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg[:,op_id_good],norm='z-score', max_feat = max_feat)
all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_good][:,sort_good_ind[:max_feat]])

# -- calculate the linkage for the correlation
corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

# -- extract operation names --- ------------------------------------------
# -- load a reference HCTSA_loc.mat containing all op_ids
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True)   

top_id = op_id_good[sort_good_ind][:max_feat]
names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat])

# -- extract problem names --- ------------------------------------------
reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat')
problem_paths = np.load(problem_names_path)

problem_names = np.array([reg_ex.match(problem_path).group(1) for problem_path in problem_paths])


# ---------------------------------------------------------------------
# -- Plot -------------------------------------------------------------
# ---------------------------------------------------------------------

fig = plt.figure(figsize = ((15,15)))
# -- plot layout ------------------------------------------------------
Ejemplo n.º 12
0
def cat_data_op_subset(file_paths,op_id_top,is_from_old_matlab = False,is_return_masked = True):
    """
    Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths.
    Warning, this can take a while and the returned data matrix can be very large.
    XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same
    database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is
    only a little more work to implement. XXX
    Parameters:
    -----------
    file_paths : list
        list of file paths pointing to the files containing the data
    op_id_top : list,ndarray
        list of operation ids wanted in the concatenated data array
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.
    is_return_masked : boolean
        Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion
         returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with 
         data_all = np.ma.masked_invalid(data_all)
    Returns:
    --------
    data_all : ndarray/masked ndarray
        Concatenated data array
   """
    is_first = True
    data_all = None

    for file_path in file_paths:
        print "Adding data from {:s} \n to complete data matrix".format(file_path)
        data,op = mIO.read_from_mat_file(file_path, ['TS_DataMat','Operations'],is_from_old_matlab = is_from_old_matlab)
 
        # -- find the indices in the data for for op_id_top
        ind = hlp.ismember(op['id'],op_id_top,is_return_masked_array = True,return_dtype = int)
        # -- if any of the operations was not calculated for this problem
        # -- create a masked array and copy only valid data and mask 
        # -- invalid data
        if ind.data != op_id_top:
            # -- create an masked array filled with NaN. 
            # -- This makes later masking of non-existent entries easier
            # -- each column of data_ma corresponds to the op_id in op_id_top with the
            # -- same index (column i in data_ma corresponds to op_id_top[i])

            data_ma = np.empty((data.shape[0],np.array(op_id_top).shape[0]))
            data_ma[:] = np.NaN
            for it,i in enumerate(ind):
                # -- if i is masked in ind that means that the current operation in data
                # -- is not part of op_id_top. We therefore do not need this operation to 
                # -- be included in data_ma.
                if i is not np.ma.masked:
                    data_ma[:,i] = data[:,it]
        # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary)
        else:
            data_ma = np.array(data[:,ind])
        
        # -- mask all NaN (not calculated) entries and stick them together
        #data_ma = np.ma.masked_invalid(data_ma)
        if is_first == True:
            data_all = data_ma
            is_first = False
        else:
            data_all = np.vstack((data_all,data_ma))
    # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module.
    if is_return_masked == True:
        data_all = np.ma.masked_invalid(data_all)
    return data_all
Ejemplo n.º 13
0
def corelated_features_mask(data=None,abs_corr_array=None,calc_times=None,op_ids=None):
    """
    Computes a mask that, when applied, removes correlated features from the data array.
    Parmeters:
    ----------
    data : ndarray
        A data matrix with rows represent training samples and columns represent features
    abs_corr_array : ndarray
        The correlation matrix of all features. Has to be given id data == none
    calc_times : ndarray
        Array where the first row corresponds to operation id's and the second row to calculation times
        for these operation ids
    op_ids : 
        The operation ids corresponding to the rows/columns in abs_corr_array
    Returns:
    -------
    mask : ndarray,dtype=bool
        1d array whose entries are one only for uncorrelated entries.
    abs_corr_arrayabs_corr_array : ndarray
        the correlation matrix
    """
    
    if abs_corr_array == None:
        abs_corr_array = np.abs(np.ma.corrcoef(data, rowvar=0))  
      
    # -- Vector containing 0 for all operations we don't want
    mask = np.ones(abs_corr_array.shape[0],dtype='bool') 
      
    for i in range(abs_corr_array.shape[0]):
    # -- if the current line represents an operation not yet eleminated
        if mask[i]:
            # -- remove operations which are highly correlated
            mask[(abs_corr_array[i] > 0.8)] = 0 
            #----------------------------------------------------------
            # -- Use fastest operation in a set of correlated operations
            #----------------------------------------------------------
            if calc_times != None and op_ids != None:
                # -- find ind in abs_corr_array of correlated features
                ind_corr = np.nonzero((abs_corr_array[i] > 0.8))[0]
    
                # -- translate ind_corr to op ids
                op_id_corr = hlp.ind_map_subset(range(abs_corr_array.shape[0]),op_ids,ind_corr)
    
                # -- get calculation time of correlated op ids
                t_corr = hlp.ind_map_subset(calc_times[0],calc_times[1],op_id_corr)
                # -- check if all entries are None -> no timing information for any of the operations
                if np.nonzero(t_corr)[0].shape[0] == 0:
                    # -- pick the first operation as fastest as there is no timing information
                    op_id_corr_fastest = op_id_corr[0]  
                # -- else pick the fastest operation               
                else:
                    # -- get op_id of fastest operation in this correlated set
                    op_id_corr_fastest = op_id_corr[np.nanargmin(t_corr)]
    
                # -- get index of fastest operation in this correlated set
                ind_corr_fastest = hlp.ind_map_subset(op_ids,range(abs_corr_array.shape[0]),op_id_corr_fastest)
                
                # -- add fastest index back in
                mask[ind_corr_fastest] = 1            
            #----------------------------------------------------------
            # -- Use arbitrary operation in a set of correlated operations
            #----------------------------------------------------------
            else:
                mask[i] = 1
    return mask,abs_corr_array
# -- calculate the correlation array with respect to performance and mask nan.
abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat(
    all_classes_avg[:, op_id_good], norm="z-score", max_feat=max_feat
)
all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:, op_id_good][:, sort_good_ind[:max_feat]])

# -- calculate the linkage for the correlation
corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

# -- extract operation names --- ------------------------------------------
# -- load a reference HCTSA_loc.mat containing all op_ids
op_ref_HCTSA_path = "/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat"
op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ["Operations"], is_from_old_matlab=True)

top_id = op_id_good[sort_good_ind][:max_feat]
names = hlp.ind_map_subset(op["id"], op["name"], op_id_good[sort_good_ind][:max_feat])

# -- extract problem names --- ------------------------------------------
reg_ex = re.compile(".*\/HCTSA_(.*)_N_70_100_reduced.mat")
problem_paths = np.load(problem_names_path)

problem_names = np.array([reg_ex.match(problem_path).group(1) for problem_path in problem_paths])


# ---------------------------------------------------------------------
# -- Plot -------------------------------------------------------------
# ---------------------------------------------------------------------

fig = plt.figure(figsize=((15, 15)))
# -- plot layout ------------------------------------------------------
Ejemplo n.º 15
0
    def plot_stat_array(self):

        fig = plt.figure(figsize=((15, 15)))
        # -- plot layout ------------------------------------------------------

        #rect_ustat_arr = [0.01,0.01,0.75,0.75] #[0.25,0.175,.5,.5]
        #rect_dendr = [0.76,0.01,.2175,.75] #[0.755,0.175,.145,.5]
        rect_ustat_arr = fig.add_axes([0.15, 0.2, 0.7, 0.8])
        #rect_dendr = fig.add_axes([0.7, 0.1, 0.145, 0.8])
        rect_dendr = fig.add_axes([0.15, 0.8, 0.873, 0.2])
        '''rect_measures0 = [0.25,0.68,0.5,0.1]
        rect_measures1 = [0.25,0.785,0.5,0.1]
        rect_measures2 = [0.25,0.89,0.5,0.1]'''

        ax_ustat_arr = fig.add_axes(rect_ustat_arr)
        ax_dendr = fig.add_axes(rect_dendr)
        '''ax_measures00 = fig.add_axes(rect_measures0)
        ax_measures01 = plt.twinx(ax_measures00) 
        ax_measures10 = fig.add_axes(rect_measures1)
        ax_measures10.set_xticklabels([])
        ax_measures20 = fig.add_axes(rect_measures2)
        
        ax_measures20.set_xticklabels([])
        ax_measures21 = plt.twinx(ax_measures20)'''

        # -- calculate and plot the dendrogram
        dist_dendrogram = hierarchy.dendrogram(self.linkage,
                                               orientation='top',
                                               no_plot=True)
        hierarchy.dendrogram(self.linkage,
                             orientation='top',
                             p=50,
                             truncate_mode='lastp',
                             ax=ax_dendr)

        ax_dendr.set_xticks([])
        ax_dendr.axvline(self.max_dist_cluster, ls='--', c='k')

        # -- plot sorted classification stat array ------------------------------------------

        # -- create index that sort rows to correspond to dendrogram
        feat_sort_ind = dist_dendrogram['leaves']
        # -- sort the good performant features so they have the same order as the similarity array
        sort_ind = hlp.ismember(
            self.workflow.redundancy_method.similarity_array_op_ids,
            self.workflow.redundancy_method.good_perf_op_ids)
        self.ops_base_perf_vals = self.ops_base_perf_vals[:, sort_ind]
        # -- create index that sort columns with respect to their mean value
        task_sort_ind = np.argsort(
            self.ops_base_perf_vals[:, feat_sort_ind].mean(axis=1))

        #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
        #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0]
        all_classes_avg_top = self.ops_base_perf_vals
        # -- plot the operation names as y-axis tick labels
        aspect = all_classes_avg_top.shape[0] / float(
            all_classes_avg_top.shape[1])
        im = ax_ustat_arr.matshow(
            all_classes_avg_top[task_sort_ind, :][:, feat_sort_ind],
            aspect=aspect,
            origin='bottom',
            cmap='turbo')

        ax_ustat_arr.set_yticks(range(all_classes_avg_top.shape[0]))

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(
            op_id_name_map[0], op_id_name_map[1],
            self.workflow.redundancy_method.similarity_array_op_ids)
        ax_ustat_arr.set_yticklabels(self.task_names[task_sort_ind])
        # -- plot the problem names as x axis labels
        ax_ustat_arr.xaxis.tick_bottom()
        ax_ustat_arr.set_xticks(range(len(feat_sort_ind)))
        ax_ustat_arr.set_xticklabels(np.array(names)[feat_sort_ind],
                                     rotation='vertical')

        fig.colorbar(im)

        # -- plot clusters ----------------------------------
        '''cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5
Ejemplo n.º 16
0
abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat(
    all_classes_avg[:, op_id_good], norm='z-score', max_feat=max_feat)
all_classes_avg_top = np.ma.masked_invalid(
    all_classes_avg[:, op_id_good][:, sort_good_ind[:max_feat]])

# -- calculate the linkage for the correlation
corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

# -- extract operation names --- ------------------------------------------
# -- load a reference HCTSA_loc.mat containing all op_ids
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'],
                             is_from_old_matlab=True)

top_id = op_id_good[sort_good_ind][:max_feat]
names = hlp.ind_map_subset(op['id'], op['name'],
                           op_id_good[sort_good_ind][:max_feat])

# -- extract problem names --- ------------------------------------------
reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat')
problem_paths = np.load(problem_names_path)

problem_names = np.array(
    [reg_ex.match(problem_path).group(1) for problem_path in problem_paths])

# ---------------------------------------------------------------------
# -- Plot -------------------------------------------------------------
# ---------------------------------------------------------------------

fig = plt.figure(figsize=((15, 15)))
# -- plot layout ------------------------------------------------------