コード例 #1
0
    def plot_similarity_array(self):

        abs_corr_array = self.workflow.redundancy_method.similarity_array

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(
            op_id_name_map[0], op_id_name_map[1],
            self.workflow.redundancy_method.similarity_array_op_ids)
        measures = np.zeros((2, len(names)))

        tmp_ind = hlp.ismember(
            self.workflow.redundancy_method.similarity_array_op_ids,
            self.workflow.good_op_ids)

        # -- number of problems for which each good performing feature has been calculated
        measures[0, :] = (~self.workflow.stats_good_op[:, tmp_ind].mask).sum(
            axis=0)
        # -- z scored u-stat(for all features) for top features
        stats_good_op_z_score = fap.normalise_masked_array(
            self.workflow.stats_good_op_comb, axis=0, norm_type='zscore')[0]
        measures[1, :] = stats_good_op_z_score[tmp_ind]

        fiplt.plot_arr_dendrogram(abs_corr_array,
                                  names,
                                  max_dist_cluster=self.max_dist_cluster,
                                  measures=measures)
コード例 #2
0
ファイル: Plotting.py プロジェクト: Philiphorst/op_importance
 def plot_similarity_array(self):
     
     abs_corr_array = self.workflow.redundancy_method.similarity_array
     
     op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
     names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids)
     measures = np.zeros((2,len(names)))
    
     
     tmp_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids, 
                    self.workflow.good_op_ids)
     
     # -- number of problems for which each good performing feature has been calculated
     measures[0,:] = (~self.workflow.stats_good_op[:,tmp_ind].mask).sum(axis=0)
     # -- z scored u-stat(for all features) for top features 
     stats_good_op_z_score = fap.normalise_masked_array(self.workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0]
     measures[1,:] = stats_good_op_z_score[tmp_ind]
     
     fiplt.plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster=self.max_dist_cluster,measures = measures)
コード例 #3
0
ファイル: Workflow.py プロジェクト: Philiphorst/op_importance
    def select_good_perf_ops_sort_asc(self):
        """
        Select a subset of well performing operations
        """
                
        if self.select_good_perf_ops_norm in ['z-score','zscore'] :
            all_classes_good_norm = fap.normalise_masked_array(self.stats_good_op,axis = 1,norm_type = 'zscore')[0]
        
        elif self.select_good_perf_ops_norm == 'mean-norm':
            all_classes_good_mean = np.ma.masked_invalid(np.ma.mean(self.stats_good_op,axis = 1))
            all_classes_good_norm = (self.stats_good_op.T / all_classes_good_mean).T  
        
        else:
            all_classes_good_norm =  self.stats_good_op

        sort_ind_tmp = np.argsort(all_classes_good_norm.mean(axis=0))
        
        if self.n_good_perf_ops == None:
            self.stats_good_perf_op_comb  = self.stats_good_op_comb[sort_ind_tmp]
            self.good_perf_op_ids =  self.good_op_ids[sort_ind_tmp]
        else:
            self.stats_good_perf_op_comb  = self.stats_good_op_comb[sort_ind_tmp][:self.n_good_perf_ops]            
            self.good_perf_op_ids =  self.good_op_ids[sort_ind_tmp][:self.n_good_perf_ops]            
# -- plot dendrogram --------------------------------------------------
corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation="left", no_plot=True)
hierarchy.dendrogram(corr_linkage, orientation="left", p=50, truncate_mode="lastp", ax=ax_dendr)
ax_dendr.set_yticks([])

ax_dendr.axvline(max_corr_dist, ls="--", c="k")

# -- plot sorted U-Stat array ------------------------------------------

# -- create index that sort rows to correspond to dendrogram
feat_sort_ind = corr_dendrogram["leaves"]
# -- create index that sort columns with respect to their mean value
porblem_sort_ind = np.argsort(all_classes_avg_top[:, feat_sort_ind].mean(axis=1))
print porblem_sort_ind
# all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
all_classes_avg_top = fap.normalise_masked_array(all_classes_avg_top, axis=1, norm_type="zscore")[0]
# -- plot the operation names as y-axis tick labels
ax_ustat_arr.matshow(
    all_classes_avg_top[porblem_sort_ind, :][:, feat_sort_ind].T, aspect=39 / float(50), origin="bottom"
)
ax_ustat_arr.set_yticks(range(len(feat_sort_ind)))
ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind])
# -- plot the problem names as x axis labels
ax_ustat_arr.xaxis.tick_bottom()
ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0]))
ax_ustat_arr.set_xticklabels(problem_names[porblem_sort_ind], rotation="vertical")

# -- calculate and plot clusters ----------------------------------
cluster_ind = hierarchy.fcluster(corr_linkage, t=max_corr_dist, criterion="distance")
cluster_bounds = np.nonzero(np.diff(cluster_ind[feat_sort_ind]))[0] + 0.5
for cluster_bound in cluster_bounds:
コード例 #5
0
                     ax=ax_dendr)
ax_dendr.set_yticks([])

ax_dendr.axvline(max_corr_dist, ls='--', c='k')

# -- plot sorted U-Stat array ------------------------------------------

# -- create index that sort rows to correspond to dendrogram
feat_sort_ind = corr_dendrogram['leaves']
# -- create index that sort columns with respect to their mean value
porblem_sort_ind = np.argsort(all_classes_avg_top[:,
                                                  feat_sort_ind].mean(axis=1))
print porblem_sort_ind
#all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
all_classes_avg_top = fap.normalise_masked_array(all_classes_avg_top,
                                                 axis=1,
                                                 norm_type='zscore')[0]
# -- plot the operation names as y-axis tick labels
ax_ustat_arr.matshow(all_classes_avg_top[porblem_sort_ind, :][:,
                                                              feat_sort_ind].T,
                     aspect=39 / float(50),
                     origin='bottom')
ax_ustat_arr.set_yticks(range(len(feat_sort_ind)))
ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind])
# -- plot the problem names as x axis labels
ax_ustat_arr.xaxis.tick_bottom()
ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0]))
ax_ustat_arr.set_xticklabels(problem_names[porblem_sort_ind],
                             rotation='vertical')

# -- calculate and plot clusters ----------------------------------
コード例 #6
0
ファイル: Workflow.py プロジェクト: Philiphorst/op_importance
    def __init__(self,task_names,input_method,stats_method,redundancy_method,combine_tasks_method = 'mean',
                 combine_tasks_norm = None,
                 select_good_perf_ops_method = 'sort_asc',
                 select_good_perf_ops_norm = 'zscore',
                 n_good_perf_ops = None):
        """
        Constructor
        Parameters:
        -----------
        task_names : list of str
            A list of task names to be included in this workflow
        input_method : Data_Input
            The data input method used to read the data from disk. 
        stat_method : Feature_Stats
            The mehtod used to calculate the statistics
        redundancy_method : Reducing_Redundancy   
            The method used to reduce the redundancy in the well performing features
        combine_tasks_method : str
            The name describing the method used to combine the statistics for each task to create a single 1d arrray with 
            a single entry for every operation
        combine_tasks_norm : str
            The name of the normalisation method applied to the stats of each task before the statistics for each task are combined
        select_good_perf_ops_method : str
            The name describing the method used to sort the operations so the best operations come first in the self.stats_good_perf_op_comb
             and self.good_perf_op_ids
        select_good_perf_ops_norm : str
            The name describing the norm used when combining all statistics for all tasks for each operations
        self.n_good_perf_op_ids : int, optional
            Maximum entries in self.stats_good_perf_op_comb and self.good_perf_op_ids. If None, all good operations are used.
        """
        self.task_names = task_names
        self.input_method = input_method
        self.stats_method = stats_method        
        self.redundancy_method = redundancy_method
        self.combine_tasks_norm = combine_tasks_norm
        #normalise_array(data,axis,norm_type = 'zscore')
        if combine_tasks_method == 'mean':
            self.combine_tasks = self.combine_task_stats_mean
            
        if combine_tasks_norm == 'zscore':
            self.combine_task_norm_method = lambda y : fap.normalise_masked_array(y,axis = 1,norm_type = 'zscore')[0]
        else:
            # -- no normalisation - id-function
            self.combine_task_norm_method = lambda y : y

        if select_good_perf_ops_method == 'sort_asc':
            self.select_good_perf_ops = self.select_good_perf_ops_sort_asc
        self.select_good_perf_ops_norm = select_good_perf_ops_norm
        
        self.n_good_perf_ops = n_good_perf_ops 
        # -- list of Tasks for this workflow
        self.tasks = [Task.Task(task_name,self.input_method,self.stats_method) for task_name in task_names]

        # -- Counter array for number of problems calculated successfully for each operation
        
        # -- place holders 
        self.good_op_ids = []
        self.stats_good_op = None
        self.stats_good_op_comb = None
        self.stats_good_perf_op_comb = None
        self.good_perf_op_ids = None
コード例 #7
0
ファイル: Workflow.py プロジェクト: Philiphorst/op_importance
    
    
    # -----------------------------------------------------------------
    # -- Output the results to text file-------------------------------
    # -----------------------------------------------------------------       
    op_id_name_map = plotting.map_op_id_name_mult_task(workflow.tasks)
    # -- write not reduced top performing features indicating the respective clusters they belong to
    # -- number of problems for which each good performing feature has been calculated
    measures = np.zeros((3,len(workflow.good_op_ids)))
    # -- op_ids
    measures[0,:] =  workflow.good_op_ids
    # -- number of problems calculated
    measures[1,:] = (~workflow.stats_good_op.mask).sum(axis=0)
    # -- z scored u-stat
    measures[2,:] = fap.normalise_masked_array(workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0]
   
    # -- write textfile containing the information as shown in the plot     
    workflow.redundancy_method.write_cluster_file(result_txt_outpath,op_id_name_map,measures) 
    
    
    # -----------------------------------------------------------------    
    # -- show the plot as last task of the script
    # -----------------------------------------------------------------    
    plt.show()

    # -- write not reduced top performing features to text file
#     with open(result_txt_outpath,'wb') as out_result_txt_file:
#         for op_id,op_name,op_U in zip(workflow.good_perf_op_ids,
#                                       hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1], workflow.good_perf_op_ids),
#                                       workflow.stats_good_perf_op_comb):