def calc_abs_corr(self):
     """
     Calculate the distance matrix using a correlation approach for every column in self.ops_base_perf_vals
     """
     # -- no normalisation in here as the best performing features have been picked already, potentially using normalisation
     self.similarity_array,sort_ind,_ = idtop.calc_perform_corr_mat(self.ops_base_perf_vals,norm=None, 
                                                           max_feat = self.ops_base_perf_vals.shape[1])
     self.similarity_array_op_ids = self.good_perf_op_ids[sort_ind]
 def calc_abs_corr(self):
     """
     Calculate the distance matrix using a correlation approach for every column in self.ops_base_perf_vals
     """
     # -- no normalisation in here as the best performing features have been picked already, potentially using normalisation
     self.similarity_array, sort_ind, _ = idtop.calc_perform_corr_mat(
         self.ops_base_perf_vals,
         norm=None,
         max_feat=self.ops_base_perf_vals.shape[1])
     self.similarity_array_op_ids = self.good_perf_op_ids[sort_ind]
Ejemplo n.º 3
0
# -- Load the data
all_classes_avg = np.load(all_classes_avg_out_path)
op_id_good = np.load(op_id_good_path)

# -- Mask NaN entires
all_classes_avg_good = np.ma.masked_invalid(all_classes_avg[:, op_id_good])

# -- load a reference HCTSA_loc.mat containing all op_ids
import modules.misc.PK_matlab_IO as mIO
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'],
                             is_from_old_matlab=True)

max_feat = 50
# -- calculate the correlation
abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat(
    all_classes_avg_good, norm='z-score', max_feat=max_feat)

# -- save the op id's in order of performance (first entry = best performance)
np.save(op_id_order_path, op_id_good[sort_good_ind])
# -- sort the permutation vector that would sort the data array containing the good operations only
np.save(sort_good_ind_path, sort_good_ind)

# -- extract the top feature names
names = hlp.ind_map_subset(op['id'], op['name'],
                           op_id_good[sort_good_ind][:max_feat])

# -- Calculate the measures to be plotted
problems_succ = (~all_classes_avg_good[:, sort_good_ind[:max_feat]].mask).sum(
    axis=0)
u_stat_mean = all_classes_avg_good_norm[:,
                                        sort_good_ind[:max_feat]].mean(axis=0)
problem_names_path = intermediate_data_root+'problem_names.npy'
measures_problems_path = intermediate_data_root+'measure_problems.npy'
# -- Load the data
all_classes_avg = np.load(all_classes_avg_out_path)
op_id_order = np.load(op_id_order_path)
op_id_good = np.load(op_id_good_path)
max_feat = 50
max_corr_dist = 0.2
# # -- mask all nan values and take top 200 features
# all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_order[:100]])
# # -- calculate the z-score of the u stat array
# all_classes_avg_top = ((all_classes_avg_top.T - np.ma.mean(all_classes_avg_top,axis=1)) / np.ma.std(all_classes_avg_top,axis=1)).T
# abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0)) 

# -- calculate the correlation array with respect to performance and mask nan.
abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg[:,op_id_good],norm='z-score', max_feat = max_feat)
all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_good][:,sort_good_ind[:max_feat]])

# -- calculate the linkage for the correlation
corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

# -- extract operation names --- ------------------------------------------
# -- load a reference HCTSA_loc.mat containing all op_ids
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True)   

top_id = op_id_good[sort_good_ind][:max_feat]
names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat])

# -- extract problem names --- ------------------------------------------
reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat')