def input_task(self,task_name,is_read_feature_data = True): """ Read the required data from a HCTSA_loc.mat file Parameters: ----------- task_name : string the name of the classification task to be imported is_read_feature_data : bool if true, the feature data matrix will be read (default is True) Returns: -------- data : ndarray Array containing the data. Each row corresponds to a timeseries and each column to an operation. ts : dict dictionary containing the information for all timeseries (rows in data). ['keywords', 'n_samples', 'id', 'filename'] op : dict dictionary containing the information for all contained operations. Keys are ['keywords', 'master_id', 'id', 'code_string', 'name'] """ # -- assemble the file path mat_file_path = self.path_pattern.format(task_name) # -- load the data,operations and timeseries from the matlab file if is_read_feature_data: data , op, ts = mIO.read_from_mat_file(mat_file_path,['TS_DataMat','Operations','TimeSeries'],is_from_old_matlab = True) else: op, ts = mIO.read_from_mat_file(mat_file_path,['Operations','TimeSeries'],is_from_old_matlab = True) data = None if is_read_feature_data: return self.masking_method(data), ts, op else: return None, ts, op
def count_op_calc(mat_file_paths, is_from_old_matlab=False): """ Counts how many times every operation has been calculated successfully for each problem represented by a HCTSA_loc.mat file in root_dir Parameters: ---------- mat_file_paths : list Paths to the HCTSA_loc.mat files corresponding to the problems considered. is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. Returns: -------- count_op_calc_all_problems : ndarray Array where each entry represents one operation and each value is the number of successful calculations of the corresponding operation for the given problems. """ count_op_calc_all_problems = np.zeros(10000) for mat_file_path in mat_file_paths: op, = mIO.read_from_mat_file(mat_file_path, ['Operations'], is_from_old_matlab=is_from_old_matlab) print "Counting which operations calculated in: {:s}".format( mat_file_path) count_op_calc_all_problems[op['id']] += 1 return count_op_calc_all_problems
def count_op_calc(mat_file_paths,is_from_old_matlab = False): """ Counts how many times every operation has been calculated successfully for each problem represented by a HCTSA_loc.mat file in root_dir Parameters: ---------- mat_file_paths : list Paths to the HCTSA_loc.mat files corresponding to the problems considered. is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. Returns: -------- count_op_calc_all_problems : ndarray Array where each entry represents one operation and each value is the number of successful calculations of the corresponding operation for the given problems. """ count_op_calc_all_problems = np.zeros(10000) for mat_file_path in mat_file_paths: op, = mIO.read_from_mat_file(mat_file_path,['Operations'],is_from_old_matlab = is_from_old_matlab ) print "Counting which operations calculated in: {:s}".format(mat_file_path) count_op_calc_all_problems[op['id']]+=1 return count_op_calc_all_problems
def best_noncorr_op_ind(ind_dict,mask,file_path,op = None,is_from_old_matlab = False): """ Compute the indices for the top features for a specific HCTSA_loc.mat file and the corresponding operation ids Parameters: ----------- ind_dict : dict Dictionary where keys are file paths and values are the indices in the data matrix of HCTSA_loc.mat mask : array like Mask to reduce the indices given in ind_dict file_path : string Path to HCTSA_loc.mat file op : dict,optional Operations dictionary from HCTSA_loc.mat file at file_path is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. Returns: -------- ind_top : array Indices of the features combining the information ind_dict and mask for the HCTSA_loc.mat file pointed to by file_path op_id_top : array Operation ids corresponding to ind_top """ ind = np.array(ind_dict[file_path]) if op == None: op, = mIO.read_from_mat_file(file_path, ['Operations'],is_from_old_matlab = is_from_old_matlab) op_id_top = np.array(op['id'])[ind][mask] ind_top = ind[mask] return ind_top,op_id_top
def input_task_master(self, task_name, is_read_feature_data=True, old_matlab=False): """ Read the required data from a HCTSA_loc.mat file and master operations Parameters: ----------- task_name : string the name of the classification task to be imported is_read_feature_data : bool if true, the feature data matrix will be read (default is True) Returns: -------- data : ndarray Array containing the data. Each row corresponds to a timeseries and each column to an operation. ts : dict dictionary containing the information for all timeseries (rows in data). ['keywords', 'n_samples', 'id', 'filename'] op : dict dictionary containing the information for all contained operations. Keys are ['keywords', 'master_id', 'id', 'code_string', 'name'] """ # -- assemble the file path mat_file_path = self.path_pattern.format(task_name) print "Reading file {}".format(mat_file_path) # -- load the data,operations and timeseries from the matlab file if is_read_feature_data: data, op, ts, m_op = mIO.read_from_mat_file( mat_file_path, ['TS_DataMat', 'Operations', 'TimeSeries', 'MasterOperations'], is_from_old_matlab=old_matlab) else: op, ts, m_op = mIO.read_from_mat_file( mat_file_path, ['Operations', 'TimeSeries', 'MasterOperations'], is_from_old_matlab=old_matlab) data = None if is_read_feature_data: return self.masking_method(data), ts, op, m_op else: return None, ts, op, m_op
def calculate_ustat_avg_mult_task(mat_file_paths, u_stat_file_paths, all_classes_avg_out_path='./', is_from_old_matlab=False): """ For multiple tasks calculate the u statistics for each task averaged over all possible label pairs. The results are saved to disk. Parameters: ----------- mat_file_paths : list List of file paths to the MAT files containing the HCTSA data. u_stat_file_paths : list File paths of the saved u statistics data in binary npy files. all_classes_avg_out_path : string Path to the output folder in which the tasks average u statistics are saved. is_from_old_matlab : boolean Are the MAT files from older version of the comp engine Returns: -------- all_classes_avg : ndarray ndarray where each row represents a task and column i represents operation with op_id = i. """ # -- initialise the array containing the average u-statistic values for all problems and features all_classes_avg = np.ones((len(u_stat_file_paths), 10000)) * np.NAN for i, (u_stat_file_path, mat_file_path) in enumerate(zip(u_stat_file_paths, mat_file_paths)): # -- load the u statistic for every operation and label pairing u_stat = np.load(u_stat_file_path) # -- calculate the scaling factor for every label pairing of the current classification problem u_scale = u_stat_norm_factor(mat_file_path, is_from_old_matlab=is_from_old_matlab) # -- calculate the average scaled u statistic over all label pairs in current problem u_stat_avg = (u_stat.T / u_scale).transpose().mean(axis=0) # -- save the average scaled u-statistic for all features to the all_classes_avg array. # The column number corresponds with the operation id op, = mIO.read_from_mat_file(mat_file_path, ['Operations'], is_from_old_matlab=is_from_old_matlab) all_classes_avg[i, op['id']] = u_stat_avg np.save(all_classes_avg_out_path, all_classes_avg) return all_classes_avg
def calculate_ustat_avg_mult_task(mat_file_paths,u_stat_file_paths,all_classes_avg_out_path = './',is_from_old_matlab = False): """ For multiple tasks calculate the u statistics for each task averaged over all possible label pairs. The results are saved to disk. Parameters: ----------- mat_file_paths : list List of file paths to the MAT files containing the HCTSA data. u_stat_file_paths : list File paths of the saved u statistics data in binary npy files. all_classes_avg_out_path : string Path to the output folder in which the tasks average u statistics are saved. is_from_old_matlab : boolean Are the MAT files from older version of the comp engine Returns: -------- all_classes_avg : ndarray ndarray where each row represents a task and column i represents operation with op_id = i. """ # -- initialise the array containing the average u-statistic values for all problems and features all_classes_avg = np.ones((len(u_stat_file_paths),10000))*np.NAN for i,(u_stat_file_path, mat_file_path) in enumerate(zip(u_stat_file_paths,mat_file_paths)): # -- load the u statistic for every operation and label pairing u_stat = np.load(u_stat_file_path) # -- calculate the scaling factor for every label pairing of the current classification problem u_scale = u_stat_norm_factor(mat_file_path,is_from_old_matlab = is_from_old_matlab) # -- calculate the average scaled u statistic over all label pairs in current problem u_stat_avg = (u_stat.T/u_scale).transpose().mean(axis=0) # -- save the average scaled u-statistic for all features to the all_classes_avg array. # The column number corresponds with the operation id op, = mIO.read_from_mat_file(mat_file_path,['Operations'],is_from_old_matlab = is_from_old_matlab ) all_classes_avg[i,op['id']] = u_stat_avg np.save(all_classes_avg_out_path,all_classes_avg) return all_classes_avg
def u_stat_norm_factor(file_name, is_from_old_matlab=False): """ Return the u statisitc scaling factor n_1*n_2 where n_i is the number of time series with label i. Every entry corresponds to one label pairing in the classification problem pointed to by file_name. Parameters: ----------- file_name : string Filename of HCTSA_loc.mat file containing data of the current problem is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. Returns: -------- u_scale : array Scaling factor for u statistic for every label pairing in the current classification problem """ ts, = mIO.read_from_mat_file(file_name, ['TimeSeries'], is_from_old_matlab=is_from_old_matlab) labels = [int(x.split(',')[-1]) for x in ts['keywords']] labels_unique = list(set(labels)) labels = np.array(labels) n_labels = len(labels_unique) nr_items_label = [] # -- for every label calculate the number of time series with this label for i, label in enumerate(labels_unique): nr_items_label.append(np.nonzero((labels == label))[0].shape[0]) # -- initialise the u_scale array for all label pairings u_scale = np.zeros(n_labels * (n_labels - 1) / 2) # -- calculate the scaling factor for the u statistic for every label pairing for i, (label_ind_0, label_ind_1) in enumerate( itertools.combinations(range(n_labels), 2)): u_scale[i] = nr_items_label[label_ind_0] * nr_items_label[label_ind_1] return u_scale
def u_stat_norm_factor(file_name,is_from_old_matlab = False): """ Return the u statisitc scaling factor n_1*n_2 where n_i is the number of time series with label i. Every entry corresponds to one label pairing in the classification problem pointed to by file_name. Parameters: ----------- file_name : string Filename of HCTSA_loc.mat file containing data of the current problem is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. Returns: -------- u_scale : array Scaling factor for u statistic for every label pairing in the current classification problem """ ts, = mIO.read_from_mat_file(file_name,['TimeSeries'],is_from_old_matlab = is_from_old_matlab ) labels = [int(x.split(',')[-1]) for x in ts['keywords']] labels_unique = list(set(labels)) labels = np.array(labels) n_labels = len(labels_unique) nr_items_label = [] # -- for every label calculate the number of time series with this label for i,label in enumerate(labels_unique): nr_items_label.append(np.nonzero((labels == label))[0].shape[0]) # -- initialise the u_scale array for all label pairings u_scale = np.zeros(n_labels * (n_labels-1)/2 ) # -- calculate the scaling factor for the u statistic for every label pairing for i,(label_ind_0,label_ind_1) in enumerate(itertools.combinations(range(n_labels),2)): u_scale[i] = nr_items_label[label_ind_0]*nr_items_label[label_ind_1] return u_scale
def best_noncorr_op_ind(ind_dict, mask, file_path, op=None, is_from_old_matlab=False): """ Compute the indices for the top features for a specific HCTSA_loc.mat file and the corresponding operation ids Parameters: ----------- ind_dict : dict Dictionary where keys are file paths and values are the indices in the data matrix of HCTSA_loc.mat mask : array like Mask to reduce the indices given in ind_dict file_path : string Path to HCTSA_loc.mat file op : dict,optional Operations dictionary from HCTSA_loc.mat file at file_path is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. Returns: -------- ind_top : array Indices of the features combining the information ind_dict and mask for the HCTSA_loc.mat file pointed to by file_path op_id_top : array Operation ids corresponding to ind_top """ ind = np.array(ind_dict[file_path]) if op == None: op, = mIO.read_from_mat_file(file_path, ['Operations'], is_from_old_matlab=is_from_old_matlab) op_id_top = np.array(op['id'])[ind][mask] ind_top = ind[mask] return ind_top, op_id_top
def cat_data_op_subset(file_paths, op_id_top, is_from_old_matlab=False, is_return_masked=True): """ Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths. Warning, this can take a while and the returned data matrix can be very large. XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is only a little more work to implement. XXX Parameters: ----------- file_paths : list list of file paths pointing to the files containing the data op_id_top : list,ndarray list of operation ids wanted in the concatenated data array is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. is_return_masked : boolean Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with data_all = np.ma.masked_invalid(data_all) Returns: -------- data_all : ndarray/masked ndarray Concatenated data array """ is_first = True data_all = None for file_path in file_paths: print "Adding data from {:s} \n to complete data matrix".format( file_path) data, op = mIO.read_from_mat_file( file_path, ['TS_DataMat', 'Operations'], is_from_old_matlab=is_from_old_matlab) # -- find the indices in the data for for op_id_top ind = hlp.ismember(op['id'], op_id_top, is_return_masked_array=True, return_dtype=int) # -- if any of the operations was not calculated for this problem # -- create a masked array and copy only valid data and mask # -- invalid data if ind.data != op_id_top: # -- create an masked array filled with NaN. # -- This makes later masking of non-existent entries easier # -- each column of data_ma corresponds to the op_id in op_id_top with the # -- same index (column i in data_ma corresponds to op_id_top[i]) data_ma = np.empty((data.shape[0], np.array(op_id_top).shape[0])) data_ma[:] = np.NaN for it, i in enumerate(ind): # -- if i is masked in ind that means that the current operation in data # -- is not part of op_id_top. We therefore do not need this operation to # -- be included in data_ma. if i is not np.ma.masked: data_ma[:, i] = data[:, it] # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary) else: data_ma = np.array(data[:, ind]) # -- mask all NaN (not calculated) entries and stick them together #data_ma = np.ma.masked_invalid(data_ma) if is_first == True: data_all = data_ma is_first = False else: data_all = np.vstack((data_all, data_ma)) # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module. if is_return_masked == True: data_all = np.ma.masked_invalid(data_all) return data_all
all_classes_avg_out_path = intermediate_data_root + '/all_classes_avg.npy' op_id_good_path = intermediate_data_root + '/op_id_good.npy' op_id_order_path = intermediate_data_root + '/op_id_order.npy' sort_good_ind_path = intermediate_data_root + 'sort_good_ind.npy' # -- Load the data all_classes_avg = np.load(all_classes_avg_out_path) op_id_good = np.load(op_id_good_path) # -- Mask NaN entires all_classes_avg_good = np.ma.masked_invalid(all_classes_avg[:, op_id_good]) # -- load a reference HCTSA_loc.mat containing all op_ids import modules.misc.PK_matlab_IO as mIO op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'], is_from_old_matlab=True) max_feat = 50 # -- calculate the correlation abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat( all_classes_avg_good, norm='z-score', max_feat=max_feat) # -- save the op id's in order of performance (first entry = best performance) np.save(op_id_order_path, op_id_good[sort_good_ind]) # -- sort the permutation vector that would sort the data array containing the good operations only np.save(sort_good_ind_path, sort_good_ind) # -- extract the top feature names names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat])
# all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_order[:100]]) # # -- calculate the z-score of the u stat array # all_classes_avg_top = ((all_classes_avg_top.T - np.ma.mean(all_classes_avg_top,axis=1)) / np.ma.std(all_classes_avg_top,axis=1)).T # abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0)) # -- calculate the correlation array with respect to performance and mask nan. abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg[:,op_id_good],norm='z-score', max_feat = max_feat) all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_good][:,sort_good_ind[:max_feat]]) # -- calculate the linkage for the correlation corr_linkage = idtop.calc_linkage(abs_corr_array)[0] # -- extract operation names --- ------------------------------------------ # -- load a reference HCTSA_loc.mat containing all op_ids op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True) top_id = op_id_good[sort_good_ind][:max_feat] names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat]) # -- extract problem names --- ------------------------------------------ reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat') problem_paths = np.load(problem_names_path) problem_names = np.array([reg_ex.match(problem_path).group(1) for problem_path in problem_paths]) # --------------------------------------------------------------------- # -- Plot ------------------------------------------------------------- # ---------------------------------------------------------------------
def cat_data_op_subset(file_paths,op_id_top,is_from_old_matlab = False,is_return_masked = True): """ Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths. Warning, this can take a while and the returned data matrix can be very large. XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is only a little more work to implement. XXX Parameters: ----------- file_paths : list list of file paths pointing to the files containing the data op_id_top : list,ndarray list of operation ids wanted in the concatenated data array is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. is_return_masked : boolean Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with data_all = np.ma.masked_invalid(data_all) Returns: -------- data_all : ndarray/masked ndarray Concatenated data array """ is_first = True data_all = None for file_path in file_paths: print "Adding data from {:s} \n to complete data matrix".format(file_path) data,op = mIO.read_from_mat_file(file_path, ['TS_DataMat','Operations'],is_from_old_matlab = is_from_old_matlab) # -- find the indices in the data for for op_id_top ind = hlp.ismember(op['id'],op_id_top,is_return_masked_array = True,return_dtype = int) # -- if any of the operations was not calculated for this problem # -- create a masked array and copy only valid data and mask # -- invalid data if ind.data != op_id_top: # -- create an masked array filled with NaN. # -- This makes later masking of non-existent entries easier # -- each column of data_ma corresponds to the op_id in op_id_top with the # -- same index (column i in data_ma corresponds to op_id_top[i]) data_ma = np.empty((data.shape[0],np.array(op_id_top).shape[0])) data_ma[:] = np.NaN for it,i in enumerate(ind): # -- if i is masked in ind that means that the current operation in data # -- is not part of op_id_top. We therefore do not need this operation to # -- be included in data_ma. if i is not np.ma.masked: data_ma[:,i] = data[:,it] # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary) else: data_ma = np.array(data[:,ind]) # -- mask all NaN (not calculated) entries and stick them together #data_ma = np.ma.masked_invalid(data_ma) if is_first == True: data_all = data_ma is_first = False else: data_all = np.vstack((data_all,data_ma)) # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module. if is_return_masked == True: data_all = np.ma.masked_invalid(data_all) return data_all
def u_stat_all_label_file_name(file_name, mask=None, is_from_old_matlab=False): """ Calculate the U-statistic for all label pairings in a HCTSA_loc.mat file. The operations can be masked by a boolean array. Parameters: ----------- file_name : string File name of the HCTSA_loc.mat file containing as least the matrices 'TimeSeries' and 'TS_DataMat' mask : ndarray dtype='bool', optional If given this acts as mask to which features are included in the calculation is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. Returns: -------- ranks : ndarray of dim (n_labels * (n_labels-1) / 2, nr_timeseries) The U-statistic for all features and label pairs, where a row represents a pair of labels. Every column represents one feature labels_unique : list List of all unique labels label_ind_list : list List of lists where each sub-list i represents all row-indices in data containing timeseries for labels_unique[i] """ # --------------------------------------------------------------------- # load the data # --------------------------------------------------------------------- ts, data = mIO.read_from_mat_file(file_name, ['TimeSeries', 'TS_DataMat'], is_from_old_matlab=is_from_old_matlab) # --------------------------------------------------------------------- # mask the data if required # --------------------------------------------------------------------- if mask != None: data = data[:, mask] # --------------------------------------------------------------------- # extract the unique labels # --------------------------------------------------------------------- labels = [int(x.split(',')[-1]) for x in ts['keywords']] labels_unique = list(set(labels)) labels = np.array(labels) label_ind_list = [] # --------------------------------------------------------------------- # get indices for all unique labels # --------------------------------------------------------------------- for i, label in enumerate(labels_unique): label_ind_list.append(np.nonzero((labels == label))[0]) n_labels = len(label_ind_list) # --------------------------------------------------------------------- # calculate Mann-Whitney u-test # --------------------------------------------------------------------- ranks = np.zeros((n_labels * (n_labels - 1) / 2, data.shape[1])) for i, (label_ind_0, label_ind_1) in enumerate( itertools.combinations(range(n_labels), 2)): # -- select the data for the current labels data_0 = data[label_ind_list[label_ind_0], :] data_1 = data[label_ind_list[label_ind_1], :] print i + 1, '/', n_labels * (n_labels - 1) / 2 for k in range(0, data.shape[1]): # -- in the case of same value for every feature in both arrays set max possible value if np.ma.all((data_0[:, k] == data_0[0, k])) and np.ma.all( (data_1[:, k] == data_0[0, k])): ranks[i, k] = data_0[:, k].shape[0] * data_1[:, k].shape[0] / 2. else: ranks[i, k] = stats.mannwhitneyu(data_0[:, k], data_1[:, k])[0] return ranks, labels_unique, label_ind_list
def u_stat_all_label_file_name(file_name,mask = None, is_from_old_matlab=False): """ Calculate the U-statistic for all label pairings in a HCTSA_loc.mat file. The operations can be masked by a boolean array. Parameters: ----------- file_name : string File name of the HCTSA_loc.mat file containing as least the matrices 'TimeSeries' and 'TS_DataMat' mask : ndarray dtype='bool', optional If given this acts as mask to which features are included in the calculation is_from_old_matlab : bool If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different. Returns: -------- ranks : ndarray of dim (n_labels * (n_labels-1) / 2, nr_timeseries) The U-statistic for all features and label pairs, where a row represents a pair of labels. Every column represents one feature labels_unique : list List of all unique labels label_ind_list : list List of lists where each sub-list i represents all row-indices in data containing timeseries for labels_unique[i] """ # --------------------------------------------------------------------- # load the data # --------------------------------------------------------------------- ts,data = mIO.read_from_mat_file(file_name,['TimeSeries','TS_DataMat'],is_from_old_matlab = is_from_old_matlab ) # --------------------------------------------------------------------- # mask the data if required # --------------------------------------------------------------------- if mask != None: data = data[:,mask] # --------------------------------------------------------------------- # extract the unique labels # --------------------------------------------------------------------- labels = [int(x.split(',')[-1]) for x in ts['keywords']] labels_unique = list(set(labels)) labels = np.array(labels) label_ind_list = [] # --------------------------------------------------------------------- # get indices for all unique labels # --------------------------------------------------------------------- for i,label in enumerate(labels_unique): label_ind_list.append(np.nonzero((labels == label))[0]) n_labels = len(label_ind_list) # --------------------------------------------------------------------- # calculate Mann-Whitney u-test # --------------------------------------------------------------------- ranks = np.zeros((n_labels * (n_labels-1) / 2,data.shape[1])) for i,(label_ind_0,label_ind_1) in enumerate(itertools.combinations(range(n_labels),2)): # -- select the data for the current labels data_0 = data[label_ind_list[label_ind_0],:] data_1 = data[label_ind_list[label_ind_1],:] print i+1,'/',n_labels * (n_labels-1) / 2 for k in range(0,data.shape[1]): # -- in the case of same value for every feature in both arrays set max possible value if np.ma.all((data_0[:,k] == data_0[0,k])) and np.ma.all((data_1[:,k] == data_0[0,k] )): ranks[i,k] = data_0[:,k].shape[0] * data_1[:,k].shape[0]/2. else: ranks[i,k] = stats.mannwhitneyu(data_0[:,k], data_1[:,k])[0] return ranks,labels_unique,label_ind_list