def test_remove_nans(self): # Check Empty var1 = [] var2 = [] nvar1, nvar2 = utils.remove_nans(var1, var2) assert not nvar1.any() assert not nvar2.any() # Check some nan var1 = [1, np.nan, np.nan, 4, 5, 6, np.nan] var2 = [np.nan, 2, np.nan, 4, 5, np.nan, 7] nvar1, nvar2 = utils.remove_nans(var1, var2) assert (nvar1 == np.array([4, 5])).all() assert (nvar2 == np.array([4, 5])).all() # Check all nan var1 = [np.nan, np.nan, np.nan, np.nan] var2 = [np.nan, np.nan, np.nan, 7] nvar1, nvar2 = utils.remove_nans(var1, var2) assert not nvar1.any() assert not nvar2.any() # Check no nan var1 = [i for i in range(10)] var2 = [i * i for i in range(10)] nvar1, nvar2 = utils.remove_nans(var1, var2) assert (nvar1 == np.array(var1)).all() assert (nvar2 == np.array(var2)).all()
def calculate_FP_sets(initial_corr, samp_var1, samp_var2, infln_metrics, infln_mapping, threshold, fold, fold_value, param): """ Determine which correlations (variable pairs) belong in which infln_metric_FP sets. ---------------------------------------------------------------------------- INPUTS initial_corr - Set of integer tuples. Contains variable pairs initially classified as significant (forward CUTIE) or insignificant (reverse CUTIE). Note variable pairs (i,j) and (j,i) are double counted. infln_metrics - List. Contains strings of infln_metrics (such as 'cookd'). infln_mapping - Dictionary. Maps strings of function names to function objects (e.g. 'cookd') samp_var1 - 2D array. Each value in row i col j is the level of variable j corresponding to sample i in the order that the samples are presented in samp_ids. samp_var2 - 2D array. Same as samp_var1 but for file 2. threshold - Float. Level of significance testing (after adjusting for multiple comparisons) fold - Boolean. Determines whether you require the new P value to be a certain fold greater to be classified as a CUTIE. fold_value - Float. Determines fold difference constraint imposed on the resampled p-value needed for a correlation to be classified as a CUTIE. param - String. Either 'r' or 'p' depending on whether r value or p value will be used to filter correlations. OUTPUTS FP_infln_sets - Dictionary. Key is particular outlier metric, entry is a set of variable pairs classified as FP according to that metric. """ FP_infln_sets = {} # initialize dict for metric in infln_metrics: FP_infln_sets[metric] = set() # determine if each initial_corr correlation belongs in each metric FP set for pair in initial_corr: var1, var2 = pair x_old = samp_var1[:, var1] y_old = samp_var2[:, var2] # remove nan for influence calculation var1_values, var2_values = utils.remove_nans(x_old, y_old) if len(var1_values) > 1 and len(var2_values) > 1: influence = return_influence(var1_values, var2_values) for metric in infln_metrics: reverse, exceeds, corr_values, pvalues_thresholds = infln_mapping[metric]( var1, var2, samp_var1, samp_var2, influence=influence, threshold=threshold, fold=fold, fold_value=fold_value, param=param) # if exceeds == 0 then it is a TP if exceeds.sum() != 0: FP_infln_sets[metric].add(pair) return FP_infln_sets
def test_pointwise_metrics(self): # generate results and output intermediate file pointwise_results = {} for t in self.tuples: t1, t2, = t pointwise_results[str(t)] = {} for p in ['p', 'r']: pointwise_results[str(t)][p] = {} for f in self.infln_mapping:: x_old = self.samp_var1[:, t1] y_old = self.samp_var2[:, t2] var1_values, var2_values = utils.remove_nans(x_old, y_old) influence = statistics.return_influence(var1_values, var2_values) arr_0, arr_1, arr_2, arr_3 = self.infln_mapping[f](var1_index=t1, var2_index=t2, samp_var1=self.samp_var1, samp_var2=self.samp_var2, influence=influence, threshold=self.threshold[p], fold=self.fold, fold_value=self.fold_value[p], param=p) # save results to compressed object and later text file fp = self.work_dir + '_'.join([str(t), p, f, '.npz']) np.savez(fp, arr_0, arr_1, arr_2, arr_3) results = [] for key, value in np.load(fp).items(): results.append(value) np.savetxt(self.work_dir + '_'.join([str(t), p, f, key + '.txt']), value) pointwise_results[str(t)][p][f] = results # test cutie, cookd, dffits, dsr # with inputs mixed with nan and neg values as defined in setUp for t in self.tuples: t1, t2 = t var1_values = self.samp_var1[:, t1] var2_values = self.samp_var2[:, t2] influence = statistics.return_influence(var1_values, var2_values) for p in ['p', 'r']: for f in self.infln_mapping: results = self.infln_mapping[f](var1_index=t1, var2_index=t2, samp_var1=self.samp_var1, samp_var2=self.samp_var2, influence=influence, threshold=self.threshold[p], fold=self.fold, fold_value=self.fold_value[p], param=p) # comparison to 7 decimal places is the default value assert_almost_equal(pointwise_results[str(t)][p][f], results)
def compute_kc(new_var1, new_var2): """ Compute Kendall correlation and return p and r values. ---------------------------------------------------------------------------- INPUTS new_var1 - Array. Length sample size containing observations for given variable from file 1. new_var2 - Array. Same as new_var1 but for file 2. """ var1, var2 = utils.remove_nans(new_var1, new_var2) try: r_value, p_value = scipy.stats.kendalltau(var1, var2) except ValueError: r_value, p_value = np.nan, np.nan return p_value, r_value
def initial_stats(samp_var1, samp_var2, corr_func, paired): """ Helper function for assign_statistics. Forks between desired correlation coefficient (Pearson, Spearman, Kendall and MINE). Computes an initial set of statistics per the specified functions. Returns a dict where the key is a statistical function and the element is an initial matrix with dimensions n_rel_stats x n_var1 x n_var2, corresponding to the relevant statistic between each var1 and var2. ---------------------------------------------------------------------------- INPUTS samp_var1 - 2D array. Each value in row i col j is the level of variable j corresponding to sample i in the order that the samples are presented in samp_ids. samp_var2 - 2D array. Same as samp_var1 but for file 2. corr_func - Function. Desired function for computing correlation (e.g. scipy.stats.pearsonr, scipy.stats.spearmanr, scipy.stats.kendalltau). paired - Boolean. True if variables are paired. OUTPUTS stat_array - 3D array. Depth k = 2, row i, col j corresponds to the value of that quantity k (correlation or pvalue) for the correlation between var i and var j. """ n_var1, n_var2, n_samp = utils.get_param(samp_var1, samp_var2) corrs = np.zeros([n_var1, n_var2]) pvalues = np.zeros([n_var1, n_var2]) # subset the data matrices into the cols needed for var1 in range(n_var1): for var2 in range(n_var2): if not (paired and (var1 <= var2)): var1_values, var2_values = utils.remove_nans(samp_var1[:, var1], samp_var2[:, var2]) try: corrs[var1][var2], pvalues[var1][var2] = corr_func(var1_values, var2_values) except ValueError: corrs[var1][var2], pvalues[var1][var2] = np.nan, np.nan return corrs, pvalues
def resamplek_cutie(var1_index, var2_index, n_samp, samp_var1, samp_var2, pvalues, corrs, threshold, resample_k, sign, forward, statistic, fold, fold_value, param): """ Perform CUTIE resampling on a given pair of variables and test CUTIE status. ---------------------------------------------------------------------------- INPUTS var1_index - Integer. Index of variable in file 1. var2_index - Integer. Index of variable in file 2. n_samp - Integer. Number of samples. samp_var1 - 2D array. Each value in row i col j is the level of variable j corresponding to sample i in the order that the samples are presented in samp_ids when parsed. samp_var2 - 2D array. Same as samp_var1 but for file 2. pvalues - 2D array. Entry row i, col j represents p value of correlation between i-th var1 and j-th var2. corrs - 2D array. Contains values of correlation strength between var i and var j. threshold - Float. Level of significance testing (after adjusting for multiple comparisons) sign - Integer. -1 or 1, depending on original sign of correlation to check against following re-evaluation. forward - Boolean. True if CUTIE is run in the forward direction, False if reverse. statistic - String. Describes analysis being performed. fold - Boolean. Determines whether you require the new P value to be a certain fold greater to be classified as a CUTIE. fold_value - Float. Determines fold difference constraint imposed on the resampled p-value needed for a correlation to be classified as a CUTIE. param - String. Either 'r' or 'p' depending on whether r value or p value will be used to filter correlations. OUTPUTS reverse - 1D array. Index i is 1 if the correlation changes sign upon removing sample i. exceeds - 1D array. Index i is 1 if removing that sample causes the correlation to become insignificant in at least 1 different pairwise correlations extrema_p - 1D array. Length n_samp, contains lowest or highest p value observed thusfar for a particular sample, depending if reverse or forward CUTIE was run respectively across all i in {1,...,k} iterations of CUTIE_k. extrema_r - 1D array. Same as extrema_p but for R / correlation strength values. """ # initialize indicators and variables exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators( var1_index, var2_index, samp_var1, samp_var2, forward) # iteratively delete k samples and recompute statistics combs = [ list(x) for x in itertools.combinations(range(n_samp), resample_k) ] for indices in combs: new_var1 = var1[~np.in1d(range(len(var1)), indices)] new_var2 = var2[~np.in1d(range(len(var2)), indices)] # remove NaNs new_var1, new_var2 = utils.remove_nans(new_var1, new_var2) # compute new p_value and r_value depending on statistic if statistic in ('pearson', 'rpearson'): p_value, r_value = compute_pc(new_var1, new_var2) elif statistic in ('spearman', 'rspearman'): p_value, r_value = compute_sc(new_var1, new_var2) elif statistic in ('kendall', 'rkendall'): p_value, r_value = compute_kc(new_var1, new_var2) # update reverse, maxp, and minr reverse, extrema_p, extrema_r = update_rev_extrema_rp( sign, r_value, p_value, indices, reverse, extrema_p, extrema_r, forward) # check sign reversal if np.sign(r_value) != sign: for i in indices: reverse[i] += 1 if forward is True: if param == 'p': # fold change p-value restraint if fold: if (p_value > threshold and p_value > pvalues[var1_index][var2_index] * fold_value) or \ np.isnan(p_value): for i in indices: exceeds[i] += 1 elif p_value > threshold or np.isnan(p_value): for i in indices: exceeds[i] += 1 elif param == 'r': # fold change r-value restraint if fold: if (np.abs(r_value) < threshold and np.abs(r_value) < np.abs(corrs[var1_index][var2_index]) * fold_value) or \ np.isnan(r_value): for i in indices: exceeds[i] += 1 elif np.abs(r_value) < threshold or np.isnan(r_value): for i in indices: exceeds[i] += 1 elif forward is False: if param == 'p': # fold change p-value restraint if fold: if (p_value < threshold and p_value < pvalues[var1_index][var2_index] / fold_value): for i in indices: exceeds[i] += 1 elif p_value < threshold: for i in indices: exceeds[i] += 1 elif param == 'r': # fold change p-value restraint if fold: if (np.abs(r_value) > threshold and np.abs(r_value) > np.abs(corrs[var1_index][var2_index]) * fold_value) or \ np.isnan(r_value): for i in indices: exceeds[i] += 1 elif np.abs(r_value) > threshold or np.isnan(r_value): for i in indices: exceeds[i] += 1 return reverse, exceeds, extrema_p, extrema_r