def hmsort(self, sort): if not sort: pass elif sort == 0: for t in list(self.data.keys()): for i, g in enumerate(self.data[t].keys()): # print(numpy.sum(data[t][bed].values()[0], axis=1)) # print(len(numpy.sum(data[t][bed].values()[0], axis=1))) sumarr = numpy.sum([numpy.sum(d, axis=1) for d in list(self.data[t][g].values())], axis=0) # print(sumarr) # sumarr = numpy.sum(sumarr, axis=1) ind = stats.rankdata(sumarr, method='ordinal') # The index for further sorting # numpy.fliplr(ind) for j, c in enumerate(self.data[t][g].keys()): d = numpy.empty(shape=self.data[t][g][c].shape) for k, ranki in enumerate(ind): d[-ranki, :] = self.data[t][g][c][k, :] self.data[t][g][c] = d else: for t in list(self.data.keys()): for i, g in enumerate(self.data[t].keys()): sumarr = numpy.sum(list(self.data[t][g].values())[sort - 1], axis=1) # print(sumarr) # sumarr = numpy.sum(sumarr, axis=1) ind = stats.rankdata(sumarr, method='ordinal') # The index for further sorting # list(ind) # print(ind) for j, c in enumerate(self.data[t][g].keys()): d = numpy.empty(shape=self.data[t][g][c].shape) for k, ranki in enumerate(ind): d[-ranki, :] = self.data[t][g][c][k, :] self.data[t][g][c] = d
def spearman_rs(l1, l2): """Compute Spearman-Rank Correlation Coefficient with corresponding p-Value""" if len(l1) == 0 or len(l2) == 0: print 'ERROR: LISTS CONTAIN NO ELEMENTS!' return -1. elif len(l1) != len(l2): print 'ERROR: LISTS HAVE TO HAVE THE SAME LENGTH!' return -1. l1 = rankdata(l1) l2 = rankdata(l2) l1_mean = sum(l1) / len(l1) l2_mean = sum(l2) / len(l2) sum1 = 0. sum2 = 0. numerator = 0. # Compute Spearman rs for i in range(0, len(l1)): numerator += (l1[i] - l1_mean) * (l2[i] - l2_mean) sum1 += (l1[i] - l1_mean)**2 sum2 += (l2[i] - l2_mean)**2 denum = sqrt(sum1) * sqrt(sum2) rs = numerator / denum # Compute Spearman t t = len(l1) - 2. t /= 1. - rs**2 t = rs * sqrt(t) # if t > 0: change sign, since student's t is axis symmetric around zero if t > 0: t_help = (-1.) * t else: t_help = t #p = stdtr(len(z)-2.,t_help) p = stdtr(len(l1) - 2., t_help) return (rs, p)
def spearman_rs(l1,l2): """Compute Spearman-Rank Correlation Coefficient with corresponding p-Value""" if len(l1) == 0 or len(l2) == 0: print 'ERROR: LISTS CONTAIN NO ELEMENTS!' return -1. elif len(l1) != len(l2): print 'ERROR: LISTS HAVE TO HAVE THE SAME LENGTH!' return -1. l1 = rankdata(l1) l2 = rankdata(l2) l1_mean = sum(l1)/len(l1) l2_mean = sum(l2)/len(l2) sum1 = 0. sum2 = 0. numerator = 0. # Compute Spearman rs for i in range(0,len(l1)): numerator +=(l1[i] - l1_mean)*(l2[i] - l2_mean) sum1 += (l1[i] - l1_mean)**2 sum2 += (l2[i] - l2_mean)**2 denum = sqrt(sum1)*sqrt(sum2) rs = numerator/denum # Compute Spearman t t = len(l1) - 2. t /= 1. - rs**2 t = rs*sqrt(t) # if t > 0: change sign, since student's t is axis symmetric around zero if t>0: t_help = (-1.)*t else: t_help = t #p = stdtr(len(z)-2.,t_help) p = stdtr(len(l1)-2.,t_help) return (rs,p)
def hmsort(self, sort): if not sort: pass elif sort == 0: for t in self.data.keys(): for i, g in enumerate(self.data[t].keys()): # print(numpy.sum(data[t][bed].values()[0], axis=1)) # print(len(numpy.sum(data[t][bed].values()[0], axis=1))) sumarr = numpy.sum([numpy.sum(d, axis=1) for d in self.data[t][g].values()], axis=0) # print(sumarr) # sumarr = numpy.sum(sumarr, axis=1) ind = stats.rankdata(sumarr, method='ordinal') # The index for further sorting # numpy.fliplr(ind) for j, c in enumerate(self.data[t][g].keys()): d = numpy.empty(shape=self.data[t][g][c].shape) for k, ranki in enumerate(ind): d[-ranki, :] = self.data[t][g][c][k, :] self.data[t][g][c] = d else: for t in self.data.keys(): for i, g in enumerate(self.data[t].keys()): sumarr = numpy.sum(self.data[t][g].values()[sort - 1], axis=1) # print(sumarr) # sumarr = numpy.sum(sumarr, axis=1) ind = stats.rankdata(sumarr, method='ordinal') # The index for further sorting # list(ind) # print(ind) for j, c in enumerate(self.data[t][g].keys()): d = numpy.empty(shape=self.data[t][g][c].shape) for k, ranki in enumerate(ind): d[-ranki, :] = self.data[t][g][c][k, :] self.data[t][g][c] = d
def get_tied_rank_single(batch_score): rank = stats.rankdata(batch_score) rank = stats.rankdata(rank) - 1 rank = (rank * -1) + batch_score.size(dim=0) rank = torch.from_numpy(rank) rank = rank.float() rank = rank / batch_score.size(dim=0) return rank
def score_sentence_level(gold, pred): pearson = pearsonr(gold, pred) mae = mean_absolute_error(gold, pred) rmse = np.sqrt(mean_squared_error(gold, pred)) spearman = spearmanr(rankdata(gold, method="ordinal"), rankdata(pred, method="ordinal")) delta_avg = delta_average(gold, rankdata(pred, method="ordinal")) return (pearson[0], mae, rmse), (spearman[0], delta_avg)
def ROC_AUC(fg_vals, bg_vals): #if len(fg_vals) != len(bg_vals): # return None if len(fg_vals) == 0 or len(bg_vals) == 0: return None fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(fg_vals + bg_vals) return (sum(total_rank[:fg_len]) - sum(fg_rank))/ (fg_len * (total_len - fg_len))
def __getitem__(self, index): rand_seq = get_rand_seq(self.seq_len, self.dist) # rand_seq = np.array([1,3,3,5],dtype=np.float32) ranks = stats.rankdata(rand_seq) ranks = stats.rankdata(ranks) - 1 ranks = (ranks * -1) + rand_seq.size ranks = torch.from_numpy(ranks) ranks = ranks.float() ranks = ranks / rand_seq.size # zipp_sort_ind = zip(np.argsort(rand_seq)[::-1], range(self.seq_len)) # ranks = [((y[1] + 1) / float(self.seq_len)) for y in sorted(zipp_sort_ind, key=lambda x: x[0])] return torch.FloatTensor(rand_seq), torch.FloatTensor(ranks)
def ROC_AUC(fg_vals, bg_vals): #if len(fg_vals) != len(bg_vals): # return None if len(fg_vals) == 0 or len(bg_vals) == 0: return None fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(fg_vals + bg_vals) return (sum(total_rank[:fg_len]) - sum(fg_rank)) / (fg_len * (total_len - fg_len))
def MNCP(fg_vals, bg_vals): from scipy.stats import stats from numpy import mean #from pylab import * fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(fg_vals + bg_vals) slopes = [] for i in range(len(fg_vals)): slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / ((total_len - total_rank[i] + 1)/ total_len) slopes.append(slope) return mean(slopes)
def sentence_level_scores( true_targets: List[float], predicted_targets: List[float]) -> Tuple[Tuple, Tuple]: pearson = pearsonr(true_targets, predicted_targets) mae = mean_absolute_error(true_targets, predicted_targets) rmse = np.sqrt(mean_squared_error(true_targets, predicted_targets)) spearman = spearmanr( rankdata(true_targets, method="ordinal"), # NOQA rankdata(predicted_targets, method="ordinal"), # NOQA ) delta_avg = delta_average(true_targets, rankdata(predicted_targets, method="ordinal")) return (pearson[0], mae, rmse), (spearman[0], delta_avg)
def calc_IDR(theta, r1, r2): """ idr <- 1 - e.z o <- order(idr) idr.o <- idr[o] idr.rank <- rank(idr.o, ties.method = "max") top.mean <- function(index, x) { mean(x[1:index]) } IDR.o <- sapply(idr.rank, top.mean, idr.o) IDR <- idr IDR[o] <- IDR.o """ mu, sigma, rho, p = theta z1 = compute_pseudo_values(r1, mu, sigma, p, EPS=1e-12) z2 = compute_pseudo_values(r2, mu, sigma, p, EPS=1e-12) localIDR = 1 - calc_post_membership_prbs(numpy.array(theta), z1, z2) if idr.FILTER_PEAKS_BELOW_NOISE_MEAN: localIDR[z1 + z2 < 0] = 1 # it doesn't make sense for the IDR values to be smaller than the # optimization tolerance localIDR = numpy.clip(localIDR, idr.CONVERGENCE_EPS_DEFAULT, 1) local_idr_order = localIDR.argsort() ordered_local_idr = localIDR[local_idr_order] ordered_local_idr_ranks = rankdata(ordered_local_idr, method='max') IDR = [] for i, rank in enumerate(ordered_local_idr_ranks): IDR.append(ordered_local_idr[:rank].mean()) IDR = numpy.array(IDR)[local_idr_order.argsort()] return localIDR, IDR
def directed_mannwhitneyu(x, y, use_continuity=True): """ Copy of scipy.stats.mannwhitneyu which multiplies the static by the direction. """ x = asarray(x) y = asarray(y) n1 = len(x) n2 = len(y) ranked = rankdata(np.concatenate((x, y))) rankx = ranked[0:n1] # get the x-ranks u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - np.sum(rankx, axis=0) # calc U for x u2 = n1 * n2 - u1 # remainder is U for y bigu = max(u1, u2) smallu = min(u1, u2) t = tiecorrect(ranked) if t == 0: raise ValueError('All numbers are identical in amannwhitneyu') sd = np.sqrt(t * n1 * n2 * (n1 + n2 + 1) / 12.0) if use_continuity: # normal approximation for prob calc with continuity correction z = abs((bigu - 0.5 - n1 * n2 / 2.0) / sd) else: z = abs( (bigu - n1 * n2 / 2.0) / sd) # normal approximation for prob calc return (eps if smallu == 0 else smallu) * (-1 if smallu == u1 else 1), distributions.norm.sf( z) # (1.0 - zprob(z))
def MNCP(fg_vals, bg_vals): from scipy.stats import stats from numpy import mean #from pylab import * fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(fg_vals + bg_vals) slopes = [] for i in range(len(fg_vals)): slope = ((fg_len - fg_rank[i] + 1) / fg_len) / ( (total_len - total_rank[i] + 1) / total_len) slopes.append(slope) return mean(slopes)
def calc_IDR(theta, r1, r2): """ idr <- 1 - e.z o <- order(idr) idr.o <- idr[o] idr.rank <- rank(idr.o, ties.method = "max") top.mean <- function(index, x) { mean(x[1:index]) } IDR.o <- sapply(idr.rank, top.mean, idr.o) IDR <- idr IDR[o] <- IDR.o """ mu, sigma, rho, p = theta z1 = compute_pseudo_values(r1, mu, sigma, p, EPS=1e-12) z2 = compute_pseudo_values(r2, mu, sigma, p, EPS=1e-12) localIDR = 1-calc_post_membership_prbs(numpy.array(theta), z1, z2) if idr.FILTER_PEAKS_BELOW_NOISE_MEAN: localIDR[z1 + z2 < 0] = 1 # it doesn't make sense for the IDR values to be smaller than the # optimization tolerance localIDR = numpy.clip(localIDR, idr.CONVERGENCE_EPS_DEFAULT, 1) local_idr_order = localIDR.argsort() ordered_local_idr = localIDR[local_idr_order] ordered_local_idr_ranks = rankdata( ordered_local_idr, method='max' ) IDR = [] for i, rank in enumerate(ordered_local_idr_ranks): IDR.append(ordered_local_idr[:rank].mean()) IDR = numpy.array(IDR)[local_idr_order.argsort()] return localIDR, IDR
def MNCP(fg_vals, bg_vals): fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) if type(fg_vals) != type(np.array([])): fg_vals = np.array(fg_vals) if type(bg_vals) != type(np.array([])): bg_vals = np.array(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals))) slopes = [] for i in range(len(fg_vals)): slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / ((total_len - total_rank[i] + 1)/ total_len) slopes.append(slope) return np.mean(slopes)
def read_hter(file_name): with open(file_name) as f: scores = np.array([line.strip() for line in f], dtype='float') method = file_name segments = np.vstack((np.arange(1, scores.shape[0] + 1), scores, rankdata(scores, method='ordinal'))).T return method, segments
def calc_global_IDR(localIDR): local_idr_order = localIDR.argsort() ordered_local_idr = localIDR[local_idr_order] ordered_local_idr_ranks = rankdata( ordered_local_idr, method='max' ) IDR = [] for i, rank in enumerate(ordered_local_idr_ranks): IDR.append(ordered_local_idr[:rank].mean()) IDR = numpy.array(IDR)[local_idr_order.argsort()] return IDR
def calc_global_IDR(localIDR): local_idr_order = localIDR.argsort() ordered_local_idr = localIDR[local_idr_order] ordered_local_idr_ranks = rankdata(ordered_local_idr, method='max') IDR = [] for i, rank in enumerate(ordered_local_idr_ranks): IDR.append(ordered_local_idr[:rank].mean()) IDR = numpy.array(IDR)[local_idr_order.argsort()] return IDR
def ROC_AUC(fg_vals, bg_vals): #if len(fg_vals) != len(bg_vals): # return None if len(fg_vals) == 0 or len(bg_vals) == 0: return None fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) if type(fg_vals) != type(np.array([])): fg_vals = np.array(fg_vals) if type(bg_vals) != type(np.array([])): bg_vals = np.array(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals))) return (sum(total_rank[:fg_len]) - sum(fg_rank))/ (fg_len * (total_len - fg_len))
def MNCP(fg_vals, bg_vals): from scipy.stats import stats from numpy import mean,array,hstack #from pylab import * fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) if type(fg_vals) != type(array([])): fg_vals = array(fg_vals) if type(bg_vals) != type(array([])): bg_vals = array(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(hstack((fg_vals, bg_vals))) slopes = [] for i in range(len(fg_vals)): slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / ((total_len - total_rank[i] + 1)/ total_len) slopes.append(slope) return mean(slopes)
def ROC_AUC(fg_vals, bg_vals): from scipy.stats import stats from numpy import mean,array,hstack #if len(fg_vals) != len(bg_vals): # return None if len(fg_vals) == 0 or len(bg_vals) == 0: return None fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) if type(fg_vals) != type(array([])): fg_vals = array(fg_vals) if type(bg_vals) != type(array([])): bg_vals = array(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(hstack((fg_vals, bg_vals))) return (sum(total_rank[:fg_len]) - sum(fg_rank))/ (fg_len * (total_len - fg_len))
def mncp(fg_vals, bg_vals): """ Computes the Mean Normalized Conditional Probability (MNCP). MNCP is described in Clarke & Granek, Bioinformatics, 2003. Parameters ---------- fg_vals : array_like The list of values for the positive set. bg_vals : array_like The list of values for the negative set. Returns ------- score : float MNCP score """ fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) if not isinstance(fg_vals, np.ndarray): fg_vals = np.array(fg_vals) if not isinstance(bg_vals, np.ndarray): bg_vals = np.array(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals))) slopes = [] for i in range(len(fg_vals)): slope = ((fg_len - fg_rank[i] + 1) / fg_len) / ( (total_len - total_rank[i] + 1) / total_len ) slopes.append(slope) return np.mean(slopes)
def relative_ranks(self, y_hat, y): """ Compute mean rank of correct answer in output sorted by probability :param y_hat: :param y: :return: """ y_hat = y_hat.squeeze().cpu() y = y.squeeze().cpu() choice_set_lengths = np.array((~torch.isinf(y_hat)).sum(1)) ranks = stats.rankdata(-y_hat.detach().numpy(), method='average', axis=1)[np.arange(len(y)), y] - 1 return ranks / (choice_set_lengths - 1)
def mncp(fg_vals, bg_vals): """ Computes the Mean Normalized Conditional Probability (MNCP). MNCP is described in Clarke & Granek, Bioinformatics, 2003. Parameters ---------- fg_vals : array_like The list of values for the positive set. bg_vals : array_like The list of values for the negative set. Returns ------- score : float MNCP score """ fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) if not isinstance(fg_vals, np.ndarray): fg_vals = np.array(fg_vals) if not isinstance(bg_vals, np.ndarray): bg_vals = np.array(bg_vals) fg_rank = stats.rankdata(fg_vals) total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals))) slopes = [] for i in range(len(fg_vals)): slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / ( (total_len - total_rank[i] + 1)/ total_len) slopes.append(slope) return np.mean(slopes)
def main(args): print(f"Arguments: {args}") attribute_values_to_rank = defaultdict(lambda: list()) attribute_percentiles = defaultdict(lambda: list()) with jsonlines.open(args["source_json"], mode='r') as reader: for json_obj in reader: for attr in args["attributes_to_bucket"]: if attr in json_obj: attribute_values_to_rank[attr].append(json_obj[attr]) for k, v in attribute_values_to_rank.items(): attr_value_vec = np.array(v) # This corresponds to culmative distribution where x% have a values that is lower than or equal to this. attr_perc_rank = stats.rankdata(attr_value_vec, "max") / len(attr_value_vec) attribute_percentiles[k].extend(attr_perc_rank.tolist()) attribute_keys_list = attribute_percentiles.keys() attribute_values_list = attribute_percentiles.values() attribute_percentiles_combined = [] for values in zip(*attribute_values_list): percentiles_per_attr = {} for i, attr in enumerate(attribute_keys_list): percentiles_per_attr[f"{attr}_percentile"] = values[i] attribute_percentiles_combined.append(percentiles_per_attr) with jsonlines.open(args["source_json"], mode='r') as reader: with jsonlines.open(args["target_json"], mode='w') as writer: for json_obj, percentiles in zip(reader, attribute_percentiles_combined): out_json_obj = {**json_obj, **percentiles} print(out_json_obj) writer.write(out_json_obj)
def write_big_submatrix(matrix, chrom, pos1, pos2, sections, section_pos, out, matrix_size, waffle_size, waffle_radii, square_size, metric='loop'): # convert to numpy array for faster querying (faster than list of lists) num_matrix = np.asarray( [[matrix.get((i, j), 0) for j in range(matrix_size)] for i in range(matrix_size)]) # another numpy array with string to convert to string only once per number str_matrix = np.asarray([[ '{:.3f}'.format(matrix.get((i, j), 0)).rstrip('0').rstrip('.') for j in range(matrix_size) ] for i in range(matrix_size)]) # iterate over each cell inside the inner matrix # extract a waffle around each cell and do stats tpos1 = pos1 + section_pos[chrom][0] tpos2 = pos2 + section_pos[chrom][0] if metric == 'loop': fast_matrix_to_decay = fast_matrix_to_decay_loop else: fast_matrix_to_decay = fast_matrix_to_decay_noloop between_indexes = [ j + i * waffle_size for i in range(waffle_radii, waffle_size) for j in range(waffle_radii + 1) ] outside_indexes = [ j + i * waffle_size for i in range(waffle_radii, waffle_size) for j in range(waffle_radii + 1, waffle_size) ] outside_indexes += [ j + i * waffle_size for i in range(waffle_radii) for j in range(waffle_size) ] dist_from_center = rankdata(pre_matrix_to_decay(waffle_size)) for i in range(waffle_radii, square_size + waffle_radii): # we do not want anything outside chromosome if pos1 + i > sections[chrom]: break for j in range(waffle_radii, square_size + waffle_radii): # we do not want anything crossing over the diagonal if pos1 + i > pos2 + j: # - waffle_size: continue # we do not want anything outside chromosome if pos2 + j > sections[chrom]: break ## get waffle (diagonal of the genomic matrix is located in the down left, ## i=waffle_size and j=0) waffle = num_matrix[i - waffle_radii:i + waffle_radii + 1, j - waffle_radii:j + waffle_radii + 1] # if it's all zeroes we do not want it if not waffle.sum(): continue # if it's smaller than expected we do not want it if len(waffle) < waffle_size: continue ## stats # spearman y = fast_matrix_to_decay(waffle, between_indexes, outside_indexes) # x, y = matrix_to_decay(waffle, waffle_size, metric=metric) rho, pval = pearsonr(dist_from_center, rankdata(y)) # equivalent of spearmanr # change this: x can be already known # if nan, the matrix is too sparse and we do not want it if isnan(rho): continue # peak intensity peak = get_center(waffle, len(waffle), span=1) ## store waffle and stats waffle = str_matrix[i - waffle_radii:i + waffle_radii + 1, j - waffle_radii:j + waffle_radii + 1] out.write('{}\t{}\t{:.3g}\t{:.3g}\t{:.3f}\t{}\n'.format( tpos1 + i, tpos2 + j, rho, pval, peak, ','.join(v for l in waffle for v in l)))
def ROC_AUC_xlim(x_bla, y_bla, xlim=None): x = x_bla[:] y = y_bla[:] x.sort() y.sort() u = {} for i in x + y: u[i] = 1 vals = u.keys() vals.sort() len_x = float(len(x)) len_y = float(len(y)) new_x = [] new_y = [] x_p = 0 y_p = 0 for val in vals[::-1]: while len(x) > 0 and x[-1] >= val: x.pop() x_p += 1 while len(y) > 0 and y[-1] >= val: y.pop() y_p += 1 new_y.append((len_x - x_p) / len_x) new_x.append((len_y - y_p) / len_y) #print new_x #print new_y new_x = 1 - array(new_x) new_y = 1 - array(new_y) #plot(new_x, new_y) #show() x = new_x y = new_y if len(x) != len(y): raise "Unequal!" if not xlim: xlim = 1.0 auc = 0.0 bla = zip(stats.rankdata(x), range(len(x))) def sortfunc(x,y): res = x[0] - y[0] if res < 0: return -1 elif res > 0: return 1 elif res == 0: return y[1] - x[1] bla.sort(sortfunc) prev_x = x[bla[0][1]] prev_y = y[bla[0][1]] index = 1 while index < len(bla) and x[bla[index][1]] <= xlim: (rank, i) = bla[index] auc += y[i] * (x[i] - prev_x) - ((x[i] - prev_x) * (y[i] - prev_y) / 2.0) prev_x = x[i] prev_y = y[i] index += 1 if index < len(bla): (rank, i) = bla[index] auc += prev_y * (xlim - prev_x) + ((y[i] - prev_y)/(x[i] - prev_x) * (xlim -prev_x) * (xlim - prev_x)/2) return auc
def wilcoxon_one_sided(x, y=None, zero_method="wilcox", correction=False): """ Calculate the one-tailed Wilcoxon signed-rank test. The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from the same distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. It is a non-parametric version of the paired T-test. Parameters ---------- x : array_like The first set of measurements. y : array_like, optional The second set of measurements. If `y` is not given, then the `x` array is considered to be the differences between the two sets of measurements. zero_method : string, {"pratt", "wilcox", "zsplit"}, optional "pratt": Pratt treatment: includes zero-differences in the ranking process (more conservative) "wilcox": Wilcox treatment: discards all zero-differences "zsplit": Zero rank split: just like Pratt, but spliting the zero rank between positive and negative ones correction : bool, optional If True, apply continuity correction by adjusting the Wilcoxon rank statistic by 0.5 towards the mean value when computing the z-statistic. Default is False. Returns ------- statistic : float The sum of the ranks of the differences above or below zero, whichever is smaller. pvalue : float The one-sided p-value for the test (null hypothesis: x <= y) Notes ----- Because the normal approximation is used for the calculations, the samples used should be large. A typical rule is to require that n > 20. References ---------- .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test """ if zero_method not in ["wilcox", "pratt", "zsplit"]: raise ValueError("Zero method should be either 'wilcox' " "or 'pratt' or 'zsplit'") if y is None: d = asarray(x) else: x, y = map(asarray, (x, y)) if len(x) != len(y): raise ValueError('Unequal N in wilcoxon. Aborting.') d = x - y if zero_method == "wilcox": # Keep all non-zero differences d = compress(np.not_equal(d, 0), d, axis=-1) count = len(d) if count < 10: warnings.warn( "Warning: sample size too small for normal approximation.") r = stats.rankdata(abs(d)) r_plus = np.sum((d > 0) * r, axis=0) r_minus = np.sum((d < 0) * r, axis=0) if zero_method == "zsplit": r_zero = np.sum((d == 0) * r, axis=0) r_plus += r_zero / 2. r_minus += r_zero / 2. T = r_minus mn = count * (count + 1.) * 0.25 se = count * (count + 1.) * (2. * count + 1.) if zero_method == "pratt": r = r[d != 0] replist, repnum = find_repeats(r) if repnum.size != 0: # Correction for repeated elements. se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() se = sqrt(se / 24) correction = 0.5 * int(bool(correction)) * np.sign(T - mn) z = (T - mn - correction) / se prob = distributions.norm.cdf(z) return WilcoxonResult(T, prob)
def roc_auc_xlim(x_bla, y_bla, xlim=0.1): """ Computes the ROC Area Under Curve until a certain FPR value. Parameters ---------- fg_vals : array_like list of values for positive set bg_vals : array_like list of values for negative set xlim : float, optional FPR value Returns ------- score : float ROC AUC score """ x = x_bla[:] y = y_bla[:] x.sort() y.sort() u = {} for i in x + y: u[i] = 1 vals = sorted(u.keys()) len_x = float(len(x)) len_y = float(len(y)) new_x = [] new_y = [] x_p = 0 y_p = 0 for val in vals[::-1]: while len(x) > 0 and x[-1] >= val: x.pop() x_p += 1 while len(y) > 0 and y[-1] >= val: y.pop() y_p += 1 new_y.append((len_x - x_p) / len_x) new_x.append((len_y - y_p) / len_y) #print new_x #print new_y new_x = 1 - np.array(new_x) new_y = 1 - np.array(new_y) #plot(new_x, new_y) #show() x = new_x y = new_y if len(x) != len(y): raise ValueError("Unequal!") if not xlim: xlim = 1.0 auc = 0.0 bla = zip(stats.rankdata(x), range(len(x))) bla = sorted(bla, key=lambda x: x[1]) prev_x = x[bla[0][1]] prev_y = y[bla[0][1]] index = 1 while index < len(bla) and x[bla[index][1]] <= xlim: _, i = bla[index] auc += y[i] * (x[i] - prev_x) - ((x[i] - prev_x) * (y[i] - prev_y) / 2.0) prev_x = x[i] prev_y = y[i] index += 1 if index < len(bla): (rank, i) = bla[index] auc += prev_y * (xlim - prev_x) + ((y[i] - prev_y)/(x[i] - prev_x) * (xlim -prev_x) * (xlim - prev_x)/2) return auc
def ROC_AUC_xlim(x_bla, y_bla, xlim=None): x = x_bla[:] y = y_bla[:] x.sort() y.sort() u = {} for i in x + y: u[i] = 1 vals = u.keys() vals.sort() len_x = float(len(x)) len_y = float(len(y)) new_x = [] new_y = [] x_p = 0 y_p = 0 for val in vals[::-1]: while len(x) > 0 and x[-1] >= val: x.pop() x_p += 1 while len(y) > 0 and y[-1] >= val: y.pop() y_p += 1 new_y.append((len_x - x_p) / len_x) new_x.append((len_y - y_p) / len_y) #print new_x #print new_y new_x = 1 - array(new_x) new_y = 1 - array(new_y) #plot(new_x, new_y) #show() x = new_x y = new_y if len(x) != len(y): raise "Unequal!" if not xlim: xlim = 1.0 auc = 0.0 bla = zip(stats.rankdata(x), range(len(x))) def sortfunc(x, y): res = x[0] - y[0] if res < 0: return -1 elif res > 0: return 1 elif res == 0: return y[1] - x[1] bla.sort(sortfunc) prev_x = x[bla[0][1]] prev_y = y[bla[0][1]] index = 1 while index < len(bla) and x[bla[index][1]] <= xlim: (rank, i) = bla[index] auc += y[i] * (x[i] - prev_x) - ((x[i] - prev_x) * (y[i] - prev_y) / 2.0) prev_x = x[i] prev_y = y[i] index += 1 if index < len(bla): (rank, i) = bla[index] auc += prev_y * (xlim - prev_x) + ((y[i] - prev_y) / (x[i] - prev_x) * (xlim - prev_x) * (xlim - prev_x) / 2) return auc