Ejemplo n.º 1
0
    def hmsort(self, sort):
        if not sort:
            pass
        elif sort == 0:
            for t in list(self.data.keys()):
                for i, g in enumerate(self.data[t].keys()):
                    # print(numpy.sum(data[t][bed].values()[0], axis=1))
                    # print(len(numpy.sum(data[t][bed].values()[0], axis=1)))

                    sumarr = numpy.sum([numpy.sum(d, axis=1) for d in list(self.data[t][g].values())], axis=0)
                    # print(sumarr)
                    # sumarr = numpy.sum(sumarr, axis=1)
                    ind = stats.rankdata(sumarr, method='ordinal')  # The index for further sorting
                    # numpy.fliplr(ind)

                    for j, c in enumerate(self.data[t][g].keys()):
                        d = numpy.empty(shape=self.data[t][g][c].shape)
                        for k, ranki in enumerate(ind):
                            d[-ranki, :] = self.data[t][g][c][k, :]
                        self.data[t][g][c] = d
        else:
            for t in list(self.data.keys()):
                for i, g in enumerate(self.data[t].keys()):
                    sumarr = numpy.sum(list(self.data[t][g].values())[sort - 1], axis=1)
                    # print(sumarr)
                    # sumarr = numpy.sum(sumarr, axis=1)
                    ind = stats.rankdata(sumarr, method='ordinal')  # The index for further sorting
                    # list(ind)
                    # print(ind)
                    for j, c in enumerate(self.data[t][g].keys()):
                        d = numpy.empty(shape=self.data[t][g][c].shape)
                        for k, ranki in enumerate(ind):
                            d[-ranki, :] = self.data[t][g][c][k, :]
                        self.data[t][g][c] = d
Ejemplo n.º 2
0
def spearman_rs(l1, l2):
    """Compute Spearman-Rank Correlation Coefficient with corresponding p-Value"""

    if len(l1) == 0 or len(l2) == 0:
        print 'ERROR: LISTS CONTAIN NO ELEMENTS!'
        return -1.
    elif len(l1) != len(l2):
        print 'ERROR: LISTS HAVE TO HAVE THE SAME LENGTH!'
        return -1.
    l1 = rankdata(l1)
    l2 = rankdata(l2)
    l1_mean = sum(l1) / len(l1)
    l2_mean = sum(l2) / len(l2)
    sum1 = 0.
    sum2 = 0.
    numerator = 0.
    # Compute Spearman rs
    for i in range(0, len(l1)):
        numerator += (l1[i] - l1_mean) * (l2[i] - l2_mean)
        sum1 += (l1[i] - l1_mean)**2
        sum2 += (l2[i] - l2_mean)**2
    denum = sqrt(sum1) * sqrt(sum2)
    rs = numerator / denum
    # Compute Spearman t
    t = len(l1) - 2.
    t /= 1. - rs**2
    t = rs * sqrt(t)
    # if t > 0: change sign, since student's t is axis symmetric around zero
    if t > 0:
        t_help = (-1.) * t
    else:
        t_help = t
#p = stdtr(len(z)-2.,t_help)
    p = stdtr(len(l1) - 2., t_help)
    return (rs, p)
Ejemplo n.º 3
0
def spearman_rs(l1,l2):
    """Compute Spearman-Rank Correlation Coefficient with corresponding p-Value"""

    if len(l1) == 0 or len(l2) == 0:
	print 'ERROR: LISTS CONTAIN NO ELEMENTS!'
	return -1. 
    elif len(l1) != len(l2):
	print 'ERROR: LISTS HAVE TO HAVE THE SAME LENGTH!'
	return -1. 
    l1 = rankdata(l1)
    l2 = rankdata(l2)
    l1_mean = sum(l1)/len(l1)
    l2_mean = sum(l2)/len(l2)
    sum1 = 0.
    sum2 = 0.
    numerator = 0.
# Compute Spearman rs
    for i in range(0,len(l1)):
	numerator +=(l1[i] - l1_mean)*(l2[i] - l2_mean)
	sum1 += (l1[i] - l1_mean)**2
	sum2 += (l2[i] - l2_mean)**2
    denum = sqrt(sum1)*sqrt(sum2)
    rs = numerator/denum
# Compute Spearman t
    t = len(l1) - 2.
    t /= 1. - rs**2
    t = rs*sqrt(t)
# if t > 0: change sign, since student's t is axis symmetric around zero
    if t>0:
	t_help = (-1.)*t
    else:
	t_help = t
#p = stdtr(len(z)-2.,t_help)
    p = stdtr(len(l1)-2.,t_help)
    return (rs,p)
Ejemplo n.º 4
0
    def hmsort(self, sort):
        if not sort:
            pass
        elif sort == 0:
            for t in self.data.keys():
                for i, g in enumerate(self.data[t].keys()):
                    # print(numpy.sum(data[t][bed].values()[0], axis=1))
                    # print(len(numpy.sum(data[t][bed].values()[0], axis=1)))

                    sumarr = numpy.sum([numpy.sum(d, axis=1) for d in self.data[t][g].values()], axis=0)
                    # print(sumarr)
                    # sumarr = numpy.sum(sumarr, axis=1)
                    ind = stats.rankdata(sumarr, method='ordinal')  # The index for further sorting
                    # numpy.fliplr(ind)

                    for j, c in enumerate(self.data[t][g].keys()):
                        d = numpy.empty(shape=self.data[t][g][c].shape)
                        for k, ranki in enumerate(ind):
                            d[-ranki, :] = self.data[t][g][c][k, :]
                        self.data[t][g][c] = d
        else:
            for t in self.data.keys():
                for i, g in enumerate(self.data[t].keys()):
                    sumarr = numpy.sum(self.data[t][g].values()[sort - 1], axis=1)
                    # print(sumarr)
                    # sumarr = numpy.sum(sumarr, axis=1)
                    ind = stats.rankdata(sumarr, method='ordinal')  # The index for further sorting
                    # list(ind)
                    # print(ind)
                    for j, c in enumerate(self.data[t][g].keys()):
                        d = numpy.empty(shape=self.data[t][g][c].shape)
                        for k, ranki in enumerate(ind):
                            d[-ranki, :] = self.data[t][g][c][k, :]
                        self.data[t][g][c] = d
Ejemplo n.º 5
0
def get_tied_rank_single(batch_score):
    rank = stats.rankdata(batch_score)
    rank = stats.rankdata(rank) - 1
    rank = (rank * -1) + batch_score.size(dim=0)
    rank = torch.from_numpy(rank)
    rank = rank.float()
    rank = rank / batch_score.size(dim=0)
    return rank
Ejemplo n.º 6
0
def score_sentence_level(gold, pred):
    pearson = pearsonr(gold, pred)
    mae = mean_absolute_error(gold, pred)
    rmse = np.sqrt(mean_squared_error(gold, pred))

    spearman = spearmanr(rankdata(gold, method="ordinal"),
                         rankdata(pred, method="ordinal"))
    delta_avg = delta_average(gold, rankdata(pred, method="ordinal"))
    return (pearson[0], mae, rmse), (spearman[0], delta_avg)
Ejemplo n.º 7
0
def ROC_AUC(fg_vals, bg_vals):
	#if len(fg_vals) != len(bg_vals):
	#	return None
	
	if len(fg_vals) == 0 or len(bg_vals) == 0:
		return None
	
	fg_len = len(fg_vals)
	total_len = len(fg_vals) + len(bg_vals)

	fg_rank = stats.rankdata(fg_vals) 
	total_rank = stats.rankdata(fg_vals + bg_vals) 
	
	return (sum(total_rank[:fg_len]) - sum(fg_rank))/ (fg_len * (total_len - fg_len))
Ejemplo n.º 8
0
    def __getitem__(self, index):
        rand_seq = get_rand_seq(self.seq_len, self.dist)
        # rand_seq = np.array([1,3,3,5],dtype=np.float32)
        ranks = stats.rankdata(rand_seq)
        ranks = stats.rankdata(ranks) - 1
        ranks = (ranks * -1) + rand_seq.size
        ranks = torch.from_numpy(ranks)
        ranks = ranks.float()
        ranks = ranks / rand_seq.size

        # zipp_sort_ind = zip(np.argsort(rand_seq)[::-1], range(self.seq_len))

        # ranks = [((y[1] + 1) / float(self.seq_len)) for y in sorted(zipp_sort_ind, key=lambda x: x[0])]

        return torch.FloatTensor(rand_seq), torch.FloatTensor(ranks)
Ejemplo n.º 9
0
def ROC_AUC(fg_vals, bg_vals):
    #if len(fg_vals) != len(bg_vals):
    #	return None

    if len(fg_vals) == 0 or len(bg_vals) == 0:
        return None

    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)

    fg_rank = stats.rankdata(fg_vals)
    total_rank = stats.rankdata(fg_vals + bg_vals)

    return (sum(total_rank[:fg_len]) - sum(fg_rank)) / (fg_len *
                                                        (total_len - fg_len))
Ejemplo n.º 10
0
def MNCP(fg_vals, bg_vals):
	from scipy.stats import stats
	from numpy import mean
	#from pylab import *
	fg_len = len(fg_vals)
	total_len = len(fg_vals) + len(bg_vals)

	fg_rank = stats.rankdata(fg_vals)
	total_rank = stats.rankdata(fg_vals + bg_vals)

	slopes = []
	for i in range(len(fg_vals)):
		slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / ((total_len - total_rank[i] + 1)/ total_len)
		slopes.append(slope)
	return mean(slopes)
Ejemplo n.º 11
0
def sentence_level_scores(
        true_targets: List[float],
        predicted_targets: List[float]) -> Tuple[Tuple, Tuple]:
    pearson = pearsonr(true_targets, predicted_targets)
    mae = mean_absolute_error(true_targets, predicted_targets)
    rmse = np.sqrt(mean_squared_error(true_targets, predicted_targets))

    spearman = spearmanr(
        rankdata(true_targets, method="ordinal"),  # NOQA
        rankdata(predicted_targets, method="ordinal"),  # NOQA
    )
    delta_avg = delta_average(true_targets,
                              rankdata(predicted_targets, method="ordinal"))

    return (pearson[0], mae, rmse), (spearman[0], delta_avg)
Ejemplo n.º 12
0
def calc_IDR(theta, r1, r2):
    """
    idr <- 1 - e.z
    o <- order(idr)
    idr.o <- idr[o]
    idr.rank <- rank(idr.o, ties.method = "max")
    top.mean <- function(index, x) {
        mean(x[1:index])
    }
    IDR.o <- sapply(idr.rank, top.mean, idr.o)
    IDR <- idr
    IDR[o] <- IDR.o
    """
    mu, sigma, rho, p = theta
    z1 = compute_pseudo_values(r1, mu, sigma, p, EPS=1e-12)
    z2 = compute_pseudo_values(r2, mu, sigma, p, EPS=1e-12)
    localIDR = 1 - calc_post_membership_prbs(numpy.array(theta), z1, z2)
    if idr.FILTER_PEAKS_BELOW_NOISE_MEAN:
        localIDR[z1 + z2 < 0] = 1

    # it doesn't make sense for the IDR values to be smaller than the
    # optimization tolerance
    localIDR = numpy.clip(localIDR, idr.CONVERGENCE_EPS_DEFAULT, 1)
    local_idr_order = localIDR.argsort()
    ordered_local_idr = localIDR[local_idr_order]
    ordered_local_idr_ranks = rankdata(ordered_local_idr, method='max')
    IDR = []
    for i, rank in enumerate(ordered_local_idr_ranks):
        IDR.append(ordered_local_idr[:rank].mean())
    IDR = numpy.array(IDR)[local_idr_order.argsort()]

    return localIDR, IDR
Ejemplo n.º 13
0
def directed_mannwhitneyu(x, y, use_continuity=True):
    """
    Copy of scipy.stats.mannwhitneyu which multiplies the static by the direction.
    """
    x = asarray(x)
    y = asarray(y)
    n1 = len(x)
    n2 = len(y)
    ranked = rankdata(np.concatenate((x, y)))
    rankx = ranked[0:n1]  # get the x-ranks
    u1 = n1 * n2 + (n1 *
                    (n1 + 1)) / 2.0 - np.sum(rankx, axis=0)  # calc U for x
    u2 = n1 * n2 - u1  # remainder is U for y
    bigu = max(u1, u2)
    smallu = min(u1, u2)
    t = tiecorrect(ranked)
    if t == 0:
        raise ValueError('All numbers are identical in amannwhitneyu')
    sd = np.sqrt(t * n1 * n2 * (n1 + n2 + 1) / 12.0)

    if use_continuity:
        # normal approximation for prob calc with continuity correction
        z = abs((bigu - 0.5 - n1 * n2 / 2.0) / sd)
    else:
        z = abs(
            (bigu - n1 * n2 / 2.0) / sd)  # normal approximation for prob calc
    return (eps if smallu == 0 else smallu) * (-1 if smallu == u1
                                               else 1), distributions.norm.sf(
                                                   z)  # (1.0 - zprob(z))
Ejemplo n.º 14
0
def MNCP(fg_vals, bg_vals):
    from scipy.stats import stats
    from numpy import mean
    #from pylab import *
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)

    fg_rank = stats.rankdata(fg_vals)
    total_rank = stats.rankdata(fg_vals + bg_vals)

    slopes = []
    for i in range(len(fg_vals)):
        slope = ((fg_len - fg_rank[i] + 1) / fg_len) / (
            (total_len - total_rank[i] + 1) / total_len)
        slopes.append(slope)
    return mean(slopes)
Ejemplo n.º 15
0
def calc_IDR(theta, r1, r2):
    """
    idr <- 1 - e.z
    o <- order(idr)
    idr.o <- idr[o]
    idr.rank <- rank(idr.o, ties.method = "max")
    top.mean <- function(index, x) {
        mean(x[1:index])
    }
    IDR.o <- sapply(idr.rank, top.mean, idr.o)
    IDR <- idr
    IDR[o] <- IDR.o
    """
    mu, sigma, rho, p = theta
    z1 = compute_pseudo_values(r1, mu, sigma, p, EPS=1e-12)
    z2 = compute_pseudo_values(r2, mu, sigma, p, EPS=1e-12)
    localIDR = 1-calc_post_membership_prbs(numpy.array(theta), z1, z2)
    if idr.FILTER_PEAKS_BELOW_NOISE_MEAN:
        localIDR[z1 + z2 < 0] = 1 

    # it doesn't make sense for the IDR values to be smaller than the 
    # optimization tolerance
    localIDR = numpy.clip(localIDR, idr.CONVERGENCE_EPS_DEFAULT, 1)
    local_idr_order = localIDR.argsort()
    ordered_local_idr = localIDR[local_idr_order]
    ordered_local_idr_ranks = rankdata( ordered_local_idr, method='max' )
    IDR = []
    for i, rank in enumerate(ordered_local_idr_ranks):
        IDR.append(ordered_local_idr[:rank].mean())
    IDR = numpy.array(IDR)[local_idr_order.argsort()]

    return localIDR, IDR
Ejemplo n.º 16
0
def MNCP(fg_vals, bg_vals):
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)

    if type(fg_vals) != type(np.array([])):
        fg_vals = np.array(fg_vals)
    if type(bg_vals) != type(np.array([])):
        bg_vals = np.array(bg_vals)
    
    fg_rank = stats.rankdata(fg_vals)
    total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals)))

    slopes = []
    for i in range(len(fg_vals)):
        slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / ((total_len - total_rank[i] + 1)/ total_len)
        slopes.append(slope)
    return np.mean(slopes)
Ejemplo n.º 17
0
def MNCP(fg_vals, bg_vals):
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)

    if type(fg_vals) != type(np.array([])):
        fg_vals = np.array(fg_vals)
    if type(bg_vals) != type(np.array([])):
        bg_vals = np.array(bg_vals)
    
    fg_rank = stats.rankdata(fg_vals)
    total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals)))

    slopes = []
    for i in range(len(fg_vals)):
        slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / ((total_len - total_rank[i] + 1)/ total_len)
        slopes.append(slope)
    return np.mean(slopes)
Ejemplo n.º 18
0
def read_hter(file_name):
    with open(file_name) as f:
        scores = np.array([line.strip() for line in f], dtype='float')
    method = file_name
    segments = np.vstack((np.arange(1, scores.shape[0] + 1),
                          scores,
                          rankdata(scores, method='ordinal'))).T
    return method, segments
Ejemplo n.º 19
0
def calc_global_IDR(localIDR):
    local_idr_order = localIDR.argsort()
    ordered_local_idr = localIDR[local_idr_order]
    ordered_local_idr_ranks = rankdata( ordered_local_idr, method='max' )
    IDR = []
    for i, rank in enumerate(ordered_local_idr_ranks):
        IDR.append(ordered_local_idr[:rank].mean())
    IDR = numpy.array(IDR)[local_idr_order.argsort()]
    return IDR
Ejemplo n.º 20
0
def calc_global_IDR(localIDR):
    local_idr_order = localIDR.argsort()
    ordered_local_idr = localIDR[local_idr_order]
    ordered_local_idr_ranks = rankdata(ordered_local_idr, method='max')
    IDR = []
    for i, rank in enumerate(ordered_local_idr_ranks):
        IDR.append(ordered_local_idr[:rank].mean())
    IDR = numpy.array(IDR)[local_idr_order.argsort()]
    return IDR
Ejemplo n.º 21
0
def ROC_AUC(fg_vals, bg_vals):
    #if len(fg_vals) != len(bg_vals):
    #    return None
    
    if len(fg_vals) == 0 or len(bg_vals) == 0:
        return None
    
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)
    
    if type(fg_vals) != type(np.array([])):
        fg_vals = np.array(fg_vals)
    if type(bg_vals) != type(np.array([])):
        bg_vals = np.array(bg_vals)

    fg_rank = stats.rankdata(fg_vals) 
    total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals)))
    
    return (sum(total_rank[:fg_len]) - sum(fg_rank))/ (fg_len * (total_len - fg_len))
Ejemplo n.º 22
0
def ROC_AUC(fg_vals, bg_vals):
    #if len(fg_vals) != len(bg_vals):
    #    return None
    
    if len(fg_vals) == 0 or len(bg_vals) == 0:
        return None
    
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)
    
    if type(fg_vals) != type(np.array([])):
        fg_vals = np.array(fg_vals)
    if type(bg_vals) != type(np.array([])):
        bg_vals = np.array(bg_vals)

    fg_rank = stats.rankdata(fg_vals) 
    total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals)))
    
    return (sum(total_rank[:fg_len]) - sum(fg_rank))/ (fg_len * (total_len - fg_len))
Ejemplo n.º 23
0
def MNCP(fg_vals, bg_vals):
    from scipy.stats import stats
    from numpy import mean,array,hstack
    #from pylab import *
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)

    if type(fg_vals) != type(array([])):
        fg_vals = array(fg_vals)
    if type(bg_vals) != type(array([])):
        bg_vals = array(bg_vals)
    
    fg_rank = stats.rankdata(fg_vals)
    total_rank = stats.rankdata(hstack((fg_vals, bg_vals)))

    slopes = []
    for i in range(len(fg_vals)):
        slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / ((total_len - total_rank[i] + 1)/ total_len)
        slopes.append(slope)
    return mean(slopes)
Ejemplo n.º 24
0
def ROC_AUC(fg_vals, bg_vals):
    from scipy.stats import stats
    from numpy import mean,array,hstack
    #if len(fg_vals) != len(bg_vals):
    #    return None
    
    if len(fg_vals) == 0 or len(bg_vals) == 0:
        return None
    
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)
    
    if type(fg_vals) != type(array([])):
        fg_vals = array(fg_vals)
    if type(bg_vals) != type(array([])):
        bg_vals = array(bg_vals)

    fg_rank = stats.rankdata(fg_vals) 
    total_rank = stats.rankdata(hstack((fg_vals, bg_vals)))
    
    return (sum(total_rank[:fg_len]) - sum(fg_rank))/ (fg_len * (total_len - fg_len))
Ejemplo n.º 25
0
def mncp(fg_vals, bg_vals):
    """
    Computes the Mean Normalized Conditional Probability (MNCP).

    MNCP is described in Clarke & Granek, Bioinformatics, 2003.

    Parameters
    ----------
    fg_vals : array_like
        The list of values for the positive set.

    bg_vals : array_like
        The list of values for the negative set.

    Returns
    -------
    score : float
        MNCP score
    """
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)

    if not isinstance(fg_vals, np.ndarray):
        fg_vals = np.array(fg_vals)
    if not isinstance(bg_vals, np.ndarray):
        bg_vals = np.array(bg_vals)

    fg_rank = stats.rankdata(fg_vals)
    total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals)))

    slopes = []
    for i in range(len(fg_vals)):
        slope = ((fg_len - fg_rank[i] + 1) / fg_len) / (
            (total_len - total_rank[i] + 1) / total_len
        )
        slopes.append(slope)

    return np.mean(slopes)
Ejemplo n.º 26
0
    def relative_ranks(self, y_hat, y):
        """
        Compute mean rank of correct answer in output sorted by probability
        :param y_hat:
        :param y:
        :return:
        """
        y_hat = y_hat.squeeze().cpu()
        y = y.squeeze().cpu()

        choice_set_lengths = np.array((~torch.isinf(y_hat)).sum(1))
        ranks = stats.rankdata(-y_hat.detach().numpy(), method='average', axis=1)[np.arange(len(y)), y] - 1

        return ranks / (choice_set_lengths - 1)
Ejemplo n.º 27
0
def mncp(fg_vals, bg_vals):
    """
    Computes the Mean Normalized Conditional Probability (MNCP).

    MNCP is described in Clarke & Granek, Bioinformatics, 2003.

    Parameters
    ----------
    fg_vals : array_like
        The list of values for the positive set.

    bg_vals : array_like
        The list of values for the negative set.
    
    Returns
    -------
    score : float
        MNCP score
    """
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)

    if not isinstance(fg_vals, np.ndarray):
        fg_vals = np.array(fg_vals)
    if not isinstance(bg_vals, np.ndarray):
        bg_vals = np.array(bg_vals)
    
    fg_rank = stats.rankdata(fg_vals)
    total_rank = stats.rankdata(np.hstack((fg_vals, bg_vals)))

    slopes = []
    for i in range(len(fg_vals)):
        slope = ((fg_len - fg_rank[i] + 1) / fg_len ) / (
                (total_len - total_rank[i] + 1)/ total_len)
        slopes.append(slope)
    
    return np.mean(slopes)
Ejemplo n.º 28
0
def main(args):
    print(f"Arguments: {args}")

    attribute_values_to_rank = defaultdict(lambda: list())
    attribute_percentiles = defaultdict(lambda: list())

    with jsonlines.open(args["source_json"], mode='r') as reader:
        for json_obj in reader:

            for attr in args["attributes_to_bucket"]:
                if attr in json_obj:
                    attribute_values_to_rank[attr].append(json_obj[attr])

    for k, v in attribute_values_to_rank.items():
        attr_value_vec = np.array(v)
        # This corresponds to culmative distribution where x% have a values that is lower than or equal to this.
        attr_perc_rank = stats.rankdata(attr_value_vec,
                                        "max") / len(attr_value_vec)
        attribute_percentiles[k].extend(attr_perc_rank.tolist())

    attribute_keys_list = attribute_percentiles.keys()
    attribute_values_list = attribute_percentiles.values()
    attribute_percentiles_combined = []
    for values in zip(*attribute_values_list):
        percentiles_per_attr = {}
        for i, attr in enumerate(attribute_keys_list):
            percentiles_per_attr[f"{attr}_percentile"] = values[i]
        attribute_percentiles_combined.append(percentiles_per_attr)

    with jsonlines.open(args["source_json"], mode='r') as reader:
        with jsonlines.open(args["target_json"], mode='w') as writer:
            for json_obj, percentiles in zip(reader,
                                             attribute_percentiles_combined):
                out_json_obj = {**json_obj, **percentiles}
                print(out_json_obj)
                writer.write(out_json_obj)
Ejemplo n.º 29
0
def write_big_submatrix(matrix,
                        chrom,
                        pos1,
                        pos2,
                        sections,
                        section_pos,
                        out,
                        matrix_size,
                        waffle_size,
                        waffle_radii,
                        square_size,
                        metric='loop'):
    # convert to numpy array for faster querying (faster than list of lists)
    num_matrix = np.asarray(
        [[matrix.get((i, j), 0) for j in range(matrix_size)]
         for i in range(matrix_size)])
    # another numpy array with string to convert to string only once per number
    str_matrix = np.asarray([[
        '{:.3f}'.format(matrix.get((i, j), 0)).rstrip('0').rstrip('.')
        for j in range(matrix_size)
    ] for i in range(matrix_size)])

    # iterate over each cell inside the inner matrix
    # extract a waffle around each cell and do stats
    tpos1 = pos1 + section_pos[chrom][0]
    tpos2 = pos2 + section_pos[chrom][0]

    if metric == 'loop':
        fast_matrix_to_decay = fast_matrix_to_decay_loop
    else:
        fast_matrix_to_decay = fast_matrix_to_decay_noloop

    between_indexes = [
        j + i * waffle_size for i in range(waffle_radii, waffle_size)
        for j in range(waffle_radii + 1)
    ]
    outside_indexes = [
        j + i * waffle_size for i in range(waffle_radii, waffle_size)
        for j in range(waffle_radii + 1, waffle_size)
    ]
    outside_indexes += [
        j + i * waffle_size for i in range(waffle_radii)
        for j in range(waffle_size)
    ]
    dist_from_center = rankdata(pre_matrix_to_decay(waffle_size))
    for i in range(waffle_radii, square_size + waffle_radii):
        # we do not want anything outside chromosome
        if pos1 + i > sections[chrom]:
            break
        for j in range(waffle_radii, square_size + waffle_radii):
            # we do not want anything crossing over the diagonal
            if pos1 + i > pos2 + j:  # - waffle_size:
                continue
            # we do not want anything outside chromosome
            if pos2 + j > sections[chrom]:
                break
            ## get waffle (diagonal of the genomic matrix is located in the down left,
            ## i=waffle_size and j=0)
            waffle = num_matrix[i - waffle_radii:i + waffle_radii + 1,
                                j - waffle_radii:j + waffle_radii + 1]
            # if it's all zeroes we do not want it
            if not waffle.sum():
                continue
            # if it's smaller than expected we do not want it
            if len(waffle) < waffle_size:
                continue
            ## stats
            # spearman
            y = fast_matrix_to_decay(waffle, between_indexes, outside_indexes)
            # x, y = matrix_to_decay(waffle, waffle_size, metric=metric)
            rho, pval = pearsonr(dist_from_center,
                                 rankdata(y))  # equivalent of spearmanr
            # change this: x can be already known
            # if nan, the matrix is too sparse and we do not want it
            if isnan(rho):
                continue
            # peak intensity
            peak = get_center(waffle, len(waffle), span=1)
            ## store waffle and stats
            waffle = str_matrix[i - waffle_radii:i + waffle_radii + 1,
                                j - waffle_radii:j + waffle_radii + 1]
            out.write('{}\t{}\t{:.3g}\t{:.3g}\t{:.3f}\t{}\n'.format(
                tpos1 + i, tpos2 + j, rho, pval, peak,
                ','.join(v for l in waffle for v in l)))
Ejemplo n.º 30
0
def ROC_AUC_xlim(x_bla, y_bla, xlim=None):
	x = x_bla[:]
	y = y_bla[:]

	x.sort()
	y.sort()

	u = {}
	for i in x + y:
		u[i] = 1

	vals = u.keys()
	vals.sort()
	
	len_x = float(len(x))
	len_y = float(len(y))
	
	new_x = []
	new_y = []
	
	x_p = 0
	y_p = 0
	for val in vals[::-1]:
		while len(x) > 0 and x[-1] >= val:
			x.pop()
			x_p += 1
		while len(y) > 0 and y[-1] >= val:
			y.pop()
			y_p += 1
		new_y.append((len_x - x_p) / len_x)
		new_x.append((len_y - y_p) / len_y)
	
	#print new_x
	#print new_y
	new_x = 1 - array(new_x)
	new_y = 1 - array(new_y)
	#plot(new_x, new_y)
	#show()

	x = new_x
	y = new_y

	if len(x) != len(y):
		raise "Unequal!"

	if not xlim:
		xlim = 1.0

	auc = 0.0
	bla = zip(stats.rankdata(x), range(len(x)))

	def sortfunc(x,y):
		res = x[0] - y[0]
		if res < 0:
			return -1
		elif res > 0:
			return 1
		elif res == 0:
			return y[1] - x[1]
	
	bla.sort(sortfunc)
	
	prev_x = x[bla[0][1]]
	prev_y = y[bla[0][1]]
	index = 1

	while index < len(bla) and x[bla[index][1]] <= xlim:

		(rank, i) = bla[index]
		
		auc += y[i] * (x[i] - prev_x) - ((x[i] - prev_x) * (y[i] - prev_y) / 2.0)
		prev_x = x[i]
		prev_y = y[i]
		index += 1
	
	if index < len(bla):
		(rank, i) = bla[index]
		auc += prev_y * (xlim - prev_x) + ((y[i] - prev_y)/(x[i] - prev_x) * (xlim -prev_x) * (xlim - prev_x)/2)

	return auc
Ejemplo n.º 31
0
def wilcoxon_one_sided(x, y=None, zero_method="wilcox", correction=False):
    """
    Calculate the one-tailed Wilcoxon signed-rank test.

    The Wilcoxon signed-rank test tests the null hypothesis that two
    related paired samples come from the same distribution. In particular,
    it tests whether the distribution of the differences x - y is symmetric
    about zero. It is a non-parametric version of the paired T-test.

    Parameters
    ----------
    x : array_like
        The first set of measurements.
    y : array_like, optional
        The second set of measurements.  If `y` is not given, then the `x`
        array is considered to be the differences between the two sets of
        measurements.
    zero_method : string, {"pratt", "wilcox", "zsplit"}, optional
        "pratt":
            Pratt treatment: includes zero-differences in the ranking process
            (more conservative)
        "wilcox":
            Wilcox treatment: discards all zero-differences
        "zsplit":
            Zero rank split: just like Pratt, but spliting the zero rank
            between positive and negative ones
    correction : bool, optional
        If True, apply continuity correction by adjusting the Wilcoxon rank
        statistic by 0.5 towards the mean value when computing the
        z-statistic.  Default is False.

    Returns
    -------
    statistic : float
        The sum of the ranks of the differences above or below zero, whichever
        is smaller.
    pvalue : float
        The one-sided p-value for the test (null hypothesis: x <= y)

    Notes
    -----
    Because the normal approximation is used for the calculations, the
    samples used should be large.  A typical rule is to require that
    n > 20.

    References
    ----------
    .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test

    """

    if zero_method not in ["wilcox", "pratt", "zsplit"]:
        raise ValueError("Zero method should be either 'wilcox' "
                         "or 'pratt' or 'zsplit'")

    if y is None:
        d = asarray(x)
    else:
        x, y = map(asarray, (x, y))
        if len(x) != len(y):
            raise ValueError('Unequal N in wilcoxon.  Aborting.')
        d = x - y

    if zero_method == "wilcox":
        # Keep all non-zero differences
        d = compress(np.not_equal(d, 0), d, axis=-1)

    count = len(d)
    if count < 10:
        warnings.warn(
            "Warning: sample size too small for normal approximation.")

    r = stats.rankdata(abs(d))
    r_plus = np.sum((d > 0) * r, axis=0)
    r_minus = np.sum((d < 0) * r, axis=0)

    if zero_method == "zsplit":
        r_zero = np.sum((d == 0) * r, axis=0)
        r_plus += r_zero / 2.
        r_minus += r_zero / 2.

    T = r_minus
    mn = count * (count + 1.) * 0.25
    se = count * (count + 1.) * (2. * count + 1.)

    if zero_method == "pratt":
        r = r[d != 0]

    replist, repnum = find_repeats(r)
    if repnum.size != 0:
        # Correction for repeated elements.
        se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

    se = sqrt(se / 24)
    correction = 0.5 * int(bool(correction)) * np.sign(T - mn)
    z = (T - mn - correction) / se
    prob = distributions.norm.cdf(z)

    return WilcoxonResult(T, prob)
Ejemplo n.º 32
0
def roc_auc_xlim(x_bla, y_bla, xlim=0.1):
    """
    Computes the ROC Area Under Curve until a certain FPR value.

    Parameters
    ----------
    fg_vals : array_like
        list of values for positive set

    bg_vals : array_like
        list of values for negative set

    xlim : float, optional
        FPR value
    
    Returns
    -------
    score : float
        ROC AUC score
    """
    x = x_bla[:]
    y = y_bla[:]

    x.sort()
    y.sort()

    u = {}
    for i in x + y:
        u[i] = 1

    vals = sorted(u.keys())
    
    len_x = float(len(x))
    len_y = float(len(y))
    
    new_x = []
    new_y = []
    
    x_p = 0
    y_p = 0
    for val in vals[::-1]:
        while len(x) > 0 and x[-1] >= val:
            x.pop()
            x_p += 1
        while len(y) > 0 and y[-1] >= val:
            y.pop()
            y_p += 1
        new_y.append((len_x - x_p) / len_x)
        new_x.append((len_y - y_p) / len_y)
    
    #print new_x
    #print new_y
    new_x = 1 - np.array(new_x)
    new_y = 1 - np.array(new_y)
    #plot(new_x, new_y)
    #show()

    x = new_x
    y = new_y

    if len(x) != len(y):
        raise ValueError("Unequal!")

    if not xlim:
        xlim = 1.0

    auc = 0.0
    bla = zip(stats.rankdata(x), range(len(x)))
    bla = sorted(bla, key=lambda x: x[1])
    
    prev_x = x[bla[0][1]]
    prev_y = y[bla[0][1]]
    index = 1

    while index < len(bla) and x[bla[index][1]] <= xlim:

        _, i = bla[index]
        
        auc += y[i] * (x[i] - prev_x) - ((x[i] - prev_x) * (y[i] - prev_y) / 2.0)
        prev_x = x[i]
        prev_y = y[i]
        index += 1
    
    if index < len(bla):
        (rank, i) = bla[index]
        auc += prev_y * (xlim - prev_x) + ((y[i] - prev_y)/(x[i] - prev_x) * (xlim -prev_x) * (xlim - prev_x)/2)
 
    return auc
Ejemplo n.º 33
0
def ROC_AUC_xlim(x_bla, y_bla, xlim=None):
    x = x_bla[:]
    y = y_bla[:]

    x.sort()
    y.sort()

    u = {}
    for i in x + y:
        u[i] = 1

    vals = u.keys()
    vals.sort()

    len_x = float(len(x))
    len_y = float(len(y))

    new_x = []
    new_y = []

    x_p = 0
    y_p = 0
    for val in vals[::-1]:
        while len(x) > 0 and x[-1] >= val:
            x.pop()
            x_p += 1
        while len(y) > 0 and y[-1] >= val:
            y.pop()
            y_p += 1
        new_y.append((len_x - x_p) / len_x)
        new_x.append((len_y - y_p) / len_y)

    #print new_x
    #print new_y
    new_x = 1 - array(new_x)
    new_y = 1 - array(new_y)
    #plot(new_x, new_y)
    #show()

    x = new_x
    y = new_y

    if len(x) != len(y):
        raise "Unequal!"

    if not xlim:
        xlim = 1.0

    auc = 0.0
    bla = zip(stats.rankdata(x), range(len(x)))

    def sortfunc(x, y):
        res = x[0] - y[0]
        if res < 0:
            return -1
        elif res > 0:
            return 1
        elif res == 0:
            return y[1] - x[1]

    bla.sort(sortfunc)

    prev_x = x[bla[0][1]]
    prev_y = y[bla[0][1]]
    index = 1

    while index < len(bla) and x[bla[index][1]] <= xlim:

        (rank, i) = bla[index]

        auc += y[i] * (x[i] - prev_x) - ((x[i] - prev_x) *
                                         (y[i] - prev_y) / 2.0)
        prev_x = x[i]
        prev_y = y[i]
        index += 1

    if index < len(bla):
        (rank, i) = bla[index]
        auc += prev_y * (xlim - prev_x) + ((y[i] - prev_y) / (x[i] - prev_x) *
                                           (xlim - prev_x) *
                                           (xlim - prev_x) / 2)

    return auc