Example #1
0
def get_error_table_using_percentile_positives_new(err_df, target_scores, num_null):
    """ transfer error statistics in err_df for many target scores and given
    number of estimated null hypothesises 'num_null' """

    num_total = len(target_scores)
    num_alternative = num_total - num_null
    target_scores = np.sort(to_one_dim_array(target_scores))  # ascending

    # optimized
    num_positives = count_num_positives(target_scores.astype(np.float64))

    num_negatives = num_total - num_positives

    # the last coertion is needed because depending on the scale of num_total
    # numpy switched to 64 bit floats
    pp = (num_positives.astype(np.float32) / num_total).astype(np.float32)

    # find best matching row in err_df for each percentile_positive in pp:
    imax = find_nearest_matches(err_df.percentile_positive.values, pp)

    qvalues = err_df.qvalue.iloc[imax].values
    svalues = err_df.svalue.iloc[imax].values
    pvalues = err_df.pvalue.iloc[imax].values

    fdr = err_df.FDR.iloc[imax].values
    fdr[fdr < 0.0] = 0.0
    fdr[fdr > 1.0] = 1.0
    fdr[num_positives == 0] = 0.0

    fp = np.round(fdr * num_positives)
    tp = num_positives - fp
    tn = num_null - fp
    fn = num_negatives - tn

    sens = tp / num_alternative
    if num_alternative == 0:
        sens = np.zeros_like(tp)
    sens[sens < 0.0] = 0.0
    sens[sens > 1.0] = 1.0

    df_error = pd.DataFrame(
        dict(qvalue=qvalues,
             svalue=svalues,
             pvalue=pvalues,
             TP=tp,
             FP=fp,
             TN=tn,
             FN=fn,
             FDR=fdr,
             sens=sens,
             cutoff=target_scores),
        columns="qvalue svalue pvalue TP FP TN FN FDR sens cutoff".split(),
    )
    return df_error
Example #2
0
def get_error_table_using_percentile_positives_new(err_df, target_scores,
                                                   num_null):
    """ transfer error statistics in err_df for many target scores and given
    number of estimated null hypothesises 'num_null' """

    num_total = len(target_scores)
    num_alternative = num_total - num_null
    target_scores = np.sort(to_one_dim_array(target_scores))  # ascending

    # optimized
    num_positives = count_num_positives(target_scores.astype(np.float64))

    num_negatives = num_total - num_positives

    # the last coertion is needed because depending on the scale of num_total
    # numpy switched to 64 bit floats
    pp = (num_positives.astype(np.float32) / num_total).astype(np.float32)

    # find best matching row in err_df for each percentile_positive in pp:
    imax = find_nearest_matches(err_df.percentile_positive.values, pp)

    qvalues = err_df.qvalue.iloc[imax].values
    svalues = err_df.svalue.iloc[imax].values
    fdr = err_df.FDR.iloc[imax].values
    fdr[fdr < 0.0] = 0.0
    fdr[fdr > 1.0] = 1.0
    fdr[num_positives == 0] = 0.0

    fp = np.round(fdr * num_positives)
    tp = num_positives - fp
    tn = num_null - fp
    fn = num_negatives - tn

    sens = tp / num_alternative
    if num_alternative == 0:
        sens = np.zeros_like(tp)
    sens[sens < 0.0] = 0.0
    sens[sens > 1.0] = 1.0

    df_error = pd.DataFrame(
        dict(qvalue=qvalues,
             svalue=svalues,
             TP=tp,
             FP=fp,
             TN=tn,
             FN=fn,
             FDR=fdr,
             sens=sens,
             cutoff=target_scores),
        columns="qvalue svalue TP FP TN FN FDR sens cutoff".split(),
    )
    return df_error
Example #3
0
def get_error_table_from_pvalues_new(p_values, lambda_=0.4, use_pfdr=False):
    """ estimate error table from p_values with method of storey for estimating fdrs and q-values
    """

    # sort descending:
    p_values = np.sort(to_one_dim_array(p_values))[::-1]

    # estimate FDR with storeys method:
    num_null = 1.0 / (1.0 - lambda_) * (p_values >= lambda_).sum()
    num_total = len(p_values)

    # optimized with numpys broadcasting: comparing column vector with row
    # vector yields a matrix with pairwise comparison results.  sum(axis=0)
    # sums up each column:
    num_positives = count_num_positives(p_values)
    num_negatives = num_total - num_positives
    pp = 1.0 * num_positives / num_total
    tp = num_positives - num_null * p_values
    fp = num_null * p_values
    tn = num_null * (1.0 - p_values)
    fn = num_negatives - num_null * (1.0 - p_values)

    fdr = fp / num_positives

    # storey published pFDR as an improvement over FDR,
    # see http://www.genomine.org/papers/directfdr.pdf:

    if use_pfdr:
        fac = 1.0 - (1.0 - p_values)**num_total
        fdr /= fac
        # if we take the limit p->1 we achieve the following factor:
        fdr[p_values == 0] = 1.0 / num_total

    # cut off values to range 0..1
    fdr[fdr < 0.0] = 0.0
    fdr[fdr > 1.0] = 1.0

    # estimate false non-discovery rate
    fnr = fn / num_negatives

    # storey published pFDR as an improvement over FDR,
    # see http://www.genomine.org/papers/directfdr.pdf:

    if use_pfdr:
        fac = 1.0 - p_values**num_total
        fnr /= fac
        # if we take the limit p->1 we achieve the following factor:
        fnr[p_values == 0] = 1.0 / num_total

    # cut off values to range 0..1
    fnr[fnr < 0.0] = 0.0
    fnr[fnr > 1.0] = 1.0

    sens = tp / (num_total - num_null)
    # cut off values to range 0..1
    sens[sens < 0.0] = 0.0
    sens[sens > 1.0] = 1.0

    if num_null:
        fpr = fp / num_null
    else:
        fpr = 0.0 * fp

    # assemble statistics as data frame
    df = pd.DataFrame(
        dict(
            pvalue=p_values.flatten().astype(np.float32),
            percentile_positive=pp.flatten().astype(np.float32),
            positive=num_positives.flatten().astype(np.float32),
            negative=num_negatives.flatten().astype(np.float32),
            TP=tp.flatten().astype(np.float32),
            FP=fp.flatten().astype(np.float32),
            TN=tn.flatten().astype(np.float32),
            FN=fn.flatten().astype(np.float32),
            FDR=fdr.flatten().astype(np.float32),
            FNR=fnr.flatten().astype(np.float32),
            sens=sens.flatten().astype(np.float32),
            FPR=fpr.flatten().astype(np.float32),
        ),
        columns="""pvalue percentile_positive positive negative TP FP
                        TN FN FDR FNR sens FPR""".split(),
    )

    # cummin/cummax not available in numpy, so we create them from dataframe
    # here:
    df["qvalue"] = df.FDR.cummin()
    df["svalue"] = df.sens[::-1].cummax()[::-1]

    return ErrorStatistics(df, num_null, num_total)
Example #4
0
def get_error_table_from_pvalues_new(p_values, lambda_=0.4):
    """ estimate error table from p_values with method of storey for estimating fdrs and q-values
    """

    # sort descending:
    p_values = np.sort(to_one_dim_array(p_values))[::-1]

    # estimate FDR with storeys method:
    num_null = 1.0 / (1.0 - lambda_) * (p_values >= lambda_).sum()
    num = len(p_values)

    # p_values = p_values[:,None]

    # optimized with numpys broadcasting: comparing column vector with row
    # vector yields a matrix with pairwise comparison results.  sum(axis=0)
    # sums up each column:
    num_positives = count_num_positives(p_values)
    num_negatives = num - num_positives
    pp = 1.0 * num_positives / num
    tp = num_positives - num_null * p_values
    fp = num_null * p_values
    tn = num_null * (1.0 - p_values)
    fn = num_negatives - num_null * (1.0 - p_values)

    fdr = fp / num_positives
    # cut off values to range 0..1
    fdr[fdr < 0.0] = 0.0
    fdr[fdr > 1.0] = 1.0

    sens = tp / (num - num_null)
    # cut off values to range 0..1
    sens[sens < 0.0] = 0.0
    sens[sens > 1.0] = 1.0

    if num_null:
        fpr = fp / num_null
    else:
        fpr = 0.0 * fp

    # assemble statistics as data frame
    error_stat = pd.DataFrame(
        dict(pvalue=p_values.flatten(),
             percentile_positive=pp.flatten(),
             positive=num_positives.flatten(),
             negative=num_negatives.flatten(),
             TP=tp.flatten(),
             FP=fp.flatten(),
             TN=tn.flatten(),
             FN=fn.flatten(),
             FDR=fdr.flatten(),
             sens=sens.flatten(),
             FPR=fpr.flatten()),
        columns="""pvalue percentile_positive positive negative TP FP
                        TN FN FDR sens FPR""".split()
    )

    # cummin/cummax not available in numpy, so we create them from dataframe
    # here:
    error_stat["qvalue"] = error_stat.FDR.cummin()
    error_stat["svalue"] = error_stat.sens[::-1].cummax()[::-1]

    return error_stat, num_null, num
Example #5
0
def get_error_table_from_pvalues_new(p_values, lambda_=0.4, use_pfdr=False):
    """ estimate error table from p_values with method of storey for estimating fdrs and q-values
    """

    # sort descending:
    p_values = np.sort(to_one_dim_array(p_values))[::-1]

    # estimate FDR with storeys method:
    num_null = 1.0 / (1.0 - lambda_) * (p_values >= lambda_).sum()
    num_total = len(p_values)

    # optimized with numpys broadcasting: comparing column vector with row
    # vector yields a matrix with pairwise comparison results.  sum(axis=0)
    # sums up each column:
    num_positives = count_num_positives(p_values)
    num_negatives = num_total - num_positives
    pp = 1.0 * num_positives / num_total
    tp = num_positives - num_null * p_values
    fp = num_null * p_values
    tn = num_null * (1.0 - p_values)
    fn = num_negatives - num_null * (1.0 - p_values)

    fdr = fp / num_positives

    # storey published pFDR as an improvement over FDR,
    # see http://www.genomine.org/papers/directfdr.pdf:

    if use_pfdr:
        fac = 1.0 - (1.0 - p_values) ** num_total
        fdr /= fac
        # if we take the limit p->1 we achieve the following factor:
        fdr[p_values == 0] = 1.0 / num_total

    # cut off values to range 0..1
    fdr[fdr < 0.0] = 0.0
    fdr[fdr > 1.0] = 1.0

    sens = tp / (num_total - num_null)
    # cut off values to range 0..1
    sens[sens < 0.0] = 0.0
    sens[sens > 1.0] = 1.0

    if num_null:
        fpr = fp / num_null
    else:
        fpr = 0.0 * fp

    # assemble statistics as data frame
    df = pd.DataFrame(
        dict(pvalue=p_values.flatten().astype(np.float32),
             percentile_positive=pp.flatten().astype(np.float32),
             positive=num_positives.flatten().astype(np.float32),
             negative=num_negatives.flatten().astype(np.float32),
             TP=tp.flatten().astype(np.float32),
             FP=fp.flatten().astype(np.float32),
             TN=tn.flatten().astype(np.float32),
             FN=fn.flatten().astype(np.float32),
             FDR=fdr.flatten().astype(np.float32),
             sens=sens.flatten().astype(np.float32),
             FPR=fpr.flatten().astype(np.float32),
             ),
        columns="""pvalue percentile_positive positive negative TP FP
                        TN FN FDR sens FPR""".split(),
    )

    # cummin/cummax not available in numpy, so we create them from dataframe
    # here:
    df["qvalue"] = df.FDR.cummin()
    df["svalue"] = df.sens[::-1].cummax()[::-1]

    return ErrorStatistics(df, num_null, num_total)