def get_error_table_using_percentile_positives_new(err_df, target_scores, num_null): """ transfer error statistics in err_df for many target scores and given number of estimated null hypothesises 'num_null' """ num_total = len(target_scores) num_alternative = num_total - num_null target_scores = np.sort(to_one_dim_array(target_scores)) # ascending # optimized num_positives = count_num_positives(target_scores.astype(np.float64)) num_negatives = num_total - num_positives # the last coertion is needed because depending on the scale of num_total # numpy switched to 64 bit floats pp = (num_positives.astype(np.float32) / num_total).astype(np.float32) # find best matching row in err_df for each percentile_positive in pp: imax = find_nearest_matches(err_df.percentile_positive.values, pp) qvalues = err_df.qvalue.iloc[imax].values svalues = err_df.svalue.iloc[imax].values pvalues = err_df.pvalue.iloc[imax].values fdr = err_df.FDR.iloc[imax].values fdr[fdr < 0.0] = 0.0 fdr[fdr > 1.0] = 1.0 fdr[num_positives == 0] = 0.0 fp = np.round(fdr * num_positives) tp = num_positives - fp tn = num_null - fp fn = num_negatives - tn sens = tp / num_alternative if num_alternative == 0: sens = np.zeros_like(tp) sens[sens < 0.0] = 0.0 sens[sens > 1.0] = 1.0 df_error = pd.DataFrame( dict(qvalue=qvalues, svalue=svalues, pvalue=pvalues, TP=tp, FP=fp, TN=tn, FN=fn, FDR=fdr, sens=sens, cutoff=target_scores), columns="qvalue svalue pvalue TP FP TN FN FDR sens cutoff".split(), ) return df_error
def get_error_table_using_percentile_positives_new(err_df, target_scores, num_null): """ transfer error statistics in err_df for many target scores and given number of estimated null hypothesises 'num_null' """ num_total = len(target_scores) num_alternative = num_total - num_null target_scores = np.sort(to_one_dim_array(target_scores)) # ascending # optimized num_positives = count_num_positives(target_scores.astype(np.float64)) num_negatives = num_total - num_positives # the last coertion is needed because depending on the scale of num_total # numpy switched to 64 bit floats pp = (num_positives.astype(np.float32) / num_total).astype(np.float32) # find best matching row in err_df for each percentile_positive in pp: imax = find_nearest_matches(err_df.percentile_positive.values, pp) qvalues = err_df.qvalue.iloc[imax].values svalues = err_df.svalue.iloc[imax].values fdr = err_df.FDR.iloc[imax].values fdr[fdr < 0.0] = 0.0 fdr[fdr > 1.0] = 1.0 fdr[num_positives == 0] = 0.0 fp = np.round(fdr * num_positives) tp = num_positives - fp tn = num_null - fp fn = num_negatives - tn sens = tp / num_alternative if num_alternative == 0: sens = np.zeros_like(tp) sens[sens < 0.0] = 0.0 sens[sens > 1.0] = 1.0 df_error = pd.DataFrame( dict(qvalue=qvalues, svalue=svalues, TP=tp, FP=fp, TN=tn, FN=fn, FDR=fdr, sens=sens, cutoff=target_scores), columns="qvalue svalue TP FP TN FN FDR sens cutoff".split(), ) return df_error
def get_error_table_from_pvalues_new(p_values, lambda_=0.4, use_pfdr=False): """ estimate error table from p_values with method of storey for estimating fdrs and q-values """ # sort descending: p_values = np.sort(to_one_dim_array(p_values))[::-1] # estimate FDR with storeys method: num_null = 1.0 / (1.0 - lambda_) * (p_values >= lambda_).sum() num_total = len(p_values) # optimized with numpys broadcasting: comparing column vector with row # vector yields a matrix with pairwise comparison results. sum(axis=0) # sums up each column: num_positives = count_num_positives(p_values) num_negatives = num_total - num_positives pp = 1.0 * num_positives / num_total tp = num_positives - num_null * p_values fp = num_null * p_values tn = num_null * (1.0 - p_values) fn = num_negatives - num_null * (1.0 - p_values) fdr = fp / num_positives # storey published pFDR as an improvement over FDR, # see http://www.genomine.org/papers/directfdr.pdf: if use_pfdr: fac = 1.0 - (1.0 - p_values)**num_total fdr /= fac # if we take the limit p->1 we achieve the following factor: fdr[p_values == 0] = 1.0 / num_total # cut off values to range 0..1 fdr[fdr < 0.0] = 0.0 fdr[fdr > 1.0] = 1.0 # estimate false non-discovery rate fnr = fn / num_negatives # storey published pFDR as an improvement over FDR, # see http://www.genomine.org/papers/directfdr.pdf: if use_pfdr: fac = 1.0 - p_values**num_total fnr /= fac # if we take the limit p->1 we achieve the following factor: fnr[p_values == 0] = 1.0 / num_total # cut off values to range 0..1 fnr[fnr < 0.0] = 0.0 fnr[fnr > 1.0] = 1.0 sens = tp / (num_total - num_null) # cut off values to range 0..1 sens[sens < 0.0] = 0.0 sens[sens > 1.0] = 1.0 if num_null: fpr = fp / num_null else: fpr = 0.0 * fp # assemble statistics as data frame df = pd.DataFrame( dict( pvalue=p_values.flatten().astype(np.float32), percentile_positive=pp.flatten().astype(np.float32), positive=num_positives.flatten().astype(np.float32), negative=num_negatives.flatten().astype(np.float32), TP=tp.flatten().astype(np.float32), FP=fp.flatten().astype(np.float32), TN=tn.flatten().astype(np.float32), FN=fn.flatten().astype(np.float32), FDR=fdr.flatten().astype(np.float32), FNR=fnr.flatten().astype(np.float32), sens=sens.flatten().astype(np.float32), FPR=fpr.flatten().astype(np.float32), ), columns="""pvalue percentile_positive positive negative TP FP TN FN FDR FNR sens FPR""".split(), ) # cummin/cummax not available in numpy, so we create them from dataframe # here: df["qvalue"] = df.FDR.cummin() df["svalue"] = df.sens[::-1].cummax()[::-1] return ErrorStatistics(df, num_null, num_total)
def get_error_table_from_pvalues_new(p_values, lambda_=0.4): """ estimate error table from p_values with method of storey for estimating fdrs and q-values """ # sort descending: p_values = np.sort(to_one_dim_array(p_values))[::-1] # estimate FDR with storeys method: num_null = 1.0 / (1.0 - lambda_) * (p_values >= lambda_).sum() num = len(p_values) # p_values = p_values[:,None] # optimized with numpys broadcasting: comparing column vector with row # vector yields a matrix with pairwise comparison results. sum(axis=0) # sums up each column: num_positives = count_num_positives(p_values) num_negatives = num - num_positives pp = 1.0 * num_positives / num tp = num_positives - num_null * p_values fp = num_null * p_values tn = num_null * (1.0 - p_values) fn = num_negatives - num_null * (1.0 - p_values) fdr = fp / num_positives # cut off values to range 0..1 fdr[fdr < 0.0] = 0.0 fdr[fdr > 1.0] = 1.0 sens = tp / (num - num_null) # cut off values to range 0..1 sens[sens < 0.0] = 0.0 sens[sens > 1.0] = 1.0 if num_null: fpr = fp / num_null else: fpr = 0.0 * fp # assemble statistics as data frame error_stat = pd.DataFrame( dict(pvalue=p_values.flatten(), percentile_positive=pp.flatten(), positive=num_positives.flatten(), negative=num_negatives.flatten(), TP=tp.flatten(), FP=fp.flatten(), TN=tn.flatten(), FN=fn.flatten(), FDR=fdr.flatten(), sens=sens.flatten(), FPR=fpr.flatten()), columns="""pvalue percentile_positive positive negative TP FP TN FN FDR sens FPR""".split() ) # cummin/cummax not available in numpy, so we create them from dataframe # here: error_stat["qvalue"] = error_stat.FDR.cummin() error_stat["svalue"] = error_stat.sens[::-1].cummax()[::-1] return error_stat, num_null, num
def get_error_table_from_pvalues_new(p_values, lambda_=0.4, use_pfdr=False): """ estimate error table from p_values with method of storey for estimating fdrs and q-values """ # sort descending: p_values = np.sort(to_one_dim_array(p_values))[::-1] # estimate FDR with storeys method: num_null = 1.0 / (1.0 - lambda_) * (p_values >= lambda_).sum() num_total = len(p_values) # optimized with numpys broadcasting: comparing column vector with row # vector yields a matrix with pairwise comparison results. sum(axis=0) # sums up each column: num_positives = count_num_positives(p_values) num_negatives = num_total - num_positives pp = 1.0 * num_positives / num_total tp = num_positives - num_null * p_values fp = num_null * p_values tn = num_null * (1.0 - p_values) fn = num_negatives - num_null * (1.0 - p_values) fdr = fp / num_positives # storey published pFDR as an improvement over FDR, # see http://www.genomine.org/papers/directfdr.pdf: if use_pfdr: fac = 1.0 - (1.0 - p_values) ** num_total fdr /= fac # if we take the limit p->1 we achieve the following factor: fdr[p_values == 0] = 1.0 / num_total # cut off values to range 0..1 fdr[fdr < 0.0] = 0.0 fdr[fdr > 1.0] = 1.0 sens = tp / (num_total - num_null) # cut off values to range 0..1 sens[sens < 0.0] = 0.0 sens[sens > 1.0] = 1.0 if num_null: fpr = fp / num_null else: fpr = 0.0 * fp # assemble statistics as data frame df = pd.DataFrame( dict(pvalue=p_values.flatten().astype(np.float32), percentile_positive=pp.flatten().astype(np.float32), positive=num_positives.flatten().astype(np.float32), negative=num_negatives.flatten().astype(np.float32), TP=tp.flatten().astype(np.float32), FP=fp.flatten().astype(np.float32), TN=tn.flatten().astype(np.float32), FN=fn.flatten().astype(np.float32), FDR=fdr.flatten().astype(np.float32), sens=sens.flatten().astype(np.float32), FPR=fpr.flatten().astype(np.float32), ), columns="""pvalue percentile_positive positive negative TP FP TN FN FDR sens FPR""".split(), ) # cummin/cummax not available in numpy, so we create them from dataframe # here: df["qvalue"] = df.FDR.cummin() df["svalue"] = df.sens[::-1].cummax()[::-1] return ErrorStatistics(df, num_null, num_total)