Beispiel #1
0
def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None):
    """Annotate permutation results with q-values, p-value threshold"""
    if logger is None:
        logger = SimpleLogger()

    logger.write('Computing q-values')
    logger.write(f'  * Number of phenotypes tested: {res_df.shape[0]}')
    r = stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0]
    logger.write(f'  * Correlation between Beta-approximated and empirical p-values: : {r:.4f}')

    # calculate q-values
    if qvalue_lambda is None:
        qval, pi0 = rfunc.qvalue(res_df['pval_beta'])
    else:
        logger.write(f'  * Calculating q-values with lambda = {qvalue_lambda:.3f}')
        qval, pi0 = rfunc.qvalue(res_df['pval_beta'], qvalue_lambda)
    res_df['qval'] = qval
    logger.write(f'  * Proportion of significant phenotypes (1-pi0): {1-pi0:.2f}')
    logger.write(f"  * QTL phenotypes @ FDR {fdr:.2f}: {(res_df['qval'] <= fdr).sum()}")

    # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene
    lb = res_df.loc[res_df['qval']<=fdr, 'pval_beta'].sort_values()
    ub = res_df.loc[res_df['qval']>fdr, 'pval_beta'].sort_values()

    if lb.shape[0] > 0:  # significant phenotypes
        lb = lb[-1]
        if ub.shape[0] > 0:
            ub = ub[0]
            pthreshold = (lb+ub)/2
        else:
            pthreshold = lb
        logger.write(f'  * min p-value threshold @ FDR {fdr}: {pthreshold:.6g}')
        res_df['pval_nominal_threshold'] = stats.beta.ppf(pthreshold, res_df['beta_shape1'], res_df['beta_shape2'])
Beispiel #2
0
def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None):
    """Annotate permutation results with q-values, p-value threshold"""
    if logger is None:
        logger = SimpleLogger()

    logger.write('Computing q-values')
    logger.write('  * Number of phenotypes tested: {}'.format(res_df.shape[0]))
    logger.write(
        '  * Correlation between Beta-approximated and empirical p-values: : {:.4f}'
        .format(stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0]))

    # calculate q-values
    if qvalue_lambda is None:
        qval, pi0 = rfunc.qvalue(res_df['pval_beta'])
    else:
        logger.write('  * Calculating q-values with lambda = {:.3f}'.format(
            qvalue_lambda))
        qval, pi0 = rfunc.qvalue(res_df['pval_beta'], qvalue_lambda)
    res_df['qval'] = qval
    logger.write(
        '  * Proportion of significant phenotypes (1-pi0): {:.2f}'.format(1 -
                                                                          pi0))
    logger.write('  * QTL phenotypes @ FDR {:.2f}: {}'.format(
        fdr, np.sum(res_df['qval'] <= fdr)))

    # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene
    ub = res_df.loc[res_df['qval'] > fdr, 'pval_beta'].sort_values()[0]
    lb = res_df.loc[res_df['qval'] <= fdr, 'pval_beta'].sort_values()[-1]
    pthreshold = (lb + ub) / 2
    logger.write('  * min p-value threshold @ FDR {}: {:.6g}'.format(
        fdr, pthreshold))
    res_df['pval_nominal_threshold'] = stats.beta.ppf(pthreshold,
                                                      res_df['beta_shape1'],
                                                      res_df['beta_shape2'])
Beispiel #3
0
def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None):
    """Annotate permutation results with q-values, p-value threshold"""
    if logger is None:
        logger = SimpleLogger()

    logger.write('Computing q-values')
    logger.write('  * Number of phenotypes tested: {}'.format(res_df.shape[0]))
    # logger.write('  * Correlation between Beta-approximated and empirical p-values: : {:.4f}'.format(
    #     stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0]))
    pval_perm = np.array(res_df['pval_perm'], dtype=float)
    pval_beta = np.array(res_df['pval_beta'], dtype=float)
    is_finite_1 = np.isfinite(pval_perm)
    is_finite_2 = np.isfinite(pval_beta)
    is_finite = is_finite_1 & is_finite_2
    logger.write(
        '  * Correlation between Beta-approximated and empirical p-values: : {:.4f}'
        .format(stats.pearsonr(pval_perm[is_finite], pval_beta[is_finite])[0]))

    # calculate q-values
    if qvalue_lambda is None:
        qval, pi0 = rfunc.qvalue(res_df['pval_beta'])
    else:
        logger.write('  * Calculating q-values with lambda = {:.3f}'.format(
            qvalue_lambda))
        qval, pi0 = rfunc.qvalue(res_df['pval_beta'], qvalue_lambda)
    res_df['qval'] = qval
    logger.write(
        '  * Proportion of significant phenotypes (1-pi0): {:.2f}'.format(1 -
                                                                          pi0))
    # logger.write('  * QTL phenotypes @ FDR {:.2f}: {}'.format(fdr, np.sum(res_df['qval']<=fdr)))
    logger.write('  * QTL phenotypes @ FDR {:.2f}: {}'.format(
        fdr, np.nansum(res_df['qval'] <= fdr)))

    # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene
    lb = res_df.loc[res_df['qval'] <= fdr, 'pval_beta'].sort_values()
    ub = res_df.loc[res_df['qval'] > fdr, 'pval_beta'].sort_values()

    if lb.shape[0] > 0:  # significant phenotypes
        lb = list(lb)
        lb = lb[-1]
        if ub.shape[0] > 0:
            ub = list(ub)
            ub = ub[0]
            pthreshold = (lb + ub) / 2
        else:
            pthreshold = lb
        logger.write('  * min p-value threshold @ FDR {}: {:.6g}'.format(
            fdr, pthreshold))
        res_df['pval_nominal_threshold'] = stats.beta.ppf(
            pthreshold, res_df['beta_shape1'], res_df['beta_shape2'])
Beispiel #4
0
    # load eqtl summary statistics
    logging.info('Loading eQTL table.')
    df = read_table(input_file)

    # pool over gene
    logging.info('Looping over genes.')
    res = []
    res_pi0 = []
    genes = df[pheno_col].unique()
    for gene in tqdm(genes):
        df_i = df[df[pheno_col] == gene].reset_index(drop=True)
        df_i = load_pvalue(df_i, mode=args.mode)
        if df_i.shape[0] < 1:
            continue
        try:
            qval, pi0 = rfunc.qvalue(df_i.pval.values)
        except:
            logging.info(f'Failed on {gene}')
            continue
        tmp = df_i[[pheno_col, variant_col, 'pval']].copy()
        tmp['qval'] = qval
        res_pi0.append(pd.DataFrame({'phenotype_id': [gene], 'pi0': [pi0]}))
        res.append(tmp[tmp.qval < args.fdr_cutoff].reset_index(drop=True))
    res = pd.concat(res, axis=0)
    res_pi0 = pd.concat(res_pi0, axis=0)

    # save output
    logging.info('Writing output to disk.')
    res.to_parquet(args.output)
    res_pi0.to_csv(args.output_pi0, index=False)