def prepare_expression(counts_df, tpm_df, vcf_lookup_s, sample_frac_threshold=0.2, count_threshold=6, tpm_threshold=0.1, mode='tmm'):
    """
    Genes are thresholded based on the following expression rules:
      TPM > tpm_threshold in >= sample_frac_threshold*samples
      read counts >= count_threshold in sample_frac_threshold*samples
    
    vcf_lookup: lookup table mapping sample IDs to VCF IDs
    
    Between-sample normalization modes:
      tmm: TMM from edgeR
      qn:  quantile normalization
    """

    ix = np.intersect1d(counts_df.columns, vcf_lookup_s.index)
    tpm_df = tpm_df[ix]
    counts_df = counts_df[ix]
    ns = tpm_df.shape[1]

    # expression thresholds
    mask = (
        (np.sum(tpm_df>=tpm_threshold,axis=1)>=sample_frac_threshold*ns) &
        (np.sum(counts_df>=count_threshold,axis=1)>=sample_frac_threshold*ns)
    ).values

    # apply normalization
    if mode.lower()=='tmm':
        tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df, normalized_lib_sizes=True)
        norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df[mask])
    elif mode.lower()=='qn':
        qn_df = rnaseqnorm.normalize_quantiles(tpm_df.loc[mask])
        norm_df = rnaseqnorm.inverse_normal_transform(qn_df)
    else:
        raise ValueError('Unsupported mode {}'.format(mode))

    return norm_df
def prepare_expression(counts_df, tpm_df, mode='tmm'):
    """
    This part and Normalization part is adapted from gtex official pipeline

    Genes are thresholded based on the following expression rules:
      TPM >= tpm_threshold in >= sample_frac_threshold*samples
      read counts >= count_threshold in sample_frac_threshold*samples
    
    vcf_lookup: lookup table mapping sample IDs to VCF IDs
    
    Between-sample normalization modes:
      tmm: TMM from edgeR
      qn:  quantile normalization
    """
    ns = tpm_df.shape[1]
    # apply normalization
    if mode.lower() == 'tmm':
        tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df,
                                             normalized_lib_sizes=True)
        norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df)
    elif mode.lower() == 'qn':
        qn_df = rnaseqnorm.normalize_quantiles(tpm_df)
        norm_df = rnaseqnorm.inverse_normal_transform(qn_df)
    else:
        raise ValueError('Unsupported mode {}'.format(mode))

    return norm_df
Esempio n. 3
0
def main():

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    counts_df = pd.read_csv(args.count_matrix,
                            sep='\t',
                            skiprows=1,
                            index_col=0)

    print("Normalizing...")

    norm_counts_df = edgeR_cpm(counts_df)

    print("Averaging replicate counts...")

    pat = r'\w+(?=[0-9]+$)'  # regex for groups from sample ids
    groups = OrderedDict()
    for i, sid in enumerate(list(norm_counts_df.columns)):
        groups.setdefault(re.search(pat, sid).group(0), []).append(i)

    # get mean across group indices
    mu_norm_counts_df = pd.DataFrame(0,
                                     index=norm_counts_df.index,
                                     columns=groups.keys())

    for grp, idc in groups.items():
        group_cols = norm_counts_df.columns[idc]
        mu_norm_counts_df[grp] = norm_counts_df[group_cols].apply(np.mean,
                                                                  axis=1)

    outfile = os.path.join(args.output_dir,
                           args.prefix + 'mean_expression.tmm.txt')
    mu_norm_counts_df.to_csv(outfile, sep='\t')

    outfile = os.path.join(args.output_dir, args.prefix + '.tmm.gct')
    norm_counts_df.to_csv(outfile, sep='\t')

    print(
        "wrote to *.expression.txt (averaged) and *.tmm.gct (w/ replicates) to: {}"
        .format(args.output_dir))
def tmm_normalization(counts_df, mask):
    tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df, normalized_lib_sizes=True)
    norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df[mask])
    return norm_df