def prepare_expression(counts_df, tpm_df, vcf_lookup_s, sample_frac_threshold=0.2, count_threshold=6, tpm_threshold=0.1, mode='tmm'): """ Genes are thresholded based on the following expression rules: TPM > tpm_threshold in >= sample_frac_threshold*samples read counts >= count_threshold in sample_frac_threshold*samples vcf_lookup: lookup table mapping sample IDs to VCF IDs Between-sample normalization modes: tmm: TMM from edgeR qn: quantile normalization """ ix = np.intersect1d(counts_df.columns, vcf_lookup_s.index) tpm_df = tpm_df[ix] counts_df = counts_df[ix] ns = tpm_df.shape[1] # expression thresholds mask = ( (np.sum(tpm_df>=tpm_threshold,axis=1)>=sample_frac_threshold*ns) & (np.sum(counts_df>=count_threshold,axis=1)>=sample_frac_threshold*ns) ).values # apply normalization if mode.lower()=='tmm': tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df, normalized_lib_sizes=True) norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df[mask]) elif mode.lower()=='qn': qn_df = rnaseqnorm.normalize_quantiles(tpm_df.loc[mask]) norm_df = rnaseqnorm.inverse_normal_transform(qn_df) else: raise ValueError('Unsupported mode {}'.format(mode)) return norm_df
def prepare_expression(counts_df, tpm_df, mode='tmm'): """ This part and Normalization part is adapted from gtex official pipeline Genes are thresholded based on the following expression rules: TPM >= tpm_threshold in >= sample_frac_threshold*samples read counts >= count_threshold in sample_frac_threshold*samples vcf_lookup: lookup table mapping sample IDs to VCF IDs Between-sample normalization modes: tmm: TMM from edgeR qn: quantile normalization """ ns = tpm_df.shape[1] # apply normalization if mode.lower() == 'tmm': tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df, normalized_lib_sizes=True) norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df) elif mode.lower() == 'qn': qn_df = rnaseqnorm.normalize_quantiles(tpm_df) norm_df = rnaseqnorm.inverse_normal_transform(qn_df) else: raise ValueError('Unsupported mode {}'.format(mode)) return norm_df
def main(): if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) counts_df = pd.read_csv(args.count_matrix, sep='\t', skiprows=1, index_col=0) print("Normalizing...") norm_counts_df = edgeR_cpm(counts_df) print("Averaging replicate counts...") pat = r'\w+(?=[0-9]+$)' # regex for groups from sample ids groups = OrderedDict() for i, sid in enumerate(list(norm_counts_df.columns)): groups.setdefault(re.search(pat, sid).group(0), []).append(i) # get mean across group indices mu_norm_counts_df = pd.DataFrame(0, index=norm_counts_df.index, columns=groups.keys()) for grp, idc in groups.items(): group_cols = norm_counts_df.columns[idc] mu_norm_counts_df[grp] = norm_counts_df[group_cols].apply(np.mean, axis=1) outfile = os.path.join(args.output_dir, args.prefix + 'mean_expression.tmm.txt') mu_norm_counts_df.to_csv(outfile, sep='\t') outfile = os.path.join(args.output_dir, args.prefix + '.tmm.gct') norm_counts_df.to_csv(outfile, sep='\t') print( "wrote to *.expression.txt (averaged) and *.tmm.gct (w/ replicates) to: {}" .format(args.output_dir))
def tmm_normalization(counts_df, mask): tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df, normalized_lib_sizes=True) norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df[mask]) return norm_df