def prepare_expression(counts_df, tpm_df, vcf_lookup_s, sample_frac_threshold=0.2, count_threshold=6, tpm_threshold=0.1, mode='tmm'): """ Genes are thresholded based on the following expression rules: TPM > tpm_threshold in >= sample_frac_threshold*samples read counts >= count_threshold in sample_frac_threshold*samples vcf_lookup: lookup table mapping sample IDs to VCF IDs Between-sample normalization modes: tmm: TMM from edgeR qn: quantile normalization """ ix = np.intersect1d(counts_df.columns, vcf_lookup_s.index) tpm_df = tpm_df[ix] counts_df = counts_df[ix] ns = tpm_df.shape[1] # expression thresholds mask = ( (np.sum(tpm_df>=tpm_threshold,axis=1)>=sample_frac_threshold*ns) & (np.sum(counts_df>=count_threshold,axis=1)>=sample_frac_threshold*ns) ).values # apply normalization if mode.lower()=='tmm': tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df, normalized_lib_sizes=True) norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df[mask]) elif mode.lower()=='qn': qn_df = rnaseqnorm.normalize_quantiles(tpm_df.loc[mask]) norm_df = rnaseqnorm.inverse_normal_transform(qn_df) else: raise ValueError('Unsupported mode {}'.format(mode)) return norm_df
def prepare_expression(counts_df, tpm_df, mode='tmm'): """ This part and Normalization part is adapted from gtex official pipeline Genes are thresholded based on the following expression rules: TPM >= tpm_threshold in >= sample_frac_threshold*samples read counts >= count_threshold in sample_frac_threshold*samples vcf_lookup: lookup table mapping sample IDs to VCF IDs Between-sample normalization modes: tmm: TMM from edgeR qn: quantile normalization """ ns = tpm_df.shape[1] # apply normalization if mode.lower() == 'tmm': tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df, normalized_lib_sizes=True) norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df) elif mode.lower() == 'qn': qn_df = rnaseqnorm.normalize_quantiles(tpm_df) norm_df = rnaseqnorm.inverse_normal_transform(qn_df) else: raise ValueError('Unsupported mode {}'.format(mode)) return norm_df
def normalize_expression_and_generate_expression_pcs( raw_pseudobulk_expression, sample_level_normalization, gene_level_normalization, num_pcs, pb_expression_output_root): # Initialize output normalized expression matrix normalized_expression = np.zeros(raw_pseudobulk_expression.shape) ################################## # Perform sample level normalization ################################## if sample_level_normalization == 'qn': df = pd.DataFrame(np.transpose(raw_pseudobulk_expression)) temp_out = rnaseqnorm.normalize_quantiles(df) raw_pseudobulk_expression = np.transpose(np.asarray(temp_out)) ################################## # Perform gene level normalization ################################## if gene_level_normalization == 'zscore': for gene_num in range(normalized_expression.shape[1]): temp_expr = (raw_pseudobulk_expression[:, gene_num] - np.mean( raw_pseudobulk_expression[:, gene_num])) / np.std( raw_pseudobulk_expression[:, gene_num]) temp_expr[temp_expr > 10.0] = 10.0 temp_expr[temp_expr < -10.0] = -10.0 temp_expr = temp_expr - np.mean(temp_expr) normalized_expression[:, gene_num] = temp_expr elif gene_level_normalization == 'ign': # Code from GTEx v8 # Project each gene onto a gaussian df = pd.DataFrame(np.transpose(raw_pseudobulk_expression)) norm_df = rnaseqnorm.inverse_normal_transform(df) normalized_expression = np.transpose(np.asarray(norm_df)) else: print(gene_level_normalization + ' gene level normalization method currently not implemented') pdb.set_trace() # Save normalized pseudobulk gene expression to output file pseudobulk_expression_file = pb_expression_output_root + 'normalized_expression.txt' np.savetxt(pseudobulk_expression_file, normalized_expression, fmt="%s", delimiter='\t') # Run PCA on pseudobulk data pca_file = pb_expression_output_root + 'pca_scores.txt' pca_ve_file = pb_expression_output_root + 'pca_pve.txt' generate_pca_scores_and_variance_explained(pseudobulk_expression_file, num_pcs, pca_file, pca_ve_file)
def standardize_expression(tpm_expression_matrix_file, standardized_tpm_expression_matrix_file): tpm_full = np.loadtxt(tpm_expression_matrix_file, dtype=str, delimiter='\t') tpm = tpm_full[1:, 1:].astype(float) samples = tpm_full[1:, 0] genes = tpm_full[0, 1:] # Quantile normalize the samples df = pd.DataFrame(np.transpose(tpm)) #rank_mean = df.stack().groupby(df.rank(method='first').stack().astype(int)).mean() #temp_out = df.rank(method='min').stack().astype(int).map(rank_mean).unstack() #tpm_quantile_normalized = np.transpose(np.asarray(temp_out)) temp_out = rnaseqnorm.normalize_quantiles(df) norm_df = rnaseqnorm.inverse_normal_transform(temp_out) standardized_tpm = np.transpose(np.asarray(norm_df)) ### #tpm_quantile_normalized = np.transpose(np.asarray(temp_out)) ### # Standardize the genes #num_genes = tpm_quantile_normalized.shape[1] #num_samples = tpm_quantile_normalized.shape[0] #### #standardized_tpm = np.zeros((num_samples, num_genes)) #for gene_num in range(num_genes): # standardized_tpm[:,gene_num] = (tpm_quantile_normalized[:, gene_num] - np.mean(tpm_quantile_normalized[:, gene_num]))/np.std(tpm_quantile_normalized[:, gene_num]) #### # Print to output file t = open(standardized_tpm_expression_matrix_file, 'w') # print header t.write('SampleId\t' + '\t'.join(samples) + '\n') for gene_num, gene_name in enumerate(genes): #expr = tpm_quantile_normalized[sample_num, :].astype(str) ### expr = standardized_tpm[:, gene_num].astype(str) ### t.write(gene_name + '\t' + '\t'.join(expr) + '\n') t.close() '''
def main(): if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) count_matrix_df = pd.read_csv(args.count_matrix, sep='\t', index_col=0) print("Normalizing...") norm_matrix_df = normalize_quantiles(count_matrix_df) print(norm_matrix_df.head()) print("Averaging replicate counts...") pat = r'\w+(?=[0-9]+$)' # regex for groups from sample ids groups = OrderedDict() for i, sid in enumerate(list(norm_matrix_df.columns)): groups.setdefault(re.search(pat, sid).group(0), []).append(i) # get mean across group indices mu_norm_matrix_df = pd.DataFrame(0, index=norm_matrix_df.index, columns=groups.keys()) for grp, idc in groups.items(): group_cols = norm_matrix_df.columns[idc] mu_norm_matrix_df[grp] = norm_matrix_df[group_cols].apply(np.mean, axis=1) outfile = os.path.join(args.output_dir, args.prefix + '.accessibility.txt') mu_norm_matrix_df.to_csv(outfile, sep='\t') outfile = os.path.join(args.output_dir, args.prefix + '.quant_norm.pct') norm_matrix_df.to_csv(outfile, sep='\t') print( "wrote to *.accessibility (averaged) and *.quant_norm.pct (w/ replicates) to: {}" .format(args.output_dir))
def qn_normalization(tpm_df, mask): qn_df = rnaseqnorm.normalize_quantiles(tpm_df.loc[mask]) norm_df = rnaseqnorm.inverse_normal_transform(qn_df) return norm_df