def run_edgeR(gene_expression, bio_assignment, gene_names, batch_info=None, batch=True): if batch_info is None: batch = False r_counts = conversion_pydataframe(gene_expression) r_bio_group = conversion_pydataframe(bio_assignment) r_dge = r.DGEList(counts=r.t(r_counts), genes=gene_names) r.assign("dge", r_dge) r.assign("bio_group", r.factor(r_bio_group)) r("dge$samples$bio_group <- bio_group") if batch: r_batch_group = conversion_pydataframe(batch_info) r.assign("batch_group", r.factor(r_batch_group)) r("dge$samples$batch_group <- batch_group") r("""dge <- suppressWarnings(edgeR::calcNormFactors(dge))""") if not batch: r("""design <- model.matrix(~bio_group, data = dge$samples)""") r("""colnames(design) <- c("Intercept", "bio")""") if batch: r("""design <- model.matrix(~bio_group+batch_group, data = dge$samples)""") r("""colnames(design) <- c("Intercept", "bio", "batch")""") r("""dge <- estimateDisp(dge, design)""") r("""fit <- glmFit(dge, design)""") if not batch: r("""lrt <- glmLRT(fit)""") if batch: r("""lrt <- glmLRT(fit, coef="bio")""") return r("lrt$table$PValue")
def calc_size_factors(self, method="TMM"): # Convert pandas dataframe to R dataframe r_dge = r.DGEList(self.count_df) # Calculate normalization factors r_dge = r.calcNormFactors(r_dge, method=method) size_factors = (np.array(r_dge.rx2('samples').rx2("lib.size")) * np.array(r_dge.rx2("samples").rx2("norm.factors"))) # convert to pandas series size_factors = pd.Series(size_factors, index=self.count_df.columns) # adjust size factors so that the maximum is 1.0 size_factors = size_factors / size_factors.max() return size_factors
def norm_expr_vals(self, ref_col, method="TMM"): """ Normalize expression values relative to a reference sample (using TMM normalization). Parameters: ----------- - ref_col: Reference column - method: Method to use for normalization, e.g. 'TMM' """ # Take only numeric values from dataframe r_counts = conversion_pydataframe(self.exp_obj.counts_df) r_dge = r.DGEList(r_counts) # Calculate normalization factors r_dge = r.calcNormFactors(r_dge, refColumn=ref_col, method=method) print r_dge[0:10] # Get counts per million r_cpm_result = r.cpm(r_dge) print "Counts per million: " print r_cpm_result[0:10] return r_dge, r_cpm_result