def start(self): self.log.info("Correcting expression data for dataset effects.") self.print_arguments() self.log.info("Correcting signature expression data.") if not check_file_exists(self.sign_expr_dc_outpath) or self.force: if self.dataset_df is None: self.dataset_df = load_dataframe(self.dataset_file, header=0, index_col=0, logger=self.log) if self.sign_expr_df is None: self.sign_expr_df = load_dataframe(self.sign_expr_file, header=0, index_col=0, logger=self.log) self.sign_expr_dc_df = self.dataset_correction( self.sign_expr_df, self.dataset_df) save_dataframe(df=self.sign_expr_dc_df, outpath=self.sign_expr_dc_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.")
def save(self): print("\tSaving matrix.") save_dataframe(df=self.df, outpath=self.outpath, index=True, header=True, logger=self.log)
def save(self): save_dataframe(df=self.eqtl_df, outpath=self.outpath, index=False, header=True, logger=self.log) save_dataframe(df=self.eqtl_df.loc[:, ["ProbeName", "SNPName"]], outpath=os.path.join(self.outdir, "snp_gene_list.txt"), index=False, header=True, logger=self.log)
def create_covs_file(self): # read the eigenvectors file. self.log.info("Loading eigenvectors matrix.") eigen_df = load_dataframe(self.eig_file, header=0, index_col=0, nrows=max(self.n_eigen), logger=self.log) if len(set(self.sample_order).intersection(set( eigen_df.columns))) == 0: eigen_df = eigen_df.T eigen_df.columns = [ self.sample_dict[x] if x in self.sample_dict else x for x in eigen_df.columns ] eigen_df = eigen_df.loc[:, self.sample_order] for n_eigen in self.n_eigen: save_dataframe(df=eigen_df.iloc[:n_eigen, :], outpath=os.path.join( self.outdir, "first{}PCComponents.txt.gz".format(n_eigen)), index=True, header=True, logger=self.log) # loading deconvolution matrix. self.log.info("Loading deconvolution matrix.") if self.decon_df is None: self.decon_df = load_dataframe(self.decon_file, header=0, index_col=0, logger=self.log) # merge. self.log.info("Merging matrices.") covs_df = pd.merge(eigen_df.T, self.decon_df, left_index=True, right_index=True) covs_df = covs_df.T covs_df.index.name = "-" # Validate sample order. if not covs_df.columns.equals(self.sample_order): covs_df = covs_df[self.sample_order] # Remove old dataframes. del eigen_df return covs_df
def save(self): save_dataframe(df=self.gte_df, outpath=self.outpath, index=False, header=False, logger=self.log) sample_dataset_df = self.gte_df.iloc[:, [1, 2]] sample_dataset_df.columns = ["sample", "dataset"] save_dataframe(df=sample_dataset_df, outpath=os.path.join(self.outdir, "SampleToDataset.txt.gz"), index=False, header=True, logger=self.log)
def pivot_and_save(self, dfm, col, indices, columns): print("Pivoting table.", flush=True) pivot_df = dfm.pivot(index='covariate', columns='SNP', values=col) print("Reorder dataframe.") pivot_df = pivot_df.loc[indices, columns] pivot_df.index = ["_".join(x.split("_")[:-1]) for x in pivot_df.index] pivot_df.index.name = "-" pivot_df.columns = ["_".join(x.split("_")[:-1]) for x in pivot_df.columns] pivot_df.columns.name = None print("Saving {} dataframe.".format(col), flush=True) save_dataframe(df=pivot_df, outpath=os.path.join(self.work_dir, "{}_table.txt.gz".format(col)), header=True, index=True)
def save(self): save_dataframe(df=self.eqtl_df, outpath=self.outpath, index=False, header=True, logger=self.log)
def start(self): self.log.info("Starting creating matrices.") self.print_arguments() if self.eqtl_df is None: self.eqtl_df = load_dataframe(self.eqtl_file, header=0, index_col=None, logger=self.log) self.log.info("Parsing genotype input data.") if not check_file_exists(self.geno_outpath) or not check_file_exists( self.alleles_outpath) or self.force: alleles_df, geno_df = self.parse_genotype_file() self.log.info("Reorder, Filter, and save.") self.alleles_df = alleles_df.loc[self.eqtl_df.loc[:, "SNPName"], :] save_dataframe(df=self.alleles_df, outpath=self.alleles_outpath, index=True, header=True, logger=self.log) self.geno_df = geno_df.loc[self.eqtl_df.loc[:, "SNPName"], self.sample_order] save_dataframe(df=self.geno_df, outpath=self.geno_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.") self.log.info("Parsing expression input data.") if not check_file_exists(self.expr_outpath) or not check_file_exists( self.sign_expr_outpath) or self.force: self.log.info("Loading signature matrix.") self.sign_df = load_dataframe(inpath=self.sign_file, header=0, index_col=0, logger=self.log) signature_genes = set(self.sign_df.index.to_list()) self.log.info("Loading gene traslate dict.") self.gene_info_df = load_dataframe(inpath=self.gene_info_file, header=0, index_col=None, logger=self.log) gene_trans_dict = construct_dict_from_df(self.gene_info_df, self.ensg_id, self.hgnc_id) if not check_file_exists(self.expr_outpath) or self.force: self.log.info("Parsing expression data.") self.expr_df, self.sign_expr_df = self.parse_expression_file( self.expr_file, signature_genes, gene_trans_dict, include_decon=self.decon_expr_file is None) if (not check_file_exists(self.sign_expr_outpath) or self.force) and (check_file_exists(self.decon_expr_file)): self.log.info("Parsing deconvolution expression data.") self.log.warning( "Using different expresion file for deconvolution.") _, self.sign_expr_df = self.parse_expression_file( self.decon_expr_file, signature_genes, gene_trans_dict, include_expr=False, remove_ens_version=True) self.log.info("Reorder, Filter, and save.") if self.expr_df is not None: self.expr_df = self.expr_df.loc[self.eqtl_df.loc[:, "ProbeName"], self.sample_order] save_dataframe(df=self.expr_df, outpath=self.expr_outpath, index=True, header=True, logger=self.log) if self.sign_expr_df is not None: self.sign_expr_df = self.sign_expr_df.loc[:, self.sample_order] save_dataframe(df=self.sign_expr_df, outpath=self.sign_expr_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.")
def perform_deconvolution(self): if self.sign_df is None: # Load the celltype profile file. self.log.info("Loading cell type profile matrix.") self.sign_df = load_dataframe(self.sign_file, header=0, index_col=0, logger=self.log) if self.sign_expr_df is None: # Load the celltype expression file. self.log.info("Loading cell type expression matrix.") self.sign_expr_df = load_dataframe(self.sign_expr_file, header=0, index_col=0, logger=self.log) # Filter uninformative genes from the signature matrix. sign_df = self.filter(self.sign_df, cutoff=self.min_expr_cutoff) # Subset and reorder. sign_df, expr_df = self.subset(sign_df, self.sign_expr_df) # Transform. sign_df = self.perform_log2_transform(sign_df) # Shift the data to be positive. self.log.info("Shifting data to be positive if required") if sign_df.values.min() < 0: self.log.warning("\tSignature matrix is shifted.") sign_df = self.perform_shift(sign_df) if expr_df.values.min() < 0: self.log.warning("\tExpression matrix is shifted.") expr_df = self.perform_shift(expr_df) self.log.info("Signature shape: {}".format(sign_df.shape)) self.log.info("Expression shape: {}".format(expr_df.shape)) # Perform deconvolution per sample. self.log.info("Performing partial deconvolution.") decon_data = [] residuals_data = [] recon_accuracy_data = [] for _, sample in expr_df.T.iterrows(): # Model. proportions, rnorm = self.nnls(sign_df, sample) # Calculate reconstruction accuracy. recon_accuracy = self.calc_reconstruction_accuracy( y=sample, X=sign_df, betas=proportions) # Save. decon_data.append(proportions) residuals_data.append(rnorm) recon_accuracy_data.append(recon_accuracy) decon_df = pd.DataFrame(decon_data, index=expr_df.columns, columns=sign_df.columns) residuals_df = pd.Series(residuals_data, index=expr_df.columns) recon_accuracy = pd.Series(recon_accuracy_data, index=expr_df.columns) self.log.info("Estimated weights:") self.log.info(decon_df.mean(axis=0)) self.log.info( "Average reconstruction accuracy: {:.2f} [SD: {:.2f}]".format( recon_accuracy.mean(), recon_accuracy.std())) save_dataframe(df=decon_df, outpath=os.path.join(self.outdir, "NNLS_betas.txt.gz"), index=True, header=True, logger=self.log) # Make the weights sum up to 1. decon_df = self.sum_to_one(decon_df) self.log.info("Estimated proportions:") self.log.info(decon_df.mean(axis=0)) # Calculate the average residuals. self.log.info("Average residual: {:.2f}".format(residuals_df.mean())) save_dataframe(df=decon_df, outpath=os.path.join( self.outdir, "deconvolution_table_complete.txt.gz"), index=True, header=True, logger=self.log) if self.cell_type_groups is not None: self.log.info("Summing cell types.") cell_type_group = np.array([ self.cell_type_groups[ct] if ct in self.cell_type_groups else ct for ct in decon_df.columns ], dtype=object) cell_types = list(set(cell_type_group)) cell_types.sort() summed_decon_df = pd.DataFrame(np.nan, index=decon_df.index, columns=cell_types) for ct_group in cell_types: summed_decon_df.loc[:, ct_group] = decon_df.loc[:, cell_type_group == ct_group].sum( axis=1) decon_df = summed_decon_df return decon_df
def save(self): save_dataframe(df=self.decon_df, outpath=self.outpath, index=True, header=True, logger=self.log)
def start(self): print("Starting interaction analyser - combine and plot.") self.print_arguments() # Start the timer. start_time = time.time() print("") print("### Step 1 ###") print("Combine pickle files into dataframe.", flush=True) dataframes = {} for filename in [ self.pvalues_filename, self.coef_filename, self.std_err_filename ]: outpath = os.path.join(self.work_dir, "{}_table.txt.gz".format(filename)) if not check_file_exists(outpath) or self.force: print("Loading {} data.".format(filename), flush=True) columns, data = self.combine_pickles(self.work_dir, filename, columns=True) if len(data) == 0: print("\tNo {} data found.".format(filename)) continue print("Creating {} dataframe.".format(filename), flush=True) df = self.create_df(data, columns) print("Saving {} dataframe.".format(filename), flush=True) save_dataframe(df=df, outpath=outpath, header=True, index=True) dataframes[filename] = df del columns, data, df else: print("Skipping step for {}".format(outpath)) dataframes[filename] = load_dataframe(outpath, header=0, index_col=0) print("") print("### Step 2 ###") print("Calculate t-values", flush=True) outpath = os.path.join(self.work_dir, "{}_table.txt.gz".format(self.tvalue_filename)) if not check_file_exists(outpath) or self.force: if self.coef_filename in dataframes and self.std_err_filename in dataframes: # Calculate t-values coef_df = dataframes[self.coef_filename] std_err_df = dataframes[self.std_err_filename] if not coef_df.columns.identical(std_err_df.columns): overlap = set(coef_df.columns).intersection( set(std_err_df.columns)) if len(overlap) == 0: print("No overlapping eQTLs between coef and std_err " "data frame columns.") else: coef_df = coef_df.loc[:, overlap] std_err_df = std_err_df.loc[:, overlap] if not coef_df.index.identical(std_err_df.index): overlap = set(coef_df.index).intersection( set(std_err_df.index)) if len(overlap) == 0: print("No overlapping eQTLs between coef and std_err " "data frames indices.") else: coef_df = coef_df.loc[overlap, :] std_err_df = std_err_df.loc[overlap, :] if coef_df.columns.identical( std_err_df.columns) and coef_df.index.identical( std_err_df.index): tvalue_df = coef_df / std_err_df print("Saving {} dataframe.".format(self.tvalue_filename), flush=True) save_dataframe(df=tvalue_df, outpath=os.path.join( self.work_dir, "{}_table.txt.gz".format( self.tvalue_filename)), header=True, index=True) else: print("\tNo data found.") else: print("Skipping step.") print("") print("### Step 3 ###") print("Starting other calculations", flush=True) if self.pvalues_filename not in dataframes: print("\tNo pvalues data found.") return pvalue_df = dataframes[self.pvalues_filename] pvalue_df_columns = [ "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.columns) ] pvalue_df.columns = pvalue_df_columns pvalue_df_indices = [ "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.index) ] pvalue_df.index = pvalue_df_indices pvalue_df.reset_index(drop=False, inplace=True) print("Melting dataframe.", flush=True) dfm = pvalue_df.melt(id_vars=["index"]) dfm.columns = ["covariate", "SNP", "pvalue"] dfm["rank"] = dfm.loc[:, "pvalue"].rank(ascending=True) n_signif = dfm[dfm["pvalue"] <= self.alpha].shape[0] n_total = dfm.shape[0] print("\t{}/{} [{:.2f}%] of pvalues < {}".format( n_signif, n_total, (100 / n_total) * n_signif, self.alpha), flush=True) print("Adding z-scores.", flush=True) dfm["zscore"] = stats.norm.isf(dfm["pvalue"]) dfm.loc[dfm["pvalue"] > (1.0 - 1e-16), "zscore"] = -8.209536151601387 dfm.loc[dfm["pvalue"] < 1e-323, "zscore"] = 38.44939448087599 self.pivot_and_save(dfm, "zscore", pvalue_df_indices, pvalue_df_columns) print("Adding BH-FDR.", flush=True) dfm["BH-FDR"] = dfm["pvalue"] * (n_total / (dfm["rank"] + 1)) dfm.loc[dfm["BH-FDR"] > 1, "BH-FDR"] = 1 prev_bh_fdr = -np.Inf for i in range(n_total): bh_fdr = dfm.loc[i, "BH-FDR"] if bh_fdr > prev_bh_fdr: prev_bh_fdr = bh_fdr else: dfm.loc[i, "BH-FDR"] = prev_bh_fdr n_signif = dfm[dfm["BH-FDR"] <= self.alpha].shape[0] print("\t{}/{} [{:.2f}%] of BH-FDR values < {}".format( n_signif, n_total, (100 / n_total) * n_signif, self.alpha), flush=True) self.pivot_and_save(dfm, "BH-FDR", pvalue_df_indices, pvalue_df_columns) print("Adding permutation FDR.", flush=True) print("\tLoading permutation pvalue data.", flush=True) _, perm_pvalues = self.combine_pickles(self.work_dir, self.perm_pvalues_filename) # perm_pvalues = [random.random() for _ in range(n_total * 10)] print("Sorting p-values.", flush=True) perm_pvalues = sorted(perm_pvalues) if len(perm_pvalues) > 0: n_perm = len(perm_pvalues) / n_total if n_perm != self.n_perm: print("\tWARNING: not all permutation pvalus are present") perm_ranks = [] for pvalue in dfm["pvalue"]: perm_ranks.append(bisect_left(perm_pvalues, pvalue)) dfm["perm-rank"] = perm_ranks dfm["perm-FDR"] = (dfm["perm-rank"] / n_perm) / dfm["rank"] dfm.loc[(dfm.index == 0) | (dfm["perm-rank"] == 0), "perm-FDR"] = 0 dfm.loc[dfm["perm-FDR"] > 1, "perm-FDR"] = 1 self.pivot_and_save(dfm, "perm-FDR", pvalue_df_indices, pvalue_df_columns) print("Saving full dataframe.", flush=True) save_dataframe(df=dfm, outpath=os.path.join(self.work_dir, "molten_table.txt.gz"), header=True, index=True) print("") # Print the time. run_time_min, run_time_sec = divmod(time.time() - start_time, 60) run_time_hour, run_time_min = divmod(run_time_min, 60) print("finished in {} hour(s), {} minute(s) and " "{} second(s).".format(int(run_time_hour), int(run_time_min), int(run_time_sec)), flush=True)
def create_tech_covs_file(self): # Load the technical covariates. self.log.info("Loading technical covariates matrix.") tcov_df = load_dataframe(inpath=self.cov_file, header=0, index_col=0, logger=self.log) # Filter on samples and technical covariates. self.log.info("Filtering on samples and technical covariates.") tcov_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in tcov_df.index] tcov_df = tcov_df.loc[self.sample_order, :].copy() save_dataframe(df=tcov_df.T, outpath=os.path.join(self.outdir, "technical_covariates_table.txt.gz"), index=True, header=True, logger=self.log) if self.technical_covariates: save_dataframe(df=tcov_df.loc[:, self.technical_covariates].T, outpath=os.path.join(self.outdir, "technical_covariates_table_subset.txt.gz"), index=True, header=True, logger=self.log) # Load the MDS components. self.log.info("Loading MDS matrix.") mds_df = load_dataframe(inpath=self.mds_file, header=0, index_col=0, logger=self.log) # Filter on samples and technical covariates. self.log.info("Filtering on samples and technical covariates.") mds_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in mds_df.index] mds_df = mds_df.loc[self.sample_order, :].copy() save_dataframe(df=mds_df.T, outpath=os.path.join(self.outdir, "mds_covariates_table.txt.gz"), index=True, header=True, logger=self.log) tmp_combined_df = tcov_df.merge(mds_df, left_index=True, right_index=True) save_dataframe(df=tmp_combined_df.T, outpath=os.path.join(self.outdir, "technical_and_mds_covariates_table.txt.gz"), index=True, header=True, logger=self.log) # Loading cohort matrix. self.log.info("Loading dataset matrix.") if self.dataset_df is None: self.dataset_df = load_dataframe(self.dataset_file, header=0, index_col=0, logger=self.log) # merge. self.log.info("Merging matrices.") correction_df = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), [tcov_df, mds_df, self.dataset_df]) correction_df = correction_df.T correction_df.index.name = "-" self.log.info("\t Correction matrix shape: {}".format(correction_df.shape)) # Validate sample order. if not correction_df.columns.equals(self.sample_order): correction_df = correction_df[self.sample_order] return correction_df
def save(self): save_dataframe(df=self.gte, outpath=self.outpath, index=False, header=False, logger=self.log)