Ejemplo n.º 1
0
    def start(self):
        self.log.info("Correcting expression data for dataset effects.")
        self.print_arguments()

        self.log.info("Correcting signature expression data.")
        if not check_file_exists(self.sign_expr_dc_outpath) or self.force:
            if self.dataset_df is None:
                self.dataset_df = load_dataframe(self.dataset_file,
                                                 header=0,
                                                 index_col=0,
                                                 logger=self.log)

            if self.sign_expr_df is None:
                self.sign_expr_df = load_dataframe(self.sign_expr_file,
                                                   header=0,
                                                   index_col=0,
                                                   logger=self.log)

            self.sign_expr_dc_df = self.dataset_correction(
                self.sign_expr_df, self.dataset_df)
            save_dataframe(df=self.sign_expr_dc_df,
                           outpath=self.sign_expr_dc_outpath,
                           index=True,
                           header=True,
                           logger=self.log)
        else:
            self.log.info("\tSkipping step.")
Ejemplo n.º 2
0
 def save(self):
     print("\tSaving matrix.")
     save_dataframe(df=self.df,
                    outpath=self.outpath,
                    index=True,
                    header=True,
                    logger=self.log)
Ejemplo n.º 3
0
 def save(self):
     save_dataframe(df=self.eqtl_df,
                    outpath=self.outpath,
                    index=False,
                    header=True,
                    logger=self.log)
     save_dataframe(df=self.eqtl_df.loc[:, ["ProbeName", "SNPName"]],
                    outpath=os.path.join(self.outdir, "snp_gene_list.txt"),
                    index=False,
                    header=True,
                    logger=self.log)
Ejemplo n.º 4
0
    def create_covs_file(self):
        # read the eigenvectors file.
        self.log.info("Loading eigenvectors matrix.")
        eigen_df = load_dataframe(self.eig_file,
                                  header=0,
                                  index_col=0,
                                  nrows=max(self.n_eigen),
                                  logger=self.log)
        if len(set(self.sample_order).intersection(set(
                eigen_df.columns))) == 0:
            eigen_df = eigen_df.T
        eigen_df.columns = [
            self.sample_dict[x] if x in self.sample_dict else x
            for x in eigen_df.columns
        ]
        eigen_df = eigen_df.loc[:, self.sample_order]

        for n_eigen in self.n_eigen:
            save_dataframe(df=eigen_df.iloc[:n_eigen, :],
                           outpath=os.path.join(
                               self.outdir,
                               "first{}PCComponents.txt.gz".format(n_eigen)),
                           index=True,
                           header=True,
                           logger=self.log)

        # loading deconvolution matrix.
        self.log.info("Loading deconvolution matrix.")
        if self.decon_df is None:
            self.decon_df = load_dataframe(self.decon_file,
                                           header=0,
                                           index_col=0,
                                           logger=self.log)

        # merge.
        self.log.info("Merging matrices.")
        covs_df = pd.merge(eigen_df.T,
                           self.decon_df,
                           left_index=True,
                           right_index=True)
        covs_df = covs_df.T
        covs_df.index.name = "-"

        # Validate sample order.
        if not covs_df.columns.equals(self.sample_order):
            covs_df = covs_df[self.sample_order]

        # Remove old dataframes.
        del eigen_df

        return covs_df
Ejemplo n.º 5
0
    def save(self):
        save_dataframe(df=self.gte_df,
                       outpath=self.outpath,
                       index=False,
                       header=False,
                       logger=self.log)

        sample_dataset_df = self.gte_df.iloc[:, [1, 2]]
        sample_dataset_df.columns = ["sample", "dataset"]
        save_dataframe(df=sample_dataset_df,
                       outpath=os.path.join(self.outdir,
                                            "SampleToDataset.txt.gz"),
                       index=False,
                       header=True,
                       logger=self.log)
Ejemplo n.º 6
0
    def pivot_and_save(self, dfm, col, indices, columns):
        print("Pivoting table.", flush=True)
        pivot_df = dfm.pivot(index='covariate', columns='SNP', values=col)

        print("Reorder dataframe.")
        pivot_df = pivot_df.loc[indices, columns]
        pivot_df.index = ["_".join(x.split("_")[:-1]) for x in pivot_df.index]
        pivot_df.index.name = "-"
        pivot_df.columns = ["_".join(x.split("_")[:-1]) for x in
                            pivot_df.columns]
        pivot_df.columns.name = None

        print("Saving {} dataframe.".format(col), flush=True)
        save_dataframe(df=pivot_df,
                       outpath=os.path.join(self.work_dir,
                                            "{}_table.txt.gz".format(col)),
                       header=True, index=True)
Ejemplo n.º 7
0
 def save(self):
     save_dataframe(df=self.eqtl_df,
                    outpath=self.outpath,
                    index=False,
                    header=True,
                    logger=self.log)
Ejemplo n.º 8
0
    def start(self):
        self.log.info("Starting creating matrices.")
        self.print_arguments()

        if self.eqtl_df is None:
            self.eqtl_df = load_dataframe(self.eqtl_file,
                                          header=0,
                                          index_col=None,
                                          logger=self.log)

        self.log.info("Parsing genotype input data.")
        if not check_file_exists(self.geno_outpath) or not check_file_exists(
                self.alleles_outpath) or self.force:
            alleles_df, geno_df = self.parse_genotype_file()

            self.log.info("Reorder, Filter, and save.")
            self.alleles_df = alleles_df.loc[self.eqtl_df.loc[:, "SNPName"], :]
            save_dataframe(df=self.alleles_df,
                           outpath=self.alleles_outpath,
                           index=True,
                           header=True,
                           logger=self.log)

            self.geno_df = geno_df.loc[self.eqtl_df.loc[:, "SNPName"],
                                       self.sample_order]
            save_dataframe(df=self.geno_df,
                           outpath=self.geno_outpath,
                           index=True,
                           header=True,
                           logger=self.log)
        else:
            self.log.info("\tSkipping step.")

        self.log.info("Parsing expression input data.")
        if not check_file_exists(self.expr_outpath) or not check_file_exists(
                self.sign_expr_outpath) or self.force:
            self.log.info("Loading signature matrix.")
            self.sign_df = load_dataframe(inpath=self.sign_file,
                                          header=0,
                                          index_col=0,
                                          logger=self.log)
            signature_genes = set(self.sign_df.index.to_list())

            self.log.info("Loading gene traslate dict.")
            self.gene_info_df = load_dataframe(inpath=self.gene_info_file,
                                               header=0,
                                               index_col=None,
                                               logger=self.log)
            gene_trans_dict = construct_dict_from_df(self.gene_info_df,
                                                     self.ensg_id,
                                                     self.hgnc_id)

            if not check_file_exists(self.expr_outpath) or self.force:
                self.log.info("Parsing expression data.")
                self.expr_df, self.sign_expr_df = self.parse_expression_file(
                    self.expr_file,
                    signature_genes,
                    gene_trans_dict,
                    include_decon=self.decon_expr_file is None)

            if (not check_file_exists(self.sign_expr_outpath) or
                    self.force) and (check_file_exists(self.decon_expr_file)):
                self.log.info("Parsing deconvolution expression data.")
                self.log.warning(
                    "Using different expresion file for deconvolution.")
                _, self.sign_expr_df = self.parse_expression_file(
                    self.decon_expr_file,
                    signature_genes,
                    gene_trans_dict,
                    include_expr=False,
                    remove_ens_version=True)

            self.log.info("Reorder, Filter, and save.")
            if self.expr_df is not None:
                self.expr_df = self.expr_df.loc[self.eqtl_df.loc[:,
                                                                 "ProbeName"],
                                                self.sample_order]
                save_dataframe(df=self.expr_df,
                               outpath=self.expr_outpath,
                               index=True,
                               header=True,
                               logger=self.log)
            if self.sign_expr_df is not None:
                self.sign_expr_df = self.sign_expr_df.loc[:, self.sample_order]
                save_dataframe(df=self.sign_expr_df,
                               outpath=self.sign_expr_outpath,
                               index=True,
                               header=True,
                               logger=self.log)
        else:
            self.log.info("\tSkipping step.")
Ejemplo n.º 9
0
    def perform_deconvolution(self):
        if self.sign_df is None:
            # Load the celltype profile file.
            self.log.info("Loading cell type profile matrix.")
            self.sign_df = load_dataframe(self.sign_file,
                                          header=0,
                                          index_col=0,
                                          logger=self.log)

        if self.sign_expr_df is None:
            # Load the celltype expression file.
            self.log.info("Loading cell type expression matrix.")
            self.sign_expr_df = load_dataframe(self.sign_expr_file,
                                               header=0,
                                               index_col=0,
                                               logger=self.log)

        # Filter uninformative genes from the signature matrix.
        sign_df = self.filter(self.sign_df, cutoff=self.min_expr_cutoff)

        # Subset and reorder.
        sign_df, expr_df = self.subset(sign_df, self.sign_expr_df)

        # Transform.
        sign_df = self.perform_log2_transform(sign_df)

        # Shift the data to be positive.
        self.log.info("Shifting data to be positive if required")
        if sign_df.values.min() < 0:
            self.log.warning("\tSignature matrix is shifted.")
            sign_df = self.perform_shift(sign_df)

        if expr_df.values.min() < 0:
            self.log.warning("\tExpression matrix is shifted.")
            expr_df = self.perform_shift(expr_df)

        self.log.info("Signature shape: {}".format(sign_df.shape))
        self.log.info("Expression shape: {}".format(expr_df.shape))

        # Perform deconvolution per sample.
        self.log.info("Performing partial deconvolution.")
        decon_data = []
        residuals_data = []
        recon_accuracy_data = []
        for _, sample in expr_df.T.iterrows():
            # Model.
            proportions, rnorm = self.nnls(sign_df, sample)

            # Calculate reconstruction accuracy.
            recon_accuracy = self.calc_reconstruction_accuracy(
                y=sample, X=sign_df, betas=proportions)

            # Save.
            decon_data.append(proportions)
            residuals_data.append(rnorm)
            recon_accuracy_data.append(recon_accuracy)

        decon_df = pd.DataFrame(decon_data,
                                index=expr_df.columns,
                                columns=sign_df.columns)
        residuals_df = pd.Series(residuals_data, index=expr_df.columns)
        recon_accuracy = pd.Series(recon_accuracy_data, index=expr_df.columns)

        self.log.info("Estimated weights:")
        self.log.info(decon_df.mean(axis=0))
        self.log.info(
            "Average reconstruction accuracy: {:.2f} [SD: {:.2f}]".format(
                recon_accuracy.mean(), recon_accuracy.std()))

        save_dataframe(df=decon_df,
                       outpath=os.path.join(self.outdir, "NNLS_betas.txt.gz"),
                       index=True,
                       header=True,
                       logger=self.log)

        # Make the weights sum up to 1.
        decon_df = self.sum_to_one(decon_df)
        self.log.info("Estimated proportions:")
        self.log.info(decon_df.mean(axis=0))

        # Calculate the average residuals.
        self.log.info("Average residual: {:.2f}".format(residuals_df.mean()))

        save_dataframe(df=decon_df,
                       outpath=os.path.join(
                           self.outdir, "deconvolution_table_complete.txt.gz"),
                       index=True,
                       header=True,
                       logger=self.log)

        if self.cell_type_groups is not None:
            self.log.info("Summing cell types.")
            cell_type_group = np.array([
                self.cell_type_groups[ct]
                if ct in self.cell_type_groups else ct
                for ct in decon_df.columns
            ],
                                       dtype=object)
            cell_types = list(set(cell_type_group))
            cell_types.sort()
            summed_decon_df = pd.DataFrame(np.nan,
                                           index=decon_df.index,
                                           columns=cell_types)
            for ct_group in cell_types:
                summed_decon_df.loc[:,
                                    ct_group] = decon_df.loc[:,
                                                             cell_type_group ==
                                                             ct_group].sum(
                                                                 axis=1)

            decon_df = summed_decon_df

        return decon_df
Ejemplo n.º 10
0
 def save(self):
     save_dataframe(df=self.decon_df,
                    outpath=self.outpath,
                    index=True,
                    header=True,
                    logger=self.log)
Ejemplo n.º 11
0
    def start(self):
        print("Starting interaction analyser - combine and plot.")
        self.print_arguments()

        # Start the timer.
        start_time = time.time()

        print("")
        print("### Step 1 ###")
        print("Combine pickle files into dataframe.", flush=True)
        dataframes = {}
        for filename in [
                self.pvalues_filename, self.coef_filename,
                self.std_err_filename
        ]:
            outpath = os.path.join(self.work_dir,
                                   "{}_table.txt.gz".format(filename))
            if not check_file_exists(outpath) or self.force:
                print("Loading {} data.".format(filename), flush=True)
                columns, data = self.combine_pickles(self.work_dir,
                                                     filename,
                                                     columns=True)

                if len(data) == 0:
                    print("\tNo {} data found.".format(filename))
                    continue

                print("Creating {} dataframe.".format(filename), flush=True)
                df = self.create_df(data, columns)

                print("Saving {} dataframe.".format(filename), flush=True)
                save_dataframe(df=df, outpath=outpath, header=True, index=True)

                dataframes[filename] = df

                del columns, data, df
            else:
                print("Skipping step for {}".format(outpath))
                dataframes[filename] = load_dataframe(outpath,
                                                      header=0,
                                                      index_col=0)

        print("")
        print("### Step 2 ###")
        print("Calculate t-values", flush=True)
        outpath = os.path.join(self.work_dir,
                               "{}_table.txt.gz".format(self.tvalue_filename))
        if not check_file_exists(outpath) or self.force:
            if self.coef_filename in dataframes and self.std_err_filename in dataframes:
                # Calculate t-values
                coef_df = dataframes[self.coef_filename]
                std_err_df = dataframes[self.std_err_filename]

                if not coef_df.columns.identical(std_err_df.columns):
                    overlap = set(coef_df.columns).intersection(
                        set(std_err_df.columns))
                    if len(overlap) == 0:
                        print("No overlapping eQTLs between coef and std_err "
                              "data frame columns.")
                    else:
                        coef_df = coef_df.loc[:, overlap]
                        std_err_df = std_err_df.loc[:, overlap]
                if not coef_df.index.identical(std_err_df.index):
                    overlap = set(coef_df.index).intersection(
                        set(std_err_df.index))
                    if len(overlap) == 0:
                        print("No overlapping eQTLs between coef and std_err "
                              "data frames indices.")
                    else:
                        coef_df = coef_df.loc[overlap, :]
                        std_err_df = std_err_df.loc[overlap, :]

                if coef_df.columns.identical(
                        std_err_df.columns) and coef_df.index.identical(
                            std_err_df.index):
                    tvalue_df = coef_df / std_err_df

                    print("Saving {} dataframe.".format(self.tvalue_filename),
                          flush=True)
                    save_dataframe(df=tvalue_df,
                                   outpath=os.path.join(
                                       self.work_dir, "{}_table.txt.gz".format(
                                           self.tvalue_filename)),
                                   header=True,
                                   index=True)
            else:
                print("\tNo data found.")
        else:
            print("Skipping step.")

        print("")
        print("### Step 3 ###")
        print("Starting other calculations", flush=True)

        if self.pvalues_filename not in dataframes:
            print("\tNo pvalues data found.")
            return

        pvalue_df = dataframes[self.pvalues_filename]
        pvalue_df_columns = [
            "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.columns)
        ]
        pvalue_df.columns = pvalue_df_columns
        pvalue_df_indices = [
            "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.index)
        ]
        pvalue_df.index = pvalue_df_indices
        pvalue_df.reset_index(drop=False, inplace=True)

        print("Melting dataframe.", flush=True)
        dfm = pvalue_df.melt(id_vars=["index"])
        dfm.columns = ["covariate", "SNP", "pvalue"]
        dfm["rank"] = dfm.loc[:, "pvalue"].rank(ascending=True)
        n_signif = dfm[dfm["pvalue"] <= self.alpha].shape[0]
        n_total = dfm.shape[0]
        print("\t{}/{} [{:.2f}%] of pvalues < {}".format(
            n_signif, n_total, (100 / n_total) * n_signif, self.alpha),
              flush=True)

        print("Adding z-scores.", flush=True)
        dfm["zscore"] = stats.norm.isf(dfm["pvalue"])
        dfm.loc[dfm["pvalue"] > (1.0 - 1e-16), "zscore"] = -8.209536151601387
        dfm.loc[dfm["pvalue"] < 1e-323, "zscore"] = 38.44939448087599
        self.pivot_and_save(dfm, "zscore", pvalue_df_indices,
                            pvalue_df_columns)

        print("Adding BH-FDR.", flush=True)
        dfm["BH-FDR"] = dfm["pvalue"] * (n_total / (dfm["rank"] + 1))
        dfm.loc[dfm["BH-FDR"] > 1, "BH-FDR"] = 1
        prev_bh_fdr = -np.Inf
        for i in range(n_total):
            bh_fdr = dfm.loc[i, "BH-FDR"]
            if bh_fdr > prev_bh_fdr:
                prev_bh_fdr = bh_fdr
            else:
                dfm.loc[i, "BH-FDR"] = prev_bh_fdr
        n_signif = dfm[dfm["BH-FDR"] <= self.alpha].shape[0]
        print("\t{}/{} [{:.2f}%] of BH-FDR values < {}".format(
            n_signif, n_total, (100 / n_total) * n_signif, self.alpha),
              flush=True)
        self.pivot_and_save(dfm, "BH-FDR", pvalue_df_indices,
                            pvalue_df_columns)

        print("Adding permutation FDR.", flush=True)
        print("\tLoading permutation pvalue data.", flush=True)
        _, perm_pvalues = self.combine_pickles(self.work_dir,
                                               self.perm_pvalues_filename)
        # perm_pvalues = [random.random() for _ in range(n_total * 10)]
        print("Sorting p-values.", flush=True)
        perm_pvalues = sorted(perm_pvalues)

        if len(perm_pvalues) > 0:
            n_perm = len(perm_pvalues) / n_total
            if n_perm != self.n_perm:
                print("\tWARNING: not all permutation pvalus are present")
            perm_ranks = []
            for pvalue in dfm["pvalue"]:
                perm_ranks.append(bisect_left(perm_pvalues, pvalue))
            dfm["perm-rank"] = perm_ranks
            dfm["perm-FDR"] = (dfm["perm-rank"] / n_perm) / dfm["rank"]
            dfm.loc[(dfm.index == 0) | (dfm["perm-rank"] == 0), "perm-FDR"] = 0
            dfm.loc[dfm["perm-FDR"] > 1, "perm-FDR"] = 1

            self.pivot_and_save(dfm, "perm-FDR", pvalue_df_indices,
                                pvalue_df_columns)

        print("Saving full dataframe.", flush=True)
        save_dataframe(df=dfm,
                       outpath=os.path.join(self.work_dir,
                                            "molten_table.txt.gz"),
                       header=True,
                       index=True)
        print("")

        # Print the time.
        run_time_min, run_time_sec = divmod(time.time() - start_time, 60)
        run_time_hour, run_time_min = divmod(run_time_min, 60)
        print("finished in  {} hour(s), {} minute(s) and "
              "{} second(s).".format(int(run_time_hour), int(run_time_min),
                                     int(run_time_sec)),
              flush=True)
Ejemplo n.º 12
0
    def create_tech_covs_file(self):
        # Load the technical covariates.
        self.log.info("Loading technical covariates matrix.")
        tcov_df = load_dataframe(inpath=self.cov_file,
                                 header=0,
                                 index_col=0,
                                 logger=self.log)

        # Filter on samples and technical covariates.
        self.log.info("Filtering on samples and technical covariates.")
        tcov_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in tcov_df.index]
        tcov_df = tcov_df.loc[self.sample_order, :].copy()
        save_dataframe(df=tcov_df.T, outpath=os.path.join(self.outdir, "technical_covariates_table.txt.gz"),
                       index=True, header=True, logger=self.log)
        if self.technical_covariates:
            save_dataframe(df=tcov_df.loc[:, self.technical_covariates].T,
                           outpath=os.path.join(self.outdir, "technical_covariates_table_subset.txt.gz"),
                           index=True, header=True, logger=self.log)

        # Load the MDS components.
        self.log.info("Loading MDS matrix.")
        mds_df = load_dataframe(inpath=self.mds_file,
                                header=0,
                                index_col=0,
                                logger=self.log)

        # Filter on samples and technical covariates.
        self.log.info("Filtering on samples and technical covariates.")
        mds_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in mds_df.index]
        mds_df = mds_df.loc[self.sample_order, :].copy()

        save_dataframe(df=mds_df.T, outpath=os.path.join(self.outdir, "mds_covariates_table.txt.gz"),
                       index=True, header=True, logger=self.log)

        tmp_combined_df = tcov_df.merge(mds_df, left_index=True, right_index=True)
        save_dataframe(df=tmp_combined_df.T, outpath=os.path.join(self.outdir, "technical_and_mds_covariates_table.txt.gz"),
                       index=True, header=True, logger=self.log)

        # Loading cohort matrix.
        self.log.info("Loading dataset matrix.")
        if self.dataset_df is None:
            self.dataset_df = load_dataframe(self.dataset_file,
                                             header=0,
                                             index_col=0,
                                             logger=self.log)

        # merge.
        self.log.info("Merging matrices.")
        correction_df = reduce(lambda left, right: pd.merge(left,
                                                           right,
                                                           left_index=True,
                                                           right_index=True),
                             [tcov_df,
                              mds_df,
                              self.dataset_df])
        correction_df = correction_df.T
        correction_df.index.name = "-"
        self.log.info("\t Correction matrix shape: {}".format(correction_df.shape))

        # Validate sample order.
        if not correction_df.columns.equals(self.sample_order):
            correction_df = correction_df[self.sample_order]

        return correction_df
Ejemplo n.º 13
0
 def save(self):
     save_dataframe(df=self.gte,
                    outpath=self.outpath,
                    index=False,
                    header=False,
                    logger=self.log)