def save(self):
     save_dataframe(df=self.celltype_pcs,
                    outpath=self.pca_outpath,
                    index=True,
                    header=True)
     save_dataframe(df=self.celltype_cs,
                    outpath=self.nmf_outpath,
                    index=True,
                    header=True)
Esempio n. 2
0
    def create_marker_df(self, inter_df, eqtl_df, outpath):
        inter_df = inter_df.T
        eqtl_df = eqtl_df[["SNPName", "ProbeName", "HGNCName"]]

        # Calculate the z-score cutoff.
        z_score_cutoff = stats.norm.ppf(
            0.05 / (inter_df.shape[0] * inter_df.shape[1]))
        gini_cutoff = 0.75

        # Subset on the marker genes.
        marker_cols = []
        for colname in inter_df.columns:
            if ("_" in colname) and (colname.split("_")[1] in self.celltypes):
                marker_cols.append(colname)

        marker_df = inter_df.loc[:, marker_cols]
        del inter_df

        # Create a gini dataframe grouped by celltype.
        gini_df = marker_df.copy()
        gini_df = gini_df.abs()
        zscore_mask = list(gini_df.max(axis=1) >= abs(z_score_cutoff))
        gini_df.columns = [x.split("_")[1] for x in gini_df.columns]
        gini_df = gini_df.T.groupby(gini_df.columns).sum().T

        # Calculate the gini impurity.
        gini_values = gini_df.div(gini_df.sum(axis=1), axis=0).pow(2)
        marker_df["gini_impurity"] = 1 - gini_values.sum(axis=1)
        marker_df["eqtl_celltype"] = gini_values.idxmax(axis=1)
        del gini_df

        # Subset the marker df on gini impurity.
        gini_mask = list(marker_df["gini_impurity"] <= gini_cutoff)
        marker_df = marker_df.loc[zscore_mask and gini_mask, :]
        marker_df.index.name = "-"
        marker_df.reset_index(inplace=True)

        # Subset the eQTL dataframe.
        eqtl_df = eqtl_df.loc[zscore_mask and gini_mask, :]
        eqtl_df.reset_index(drop=True, inplace=True)

        # Merge them together.
        merged_df = pd.concat([marker_df, eqtl_df], axis=1)
        merged_df = merged_df.sort_values(
            by=['eqtl_celltype', 'gini_impurity'])

        # Save the dataframe.
        save_dataframe(df=merged_df, outpath=outpath, header=True, index=False)

        # Save celltype eqtl's HGNC names.
        print("Writing celltype mediated eQTL files.")
        for celltype in marker_df['eqtl_celltype'].unique():
            subset = merged_df.loc[merged_df['eqtl_celltype'] == celltype, :]
            print("\tCelltype: {:20s} {} genes".format(celltype,
                                                       len(subset.index)))
            if len(subset.index) > 0:
                genes = ', '.join(subset['HGNCName'].to_list())
                outfile = open(
                    os.path.join(get_dirname(outpath),
                                 '{}.txt'.format(celltype)), "w")
                outfile.write(genes)
                outfile.close()

        return eqtl_df
Esempio n. 3
0
    def combine_groups(self, inter_outpath):
        print("Combining groups.")
        snp_mask = np.array([], dtype=np.int16)
        sample_mask = np.array([], dtype=np.int16)
        inter_df = None
        for i, group_id in enumerate(self.group_ids):
            print("  Working on: {:10s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.group_ids),
                                    (100 / len(self.group_ids)) * (i + 1)))

            # Define the directory names.
            data_indir = os.path.join(self.g_data_indir, group_id)
            inter_indir = os.path.join(self.g_inter_indir, group_id, 'output')

            # Load the group object.
            with open(os.path.join(data_indir, self.obj_filename), "rb") as f:
                group_object = pickle.load(f)

            # Safe the indices.
            snp_mask = np.append(snp_mask, group_object.get_snp_indices())
            sample_mask = np.append(sample_mask,
                                    group_object.get_sample_indices())

            if not check_file_exists(inter_outpath) or self.force:
                # Search for the interaction filename.
                inter_inpath = None
                for path in glob.glob(os.path.join(inter_indir, "*")):
                    if re.match(self.inter_regex, get_basename(path)):
                        inter_inpath = path
                        break
                if inter_inpath is None:
                    print("Interaction matrix not found.")
                    exit()

                # Load the interaction file.
                group_inter_df = load_dataframe(inpath=inter_inpath,
                                                header=0,
                                                index_col=0)

                # Merge them.
                if inter_df is None:
                    inter_df = group_inter_df
                else:
                    inter_df = inter_df.merge(group_inter_df,
                                              left_index=True,
                                              right_index=True)

        print("Preparing interaction matrix.")
        if not check_file_exists(inter_outpath) or self.force:
            # Sort the matrix according to the indices.
            inter_df = inter_df.T
            inter_df["index"] = snp_mask
            inter_df.sort_values(by=['index'], inplace=True)
            inter_df.drop(["index"], axis=1, inplace=True)
            inter_df = inter_df.T

            save_dataframe(df=inter_df,
                           outpath=inter_outpath,
                           index=True,
                           header=True)
        else:
            inter_df = load_dataframe(inpath=inter_outpath,
                                      header=0,
                                      index_col=0)

        # Prepare the masks.
        snp_mask = sorted(list(set(snp_mask)))
        sample_mask = sorted(list(set(sample_mask)))

        return snp_mask, sample_mask, inter_df
Esempio n. 4
0
    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting combining groups.")
        self.print_arguments()

        # Combine the indices of each group and combine the interaction
        # matrix if need be.
        inter_outpath = os.path.join(self.outdir, self.inter_filename)
        snp_mask, sample_mask, inter_df = self.combine_groups(inter_outpath)

        print("\nSubsetting data with masks:")
        print("\tSNP mask:\tlength: {}\tlowest index: {}"
              "\thighest index: {}".format(len(snp_mask), min(snp_mask),
                                           max(snp_mask)))
        print("\tSample mask:\tlength: {}\tlowest index: {}"
              "\thighest index: {}".format(len(sample_mask), min(sample_mask),
                                           max(sample_mask)))
        print("")

        # Load the eQTL file if either the marker df or the eqtl df needs to be
        # created.
        markers_outpath = os.path.join(self.outdir, self.markers_filename)
        eqtl_outpath = os.path.join(self.outdir, self.eqtl_filename)
        if not check_file_exists(eqtl_outpath) or \
                not check_file_exists(markers_outpath) \
                or self.force:
            print("Loading eQTL file.")
            eqtl_df = load_dataframe(inpath=self.eqtl_inpath,
                                     header=0,
                                     index_col=None)
            eqtl_df = eqtl_df.iloc[snp_mask, :]

            print("Preparing marker matrix.")
            if not check_file_exists(markers_outpath) or self.force:
                self.create_marker_df(inter_df, eqtl_df, markers_outpath)
            else:
                print("\tSkipping step.")

            print("Preparing eQTL matrix.")
            if not check_file_exists(eqtl_outpath) or self.force:
                save_dataframe(outpath=eqtl_outpath,
                               df=eqtl_df,
                               index=False,
                               header=True)
            else:
                print("\tSkipping step.")
            del eqtl_df

        del inter_df

        print("\nPreparing genotype matrix.")
        geno_outpath = os.path.join(self.outdir, self.geno_filename)
        if not check_file_exists(geno_outpath) or self.force:
            geno_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.geno_filename),
                                     header=0,
                                     index_col=0)
            geno_df = geno_df.iloc[snp_mask, sample_mask]
            save_dataframe(outpath=geno_outpath,
                           df=geno_df,
                           index=True,
                           header=True)
            del geno_df
        else:
            print("\tSkipping step.")

        print("\nPreparing alleles matrix.")
        alleles_outpath = os.path.join(self.outdir, self.alleles_filename)
        if not check_file_exists(alleles_outpath) or self.force:
            alleles_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.alleles_filename),
                                        header=0,
                                        index_col=0)
            alleles_df = alleles_df.iloc[snp_mask, :]
            save_dataframe(outpath=alleles_outpath,
                           df=alleles_df,
                           index=True,
                           header=True)
            del alleles_df
        else:
            print("\tSkipping step.")

        print("\nPreparing expression matrix.")
        expr_outpath = os.path.join(self.outdir, self.expr_filename)
        if not check_file_exists(expr_outpath) or self.force:
            expr_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.expr_filename),
                                     header=0,
                                     index_col=0)
            expr_df = expr_df.iloc[snp_mask, sample_mask]
            save_dataframe(outpath=expr_outpath,
                           df=expr_df,
                           index=True,
                           header=True)
            del expr_df
        else:
            print("\tSkipping step.")

        print("\nPreparing covariate matrix.")
        cov_outpath = os.path.join(self.outdir, self.cov_filename)
        if not check_file_exists(cov_outpath) or self.force:
            cov_df = load_dataframe(inpath=self.cov_inpath,
                                    header=0,
                                    index_col=0)
            cov_df = cov_df.iloc[:, sample_mask].copy()
            save_dataframe(outpath=cov_outpath,
                           df=cov_df,
                           index=True,
                           header=True)
            del cov_df
        else:
            print("\tSkipping step.")
Esempio n. 5
0
 def save(self):
     save_dataframe(df=self.deconvolution,
                    outpath=self.outpath,
                    index=True,
                    header=True)
Esempio n. 6
0
    def start(self):
        print("Creating groups.")
        for i, (group_id, group_obj) in enumerate(self.groups.items()):
            print("  Working on: {:10s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.groups),
                                    (100 / len(self.groups)) * (i + 1)))

            # Create the group dir.
            group_dir = os.path.join(self.outdir, group_id)
            prepare_output_dir(group_dir)

            # Define the output names.
            group_object = os.path.join(group_dir,
                                        "group.pkl")
            eqtl_outpath = os.path.join(group_dir,
                                        "eqtl_table.txt.gz")
            geno_outpath = os.path.join(group_dir,
                                        "genotype_table.txt.gz")
            alleles_outpath = os.path.join(group_dir,
                                           "genotype_alleles.txt.gz")
            expr_outpath = os.path.join(group_dir,
                                        "expression_table.txt.gz")
            cov_outpath = os.path.join(group_dir,
                                       "covariates_table.txt.gz")

            # Check if output file exist, if not, create it.
            if not check_file_exists(group_object) or self.force:
                with open(group_object, "wb") as f:
                    pickle.dump(group_obj, f)
                print("\tSaved group object: "
                      "{}".format(get_basename(group_object)))

            # Get the group indices.
            snp_mask = group_obj.get_snp_indices()
            sample_mask = group_obj.get_sample_indices()

            # Check if output file exist, if not, create it.
            if not check_file_exists(eqtl_outpath) or self.force:
                group_eqtl = self.eqtl_df.iloc[snp_mask, :].copy()
                save_dataframe(outpath=eqtl_outpath, df=group_eqtl,
                               index=False, header=True)
                del group_eqtl

            if not check_file_exists(geno_outpath) or self.force:
                group_geno = self.geno_df.iloc[snp_mask, sample_mask].copy()
                save_dataframe(outpath=geno_outpath, df=group_geno,
                               index=True, header=True)
                del group_geno

            if not check_file_exists(alleles_outpath) or self.force:
                group_alleles = self.alleles_df.iloc[snp_mask, :].copy()
                save_dataframe(outpath=alleles_outpath, df=group_alleles,
                               index=True, header=True)
                del group_alleles

            if not check_file_exists(expr_outpath) or self.force:
                group_expr = self.expr_df.iloc[snp_mask, sample_mask].copy()
                save_dataframe(outpath=expr_outpath, df=group_expr,
                               index=True, header=True)
                del group_expr

            if not check_file_exists(cov_outpath) or self.force:
                group_cov = self.cov_df.iloc[:, sample_mask].copy()
                save_dataframe(outpath=cov_outpath, df=group_cov,
                               index=True, header=True)
                del group_cov
Esempio n. 7
0
    def work(self, workdir):

        # pvalue_df = pd.read_csv(os.path.join(workdir, "pvalue_table.txt.gz"), sep="\t", header=0, index_col=0)
        # perm_fdr_df = pd.read_csv(os.path.join(workdir, "perm_fdr_table.txt.gz"), sep="\t", header=0, index_col=0)
        # bh_fdr_df = pd.read_csv(os.path.join(workdir, "bh_fdr_table.txt.gz"), sep="\t", header=0, index_col=0)

        # Combine the pickle files.
        print("Loading pvalue data.", flush=True)
        pcolumns, pvalues_data = self.combine_pickles(workdir,
                                                      self.pvalues_outfile,
                                                      columns=True)

        # Create a pandas dataframe from the nested list.
        print("Creating p-values dataframe.", flush=True)
        pvalue_df = self.create_df(pvalues_data, pcolumns)
        save_dataframe(df=pvalue_df,
                       outpath=os.path.join(workdir, "pvalue_table.txt.gz"),
                       header=True,
                       index=True)

        # pvalue_df = pd.read_csv(os.path.join(workdir, "pvalue_table.txt.gz"),
        #                         sep="\t", header=0, index_col=0)
        # with open(os.path.join(workdir, "perm_pvalues.pkl"), "rb") as f:
        #     perm_pvalues = pickle.load(f)
        # f.close()

        # Get the pvalues from the dataframe.
        pvalues = pvalue_df.melt()["value"].values

        print("Loading permutation pvalue data.", flush=True)
        _, perm_pvalues = self.combine_pickles(workdir,
                                               self.perm_pvalues_outfile)
        # with open(os.path.join(workdir, "perm_pvalues.pkl"), "wb") as f:
        #      pickle.dump(perm_pvalues, f)
        # f.close()

        # Visualise distributions.
        print("Visualizing distributions.", flush=True)
        self.plot_distributions(perm_pvalues, pvalues, workdir)

        # return

        print("Loading SNP tvalue data.", flush=True)
        snp_tcolumns, snp_tvalues_data = self.combine_pickles(
            workdir, self.snp_tvalues_outfile, columns=True)

        # Create a pandas dataframe from the nested list.
        print("Creating SNP t-values dataframe.", flush=True)
        snp_tvalue_df = self.create_df(snp_tvalues_data, snp_tcolumns)
        save_dataframe(df=snp_tvalue_df,
                       outpath=os.path.join(workdir,
                                            "snp_tvalue_table.txt.gz"),
                       header=True,
                       index=True)

        print("Loading inter tvalue data.", flush=True)
        inter_tcolumns, inter_tvalues_data = self.combine_pickles(
            workdir, self.inter_tvalues_outfile, columns=True)

        # Create a pandas dataframe from the nested list.
        print("Creating inter t-values dataframe.", flush=True)
        inter_tvalue_df = self.create_df(inter_tvalues_data, inter_tcolumns)
        save_dataframe(df=inter_tvalue_df,
                       outpath=os.path.join(workdir,
                                            "inter_tvalue_table.txt.gz"),
                       header=True,
                       index=True)

        # Create a dataframe with z-scores.
        print("Creating Z-score dataframe.", flush=True)
        zscore_df = self.create_zscore_df(pvalue_df)
        save_dataframe(df=zscore_df,
                       outpath=os.path.join(workdir,
                                            "interaction_table.txt.gz"),
                       header=True,
                       index=True)

        # Sort the lists.
        print("Sorting p-values.", flush=True)
        perm_pvalues = sorted(perm_pvalues)
        pvalues = sorted(pvalues)

        # Create the FDR dataframes.
        print("Creating permutation FDR dataframe.", flush=True)
        perm_fdr_df, perm_cutoff = self.create_perm_fdr_df(
            pvalue_df, pvalues, perm_pvalues, self.n_permutations)
        perm_n_signif = self.count_n_significant(pvalues, perm_cutoff)
        print("\tPermutation FDR: {} p-values < signif. cutoff "
              "{:.2e} [{:.2f}%]".format(perm_n_signif, perm_cutoff,
                                        (100 / len(pvalues)) * perm_n_signif))
        # Write the output file.
        save_dataframe(df=perm_fdr_df,
                       outpath=os.path.join(workdir, "perm_fdr_table.txt.gz"),
                       header=True,
                       index=True)

        print("Creating Benjamini-Hochberg FDR dataframe.", flush=True)
        bh_fdr_df, bh_cutoff = self.create_bh_fdr_df(pvalue_df, pvalues)
        bh_n_signif = self.count_n_significant(pvalues, bh_cutoff)
        print("\tBH FDR: {} p-values < signif. cutoff "
              "{:.2e} [{:.2f}%]".format(bh_n_signif, bh_cutoff,
                                        (100 / len(pvalues)) * bh_n_signif))
        save_dataframe(df=bh_fdr_df,
                       outpath=os.path.join(workdir, "bh_fdr_table.txt.gz"),
                       header=True,
                       index=True)

        #return

        # Compare the two pvalue scores.
        print("Creating score visualisation [1/2].", flush=True)
        self.compare_pvalue_scores(pvalue_df, perm_fdr_df, bh_fdr_df, workdir)
Esempio n. 8
0
 def save(self):
     save_dataframe(df=self.eqtl_probes,
                    outpath=self.outpath,
                    index=False,
                    header=True)
Esempio n. 9
0
    def start(self):
        print("Starting creating masked files.")
        self.print_arguments()

        # Get the sizes.
        (n_eqtls, n_samples) = self.geno_df.shape
        n_covs = self.cov_df.shape[0]

        # Create masks.
        eqtl_mask = ["eqtl_" + str(x) for x in range(n_eqtls)]
        sample_mask = ["sample_" + str(x) for x in range(n_samples)]
        cov_mask = ["cov_" + str(x) for x in range(n_covs)]

        # Create translate dicts.
        print("Creating translation files.")
        eqtl_translate_outpath = os.path.join(self.outdir,
                                              "eqtl_translate_table.txt.gz")
        if not check_file_exists(eqtl_translate_outpath) or self.force:
            eqtl_translate = pd.DataFrame({'unmasked': list(self.geno_df.index),
                                           'masked': eqtl_mask})
            save_dataframe(outpath=eqtl_translate_outpath,
                           df=eqtl_translate,
                           index=False, header=True)
            del eqtl_translate
        else:
            print("\tSkipping eQTLs translate table.")

        sample_translate_outpath = os.path.join(self.outdir,
                                                "sample_translate_table.txt.gz")
        if not check_file_exists(sample_translate_outpath) or self.force:
            sample_translate = pd.DataFrame(
                {'unmasked': list(self.geno_df.columns),
                 'masked': sample_mask})
            save_dataframe(outpath=sample_translate_outpath,
                           df=sample_translate,
                           index=False, header=True)
            del sample_translate
        else:
            print("\tSkipping sample translate table.")

        cov_translate_outpath = os.path.join(self.outdir,
                                             "cov_translate_table.txt.gz")
        if not check_file_exists(cov_translate_outpath) or self.force:
            cov_translate = pd.DataFrame({'unmasked': list(self.cov_df.index),
                                          'masked': cov_mask})
            save_dataframe(outpath=cov_translate_outpath, df=cov_translate,
                           index=False, header=True)
            del cov_translate
        else:
            print("\tSkipping covariates translate table.")

        # Start masking the dataframes.
        print("Start masking files.")
        eqtl_outpath = os.path.join(self.outdir, "eqtl_table.txt.gz")
        if not check_file_exists(eqtl_outpath) or self.force:
            self.eqtl_df.index = eqtl_mask
            save_dataframe(outpath=eqtl_outpath, df=self.eqtl_df,
                           index=True, header=True)
        else:
            print("\tSkipping eQTL table.")

        geno_outpath = os.path.join(self.outdir, "genotype_table.txt.gz")
        if not check_file_exists(geno_outpath) or self.force:
            self.geno_df.index = eqtl_mask
            self.geno_df.columns = sample_mask
            save_dataframe(outpath=geno_outpath, df=self.geno_df,
                           index=True, header=True)
        else:
            print("\tSkipping genotype table.")

        alleles_outpath = os.path.join(self.outdir,
                                       "genotype_alleles.txt.gz")
        if not check_file_exists(alleles_outpath) or self.force:
            self.alleles_df.index = eqtl_mask
            save_dataframe(outpath=alleles_outpath, df=self.alleles_df,
                           index=True, header=True)
        else:
            print("\tSkipping genotype alleles tables.")

        expr_outpath = os.path.join(self.outdir, "expression_table.txt.gz")
        if not check_file_exists(expr_outpath) or self.force:
            self.expr_df.index = eqtl_mask
            self.expr_df.columns = sample_mask
            save_dataframe(outpath=expr_outpath, df=self.expr_df,
                           index=True, header=True)
        else:
            print("\tSkipping expression table.")

        cov_outpath = os.path.join(self.outdir, "covariates_table.txt.gz")
        if not check_file_exists(cov_outpath) or self.force:
            self.cov_df.index = cov_mask
            self.cov_df.columns = sample_mask
            save_dataframe(outpath=cov_outpath, df=self.cov_df,
                           index=True, header=True)
        else:
            print("\tSkipping covariates table.")
Esempio n. 10
0
 def save(self):
     save_dataframe(df=self.covariates,
                    outpath=self.outpath,
                    index=True,
                    header=True)
Esempio n. 11
0
 def save(self):
     save_dataframe(df=self.gte,
                    outpath=self.outpath,
                    index=False,
                    header=False)