Beispiel #1
0
    def start(self):
        self.log.info("Filtering technical covariates datafile.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            # Load the sample info.
            self.log.info("Loading covariates matrix.")
            cov_df = load_dataframe(inpath=self.cov_file,
                                    header=0,
                                    index_col=0,
                                    logger=self.log)

            # Filter on samples and technical covariates.
            self.log.info("Filtering on samples and technical covariates.")
            cov_df.index = [
                self.sample_dict[x] if x in self.sample_dict else x
                for x in cov_df.index
            ]
            tech_cov_df = cov_df.loc[self.sample_order, self.tech_covs].copy()
            del cov_df
            self.log.info("\tNew shape: {}".format(tech_cov_df.shape))

            # Remove technical covariates that are linearly dependent.
            self.log.info("Removing linearly dependent column(s).")
            self.tech_covs_df = self.filter_linear_dependent_covs(tech_cov_df)
            self.log.info("\tNew shape: {}".format(self.tech_covs_df.shape))

            self.save()
        else:
            self.log.info("Skipping step.")
Beispiel #2
0
    def start(self):
        self.log.info("Correcting expression data for dataset effects.")
        self.print_arguments()

        self.log.info("Correcting signature expression data.")
        if not check_file_exists(self.sign_expr_dc_outpath) or self.force:
            if self.dataset_df is None:
                self.dataset_df = load_dataframe(self.dataset_file,
                                                 header=0,
                                                 index_col=0,
                                                 logger=self.log)

            if self.sign_expr_df is None:
                self.sign_expr_df = load_dataframe(self.sign_expr_file,
                                                   header=0,
                                                   index_col=0,
                                                   logger=self.log)

            self.sign_expr_dc_df = self.dataset_correction(
                self.sign_expr_df, self.dataset_df)
            save_dataframe(df=self.sign_expr_dc_df,
                           outpath=self.sign_expr_dc_outpath,
                           index=True,
                           header=True,
                           logger=self.log)
        else:
            self.log.info("\tSkipping step.")
Beispiel #3
0
    def start(self):
        self.log.info("Starting creating cohort matrix.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            # Load the sample info.
            self.log.info("Loading sample information matrix.")
            self.sample_info_df = load_dataframe(inpath=self.inpath,
                                                 header=0,
                                                 index_col=None,
                                                 low_memory=False,
                                                 logger=self.log)

            # Construct sample-cohort dict.
            self.log.info("Creating sample to cohort dict.")
            sample_cohort_dict = construct_dict_from_df(
                self.sample_info_df, self.sample_id, self.cohort_id)

            # Create cohort dataframe.
            self.log.info("Constructing cohort matrix.")
            self.cohort_df = self.create_cohort_df(self.sample_dict,
                                                   self.sample_order,
                                                   sample_cohort_dict)
            self.save()
        else:
            self.log.info("Skipping step.")
Beispiel #4
0
    def start(self):
        self.log.info("Starting deconvolution.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            self.decon_df = self.perform_deconvolution()
            self.save()
        else:
            self.log.info("Skipping step.")
    def start(self):
        self.log.info("Starting creating covariate file.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            self.covs_df = self.create_covs_file()
            self.save()
        else:
            self.log.info("Skipping step.")
Beispiel #6
0
    def start(self):
        self.log.info("Starting creating extra covariate file(s).")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            self.df = self.prepare_matrix()
            self.save()
        else:
            self.log.info("Skipping {}.".format(self.inpath))
Beispiel #7
0
    def start(self):
        self.log.info("Starting normal transforming matrix.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            self.normalized_df = self.normal_transform()
            self.save()
        else:
            self.log.info("Skipping step.")
Beispiel #8
0
    def start(self):
        self.log.info("Starting combining eQTL probe files.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            # Load each GTE file.
            self.log.info("Loading eQTLprobes files.")
            self.eqtl_df = self.combine_files()
            self.save()
        else:
            self.log.info("Skipping step.")
Beispiel #9
0
    def start(self):
        self.log.info("Starting creating dataset matrix.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            # Create dataset dataframe.
            self.log.info("Constructing dataset matrix.")
            self.dataset_df = self.create_dataset_df(self.dts_dict,
                                                     self.sample_order)
            self.save()
        else:
            self.log.info("Skipping step.")
Beispiel #10
0
    def validate(self):
        # Check if input files exist.
        for filepath in [
                self.matrix_inpath, self.covariates_inpath,
                self.sample_dict_inpath
        ]:
            if filepath is not None and not check_file_exists(filepath):
                print("File {} does not exist".format(filepath))
                return False

        # Check if correct extension.
        if not self.matrix_inpath.endswith(".txt.gz"):
            print("Matrix input must be in .txt.gz format")
            return False

        return True
Beispiel #11
0
    def start(self):
        self.log.info("Starting combining GTE files.")
        self.print_arguments()

        # Check if GTE output file exist.
        if check_file_exists(self.outpath) and not self.force:
            self.log.info("Skipping step, loading result.")
            self.gte_df = load_dataframe(inpath=self.outpath,
                                         header=None,
                                         index_col=None,
                                         logger=self.log)
        else:
            # Load each GTE file.
            self.log.info("Loading GTE files.")
            self.gte_df = self.combine_files()
            self.save()

        # Construct sample translate dict.
        self.sample_dict = self.create_sample_dict()
        self.sample_order = list(self.gte_df.iloc[:, 1])
        self.dataset_to_samples_dict = self.set_dataset_to_samples_dict()
Beispiel #12
0
    def start(self):
        self.log.info("Starting creating matrices.")
        self.print_arguments()

        if self.eqtl_df is None:
            self.eqtl_df = load_dataframe(self.eqtl_file,
                                          header=0,
                                          index_col=None,
                                          logger=self.log)

        self.log.info("Parsing genotype input data.")
        if not check_file_exists(self.geno_outpath) or not check_file_exists(
                self.alleles_outpath) or self.force:
            alleles_df, geno_df = self.parse_genotype_file()

            self.log.info("Reorder, Filter, and save.")
            self.alleles_df = alleles_df.loc[self.eqtl_df.loc[:, "SNPName"], :]
            save_dataframe(df=self.alleles_df,
                           outpath=self.alleles_outpath,
                           index=True,
                           header=True,
                           logger=self.log)

            self.geno_df = geno_df.loc[self.eqtl_df.loc[:, "SNPName"],
                                       self.sample_order]
            save_dataframe(df=self.geno_df,
                           outpath=self.geno_outpath,
                           index=True,
                           header=True,
                           logger=self.log)
        else:
            self.log.info("\tSkipping step.")

        self.log.info("Parsing expression input data.")
        if not check_file_exists(self.expr_outpath) or not check_file_exists(
                self.sign_expr_outpath) or self.force:
            self.log.info("Loading signature matrix.")
            self.sign_df = load_dataframe(inpath=self.sign_file,
                                          header=0,
                                          index_col=0,
                                          logger=self.log)
            signature_genes = set(self.sign_df.index.to_list())

            self.log.info("Loading gene traslate dict.")
            self.gene_info_df = load_dataframe(inpath=self.gene_info_file,
                                               header=0,
                                               index_col=None,
                                               logger=self.log)
            gene_trans_dict = construct_dict_from_df(self.gene_info_df,
                                                     self.ensg_id,
                                                     self.hgnc_id)

            if not check_file_exists(self.expr_outpath) or self.force:
                self.log.info("Parsing expression data.")
                self.expr_df, self.sign_expr_df = self.parse_expression_file(
                    self.expr_file,
                    signature_genes,
                    gene_trans_dict,
                    include_decon=self.decon_expr_file is None)

            if (not check_file_exists(self.sign_expr_outpath) or
                    self.force) and (check_file_exists(self.decon_expr_file)):
                self.log.info("Parsing deconvolution expression data.")
                self.log.warning(
                    "Using different expresion file for deconvolution.")
                _, self.sign_expr_df = self.parse_expression_file(
                    self.decon_expr_file,
                    signature_genes,
                    gene_trans_dict,
                    include_expr=False,
                    remove_ens_version=True)

            self.log.info("Reorder, Filter, and save.")
            if self.expr_df is not None:
                self.expr_df = self.expr_df.loc[self.eqtl_df.loc[:,
                                                                 "ProbeName"],
                                                self.sample_order]
                save_dataframe(df=self.expr_df,
                               outpath=self.expr_outpath,
                               index=True,
                               header=True,
                               logger=self.log)
            if self.sign_expr_df is not None:
                self.sign_expr_df = self.sign_expr_df.loc[:, self.sample_order]
                save_dataframe(df=self.sign_expr_df,
                               outpath=self.sign_expr_outpath,
                               index=True,
                               header=True,
                               logger=self.log)
        else:
            self.log.info("\tSkipping step.")
Beispiel #13
0
    def start(self):
        print("Starting interaction analyser - combine and plot.")
        self.print_arguments()

        # Start the timer.
        start_time = time.time()

        print("")
        print("### Step 1 ###")
        print("Combine pickle files into dataframe.", flush=True)
        dataframes = {}
        for filename in [
                self.pvalues_filename, self.coef_filename,
                self.std_err_filename
        ]:
            outpath = os.path.join(self.work_dir,
                                   "{}_table.txt.gz".format(filename))
            if not check_file_exists(outpath) or self.force:
                print("Loading {} data.".format(filename), flush=True)
                columns, data = self.combine_pickles(self.work_dir,
                                                     filename,
                                                     columns=True)

                if len(data) == 0:
                    print("\tNo {} data found.".format(filename))
                    continue

                print("Creating {} dataframe.".format(filename), flush=True)
                df = self.create_df(data, columns)

                print("Saving {} dataframe.".format(filename), flush=True)
                save_dataframe(df=df, outpath=outpath, header=True, index=True)

                dataframes[filename] = df

                del columns, data, df
            else:
                print("Skipping step for {}".format(outpath))
                dataframes[filename] = load_dataframe(outpath,
                                                      header=0,
                                                      index_col=0)

        print("")
        print("### Step 2 ###")
        print("Calculate t-values", flush=True)
        outpath = os.path.join(self.work_dir,
                               "{}_table.txt.gz".format(self.tvalue_filename))
        if not check_file_exists(outpath) or self.force:
            if self.coef_filename in dataframes and self.std_err_filename in dataframes:
                # Calculate t-values
                coef_df = dataframes[self.coef_filename]
                std_err_df = dataframes[self.std_err_filename]

                if not coef_df.columns.identical(std_err_df.columns):
                    overlap = set(coef_df.columns).intersection(
                        set(std_err_df.columns))
                    if len(overlap) == 0:
                        print("No overlapping eQTLs between coef and std_err "
                              "data frame columns.")
                    else:
                        coef_df = coef_df.loc[:, overlap]
                        std_err_df = std_err_df.loc[:, overlap]
                if not coef_df.index.identical(std_err_df.index):
                    overlap = set(coef_df.index).intersection(
                        set(std_err_df.index))
                    if len(overlap) == 0:
                        print("No overlapping eQTLs between coef and std_err "
                              "data frames indices.")
                    else:
                        coef_df = coef_df.loc[overlap, :]
                        std_err_df = std_err_df.loc[overlap, :]

                if coef_df.columns.identical(
                        std_err_df.columns) and coef_df.index.identical(
                            std_err_df.index):
                    tvalue_df = coef_df / std_err_df

                    print("Saving {} dataframe.".format(self.tvalue_filename),
                          flush=True)
                    save_dataframe(df=tvalue_df,
                                   outpath=os.path.join(
                                       self.work_dir, "{}_table.txt.gz".format(
                                           self.tvalue_filename)),
                                   header=True,
                                   index=True)
            else:
                print("\tNo data found.")
        else:
            print("Skipping step.")

        print("")
        print("### Step 3 ###")
        print("Starting other calculations", flush=True)

        if self.pvalues_filename not in dataframes:
            print("\tNo pvalues data found.")
            return

        pvalue_df = dataframes[self.pvalues_filename]
        pvalue_df_columns = [
            "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.columns)
        ]
        pvalue_df.columns = pvalue_df_columns
        pvalue_df_indices = [
            "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.index)
        ]
        pvalue_df.index = pvalue_df_indices
        pvalue_df.reset_index(drop=False, inplace=True)

        print("Melting dataframe.", flush=True)
        dfm = pvalue_df.melt(id_vars=["index"])
        dfm.columns = ["covariate", "SNP", "pvalue"]
        dfm["rank"] = dfm.loc[:, "pvalue"].rank(ascending=True)
        n_signif = dfm[dfm["pvalue"] <= self.alpha].shape[0]
        n_total = dfm.shape[0]
        print("\t{}/{} [{:.2f}%] of pvalues < {}".format(
            n_signif, n_total, (100 / n_total) * n_signif, self.alpha),
              flush=True)

        print("Adding z-scores.", flush=True)
        dfm["zscore"] = stats.norm.isf(dfm["pvalue"])
        dfm.loc[dfm["pvalue"] > (1.0 - 1e-16), "zscore"] = -8.209536151601387
        dfm.loc[dfm["pvalue"] < 1e-323, "zscore"] = 38.44939448087599
        self.pivot_and_save(dfm, "zscore", pvalue_df_indices,
                            pvalue_df_columns)

        print("Adding BH-FDR.", flush=True)
        dfm["BH-FDR"] = dfm["pvalue"] * (n_total / (dfm["rank"] + 1))
        dfm.loc[dfm["BH-FDR"] > 1, "BH-FDR"] = 1
        prev_bh_fdr = -np.Inf
        for i in range(n_total):
            bh_fdr = dfm.loc[i, "BH-FDR"]
            if bh_fdr > prev_bh_fdr:
                prev_bh_fdr = bh_fdr
            else:
                dfm.loc[i, "BH-FDR"] = prev_bh_fdr
        n_signif = dfm[dfm["BH-FDR"] <= self.alpha].shape[0]
        print("\t{}/{} [{:.2f}%] of BH-FDR values < {}".format(
            n_signif, n_total, (100 / n_total) * n_signif, self.alpha),
              flush=True)
        self.pivot_and_save(dfm, "BH-FDR", pvalue_df_indices,
                            pvalue_df_columns)

        print("Adding permutation FDR.", flush=True)
        print("\tLoading permutation pvalue data.", flush=True)
        _, perm_pvalues = self.combine_pickles(self.work_dir,
                                               self.perm_pvalues_filename)
        # perm_pvalues = [random.random() for _ in range(n_total * 10)]
        print("Sorting p-values.", flush=True)
        perm_pvalues = sorted(perm_pvalues)

        if len(perm_pvalues) > 0:
            n_perm = len(perm_pvalues) / n_total
            if n_perm != self.n_perm:
                print("\tWARNING: not all permutation pvalus are present")
            perm_ranks = []
            for pvalue in dfm["pvalue"]:
                perm_ranks.append(bisect_left(perm_pvalues, pvalue))
            dfm["perm-rank"] = perm_ranks
            dfm["perm-FDR"] = (dfm["perm-rank"] / n_perm) / dfm["rank"]
            dfm.loc[(dfm.index == 0) | (dfm["perm-rank"] == 0), "perm-FDR"] = 0
            dfm.loc[dfm["perm-FDR"] > 1, "perm-FDR"] = 1

            self.pivot_and_save(dfm, "perm-FDR", pvalue_df_indices,
                                pvalue_df_columns)

        print("Saving full dataframe.", flush=True)
        save_dataframe(df=dfm,
                       outpath=os.path.join(self.work_dir,
                                            "molten_table.txt.gz"),
                       header=True,
                       index=True)
        print("")

        # Print the time.
        run_time_min, run_time_sec = divmod(time.time() - start_time, 60)
        run_time_hour, run_time_min = divmod(run_time_min, 60)
        print("finished in  {} hour(s), {} minute(s) and "
              "{} second(s).".format(int(run_time_hour), int(run_time_min),
                                     int(run_time_sec)),
              flush=True)
Beispiel #14
0
    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting interaction analyser.")
        self.print_arguments()

        # Loop over the groups.
        print("Performing interaction analyses.")
        for i, group_indir in enumerate(self.group_indirs):
            # Prepare the input and output directories.
            if self.groups is not None:
                group_id = get_leaf_dir(group_indir)
                group_outdir = os.path.join(self.outdir, group_id)
            else:
                group_id = ""
                group_outdir = self.outdir
            ia_indir = os.path.join(group_outdir, 'input')
            ia_outdir = os.path.join(group_outdir, 'output')
            for outdir in [group_outdir, ia_indir, ia_outdir]:
                prepare_output_dir(outdir)

            # Check if we can find an InteractionZSCoreMatrix
            has_inter_matrix = False
            if not self.force:
                for path in glob.glob(os.path.join(ia_outdir, "*")):
                    if re.match(self.inter_regex, get_basename(path)):
                        has_inter_matrix = True
                        break

            # Stop if we already have the interaction matrix.
            if has_inter_matrix and not self.force:
                continue

            print("\tWorking on: {:15s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.group_indirs),
                                    (100 / len(self.group_indirs)) * (i + 1)))

            # Prepare the EQTLInteractioAnalyser expected input.
            self.print_string("\n### STEP1 ###\n")
            expected_input = ["Genotypes", "Expression", "Covariates"]
            filenames = [
                self.geno_filename, self.expr_filename, self.cov_filename
            ]
            for exp_ia_infile, filename in zip(expected_input, filenames):
                # Check if the files alreadt exist.
                file1 = os.path.join(ia_indir, exp_ia_infile + ".binary.dat")
                file2 = os.path.join(ia_indir,
                                     exp_ia_infile + ".binary.rows.txt")
                file3 = os.path.join(ia_indir,
                                     exp_ia_infile + ".binary.columns.txt")

                if not check_file_exists(file1) or \
                        not check_file_exists(file2) or \
                        not check_file_exists(file3) or \
                        self.force:
                    self.print_string("\nPreparing {}.".format(filename))

                    # Define the filenames.
                    compr_file = os.path.join(self.indir, group_id,
                                              filename + '.txt.gz')
                    copy_file = os.path.join(ia_indir, filename + '.txt.gz')
                    uncompr_file = os.path.join(ia_indir, filename + '.txt')
                    bin_file = os.path.join(ia_indir,
                                            exp_ia_infile + ".binary")

                    # Copy and decompressed the file.
                    self.print_string("\nCopying the input files.")
                    self.copy_file(compr_file, copy_file)
                    self.print_string("\nDecompressing the input files.")
                    self.decompress(copy_file)

                    # Convert to binary.
                    self.print_string("\nConverting files to binary format.")
                    self.convert_to_binary(uncompr_file, bin_file)

                    # Remove the uncompressed file.
                    self.print_string("\nRemoving uncompressed files.")
                    if check_file_exists(uncompr_file):
                        self.print_string(
                            "\tos.remove({})".format(uncompr_file))
                        os.remove(uncompr_file)
                else:
                    self.print_string(
                        "Skipping {} preparation.".format(filename))

            # prepare the eQTL file.
            self.print_string("\n### STEP2 ###\n")
            eqtl_file = os.path.join(ia_indir, self.eqtl_filename + '.txt')
            if not check_file_exists(eqtl_file) or self.force:
                self.print_string("\nPreparing eQTL file.")
                # Define the filenames.
                compr_file = os.path.join(self.indir, group_id,
                                          self.eqtl_filename + '.txt.gz')
                copy_file = os.path.join(ia_indir,
                                         self.eqtl_filename + '.txt.gz')

                # Copy and decompressed the file.
                self.print_string("\nCopying the input files.")
                self.copy_file(compr_file, copy_file)
                self.print_string("\nDecompressing the input files.")
                self.decompress(copy_file)
            else:
                self.print_string("Skipping eqtl preparation.")

            # execute the program.
            self.print_string("\n### STEP3 ###\n")
            self.print_string("Executing the eQTLInteractionAnalyser.")
            self.execute(ia_indir, ia_outdir, eqtl_file)
Beispiel #15
0
    def start(self):
        self.print_arguments()
        print("Starting Custom Interaction Analyser "
              "[{}]".format(datetime.now().strftime("%d-%m-%Y, %H:%M:%S")))

        # Start the timer.
        start_time = int(time.time())

        # Get the permutation orders.
        permutation_orders = None
        perm_orders_outfile = os.path.join(self.outdir,
                                           self.perm_order_filename + ".pkl")
        if check_file_exists(perm_orders_outfile):
            print("Loading permutation order")
            permutation_orders = self.load_pickle(perm_orders_outfile)

            # Validate the permutation orders for the given input.
            if len(permutation_orders) != (self.n_perm + 1):
                print("\tinvalid")
                permutation_orders = None

            if permutation_orders is not None:
                if permutation_orders[0] != None:
                    print("\tinvalid")
                    permutation_orders = None
                for order in permutation_orders[1:]:
                    if len(order) != self.n_samples:
                        print("\tinvalid")
                        permutation_orders = None
                        break

            print("\tvalid")

        if permutation_orders is None:
            print("Creating permutation order")
            permutation_orders = self.create_perm_orders()
            self.dump_pickle(permutation_orders, self.outdir,
                             self.perm_order_filename)

        # Start the work.
        print("Start the analysis", flush=True)
        storage = self.work(permutation_orders)

        print("Saving output files", flush=True)
        filename_suffix = "{}_{}".format(self.skip_rows,
                                         self.skip_rows + storage.get_n_rows())
        self.dump_pickle(storage.get_pvalues(),
                         self.outdir,
                         self.pvalues_filename,
                         filename_suffix=filename_suffix,
                         subdir=True,
                         unique=True)
        self.dump_pickle(storage.get_perm_pvalues(),
                         self.outdir,
                         self.perm_pvalues_filename,
                         filename_suffix=filename_suffix,
                         subdir=True,
                         unique=True)
        self.dump_pickle(storage.get_coefficients(),
                         self.outdir,
                         self.coef_filename,
                         filename_suffix=filename_suffix,
                         subdir=True,
                         unique=True)
        self.dump_pickle(storage.get_std_errors(),
                         self.outdir,
                         self.std_err_filename,
                         filename_suffix=filename_suffix,
                         subdir=True,
                         unique=True)

        # Print the process time.
        run_time = int(time.time()) - start_time
        run_time_min, run_time_sec = divmod(run_time, 60)
        run_time_hour, run_time_min = divmod(run_time_min, 60)
        print("Finished in  {} hour(s), {} minute(s) and "
              "{} second(s)".format(int(run_time_hour), int(run_time_min),
                                    int(run_time_sec)))
        print("Received {:.2f} analyses per minute".format(
            (self.n_eqtls * (self.n_perm + 1)) / (run_time / 60)))

        # Shutdown the manager.
        print("Shutting down manager [{}]".format(
            datetime.now().strftime("%d-%m-%Y, %H:%M:%S")),
              flush=True)