Example #1
0
    def __init__(self, name, settings_file, alpha, extensions, interest):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param alpha: float, the significance cut-off.
        :param extensions: str, the output figure file type extension.
        :param interest: list, the HGNC names to print.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        self.settings = LocalSettings(current_dir, settings_file)
        self.covs = self.settings.get_setting("covariates_to_include")
        self.covs_excl_from_overview = [
            x.lower()
            for x in self.settings.get_setting("covariates_excl_from_overview")
        ]
        self.max_url_len = self.settings.get_setting("max_url_length")
        self.maf_cutoff = self.settings.get_setting("maf_cutoff")
        self.include_top_n = self.settings.get_setting("include_top_n")

        # Load the variables.
        self.name = name
        self.alpha = alpha
        self.extensions = extensions
        self.interest = interest

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)
Example #2
0
    def __init__(self, name, settings_file):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        settings = LocalSettings(current_dir, settings_file)

        # Safe arguments.
        self.n_permutations = settings.get_setting("n_permutations")

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)

        # Get the needed settings.
        self.cov_outdir = settings.get_setting("covariates_folder")
        self.tech_cov_outdir = settings.get_setting(
            "technical_covariates_folder")
        self.pvalues_outfile = settings.get_setting(
            "actual_pvalues_pickle_filename")
        self.snp_tvalues_outfile = settings.get_setting(
            "snp_tvalues_pickle_filename")
        self.inter_tvalues_outfile = settings.get_setting(
            "inter_tvalues_pickle_filename")
        self.perm_pvalues_outfile = settings.get_setting(
            "permuted_pvalues_pickle_filename")
Example #3
0
    def __init__(self, name, settings_file, alpha, plots, top, interest,
                 extension, validate):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param alpha: float, the significance cut-off.
        :param plots: list, the names of the plots to create.
        :param top: int, the number of top eQTLs to plot.
        :param interest: list, the indices of equals to plot.
        :param extension: str, the output figure file type extension.
        :param validate: boolean, whether or not to validate the input.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        self.settings = LocalSettings(current_dir, settings_file)

        # Load the variables.
        self.name = name
        self.alpha = alpha
        self.plots = plots
        self.top = top
        self.interest = interest
        self.extension = extension
        self.validate = validate

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)
Example #4
0
    def __init__(self, name, settings_file, disease, force_steps):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param disease: string, the name of the disease to analyse.
        :param force_steps: list, the names of the steps to force to redo.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        self.settings = LocalSettings(current_dir, settings_file)

        # Safe arguments.
        self.disease = disease
        self.force_dict = self.create_force_dict(force_steps)

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)
Example #5
0
    def __init__(self, settings_file, groups, force, verbose):
        """
        Initializer of the class.

        :param settings_file: string, the name of the settings file.
        :param groups: list, the names of groups to analyse.
        :param force: boolean, whether or not to force to redo each step.
        :param verbose: boolean, whether or not to print each step.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        settings = LocalSettings(current_dir, settings_file)

        # Safe arguments.
        self.indir = settings.get_setting("input_dir")
        self.tech_covs = settings.get_setting("technical_covariates")
        self.eqtl_ia = settings.get_setting("eQTLInteractionAnalyser")
        self.inter_regex = settings.get_setting("interaction_regex")
        self.groups = groups
        self.force = force
        self.verbose = verbose

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir,
                                   settings.get_setting("output_dir"))
        prepare_output_dir(self.outdir)

        # Find which groups are in the input directory.
        if self.groups is not None:
            groups_in_indir = glob.glob(os.path.join(self.indir, 'group_*'))
            self.group_indirs = self.filter_groups(groups_in_indir)
        else:
            self.group_indirs = [self.indir]

        # Prepare filenames.
        filenames = settings.get_setting("filenames")
        self.eqtl_filename = filenames["eqtl"]
        self.geno_filename = filenames["genotype"]
        self.expr_filename = filenames["expression"]
        self.cov_filename = filenames["covariate"]
Example #6
0
    def __init__(self, settings_file, groups, force):
        """
        Initializer of the class.

        :param settings_file: string, the name of the settings file.
        :param groups: list, the names of groups to analyse.
        :param force: boolean, whether or not to force to redo each step.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        settings = LocalSettings(current_dir, settings_file)

        # Safe arguments.)
        self.eqtl_inpath = settings.get_setting("eqtl_datafile")
        self.cov_inpath = settings.get_setting("cov_datafile")
        self.data_indir = settings.get_setting("data_dir")
        self.g_data_indir = settings.get_setting("groups_data_dir")
        self.g_inter_indir = settings.get_setting("inter_groups_dir")
        self.inter_regex = settings.get_setting("interaction_regex")
        self.group_ids = self.filter_groups(groups)
        self.celltypes = settings.get_setting("celltypes")
        self.force = force

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir,
                                   settings.get_setting("output_dir"))
        prepare_output_dir(self.outdir)

        # Prepare filenames.
        filenames = settings.get_setting("filenames")
        self.obj_filename = filenames["object"]
        self.eqtl_filename = filenames["eqtl"]
        self.geno_filename = filenames["genotype"]
        self.alleles_filename = filenames["alleles"]
        self.expr_filename = filenames["expression"]
        self.cov_filename = filenames["covariates"]
        self.inter_filename = filenames["interaction"]
        self.markers_filename = filenames["markers"]
Example #7
0
class Main:
    """
    Main: this class is the main class that calls all other functionality.
    """

    def __init__(self, name, settings_file, disease, force_steps):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param disease: string, the name of the disease to analyse.
        :param force_steps: list, the names of the steps to force to redo.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        self.settings = LocalSettings(current_dir, settings_file)

        # Safe arguments.
        self.disease = disease
        self.force_dict = self.create_force_dict(force_steps)

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)

    @staticmethod
    def create_force_dict(force_steps):
        force_dict = {'combine_gte_files': False,
                      'combine_eqtlprobes': False,
                      'create_matrices': False,
                      'perform_celltype_factorization': False,
                      'create_deconvolution_matrices': False,
                      'perform_deconvolution': False,
                      'create_cov_matrix': False,
                      'mask_matrices': False,
                      'create_groups': False,
                      'create_regression_matrix': False}
        if force_steps is None or len(force_steps) == 0:
            return force_dict

        if force_steps == ['all']:
            for key in force_dict.keys():
                force_dict[key] = True
        else:
            for step in force_steps:
                if step in force_dict.keys():
                    force_dict[step] = True

        return force_dict

    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting program.")
        print("\n### STEP1 ###\n")
        # Step 1. Combine GTE files.
        cgtef = CombineGTEFiles(
            settings=self.settings.get_setting('combine_gte_files'),
            force=self.force_dict['combine_gte_files'],
            outdir=self.outdir)
        cgtef.start()
        cgtef.clear_variables()

        # Step2. Combine eQTL probes files.
        print("\n### STEP2 ###\n")
        cepf = CombineEQTLProbes(
            settings=self.settings.get_setting('combine_eqtlprobes'),
            disease=self.disease,
            force=self.force_dict['combine_eqtlprobes'],
            outdir=self.outdir)
        cepf.start()
        cepf.clear_variables()

        # Step3. Create the ordered unmasked matrices.
        print("\n### STEP3 ###\n")
        cm = CreateMatrices(
            settings=self.settings.get_setting('create_matrices'),
            gte_df=cgtef.get_gte(),
            sample_dict=cgtef.get_sample_dict(),
            sample_order=cgtef.get_sample_order(),
            eqtl_df=cepf.get_eqtlprobes(),
            force=self.force_dict['create_matrices'],
            outdir=self.outdir)
        cm.start()
        cm.clear_variables()

        # Step4. Create the deconvolution matrices.
        print("\n### STEP4 ###\n")
        cdm = CreateDeconvolutionMatrices(
            settings=self.settings.get_setting('create_deconvolution_matrices'),
            expr_file=cm.get_expr_file(),
            expr_df=cm.get_complete_expr_matrix(),
            sample_dict=cgtef.get_sample_dict(),
            sample_order=cgtef.get_sample_order(),
            force=self.force_dict['create_deconvolution_matrices'],
            outdir=self.outdir)
        cdm.start()
        cdm.clear_variables()

        # Step5. Create the celltype PCA file.
        print("\n### STEP5 ###\n")
        pcf = PerformCelltypeFactorization(
            settings=self.settings.get_setting('perform_celltype_factorization'),
            profile_file=cdm.get_celltype_profile_file(),
            profile_df=cdm.get_celltype_profile(),
            ct_expr_file=cdm.get_ct_profile_expr_outpath(),
            force=self.force_dict['perform_celltype_factorization'],
            outdir=self.outdir)
        pcf.start()
        pcf.clear_variables()

        # Step6. Create the covariance matrix.
        print("\n### STEP6 ###\n")
        pd = PerformDeconvolution(
            settings=self.settings.get_setting('perform_deconvolution'),
            profile_file=cdm.get_celltype_profile_file(),
            profile_df=cdm.get_celltype_profile(),
            ct_expr_file=cdm.get_ct_profile_expr_outpath(),
            ct_expr_df=pcf.get_celltype_expression(),
            force=self.force_dict['perform_deconvolution'],
            outdir=self.outdir)
        pd.start()
        pd.clear_variables()

        # Step7. Create the covariance matrix.
        print("\n### STEP7 ###\n")
        ccm = CreateCovMatrix(
            settings=self.settings.get_setting('create_cov_matrix'),
            marker_file=cdm.get_markers_outpath(),
            celltype_pcs=pcf.get_celltype_pcs(),
            celltype_cs=pcf.get_celltype_cs(),
            deconvolution=pd.get_deconvolution(),
            sample_order=cgtef.get_sample_order(),
            force=self.force_dict['create_cov_matrix'],
            outdir=self.outdir)
        ccm.start()
        ccm.clear_variables()

        exit()

        # Load the complete dataframes.
        print("\n### LOADING SORTED DATAFRAMES ###\n")
        print("Extracting eQTL dataframe.")
        eqtl_df = cepf.get_eqtlprobes()

        print("Loading genotype dataframe.")
        geno_df = load_dataframe(cm.get_geno_outpath(),
                                 header=0,
                                 index_col=0)

        print("Loading alleles dataframe.")
        alleles_df = load_dataframe(cm.get_alleles_outpath(),
                                    header=0,
                                    index_col=0)

        print("Loading expression dataframe.")
        expr_df = load_dataframe(cm.get_expr_outpath(),
                                 header=0,
                                 index_col=0)

        print("Extracting covariates dataframe.")
        cov_df = ccm.get_covariates()

        # Validate the matrices.
        print("Validating matrices.")
        self.validate(eqtl_df.copy(), geno_df, alleles_df, expr_df, cov_df)

        # Step 8. Create the masked matrices.
        print("\n### STEP8 ###\n")
        cmm = MaskMatrices(
            settings=self.settings.get_setting('mask_matrices'),
            eqtl_df=eqtl_df.copy(),
            geno_df=geno_df.copy(),
            alleles_df=alleles_df.copy(),
            expr_df=expr_df.copy(),
            cov_df=cov_df.copy(),
            force=self.force_dict['mask_matrices'],
            outdir=self.outdir)
        cmm.start()
        del cmm

        # # Step 9. Create the group matrices.
        # print("\n### STEP9 ###\n")
        # cg = CreateGroups(
        #     settings=self.settings.get_setting('create_groups'),
        #     eqtl_df=eqtl_df.copy(),
        #     geno_df=geno_df.copy(),
        #     alleles_df=alleles_df.copy(),
        #     expr_df=expr_df.copy(),
        #     cov_df=cov_df.copy(),
        #     groups_file=cm.get_group_outpath(),
        #     force=self.force_dict['create_groups'],
        #     outdir=self.outdir)
        # cg.start()
        # del cg

        # Step 10. Create the regression matrices.
        print("\n### STEP10 ###\n")
        crm = CreateRegressionMatrix(
            settings=self.settings.get_setting('create_regression_matrix'),
            eqtl_df=eqtl_df.copy(),
            geno_df=geno_df.copy(),
            alleles_df=alleles_df.copy(),
            expr_df=expr_df.copy(),
            force=self.force_dict['create_regression_matrix'],
            outdir=self.outdir)
        crm.start()
        del crm

    @staticmethod
    def validate(eqtl_df, geno_df, alleles_df, expr_df, cov_df):
        # Set the index of the eQTL for comparison.
        eqtl_df.index = eqtl_df["SNPName"]
        eqtl_df.index.name = "-"

        # Check if row order is identical.
        if not (eqtl_df.index.identical(geno_df.index)) or \
                not (eqtl_df.index.identical(expr_df.index)) or \
                not (eqtl_df.index.identical(alleles_df.index)):
            print("Row order is not identical.")
            exit()

        # Check if sample order is identical.
        if not (geno_df.columns.identical(expr_df.columns)) or \
                not (geno_df.columns.identical(cov_df.columns)):
            print("Order of samples are not identical.")
            exit()

        print("\tValid.")
Example #8
0
class Main:
    """
    Main: this class is the main class that calls all other functionality.
    """
    def __init__(self, name, settings_file, alpha, extensions, interest):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param alpha: float, the significance cut-off.
        :param extensions: str, the output figure file type extension.
        :param interest: list, the HGNC names to print.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        self.settings = LocalSettings(current_dir, settings_file)
        self.covs = self.settings.get_setting("covariates_to_include")
        self.covs_excl_from_overview = [
            x.lower()
            for x in self.settings.get_setting("covariates_excl_from_overview")
        ]
        self.max_url_len = self.settings.get_setting("max_url_length")
        self.maf_cutoff = self.settings.get_setting("maf_cutoff")
        self.include_top_n = self.settings.get_setting("include_top_n")

        # Load the variables.
        self.name = name
        self.alpha = alpha
        self.extensions = extensions
        self.interest = interest

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)

    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting visualiser.")
        self.print_arguments()

        # Create the dataset object.
        ds = Dataset(name=self.name, settings=self.settings, alpha=self.alpha)

        # Loading data.
        print("Loading dataframes")
        cov_df = ds.get_cov_df()
        inter_zscore_df = ds.get_inter_cov_zscore_df()
        inter_tvalue_df = ds.get_inter_cov_inter_tvalue_df()
        eqtl_df = ds.get_eqtl_df()
        geno_df = ds.get_geno_df()
        alleles_df = ds.get_alleles_df()
        expr_df = ds.get_expr_df()
        signif_cutoff = ds.get_significance_cutoff()

        # Subset the method of interest.
        select_cov_df = cov_df.loc[self.covs.keys(), :].copy()
        select_zscore_df = inter_zscore_df.loc[self.covs.keys(), :].copy().T
        select_tvalue_df = inter_tvalue_df.loc[self.covs.keys(), :].copy().T
        del cov_df, inter_zscore_df, inter_tvalue_df

        if not select_cov_df.index.equals(
                select_zscore_df.columns) or not select_cov_df.index.equals(
                    select_tvalue_df.columns):
            print("Columns do not match.")
            exit()

        print("Analysing interactions")
        data = []
        for i, (index, row) in enumerate(eqtl_df.iterrows()):
            if (i % 250 == 0) or (i == (eqtl_df.shape[0] - 1)):
                print("\tprocessing {}/{} "
                      "[{:.2f}%]".format(i, (eqtl_df.shape[0] - 1),
                                         (100 / (eqtl_df.shape[0] - 1)) * i))

            # Get the data.
            genotype = geno_df.iloc[i, :].copy()
            expression = expr_df.iloc[i, :].copy()
            (alleles, _) = alleles_df.iloc[i, :].copy()
            zscores = select_zscore_df.iloc[i, :].copy()
            tvalues = select_tvalue_df.iloc[i, :].copy()

            iteration = None
            if "Iteration" in row.index:
                iteration = row["Iteration"]
            gwas_ids = None
            if "GWASIDS" in row.index:
                gwas_ids = row["GWASIDS"]
            traits = None
            if "Trait" in row.index:
                traits = row["Trait"]

            if max(zscores) > signif_cutoff:
                eqtl = Eqtl(index=i,
                            snp_name=row["SNPName"],
                            probe_name=row["ProbeName"],
                            hgnc_name=row["HGNCName"],
                            iteration=iteration,
                            eqtl_zscore=row["OverallZScore"],
                            gwas_ids=gwas_ids,
                            traits=traits,
                            alleles=alleles,
                            signif_cutoff=signif_cutoff,
                            maf_cutoff=self.maf_cutoff,
                            selections=self.covs,
                            genotype=genotype,
                            expression=expression,
                            covariates=select_cov_df.copy(),
                            inter_zscores=zscores,
                            inter_tvalues=tvalues)
                if self.interest is not None and row[
                        "HGNCName"] in self.interest:
                    eqtl.print_info()
                data.extend(eqtl.get_data())
                del eqtl

        # Create the complete dataframe.
        data_df = pd.DataFrame(data,
                               columns=[
                                   "Index", "SNPName", "ProbeName", "HGNCName",
                                   "Iteration", "N", "MAF", "eQTL", "Inter",
                                   "Covariate", "Interaction", "Direction",
                                   "GWASIDs", "Traits"
                               ])

        # Plot the data.
        print("Creating plots")
        for extension in self.extensions:
            plotter = Plotter(data_df, self.outdir, extension=extension)
            plotter.plot_upsetplot(column="Covariate",
                                   id_col="Index",
                                   exclude=self.covs_excl_from_overview)
            plotter.plot_upsetplot(column="Iteration", id_col="ID")
            plotter.plot_pie(total=eqtl_df.shape[0],
                             part=len(data_df["Index"].unique()))

        # Save data files.
        print("Saving results")
        saver = Saver(data_df, self.outdir, signif_cutoff, self.max_url_len,
                      self.include_top_n)
        saver.save_all(exclude=self.covs_excl_from_overview)
        saver.save_per_iter(exclude=self.covs_excl_from_overview)
        indices_of_interest = saver.save_per_group()
        print("eQTL indices of interest: {}".format(' '.join(
            [str(x) for x in indices_of_interest])))

    def print_arguments(self):
        print("Arguments:")
        print("  > Output directory: {}".format(self.outdir))
        print("  > Alpha: {}".format(self.alpha))
        print("  > Extensions: {}".format(self.extensions))
        print("  > Interest: {}".format(self.interest))
        print("  > Covariates to include: {}".format(self.covs))
        print("  > Covariates to exclude from overview: {}".format(
            self.covs_excl_from_overview))
        print("  > Max URL length: {}".format(self.max_url_len))
        print("  > MAF cutoff: {}".format(self.maf_cutoff))
        print("  > Include top n: {}".format(self.include_top_n))
        print("")
Example #9
0
    def __init__(self, name, settings_file, skip_rows, n_eqtls, n_samples,
                 verbose):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param skip_rows: int, the number of rows to skip.
        :param n_eqtls: int, the number of eqtls in the input files.
        :param n_samples: int, the number of samples in the input files.
        :param cores: int, the number of cores to use.
        :param verbose: boolean, whether or not to print all update info.
        :param include: boolean, whether or not to include the unfinished
                        wait_list.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        settings = LocalSettings(current_dir, settings_file)

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)

        # Safe settings.
        input_dir = settings.get_setting("input_dir")
        filenames = settings.get_setting("filenames")
        self.geno_inpath = os.path.join(input_dir, name, filenames["genotype"])
        self.expr_inpath = os.path.join(input_dir, name,
                                        filenames["expression"])
        self.cov_inpath = os.path.join(input_dir, name,
                                       filenames["covariates"])
        self.drop_covs = settings.get_setting("drop_covariates")
        self.tech_covs = settings.get_setting("technical_covariates")
        self.cov_outdir = settings.get_setting("covariates_folder")
        self.tech_cov_outdir = settings.get_setting(
            "technical_covariates_folder")
        self.perm_orders_filename = settings.get_setting(
            "permutations_order_pickle_filename")
        self.pvalues_filename = settings.get_setting(
            "actual_pvalues_pickle_filename")
        self.snp_tvalues_filename = settings.get_setting(
            "snp_tvalues_pickle_filename")
        self.inter_tvalues_filename = settings.get_setting(
            "inter_tvalues_pickle_filename")
        self.perm_pvalues_filename = settings.get_setting(
            "permuted_pvalues_pickle_filename")
        self.n_permutations = settings.get_setting("n_permutations")
        self.max_end_time = int(time.time(
        )) + settings.get_setting("max_runtime_in_hours") * 60 * 60
        self.panic_time = self.max_end_time - (
            settings.get_setting("panic_time_in_min") * 60)
        self.skip_rows = skip_rows
        self.n_eqtls = n_eqtls
        self.n_samples = n_samples
        self.verbose = verbose