def __init__(self, name, settings_file, alpha, extensions, interest): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param alpha: float, the significance cut-off. :param extensions: str, the output figure file type extension. :param interest: list, the HGNC names to print. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. self.settings = LocalSettings(current_dir, settings_file) self.covs = self.settings.get_setting("covariates_to_include") self.covs_excl_from_overview = [ x.lower() for x in self.settings.get_setting("covariates_excl_from_overview") ] self.max_url_len = self.settings.get_setting("max_url_length") self.maf_cutoff = self.settings.get_setting("maf_cutoff") self.include_top_n = self.settings.get_setting("include_top_n") # Load the variables. self.name = name self.alpha = alpha self.extensions = extensions self.interest = interest # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir)
def __init__(self, name, settings_file): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. settings = LocalSettings(current_dir, settings_file) # Safe arguments. self.n_permutations = settings.get_setting("n_permutations") # Prepare an output directory. self.outdir = os.path.join(current_dir, name) # Get the needed settings. self.cov_outdir = settings.get_setting("covariates_folder") self.tech_cov_outdir = settings.get_setting( "technical_covariates_folder") self.pvalues_outfile = settings.get_setting( "actual_pvalues_pickle_filename") self.snp_tvalues_outfile = settings.get_setting( "snp_tvalues_pickle_filename") self.inter_tvalues_outfile = settings.get_setting( "inter_tvalues_pickle_filename") self.perm_pvalues_outfile = settings.get_setting( "permuted_pvalues_pickle_filename")
def __init__(self, name, settings_file, alpha, plots, top, interest, extension, validate): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param alpha: float, the significance cut-off. :param plots: list, the names of the plots to create. :param top: int, the number of top eQTLs to plot. :param interest: list, the indices of equals to plot. :param extension: str, the output figure file type extension. :param validate: boolean, whether or not to validate the input. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. self.settings = LocalSettings(current_dir, settings_file) # Load the variables. self.name = name self.alpha = alpha self.plots = plots self.top = top self.interest = interest self.extension = extension self.validate = validate # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir)
def __init__(self, name, settings_file, disease, force_steps): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param disease: string, the name of the disease to analyse. :param force_steps: list, the names of the steps to force to redo. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. self.settings = LocalSettings(current_dir, settings_file) # Safe arguments. self.disease = disease self.force_dict = self.create_force_dict(force_steps) # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir)
def __init__(self, settings_file, groups, force, verbose): """ Initializer of the class. :param settings_file: string, the name of the settings file. :param groups: list, the names of groups to analyse. :param force: boolean, whether or not to force to redo each step. :param verbose: boolean, whether or not to print each step. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. settings = LocalSettings(current_dir, settings_file) # Safe arguments. self.indir = settings.get_setting("input_dir") self.tech_covs = settings.get_setting("technical_covariates") self.eqtl_ia = settings.get_setting("eQTLInteractionAnalyser") self.inter_regex = settings.get_setting("interaction_regex") self.groups = groups self.force = force self.verbose = verbose # Prepare an output directory. self.outdir = os.path.join(current_dir, settings.get_setting("output_dir")) prepare_output_dir(self.outdir) # Find which groups are in the input directory. if self.groups is not None: groups_in_indir = glob.glob(os.path.join(self.indir, 'group_*')) self.group_indirs = self.filter_groups(groups_in_indir) else: self.group_indirs = [self.indir] # Prepare filenames. filenames = settings.get_setting("filenames") self.eqtl_filename = filenames["eqtl"] self.geno_filename = filenames["genotype"] self.expr_filename = filenames["expression"] self.cov_filename = filenames["covariate"]
def __init__(self, settings_file, groups, force): """ Initializer of the class. :param settings_file: string, the name of the settings file. :param groups: list, the names of groups to analyse. :param force: boolean, whether or not to force to redo each step. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. settings = LocalSettings(current_dir, settings_file) # Safe arguments.) self.eqtl_inpath = settings.get_setting("eqtl_datafile") self.cov_inpath = settings.get_setting("cov_datafile") self.data_indir = settings.get_setting("data_dir") self.g_data_indir = settings.get_setting("groups_data_dir") self.g_inter_indir = settings.get_setting("inter_groups_dir") self.inter_regex = settings.get_setting("interaction_regex") self.group_ids = self.filter_groups(groups) self.celltypes = settings.get_setting("celltypes") self.force = force # Prepare an output directory. self.outdir = os.path.join(current_dir, settings.get_setting("output_dir")) prepare_output_dir(self.outdir) # Prepare filenames. filenames = settings.get_setting("filenames") self.obj_filename = filenames["object"] self.eqtl_filename = filenames["eqtl"] self.geno_filename = filenames["genotype"] self.alleles_filename = filenames["alleles"] self.expr_filename = filenames["expression"] self.cov_filename = filenames["covariates"] self.inter_filename = filenames["interaction"] self.markers_filename = filenames["markers"]
class Main: """ Main: this class is the main class that calls all other functionality. """ def __init__(self, name, settings_file, disease, force_steps): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param disease: string, the name of the disease to analyse. :param force_steps: list, the names of the steps to force to redo. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. self.settings = LocalSettings(current_dir, settings_file) # Safe arguments. self.disease = disease self.force_dict = self.create_force_dict(force_steps) # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir) @staticmethod def create_force_dict(force_steps): force_dict = {'combine_gte_files': False, 'combine_eqtlprobes': False, 'create_matrices': False, 'perform_celltype_factorization': False, 'create_deconvolution_matrices': False, 'perform_deconvolution': False, 'create_cov_matrix': False, 'mask_matrices': False, 'create_groups': False, 'create_regression_matrix': False} if force_steps is None or len(force_steps) == 0: return force_dict if force_steps == ['all']: for key in force_dict.keys(): force_dict[key] = True else: for step in force_steps: if step in force_dict.keys(): force_dict[step] = True return force_dict def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting program.") print("\n### STEP1 ###\n") # Step 1. Combine GTE files. cgtef = CombineGTEFiles( settings=self.settings.get_setting('combine_gte_files'), force=self.force_dict['combine_gte_files'], outdir=self.outdir) cgtef.start() cgtef.clear_variables() # Step2. Combine eQTL probes files. print("\n### STEP2 ###\n") cepf = CombineEQTLProbes( settings=self.settings.get_setting('combine_eqtlprobes'), disease=self.disease, force=self.force_dict['combine_eqtlprobes'], outdir=self.outdir) cepf.start() cepf.clear_variables() # Step3. Create the ordered unmasked matrices. print("\n### STEP3 ###\n") cm = CreateMatrices( settings=self.settings.get_setting('create_matrices'), gte_df=cgtef.get_gte(), sample_dict=cgtef.get_sample_dict(), sample_order=cgtef.get_sample_order(), eqtl_df=cepf.get_eqtlprobes(), force=self.force_dict['create_matrices'], outdir=self.outdir) cm.start() cm.clear_variables() # Step4. Create the deconvolution matrices. print("\n### STEP4 ###\n") cdm = CreateDeconvolutionMatrices( settings=self.settings.get_setting('create_deconvolution_matrices'), expr_file=cm.get_expr_file(), expr_df=cm.get_complete_expr_matrix(), sample_dict=cgtef.get_sample_dict(), sample_order=cgtef.get_sample_order(), force=self.force_dict['create_deconvolution_matrices'], outdir=self.outdir) cdm.start() cdm.clear_variables() # Step5. Create the celltype PCA file. print("\n### STEP5 ###\n") pcf = PerformCelltypeFactorization( settings=self.settings.get_setting('perform_celltype_factorization'), profile_file=cdm.get_celltype_profile_file(), profile_df=cdm.get_celltype_profile(), ct_expr_file=cdm.get_ct_profile_expr_outpath(), force=self.force_dict['perform_celltype_factorization'], outdir=self.outdir) pcf.start() pcf.clear_variables() # Step6. Create the covariance matrix. print("\n### STEP6 ###\n") pd = PerformDeconvolution( settings=self.settings.get_setting('perform_deconvolution'), profile_file=cdm.get_celltype_profile_file(), profile_df=cdm.get_celltype_profile(), ct_expr_file=cdm.get_ct_profile_expr_outpath(), ct_expr_df=pcf.get_celltype_expression(), force=self.force_dict['perform_deconvolution'], outdir=self.outdir) pd.start() pd.clear_variables() # Step7. Create the covariance matrix. print("\n### STEP7 ###\n") ccm = CreateCovMatrix( settings=self.settings.get_setting('create_cov_matrix'), marker_file=cdm.get_markers_outpath(), celltype_pcs=pcf.get_celltype_pcs(), celltype_cs=pcf.get_celltype_cs(), deconvolution=pd.get_deconvolution(), sample_order=cgtef.get_sample_order(), force=self.force_dict['create_cov_matrix'], outdir=self.outdir) ccm.start() ccm.clear_variables() exit() # Load the complete dataframes. print("\n### LOADING SORTED DATAFRAMES ###\n") print("Extracting eQTL dataframe.") eqtl_df = cepf.get_eqtlprobes() print("Loading genotype dataframe.") geno_df = load_dataframe(cm.get_geno_outpath(), header=0, index_col=0) print("Loading alleles dataframe.") alleles_df = load_dataframe(cm.get_alleles_outpath(), header=0, index_col=0) print("Loading expression dataframe.") expr_df = load_dataframe(cm.get_expr_outpath(), header=0, index_col=0) print("Extracting covariates dataframe.") cov_df = ccm.get_covariates() # Validate the matrices. print("Validating matrices.") self.validate(eqtl_df.copy(), geno_df, alleles_df, expr_df, cov_df) # Step 8. Create the masked matrices. print("\n### STEP8 ###\n") cmm = MaskMatrices( settings=self.settings.get_setting('mask_matrices'), eqtl_df=eqtl_df.copy(), geno_df=geno_df.copy(), alleles_df=alleles_df.copy(), expr_df=expr_df.copy(), cov_df=cov_df.copy(), force=self.force_dict['mask_matrices'], outdir=self.outdir) cmm.start() del cmm # # Step 9. Create the group matrices. # print("\n### STEP9 ###\n") # cg = CreateGroups( # settings=self.settings.get_setting('create_groups'), # eqtl_df=eqtl_df.copy(), # geno_df=geno_df.copy(), # alleles_df=alleles_df.copy(), # expr_df=expr_df.copy(), # cov_df=cov_df.copy(), # groups_file=cm.get_group_outpath(), # force=self.force_dict['create_groups'], # outdir=self.outdir) # cg.start() # del cg # Step 10. Create the regression matrices. print("\n### STEP10 ###\n") crm = CreateRegressionMatrix( settings=self.settings.get_setting('create_regression_matrix'), eqtl_df=eqtl_df.copy(), geno_df=geno_df.copy(), alleles_df=alleles_df.copy(), expr_df=expr_df.copy(), force=self.force_dict['create_regression_matrix'], outdir=self.outdir) crm.start() del crm @staticmethod def validate(eqtl_df, geno_df, alleles_df, expr_df, cov_df): # Set the index of the eQTL for comparison. eqtl_df.index = eqtl_df["SNPName"] eqtl_df.index.name = "-" # Check if row order is identical. if not (eqtl_df.index.identical(geno_df.index)) or \ not (eqtl_df.index.identical(expr_df.index)) or \ not (eqtl_df.index.identical(alleles_df.index)): print("Row order is not identical.") exit() # Check if sample order is identical. if not (geno_df.columns.identical(expr_df.columns)) or \ not (geno_df.columns.identical(cov_df.columns)): print("Order of samples are not identical.") exit() print("\tValid.")
class Main: """ Main: this class is the main class that calls all other functionality. """ def __init__(self, name, settings_file, alpha, extensions, interest): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param alpha: float, the significance cut-off. :param extensions: str, the output figure file type extension. :param interest: list, the HGNC names to print. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. self.settings = LocalSettings(current_dir, settings_file) self.covs = self.settings.get_setting("covariates_to_include") self.covs_excl_from_overview = [ x.lower() for x in self.settings.get_setting("covariates_excl_from_overview") ] self.max_url_len = self.settings.get_setting("max_url_length") self.maf_cutoff = self.settings.get_setting("maf_cutoff") self.include_top_n = self.settings.get_setting("include_top_n") # Load the variables. self.name = name self.alpha = alpha self.extensions = extensions self.interest = interest # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir) def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting visualiser.") self.print_arguments() # Create the dataset object. ds = Dataset(name=self.name, settings=self.settings, alpha=self.alpha) # Loading data. print("Loading dataframes") cov_df = ds.get_cov_df() inter_zscore_df = ds.get_inter_cov_zscore_df() inter_tvalue_df = ds.get_inter_cov_inter_tvalue_df() eqtl_df = ds.get_eqtl_df() geno_df = ds.get_geno_df() alleles_df = ds.get_alleles_df() expr_df = ds.get_expr_df() signif_cutoff = ds.get_significance_cutoff() # Subset the method of interest. select_cov_df = cov_df.loc[self.covs.keys(), :].copy() select_zscore_df = inter_zscore_df.loc[self.covs.keys(), :].copy().T select_tvalue_df = inter_tvalue_df.loc[self.covs.keys(), :].copy().T del cov_df, inter_zscore_df, inter_tvalue_df if not select_cov_df.index.equals( select_zscore_df.columns) or not select_cov_df.index.equals( select_tvalue_df.columns): print("Columns do not match.") exit() print("Analysing interactions") data = [] for i, (index, row) in enumerate(eqtl_df.iterrows()): if (i % 250 == 0) or (i == (eqtl_df.shape[0] - 1)): print("\tprocessing {}/{} " "[{:.2f}%]".format(i, (eqtl_df.shape[0] - 1), (100 / (eqtl_df.shape[0] - 1)) * i)) # Get the data. genotype = geno_df.iloc[i, :].copy() expression = expr_df.iloc[i, :].copy() (alleles, _) = alleles_df.iloc[i, :].copy() zscores = select_zscore_df.iloc[i, :].copy() tvalues = select_tvalue_df.iloc[i, :].copy() iteration = None if "Iteration" in row.index: iteration = row["Iteration"] gwas_ids = None if "GWASIDS" in row.index: gwas_ids = row["GWASIDS"] traits = None if "Trait" in row.index: traits = row["Trait"] if max(zscores) > signif_cutoff: eqtl = Eqtl(index=i, snp_name=row["SNPName"], probe_name=row["ProbeName"], hgnc_name=row["HGNCName"], iteration=iteration, eqtl_zscore=row["OverallZScore"], gwas_ids=gwas_ids, traits=traits, alleles=alleles, signif_cutoff=signif_cutoff, maf_cutoff=self.maf_cutoff, selections=self.covs, genotype=genotype, expression=expression, covariates=select_cov_df.copy(), inter_zscores=zscores, inter_tvalues=tvalues) if self.interest is not None and row[ "HGNCName"] in self.interest: eqtl.print_info() data.extend(eqtl.get_data()) del eqtl # Create the complete dataframe. data_df = pd.DataFrame(data, columns=[ "Index", "SNPName", "ProbeName", "HGNCName", "Iteration", "N", "MAF", "eQTL", "Inter", "Covariate", "Interaction", "Direction", "GWASIDs", "Traits" ]) # Plot the data. print("Creating plots") for extension in self.extensions: plotter = Plotter(data_df, self.outdir, extension=extension) plotter.plot_upsetplot(column="Covariate", id_col="Index", exclude=self.covs_excl_from_overview) plotter.plot_upsetplot(column="Iteration", id_col="ID") plotter.plot_pie(total=eqtl_df.shape[0], part=len(data_df["Index"].unique())) # Save data files. print("Saving results") saver = Saver(data_df, self.outdir, signif_cutoff, self.max_url_len, self.include_top_n) saver.save_all(exclude=self.covs_excl_from_overview) saver.save_per_iter(exclude=self.covs_excl_from_overview) indices_of_interest = saver.save_per_group() print("eQTL indices of interest: {}".format(' '.join( [str(x) for x in indices_of_interest]))) def print_arguments(self): print("Arguments:") print(" > Output directory: {}".format(self.outdir)) print(" > Alpha: {}".format(self.alpha)) print(" > Extensions: {}".format(self.extensions)) print(" > Interest: {}".format(self.interest)) print(" > Covariates to include: {}".format(self.covs)) print(" > Covariates to exclude from overview: {}".format( self.covs_excl_from_overview)) print(" > Max URL length: {}".format(self.max_url_len)) print(" > MAF cutoff: {}".format(self.maf_cutoff)) print(" > Include top n: {}".format(self.include_top_n)) print("")
def __init__(self, name, settings_file, skip_rows, n_eqtls, n_samples, verbose): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param skip_rows: int, the number of rows to skip. :param n_eqtls: int, the number of eqtls in the input files. :param n_samples: int, the number of samples in the input files. :param cores: int, the number of cores to use. :param verbose: boolean, whether or not to print all update info. :param include: boolean, whether or not to include the unfinished wait_list. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. settings = LocalSettings(current_dir, settings_file) # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir) # Safe settings. input_dir = settings.get_setting("input_dir") filenames = settings.get_setting("filenames") self.geno_inpath = os.path.join(input_dir, name, filenames["genotype"]) self.expr_inpath = os.path.join(input_dir, name, filenames["expression"]) self.cov_inpath = os.path.join(input_dir, name, filenames["covariates"]) self.drop_covs = settings.get_setting("drop_covariates") self.tech_covs = settings.get_setting("technical_covariates") self.cov_outdir = settings.get_setting("covariates_folder") self.tech_cov_outdir = settings.get_setting( "technical_covariates_folder") self.perm_orders_filename = settings.get_setting( "permutations_order_pickle_filename") self.pvalues_filename = settings.get_setting( "actual_pvalues_pickle_filename") self.snp_tvalues_filename = settings.get_setting( "snp_tvalues_pickle_filename") self.inter_tvalues_filename = settings.get_setting( "inter_tvalues_pickle_filename") self.perm_pvalues_filename = settings.get_setting( "permuted_pvalues_pickle_filename") self.n_permutations = settings.get_setting("n_permutations") self.max_end_time = int(time.time( )) + settings.get_setting("max_runtime_in_hours") * 60 * 60 self.panic_time = self.max_end_time - ( settings.get_setting("panic_time_in_min") * 60) self.skip_rows = skip_rows self.n_eqtls = n_eqtls self.n_samples = n_samples self.verbose = verbose