def __init__(self, name, settings_file, alpha, extensions, interest): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param alpha: float, the significance cut-off. :param extensions: str, the output figure file type extension. :param interest: list, the HGNC names to print. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. self.settings = LocalSettings(current_dir, settings_file) self.covs = self.settings.get_setting("covariates_to_include") self.covs_excl_from_overview = [ x.lower() for x in self.settings.get_setting("covariates_excl_from_overview") ] self.max_url_len = self.settings.get_setting("max_url_length") self.maf_cutoff = self.settings.get_setting("maf_cutoff") self.include_top_n = self.settings.get_setting("include_top_n") # Load the variables. self.name = name self.alpha = alpha self.extensions = extensions self.interest = interest # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir)
def __init__(self, settings, profile_file, profile_df, ct_expr_file, ct_expr_df, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param profile_file: string, the datafile contaioning the celltype profile. :param profile_df: DataFrame, the celltype profile. :param ct_expr_file: string, the datafile containing expression of the celltype profiles. :param ct_expr_df: string, the celltype expression. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.sample_cohort_file = settings["sample_cohort_datafile"] self.sample_id = settings["sample_cohort_identifiers"]["sample"] self.cohort_id = settings["sample_cohort_identifiers"]["cohort"] self.profile_file = profile_file self.profile_df = profile_df self.ct_expr_file = ct_expr_file self.ct_expr_df = ct_expr_df self.force = force # Prepare an output directories. self.outdir = os.path.join(outdir, 'perform_deconvolution') prepare_output_dir(self.outdir) # Construct the output paths. self.outpath = os.path.join(self.outdir, "deconvolution_table.txt.gz") # Create empty variable. self.deconvolution = None
def __init__(self, settings, eqtl_df, geno_df, alleles_df, expr_df, cov_df, groups_file, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param eqtl_df: DataFrame, the eQTL probes data. :param geno_df: DataFrame, the genotype data. :param alleles_df: DataFrame, the alleles data. :param expr_df: DataFrame, the expression data. :param cov_df: DataFrame, the covariate data. :param groups_file: string, path to the groups file. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.eqtl_df = eqtl_df self.geno_df = geno_df self.alleles_df = alleles_df self.expr_df = expr_df self.cov_df = cov_df self.force = force # Load the groups. with open(groups_file, "rb") as f: groups_data = pickle.load(f) # Remove uninteresting groups. self.groups = self.filter_groups(groups_data, settings["min_eqtl_in_group"], settings["min_samples_in_group"]) del groups_data # Prepare an output directories. self.outdir = os.path.join(outdir, 'create_groups') prepare_output_dir(self.outdir)
def __init__(self, dataset, outdir, extension): """ The initializer for the class. :param dataset: Dataset, the input data. :param outdir: string, the output directory. :param extension: str, the output figure file type extension. """ self.outdir = os.path.join(outdir, 'inter_eqtl_effect') prepare_output_dir(self.outdir) self.extension = extension # Set the right pdf font for exporting. matplotlib.rcParams['pdf.fonttype'] = 42 # Extract the required data. print("Loading data") self.eqtl_df = dataset.get_eqtl_df() self.geno_df = dataset.get_geno_df() self.expr_df = dataset.get_expr_df() self.alleles_df = dataset.get_alleles_df() self.cov_df = dataset.get_cov_df() self.inter_df = dataset.get_inter_cov_zscore_df() self.z_score_cutoff = dataset.get_significance_cutoff() colormap = dataset.get_colormap() # Create color map. self.group_color_map, self.value_color_map = self.create_color_map( colormap) self.sex_color_map = { "Male": colormap["male"], "Female": colormap["female"] }
def __init__(self, name, settings_file, alpha, plots, top, interest, extension, validate): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param alpha: float, the significance cut-off. :param plots: list, the names of the plots to create. :param top: int, the number of top eQTLs to plot. :param interest: list, the indices of equals to plot. :param extension: str, the output figure file type extension. :param validate: boolean, whether or not to validate the input. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. self.settings = LocalSettings(current_dir, settings_file) # Load the variables. self.name = name self.alpha = alpha self.plots = plots self.top = top self.interest = interest self.extension = extension self.validate = validate # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir)
def __init__(self, dataset, outdir, extension): """ The initializer for the class. :param dataset: Dataset, the input data. :param outdir: string, the output directory. :param extension: str, the output figure file type extension. """ self.outdir = os.path.join(outdir, 'inter_eqtl_effect_deconvolution') prepare_output_dir(self.outdir) self.extension = extension # Set the right pdf font for exporting. matplotlib.rcParams['pdf.fonttype'] = 42 # Extract the required data. print("Loading data") self.eqtl_df = dataset.get_eqtl_df() self.geno_df = dataset.get_geno_df() self.expr_df = dataset.get_expr_df() self.alleles_df = dataset.get_alleles_df() self.cov_df = dataset.get_cov_df() self.inter_df = dataset.get_inter_cov_zscore_df() self.celltypes = dataset.get_celltypes() self.cellmap_methods = dataset.get_cellmap_methods() self.marker_genes = dataset.get_marker_genes() # Create color map. self.group_color_map, self.value_color_map = self.create_color_map()
def __init__(self, dataset, outdir, extension): """ The initializer for the class. :param dataset: Dataset, the input data. :param outdir: string, the output directory. :param extension: str, the output figure file type extension. """ self.outdir = os.path.join(outdir, 'inter_eqtl_celltype_details') prepare_output_dir(self.outdir) self.extension = extension # Set the right pdf font for exporting. matplotlib.rcParams['pdf.fonttype'] = 42 # Extract the required data. print("Loading data") self.eqtl_df = dataset.get_eqtl_df() self.geno_df = dataset.get_geno_df() self.zscore_df = dataset.get_inter_cov_zscore_df() self.tvalue_df = dataset.get_inter_cov_inter_tvalue_df() self.cellmap_methods = dataset.get_cellmap_methods() self.marker_genes = dataset.get_marker_genes() self.z_score_cutoff = dataset.get_significance_cutoff() self.colormap = dataset.get_colormap()
def __init__(self, settings, profile_file, profile_df, ct_expr_file, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param profile_file: string, the datafile contaioning the celltype profile. :param profile_df: DataFrame, the celltype profile. :param ct_expr_file: string, the datafile containing expression of the celltype profiles. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.profile_file = profile_file self.profile_df = profile_df self.ct_expr_file = ct_expr_file self.force = force # Prepare an output directory. self.outdir = os.path.join(outdir, 'perform_celltype_factorization') prepare_output_dir(self.outdir) self.pca_outpath = os.path.join(self.outdir, "celltype_pca.txt.gz") self.nmf_outpath = os.path.join(self.outdir, "celltype_nmf.txt.gz") # Create empty variables. self.celltype_expression = None self.celltype_pcs = None self.celltype_cs = None
def __init__(self, settings, eqtl_df, geno_df, alleles_df, expr_df, cov_df, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param eqtl_df: DataFrame, the eQTL data. :param geno_df: DataFrame, the genotype data. :param alleles_df: DataFrame, the alleles data. :param expr_df: DataFrame, the expression data. :param cov_df: DataFrame, the covariate data. :param marker_file: string, path to the marker file. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.eqtl_df = eqtl_df self.geno_df = geno_df self.alleles_df = alleles_df self.expr_df = expr_df self.cov_df = cov_df self.force = force # Prepare an output directories. self.outdir = os.path.join(outdir, 'mask_matrices') prepare_output_dir(self.outdir)
def __init__(self, settings, disease, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param disease: string, the name of the disease to analyse. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.indir = settings["input_directory"] self.iter_dirname = settings["iteration_dirname"] self.in_filename = settings["in_filename"] self.n_iterations = settings["iterations"] self.snp_to_gwasid_filename = settings["snp_to_gwasid_filename"] self.gwasid_to_trait_filename = settings["gwasid_to_trait_filename"] self.disease = disease self.force = force # Prepare an output directory. self.outdir = os.path.join(outdir, 'combine_eqtlprobes') prepare_output_dir(self.outdir) self.outpath = os.path.join(self.outdir, "eQTLprobes_combined.txt.gz") # Declare variables. self.eqtl_probes = None
def __init__(self, name, settings_file, skip_rows, n_eqtls, n_samples, verbose): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param skip_rows: int, the number of rows to skip. :param n_eqtls: int, the number of eqtls in the input files. :param n_samples: int, the number of samples in the input files. :param cores: int, the number of cores to use. :param verbose: boolean, whether or not to print all update info. :param include: boolean, whether or not to include the unfinished wait_list. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. settings = LocalSettings(current_dir, settings_file) # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir) # Safe settings. input_dir = settings.get_setting("input_dir") filenames = settings.get_setting("filenames") self.geno_inpath = os.path.join(input_dir, name, filenames["genotype"]) self.expr_inpath = os.path.join(input_dir, name, filenames["expression"]) self.cov_inpath = os.path.join(input_dir, name, filenames["covariates"]) self.drop_covs = settings.get_setting("drop_covariates") self.tech_covs = settings.get_setting("technical_covariates") self.cov_outdir = settings.get_setting("covariates_folder") self.tech_cov_outdir = settings.get_setting( "technical_covariates_folder") self.perm_orders_filename = settings.get_setting( "permutations_order_pickle_filename") self.pvalues_filename = settings.get_setting( "actual_pvalues_pickle_filename") self.snp_tvalues_filename = settings.get_setting( "snp_tvalues_pickle_filename") self.inter_tvalues_filename = settings.get_setting( "inter_tvalues_pickle_filename") self.perm_pvalues_filename = settings.get_setting( "permuted_pvalues_pickle_filename") self.n_permutations = settings.get_setting("n_permutations") self.max_end_time = int(time.time( )) + settings.get_setting("max_runtime_in_hours") * 60 * 60 self.panic_time = self.max_end_time - ( settings.get_setting("panic_time_in_min") * 60) self.skip_rows = skip_rows self.n_eqtls = n_eqtls self.n_samples = n_samples self.verbose = verbose
def __init__(self, df, outdir, extension="png"): self.df = self.set_df(df) self.outdir = os.path.join(outdir, 'plots') self.extension = extension prepare_output_dir(self.outdir) # Set the right pdf font for exporting. if self.extension == "pdf": matplotlib.rcParams['pdf.fonttype'] = 42
def __init__(self, dataset, outdir, extension): """ The initializer for the class. :param dataset: Dataset, the input data. :param outdir: string, the output directory. :param extension: str, the output figure file type extension. """ self.outdir = os.path.join(outdir, 'inter_zscore_dist') prepare_output_dir(self.outdir) self.extension = extension # Set the right pdf font for exporting. matplotlib.rcParams['pdf.fonttype'] = 42 # Extract the required data. print("Loading data") self.inter_df = dataset.get_inter_cov_zscore_df() self.z_score_cutoff = dataset.get_significance_cutoff()
def __init__(self, dataset, outdir, extension): """ The initializer for the class. :param dataset: Dataset, the input data. :param outdir: string, the output directory. :param extension: str, the output figure file type extension. """ self.outdir = os.path.join(outdir, 'covariate_clustermap') prepare_output_dir(self.outdir) self.extension = extension # Set the right pdf font for exporting. matplotlib.rcParams['pdf.fonttype'] = 42 # Extract the required data. print("Loading data") self.cov_df = dataset.get_cov_df() self.cmap = dataset.get_diverging_cmap()
def __init__(self, settings_file, groups, force, verbose): """ Initializer of the class. :param settings_file: string, the name of the settings file. :param groups: list, the names of groups to analyse. :param force: boolean, whether or not to force to redo each step. :param verbose: boolean, whether or not to print each step. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. settings = LocalSettings(current_dir, settings_file) # Safe arguments. self.indir = settings.get_setting("input_dir") self.tech_covs = settings.get_setting("technical_covariates") self.eqtl_ia = settings.get_setting("eQTLInteractionAnalyser") self.inter_regex = settings.get_setting("interaction_regex") self.groups = groups self.force = force self.verbose = verbose # Prepare an output directory. self.outdir = os.path.join(current_dir, settings.get_setting("output_dir")) prepare_output_dir(self.outdir) # Find which groups are in the input directory. if self.groups is not None: groups_in_indir = glob.glob(os.path.join(self.indir, 'group_*')) self.group_indirs = self.filter_groups(groups_in_indir) else: self.group_indirs = [self.indir] # Prepare filenames. filenames = settings.get_setting("filenames") self.eqtl_filename = filenames["eqtl"] self.geno_filename = filenames["genotype"] self.expr_filename = filenames["expression"] self.cov_filename = filenames["covariate"]
def __init__(self, settings, marker_file, celltype_pcs, celltype_cs, deconvolution, sample_order, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param marker_file: string, path to the marker file. :param celltype_pcs: DataFrame, the first component from PCA of each celltype expression. :param celltype_cs: DataFrame, the first component from NMF of each celltype expression. :param deconvolution: DataFrame, the estimated cell count proportions of each celltype per sample. :param sample_order: list, order of samples. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.cov_file = settings["covariate_datafile"] self.tech_covs = settings["technical_covariates"] self.cohorts = settings["cohorts"] self.ref_cohort = settings["reference_cohort"] self.pheno_file = settings["phenotype_datafile"] self.eig_file = settings["eigenvectors_datafile"] self.n_eigen = settings["num_eigenvectors"] self.eig_bef_cov_corr_file = settings[ "eigenvectors_before_cov_corr_datafile"] self.marker_file = marker_file self.sample_order = sample_order self.celltype_pcs = celltype_pcs self.celltype_cs = celltype_cs self.deconvolution = deconvolution self.force = force # Prepare an output directories. self.outdir = os.path.join(outdir, 'create_cov_matrix') prepare_output_dir(self.outdir) self.outpath = os.path.join(self.outdir, "covariates_table.txt.gz") # Variables. self.covariates = None self.sex_dict = {"M": 0, "F": 1, np.nan: -1}
def __init__(self, settings, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.inpath = os.path.join(settings["input_directory"], settings["filename_regex"]) self.force = force # Prepare an output directory. self.outdir = os.path.join(outdir, 'combine_gte_files') prepare_output_dir(self.outdir) self.outpath = os.path.join(self.outdir, "GTE_combined.txt.gz") # Declare variables. self.gte = None self.sample_dict = None self.sample_order = None
def __init__(self, settings_file, groups, force): """ Initializer of the class. :param settings_file: string, the name of the settings file. :param groups: list, the names of groups to analyse. :param force: boolean, whether or not to force to redo each step. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. settings = LocalSettings(current_dir, settings_file) # Safe arguments.) self.eqtl_inpath = settings.get_setting("eqtl_datafile") self.cov_inpath = settings.get_setting("cov_datafile") self.data_indir = settings.get_setting("data_dir") self.g_data_indir = settings.get_setting("groups_data_dir") self.g_inter_indir = settings.get_setting("inter_groups_dir") self.inter_regex = settings.get_setting("interaction_regex") self.group_ids = self.filter_groups(groups) self.celltypes = settings.get_setting("celltypes") self.force = force # Prepare an output directory. self.outdir = os.path.join(current_dir, settings.get_setting("output_dir")) prepare_output_dir(self.outdir) # Prepare filenames. filenames = settings.get_setting("filenames") self.obj_filename = filenames["object"] self.eqtl_filename = filenames["eqtl"] self.geno_filename = filenames["genotype"] self.alleles_filename = filenames["alleles"] self.expr_filename = filenames["expression"] self.cov_filename = filenames["covariates"] self.inter_filename = filenames["interaction"] self.markers_filename = filenames["markers"]
def __init__(self, name, settings_file, disease, force_steps): """ Initializer of the class. :param name: string, the name of the base input/ouput directory. :param settings_file: string, the name of the settings file. :param disease: string, the name of the disease to analyse. :param force_steps: list, the names of the steps to force to redo. """ # Define the current directory. current_dir = str(Path(__file__).parent.parent) # Load the LocalSettings singelton class. self.settings = LocalSettings(current_dir, settings_file) # Safe arguments. self.disease = disease self.force_dict = self.create_force_dict(force_steps) # Prepare an output directory. self.outdir = os.path.join(current_dir, name) prepare_output_dir(self.outdir)
def __init__(self, settings, expr_file, expr_df, sample_dict, sample_order, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param expr_file: string, the expression data file. :param expr_df: DataFrame, the complete expression dataframe. :param sample_dict: dictionary, a dictionary for translating unmasked sampels to the same format. :param sample_order: list, order of samples. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.decon_expr_file = settings["decon_expression_datafile"] self.celltype_profile_file = settings["celltype_profile_datafile"] self.translate_file = settings["translate_datafile"] self.marker_genes_suffix = settings["marker_genes_suffix"] self.marker_dict = settings["marker_dict"] self.expr_file = expr_file self.expr_df = expr_df self.sample_dict = sample_dict self.sample_order = sample_order self.force = force # Prepare an output directories. self.outdir = os.path.join(outdir, 'create_deconvolution_matrices') prepare_output_dir(self.outdir) # Construct the output paths. self.decon_expr_outpath = os.path.join(self.outdir, "decon_expr_table.txt.gz") self.ct_profile_expr_outpath = os.path.join( self.outdir, "ct_profile_expr_table.txt.gz") self.markers_outpath = os.path.join(self.outdir, "marker_genes.txt.gz") # Create empty variable. self.celltype_profile = None
def save_per_group(self): indices_of_interest = [] for interaction in self.df["Interaction"].unique(): inter_df = self.df.loc[self.df["Interaction"] == interaction, :].copy() inter_df.drop(["Interaction"], axis=1, inplace=True) if len(inter_df.index) <= 0: return indices_of_interest out_dir = os.path.join(self.outdir, '{}_interaction'.format(interaction)) prepare_output_dir(out_dir) for covariate in inter_df["Covariate"].unique(): cov_df = inter_df.loc[inter_df["Covariate"] == covariate, :].copy() cov_df.drop(["Covariate"], axis=1, inplace=True) if len(cov_df.index) <= 0: continue fpath = os.path.join(out_dir, "{}_{}.txt".format(interaction, covariate)) self.save(cov_df, fpath, self.max_url_len, self.signif_cutoff) for direction in ["up", "down"]: dir_df = cov_df.loc[cov_df["Direction"] == direction, :].copy() dir_df.drop(["Direction"], axis=1, inplace=True) if len(dir_df.index) <= 0: continue fpath = os.path.join(out_dir, "{}_{}_{}.txt".format(interaction, covariate, direction)) self.save(dir_df, fpath, self.max_url_len, self.signif_cutoff) indices_of_interest.extend(dir_df["Index"][:self.top]) indices_of_interest = list(set(indices_of_interest)) indices_of_interest.sort() return indices_of_interest
def __init__(self, settings, eqtl_df, geno_df, alleles_df, expr_df, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param eqtl_df: DataFrame, the eQTL probes data. :param geno_df: DataFrame, the genotype data. :param alleles_df: DataFrame, the alleles data. :param expr_df: DataFrame, the expression data. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.eqtl_df = eqtl_df self.geno_df = geno_df self.alleles_df = alleles_df self.expr_df = expr_df self.force = force # Prepare an output directories. outdir = os.path.join(outdir, 'create_regression_matrix') prepare_output_dir(outdir) self.outpath = os.path.join(outdir, "regression_table.txt.gz")
def __init__(self, settings, gte_df, sample_dict, sample_order, eqtl_df, force, outdir): """ The initializer for the class. :param settings: string, the settings. :param gte_df: DataFrame, the combined GTE files in a dataframe. :param sample_dict: dictionary, a dictionary for translating unmasked sampels to the same format. :param sample_order: list, order of samples. :param eqtl_df: DataFrame, the combined eQTL probe files in a dataframe. :param force: boolean, whether or not to force the step to redo. :param outdir: string, the output directory. """ self.geno_file = settings["genotype_datafile"] self.expr_file = settings["expression_datafile"] self.gte_df = gte_df self.sample_dict = sample_dict self.sample_order = sample_order self.eqtl_df = eqtl_df self.force = force # Prepare an output directories. self.outdir = os.path.join(outdir, 'create_matrices') prepare_output_dir(self.outdir) # Construct the output paths. self.geno_outpath = os.path.join(self.outdir, "genotype_table.txt.gz") self.alleles_outpath = os.path.join(self.outdir, "genotype_alleles.txt.gz") self.expr_outpath = os.path.join(self.outdir, "expression_table.txt.gz") # self.group_outpath = os.path.join(self.outdir, "groups.pkl") # Create empty variable. self.complete_expr_matrix = None
def start(self): """ Method to start the manager. """ self.print_arguments() print("Starting Custom Interaction Analyser " "[{}]".format(datetime.now().strftime("%d-%m-%Y, %H:%M:%S"))) # Start the timer. start_time = int(time.time()) # Get the permutation orders. permutation_orders = None perm_orders_outfile = os.path.join(self.outdir, self.perm_orders_filename + ".pkl") if check_file_exists(perm_orders_outfile): print("Loading permutation order") permutation_orders = self.load_pickle(perm_orders_outfile) # Validate the permutation orders for the given input. if len(permutation_orders) != (self.n_permutations + 1): print("\tinvalid") permutation_orders = None if permutation_orders is not None: for order in permutation_orders: if len(order) != self.n_samples: print("\tinvalid") permutation_orders = None break print("\tvalid") if permutation_orders is None: print("Creating permutation order") permutation_orders = self.create_perm_orders() self.dump_pickle(permutation_orders, self.outdir, self.perm_orders_filename) # Start the work. print("Start the analyses", flush=True) storage = self.work(permutation_orders) tc_container = storage.get_tech_cov_container() c_container = storage.get_cov_container() print("Saving output files", flush=True) filename_suffix = "{}_{}".format(self.skip_rows, self.n_eqtls) for container, outdir in zip([tc_container, c_container], [self.tech_cov_outdir, self.cov_outdir]): full_outdir = os.path.join(self.outdir, outdir) prepare_output_dir(full_outdir) self.dump_pickle(container.get_pvalues(), full_outdir, self.pvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(container.get_snp_tvalues(), full_outdir, self.snp_tvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(container.get_inter_tvalues(), full_outdir, self.inter_tvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(container.get_perm_pvalues(), full_outdir, self.perm_pvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) # Print the process time. run_time = int(time.time()) - start_time run_time_min, run_time_sec = divmod(run_time, 60) run_time_hour, run_time_min = divmod(run_time_min, 60) print("Finished in {} hour(s), {} minute(s) and " "{} second(s)".format(int(run_time_hour), int(run_time_min), int(run_time_sec))) print("Received {:.2f} analyses per minute".format( (self.n_eqtls * (self.n_permutations + 1)) / (run_time / 60))) # Shutdown the manager. print("Shutting down manager [{}]".format( datetime.now().strftime("%d-%m-%Y, %H:%M:%S")), flush=True)
def start(self): print("Plotting interaction eQTL radar plots.") self.print_arguments() methods = self.cellmap_methods methods.append((self.marker_genes, "")) print("Iterating over eQTLs.") for i, (index, row) in enumerate(self.eqtl_df.iterrows()): # Extract the usefull information from the row. snp_name = row["SNPName"] probe_name = row["ProbeName"] hgnc_name = row["HGNCName"] print("\tWorking on: {}\t{}\t{} [{}/{} " "{:.2f}%]".format(snp_name, probe_name, hgnc_name, i + 1, self.eqtl_df.shape[0], (100 / self.eqtl_df.shape[0]) * (i + 1))) # Check if we need to flip the genotypes. genotype = self.geno_df.iloc[i, :] counts = genotype.value_counts() for x in [0.0, 1.0, 2.0]: if x not in counts: counts.loc[x] = 0 zero_geno_count = (counts[0.0] * 2) + counts[1.0] two_geno_count = (counts[2.0] * 2) + counts[1.0] flip = 1 if two_geno_count > zero_geno_count: flip = -1 # Prepare output directory. eqtl_outdir = os.path.join(self.outdir, "{}_{}_{}_{}".format(index, snp_name, probe_name, hgnc_name)) prepare_output_dir(eqtl_outdir) # Iterate over the rows. for (prefix, suffix) in methods: if prefix != "CellMapNNLS_": continue name = prefix.replace("_", "") + suffix tvalues = self.tvalue_df.loc[ self.tvalue_df.index.str.startswith(prefix), :].copy() tvalues = tvalues.iloc[:, i] tvalues = tvalues * flip tvalues = tvalues.to_frame() zscores = self.zscore_df.loc[ self.zscore_df.index.str.startswith(prefix), :].copy() zscores = zscores.iloc[:, i].to_frame() df = tvalues.merge(zscores, left_index=True, right_index=True) df.columns = ["tvalue", "zscore"] df.index = ["{}".format(x.replace(prefix, "").replace(suffix, "")) for x in df.index] self.plot_forest(hgnc_name, name, df, self.z_score_cutoff, eqtl_outdir, self.extension)
def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting interaction analyser.") self.print_arguments() # Loop over the groups. print("Performing interaction analyses.") for i, group_indir in enumerate(self.group_indirs): # Prepare the input and output directories. if self.groups is not None: group_id = get_leaf_dir(group_indir) group_outdir = os.path.join(self.outdir, group_id) else: group_id = "" group_outdir = self.outdir ia_indir = os.path.join(group_outdir, 'input') ia_outdir = os.path.join(group_outdir, 'output') for outdir in [group_outdir, ia_indir, ia_outdir]: prepare_output_dir(outdir) # Check if we can find an InteractionZSCoreMatrix has_inter_matrix = False if not self.force: for path in glob.glob(os.path.join(ia_outdir, "*")): if re.match(self.inter_regex, get_basename(path)): has_inter_matrix = True break # Stop if we already have the interaction matrix. if has_inter_matrix and not self.force: continue print("\tWorking on: {:15s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.group_indirs), (100 / len(self.group_indirs)) * (i + 1))) # Prepare the EQTLInteractioAnalyser expected input. self.print_string("\n### STEP1 ###\n") expected_input = ["Genotypes", "Expression", "Covariates"] filenames = [ self.geno_filename, self.expr_filename, self.cov_filename ] for exp_ia_infile, filename in zip(expected_input, filenames): # Check if the files alreadt exist. file1 = os.path.join(ia_indir, exp_ia_infile + ".binary.dat") file2 = os.path.join(ia_indir, exp_ia_infile + ".binary.rows.txt") file3 = os.path.join(ia_indir, exp_ia_infile + ".binary.columns.txt") if not check_file_exists(file1) or \ not check_file_exists(file2) or \ not check_file_exists(file3) or \ self.force: self.print_string("\nPreparing {}.".format(filename)) # Define the filenames. compr_file = os.path.join(self.indir, group_id, filename + '.txt.gz') copy_file = os.path.join(ia_indir, filename + '.txt.gz') uncompr_file = os.path.join(ia_indir, filename + '.txt') bin_file = os.path.join(ia_indir, exp_ia_infile + ".binary") # Copy and decompressed the file. self.print_string("\nCopying the input files.") self.copy_file(compr_file, copy_file) self.print_string("\nDecompressing the input files.") self.decompress(copy_file) # Convert to binary. self.print_string("\nConverting files to binary format.") self.convert_to_binary(uncompr_file, bin_file) # Remove the uncompressed file. self.print_string("\nRemoving uncompressed files.") if check_file_exists(uncompr_file): self.print_string( "\tos.remove({})".format(uncompr_file)) os.remove(uncompr_file) else: self.print_string( "Skipping {} preparation.".format(filename)) # prepare the eQTL file. self.print_string("\n### STEP2 ###\n") eqtl_file = os.path.join(ia_indir, self.eqtl_filename + '.txt') if not check_file_exists(eqtl_file) or self.force: self.print_string("\nPreparing eQTL file.") # Define the filenames. compr_file = os.path.join(self.indir, group_id, self.eqtl_filename + '.txt.gz') copy_file = os.path.join(ia_indir, self.eqtl_filename + '.txt.gz') # Copy and decompressed the file. self.print_string("\nCopying the input files.") self.copy_file(compr_file, copy_file) self.print_string("\nDecompressing the input files.") self.decompress(copy_file) else: self.print_string("Skipping eqtl preparation.") # execute the program. self.print_string("\n### STEP3 ###\n") self.print_string("Executing the eQTLInteractionAnalyser.") self.execute(ia_indir, ia_outdir, eqtl_file)
def start(self): print("Creating groups.") for i, (group_id, group_obj) in enumerate(self.groups.items()): print(" Working on: {:10s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.groups), (100 / len(self.groups)) * (i + 1))) # Create the group dir. group_dir = os.path.join(self.outdir, group_id) prepare_output_dir(group_dir) # Define the output names. group_object = os.path.join(group_dir, "group.pkl") eqtl_outpath = os.path.join(group_dir, "eqtl_table.txt.gz") geno_outpath = os.path.join(group_dir, "genotype_table.txt.gz") alleles_outpath = os.path.join(group_dir, "genotype_alleles.txt.gz") expr_outpath = os.path.join(group_dir, "expression_table.txt.gz") cov_outpath = os.path.join(group_dir, "covariates_table.txt.gz") # Check if output file exist, if not, create it. if not check_file_exists(group_object) or self.force: with open(group_object, "wb") as f: pickle.dump(group_obj, f) print("\tSaved group object: " "{}".format(get_basename(group_object))) # Get the group indices. snp_mask = group_obj.get_snp_indices() sample_mask = group_obj.get_sample_indices() # Check if output file exist, if not, create it. if not check_file_exists(eqtl_outpath) or self.force: group_eqtl = self.eqtl_df.iloc[snp_mask, :].copy() save_dataframe(outpath=eqtl_outpath, df=group_eqtl, index=False, header=True) del group_eqtl if not check_file_exists(geno_outpath) or self.force: group_geno = self.geno_df.iloc[snp_mask, sample_mask].copy() save_dataframe(outpath=geno_outpath, df=group_geno, index=True, header=True) del group_geno if not check_file_exists(alleles_outpath) or self.force: group_alleles = self.alleles_df.iloc[snp_mask, :].copy() save_dataframe(outpath=alleles_outpath, df=group_alleles, index=True, header=True) del group_alleles if not check_file_exists(expr_outpath) or self.force: group_expr = self.expr_df.iloc[snp_mask, sample_mask].copy() save_dataframe(outpath=expr_outpath, df=group_expr, index=True, header=True) del group_expr if not check_file_exists(cov_outpath) or self.force: group_cov = self.cov_df.iloc[:, sample_mask].copy() save_dataframe(outpath=cov_outpath, df=group_cov, index=True, header=True) del group_cov
def __init__(self, dataset, outdir, extension): """ The initializer for the class. :param dataset: Dataset, the input data. :param outdir: string, the output directory. :param extension: str, the output figure file type format. """ self.outdir = os.path.join(outdir, 'covariates_explained_by_others') prepare_output_dir(self.outdir) self.extension = extension # Set the right pdf font for exporting. matplotlib.rcParams['pdf.fonttype'] = 42 # Extract the required data. print("Loading data") self.groups = dataset.get_groups() self.cov_df = dataset.get_cov_df() self.colormap = self.create_color_map() self.tech_covs = ["PCT_CODING_BASES", "PCT_MRNA_BASES", "PCT_INTRONIC_BASES", "MEDIAN_3PRIME_BIAS", "PCT_USABLE_BASES", "PCT_INTERGENIC_BASES", "PCT_UTR_BASES", #"PF_HQ_ALIGNED_READS", "PCT_READS_ALIGNED_IN_PAIRS", "PCT_CHIMERAS", "PF_READS_IMPROPER_PAIRS", "PF_HQ_ALIGNED_Q20_BASES", "PF_HQ_ALIGNED_BASES", "PCT_PF_READS_IMPROPER_PAIRS", "PF_READS_ALIGNED", "avg_mapped_read_length", "avg_input_read_length", "uniquely_mapped", "total_reads", "Total.Sequences_R1", "MDS1", "MDS2", "MDS3", "MDS4", "AMPAD-MAYO-V2-EUR", "AMPAD-MSBB-V2-EUR", "BrainGVEX-V2-EUR", "CMC_HBCC_set2-EUR", "CMC_HBCC_set3-EUR", "CMC-EUR", "ENA-EUR", "GTEx-EUR", "GVEX-EUR", "LIBD_1M-EUR", "LIBD_h650-EUR", "NABEC-H550-EUR", "NABEC-H610-EUR", "TargetALS-EUR", "UCLA_ASD-EUR", # "AMPAD-ROSMAP-V2-EUR" ]