def start(self): self.log.info("Filtering technical covariates datafile.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: # Load the sample info. self.log.info("Loading covariates matrix.") cov_df = load_dataframe(inpath=self.cov_file, header=0, index_col=0, logger=self.log) # Filter on samples and technical covariates. self.log.info("Filtering on samples and technical covariates.") cov_df.index = [ self.sample_dict[x] if x in self.sample_dict else x for x in cov_df.index ] tech_cov_df = cov_df.loc[self.sample_order, self.tech_covs].copy() del cov_df self.log.info("\tNew shape: {}".format(tech_cov_df.shape)) # Remove technical covariates that are linearly dependent. self.log.info("Removing linearly dependent column(s).") self.tech_covs_df = self.filter_linear_dependent_covs(tech_cov_df) self.log.info("\tNew shape: {}".format(self.tech_covs_df.shape)) self.save() else: self.log.info("Skipping step.")
def start(self): self.log.info("Correcting expression data for dataset effects.") self.print_arguments() self.log.info("Correcting signature expression data.") if not check_file_exists(self.sign_expr_dc_outpath) or self.force: if self.dataset_df is None: self.dataset_df = load_dataframe(self.dataset_file, header=0, index_col=0, logger=self.log) if self.sign_expr_df is None: self.sign_expr_df = load_dataframe(self.sign_expr_file, header=0, index_col=0, logger=self.log) self.sign_expr_dc_df = self.dataset_correction( self.sign_expr_df, self.dataset_df) save_dataframe(df=self.sign_expr_dc_df, outpath=self.sign_expr_dc_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.")
def start(self): self.log.info("Starting creating cohort matrix.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: # Load the sample info. self.log.info("Loading sample information matrix.") self.sample_info_df = load_dataframe(inpath=self.inpath, header=0, index_col=None, low_memory=False, logger=self.log) # Construct sample-cohort dict. self.log.info("Creating sample to cohort dict.") sample_cohort_dict = construct_dict_from_df( self.sample_info_df, self.sample_id, self.cohort_id) # Create cohort dataframe. self.log.info("Constructing cohort matrix.") self.cohort_df = self.create_cohort_df(self.sample_dict, self.sample_order, sample_cohort_dict) self.save() else: self.log.info("Skipping step.")
def start(self): self.log.info("Starting deconvolution.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: self.decon_df = self.perform_deconvolution() self.save() else: self.log.info("Skipping step.")
def start(self): self.log.info("Starting creating covariate file.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: self.covs_df = self.create_covs_file() self.save() else: self.log.info("Skipping step.")
def start(self): self.log.info("Starting creating extra covariate file(s).") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: self.df = self.prepare_matrix() self.save() else: self.log.info("Skipping {}.".format(self.inpath))
def start(self): self.log.info("Starting normal transforming matrix.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: self.normalized_df = self.normal_transform() self.save() else: self.log.info("Skipping step.")
def start(self): self.log.info("Starting combining eQTL probe files.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: # Load each GTE file. self.log.info("Loading eQTLprobes files.") self.eqtl_df = self.combine_files() self.save() else: self.log.info("Skipping step.")
def start(self): self.log.info("Starting creating dataset matrix.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: # Create dataset dataframe. self.log.info("Constructing dataset matrix.") self.dataset_df = self.create_dataset_df(self.dts_dict, self.sample_order) self.save() else: self.log.info("Skipping step.")
def validate(self): # Check if input files exist. for filepath in [ self.matrix_inpath, self.covariates_inpath, self.sample_dict_inpath ]: if filepath is not None and not check_file_exists(filepath): print("File {} does not exist".format(filepath)) return False # Check if correct extension. if not self.matrix_inpath.endswith(".txt.gz"): print("Matrix input must be in .txt.gz format") return False return True
def start(self): self.log.info("Starting combining GTE files.") self.print_arguments() # Check if GTE output file exist. if check_file_exists(self.outpath) and not self.force: self.log.info("Skipping step, loading result.") self.gte_df = load_dataframe(inpath=self.outpath, header=None, index_col=None, logger=self.log) else: # Load each GTE file. self.log.info("Loading GTE files.") self.gte_df = self.combine_files() self.save() # Construct sample translate dict. self.sample_dict = self.create_sample_dict() self.sample_order = list(self.gte_df.iloc[:, 1]) self.dataset_to_samples_dict = self.set_dataset_to_samples_dict()
def start(self): self.log.info("Starting creating matrices.") self.print_arguments() if self.eqtl_df is None: self.eqtl_df = load_dataframe(self.eqtl_file, header=0, index_col=None, logger=self.log) self.log.info("Parsing genotype input data.") if not check_file_exists(self.geno_outpath) or not check_file_exists( self.alleles_outpath) or self.force: alleles_df, geno_df = self.parse_genotype_file() self.log.info("Reorder, Filter, and save.") self.alleles_df = alleles_df.loc[self.eqtl_df.loc[:, "SNPName"], :] save_dataframe(df=self.alleles_df, outpath=self.alleles_outpath, index=True, header=True, logger=self.log) self.geno_df = geno_df.loc[self.eqtl_df.loc[:, "SNPName"], self.sample_order] save_dataframe(df=self.geno_df, outpath=self.geno_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.") self.log.info("Parsing expression input data.") if not check_file_exists(self.expr_outpath) or not check_file_exists( self.sign_expr_outpath) or self.force: self.log.info("Loading signature matrix.") self.sign_df = load_dataframe(inpath=self.sign_file, header=0, index_col=0, logger=self.log) signature_genes = set(self.sign_df.index.to_list()) self.log.info("Loading gene traslate dict.") self.gene_info_df = load_dataframe(inpath=self.gene_info_file, header=0, index_col=None, logger=self.log) gene_trans_dict = construct_dict_from_df(self.gene_info_df, self.ensg_id, self.hgnc_id) if not check_file_exists(self.expr_outpath) or self.force: self.log.info("Parsing expression data.") self.expr_df, self.sign_expr_df = self.parse_expression_file( self.expr_file, signature_genes, gene_trans_dict, include_decon=self.decon_expr_file is None) if (not check_file_exists(self.sign_expr_outpath) or self.force) and (check_file_exists(self.decon_expr_file)): self.log.info("Parsing deconvolution expression data.") self.log.warning( "Using different expresion file for deconvolution.") _, self.sign_expr_df = self.parse_expression_file( self.decon_expr_file, signature_genes, gene_trans_dict, include_expr=False, remove_ens_version=True) self.log.info("Reorder, Filter, and save.") if self.expr_df is not None: self.expr_df = self.expr_df.loc[self.eqtl_df.loc[:, "ProbeName"], self.sample_order] save_dataframe(df=self.expr_df, outpath=self.expr_outpath, index=True, header=True, logger=self.log) if self.sign_expr_df is not None: self.sign_expr_df = self.sign_expr_df.loc[:, self.sample_order] save_dataframe(df=self.sign_expr_df, outpath=self.sign_expr_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.")
def start(self): print("Starting interaction analyser - combine and plot.") self.print_arguments() # Start the timer. start_time = time.time() print("") print("### Step 1 ###") print("Combine pickle files into dataframe.", flush=True) dataframes = {} for filename in [ self.pvalues_filename, self.coef_filename, self.std_err_filename ]: outpath = os.path.join(self.work_dir, "{}_table.txt.gz".format(filename)) if not check_file_exists(outpath) or self.force: print("Loading {} data.".format(filename), flush=True) columns, data = self.combine_pickles(self.work_dir, filename, columns=True) if len(data) == 0: print("\tNo {} data found.".format(filename)) continue print("Creating {} dataframe.".format(filename), flush=True) df = self.create_df(data, columns) print("Saving {} dataframe.".format(filename), flush=True) save_dataframe(df=df, outpath=outpath, header=True, index=True) dataframes[filename] = df del columns, data, df else: print("Skipping step for {}".format(outpath)) dataframes[filename] = load_dataframe(outpath, header=0, index_col=0) print("") print("### Step 2 ###") print("Calculate t-values", flush=True) outpath = os.path.join(self.work_dir, "{}_table.txt.gz".format(self.tvalue_filename)) if not check_file_exists(outpath) or self.force: if self.coef_filename in dataframes and self.std_err_filename in dataframes: # Calculate t-values coef_df = dataframes[self.coef_filename] std_err_df = dataframes[self.std_err_filename] if not coef_df.columns.identical(std_err_df.columns): overlap = set(coef_df.columns).intersection( set(std_err_df.columns)) if len(overlap) == 0: print("No overlapping eQTLs between coef and std_err " "data frame columns.") else: coef_df = coef_df.loc[:, overlap] std_err_df = std_err_df.loc[:, overlap] if not coef_df.index.identical(std_err_df.index): overlap = set(coef_df.index).intersection( set(std_err_df.index)) if len(overlap) == 0: print("No overlapping eQTLs between coef and std_err " "data frames indices.") else: coef_df = coef_df.loc[overlap, :] std_err_df = std_err_df.loc[overlap, :] if coef_df.columns.identical( std_err_df.columns) and coef_df.index.identical( std_err_df.index): tvalue_df = coef_df / std_err_df print("Saving {} dataframe.".format(self.tvalue_filename), flush=True) save_dataframe(df=tvalue_df, outpath=os.path.join( self.work_dir, "{}_table.txt.gz".format( self.tvalue_filename)), header=True, index=True) else: print("\tNo data found.") else: print("Skipping step.") print("") print("### Step 3 ###") print("Starting other calculations", flush=True) if self.pvalues_filename not in dataframes: print("\tNo pvalues data found.") return pvalue_df = dataframes[self.pvalues_filename] pvalue_df_columns = [ "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.columns) ] pvalue_df.columns = pvalue_df_columns pvalue_df_indices = [ "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.index) ] pvalue_df.index = pvalue_df_indices pvalue_df.reset_index(drop=False, inplace=True) print("Melting dataframe.", flush=True) dfm = pvalue_df.melt(id_vars=["index"]) dfm.columns = ["covariate", "SNP", "pvalue"] dfm["rank"] = dfm.loc[:, "pvalue"].rank(ascending=True) n_signif = dfm[dfm["pvalue"] <= self.alpha].shape[0] n_total = dfm.shape[0] print("\t{}/{} [{:.2f}%] of pvalues < {}".format( n_signif, n_total, (100 / n_total) * n_signif, self.alpha), flush=True) print("Adding z-scores.", flush=True) dfm["zscore"] = stats.norm.isf(dfm["pvalue"]) dfm.loc[dfm["pvalue"] > (1.0 - 1e-16), "zscore"] = -8.209536151601387 dfm.loc[dfm["pvalue"] < 1e-323, "zscore"] = 38.44939448087599 self.pivot_and_save(dfm, "zscore", pvalue_df_indices, pvalue_df_columns) print("Adding BH-FDR.", flush=True) dfm["BH-FDR"] = dfm["pvalue"] * (n_total / (dfm["rank"] + 1)) dfm.loc[dfm["BH-FDR"] > 1, "BH-FDR"] = 1 prev_bh_fdr = -np.Inf for i in range(n_total): bh_fdr = dfm.loc[i, "BH-FDR"] if bh_fdr > prev_bh_fdr: prev_bh_fdr = bh_fdr else: dfm.loc[i, "BH-FDR"] = prev_bh_fdr n_signif = dfm[dfm["BH-FDR"] <= self.alpha].shape[0] print("\t{}/{} [{:.2f}%] of BH-FDR values < {}".format( n_signif, n_total, (100 / n_total) * n_signif, self.alpha), flush=True) self.pivot_and_save(dfm, "BH-FDR", pvalue_df_indices, pvalue_df_columns) print("Adding permutation FDR.", flush=True) print("\tLoading permutation pvalue data.", flush=True) _, perm_pvalues = self.combine_pickles(self.work_dir, self.perm_pvalues_filename) # perm_pvalues = [random.random() for _ in range(n_total * 10)] print("Sorting p-values.", flush=True) perm_pvalues = sorted(perm_pvalues) if len(perm_pvalues) > 0: n_perm = len(perm_pvalues) / n_total if n_perm != self.n_perm: print("\tWARNING: not all permutation pvalus are present") perm_ranks = [] for pvalue in dfm["pvalue"]: perm_ranks.append(bisect_left(perm_pvalues, pvalue)) dfm["perm-rank"] = perm_ranks dfm["perm-FDR"] = (dfm["perm-rank"] / n_perm) / dfm["rank"] dfm.loc[(dfm.index == 0) | (dfm["perm-rank"] == 0), "perm-FDR"] = 0 dfm.loc[dfm["perm-FDR"] > 1, "perm-FDR"] = 1 self.pivot_and_save(dfm, "perm-FDR", pvalue_df_indices, pvalue_df_columns) print("Saving full dataframe.", flush=True) save_dataframe(df=dfm, outpath=os.path.join(self.work_dir, "molten_table.txt.gz"), header=True, index=True) print("") # Print the time. run_time_min, run_time_sec = divmod(time.time() - start_time, 60) run_time_hour, run_time_min = divmod(run_time_min, 60) print("finished in {} hour(s), {} minute(s) and " "{} second(s).".format(int(run_time_hour), int(run_time_min), int(run_time_sec)), flush=True)
def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting interaction analyser.") self.print_arguments() # Loop over the groups. print("Performing interaction analyses.") for i, group_indir in enumerate(self.group_indirs): # Prepare the input and output directories. if self.groups is not None: group_id = get_leaf_dir(group_indir) group_outdir = os.path.join(self.outdir, group_id) else: group_id = "" group_outdir = self.outdir ia_indir = os.path.join(group_outdir, 'input') ia_outdir = os.path.join(group_outdir, 'output') for outdir in [group_outdir, ia_indir, ia_outdir]: prepare_output_dir(outdir) # Check if we can find an InteractionZSCoreMatrix has_inter_matrix = False if not self.force: for path in glob.glob(os.path.join(ia_outdir, "*")): if re.match(self.inter_regex, get_basename(path)): has_inter_matrix = True break # Stop if we already have the interaction matrix. if has_inter_matrix and not self.force: continue print("\tWorking on: {:15s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.group_indirs), (100 / len(self.group_indirs)) * (i + 1))) # Prepare the EQTLInteractioAnalyser expected input. self.print_string("\n### STEP1 ###\n") expected_input = ["Genotypes", "Expression", "Covariates"] filenames = [ self.geno_filename, self.expr_filename, self.cov_filename ] for exp_ia_infile, filename in zip(expected_input, filenames): # Check if the files alreadt exist. file1 = os.path.join(ia_indir, exp_ia_infile + ".binary.dat") file2 = os.path.join(ia_indir, exp_ia_infile + ".binary.rows.txt") file3 = os.path.join(ia_indir, exp_ia_infile + ".binary.columns.txt") if not check_file_exists(file1) or \ not check_file_exists(file2) or \ not check_file_exists(file3) or \ self.force: self.print_string("\nPreparing {}.".format(filename)) # Define the filenames. compr_file = os.path.join(self.indir, group_id, filename + '.txt.gz') copy_file = os.path.join(ia_indir, filename + '.txt.gz') uncompr_file = os.path.join(ia_indir, filename + '.txt') bin_file = os.path.join(ia_indir, exp_ia_infile + ".binary") # Copy and decompressed the file. self.print_string("\nCopying the input files.") self.copy_file(compr_file, copy_file) self.print_string("\nDecompressing the input files.") self.decompress(copy_file) # Convert to binary. self.print_string("\nConverting files to binary format.") self.convert_to_binary(uncompr_file, bin_file) # Remove the uncompressed file. self.print_string("\nRemoving uncompressed files.") if check_file_exists(uncompr_file): self.print_string( "\tos.remove({})".format(uncompr_file)) os.remove(uncompr_file) else: self.print_string( "Skipping {} preparation.".format(filename)) # prepare the eQTL file. self.print_string("\n### STEP2 ###\n") eqtl_file = os.path.join(ia_indir, self.eqtl_filename + '.txt') if not check_file_exists(eqtl_file) or self.force: self.print_string("\nPreparing eQTL file.") # Define the filenames. compr_file = os.path.join(self.indir, group_id, self.eqtl_filename + '.txt.gz') copy_file = os.path.join(ia_indir, self.eqtl_filename + '.txt.gz') # Copy and decompressed the file. self.print_string("\nCopying the input files.") self.copy_file(compr_file, copy_file) self.print_string("\nDecompressing the input files.") self.decompress(copy_file) else: self.print_string("Skipping eqtl preparation.") # execute the program. self.print_string("\n### STEP3 ###\n") self.print_string("Executing the eQTLInteractionAnalyser.") self.execute(ia_indir, ia_outdir, eqtl_file)
def start(self): self.print_arguments() print("Starting Custom Interaction Analyser " "[{}]".format(datetime.now().strftime("%d-%m-%Y, %H:%M:%S"))) # Start the timer. start_time = int(time.time()) # Get the permutation orders. permutation_orders = None perm_orders_outfile = os.path.join(self.outdir, self.perm_order_filename + ".pkl") if check_file_exists(perm_orders_outfile): print("Loading permutation order") permutation_orders = self.load_pickle(perm_orders_outfile) # Validate the permutation orders for the given input. if len(permutation_orders) != (self.n_perm + 1): print("\tinvalid") permutation_orders = None if permutation_orders is not None: if permutation_orders[0] != None: print("\tinvalid") permutation_orders = None for order in permutation_orders[1:]: if len(order) != self.n_samples: print("\tinvalid") permutation_orders = None break print("\tvalid") if permutation_orders is None: print("Creating permutation order") permutation_orders = self.create_perm_orders() self.dump_pickle(permutation_orders, self.outdir, self.perm_order_filename) # Start the work. print("Start the analysis", flush=True) storage = self.work(permutation_orders) print("Saving output files", flush=True) filename_suffix = "{}_{}".format(self.skip_rows, self.skip_rows + storage.get_n_rows()) self.dump_pickle(storage.get_pvalues(), self.outdir, self.pvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(storage.get_perm_pvalues(), self.outdir, self.perm_pvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(storage.get_coefficients(), self.outdir, self.coef_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(storage.get_std_errors(), self.outdir, self.std_err_filename, filename_suffix=filename_suffix, subdir=True, unique=True) # Print the process time. run_time = int(time.time()) - start_time run_time_min, run_time_sec = divmod(run_time, 60) run_time_hour, run_time_min = divmod(run_time_min, 60) print("Finished in {} hour(s), {} minute(s) and " "{} second(s)".format(int(run_time_hour), int(run_time_min), int(run_time_sec))) print("Received {:.2f} analyses per minute".format( (self.n_eqtls * (self.n_perm + 1)) / (run_time / 60))) # Shutdown the manager. print("Shutting down manager [{}]".format( datetime.now().strftime("%d-%m-%Y, %H:%M:%S")), flush=True)