def save(self): save_dataframe(df=self.celltype_pcs, outpath=self.pca_outpath, index=True, header=True) save_dataframe(df=self.celltype_cs, outpath=self.nmf_outpath, index=True, header=True)
def create_marker_df(self, inter_df, eqtl_df, outpath): inter_df = inter_df.T eqtl_df = eqtl_df[["SNPName", "ProbeName", "HGNCName"]] # Calculate the z-score cutoff. z_score_cutoff = stats.norm.ppf( 0.05 / (inter_df.shape[0] * inter_df.shape[1])) gini_cutoff = 0.75 # Subset on the marker genes. marker_cols = [] for colname in inter_df.columns: if ("_" in colname) and (colname.split("_")[1] in self.celltypes): marker_cols.append(colname) marker_df = inter_df.loc[:, marker_cols] del inter_df # Create a gini dataframe grouped by celltype. gini_df = marker_df.copy() gini_df = gini_df.abs() zscore_mask = list(gini_df.max(axis=1) >= abs(z_score_cutoff)) gini_df.columns = [x.split("_")[1] for x in gini_df.columns] gini_df = gini_df.T.groupby(gini_df.columns).sum().T # Calculate the gini impurity. gini_values = gini_df.div(gini_df.sum(axis=1), axis=0).pow(2) marker_df["gini_impurity"] = 1 - gini_values.sum(axis=1) marker_df["eqtl_celltype"] = gini_values.idxmax(axis=1) del gini_df # Subset the marker df on gini impurity. gini_mask = list(marker_df["gini_impurity"] <= gini_cutoff) marker_df = marker_df.loc[zscore_mask and gini_mask, :] marker_df.index.name = "-" marker_df.reset_index(inplace=True) # Subset the eQTL dataframe. eqtl_df = eqtl_df.loc[zscore_mask and gini_mask, :] eqtl_df.reset_index(drop=True, inplace=True) # Merge them together. merged_df = pd.concat([marker_df, eqtl_df], axis=1) merged_df = merged_df.sort_values( by=['eqtl_celltype', 'gini_impurity']) # Save the dataframe. save_dataframe(df=merged_df, outpath=outpath, header=True, index=False) # Save celltype eqtl's HGNC names. print("Writing celltype mediated eQTL files.") for celltype in marker_df['eqtl_celltype'].unique(): subset = merged_df.loc[merged_df['eqtl_celltype'] == celltype, :] print("\tCelltype: {:20s} {} genes".format(celltype, len(subset.index))) if len(subset.index) > 0: genes = ', '.join(subset['HGNCName'].to_list()) outfile = open( os.path.join(get_dirname(outpath), '{}.txt'.format(celltype)), "w") outfile.write(genes) outfile.close() return eqtl_df
def combine_groups(self, inter_outpath): print("Combining groups.") snp_mask = np.array([], dtype=np.int16) sample_mask = np.array([], dtype=np.int16) inter_df = None for i, group_id in enumerate(self.group_ids): print(" Working on: {:10s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.group_ids), (100 / len(self.group_ids)) * (i + 1))) # Define the directory names. data_indir = os.path.join(self.g_data_indir, group_id) inter_indir = os.path.join(self.g_inter_indir, group_id, 'output') # Load the group object. with open(os.path.join(data_indir, self.obj_filename), "rb") as f: group_object = pickle.load(f) # Safe the indices. snp_mask = np.append(snp_mask, group_object.get_snp_indices()) sample_mask = np.append(sample_mask, group_object.get_sample_indices()) if not check_file_exists(inter_outpath) or self.force: # Search for the interaction filename. inter_inpath = None for path in glob.glob(os.path.join(inter_indir, "*")): if re.match(self.inter_regex, get_basename(path)): inter_inpath = path break if inter_inpath is None: print("Interaction matrix not found.") exit() # Load the interaction file. group_inter_df = load_dataframe(inpath=inter_inpath, header=0, index_col=0) # Merge them. if inter_df is None: inter_df = group_inter_df else: inter_df = inter_df.merge(group_inter_df, left_index=True, right_index=True) print("Preparing interaction matrix.") if not check_file_exists(inter_outpath) or self.force: # Sort the matrix according to the indices. inter_df = inter_df.T inter_df["index"] = snp_mask inter_df.sort_values(by=['index'], inplace=True) inter_df.drop(["index"], axis=1, inplace=True) inter_df = inter_df.T save_dataframe(df=inter_df, outpath=inter_outpath, index=True, header=True) else: inter_df = load_dataframe(inpath=inter_outpath, header=0, index_col=0) # Prepare the masks. snp_mask = sorted(list(set(snp_mask))) sample_mask = sorted(list(set(sample_mask))) return snp_mask, sample_mask, inter_df
def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting combining groups.") self.print_arguments() # Combine the indices of each group and combine the interaction # matrix if need be. inter_outpath = os.path.join(self.outdir, self.inter_filename) snp_mask, sample_mask, inter_df = self.combine_groups(inter_outpath) print("\nSubsetting data with masks:") print("\tSNP mask:\tlength: {}\tlowest index: {}" "\thighest index: {}".format(len(snp_mask), min(snp_mask), max(snp_mask))) print("\tSample mask:\tlength: {}\tlowest index: {}" "\thighest index: {}".format(len(sample_mask), min(sample_mask), max(sample_mask))) print("") # Load the eQTL file if either the marker df or the eqtl df needs to be # created. markers_outpath = os.path.join(self.outdir, self.markers_filename) eqtl_outpath = os.path.join(self.outdir, self.eqtl_filename) if not check_file_exists(eqtl_outpath) or \ not check_file_exists(markers_outpath) \ or self.force: print("Loading eQTL file.") eqtl_df = load_dataframe(inpath=self.eqtl_inpath, header=0, index_col=None) eqtl_df = eqtl_df.iloc[snp_mask, :] print("Preparing marker matrix.") if not check_file_exists(markers_outpath) or self.force: self.create_marker_df(inter_df, eqtl_df, markers_outpath) else: print("\tSkipping step.") print("Preparing eQTL matrix.") if not check_file_exists(eqtl_outpath) or self.force: save_dataframe(outpath=eqtl_outpath, df=eqtl_df, index=False, header=True) else: print("\tSkipping step.") del eqtl_df del inter_df print("\nPreparing genotype matrix.") geno_outpath = os.path.join(self.outdir, self.geno_filename) if not check_file_exists(geno_outpath) or self.force: geno_df = load_dataframe(inpath=os.path.join( self.data_indir, self.geno_filename), header=0, index_col=0) geno_df = geno_df.iloc[snp_mask, sample_mask] save_dataframe(outpath=geno_outpath, df=geno_df, index=True, header=True) del geno_df else: print("\tSkipping step.") print("\nPreparing alleles matrix.") alleles_outpath = os.path.join(self.outdir, self.alleles_filename) if not check_file_exists(alleles_outpath) or self.force: alleles_df = load_dataframe(inpath=os.path.join( self.data_indir, self.alleles_filename), header=0, index_col=0) alleles_df = alleles_df.iloc[snp_mask, :] save_dataframe(outpath=alleles_outpath, df=alleles_df, index=True, header=True) del alleles_df else: print("\tSkipping step.") print("\nPreparing expression matrix.") expr_outpath = os.path.join(self.outdir, self.expr_filename) if not check_file_exists(expr_outpath) or self.force: expr_df = load_dataframe(inpath=os.path.join( self.data_indir, self.expr_filename), header=0, index_col=0) expr_df = expr_df.iloc[snp_mask, sample_mask] save_dataframe(outpath=expr_outpath, df=expr_df, index=True, header=True) del expr_df else: print("\tSkipping step.") print("\nPreparing covariate matrix.") cov_outpath = os.path.join(self.outdir, self.cov_filename) if not check_file_exists(cov_outpath) or self.force: cov_df = load_dataframe(inpath=self.cov_inpath, header=0, index_col=0) cov_df = cov_df.iloc[:, sample_mask].copy() save_dataframe(outpath=cov_outpath, df=cov_df, index=True, header=True) del cov_df else: print("\tSkipping step.")
def save(self): save_dataframe(df=self.deconvolution, outpath=self.outpath, index=True, header=True)
def start(self): print("Creating groups.") for i, (group_id, group_obj) in enumerate(self.groups.items()): print(" Working on: {:10s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.groups), (100 / len(self.groups)) * (i + 1))) # Create the group dir. group_dir = os.path.join(self.outdir, group_id) prepare_output_dir(group_dir) # Define the output names. group_object = os.path.join(group_dir, "group.pkl") eqtl_outpath = os.path.join(group_dir, "eqtl_table.txt.gz") geno_outpath = os.path.join(group_dir, "genotype_table.txt.gz") alleles_outpath = os.path.join(group_dir, "genotype_alleles.txt.gz") expr_outpath = os.path.join(group_dir, "expression_table.txt.gz") cov_outpath = os.path.join(group_dir, "covariates_table.txt.gz") # Check if output file exist, if not, create it. if not check_file_exists(group_object) or self.force: with open(group_object, "wb") as f: pickle.dump(group_obj, f) print("\tSaved group object: " "{}".format(get_basename(group_object))) # Get the group indices. snp_mask = group_obj.get_snp_indices() sample_mask = group_obj.get_sample_indices() # Check if output file exist, if not, create it. if not check_file_exists(eqtl_outpath) or self.force: group_eqtl = self.eqtl_df.iloc[snp_mask, :].copy() save_dataframe(outpath=eqtl_outpath, df=group_eqtl, index=False, header=True) del group_eqtl if not check_file_exists(geno_outpath) or self.force: group_geno = self.geno_df.iloc[snp_mask, sample_mask].copy() save_dataframe(outpath=geno_outpath, df=group_geno, index=True, header=True) del group_geno if not check_file_exists(alleles_outpath) or self.force: group_alleles = self.alleles_df.iloc[snp_mask, :].copy() save_dataframe(outpath=alleles_outpath, df=group_alleles, index=True, header=True) del group_alleles if not check_file_exists(expr_outpath) or self.force: group_expr = self.expr_df.iloc[snp_mask, sample_mask].copy() save_dataframe(outpath=expr_outpath, df=group_expr, index=True, header=True) del group_expr if not check_file_exists(cov_outpath) or self.force: group_cov = self.cov_df.iloc[:, sample_mask].copy() save_dataframe(outpath=cov_outpath, df=group_cov, index=True, header=True) del group_cov
def work(self, workdir): # pvalue_df = pd.read_csv(os.path.join(workdir, "pvalue_table.txt.gz"), sep="\t", header=0, index_col=0) # perm_fdr_df = pd.read_csv(os.path.join(workdir, "perm_fdr_table.txt.gz"), sep="\t", header=0, index_col=0) # bh_fdr_df = pd.read_csv(os.path.join(workdir, "bh_fdr_table.txt.gz"), sep="\t", header=0, index_col=0) # Combine the pickle files. print("Loading pvalue data.", flush=True) pcolumns, pvalues_data = self.combine_pickles(workdir, self.pvalues_outfile, columns=True) # Create a pandas dataframe from the nested list. print("Creating p-values dataframe.", flush=True) pvalue_df = self.create_df(pvalues_data, pcolumns) save_dataframe(df=pvalue_df, outpath=os.path.join(workdir, "pvalue_table.txt.gz"), header=True, index=True) # pvalue_df = pd.read_csv(os.path.join(workdir, "pvalue_table.txt.gz"), # sep="\t", header=0, index_col=0) # with open(os.path.join(workdir, "perm_pvalues.pkl"), "rb") as f: # perm_pvalues = pickle.load(f) # f.close() # Get the pvalues from the dataframe. pvalues = pvalue_df.melt()["value"].values print("Loading permutation pvalue data.", flush=True) _, perm_pvalues = self.combine_pickles(workdir, self.perm_pvalues_outfile) # with open(os.path.join(workdir, "perm_pvalues.pkl"), "wb") as f: # pickle.dump(perm_pvalues, f) # f.close() # Visualise distributions. print("Visualizing distributions.", flush=True) self.plot_distributions(perm_pvalues, pvalues, workdir) # return print("Loading SNP tvalue data.", flush=True) snp_tcolumns, snp_tvalues_data = self.combine_pickles( workdir, self.snp_tvalues_outfile, columns=True) # Create a pandas dataframe from the nested list. print("Creating SNP t-values dataframe.", flush=True) snp_tvalue_df = self.create_df(snp_tvalues_data, snp_tcolumns) save_dataframe(df=snp_tvalue_df, outpath=os.path.join(workdir, "snp_tvalue_table.txt.gz"), header=True, index=True) print("Loading inter tvalue data.", flush=True) inter_tcolumns, inter_tvalues_data = self.combine_pickles( workdir, self.inter_tvalues_outfile, columns=True) # Create a pandas dataframe from the nested list. print("Creating inter t-values dataframe.", flush=True) inter_tvalue_df = self.create_df(inter_tvalues_data, inter_tcolumns) save_dataframe(df=inter_tvalue_df, outpath=os.path.join(workdir, "inter_tvalue_table.txt.gz"), header=True, index=True) # Create a dataframe with z-scores. print("Creating Z-score dataframe.", flush=True) zscore_df = self.create_zscore_df(pvalue_df) save_dataframe(df=zscore_df, outpath=os.path.join(workdir, "interaction_table.txt.gz"), header=True, index=True) # Sort the lists. print("Sorting p-values.", flush=True) perm_pvalues = sorted(perm_pvalues) pvalues = sorted(pvalues) # Create the FDR dataframes. print("Creating permutation FDR dataframe.", flush=True) perm_fdr_df, perm_cutoff = self.create_perm_fdr_df( pvalue_df, pvalues, perm_pvalues, self.n_permutations) perm_n_signif = self.count_n_significant(pvalues, perm_cutoff) print("\tPermutation FDR: {} p-values < signif. cutoff " "{:.2e} [{:.2f}%]".format(perm_n_signif, perm_cutoff, (100 / len(pvalues)) * perm_n_signif)) # Write the output file. save_dataframe(df=perm_fdr_df, outpath=os.path.join(workdir, "perm_fdr_table.txt.gz"), header=True, index=True) print("Creating Benjamini-Hochberg FDR dataframe.", flush=True) bh_fdr_df, bh_cutoff = self.create_bh_fdr_df(pvalue_df, pvalues) bh_n_signif = self.count_n_significant(pvalues, bh_cutoff) print("\tBH FDR: {} p-values < signif. cutoff " "{:.2e} [{:.2f}%]".format(bh_n_signif, bh_cutoff, (100 / len(pvalues)) * bh_n_signif)) save_dataframe(df=bh_fdr_df, outpath=os.path.join(workdir, "bh_fdr_table.txt.gz"), header=True, index=True) #return # Compare the two pvalue scores. print("Creating score visualisation [1/2].", flush=True) self.compare_pvalue_scores(pvalue_df, perm_fdr_df, bh_fdr_df, workdir)
def save(self): save_dataframe(df=self.eqtl_probes, outpath=self.outpath, index=False, header=True)
def start(self): print("Starting creating masked files.") self.print_arguments() # Get the sizes. (n_eqtls, n_samples) = self.geno_df.shape n_covs = self.cov_df.shape[0] # Create masks. eqtl_mask = ["eqtl_" + str(x) for x in range(n_eqtls)] sample_mask = ["sample_" + str(x) for x in range(n_samples)] cov_mask = ["cov_" + str(x) for x in range(n_covs)] # Create translate dicts. print("Creating translation files.") eqtl_translate_outpath = os.path.join(self.outdir, "eqtl_translate_table.txt.gz") if not check_file_exists(eqtl_translate_outpath) or self.force: eqtl_translate = pd.DataFrame({'unmasked': list(self.geno_df.index), 'masked': eqtl_mask}) save_dataframe(outpath=eqtl_translate_outpath, df=eqtl_translate, index=False, header=True) del eqtl_translate else: print("\tSkipping eQTLs translate table.") sample_translate_outpath = os.path.join(self.outdir, "sample_translate_table.txt.gz") if not check_file_exists(sample_translate_outpath) or self.force: sample_translate = pd.DataFrame( {'unmasked': list(self.geno_df.columns), 'masked': sample_mask}) save_dataframe(outpath=sample_translate_outpath, df=sample_translate, index=False, header=True) del sample_translate else: print("\tSkipping sample translate table.") cov_translate_outpath = os.path.join(self.outdir, "cov_translate_table.txt.gz") if not check_file_exists(cov_translate_outpath) or self.force: cov_translate = pd.DataFrame({'unmasked': list(self.cov_df.index), 'masked': cov_mask}) save_dataframe(outpath=cov_translate_outpath, df=cov_translate, index=False, header=True) del cov_translate else: print("\tSkipping covariates translate table.") # Start masking the dataframes. print("Start masking files.") eqtl_outpath = os.path.join(self.outdir, "eqtl_table.txt.gz") if not check_file_exists(eqtl_outpath) or self.force: self.eqtl_df.index = eqtl_mask save_dataframe(outpath=eqtl_outpath, df=self.eqtl_df, index=True, header=True) else: print("\tSkipping eQTL table.") geno_outpath = os.path.join(self.outdir, "genotype_table.txt.gz") if not check_file_exists(geno_outpath) or self.force: self.geno_df.index = eqtl_mask self.geno_df.columns = sample_mask save_dataframe(outpath=geno_outpath, df=self.geno_df, index=True, header=True) else: print("\tSkipping genotype table.") alleles_outpath = os.path.join(self.outdir, "genotype_alleles.txt.gz") if not check_file_exists(alleles_outpath) or self.force: self.alleles_df.index = eqtl_mask save_dataframe(outpath=alleles_outpath, df=self.alleles_df, index=True, header=True) else: print("\tSkipping genotype alleles tables.") expr_outpath = os.path.join(self.outdir, "expression_table.txt.gz") if not check_file_exists(expr_outpath) or self.force: self.expr_df.index = eqtl_mask self.expr_df.columns = sample_mask save_dataframe(outpath=expr_outpath, df=self.expr_df, index=True, header=True) else: print("\tSkipping expression table.") cov_outpath = os.path.join(self.outdir, "covariates_table.txt.gz") if not check_file_exists(cov_outpath) or self.force: self.cov_df.index = cov_mask self.cov_df.columns = sample_mask save_dataframe(outpath=cov_outpath, df=self.cov_df, index=True, header=True) else: print("\tSkipping covariates table.")
def save(self): save_dataframe(df=self.covariates, outpath=self.outpath, index=True, header=True)
def save(self): save_dataframe(df=self.gte, outpath=self.outpath, index=False, header=False)