def get_marker_df(self): if self.marker_df is None: self.marker_df = load_dataframe(inpath=os.path.join( self.input_dir, self.markers_filename), header=0, index_col=False) self.validate() return self.marker_df
def get_cov_df(self): if self.cov_df is None: self.cov_df = load_dataframe(inpath=os.path.join( self.input_dir, self.cov_filename), header=0, index_col=0) self.validate() return self.cov_df
def start(self): print("Starting factorization of celltype profile expression.") self.print_arguments() # Check if output file exist. if check_file_exists(self.pca_outpath) and check_file_exists( self.nmf_outpath) and not self.force: print("Skipping step, loading result.") self.celltype_pcs = load_dataframe(inpath=self.pca_outpath, header=0, index_col=0) self.celltype_cs = load_dataframe(inpath=self.nmf_outpath, header=0, index_col=0) else: self.celltype_expression, self.celltype_pcs, self.celltype_cs = self.perform_matrix_factorization( ) self.save()
def filter_on_trait(self, df): tmp1 = load_dataframe(inpath=self.gwasid_to_trait_filename, header=0, index_col=False) gwas_to_trait = pd.Series(tmp1["Trait"].values, index=tmp1["ID"]).to_dict() del tmp1 gwas_map = {} disease_map = {} tmp2 = load_dataframe(inpath=self.snp_to_gwasid_filename, header=0, index_col=False, low_memory=False) for index, row in tmp2.iterrows(): rs = row["RsID"] id = row["ID"] gwasses = gwas_map.get(rs) if gwasses is None: gwasses = id else: gwasses = "{}, {}".format(gwasses, id) gwas_map[rs] = gwasses diseases = disease_map.get(rs) if id in gwas_to_trait.keys(): trait = gwas_to_trait.get(id) if diseases is None: diseases = trait else: diseases = "{}, {}".format(diseases, trait) disease_map[rs] = diseases df["GWASIDS"] = df["SNPName"].map(gwas_map, na_action="") df["Trait"] = df["SNPName"].map(disease_map, na_action="") # Subset. df.dropna(subset=['Trait'], inplace=True) df = df[df['Trait'].str.contains(self.disease, case=False)]
def get_alleles_df(self): if self.alleles_df is None: alleles_df = load_dataframe(inpath=os.path.join( self.input_dir, self.alleles_filename), header=0, index_col=0, nrows=self.nrows) if self.interest is not None: alleles_df = alleles_df.iloc[self.interest, :] self.alleles_df = alleles_df self.validate() return self.alleles_df
def start(self): print("Starting deconvolution.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step, loading result.") self.deconvolution = load_dataframe(inpath=self.outpath, header=0, index_col=0) else: self.deconvolution = self.perform_deconvolution() self.save()
def start(self): print("Starting creating covariate file.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step, loading result.") self.covariates = load_dataframe(inpath=self.outpath, header=0, index_col=0) else: self.covariates = self.combine_files() self.save()
def combine_files(self): combined = None for i, infile in enumerate(glob.glob(self.inpath)): df = load_dataframe(inpath=infile, header=None, index_col=None) if combined is None: combined = df else: combined = pd.concat([combined, df], axis=0, ignore_index=True) # Remove duplicate entries. combined.drop_duplicates(inplace=True) return combined
def get_eqtl_and_interactions_df(self): # Get the complete input dataframes. df1 = load_dataframe(inpath=os.path.join(self.input_dir, self.eqtl_filename), header=0, index_col=False) df2 = load_dataframe(inpath=os.path.join(self.inter_input_dir, self.inter_cov_subdir, self.zscore_filename), header=0, index_col=0).T # Check if the files math up. if df1.shape[0] != df2.shape[0]: print("Input files do not match (1).") exit() for i in range(df1.shape[0]): if not df2.index[i].startswith(df1["SNPName"][i]): print("Input files do not match (2).") exit() # Reset the indices. df1.reset_index(drop=True, inplace=True) df2.reset_index(drop=True, inplace=True) # Replace the z-scores with 1's and 0's (significant vs not-siginifcant) df2[df2 <= self.signif_cutoff] = 0 df2[df2 > self.signif_cutoff] = 1 df2 = df2.fillna(0).astype('int8') # Combine. self.eqtl_and_interactions_df = pd.concat([df1, df2], axis=1) self.eqtl_and_interactions_df.index = self.eqtl_and_interactions_df.index.astype( str) + "_" + self.eqtl_and_interactions_df[ "SNPName"] + "_" + self.eqtl_and_interactions_df["ProbeName"] del df1, df2 return self.eqtl_and_interactions_df
def get_inter_cov_snp_tvalue_df(self): if self.inter_cov_snp_tvalue_df is None: inter_cov_snp_tvalue_df = load_dataframe(inpath=os.path.join( self.inter_input_dir, self.inter_cov_subdir, self.snp_tvalue_filename), header=0, index_col=0) if self.interest is not None: inter_cov_snp_tvalue_df = inter_cov_snp_tvalue_df.iloc[:, self. interest] self.inter_cov_snp_tvalue_df = inter_cov_snp_tvalue_df self.validate() return self.inter_cov_snp_tvalue_df
def combine_files(self): combined = None for i in range(1, self.n_iterations + 1): infile = os.path.join(self.indir, self.iter_dirname + str(i), self.in_filename) df = load_dataframe(inpath=infile, header=0, index_col=False) df["Iteration"] = i if combined is None: combined = df else: combined = pd.concat([combined, df], axis=0, ignore_index=True) # Remove duplicate entries. combined.drop_duplicates(inplace=True) return combined
def start(self): print("Starting combining GTE files.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step, loading result.") self.gte = load_dataframe(inpath=self.outpath, header=None, index_col=None) else: # Load each GTE file. self.gte = self.combine_files() self.save() # Construct sample translate dict. self.sample_dict = self.create_sample_dict() self.sample_order = self.set_sample_order()
def start(self): print("Starting combining eQTL probe files.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step, loading result.") self.eqtl_probes = load_dataframe(inpath=self.outpath, header=0, index_col=False) else: # Load each GTE file. print("Loading eQTLprobes files.") combined_eqtl_probes = self.combine_files() if self.disease != "" and self.disease is not None: print("Filtering on trait: {}".format(self.disease)) combined_eqtl_probes = self.filter_on_trait( combined_eqtl_probes) self.eqtl_probes = combined_eqtl_probes self.save()
def combine_groups(self, inter_outpath): print("Combining groups.") snp_mask = np.array([], dtype=np.int16) sample_mask = np.array([], dtype=np.int16) inter_df = None for i, group_id in enumerate(self.group_ids): print(" Working on: {:10s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.group_ids), (100 / len(self.group_ids)) * (i + 1))) # Define the directory names. data_indir = os.path.join(self.g_data_indir, group_id) inter_indir = os.path.join(self.g_inter_indir, group_id, 'output') # Load the group object. with open(os.path.join(data_indir, self.obj_filename), "rb") as f: group_object = pickle.load(f) # Safe the indices. snp_mask = np.append(snp_mask, group_object.get_snp_indices()) sample_mask = np.append(sample_mask, group_object.get_sample_indices()) if not check_file_exists(inter_outpath) or self.force: # Search for the interaction filename. inter_inpath = None for path in glob.glob(os.path.join(inter_indir, "*")): if re.match(self.inter_regex, get_basename(path)): inter_inpath = path break if inter_inpath is None: print("Interaction matrix not found.") exit() # Load the interaction file. group_inter_df = load_dataframe(inpath=inter_inpath, header=0, index_col=0) # Merge them. if inter_df is None: inter_df = group_inter_df else: inter_df = inter_df.merge(group_inter_df, left_index=True, right_index=True) print("Preparing interaction matrix.") if not check_file_exists(inter_outpath) or self.force: # Sort the matrix according to the indices. inter_df = inter_df.T inter_df["index"] = snp_mask inter_df.sort_values(by=['index'], inplace=True) inter_df.drop(["index"], axis=1, inplace=True) inter_df = inter_df.T save_dataframe(df=inter_df, outpath=inter_outpath, index=True, header=True) else: inter_df = load_dataframe(inpath=inter_outpath, header=0, index_col=0) # Prepare the masks. snp_mask = sorted(list(set(snp_mask))) sample_mask = sorted(list(set(sample_mask))) return snp_mask, sample_mask, inter_df
def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting combining groups.") self.print_arguments() # Combine the indices of each group and combine the interaction # matrix if need be. inter_outpath = os.path.join(self.outdir, self.inter_filename) snp_mask, sample_mask, inter_df = self.combine_groups(inter_outpath) print("\nSubsetting data with masks:") print("\tSNP mask:\tlength: {}\tlowest index: {}" "\thighest index: {}".format(len(snp_mask), min(snp_mask), max(snp_mask))) print("\tSample mask:\tlength: {}\tlowest index: {}" "\thighest index: {}".format(len(sample_mask), min(sample_mask), max(sample_mask))) print("") # Load the eQTL file if either the marker df or the eqtl df needs to be # created. markers_outpath = os.path.join(self.outdir, self.markers_filename) eqtl_outpath = os.path.join(self.outdir, self.eqtl_filename) if not check_file_exists(eqtl_outpath) or \ not check_file_exists(markers_outpath) \ or self.force: print("Loading eQTL file.") eqtl_df = load_dataframe(inpath=self.eqtl_inpath, header=0, index_col=None) eqtl_df = eqtl_df.iloc[snp_mask, :] print("Preparing marker matrix.") if not check_file_exists(markers_outpath) or self.force: self.create_marker_df(inter_df, eqtl_df, markers_outpath) else: print("\tSkipping step.") print("Preparing eQTL matrix.") if not check_file_exists(eqtl_outpath) or self.force: save_dataframe(outpath=eqtl_outpath, df=eqtl_df, index=False, header=True) else: print("\tSkipping step.") del eqtl_df del inter_df print("\nPreparing genotype matrix.") geno_outpath = os.path.join(self.outdir, self.geno_filename) if not check_file_exists(geno_outpath) or self.force: geno_df = load_dataframe(inpath=os.path.join( self.data_indir, self.geno_filename), header=0, index_col=0) geno_df = geno_df.iloc[snp_mask, sample_mask] save_dataframe(outpath=geno_outpath, df=geno_df, index=True, header=True) del geno_df else: print("\tSkipping step.") print("\nPreparing alleles matrix.") alleles_outpath = os.path.join(self.outdir, self.alleles_filename) if not check_file_exists(alleles_outpath) or self.force: alleles_df = load_dataframe(inpath=os.path.join( self.data_indir, self.alleles_filename), header=0, index_col=0) alleles_df = alleles_df.iloc[snp_mask, :] save_dataframe(outpath=alleles_outpath, df=alleles_df, index=True, header=True) del alleles_df else: print("\tSkipping step.") print("\nPreparing expression matrix.") expr_outpath = os.path.join(self.outdir, self.expr_filename) if not check_file_exists(expr_outpath) or self.force: expr_df = load_dataframe(inpath=os.path.join( self.data_indir, self.expr_filename), header=0, index_col=0) expr_df = expr_df.iloc[snp_mask, sample_mask] save_dataframe(outpath=expr_outpath, df=expr_df, index=True, header=True) del expr_df else: print("\tSkipping step.") print("\nPreparing covariate matrix.") cov_outpath = os.path.join(self.outdir, self.cov_filename) if not check_file_exists(cov_outpath) or self.force: cov_df = load_dataframe(inpath=self.cov_inpath, header=0, index_col=0) cov_df = cov_df.iloc[:, sample_mask].copy() save_dataframe(outpath=cov_outpath, df=cov_df, index=True, header=True) del cov_df else: print("\tSkipping step.")
def perform_deconvolution(self): if self.profile_df is None: # Load the celltype profile file. print("Loading cell type profile matrix.") self.profile_df = load_dataframe(self.profile_file, header=0, index_col=0) if self.ct_expr_df is None: # Load the celltype expression file. print("Loading cell type expression matrix.") self.ct_expr_df = load_dataframe(self.ct_expr_file, header=0, index_col=0) print("Loading sample cohort matrix.") sample_cohort_df = load_dataframe(self.sample_cohort_file, header=0, index_col=None) # Correct for cohort effects. cohort_df = self.create_cohort_df(list(self.ct_expr_df.columns), sample_cohort_df, self.sample_id, self.cohort_id) # Filter uninformative genes from the signature matrix. prof_df = self.filter(self.profile_df) # Subset and reorder. prof_df, expr_df, cohort_df = self.subset(prof_df, self.ct_expr_df, cohort_df) # Correct for cohorts. expr_df = self.cohort_correction(expr_df, cohort_df) # Transform. prof_df = self.perform_log2_transform(prof_df) # Shift the data to be positive. print("Shifting data to be positive") if prof_df.values.min() < 0: prof_df = self.perform_shift(prof_df) if expr_df.values.min() < 0: expr_df = self.perform_shift(expr_df) print("Profile shape: {}".format(prof_df.shape)) print("Expression shape: {}".format(expr_df.shape)) # Perform deconvolution per sample. print("Performing partial deconvolution.") decon_data = [] residuals_data = [] for _, sample in expr_df.T.iterrows(): proportions, rnorm = self.nnls(prof_df, sample) decon_data.append(proportions) residuals_data.append(rnorm) decon_df = pd.DataFrame(decon_data, index=expr_df.columns, columns=[ "{}NNLS_{}".format(*x.split("_")) for x in prof_df.columns ]) residuals_df = pd.Series(residuals_data, index=expr_df.columns) print("Estimated weights:") print(decon_df) print(decon_df.mean(axis=0)) # Make the weights sum up to 1. decon_df = self.sum_to_one(decon_df) print("Estimated proportions:") print(decon_df) print(decon_df.mean(axis=0)) # Calculate the average residuals. print(residuals_df) print("Average residual: {:.2f}".format(residuals_df.mean())) return decon_df
def combine_files(self): # read the covariates file. print("Loading covariate matrix.") cov_df = load_dataframe(inpath=self.cov_file, header=0, index_col=0) tech_cov_df = cov_df[self.tech_covs].copy() cohorts_df = cov_df[self.cohorts].copy() del cov_df # validate the cohorts. print("Validating cohorts.") colsums = cohorts_df.sum(axis=1) cohorts_df[self.ref_cohort] = 0 cohorts_df.loc[colsums == 0, self.ref_cohort] = 1 if not cohorts_df.sum(axis=1).all(): print("\tSome samples do not have a cohort.") exit() else: print("\tValid.") # read the phenotype file. print("Loading phenotype matrix.") pheno_df = load_dataframe(inpath=self.pheno_file, header=0, index_col=4, low_memory=False) # Combine the two gender columns, keep 'sex.by.expression' as main # gender ans use 'Gender' when no information is available. pheno_df = pheno_df.loc[:, ["Gender", "sex.by.expression"]] pheno_df.replace("no expression available", np.nan, inplace=True) pheno_df["SEX"] = pheno_df['sex.by.expression'].combine_first( pheno_df['Gender']) gender_df = pheno_df["SEX"].to_frame() del pheno_df gender_df = gender_df.replace({"SEX": self.sex_dict}) # read the eigenvectors file. print("Loading eigenvectors matrix.") eigen_df = load_dataframe(self.eig_file, header=0, index_col=0) eigen_df = eigen_df.loc[:, [ "Comp{}".format(x) for x in range(1, self.n_eigen + 1) ]] # read the eigenvectors before covariate correction file. print("Loading eigenvectors before cov. correction matrix.") cov_cor_df = load_dataframe(self.eig_bef_cov_corr_file, header=0, index_col=0) cov_cor_df.columns = [ "PC1-before-cov-correction", "PC2-before-cov-correction" ] # read the marker genes expression file. print("Loading marker genes matrix.") marker_df = load_dataframe(self.marker_file, header=0, index_col=0) marker_df.sort_index(inplace=True) marker_df.drop_duplicates(inplace=True) marker_df = marker_df.T # merge. print("Merging matrices.") comb_cov = reduce( lambda left, right: pd.merge( left, right, left_index=True, right_index=True), [ tech_cov_df, cohorts_df, gender_df, eigen_df, cov_cor_df, marker_df, self.celltype_pcs.T, self.celltype_cs.T, self.deconvolution ]) comb_cov = comb_cov.T comb_cov = comb_cov[self.sample_order] comb_cov.index.name = "-" print("\tShape: {}".format(comb_cov.shape)) # Remove old dataframes. del tech_cov_df, cohorts_df, gender_df, eigen_df, cov_cor_df, marker_df return comb_cov
def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting program.") print("\n### STEP1 ###\n") # Step 1. Combine GTE files. cgtef = CombineGTEFiles( settings=self.settings.get_setting('combine_gte_files'), force=self.force_dict['combine_gte_files'], outdir=self.outdir) cgtef.start() cgtef.clear_variables() # Step2. Combine eQTL probes files. print("\n### STEP2 ###\n") cepf = CombineEQTLProbes( settings=self.settings.get_setting('combine_eqtlprobes'), disease=self.disease, force=self.force_dict['combine_eqtlprobes'], outdir=self.outdir) cepf.start() cepf.clear_variables() # Step3. Create the ordered unmasked matrices. print("\n### STEP3 ###\n") cm = CreateMatrices( settings=self.settings.get_setting('create_matrices'), gte_df=cgtef.get_gte(), sample_dict=cgtef.get_sample_dict(), sample_order=cgtef.get_sample_order(), eqtl_df=cepf.get_eqtlprobes(), force=self.force_dict['create_matrices'], outdir=self.outdir) cm.start() cm.clear_variables() # Step4. Create the deconvolution matrices. print("\n### STEP4 ###\n") cdm = CreateDeconvolutionMatrices( settings=self.settings.get_setting('create_deconvolution_matrices'), expr_file=cm.get_expr_file(), expr_df=cm.get_complete_expr_matrix(), sample_dict=cgtef.get_sample_dict(), sample_order=cgtef.get_sample_order(), force=self.force_dict['create_deconvolution_matrices'], outdir=self.outdir) cdm.start() cdm.clear_variables() # Step5. Create the celltype PCA file. print("\n### STEP5 ###\n") pcf = PerformCelltypeFactorization( settings=self.settings.get_setting('perform_celltype_factorization'), profile_file=cdm.get_celltype_profile_file(), profile_df=cdm.get_celltype_profile(), ct_expr_file=cdm.get_ct_profile_expr_outpath(), force=self.force_dict['perform_celltype_factorization'], outdir=self.outdir) pcf.start() pcf.clear_variables() # Step6. Create the covariance matrix. print("\n### STEP6 ###\n") pd = PerformDeconvolution( settings=self.settings.get_setting('perform_deconvolution'), profile_file=cdm.get_celltype_profile_file(), profile_df=cdm.get_celltype_profile(), ct_expr_file=cdm.get_ct_profile_expr_outpath(), ct_expr_df=pcf.get_celltype_expression(), force=self.force_dict['perform_deconvolution'], outdir=self.outdir) pd.start() pd.clear_variables() # Step7. Create the covariance matrix. print("\n### STEP7 ###\n") ccm = CreateCovMatrix( settings=self.settings.get_setting('create_cov_matrix'), marker_file=cdm.get_markers_outpath(), celltype_pcs=pcf.get_celltype_pcs(), celltype_cs=pcf.get_celltype_cs(), deconvolution=pd.get_deconvolution(), sample_order=cgtef.get_sample_order(), force=self.force_dict['create_cov_matrix'], outdir=self.outdir) ccm.start() ccm.clear_variables() exit() # Load the complete dataframes. print("\n### LOADING SORTED DATAFRAMES ###\n") print("Extracting eQTL dataframe.") eqtl_df = cepf.get_eqtlprobes() print("Loading genotype dataframe.") geno_df = load_dataframe(cm.get_geno_outpath(), header=0, index_col=0) print("Loading alleles dataframe.") alleles_df = load_dataframe(cm.get_alleles_outpath(), header=0, index_col=0) print("Loading expression dataframe.") expr_df = load_dataframe(cm.get_expr_outpath(), header=0, index_col=0) print("Extracting covariates dataframe.") cov_df = ccm.get_covariates() # Validate the matrices. print("Validating matrices.") self.validate(eqtl_df.copy(), geno_df, alleles_df, expr_df, cov_df) # Step 8. Create the masked matrices. print("\n### STEP8 ###\n") cmm = MaskMatrices( settings=self.settings.get_setting('mask_matrices'), eqtl_df=eqtl_df.copy(), geno_df=geno_df.copy(), alleles_df=alleles_df.copy(), expr_df=expr_df.copy(), cov_df=cov_df.copy(), force=self.force_dict['mask_matrices'], outdir=self.outdir) cmm.start() del cmm # # Step 9. Create the group matrices. # print("\n### STEP9 ###\n") # cg = CreateGroups( # settings=self.settings.get_setting('create_groups'), # eqtl_df=eqtl_df.copy(), # geno_df=geno_df.copy(), # alleles_df=alleles_df.copy(), # expr_df=expr_df.copy(), # cov_df=cov_df.copy(), # groups_file=cm.get_group_outpath(), # force=self.force_dict['create_groups'], # outdir=self.outdir) # cg.start() # del cg # Step 10. Create the regression matrices. print("\n### STEP10 ###\n") crm = CreateRegressionMatrix( settings=self.settings.get_setting('create_regression_matrix'), eqtl_df=eqtl_df.copy(), geno_df=geno_df.copy(), alleles_df=alleles_df.copy(), expr_df=expr_df.copy(), force=self.force_dict['create_regression_matrix'], outdir=self.outdir) crm.start() del crm
def start(self): print("Starting creating matrices.") self.print_arguments() # Check if output file exist. if check_file_exists(self.geno_outpath) and \ check_file_exists(self.alleles_outpath) and \ check_file_exists(self.expr_outpath) and \ not self.force: print("Skipping step.") return # Remove the output files. for outfile in [ self.geno_outpath, self.alleles_outpath, self.expr_outpath ]: if os.path.isfile(outfile): print("Removing file: {}.".format(outfile)) os.remove(outfile) # Load the genotype matrix file. print("Loading genotype matrix.") geno_df = load_dataframe(self.geno_file, header=0, index_col=0) allele_df = geno_df.loc[:, ["Alleles", "MinorAllele"]].copy() geno_df = geno_df.rename(columns=self.sample_dict) geno_df = geno_df[self.sample_order] # Load the expression matrix file. print("Loading expression matrix.") expr_df = load_dataframe(self.expr_file, header=0, index_col=0) expr_df = expr_df.rename(columns=self.sample_dict) self.complete_expr_matrix = expr_df[self.sample_order] # Construct the genotype / expression matrices. print("Constructing matrices.") geno_str_buffer = ["-" + "\t" + "\t".join(self.sample_order) + "\n"] expr_str_buffer = ["-" + "\t" + "\t".join(self.sample_order) + "\n"] allele_str_buffer = [ "-" + "\t" + "\t".join(list(allele_df.columns)) + "\n" ] # saved_profile_genes = [] # groups = [] # new_group_id = 0 n_snps = self.eqtl_df.shape[0] for i, row in self.eqtl_df.iterrows(): if (i % 250 == 0) or (i == (n_snps - 1)): print("\tProcessing {}/{} " "[{:.2f}%]".format(i, (n_snps - 1), (100 / (n_snps - 1)) * i)) # Write output files. self.write_buffer(self.geno_outpath, geno_str_buffer) geno_str_buffer = [] self.write_buffer(self.expr_outpath, expr_str_buffer) expr_str_buffer = [] self.write_buffer(self.alleles_outpath, allele_str_buffer) allele_str_buffer = [] # Get the row info. snp_name = row["SNPName"] probe_name = row["ProbeName"] # Used for development. # snp_name = "10:100145864:rs4919426:T_C" # probe_name = "ENSG00000000003.15" # End used for development. # Get the genotype. genotype = geno_df.loc[[snp_name], :] if (len(genotype.index)) != 1: print("SNP: {} gives 0 or >1 " "genotypes.".format(snp_name)) continue geno_str = snp_name + "\t" + "\t".join( genotype.iloc[0, :].astype(str).values) + "\n" geno_str_buffer.append(geno_str) # Get the alleles. alleles = allele_df.loc[[snp_name], :] if (len(alleles.index)) != 1: print("SNP: {} gives 0 or >1 " "alleles.".format(snp_name)) continue allele_str = "{}\t{}\t{}\n".format(snp_name, alleles.iloc[0]["Alleles"], alleles.iloc[0]["MinorAllele"]) allele_str_buffer.append(allele_str) # Get the expression. expression = self.complete_expr_matrix.loc[[probe_name], :] if (len(expression.index)) != 1: print("Probe: {} gives 0 or >1 expression " "profiles.".format(probe_name)) continue expr_str = probe_name + "\t" + "\t".join( expression.iloc[0, :].astype(str).values) + "\n" expr_str_buffer.append(expr_str) # # Create an eQTL object. # new_eqtl = Eqtl(snp_name, i, genotype, expression) # # # Get the samples indices of the eQTl. # samples = new_eqtl.get_samples() # samples_indices = new_eqtl.get_sample_indices() # # # Assign the group. # matches = False # if groups: # # Check if there is a group with these samples. # for group in groups: # if group.matches(samples_indices): # group.add_eqtl(new_eqtl) # matches = True # break # # # Add a new group. # if not matches: # new_group = Group(new_group_id, samples) # new_group.add_eqtl(new_eqtl) # groups.append(new_group) # new_group_id = new_group_id + 1 # Write output files. if geno_str_buffer: self.write_buffer(self.geno_outpath, geno_str_buffer) if expr_str_buffer: self.write_buffer(self.expr_outpath, expr_str_buffer) if allele_str_buffer: self.write_buffer(self.alleles_outpath, allele_str_buffer) # # Pickle the groups. # print("Writing group pickle file.") # with open(self.group_outpath, "wb") as f: # pickle.dump(groups, f) # Remove old dataframes. del geno_df, expr_df
def start(self): print("Starting creating deconvolution matrices.") self.print_arguments() # Check if output file exist. if check_file_exists(self.markers_outpath) and \ check_file_exists(self.ct_profile_expr_outpath) and \ not self.force: print("Skipping step.") return # Check which expression file we will use. expr_file = self.expr_file expr_df = self.expr_df if self.decon_expr_file: print("Warning: using a different expression file for " "deconvolution than for gene expression. This might take " "longer to load.") expr_file = self.decon_expr_file expr_df = None # Load the complete expression file. if expr_df is None: # Load the expression matrix file. print("Loading expression matrix.") expr_df = load_dataframe(expr_file, header=0, index_col=0) expr_df = expr_df.rename(columns=self.sample_dict) expr_df = expr_df[self.sample_order] # Load the translate file. print("Loading translate matrix.") trans_df = load_dataframe(self.translate_file, header=0, index_col=None) trans_dict = dict( zip(trans_df.loc[:, "ArrayAddress"], trans_df.loc[:, "Symbol"])) # Translate the ENSEBL ID's to HGNC symbols. expr_df.index = expr_df.index.map(trans_dict) expr_df.index.name = "-" # Remove unneeded variables. del trans_df, trans_dict # Create the marker gene file. if not check_file_exists(self.markers_outpath) or self.force: if os.path.isfile(self.markers_outpath): print("Removing: {}".format(self.markers_outpath)) os.remove(self.markers_outpath) print("Creating marker gene expression table.") marker_str_buffer = [ "-" + "\t" + "\t".join(self.sample_order) + "\n" ] for celltype, marker_genes in self.marker_dict.items(): for marker_gene in marker_genes: if marker_gene in expr_df.index: expression = expr_df.loc[[marker_gene], :] if (len(expression.index)) != 1: print("\tMarker gene: {} gives 0 or >1 expression " "profiles.".format(marker_gene)) continue marker_str = self.marker_genes_suffix + "_" + \ celltype + "_" + marker_gene + "\t" + \ "\t".join(expression.iloc[0, :].astype(str).values) \ + "\n" marker_str_buffer.append(marker_str) self.write_buffer(self.markers_outpath, marker_str_buffer) # Create the marker gene file. if not check_file_exists(self.ct_profile_expr_outpath) or self.force: if os.path.isfile(self.ct_profile_expr_outpath): print("Removing: {}".format(self.ct_profile_expr_outpath)) os.remove(self.ct_profile_expr_outpath) # Load the celltype profile file. print("Loading cell type profile matrix.") self.celltype_profile = load_dataframe(self.celltype_profile_file, header=0, index_col=0) # Create the celltype profile file. print("Creating cell type profile expression table.") profile_str_buffer = [ "-" + "\t" + "\t".join(self.sample_order) + "\n" ] for marker_gene in self.celltype_profile.index: if marker_gene in expr_df.index: expression = expr_df.loc[[marker_gene], :] if (len(expression.index)) != 1: print("\tMarker gene: {} gives 0 or >1 expression " "profiles.".format(marker_gene)) continue profile_str = marker_gene + "\t" + "\t".join( expression.iloc[0, :].astype(str).values) + "\n" profile_str_buffer.append(profile_str) self.write_buffer(self.ct_profile_expr_outpath, profile_str_buffer)
def perform_matrix_factorization(self): # Load the expression data. print("Loading celltype expression data.") ct_expr_df = load_dataframe(inpath=self.ct_expr_file, header=0, index_col=0) if self.profile_df is None: # Load the celltype profile file. print("Loading cell type profile matrix.") self.profile_df = load_dataframe(self.profile_file, header=0, index_col=0) # Find the genes specific to each celltype. gene_celltypes = self.normalize(self.profile_df).idxmax(axis=1) # Construct a dataframe of the first component of each celltype # subset expression profile. pca_data = [] print("Performing PCA") for celltype in self.profile_df.columns: print("\tWorking on: {}".format(celltype)) ct_genes = gene_celltypes[gene_celltypes == celltype].index ct_expr = ct_expr_df.loc[ct_expr_df.index.isin(ct_genes), :] print("\t N = {}".format(len(ct_expr.index))) # perform PCA over the expression of these genes. print("\t PCA") pca_component = self.get_first_pca_component(ct_expr) pca_component_values = [x[0] for x in list(pca_component)] pca_data.append(pca_component_values) # Create the data frame. celltype_pcs = pd.DataFrame(pca_data, index=[ "{}PCA_{}_PC1".format(*x.split("_")) for x in self.profile_df.columns ], columns=ct_expr_df.columns) # Shift the expression to be all positive. shifted_ct_expr = ct_expr_df.copy() if ct_expr_df.values.min() < 0: shifted_ct_expr = self.perform_shift(ct_expr_df) # Construct a dataframe of the first component of each celltype # subset expression profile. nmf_data = [] print("Performing NMF") for celltype in self.profile_df.columns: print("\tWorking on: {}".format(celltype)) ct_genes = gene_celltypes[gene_celltypes == celltype].index ct_expr = shifted_ct_expr.loc[ shifted_ct_expr.index.isin(ct_genes), :] print("\t N = {}".format(len(ct_expr.index))) # perform NMF over the expression of these genes. print("\t NMF") nmf_component = self.get_first_nmf_component(ct_expr) nmf_component_values = [x[0] for x in list(nmf_component)] nmf_data.append(nmf_component_values) # Create the data frame. celltype_cs = pd.DataFrame(nmf_data, index=[ "{}NMF_{}_C1".format(*x.split("_")) for x in self.profile_df.columns ], columns=shifted_ct_expr.columns) return ct_expr_df, celltype_pcs, celltype_cs
def work(self, permutation_orders): """ Method that does the interaction analysis. :param storage: object, a storage object containing all results. """ # Load the data print("Loading data", flush=True) cov_df = load_dataframe(self.cov_inpath, header=0, index_col=0) geno_df = load_dataframe( self.geno_inpath, header=0, index_col=0, skiprows=[i for i in range(1, self.skip_rows + 1)], nrows=self.n_eqtls) expr_df = load_dataframe( self.expr_inpath, header=0, index_col=0, skiprows=[i for i in range(1, self.skip_rows + 1)], nrows=self.n_eqtls) # Drop the covariates we don't want. if len(self.drop_covs) > 0: cov_df.drop(self.drop_covs, axis=0, inplace=True) # Split the covariate table into covariates of interest and technical # covariates. print("Extracting technical covariates data frame") tech_cov_df = cov_df.loc[self.tech_covs, :].copy() print("\tShape: {}".format(tech_cov_df.shape)) # Replace -1 with NaN in the genotype dataframe. This way we can # drop missing values. geno_df.replace(-1, np.nan, inplace=True) # Initialize the storage object. print("Creating storage object") tech_cov_names = [] cov_names = [] for rowname in cov_df.index: if rowname in self.tech_covs: tech_cov_names.append(rowname) else: cov_names.append(rowname) storage = Storage(tech_covs=tech_cov_names, covs=cov_names) storage.print_info() # Start working. print("Starting interaction analyser", flush=True) for row_index, eqtl_index in enumerate( [i for i in range(self.skip_rows, self.skip_rows + self.n_eqtls)]): print("\tProcessing eQTL {}/{} " "[{:.0f}%]".format(row_index + 1, self.n_eqtls, (100 / self.n_eqtls) * (row_index + 1)), flush=True) # Get the complete genotype row for the permutation later. genotype_all = geno_df.iloc[row_index, :].copy() # Get the missing genotype indices. indices = np.arange(geno_df.shape[1]) eqtl_indices = indices[~geno_df.iloc[row_index, :].isnull().values] # Subset the row and present samples for this eQTL. genotype = geno_df.iloc[row_index, eqtl_indices].copy() expression = expr_df.iloc[row_index, eqtl_indices].copy() technical_covs = tech_cov_df.iloc[:, eqtl_indices].copy() covariates = cov_df.iloc[:, eqtl_indices].copy() # Create the null model. Null model are all the technical # covariates multiplied with the genotype + the SNP. tech_inter_matrix = technical_covs.mul(genotype, axis=1) tech_inter_matrix.index = [ "{}_X_SNP".format(x) for x in technical_covs.index ] intercept = pd.DataFrame(1, index=genotype.index, columns=["intercept"]) base_matrix = reduce( lambda left, right: pd. merge(left, right, left_index=True, right_index=True), [ intercept, genotype.to_frame(), technical_covs.T, tech_inter_matrix.T ]) # Initialize variables. storage.add_row(eqtl_index, genotype.name) # Loop over the covariates. for cov_index in range(len(cov_df.index)): if storage.has_error(): break # Get the covariate we are processing. covariate = covariates.iloc[cov_index, :] cov_name = covariate.name if self.verbose: print("\t\tWorking on '{}'".format(cov_name), flush=True) # Add the covariate to the null matrix if it isn't already. null_matrix = base_matrix.copy() if cov_name not in null_matrix.columns: covariate_df = covariate.copy() null_matrix = null_matrix.merge(covariate_df.to_frame(), left_index=True, right_index=True) # Create the null model. n_null = null_matrix.shape[0] df_null, rss_null, _ = self.create_model( null_matrix, expression) # if self.verbose: # print("\t\tn_null: {}\tdf_null: {}\trss_null: {}\t".format(n_null, df_null, rss_null)) # Loop over each permutation sample order. The first order # is the normal order and the remainder are random shuffles. for order_id, sample_order in enumerate(permutation_orders): if storage.has_error(): break if self.verbose: print("\t\t\tWorking on 'order_{}'".format(order_id), flush=True) # Reorder the covariate based on the sample order. # Make sure the labels are in the same order, just # shuffle the values. covariate_all = cov_df.iloc[cov_index, :].copy() covariate_all_index = covariate_all.index covariate_all = covariate_all.reindex( covariate_all.index[sample_order]) covariate_all.index = covariate_all_index # Calculate the interaction effect of the covariate of # interest. Then drop the NA's from the interaction # term. inter_of_interest = covariate_all * genotype_all inter_name = "{}_X_SNP".format(cov_name) if inter_name in null_matrix.columns: inter_name = inter_name + "_2" inter_of_interest.name = inter_name inter_of_interest = inter_of_interest.iloc[eqtl_indices] # Check if the drop is identical (see above). if not inter_of_interest.index.equals(null_matrix.index): print("\t\t\tError in permutation reordering " "(ID: {})".format(order_id), flush=True) storage.set_error() continue # Create the alternative matrix and add the interaction # term. alt_matrix = null_matrix.copy() alt_matrix = alt_matrix.merge(inter_of_interest.to_frame(), left_index=True, right_index=True) # Create the alternative model. n_alt = alt_matrix.shape[0] df_alt, rss_alt, alt_tvalues = self.create_model( alt_matrix, expression, tvalue_cols=[genotype.name, inter_name]) # if self.verbose: # print("\t\t\tn_alt: {}\tdf_alt: {}\trss_alt: {}\talt_tvalues: {}".format(n_alt, df_alt, rss_alt, alt_tvalues)) # Safe the t-values. storage.add_value(cov_name, order_id, "snp_tvalue", alt_tvalues[genotype.name]) storage.add_value(cov_name, order_id, "inter_tvalue", alt_tvalues[inter_name]) # Make sure the n's are identical. if n_null != n_alt: print("\t\t\tError due to unequal n_null and n_alt", flush=True) storage.set_error() continue # Compare the null and alternative model. fvalue = self.calc_f_value(rss_null, rss_alt, df_null, df_alt, n_null) pvalue = self.get_p_value(fvalue, df_null, df_alt, n_null) # if self.verbose: # print("\t\t\tfvalue: {}\tpvalue: {}".format(fvalue, pvalue)) # Safe the p-values. storage.add_value(cov_name, order_id, "pvalue", pvalue) # Check whether we are almost running out of time. if time.time() > self.panic_time: print("\tPanic!!!", flush=True) return storage # Safe the results of the eQTL. storage.store_row() return storage