def get_df(vcf_file, model_name): df = pd.DataFrame(list(KipoiVCFParser(vcf_file))) meta_info = df[[ "variant_chr", "variant_pos", "variant_ref", "variant_alt", "variant_id" ]] meta_info["variant_uid"] = df["variant_chr"].astype( str) + ':' + df["variant_pos"].astype( str) + ':' + df["variant_ref"] + ':' + df["variant_alt"] df.index = meta_info["variant_uid"] meta_info.index = meta_info["variant_uid"] obsolete_variant_columns = [ "variant_chr", "variant_pos", "variant_ref", "variant_alt", "variant_id" ] df = df[[col for col in df.columns if col not in obsolete_variant_columns]] df = df[[col for col in df.columns if "rID" not in col]] col_types = ["_LOGIT_REF", "_LOGIT_ALT", "_REF", "_ALT", "_DIFF", "_LOGIT"] if model_name == "labranchor": df = average_labranchor(df, model_name, col_types) else: df.columns = [ refmt_col(col, model_name, col_types) for col in df.columns ] # clump variants together df = deduplicate_vars(df) # subset meta_info like df and add variant_uid as common ID meta_info = meta_info.loc[df.index, :] return df, meta_info
def process_file(self, filename): model_name = filename.split('/')[-2] self.model_names.add(model_name) vcf_reader = KipoiVCFParser(filename) for el in vcf_reader: chrom = el['variant_chr'] position = el['variant_pos'] score = list(el.values())[5] snp_name = str(chrom) + '-' + str(position) if snp_name not in self.snps: self.snps[snp_name] = {} self.snps[snp_name][model_name] = score
def get_df(vcf_file, model_name): df = pd.DataFrame(list(KipoiVCFParser(vcf_file))) df.index = df["variant_id"] obsolete_variant_columns = [ "variant_chr", "variant_pos", "variant_ref", "variant_alt", "variant_id" ] df = df[[col for col in df.columns if col not in obsolete_variant_columns]] df = df[[col for col in df.columns if "rID" not in col]] col_types = ["_LOGIT_REF", "_LOGIT_ALT", "_REF", "_ALT", "_DIFF", "_LOGIT"] if model_name == "labranchor": df = average_labranchor(df, model_name, col_types) else: df.columns = [ refmt_col(col, model_name, col_types) for col in df.columns ] # clump variants together df = deduplicate_vars(df) return df
# get the model model = kipoi.get_model(model_name) # get the dataloader factory Dataloader = model.default_dataloader vcf_path = "../data/test.vcf" # The output vcf path, based on the input file name out_vcf_fpath = vcf_path[:-4] + "%s.vcf" % model_name.replace("/", "_") # The writer object that will output the annotated VCF writer = VcfWriter(model, vcf_path, out_vcf_fpath) # Information extraction from dataloader and model model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader) # vcf_to_region will generate a variant-centered regions when presented a VCF record. vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) dataloader_arguments = {"fasta_file": "../data/fasta_files/chr1.fa"} sp.predict_snvs( model, Dataloader, vcf_path, batch_size=32, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, #evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'deepsea_effect': DeepSEA_effect("mean")}}, sync_pred_writer=writer) vcf_reader = KipoiVCFParser(out_vcf_fpath) entries = [el for el in vcf_reader] #print(pd.DataFrame(entries).head().iloc[:,:7])
def get_clinvar_ext_Xy(clinvar='20180429', keep_variants="^Pathogenic$|^Benign$"): """Load the clinvar data Args: clinvar: clinvar version (publication date) keep_variants: regex of variants to keep """ def variant_id(chr, pos, ref, alt): return chr.astype(str) + ":" + pos.astype( str) + ":" + ref + ":['" + alt + "']" ddir = get_data_dir() df = pd.read_csv( f"{ddir}/processed/splicing/clinvar/annotated_vcf/{clinvar}.filtered/modeling_df.tsv", sep='\t') # Keep only Kipoi annotations df = df.iloc[:, ~df.columns.str.startswith("other_")] # Append clinical significance from kipoi_veff.parsers import KipoiVCFParser vcf_file = f"{ddir}/processed/splicing/clinvar/{clinvar}.filtered.vcf.gz" dfc = pd.DataFrame(list(KipoiVCFParser(vcf_file))) dfc['variant_id_old'] = dfc['variant_id'] dfc['variant_id'] = variant_id(dfc.variant_chr, dfc.variant_pos, dfc.variant_ref, dfc.variant_alt) dfc['ClinicalSignificance'] = dfc['other_CLNSIG'] # import ipdb # ipdb.set_trace() df = pd.merge(df, dfc[['variant_id', 'ClinicalSignificance']], on='variant_id', validate="many_to_one").drop_duplicates() # add the differences df["pathogenic"] = df.ClinicalSignificance == "Pathogenic" splicing_models = [ "MaxEntScan/3prime", "MaxEntScan/5prime", "HAL", "labranchor" ] for m in splicing_models: df[m + "_diff"] = df[m + "_ref"] - df[m + "_ref"] df[m + "_isna"] = df[m + "_ref"].isnull().astype(float) only_NA_rows = df[[m + "_diff" for m in splicing_models]].isnull().all(axis=1) df = df[~only_NA_rows] df = df[~df.ClinicalSignificance.isnull()] df = df[df.ClinicalSignificance.str.match(keep_variants)] # Append conservation scores and dbscSNV from VEP df_vep = pd.read_csv( f"{ddir}/processed/splicing/clinvar/annotated_vcf/{clinvar}.filtered/VEP.txt.gz", sep='\t', na_values='-') df_vep = df_vep.join( df_vep.Location.str.split(":|-", expand=True).rename(columns={ 0: "chr", 1: "start", 2: "end" })) df_vep['start'] = df_vep.start.astype(float) df_vep['end'] = df_vep.end.astype(float) df_vep['variant_id'] = variant_id(df_vep['chr'], df_vep.start.astype(int), df_vep.GIVEN_REF, df_vep.Allele) cons_features = [ "CADD_raw", "CADD_phred", "phyloP46way_placental", "phyloP46way_primate" ] splice_features = ['rf_score', 'MaxEntScan_diff'] # exclude stop_gained variants exclude = df_vep[df_vep.Consequence.str.startswith( "stop_gained")]['#Uploaded_variation'].unique() df_vep["early_stop"] = df_vep['#Uploaded_variation'].isin(exclude) df = pd.merge(df, df_vep[["variant_id", "early_stop"] + cons_features + splice_features].drop_duplicates(["variant_id"]), on=["variant_id"], how='left', validate="many_to_one").drop_duplicates() # Append spidex df_spidex = pd.read_csv( f"{ddir}/raw/splicing/spidex/hg19_spidex.clinvar_{clinvar}.txt", sep='\t') df_spidex = df_spidex.drop_duplicates() df_spidex['variant_id'] = variant_id(df_spidex["#Chr"].astype(str), df_spidex.Start, df_spidex.Ref, df_spidex.Alt) df = pd.merge(df, df_spidex[['variant_id', 'dpsi_max_tissue', 'dpsi_zscore']], on="variant_id", how='left') df['dpsi_max_tissue_isna'] = df['dpsi_max_tissue'].isnull() df['dpsi_zscore_isna'] = df['dpsi_zscore'].isnull() df.loc[df.dpsi_max_tissue.isnull(), "dpsi_max_tissue"] = 0 df.loc[df.dpsi_zscore.isnull(), "dpsi_zscore"] = 0 # Append dbscSNV dbsc = dd.read_csv(f"{ddir}/raw/splicing/dbscSNV/dbscSNV.chr*", sep='\t', dtype={ 'chr': 'object' }, na_values=".").compute() dbsc['variant_id'] = variant_id(dbsc.chr, dbsc.pos, dbsc.ref, dbsc.alt) dbsc = dbsc.rename(columns={ 'rf_score': 'dbscSNV_rf_score', 'ada_score': 'dbscSNV_ada_score' }) df = pd.merge(df, dbsc, on='variant_id', how='left') df['dbscSNV_rf_score_isna'] = df.dbscSNV_rf_score.isnull() df['dbscSNV_ada_score_isna'] = df.dbscSNV_ada_score.isnull() df.loc[df.dbscSNV_rf_score.isnull(), 'dbscSNV_rf_score'] = 0 df.loc[df.dbscSNV_ada_score.isnull(), 'dbscSNV_ada_score'] = 0 y_clinvar = np.array(df.ClinicalSignificance == "Pathogenic") X_clinvar = df.loc[:, df.columns != 'ClinicalSignificance'] X_clinvar = X_clinvar.iloc[:, ~X_clinvar.columns.str.contains("diff")] return X_clinvar, y_clinvar
def create_score_array(self): """ returns an array with the following values [[min, max, score]] """ if self.gwas_df == None: self.process_gwas_df() vcf_reader = KipoiVCFParser(self.chrom_file) all_elements = [el for el in vcf_reader] vcf_df = pd.DataFrame(all_elements) subset_gwas = self.gwas_df[['location', 'tstat', 'pval', 'ref', 'alt']] subset_gwas['location'] = pd.to_numeric(subset_gwas['location']) vcf_df['variant_pos'] = pd.to_numeric(vcf_df['variant_pos']) merged_df = pd.merge( vcf_df, subset_gwas, how='inner', left_on=['variant_pos', 'variant_ref', 'variant_alt'], right_on=['location', 'ref', 'alt']) merged_df = merged_df.dropna(subset=['tstat']) merged_df = merged_df.drop_duplicates( subset=['location', 'ref', 'alt']) min_val = min(merged_df['variant_pos']) max_val = max(merged_df['variant_pos']) #print(merged_df.shape) number_of_bases = (max_val - min_val) final_values = [] range_min = 0 while range_min < max_val: range_max = range_min + self.window_size temp_df = merged_df[merged_df['location'].between( range_min, range_max)] locations = temp_df[['location', 'ref', 'alt']] locations = [tuple(x) for x in locations.values] temp_df = temp_df.set_index(['location', 'ref', 'alt']) rinverse = None if self.function == 'corr': rinverse, indices = self.gm.get_Rinverse(locations) if indices is not None: reversed_indices = [(a, c, b) for (a, b, c) in indices] temp_df = temp_df[temp_df.index.isin(indices) | temp_df.index.isin(reversed_indices)] kipoi_scores = np.array(temp_df[temp_df.columns[5]]) t_stats = np.array(temp_df['tstat']) combiner = FunctionGetter.get_function(self.function, rinverse=rinverse) score = combiner(t_stats, kipoi_scores) final_values.append([range_min, range_max, score]) range_min += self.window_size return final_values