Ejemplo n.º 1
0
def get_df(vcf_file, model_name):
    df = pd.DataFrame(list(KipoiVCFParser(vcf_file)))
    meta_info = df[[
        "variant_chr", "variant_pos", "variant_ref", "variant_alt",
        "variant_id"
    ]]
    meta_info["variant_uid"] = df["variant_chr"].astype(
        str) + ':' + df["variant_pos"].astype(
            str) + ':' + df["variant_ref"] + ':' + df["variant_alt"]
    df.index = meta_info["variant_uid"]
    meta_info.index = meta_info["variant_uid"]
    obsolete_variant_columns = [
        "variant_chr", "variant_pos", "variant_ref", "variant_alt",
        "variant_id"
    ]
    df = df[[col for col in df.columns if col not in obsolete_variant_columns]]
    df = df[[col for col in df.columns if "rID" not in col]]
    col_types = ["_LOGIT_REF", "_LOGIT_ALT", "_REF", "_ALT", "_DIFF", "_LOGIT"]
    if model_name == "labranchor":
        df = average_labranchor(df, model_name, col_types)
    else:
        df.columns = [
            refmt_col(col, model_name, col_types) for col in df.columns
        ]
    # clump variants together
    df = deduplicate_vars(df)
    # subset meta_info like df and add variant_uid as common ID
    meta_info = meta_info.loc[df.index, :]
    return df, meta_info
Ejemplo n.º 2
0
 def process_file(self, filename):
     model_name = filename.split('/')[-2]
     self.model_names.add(model_name)
     vcf_reader = KipoiVCFParser(filename)
     for el in vcf_reader:
         chrom = el['variant_chr']
         position = el['variant_pos']
         score = list(el.values())[5]
         snp_name = str(chrom) + '-' + str(position)
         if snp_name not in self.snps:
             self.snps[snp_name] = {}
         self.snps[snp_name][model_name] = score
Ejemplo n.º 3
0
def get_df(vcf_file, model_name):
    df = pd.DataFrame(list(KipoiVCFParser(vcf_file)))
    df.index = df["variant_id"]
    obsolete_variant_columns = [
        "variant_chr", "variant_pos", "variant_ref", "variant_alt",
        "variant_id"
    ]
    df = df[[col for col in df.columns if col not in obsolete_variant_columns]]
    df = df[[col for col in df.columns if "rID" not in col]]
    col_types = ["_LOGIT_REF", "_LOGIT_ALT", "_REF", "_ALT", "_DIFF", "_LOGIT"]
    if model_name == "labranchor":
        df = average_labranchor(df, model_name, col_types)
    else:
        df.columns = [
            refmt_col(col, model_name, col_types) for col in df.columns
        ]
    # clump variants together
    df = deduplicate_vars(df)
    return df
Ejemplo n.º 4
0
# get the model
model = kipoi.get_model(model_name)
# get the dataloader factory
Dataloader = model.default_dataloader

vcf_path = "../data/test.vcf"
# The output vcf path, based on the input file name
out_vcf_fpath = vcf_path[:-4] + "%s.vcf" % model_name.replace("/", "_")
# The writer object that will output the annotated VCF
writer = VcfWriter(model, vcf_path, out_vcf_fpath)

# Information extraction from dataloader and model
model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader)
# vcf_to_region will generate a variant-centered regions when presented a VCF record.
vcf_to_region = kipoi_veff.SnvCenteredRg(model_info)

dataloader_arguments = {"fasta_file": "../data/fasta_files/chr1.fa"}

sp.predict_snvs(
    model,
    Dataloader,
    vcf_path,
    batch_size=32,
    dataloader_args=dataloader_arguments,
    vcf_to_region=vcf_to_region,
    #evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'deepsea_effect': DeepSEA_effect("mean")}},
    sync_pred_writer=writer)
vcf_reader = KipoiVCFParser(out_vcf_fpath)
entries = [el for el in vcf_reader]
#print(pd.DataFrame(entries).head().iloc[:,:7])
Ejemplo n.º 5
0
def get_clinvar_ext_Xy(clinvar='20180429',
                       keep_variants="^Pathogenic$|^Benign$"):
    """Load the clinvar data

    Args:
      clinvar: clinvar version (publication date)
      keep_variants: regex of variants to keep
    """
    def variant_id(chr, pos, ref, alt):
        return chr.astype(str) + ":" + pos.astype(
            str) + ":" + ref + ":['" + alt + "']"

    ddir = get_data_dir()
    df = pd.read_csv(
        f"{ddir}/processed/splicing/clinvar/annotated_vcf/{clinvar}.filtered/modeling_df.tsv",
        sep='\t')
    # Keep only Kipoi annotations
    df = df.iloc[:, ~df.columns.str.startswith("other_")]

    # Append clinical significance
    from kipoi_veff.parsers import KipoiVCFParser
    vcf_file = f"{ddir}/processed/splicing/clinvar/{clinvar}.filtered.vcf.gz"
    dfc = pd.DataFrame(list(KipoiVCFParser(vcf_file)))
    dfc['variant_id_old'] = dfc['variant_id']
    dfc['variant_id'] = variant_id(dfc.variant_chr, dfc.variant_pos,
                                   dfc.variant_ref, dfc.variant_alt)
    dfc['ClinicalSignificance'] = dfc['other_CLNSIG']
    # import ipdb
    # ipdb.set_trace()
    df = pd.merge(df,
                  dfc[['variant_id', 'ClinicalSignificance']],
                  on='variant_id',
                  validate="many_to_one").drop_duplicates()

    # add the differences
    df["pathogenic"] = df.ClinicalSignificance == "Pathogenic"

    splicing_models = [
        "MaxEntScan/3prime", "MaxEntScan/5prime", "HAL", "labranchor"
    ]
    for m in splicing_models:
        df[m + "_diff"] = df[m + "_ref"] - df[m + "_ref"]
        df[m + "_isna"] = df[m + "_ref"].isnull().astype(float)

    only_NA_rows = df[[m + "_diff"
                       for m in splicing_models]].isnull().all(axis=1)

    df = df[~only_NA_rows]
    df = df[~df.ClinicalSignificance.isnull()]
    df = df[df.ClinicalSignificance.str.match(keep_variants)]

    # Append conservation scores and dbscSNV from VEP
    df_vep = pd.read_csv(
        f"{ddir}/processed/splicing/clinvar/annotated_vcf/{clinvar}.filtered/VEP.txt.gz",
        sep='\t',
        na_values='-')
    df_vep = df_vep.join(
        df_vep.Location.str.split(":|-", expand=True).rename(columns={
            0: "chr",
            1: "start",
            2: "end"
        }))

    df_vep['start'] = df_vep.start.astype(float)
    df_vep['end'] = df_vep.end.astype(float)
    df_vep['variant_id'] = variant_id(df_vep['chr'], df_vep.start.astype(int),
                                      df_vep.GIVEN_REF, df_vep.Allele)
    cons_features = [
        "CADD_raw", "CADD_phred", "phyloP46way_placental",
        "phyloP46way_primate"
    ]
    splice_features = ['rf_score', 'MaxEntScan_diff']

    # exclude stop_gained variants
    exclude = df_vep[df_vep.Consequence.str.startswith(
        "stop_gained")]['#Uploaded_variation'].unique()
    df_vep["early_stop"] = df_vep['#Uploaded_variation'].isin(exclude)

    df = pd.merge(df,
                  df_vep[["variant_id", "early_stop"] + cons_features +
                         splice_features].drop_duplicates(["variant_id"]),
                  on=["variant_id"],
                  how='left',
                  validate="many_to_one").drop_duplicates()

    # Append spidex
    df_spidex = pd.read_csv(
        f"{ddir}/raw/splicing/spidex/hg19_spidex.clinvar_{clinvar}.txt",
        sep='\t')
    df_spidex = df_spidex.drop_duplicates()
    df_spidex['variant_id'] = variant_id(df_spidex["#Chr"].astype(str),
                                         df_spidex.Start, df_spidex.Ref,
                                         df_spidex.Alt)
    df = pd.merge(df,
                  df_spidex[['variant_id', 'dpsi_max_tissue', 'dpsi_zscore']],
                  on="variant_id",
                  how='left')
    df['dpsi_max_tissue_isna'] = df['dpsi_max_tissue'].isnull()
    df['dpsi_zscore_isna'] = df['dpsi_zscore'].isnull()
    df.loc[df.dpsi_max_tissue.isnull(), "dpsi_max_tissue"] = 0
    df.loc[df.dpsi_zscore.isnull(), "dpsi_zscore"] = 0

    # Append dbscSNV
    dbsc = dd.read_csv(f"{ddir}/raw/splicing/dbscSNV/dbscSNV.chr*",
                       sep='\t',
                       dtype={
                           'chr': 'object'
                       },
                       na_values=".").compute()
    dbsc['variant_id'] = variant_id(dbsc.chr, dbsc.pos, dbsc.ref, dbsc.alt)
    dbsc = dbsc.rename(columns={
        'rf_score': 'dbscSNV_rf_score',
        'ada_score': 'dbscSNV_ada_score'
    })
    df = pd.merge(df, dbsc, on='variant_id', how='left')
    df['dbscSNV_rf_score_isna'] = df.dbscSNV_rf_score.isnull()
    df['dbscSNV_ada_score_isna'] = df.dbscSNV_ada_score.isnull()
    df.loc[df.dbscSNV_rf_score.isnull(), 'dbscSNV_rf_score'] = 0
    df.loc[df.dbscSNV_ada_score.isnull(), 'dbscSNV_ada_score'] = 0

    y_clinvar = np.array(df.ClinicalSignificance == "Pathogenic")
    X_clinvar = df.loc[:, df.columns != 'ClinicalSignificance']
    X_clinvar = X_clinvar.iloc[:, ~X_clinvar.columns.str.contains("diff")]

    return X_clinvar, y_clinvar
Ejemplo n.º 6
0
    def create_score_array(self):
        """
        returns an array with the following values
        [[min, max, score]]
        """
        if self.gwas_df == None:
            self.process_gwas_df()

        vcf_reader = KipoiVCFParser(self.chrom_file)
        all_elements = [el for el in vcf_reader]
        vcf_df = pd.DataFrame(all_elements)

        subset_gwas = self.gwas_df[['location', 'tstat', 'pval', 'ref', 'alt']]
        subset_gwas['location'] = pd.to_numeric(subset_gwas['location'])

        vcf_df['variant_pos'] = pd.to_numeric(vcf_df['variant_pos'])

        merged_df = pd.merge(
            vcf_df,
            subset_gwas,
            how='inner',
            left_on=['variant_pos', 'variant_ref', 'variant_alt'],
            right_on=['location', 'ref', 'alt'])
        merged_df = merged_df.dropna(subset=['tstat'])
        merged_df = merged_df.drop_duplicates(
            subset=['location', 'ref', 'alt'])
        min_val = min(merged_df['variant_pos'])
        max_val = max(merged_df['variant_pos'])
        #print(merged_df.shape)

        number_of_bases = (max_val - min_val)

        final_values = []

        range_min = 0
        while range_min < max_val:
            range_max = range_min + self.window_size
            temp_df = merged_df[merged_df['location'].between(
                range_min, range_max)]

            locations = temp_df[['location', 'ref', 'alt']]
            locations = [tuple(x) for x in locations.values]
            temp_df = temp_df.set_index(['location', 'ref', 'alt'])
            rinverse = None
            if self.function == 'corr':
                rinverse, indices = self.gm.get_Rinverse(locations)
                if indices is not None:
                    reversed_indices = [(a, c, b) for (a, b, c) in indices]
                    temp_df = temp_df[temp_df.index.isin(indices)
                                      | temp_df.index.isin(reversed_indices)]

            kipoi_scores = np.array(temp_df[temp_df.columns[5]])
            t_stats = np.array(temp_df['tstat'])

            combiner = FunctionGetter.get_function(self.function,
                                                   rinverse=rinverse)
            score = combiner(t_stats, kipoi_scores)

            final_values.append([range_min, range_max, score])
            range_min += self.window_size

        return final_values