Esempio n. 1
0
    def _prepare_variation(self, var):
        """private method to collect metrics for a single variant (var) in a VCF file.

        Extracts variant information, variant impacts and extra fields for annotation.
        """
        extra_fields = {}
        # these metric require that genotypes are present in the file
        call_rate = None
        hwe_p_value = None
        pi_hat = None
        inbreeding_coeff = None
        hom_ref = het = hom_alt = unknown = None

        # only compute certain metrics if genoypes are available
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            hom_ref = var.num_hom_ref
            hom_alt = var.num_hom_alt
            het = var.num_het
            unknown = var.num_unknown
            call_rate = var.call_rate
            aaf = var.aaf
            hwe_p_value, inbreeding_coeff = \
                popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf)
            pi_hat = var.nucl_diversity
        else:
            aaf = infotag.extract_aaf(var)
            if not isinstance(aaf, (float, int)):
                if aaf is not None:
                    aaf = max(aaf)

        ############################################################
        # collect annotations from gemini's custom annotation files
        # but only if the size of the variant is <= 50kb
        ############################################################
        if var.end - var.POS < 50000:
            pfam_domain = annotations.get_pfamA_domains(var)
            cyto_band = annotations.get_cyto_info(var)
            rs_ids = annotations.get_dbsnp_info(var)
            clinvar_info = annotations.get_clinvar_info(var)
            in_dbsnp = 0 if rs_ids is None else 1
            rmsk_hits = annotations.get_rmsk_info(var)
            in_cpg = annotations.get_cpg_island_info(var)
            in_segdup = annotations.get_segdup_info(var)
            is_conserved = annotations.get_conservation_info(var)
            esp = annotations.get_esp_info(var)
            thousandG = annotations.get_1000G_info(var)
            recomb_rate = annotations.get_recomb_info(var)
            gms = annotations.get_gms(var)
            grc = annotations.get_grc(var)
            in_cse = annotations.get_cse(var)
            encode_tfbs = annotations.get_encode_tfbs(var)
            encode_dnaseI = annotations.get_encode_dnase_clusters(var)
            encode_cons_seg = annotations.get_encode_consensus_segs(var)
            gerp_el = annotations.get_gerp_elements(var)
            vista_enhancers = annotations.get_vista_enhancers(var)
            cosmic_ids = annotations.get_cosmic_info(var)
            fitcons = annotations.get_fitcons(var)
            Exac = annotations.get_exac_info(var)

            #load CADD scores by default
            if self.args.skip_cadd is False:
                (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var)
            else:
                (cadd_raw, cadd_scaled) = (None, None)

            # load the GERP score for this variant by default.
            gerp_bp = None
            if self.args.skip_gerp_bp is False:
                gerp_bp = annotations.get_gerp_bp(var)
        # the variant is too big to annotate
        else:
            pfam_domain = None
            cyto_band = None
            rs_ids = None
            clinvar_info = annotations.ClinVarInfo()
            in_dbsnp = None
            rmsk_hits = None
            in_cpg = None
            in_segdup = None
            is_conserved = None
            esp = annotations.ESPInfo(None, None, None, None, None)
            thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None)
            Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None)
            recomb_rate = None
            gms = annotations.GmsTechs(None, None, None)
            grc = None
            in_cse = None
            encode_tfbs = None
            encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None)
            encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None)
            gerp_el = None
            vista_enhancers = None
            cosmic_ids = None
            fitcons = None
            cadd_raw = None
            cadd_scaled = None
            gerp_bp = None

        # impact is a list of impacts for this variant
        impacts = None
        severe_impacts = None
        # impact terms initialized to None for handling unannotated vcf's
        # anno_id in variants is for the trans. with the most severe impact term
        gene = transcript = exon = codon_change = aa_change = aa_length = \
            biotype = consequence = consequence_so = effect_severity = None
        is_coding = is_exonic = is_lof = None
        polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None

        if self.args.anno_type is not None:
            impacts = func_impact.interpret_impact(self.args, var, self._effect_fields)
            il = [i for i in impacts if i.effect_severity]
            # in case we don't have sever impact, we still try to get the impact
            # to annote the main variants table.
            if len(il) == 0 and len(impacts) > 0:
                il = impacts[:1]
            if len(il) > 0:
                im = il[0]

                transcript = im.transcript
                exon, gene = im.exon, im.gene
                effect_severity = im.effect_severity
                codon_change = im.codon_change
                biotype = im.biotype
                is_coding = im.is_coding
                aa_change, aa_length, consequence = im.aa_change, im.aa_length, im.consequence
                sift_score = im.sift_score

                polyphen_pred = im.polyphen_pred
                polyphen_score = im.polyphen_score
                sift_pred = im.sift_pred
                sift_score = im.sift_score
                anno_id = im.anno_id
                is_exonic = im.is_exonic
                is_coding = im.is_coding
                is_lof = im.is_lof

            severe_impacts = \
                severe_impact.interpret_severe_impact(self.args, var, self._effect_fields)
            if severe_impacts:
                extra_fields.update(severe_impacts.extra_fields)
                gene = severe_impacts.gene
                transcript = severe_impacts.transcript
                exon = severe_impacts.exon
                codon_change = severe_impacts.codon_change
                aa_change = severe_impacts.aa_change
                aa_length = severe_impacts.aa_length
                biotype = severe_impacts.biotype
                consequence = severe_impacts.consequence
                effect_severity = severe_impacts.effect_severity
                polyphen_pred = severe_impacts.polyphen_pred
                polyphen_score = severe_impacts.polyphen_score
                sift_pred = severe_impacts.sift_pred
                sift_score = severe_impacts.sift_score
                anno_id = severe_impacts.anno_id
                is_exonic = severe_impacts.is_exonic
                is_coding = severe_impacts.is_coding
                is_lof = severe_impacts.is_lof
                consequence_so = severe_impacts.so


        # construct the filter string
        filter = None
        if var.FILTER is not None and var.FILTER != ".":
            if isinstance(var.FILTER, list):
                filter = ";".join(var.FILTER)
            else:
                filter = var.FILTER

        vcf_id = None
        if var.ID is not None and var.ID != ".":
            vcf_id = var.ID

        # build up numpy arrays for the genotype information.
        # these arrays will be pickled-to-binary, compressed,
        # and loaded as SqlLite BLOB values (see compression.pack_blob)
        gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None

        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            gt_bases = np.array(var.gt_bases, np.str)  # 'A/G', './.'
            gt_types = np.array(var.gt_types, np.int8)  # -1, 0, 1, 2
            gt_phases = np.array(var.gt_phases, np.bool)  # T F F
            gt_depths = np.array(var.gt_depths, np.int32)  # 10 37 0
            gt_ref_depths = np.array(var.gt_ref_depths, np.int32)  # 2 21 0 -1
            gt_alt_depths = np.array(var.gt_alt_depths, np.int32)  # 8 16 0 -1
            gt_quals = np.array(var.gt_quals, np.float32)  # 10.78 22 99 -1
            gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32)  # 1.0 2.0 2.1 -1
            gt_phred_likelihoods = get_phred_lik(var.gt_phred_likelihoods)
            if gt_phred_likelihoods is not None:
                gt_phred_ll_homref = gt_phred_likelihoods[:, 0]
                gt_phred_ll_het = gt_phred_likelihoods[:, 1]
                gt_phred_ll_homalt = gt_phred_likelihoods[:, 2]

            # tally the genotypes
            self._update_sample_gt_counts(gt_types)
        else:
            gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None
            gt_alt_depths = gt_quals = gt_copy_numbers = None

        if self.args.skip_info_string:
            info = None
        else:
            info = var.INFO

        # were functional impacts predicted by SnpEFF or VEP?
        # if so, build up a row for each of the impacts / transcript
        variant_impacts = []
        for idx, impact in enumerate(impacts or [], start=1):
            var_impact = [self.v_id, idx, impact.gene,
                          impact.transcript, impact.is_exonic,
                          impact.is_coding, impact.is_lof,
                          impact.exon, impact.codon_change,
                          impact.aa_change, impact.aa_length,
                          impact.biotype, impact.consequence,
                          impact.so, impact.effect_severity,
                          impact.polyphen_pred, impact.polyphen_score,
                          impact.sift_pred, impact.sift_score]
            variant_impacts.append(var_impact)

        # extract structural variants
        sv = svs.StructuralVariant(var)
        ci_left = sv.get_ci_left()
        ci_right = sv.get_ci_right()

        # construct the core variant record.
        # 1 row per variant to VARIANTS table
        chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM
        variant = [chrom, var.start, var.end,
                   vcf_id, self.v_id, anno_id, var.REF, ','.join([x or "" for x in var.ALT]),
                   var.QUAL, filter, var.var_type,
                   var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types),
                   pack_blob(gt_phases), pack_blob(gt_depths),
                   pack_blob(gt_ref_depths), pack_blob(gt_alt_depths),
                   pack_blob(gt_quals), pack_blob(gt_copy_numbers),
                   pack_blob(gt_phred_ll_homref),
                   pack_blob(gt_phred_ll_het),
                   pack_blob(gt_phred_ll_homalt),
                   call_rate, in_dbsnp,
                   rs_ids,
                   ci_left[0],
                   ci_left[1],
                   ci_right[0],
                   ci_right[1],
                   sv.get_length(),
                   sv.is_precise(),
                   sv.get_sv_tool(),
                   sv.get_evidence_type(),
                   sv.get_event_id(),
                   sv.get_mate_id(),
                   sv.get_strand(),
                   clinvar_info.clinvar_in_omim,
                   clinvar_info.clinvar_sig,
                   clinvar_info.clinvar_disease_name,
                   clinvar_info.clinvar_dbsource,
                   clinvar_info.clinvar_dbsource_id,
                   clinvar_info.clinvar_origin,
                   clinvar_info.clinvar_dsdb,
                   clinvar_info.clinvar_dsdbid,
                   clinvar_info.clinvar_disease_acc,
                   clinvar_info.clinvar_in_locus_spec_db,
                   clinvar_info.clinvar_on_diag_assay,
                   clinvar_info.clinvar_causal_allele,
                   pfam_domain, cyto_band, rmsk_hits, in_cpg,
                   in_segdup, is_conserved, gerp_bp, gerp_el,
                   hom_ref, het, hom_alt, unknown,
                   aaf, hwe_p_value, inbreeding_coeff, pi_hat,
                   recomb_rate, gene, transcript, is_exonic,
                   is_coding, is_lof, exon, codon_change, aa_change,
                   aa_length, biotype, consequence, consequence_so, effect_severity,
                   polyphen_pred, polyphen_score, sift_pred, sift_score,
                   infotag.get_ancestral_allele(var), infotag.get_rms_bq(var),
                   infotag.get_cigar(var),
                   infotag.get_depth(var), infotag.get_strand_bias(var),
                   infotag.get_rms_map_qual(var), infotag.get_homopol_run(var),
                   infotag.get_map_qual_zero(var),
                   infotag.get_num_of_alleles(var),
                   infotag.get_frac_dels(var),
                   infotag.get_haplotype_score(var),
                   infotag.get_quality_by_depth(var),
                   infotag.get_allele_count(var), infotag.get_allele_bal(var),
                   infotag.in_hm2(var), infotag.in_hm3(var),
                   infotag.is_somatic(var),
                   infotag.get_somatic_score(var),
                   esp.found, esp.aaf_EA,
                   esp.aaf_AA, esp.aaf_ALL,
                   esp.exome_chip, thousandG.found,
                   thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS,
                   thousandG.aaf_AFR, thousandG.aaf_EUR,
                   thousandG.aaf_ALL, grc,
                   gms.illumina, gms.solid,
                   gms.iontorrent, in_cse,
                   encode_tfbs,
                   encode_dnaseI.cell_count,
                   encode_dnaseI.cell_list,
                   encode_cons_seg.gm12878,
                   encode_cons_seg.h1hesc,
                   encode_cons_seg.helas3,
                   encode_cons_seg.hepg2,
                   encode_cons_seg.huvec,
                   encode_cons_seg.k562,
                   vista_enhancers,
                   cosmic_ids,
                   pack_blob(info),
                   cadd_raw,
                   cadd_scaled,
                   fitcons,
                   Exac.found,
                   Exac.aaf_ALL,
                   Exac.adj_aaf_ALL,
                   Exac.aaf_AFR, Exac.aaf_AMR,
                   Exac.aaf_EAS, Exac.aaf_FIN,
                   Exac.aaf_NFE, Exac.aaf_OTH,
                   Exac.aaf_SAS]

        return variant, variant_impacts, extra_fields
Esempio n. 2
0
    def _prepare_variation(self, var, anno_keys):
        """private method to collect metrics for a single variant (var) in a VCF file.

        Extracts variant information, variant impacts and extra fields for annotation.
        """
        extra_fields = {}
        # these metric require that genotypes are present in the file
        call_rate = None
        hwe_p_value = None
        pi_hat = None
        inbreeding_coeff = None
        hom_ref = het = hom_alt = unknown = None

        # only compute certain metrics if genoypes are available
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            hom_ref = var.num_hom_ref
            hom_alt = var.num_hom_alt
            het = var.num_het
            unknown = var.num_unknown
            call_rate = var.call_rate
            aaf = var.aaf
            hwe_p_value, inbreeding_coeff = \
                popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf)
            pi_hat = var.nucl_diversity
        else:
            aaf = infotag.extract_aaf(var)
            if not isinstance(aaf, (float, int)):
                if aaf is not None:
                    aaf = max(aaf)

        ############################################################
        # collect annotations from gemini's custom annotation files
        # but only if the size of the variant is <= 50kb
        ############################################################
        if var.end - var.POS < 50000:
            pfam_domain = annotations.get_pfamA_domains(var)
            cyto_band = annotations.get_cyto_info(var)
            rs_ids = annotations.get_dbsnp_info(var)
            clinvar_info = annotations.get_clinvar_info(var)
            in_dbsnp = 0 if rs_ids is None else 1
            rmsk_hits = annotations.get_rmsk_info(var)
            in_cpg = annotations.get_cpg_island_info(var)
            in_segdup = annotations.get_segdup_info(var)
            is_conserved = annotations.get_conservation_info(var)
            esp = annotations.get_esp_info(var)
            thousandG = annotations.get_1000G_info(var)
            recomb_rate = annotations.get_recomb_info(var)
            gms = annotations.get_gms(var)
            grc = annotations.get_grc(var)
            in_cse = annotations.get_cse(var)
            encode_tfbs = annotations.get_encode_tfbs(var)
            encode_dnaseI = annotations.get_encode_dnase_clusters(var)
            encode_cons_seg = annotations.get_encode_consensus_segs(var)
            gerp_el = annotations.get_gerp_elements(var)
            vista_enhancers = annotations.get_vista_enhancers(var)
            cosmic_ids = annotations.get_cosmic_info(var)
            fitcons = annotations.get_fitcons(var)
            Exac = annotations.get_exac_info(var)

            #load CADD scores by default
            if self.args.skip_cadd is False:
                (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var)
            else:
                (cadd_raw, cadd_scaled) = (None, None)

            # load the GERP score for this variant by default.
            gerp_bp = None
            if self.args.skip_gerp_bp is False:
                gerp_bp = annotations.get_gerp_bp(var)
        # the variant is too big to annotate
        else:
            pfam_domain = None
            cyto_band = None
            rs_ids = None
            clinvar_info = annotations.ClinVarInfo()
            in_dbsnp = None
            rmsk_hits = None
            in_cpg = None
            in_segdup = None
            is_conserved = None
            esp = annotations.ESPInfo(False, -1, -1, -1, 0)
            thousandG = annotations.EMPTY_1000G
            Exac = annotations.EXAC_EMPTY
            recomb_rate = None
            gms = annotations.GmsTechs(None, None, None)
            grc = None
            in_cse = None
            encode_tfbs = None
            encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None)
            encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None)
            gerp_el = None
            vista_enhancers = None
            cosmic_ids = None
            fitcons = None
            cadd_raw = None
            cadd_scaled = None
            gerp_bp = None

        top_impact = empty
        if anno_keys == {}:
            impacts = []
        else:

            impacts = []
            if self.args.anno_type in ("all", "snpEff"):
                try:
                    if "EFF" in anno_keys:
                        impacts += [geneimpacts.OldSnpEff(e, anno_keys["EFF"]) for e in var.INFO["EFF"].split(",")]
                    elif "ANN" in anno_keys:
                        impacts += [geneimpacts.SnpEff(e, anno_keys["ANN"]) for e in var.INFO["ANN"].split(",")]
                except KeyError:
                    pass

            if self.args.anno_type in ("all", "VEP"):
                try:
                    impacts += [geneimpacts.VEP(e, anno_keys["CSQ"]) for e in var.INFO["CSQ"].split(",")]
                except KeyError:
                    pass

            for i, im in enumerate(impacts, start=1):
                im.anno_id = i
            if impacts != []:
                top_impact = geneimpacts.Effect.top_severity(impacts)
                if isinstance(top_impact, list):
                    top_impact = top_impact[0]

        filter = None
        if var.FILTER is not None and var.FILTER != ".":
            if isinstance(var.FILTER, list):
                filter = ";".join(var.FILTER)
            else:
                filter = var.FILTER

        vcf_id = None
        if var.ID is not None and var.ID != ".":
            vcf_id = var.ID
        chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM

        clinvar_gene_phenotype = None
        if top_impact.gene is not None:
            clinvar_gene_phenotype = self.clinvar_chrom_gene_lookup.get((chrom[3:], top_impact.gene))

        # build up numpy arrays for the genotype information.
        # these arrays will be pickled-to-binary, compressed,
        # and loaded as BLOB values (see compression.pack_blob)
        gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None

        if not (self.args.no_genotypes or self.args.no_load_genotypes):
            gt_bases = var.gt_bases
            gt_types = var.gt_types
            gt_phases = var.gt_phases
            gt_depths = var.gt_depths
            gt_ref_depths = var.gt_ref_depths
            gt_alt_depths = var.gt_alt_depths
            gt_quals = var.gt_quals
            #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32)  # 1.0 2.0 2.1 -1
            gt_copy_numbers = None
            gt_phred_ll_homref = var.gt_phred_ll_homref
            gt_phred_ll_het = var.gt_phred_ll_het
            gt_phred_ll_homalt = var.gt_phred_ll_homalt
            # tally the genotypes
            self._update_sample_gt_counts(gt_types)
        else:
            gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None
            gt_alt_depths = gt_quals = gt_copy_numbers = None

        if self.args.skip_info_string:
            info = None
        else:
            info = dict(var.INFO)

        assert isinstance(thousandG.aaf_AMR, (int, float))
        # were functional impacts predicted by SnpEFF or VEP?
        # if so, build up a row for each of the impacts / transcript
        variant_impacts = []
        for idx, impact in enumerate(impacts or [], start=1):

            var_impact = dict(variant_id=self.v_id, anno_id=idx, gene=impact.gene,
                          transcript=impact.transcript, is_exonic=impact.is_exonic,
                          is_coding=impact.is_coding, is_lof=impact.is_lof,
                          is_splicing=impact.is_splicing,
                          exon=impact.exon, codon_change=impact.codon_change,
                          aa_change=impact.aa_change, aa_length=impact.aa_length,
                          biotype=impact.biotype, impact=impact.top_consequence,
                          impact_so=impact.so, impact_severity=impact.effect_severity,
                          polyphed_pred=impact.polyphen_pred, polyphen_score=impact.polyphen_score,
                          sift_pred=impact.sift_pred,
                          sift_score=impact.sift_score)
            variant_impacts.append(var_impact)

        # extract structural variants
        sv = svs.StructuralVariant(var)
        ci_left = sv.get_ci_left()
        ci_right = sv.get_ci_right()

        if top_impact is not empty:
            for dbkey, infokey in self._extra_effect_fields:
                extra_fields[dbkey] = top_impact.effects[infokey]
                if dbkey.endswith("_num"):
                    try:
                        extra_fields[dbkey] = float(extra_fields[dbkey])
                    except ValueError:
                        # sometimes the field is empty.
                        extra_fields[dbkey] = None
        # construct the core variant record.
        # 1 row per variant to VARIANTS table
        variant = dict(chrom=chrom, start=var.start, end=var.end,
                   vcf_id=vcf_id, variant_id=self.v_id, anno_id=top_impact.anno_id,
                   ref=var.REF, alt=','.join([x or "" for x in var.ALT]),
                   qual=var.QUAL, filter=filter, type=var.var_type,
                   sub_type=var.var_subtype, gts=pack_blob(gt_bases),
                   gt_types=pack_blob(gt_types),
                   gt_phases=pack_blob(gt_phases), gt_depths=pack_blob(gt_depths),
                   gt_ref_depths=pack_blob(gt_ref_depths), gt_alt_depths=pack_blob(gt_alt_depths),
                   gt_quals=pack_blob(gt_quals), gt_copy_numbers=pack_blob(gt_copy_numbers),
                   gt_phred_ll_homref=pack_blob(gt_phred_ll_homref),
                   gt_phred_ll_het=pack_blob(gt_phred_ll_het),
                   gt_phred_ll_homalt=pack_blob(gt_phred_ll_homalt),
                   call_rate=call_rate, in_dbsnp=bool(in_dbsnp),
                   rs_ids=rs_ids,

                   sv_cipos_start_left=ci_left[0],
                   sv_cipos_end_left=ci_left[1],
                   sv_cipos_start_right=ci_right[0],
                   sv_cipos_end_right=ci_right[1],
                   sv_length=sv.get_length(),
                   sv_is_precise=sv.is_precise(),
                   sv_tool=sv.get_sv_tool(),
                   sv_evidence_type=sv.get_evidence_type(),
                   sv_event_id=sv.get_event_id(),
                   sv_mate_id=sv.get_mate_id(),
                   sv_strand=sv.get_strand(),

                   in_omim=bool(clinvar_info.clinvar_in_omim),
                   clinvar_sig=clinvar_info.clinvar_sig,
                   clinvar_disease_name=clinvar_info.clinvar_disease_name,
                   clinvar_dbsource=clinvar_info.clinvar_dbsource,
                   clinvar_dbsource_id=clinvar_info.clinvar_dbsource_id,
                   clinvar_origin=clinvar_info.clinvar_origin,
                   clinvar_dsdb=clinvar_info.clinvar_dsdb,
                   clinvar_dsdbid=clinvar_info.clinvar_dsdbid,
                   clinvar_disease_acc=clinvar_info.clinvar_disease_acc,
                   clinvar_in_locus_spec_db=bool(clinvar_info.clinvar_in_locus_spec_db),
                   clinvar_on_diag_assay=bool(clinvar_info.clinvar_on_diag_assay),
                   clinvar_causal_allele=clinvar_info.clinvar_causal_allele,
                   clinvar_gene_phenotype=clinvar_gene_phenotype,
                   geno2mp_hpo_ct=annotations.get_geno2mp_ct(var),
                   pfam_domain=pfam_domain, cyto_band=cyto_band, rmsk=rmsk_hits,
                   in_cpg_island=bool(in_cpg),
                   in_segdup=bool(in_segdup), is_conserved=bool(is_conserved),
                   gerp_bp_score=gerp_bp, gerp_element_pval=gerp_el,
                   num_hom_ref=hom_ref, num_het=het, num_hom_alt=hom_alt,
                   num_unknown=unknown,
                   aaf=aaf, hwe=hwe_p_value, inbreeding_coeff=inbreeding_coeff,
                   pi=pi_hat,
                   recomb_rate=recomb_rate, gene=top_impact.gene,
                   transcript=top_impact.transcript,
                   is_exonic=top_impact.is_exonic,
                   is_coding=top_impact.is_coding,
                   is_splicing=top_impact.is_splicing,
                   is_lof=top_impact.is_lof, exon=top_impact.exon,
                   codon_change=top_impact.codon_change, aa_change=top_impact.aa_change,
                   aa_length=top_impact.aa_length, biotype=top_impact.biotype,

                   impact=top_impact.top_consequence, impact_so=top_impact.so,
                   impact_severity=top_impact.effect_severity,
                   polyphen_pred=top_impact.polyphen_pred,
                   polyphen_score=top_impact.polyphen_score,
                   sift_pred=top_impact.sift_pred, sift_score=top_impact.sift_score,

                   anc_allele=infotag.get_ancestral_allele(var), rms_bq=infotag.get_rms_bq(var),
                   cigar=infotag.get_cigar(var),
                   depth=infotag.get_depth(var), strand_bias=infotag.get_strand_bias(var),
                   rms_map_qual=infotag.get_rms_map_qual(var), in_hom_run=infotag.get_homopol_run(var),
                   num_mapq_zero=infotag.get_map_qual_zero(var),

                   num_alleles=infotag.get_num_of_alleles(var),
                   num_reads_w_dels=infotag.get_frac_dels(var),
                   haplotype_score=infotag.get_haplotype_score(var),
                   qual_depth=infotag.get_quality_by_depth(var),
                   allele_count=infotag.get_allele_count(var), allele_bal=infotag.get_allele_bal(var),
                   # bools?
                   in_hm2=infotag.in_hm2(var), in_hm3=infotag.in_hm3(var),
                   is_somatic=infotag.is_somatic(var),
                   somatic_score=infotag.get_somatic_score(var),

                   in_esp=esp.found, aaf_esp_ea=esp.aaf_EA,
                   aaf_esp_aa=esp.aaf_AA, aaf_esp_all=esp.aaf_ALL,
                   exome_chip=bool(esp.exome_chip),

                   in_1kg=thousandG.found,
                   aaf_1kg_amr=thousandG.aaf_AMR,
                   aaf_1kg_eas=thousandG.aaf_EAS,
                   aaf_1kg_sas=thousandG.aaf_SAS,
                   aaf_1kg_afr=thousandG.aaf_AFR,
                   aaf_1kg_eur=thousandG.aaf_EUR,
                   aaf_1kg_all=thousandG.aaf_ALL,

                   grc=grc,
                   gms_illumina=gms.illumina,
                   gms_solid=gms.solid,
                   gms_iontorrent=gms.iontorrent, in_cse=in_cse,
                   encode_tfbs=encode_tfbs,
                   encode_dnaseI_cell_count=encode_dnaseI.cell_count,
                   encode_dnaseI_cell_list=encode_dnaseI.cell_list,
                   encode_consensus_gm12878=encode_cons_seg.gm12878,
                   encode_consensus_h1hesc=encode_cons_seg.h1hesc,
                   encode_consensus_helas3=encode_cons_seg.helas3,
                   encode_consensus_hepg2=encode_cons_seg.hepg2,
                   encode_consensus_huvec=encode_cons_seg.huvec,
                   encode_consensus_k562=encode_cons_seg.k562,

                   vista_enhancers=vista_enhancers,
                   cosmic_ids=cosmic_ids,
                   info=pack_blob(info),
                   cadd_raw=cadd_raw,
                   cadd_scaled=cadd_scaled,
                   fitcons=fitcons,

                   in_exac=Exac.found,
                   aaf_exac_all=Exac.aaf_ALL,
                   aaf_adj_exac_all=Exac.adj_aaf_ALL,
                   aaf_adj_exac_afr=Exac.aaf_AFR,
                   aaf_adj_exac_amr=Exac.aaf_AMR,
                   aaf_adj_exac_eas=Exac.aaf_EAS,
                   aaf_adj_exac_fin=Exac.aaf_FIN,
                   aaf_adj_exac_nfe=Exac.aaf_NFE,
                   aaf_adj_exac_oth=Exac.aaf_OTH,
                   aaf_adj_exac_sas=Exac.aaf_SAS,
                   exac_num_het=Exac.num_het,
                   exac_num_hom_alt=Exac.num_hom_alt,
                   exac_num_chroms=Exac.num_chroms)

        variant['max_aaf_all'] = max(-1,
                                     variant['aaf_esp_ea'],
                                     variant['aaf_esp_aa'],
                                     variant['aaf_1kg_amr'],
                                     variant['aaf_1kg_eas'],
                                     variant['aaf_1kg_sas'],
                                     variant['aaf_1kg_afr'],
                                     variant['aaf_1kg_eur'],
                                     variant['aaf_adj_exac_afr'],
                                     variant['aaf_adj_exac_amr'],
                                     variant['aaf_adj_exac_eas'],
                                     variant['aaf_adj_exac_nfe'],
                                     variant['aaf_adj_exac_sas'])

        variant.update(self._extra_empty)
        return variant, variant_impacts, extra_fields
Esempio n. 3
0
    def _prepare_variation(self, var, anno_keys):
        """private method to collect metrics for a single variant (var) in a VCF file.

        Extracts variant information, variant impacts and extra fields for annotation.
        """
        extra_fields = {}
        # these metric require that genotypes are present in the file
        call_rate = None
        hwe_p_value = None
        pi_hat = None
        inbreeding_coeff = None
        hom_ref = het = hom_alt = unknown = None

        # only compute certain metrics if genoypes are available
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            hom_ref = var.num_hom_ref
            hom_alt = var.num_hom_alt
            het = var.num_het
            unknown = var.num_unknown
            call_rate = var.call_rate
            aaf = var.aaf
            hwe_p_value, inbreeding_coeff = \
                popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf)
            pi_hat = var.nucl_diversity
        else:
            aaf = infotag.extract_aaf(var)
            if not isinstance(aaf, (float, int)):
                if aaf is not None:
                    aaf = max(aaf)

        ############################################################
        # collect annotations from gemini's custom annotation files
        # but only if the size of the variant is <= 50kb
        ############################################################
        if var.end - var.POS < 50000:
            pfam_domain = annotations.get_pfamA_domains(var)
            cyto_band = annotations.get_cyto_info(var)
            rs_ids = annotations.get_dbsnp_info(var)
            clinvar_info = annotations.get_clinvar_info(var)
            in_dbsnp = 0 if rs_ids is None else 1
            rmsk_hits = annotations.get_rmsk_info(var)
            in_cpg = annotations.get_cpg_island_info(var)
            in_segdup = annotations.get_segdup_info(var)
            is_conserved = annotations.get_conservation_info(var)
            esp = annotations.get_esp_info(var)
            thousandG = annotations.get_1000G_info(var)
            recomb_rate = annotations.get_recomb_info(var)
            gms = annotations.get_gms(var)
            grc = annotations.get_grc(var)
            in_cse = annotations.get_cse(var)
            encode_tfbs = annotations.get_encode_tfbs(var)
            encode_dnaseI = annotations.get_encode_dnase_clusters(var)
            encode_cons_seg = annotations.get_encode_consensus_segs(var)
            gerp_el = annotations.get_gerp_elements(var)
            vista_enhancers = annotations.get_vista_enhancers(var)
            cosmic_ids = annotations.get_cosmic_info(var)
            fitcons = annotations.get_fitcons(var)
            Exac = annotations.get_exac_info(var)

            #load CADD scores by default
            if self.args.skip_cadd is False:
                (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var)
            else:
                (cadd_raw, cadd_scaled) = (None, None)

            # load the GERP score for this variant by default.
            gerp_bp = None
            if self.args.skip_gerp_bp is False:
                gerp_bp = annotations.get_gerp_bp(var)
        # the variant is too big to annotate
        else:
            pfam_domain = None
            cyto_band = None
            rs_ids = None
            clinvar_info = annotations.ClinVarInfo()
            in_dbsnp = None
            rmsk_hits = None
            in_cpg = None
            in_segdup = None
            is_conserved = None
            esp = annotations.ESPInfo(None, None, None, None, None)
            thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None)
            Exac = annotations.ExacInfo(None, None, None, None, None, None,
                    None, None, None, None, None, None, None)
            recomb_rate = None
            gms = annotations.GmsTechs(None, None, None)
            grc = None
            in_cse = None
            encode_tfbs = None
            encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None)
            encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None)
            gerp_el = None
            vista_enhancers = None
            cosmic_ids = None
            fitcons = None
            cadd_raw = None
            cadd_scaled = None
            gerp_bp = None

        top_impact = empty
        if anno_keys == {}:
            impacts = []
        else:

            impacts = []
            if self.args.anno_type in ("all", "snpEff"):
                try:
                    if "EFF" in anno_keys:
                        impacts += [geneimpacts.OldSnpEff(e, anno_keys["EFF"]) for e in var.INFO["EFF"].split(",")]
                    elif "ANN" in anno_keys:
                        impacts += [geneimpacts.SnpEff(e, anno_keys["ANN"]) for e in var.INFO["ANN"].split(",")]
                except KeyError:
                    pass

            elif self.args.anno_type in ("all", "VEP"):
                try:
                    impacts += [geneimpacts.VEP(e, anno_keys["CSQ"]) for e in var.INFO["CSQ"].split(",")]
                except KeyError:
                    pass

            for i, im in enumerate(impacts, start=1):
                im.anno_id = i
            if impacts != []:
                top_impact = geneimpacts.Effect.top_severity(impacts)
                if isinstance(top_impact, list):
                    top_impact = top_impact[0]

        filter = None
        if var.FILTER is not None and var.FILTER != ".":
            if isinstance(var.FILTER, list):
                filter = ";".join(var.FILTER)
            else:
                filter = var.FILTER

        vcf_id = None
        if var.ID is not None and var.ID != ".":
            vcf_id = var.ID

        # build up numpy arrays for the genotype information.
        # these arrays will be pickled-to-binary, compressed,
        # and loaded as SqlLite BLOB values (see compression.pack_blob)
        gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None

        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            gt_bases = var.gt_bases
            gt_types = var.gt_types
            gt_phases = var.gt_phases
            gt_depths = var.gt_depths
            gt_ref_depths = var.gt_ref_depths
            gt_alt_depths = var.gt_alt_depths
            gt_quals = var.gt_quals
            #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32)  # 1.0 2.0 2.1 -1
            gt_copy_numbers = None
            gt_phred_ll_homref = var.gt_phred_ll_homref
            gt_phred_ll_het = var.gt_phred_ll_het
            gt_phred_ll_homalt = var.gt_phred_ll_homalt
            # tally the genotypes
            self._update_sample_gt_counts(gt_types)
        else:
            gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None
            gt_alt_depths = gt_quals = gt_copy_numbers = None

        if self.args.skip_info_string:
            info = None
        else:
            info = dict(var.INFO)

        # were functional impacts predicted by SnpEFF or VEP?
        # if so, build up a row for each of the impacts / transcript
        variant_impacts = []
        for idx, impact in enumerate(impacts or [], start=1):
            var_impact = [self.v_id, idx, impact.gene,
                          impact.transcript, impact.is_exonic,
                          impact.is_coding,
                          impact.is_splicing,
                          impact.is_lof,
                          impact.exon, impact.codon_change,
                          impact.aa_change, impact.aa_length,
                          impact.biotype, impact.top_consequence,
                          impact.so, impact.effect_severity,
                          impact.polyphen_pred, impact.polyphen_score,
                          impact.sift_pred, impact.sift_score]
            variant_impacts.append(var_impact)

        # extract structural variants
        sv = svs.StructuralVariant(var)
        ci_left = sv.get_ci_left()
        ci_right = sv.get_ci_right()

        if top_impact is not empty:
            for dbkey, infokey in self._extra_effect_fields:
                extra_fields[dbkey] = top_impact.effects[infokey]

        # construct the core variant record.
        # 1 row per variant to VARIANTS table
        chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM
        variant = [chrom, var.start, var.end,
                   vcf_id, self.v_id, top_impact.anno_id, var.REF, ','.join([x or "" for x in var.ALT]),
                   var.QUAL, filter, var.var_type,
                   var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types),
                   pack_blob(gt_phases), pack_blob(gt_depths),
                   pack_blob(gt_ref_depths), pack_blob(gt_alt_depths),
                   pack_blob(gt_quals), pack_blob(gt_copy_numbers),
                   pack_blob(gt_phred_ll_homref),
                   pack_blob(gt_phred_ll_het),
                   pack_blob(gt_phred_ll_homalt),
                   call_rate, in_dbsnp,
                   rs_ids,
                   ci_left[0],
                   ci_left[1],
                   ci_right[0],
                   ci_right[1],
                   sv.get_length(),
                   sv.is_precise(),
                   sv.get_sv_tool(),
                   sv.get_evidence_type(),
                   sv.get_event_id(),
                   sv.get_mate_id(),
                   sv.get_strand(),
                   clinvar_info.clinvar_in_omim,
                   clinvar_info.clinvar_sig,
                   clinvar_info.clinvar_disease_name,
                   clinvar_info.clinvar_dbsource,
                   clinvar_info.clinvar_dbsource_id,
                   clinvar_info.clinvar_origin,
                   clinvar_info.clinvar_dsdb,
                   clinvar_info.clinvar_dsdbid,
                   clinvar_info.clinvar_disease_acc,
                   clinvar_info.clinvar_in_locus_spec_db,
                   clinvar_info.clinvar_on_diag_assay,
                   clinvar_info.clinvar_causal_allele,
                   pfam_domain, cyto_band, rmsk_hits, in_cpg,
                   in_segdup, is_conserved, gerp_bp, gerp_el,
                   hom_ref, het, hom_alt, unknown,
                   aaf, hwe_p_value, inbreeding_coeff, pi_hat,
                   recomb_rate,
                   top_impact.gene,
                   top_impact.transcript,
                   top_impact.is_exonic,
                   top_impact.is_coding,
                   top_impact.is_splicing,
                   top_impact.is_lof,
                   top_impact.exon,
                   top_impact.codon_change,
                   top_impact.aa_change,
                   top_impact.aa_length,
                   top_impact.biotype,
                   top_impact.top_consequence,
                   top_impact.so,
                   top_impact.effect_severity,
                   top_impact.polyphen_pred,
                   top_impact.polyphen_score,
                   top_impact.sift_pred,
                   top_impact.sift_score,
                   infotag.get_ancestral_allele(var), infotag.get_rms_bq(var),
                   infotag.get_cigar(var),
                   infotag.get_depth(var), infotag.get_strand_bias(var),
                   infotag.get_rms_map_qual(var), infotag.get_homopol_run(var),
                   infotag.get_map_qual_zero(var),
                   infotag.get_num_of_alleles(var),
                   infotag.get_frac_dels(var),
                   infotag.get_haplotype_score(var),
                   infotag.get_quality_by_depth(var),
                   infotag.get_allele_count(var), infotag.get_allele_bal(var),
                   infotag.in_hm2(var), infotag.in_hm3(var),
                   infotag.is_somatic(var),
                   infotag.get_somatic_score(var),
                   esp.found, esp.aaf_EA,
                   esp.aaf_AA, esp.aaf_ALL,
                   esp.exome_chip, thousandG.found,
                   thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS,
                   thousandG.aaf_AFR, thousandG.aaf_EUR,
                   thousandG.aaf_ALL, grc,
                   gms.illumina, gms.solid,
                   gms.iontorrent, in_cse,
                   encode_tfbs,
                   encode_dnaseI.cell_count,
                   encode_dnaseI.cell_list,
                   encode_cons_seg.gm12878,
                   encode_cons_seg.h1hesc,
                   encode_cons_seg.helas3,
                   encode_cons_seg.hepg2,
                   encode_cons_seg.huvec,
                   encode_cons_seg.k562,
                   vista_enhancers,
                   cosmic_ids,
                   pack_blob(info),
                   cadd_raw,
                   cadd_scaled,
                   fitcons,
                   Exac.found,
                   Exac.aaf_ALL,
                   Exac.adj_aaf_ALL,
                   Exac.aaf_AFR, Exac.aaf_AMR,
                   Exac.aaf_EAS, Exac.aaf_FIN,
                   Exac.aaf_NFE, Exac.aaf_OTH,
                   Exac.aaf_SAS,
                   Exac.num_het,
                   Exac.num_hom_alt,
                   Exac.num_chroms]

        return variant, variant_impacts, extra_fields
Esempio n. 4
0
    def _prepare_variation(self, var):
        """private method to collect metrics for a single variant (var) in a VCF file.

        Extracts variant information, variant impacts and extra fields for annotation.
        """
        extra_fields = {}
        # these metric require that genotypes are present in the file
        call_rate = None
        hwe_p_value = None
        pi_hat = None
        inbreeding_coeff = None
        hom_ref = het = hom_alt = unknown = None

        # only compute certain metrics if genoypes are available
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            hom_ref = var.num_hom_ref
            hom_alt = var.num_hom_alt
            het = var.num_het
            unknown = var.num_unknown
            call_rate = var.call_rate
            aaf = var.aaf
            hwe_p_value, inbreeding_coeff = \
                popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf)
            pi_hat = var.nucl_diversity
        else:
            aaf = infotag.extract_aaf(var)

        ############################################################
        # collect annotations from gemini's custom annotation files
        # but only if the size of the variant is <= 50kb
        ############################################################
        if var.end - var.POS < 50000:
            pfam_domain = annotations.get_pfamA_domains(var)
            cyto_band = annotations.get_cyto_info(var)
            rs_ids = annotations.get_dbsnp_info(var)
            clinvar_info = annotations.get_clinvar_info(var)
            in_dbsnp = 0 if rs_ids is None else 1
            rmsk_hits = annotations.get_rmsk_info(var)
            in_cpg = annotations.get_cpg_island_info(var)
            in_segdup = annotations.get_segdup_info(var)
            is_conserved = annotations.get_conservation_info(var)
            esp = annotations.get_esp_info(var)
            thousandG = annotations.get_1000G_info(var)
            recomb_rate = annotations.get_recomb_info(var)
            gms = annotations.get_gms(var)
            grc = annotations.get_grc(var)
            in_cse = annotations.get_cse(var)
            encode_tfbs = annotations.get_encode_tfbs(var)
            encode_dnaseI = annotations.get_encode_dnase_clusters(var)
            encode_cons_seg = annotations.get_encode_consensus_segs(var)
            gerp_el = annotations.get_gerp_elements(var)
            vista_enhancers = annotations.get_vista_enhancers(var)
            cosmic_ids = annotations.get_cosmic_info(var)
            fitcons = annotations.get_fitcons(var)
            Exac = annotations.get_exac_info(var)

            #load CADD scores by default
            if self.args.skip_cadd is False:
                (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var)
            else:
                (cadd_raw, cadd_scaled) = (None, None)

            # load the GERP score for this variant by default.
            gerp_bp = None
            if self.args.skip_gerp_bp is False:
                gerp_bp = annotations.get_gerp_bp(var)
        # the variant is too big to annotate
        else:
            pfam_domain = None
            cyto_band = None
            rs_ids = None
            clinvar_info = annotations.ClinVarInfo()
            in_dbsnp = None
            rmsk_hits = None
            in_cpg = None
            in_segdup = None
            is_conserved = None
            esp = annotations.ESPInfo(None, None, None, None, None)
            thousandG = annotations.ThousandGInfo(None, None, None, None, None,
                                                  None, None)
            Exac = annotations.ExacInfo(None, None, None, None, None, None,
                                        None, None, None, None)
            recomb_rate = None
            gms = annotations.GmsTechs(None, None, None)
            grc = None
            in_cse = None
            encode_tfbs = None
            encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None)
            encode_cons_seg = annotations.ENCODESegInfo(
                None, None, None, None, None, None)
            gerp_el = None
            vista_enhancers = None
            cosmic_ids = None
            fitcons = None
            cadd_raw = None
            cadd_scaled = None
            gerp_bp = None

        # impact is a list of impacts for this variant
        impacts = None
        severe_impacts = None
        # impact terms initialized to None for handling unannotated vcf's
        # anno_id in variants is for the trans. with the most severe impact term
        gene = transcript = exon = codon_change = aa_change = aa_length = \
            biotype = consequence = consequence_so = effect_severity = None
        is_coding = is_exonic = is_lof = None
        polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None

        if self.args.anno_type is not None:
            impacts = func_impact.interpret_impact(self.args, var,
                                                   self._effect_fields)
            severe_impacts = \
                severe_impact.interpret_severe_impact(self.args, var, self._effect_fields)
            if severe_impacts:
                extra_fields.update(severe_impacts.extra_fields)
                gene = severe_impacts.gene
                transcript = severe_impacts.transcript
                exon = severe_impacts.exon
                codon_change = severe_impacts.codon_change
                aa_change = severe_impacts.aa_change
                aa_length = severe_impacts.aa_length
                biotype = severe_impacts.biotype
                consequence = severe_impacts.consequence
                effect_severity = severe_impacts.effect_severity
                polyphen_pred = severe_impacts.polyphen_pred
                polyphen_score = severe_impacts.polyphen_score
                sift_pred = severe_impacts.sift_pred
                sift_score = severe_impacts.sift_score
                anno_id = severe_impacts.anno_id
                is_exonic = severe_impacts.is_exonic
                is_coding = severe_impacts.is_coding
                is_lof = severe_impacts.is_lof
                consequence_so = severe_impacts.so

        # construct the filter string
        filter = None
        if var.FILTER is not None and var.FILTER != ".":
            if isinstance(var.FILTER, list):
                filter = ";".join(var.FILTER)
            else:
                filter = var.FILTER

        vcf_id = None
        if var.ID is not None and var.ID != ".":
            vcf_id = var.ID

        # build up numpy arrays for the genotype information.
        # these arrays will be pickled-to-binary, compressed,
        # and loaded as SqlLite BLOB values (see compression.pack_blob)
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            gt_bases = np.array(var.gt_bases, np.str)  # 'A/G', './.'
            gt_types = np.array(var.gt_types, np.int8)  # -1, 0, 1, 2
            gt_phases = np.array(var.gt_phases, np.bool)  # T F F
            gt_depths = np.array(var.gt_depths, np.int32)  # 10 37 0
            gt_ref_depths = np.array(var.gt_ref_depths, np.int32)  # 2 21 0 -1
            gt_alt_depths = np.array(var.gt_alt_depths, np.int32)  # 8 16 0 -1
            gt_quals = np.array(var.gt_quals, np.float32)  # 10.78 22 99 -1
            gt_copy_numbers = np.array(var.gt_copy_numbers,
                                       np.float32)  # 1.0 2.0 2.1 -1

            # tally the genotypes
            self._update_sample_gt_counts(gt_types)
        else:
            gt_bases = None
            gt_types = None
            gt_phases = None
            gt_depths = None
            gt_ref_depths = None
            gt_alt_depths = None
            gt_quals = None
            gt_copy_numbers = None

        if self.args.skip_info_string is False:
            info = var.INFO
        else:
            info = None

        # were functional impacts predicted by SnpEFF or VEP?
        # if so, build up a row for each of the impacts / transcript
        variant_impacts = []
        if impacts is not None:
            for idx, impact in enumerate(impacts):
                var_impact = [
                    self.v_id, (idx + 1), impact.gene, impact.transcript,
                    impact.is_exonic, impact.is_coding, impact.is_lof,
                    impact.exon, impact.codon_change, impact.aa_change,
                    impact.aa_length, impact.biotype, impact.consequence,
                    impact.so, impact.effect_severity, impact.polyphen_pred,
                    impact.polyphen_score, impact.sift_pred, impact.sift_score
                ]
                variant_impacts.append(var_impact)

        # extract structural variants
        sv = svs.StructuralVariant(var)
        ci_left = sv.get_ci_left()
        ci_right = sv.get_ci_right()

        # construct the core variant record.
        # 1 row per variant to VARIANTS table
        if extra_fields:
            extra_fields.update({
                "chrom": var.CHROM,
                "start": var.start,
                "end": var.end
            })
        chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM
        variant = [
            chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF,
            ','.join([x or "" for x in var.ALT
                      ]), var.QUAL, filter, var.var_type, var.var_subtype,
            pack_blob(gt_bases),
            pack_blob(gt_types),
            pack_blob(gt_phases),
            pack_blob(gt_depths),
            pack_blob(gt_ref_depths),
            pack_blob(gt_alt_depths),
            pack_blob(gt_quals),
            pack_blob(gt_copy_numbers), call_rate, in_dbsnp, rs_ids,
            ci_left[0], ci_left[1], ci_right[0], ci_right[1],
            sv.get_length(),
            sv.is_precise(),
            sv.get_sv_tool(),
            sv.get_evidence_type(),
            sv.get_event_id(),
            sv.get_mate_id(),
            sv.get_strand(), clinvar_info.clinvar_in_omim,
            clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name,
            clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id,
            clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb,
            clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc,
            clinvar_info.clinvar_in_locus_spec_db,
            clinvar_info.clinvar_on_diag_assay,
            clinvar_info.clinvar_causal_allele, pfam_domain, cyto_band,
            rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el,
            hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff,
            pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding,
            is_lof, exon, codon_change, aa_change, aa_length, biotype,
            consequence, consequence_so, effect_severity, polyphen_pred,
            polyphen_score, sift_pred, sift_score,
            infotag.get_ancestral_allele(var),
            infotag.get_rms_bq(var),
            infotag.get_cigar(var),
            infotag.get_depth(var),
            infotag.get_strand_bias(var),
            infotag.get_rms_map_qual(var),
            infotag.get_homopol_run(var),
            infotag.get_map_qual_zero(var),
            infotag.get_num_of_alleles(var),
            infotag.get_frac_dels(var),
            infotag.get_haplotype_score(var),
            infotag.get_quality_by_depth(var),
            infotag.get_allele_count(var),
            infotag.get_allele_bal(var),
            infotag.in_hm2(var),
            infotag.in_hm3(var),
            infotag.is_somatic(var),
            infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA,
            esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR,
            thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR,
            thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid,
            gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count,
            encode_dnaseI.cell_list, encode_cons_seg.gm12878,
            encode_cons_seg.h1hesc, encode_cons_seg.helas3,
            encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562,
            vista_enhancers, cosmic_ids,
            pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found,
            Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR,
            Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH,
            Exac.aaf_SAS
        ]

        return variant, variant_impacts, extra_fields
Esempio n. 5
0
    def _prepare_variation(self, var, anno_keys):
        """private method to collect metrics for a single variant (var) in a VCF file.

        Extracts variant information, variant impacts and extra fields for annotation.
        """
        extra_fields = {}
        # these metric require that genotypes are present in the file
        call_rate = None
        hwe_p_value = None
        pi_hat = None
        inbreeding_coeff = None
        hom_ref = het = hom_alt = unknown = None

        # only compute certain metrics if genoypes are available
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            hom_ref = var.num_hom_ref
            hom_alt = var.num_hom_alt
            het = var.num_het
            unknown = var.num_unknown
            call_rate = var.call_rate
            aaf = var.aaf
            hwe_p_value, inbreeding_coeff = \
                popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf)
            pi_hat = var.nucl_diversity
        else:
            aaf = infotag.extract_aaf(var)
            if not isinstance(aaf, (float, int)):
                if aaf is not None:
                    aaf = max(aaf)

        ############################################################
        # collect annotations from gemini's custom annotation files
        # but only if the size of the variant is <= 50kb
        ############################################################
        if var.end - var.POS < 50000:
            pfam_domain = annotations.get_pfamA_domains(var)
            cyto_band = annotations.get_cyto_info(var)
            rs_ids = annotations.get_dbsnp_info(var)
            clinvar_info = annotations.get_clinvar_info(var)
            in_dbsnp = 0 if rs_ids is None else 1
            rmsk_hits = annotations.get_rmsk_info(var)
            in_cpg = annotations.get_cpg_island_info(var)
            in_segdup = annotations.get_segdup_info(var)
            is_conserved = annotations.get_conservation_info(var)
            esp = annotations.get_esp_info(var)
            thousandG = annotations.get_1000G_info(var)
            recomb_rate = annotations.get_recomb_info(var)
            gms = annotations.get_gms(var)
            grc = annotations.get_grc(var)
            in_cse = annotations.get_cse(var)
            encode_tfbs = annotations.get_encode_tfbs(var)
            encode_dnaseI = annotations.get_encode_dnase_clusters(var)
            encode_cons_seg = annotations.get_encode_consensus_segs(var)
            gerp_el = annotations.get_gerp_elements(var)
            vista_enhancers = annotations.get_vista_enhancers(var)
            cosmic_ids = annotations.get_cosmic_info(var)
            fitcons = annotations.get_fitcons(var)
            Exac = annotations.get_exac_info(var)

            #load CADD scores by default
            if self.args.skip_cadd is False:
                (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var)
            else:
                (cadd_raw, cadd_scaled) = (None, None)

            # load the GERP score for this variant by default.
            gerp_bp = None
            if self.args.skip_gerp_bp is False:
                gerp_bp = annotations.get_gerp_bp(var)
        # the variant is too big to annotate
        else:
            pfam_domain = None
            cyto_band = None
            rs_ids = None
            clinvar_info = annotations.ClinVarInfo()
            in_dbsnp = None
            rmsk_hits = None
            in_cpg = None
            in_segdup = None
            is_conserved = None
            esp = annotations.ESPInfo(None, None, None, None, None)
            thousandG = annotations.ThousandGInfo(None, None, None, None, None,
                                                  None, None)
            Exac = annotations.ExacInfo(None, None, None, None, None, None,
                                        None, None, None, None, None, None,
                                        None)
            recomb_rate = None
            gms = annotations.GmsTechs(None, None, None)
            grc = None
            in_cse = None
            encode_tfbs = None
            encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None)
            encode_cons_seg = annotations.ENCODESegInfo(
                None, None, None, None, None, None)
            gerp_el = None
            vista_enhancers = None
            cosmic_ids = None
            fitcons = None
            cadd_raw = None
            cadd_scaled = None
            gerp_bp = None

        top_impact = empty
        if anno_keys == {}:
            impacts = []
        else:

            impacts = []
            if self.args.anno_type in ("all", "snpEff"):
                try:
                    if "EFF" in anno_keys:
                        impacts += [
                            geneimpacts.OldSnpEff(e, anno_keys["EFF"])
                            for e in var.INFO["EFF"].split(",")
                        ]
                    elif "ANN" in anno_keys:
                        impacts += [
                            geneimpacts.SnpEff(e, anno_keys["ANN"])
                            for e in var.INFO["ANN"].split(",")
                        ]
                except KeyError:
                    pass

            if self.args.anno_type in ("all", "VEP"):
                try:
                    impacts += [
                        geneimpacts.VEP(e, anno_keys["CSQ"])
                        for e in var.INFO["CSQ"].split(",")
                    ]
                except KeyError:
                    pass

            for i, im in enumerate(impacts, start=1):
                im.anno_id = i
            if impacts != []:
                top_impact = geneimpacts.Effect.top_severity(impacts)
                if isinstance(top_impact, list):
                    top_impact = top_impact[0]

        filter = None
        if var.FILTER is not None and var.FILTER != ".":
            if isinstance(var.FILTER, list):
                filter = ";".join(var.FILTER)
            else:
                filter = var.FILTER

        vcf_id = None
        if var.ID is not None and var.ID != ".":
            vcf_id = var.ID
        chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM

        clinvar_gene_phenotype = None
        if top_impact.gene is not None:
            clinvar_gene_phenotype = self.clinvar_chrom_gene_lookup.get(
                (chrom[3:], top_impact.gene))

        # build up numpy arrays for the genotype information.
        # these arrays will be pickled-to-binary, compressed,
        # and loaded as BLOB values (see compression.pack_blob)
        gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None

        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            gt_bases = var.gt_bases
            gt_types = var.gt_types
            gt_phases = var.gt_phases
            gt_depths = var.gt_depths
            gt_ref_depths = var.gt_ref_depths
            gt_alt_depths = var.gt_alt_depths
            gt_quals = var.gt_quals
            #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32)  # 1.0 2.0 2.1 -1
            gt_copy_numbers = None
            gt_phred_ll_homref = var.gt_phred_ll_homref
            gt_phred_ll_het = var.gt_phred_ll_het
            gt_phred_ll_homalt = var.gt_phred_ll_homalt
            # tally the genotypes
            self._update_sample_gt_counts(gt_types)
        else:
            gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None
            gt_alt_depths = gt_quals = gt_copy_numbers = None

        if self.args.skip_info_string:
            info = None
        else:
            info = dict(var.INFO)

        # were functional impacts predicted by SnpEFF or VEP?
        # if so, build up a row for each of the impacts / transcript
        variant_impacts = []
        for idx, impact in enumerate(impacts or [], start=1):
            var_impact = [
                self.v_id, idx, impact.gene, impact.transcript,
                impact.is_exonic, impact.is_coding, impact.is_splicing,
                impact.is_lof, impact.exon, impact.codon_change,
                impact.aa_change, impact.aa_length, impact.biotype,
                impact.top_consequence, impact.so, impact.effect_severity,
                impact.polyphen_pred, impact.polyphen_score, impact.sift_pred,
                impact.sift_score
            ]
            variant_impacts.append(var_impact)

        # extract structural variants
        sv = svs.StructuralVariant(var)
        ci_left = sv.get_ci_left()
        ci_right = sv.get_ci_right()

        if top_impact is not empty:
            for dbkey, infokey in self._extra_effect_fields:
                extra_fields[dbkey] = top_impact.effects[infokey]

        # construct the core variant record.
        # 1 row per variant to VARIANTS table
        variant = [
            chrom, var.start, var.end, vcf_id, self.v_id, top_impact.anno_id,
            var.REF, ','.join([x or "" for x in var.ALT]), var.QUAL, filter,
            var.var_type, var.var_subtype,
            pack_blob(gt_bases),
            pack_blob(gt_types),
            pack_blob(gt_phases),
            pack_blob(gt_depths),
            pack_blob(gt_ref_depths),
            pack_blob(gt_alt_depths),
            pack_blob(gt_quals),
            pack_blob(gt_copy_numbers),
            pack_blob(gt_phred_ll_homref),
            pack_blob(gt_phred_ll_het),
            pack_blob(gt_phred_ll_homalt), call_rate, in_dbsnp, rs_ids,
            ci_left[0], ci_left[1], ci_right[0], ci_right[1],
            sv.get_length(),
            sv.is_precise(),
            sv.get_sv_tool(),
            sv.get_evidence_type(),
            sv.get_event_id(),
            sv.get_mate_id(),
            sv.get_strand(), clinvar_info.clinvar_in_omim,
            clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name,
            clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id,
            clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb,
            clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc,
            clinvar_info.clinvar_in_locus_spec_db,
            clinvar_info.clinvar_on_diag_assay,
            clinvar_info.clinvar_causal_allele, clinvar_gene_phenotype,
            pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved,
            gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value,
            inbreeding_coeff, pi_hat, recomb_rate, top_impact.gene,
            top_impact.transcript, top_impact.is_exonic, top_impact.is_coding,
            top_impact.is_splicing, top_impact.is_lof, top_impact.exon,
            top_impact.codon_change, top_impact.aa_change,
            top_impact.aa_length, top_impact.biotype,
            top_impact.top_consequence, top_impact.so,
            top_impact.effect_severity, top_impact.polyphen_pred,
            top_impact.polyphen_score, top_impact.sift_pred,
            top_impact.sift_score,
            infotag.get_ancestral_allele(var),
            infotag.get_rms_bq(var),
            infotag.get_cigar(var),
            infotag.get_depth(var),
            infotag.get_strand_bias(var),
            infotag.get_rms_map_qual(var),
            infotag.get_homopol_run(var),
            infotag.get_map_qual_zero(var),
            infotag.get_num_of_alleles(var),
            infotag.get_frac_dels(var),
            infotag.get_haplotype_score(var),
            infotag.get_quality_by_depth(var),
            infotag.get_allele_count(var),
            infotag.get_allele_bal(var),
            infotag.in_hm2(var),
            infotag.in_hm3(var),
            infotag.is_somatic(var),
            infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA,
            esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR,
            thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR,
            thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid,
            gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count,
            encode_dnaseI.cell_list, encode_cons_seg.gm12878,
            encode_cons_seg.h1hesc, encode_cons_seg.helas3,
            encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562,
            vista_enhancers, cosmic_ids,
            pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found,
            Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR,
            Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH,
            Exac.aaf_SAS, Exac.num_het, Exac.num_hom_alt, Exac.num_chroms
        ]

        return variant, variant_impacts, extra_fields