예제 #1
0
    def _prepare_variation(self, var):
        """private method to collect metrics for a single variant (var) in a VCF file.

        Extracts variant information, variant impacts and extra fields for annotation.
        """
        extra_fields = {}
        # these metric require that genotypes are present in the file
        call_rate = None
        hwe_p_value = None
        pi_hat = None
        inbreeding_coeff = None
        hom_ref = het = hom_alt = unknown = None

        # only compute certain metrics if genoypes are available
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            hom_ref = var.num_hom_ref
            hom_alt = var.num_hom_alt
            het = var.num_het
            unknown = var.num_unknown
            call_rate = var.call_rate
            aaf = var.aaf
            hwe_p_value, inbreeding_coeff = \
                popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf)
            pi_hat = var.nucl_diversity
        else:
            aaf = infotag.extract_aaf(var)

        ############################################################
        # collect annotations from gemini's custom annotation files
        # but only if the size of the variant is <= 50kb
        ############################################################
        if var.end - var.POS < 50000:
            pfam_domain = annotations.get_pfamA_domains(var)
            cyto_band = annotations.get_cyto_info(var)
            rs_ids = annotations.get_dbsnp_info(var)
            clinvar_info = annotations.get_clinvar_info(var)
            in_dbsnp = 0 if rs_ids is None else 1
            rmsk_hits = annotations.get_rmsk_info(var)
            in_cpg = annotations.get_cpg_island_info(var)
            in_segdup = annotations.get_segdup_info(var)
            is_conserved = annotations.get_conservation_info(var)
            esp = annotations.get_esp_info(var)
            thousandG = annotations.get_1000G_info(var)
            recomb_rate = annotations.get_recomb_info(var)
            gms = annotations.get_gms(var)
            grc = annotations.get_grc(var)
            in_cse = annotations.get_cse(var)
            encode_tfbs = annotations.get_encode_tfbs(var)
            encode_dnaseI = annotations.get_encode_dnase_clusters(var)
            encode_cons_seg = annotations.get_encode_consensus_segs(var)
            gerp_el = annotations.get_gerp_elements(var)
            vista_enhancers = annotations.get_vista_enhancers(var)
            cosmic_ids = annotations.get_cosmic_info(var)
            fitcons = annotations.get_fitcons(var)
            Exac = annotations.get_exac_info(var)

            #load CADD scores by default
            if self.args.skip_cadd is False:
                (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var)
            else:
                (cadd_raw, cadd_scaled) = (None, None)

            # load the GERP score for this variant by default.
            gerp_bp = None
            if self.args.skip_gerp_bp is False:
                gerp_bp = annotations.get_gerp_bp(var)
        # the variant is too big to annotate
        else:
            pfam_domain = None
            cyto_band = None
            rs_ids = None
            clinvar_info = annotations.ClinVarInfo()
            in_dbsnp = None
            rmsk_hits = None
            in_cpg = None
            in_segdup = None
            is_conserved = None
            esp = annotations.ESPInfo(None, None, None, None, None)
            thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None)
            Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None)
            recomb_rate = None
            gms = annotations.GmsTechs(None, None, None)
            grc = None
            in_cse = None
            encode_tfbs = None
            encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None)
            encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None)
            gerp_el = None
            vista_enhancers = None
            cosmic_ids = None
            fitcons = None                
            cadd_raw = None
            cadd_scaled = None
            gerp_bp = None

        # impact is a list of impacts for this variant
        impacts = None
        severe_impacts = None
        # impact terms initialized to None for handling unannotated vcf's
        # anno_id in variants is for the trans. with the most severe impact term
        gene = transcript = exon = codon_change = aa_change = aa_length = \
            biotype = consequence = consequence_so = effect_severity = None
        is_coding = is_exonic = is_lof = None
        polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None

        if self.args.anno_type is not None:
            impacts = func_impact.interpret_impact(self.args, var, self._effect_fields)
            severe_impacts = \
                severe_impact.interpret_severe_impact(self.args, var, self._effect_fields)
            if severe_impacts:
                extra_fields.update(severe_impacts.extra_fields)
                gene = severe_impacts.gene
                transcript = severe_impacts.transcript
                exon = severe_impacts.exon
                codon_change = severe_impacts.codon_change
                aa_change = severe_impacts.aa_change
                aa_length = severe_impacts.aa_length
                biotype = severe_impacts.biotype
                consequence = severe_impacts.consequence
                effect_severity = severe_impacts.effect_severity
                polyphen_pred = severe_impacts.polyphen_pred
                polyphen_score = severe_impacts.polyphen_score
                sift_pred = severe_impacts.sift_pred
                sift_score = severe_impacts.sift_score
                anno_id = severe_impacts.anno_id
                is_exonic = severe_impacts.is_exonic
                is_coding = severe_impacts.is_coding
                is_lof = severe_impacts.is_lof
                consequence_so = severe_impacts.so

        # construct the filter string
        filter = None
        if var.FILTER is not None and var.FILTER != ".":
            if isinstance(var.FILTER, list):
                filter = ";".join(var.FILTER)
            else:
                filter = var.FILTER

        vcf_id = None
        if var.ID is not None and var.ID != ".":
            vcf_id = var.ID

        # build up numpy arrays for the genotype information.
        # these arrays will be pickled-to-binary, compressed,
        # and loaded as SqlLite BLOB values (see compression.pack_blob)
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            gt_bases = np.array(var.gt_bases, np.str)  # 'A/G', './.'
            gt_types = np.array(var.gt_types, np.int8)  # -1, 0, 1, 2
            gt_phases = np.array(var.gt_phases, np.bool)  # T F F
            gt_depths = np.array(var.gt_depths, np.int32)  # 10 37 0
            gt_ref_depths = np.array(var.gt_ref_depths, np.int32)  # 2 21 0 -1
            gt_alt_depths = np.array(var.gt_alt_depths, np.int32)  # 8 16 0 -1
            gt_quals = np.array(var.gt_quals, np.float32)  # 10.78 22 99 -1
            gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32)  # 1.0 2.0 2.1 -1

            # tally the genotypes
            self._update_sample_gt_counts(gt_types)
        else:
            gt_bases = None
            gt_types = None
            gt_phases = None
            gt_depths = None
            gt_ref_depths = None
            gt_alt_depths = None
            gt_quals = None
            gt_copy_numbers = None

        if self.args.skip_info_string is False:
            info = var.INFO
        else:
            info = None

        # were functional impacts predicted by SnpEFF or VEP?
        # if so, build up a row for each of the impacts / transcript
        variant_impacts = []
        if impacts is not None:
            for idx, impact in enumerate(impacts):
                var_impact = [self.v_id, (idx + 1), impact.gene,
                              impact.transcript, impact.is_exonic,
                              impact.is_coding, impact.is_lof,
                              impact.exon, impact.codon_change,
                              impact.aa_change, impact.aa_length,
                              impact.biotype, impact.consequence,
                              impact.so, impact.effect_severity,
                              impact.polyphen_pred, impact.polyphen_score,
                              impact.sift_pred, impact.sift_score]
                variant_impacts.append(var_impact)

        # extract structural variants
        sv = svs.StructuralVariant(var)
        ci_left = sv.get_ci_left()
        ci_right = sv.get_ci_right()

        # construct the core variant record.
        # 1 row per variant to VARIANTS table
        if extra_fields:
            extra_fields.update({"chrom": var.CHROM, "start": var.start, "end": var.end})
        chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM
        variant = [chrom, var.start, var.end,
                   vcf_id, self.v_id, anno_id, var.REF, ','.join(var.ALT),
                   var.QUAL, filter, var.var_type,
                   var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types),
                   pack_blob(gt_phases), pack_blob(gt_depths),
                   pack_blob(gt_ref_depths), pack_blob(gt_alt_depths),
                   pack_blob(gt_quals), pack_blob(gt_copy_numbers),
                   call_rate, in_dbsnp,
                   rs_ids,
                   ci_left[0],
                   ci_left[1], 
                   ci_right[0],
                   ci_right[1],
                   sv.get_length(), 
                   sv.is_precise(),
                   sv.get_sv_tool(),
                   sv.get_evidence_type(),
                   sv.get_event_id(),
                   sv.get_mate_id(),
                   sv.get_strand(),
                   clinvar_info.clinvar_in_omim,
                   clinvar_info.clinvar_sig,
                   clinvar_info.clinvar_disease_name,
                   clinvar_info.clinvar_dbsource,
                   clinvar_info.clinvar_dbsource_id,
                   clinvar_info.clinvar_origin,
                   clinvar_info.clinvar_dsdb,
                   clinvar_info.clinvar_dsdbid,
                   clinvar_info.clinvar_disease_acc,
                   clinvar_info.clinvar_in_locus_spec_db,
                   clinvar_info.clinvar_on_diag_assay,
                   clinvar_info.clinvar_causal_allele,
                   pfam_domain, cyto_band, rmsk_hits, in_cpg,
                   in_segdup, is_conserved, gerp_bp, gerp_el,
                   hom_ref, het, hom_alt, unknown,
                   aaf, hwe_p_value, inbreeding_coeff, pi_hat,
                   recomb_rate, gene, transcript, is_exonic,
                   is_coding, is_lof, exon, codon_change, aa_change,
                   aa_length, biotype, consequence, consequence_so, effect_severity,
                   polyphen_pred, polyphen_score, sift_pred, sift_score,
                   infotag.get_ancestral_allele(var), infotag.get_rms_bq(var),
                   infotag.get_cigar(var),
                   infotag.get_depth(var), infotag.get_strand_bias(var),
                   infotag.get_rms_map_qual(var), infotag.get_homopol_run(var),
                   infotag.get_map_qual_zero(var),
                   infotag.get_num_of_alleles(var),
                   infotag.get_frac_dels(var),
                   infotag.get_haplotype_score(var),
                   infotag.get_quality_by_depth(var),
                   infotag.get_allele_count(var), infotag.get_allele_bal(var),
                   infotag.in_hm2(var), infotag.in_hm3(var),
                   infotag.is_somatic(var),
                   infotag.get_somatic_score(var),
                   esp.found, esp.aaf_EA,
                   esp.aaf_AA, esp.aaf_ALL,
                   esp.exome_chip, thousandG.found,
                   thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS,
                   thousandG.aaf_AFR, thousandG.aaf_EUR,
                   thousandG.aaf_ALL, grc,
                   gms.illumina, gms.solid,
                   gms.iontorrent, in_cse,
                   encode_tfbs,
                   encode_dnaseI.cell_count,
                   encode_dnaseI.cell_list,
                   encode_cons_seg.gm12878,
                   encode_cons_seg.h1hesc,
                   encode_cons_seg.helas3,
                   encode_cons_seg.hepg2,
                   encode_cons_seg.huvec,
                   encode_cons_seg.k562,
                   vista_enhancers,
                   cosmic_ids,
                   pack_blob(info),
                   cadd_raw,
                   cadd_scaled,
                   fitcons,
                   Exac.found,
                   Exac.aaf_ALL,
                   Exac.adj_aaf_ALL,
                   Exac.aaf_AFR, Exac.aaf_AMR,
                   Exac.aaf_EAS, Exac.aaf_FIN,
                   Exac.aaf_NFE, Exac.aaf_OTH,
                   Exac.aaf_SAS]

        return variant, variant_impacts, extra_fields
예제 #2
0
    def _prepare_variation(self, var, anno_keys):
        """private method to collect metrics for a single variant (var) in a VCF file.

        Extracts variant information, variant impacts and extra fields for annotation.
        """
        extra_fields = {}
        # these metric require that genotypes are present in the file
        call_rate = None
        hwe_p_value = None
        pi_hat = None
        inbreeding_coeff = None
        hom_ref = het = hom_alt = unknown = None

        # only compute certain metrics if genoypes are available
        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            hom_ref = var.num_hom_ref
            hom_alt = var.num_hom_alt
            het = var.num_het
            unknown = var.num_unknown
            call_rate = var.call_rate
            aaf = var.aaf
            hwe_p_value, inbreeding_coeff = \
                popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf)
            pi_hat = var.nucl_diversity
        else:
            aaf = infotag.extract_aaf(var)
            if not isinstance(aaf, (float, int)):
                if aaf is not None:
                    aaf = max(aaf)

        ############################################################
        # collect annotations from gemini's custom annotation files
        # but only if the size of the variant is <= 50kb
        ############################################################
        if var.end - var.POS < 50000:
            pfam_domain = annotations.get_pfamA_domains(var)
            cyto_band = annotations.get_cyto_info(var)
            rs_ids = annotations.get_dbsnp_info(var)
            clinvar_info = annotations.get_clinvar_info(var)
            in_dbsnp = 0 if rs_ids is None else 1
            rmsk_hits = annotations.get_rmsk_info(var)
            in_cpg = annotations.get_cpg_island_info(var)
            in_segdup = annotations.get_segdup_info(var)
            is_conserved = annotations.get_conservation_info(var)
            esp = annotations.get_esp_info(var)
            thousandG = annotations.get_1000G_info(var)
            recomb_rate = annotations.get_recomb_info(var)
            gms = annotations.get_gms(var)
            grc = annotations.get_grc(var)
            in_cse = annotations.get_cse(var)
            encode_tfbs = annotations.get_encode_tfbs(var)
            encode_dnaseI = annotations.get_encode_dnase_clusters(var)
            encode_cons_seg = annotations.get_encode_consensus_segs(var)
            gerp_el = annotations.get_gerp_elements(var)
            vista_enhancers = annotations.get_vista_enhancers(var)
            cosmic_ids = annotations.get_cosmic_info(var)
            fitcons = annotations.get_fitcons(var)
            Exac = annotations.get_exac_info(var)

            #load CADD scores by default
            if self.args.skip_cadd is False:
                (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var)
            else:
                (cadd_raw, cadd_scaled) = (None, None)

            # load the GERP score for this variant by default.
            gerp_bp = None
            if self.args.skip_gerp_bp is False:
                gerp_bp = annotations.get_gerp_bp(var)
        # the variant is too big to annotate
        else:
            pfam_domain = None
            cyto_band = None
            rs_ids = None
            clinvar_info = annotations.ClinVarInfo()
            in_dbsnp = None
            rmsk_hits = None
            in_cpg = None
            in_segdup = None
            is_conserved = None
            esp = annotations.ESPInfo(None, None, None, None, None)
            thousandG = annotations.ThousandGInfo(None, None, None, None, None,
                                                  None, None)
            Exac = annotations.ExacInfo(None, None, None, None, None, None,
                                        None, None, None, None, None, None,
                                        None)
            recomb_rate = None
            gms = annotations.GmsTechs(None, None, None)
            grc = None
            in_cse = None
            encode_tfbs = None
            encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None)
            encode_cons_seg = annotations.ENCODESegInfo(
                None, None, None, None, None, None)
            gerp_el = None
            vista_enhancers = None
            cosmic_ids = None
            fitcons = None
            cadd_raw = None
            cadd_scaled = None
            gerp_bp = None

        top_impact = empty
        if anno_keys == {}:
            impacts = []
        else:

            impacts = []
            if self.args.anno_type in ("all", "snpEff"):
                try:
                    if "EFF" in anno_keys:
                        impacts += [
                            geneimpacts.OldSnpEff(e, anno_keys["EFF"])
                            for e in var.INFO["EFF"].split(",")
                        ]
                    elif "ANN" in anno_keys:
                        impacts += [
                            geneimpacts.SnpEff(e, anno_keys["ANN"])
                            for e in var.INFO["ANN"].split(",")
                        ]
                except KeyError:
                    pass

            if self.args.anno_type in ("all", "VEP"):
                try:
                    impacts += [
                        geneimpacts.VEP(e, anno_keys["CSQ"])
                        for e in var.INFO["CSQ"].split(",")
                    ]
                except KeyError:
                    pass

            for i, im in enumerate(impacts, start=1):
                im.anno_id = i
            if impacts != []:
                top_impact = geneimpacts.Effect.top_severity(impacts)
                if isinstance(top_impact, list):
                    top_impact = top_impact[0]

        filter = None
        if var.FILTER is not None and var.FILTER != ".":
            if isinstance(var.FILTER, list):
                filter = ";".join(var.FILTER)
            else:
                filter = var.FILTER

        vcf_id = None
        if var.ID is not None and var.ID != ".":
            vcf_id = var.ID
        chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM

        clinvar_gene_phenotype = None
        if top_impact.gene is not None:
            clinvar_gene_phenotype = self.clinvar_chrom_gene_lookup.get(
                (chrom[3:], top_impact.gene))

        # build up numpy arrays for the genotype information.
        # these arrays will be pickled-to-binary, compressed,
        # and loaded as BLOB values (see compression.pack_blob)
        gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None

        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            gt_bases = var.gt_bases
            gt_types = var.gt_types
            gt_phases = var.gt_phases
            gt_depths = var.gt_depths
            gt_ref_depths = var.gt_ref_depths
            gt_alt_depths = var.gt_alt_depths
            gt_quals = var.gt_quals
            #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32)  # 1.0 2.0 2.1 -1
            gt_copy_numbers = None
            gt_phred_ll_homref = var.gt_phred_ll_homref
            gt_phred_ll_het = var.gt_phred_ll_het
            gt_phred_ll_homalt = var.gt_phred_ll_homalt
            # tally the genotypes
            self._update_sample_gt_counts(gt_types)
        else:
            gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None
            gt_alt_depths = gt_quals = gt_copy_numbers = None

        if self.args.skip_info_string:
            info = None
        else:
            info = dict(var.INFO)

        # were functional impacts predicted by SnpEFF or VEP?
        # if so, build up a row for each of the impacts / transcript
        variant_impacts = []
        for idx, impact in enumerate(impacts or [], start=1):
            var_impact = [
                self.v_id, idx, impact.gene, impact.transcript,
                impact.is_exonic, impact.is_coding, impact.is_splicing,
                impact.is_lof, impact.exon, impact.codon_change,
                impact.aa_change, impact.aa_length, impact.biotype,
                impact.top_consequence, impact.so, impact.effect_severity,
                impact.polyphen_pred, impact.polyphen_score, impact.sift_pred,
                impact.sift_score
            ]
            variant_impacts.append(var_impact)

        # extract structural variants
        sv = svs.StructuralVariant(var)
        ci_left = sv.get_ci_left()
        ci_right = sv.get_ci_right()

        if top_impact is not empty:
            for dbkey, infokey in self._extra_effect_fields:
                extra_fields[dbkey] = top_impact.effects[infokey]

        # construct the core variant record.
        # 1 row per variant to VARIANTS table
        variant = [
            chrom, var.start, var.end, vcf_id, self.v_id, top_impact.anno_id,
            var.REF, ','.join([x or "" for x in var.ALT]), var.QUAL, filter,
            var.var_type, var.var_subtype,
            pack_blob(gt_bases),
            pack_blob(gt_types),
            pack_blob(gt_phases),
            pack_blob(gt_depths),
            pack_blob(gt_ref_depths),
            pack_blob(gt_alt_depths),
            pack_blob(gt_quals),
            pack_blob(gt_copy_numbers),
            pack_blob(gt_phred_ll_homref),
            pack_blob(gt_phred_ll_het),
            pack_blob(gt_phred_ll_homalt), call_rate, in_dbsnp, rs_ids,
            ci_left[0], ci_left[1], ci_right[0], ci_right[1],
            sv.get_length(),
            sv.is_precise(),
            sv.get_sv_tool(),
            sv.get_evidence_type(),
            sv.get_event_id(),
            sv.get_mate_id(),
            sv.get_strand(), clinvar_info.clinvar_in_omim,
            clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name,
            clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id,
            clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb,
            clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc,
            clinvar_info.clinvar_in_locus_spec_db,
            clinvar_info.clinvar_on_diag_assay,
            clinvar_info.clinvar_causal_allele, clinvar_gene_phenotype,
            annotations.get_geno2mp_ct(var), pfam_domain, cyto_band, rmsk_hits,
            in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het,
            hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat,
            recomb_rate, top_impact.gene, top_impact.transcript,
            top_impact.is_exonic, top_impact.is_coding, top_impact.is_splicing,
            top_impact.is_lof, top_impact.exon, top_impact.codon_change,
            top_impact.aa_change, top_impact.aa_length, top_impact.biotype,
            top_impact.top_consequence, top_impact.so,
            top_impact.effect_severity, top_impact.polyphen_pred,
            top_impact.polyphen_score, top_impact.sift_pred,
            top_impact.sift_score,
            infotag.get_ancestral_allele(var),
            infotag.get_rms_bq(var),
            infotag.get_cigar(var),
            infotag.get_depth(var),
            infotag.get_strand_bias(var),
            infotag.get_rms_map_qual(var),
            infotag.get_homopol_run(var),
            infotag.get_map_qual_zero(var),
            infotag.get_num_of_alleles(var),
            infotag.get_frac_dels(var),
            infotag.get_haplotype_score(var),
            infotag.get_quality_by_depth(var),
            infotag.get_allele_count(var),
            infotag.get_allele_bal(var),
            infotag.in_hm2(var),
            infotag.in_hm3(var),
            infotag.is_somatic(var),
            infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA,
            esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR,
            thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR,
            thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid,
            gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count,
            encode_dnaseI.cell_list, encode_cons_seg.gm12878,
            encode_cons_seg.h1hesc, encode_cons_seg.helas3,
            encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562,
            vista_enhancers, cosmic_ids,
            pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found,
            Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR,
            Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH,
            Exac.aaf_SAS, Exac.num_het, Exac.num_hom_alt, Exac.num_chroms
        ]

        return variant, variant_impacts, extra_fields