def _prepare_variation(self, var): """ private method to collect metrics for a single variant (var) in a VCF file. """ # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) ############################################################ # collect annotations from gemini's custom annotation files ############################################################ pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) # grab the GERP score for this variant if asked. gerp_bp = None if self.args.load_gerp_bp is True: gerp_bp = annotations.get_gerp_bp(var) # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var) severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var) if severe_impacts: gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F gt_depths = np.array(var.gt_depths, np.int32) # 10 37 0 gt_ref_depths = np.array(var.gt_ref_depths, np.int32) # 2 21 0 -1 gt_alt_depths = np.array(var.gt_alt_depths, np.int32) # 8 16 0 -1 gt_quals = np.array(var.gt_quals, np.float32) # 10.78 22 99 -1 # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] if impacts is not None: for idx, impact in enumerate(impacts): var_impact = [self.v_id, (idx + 1), impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score] variant_impacts.append(var_impact) # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join(var.ALT), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), call_rate, in_dbsnp, rs_ids, clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_ASN, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562] return variant, variant_impacts
def _prepare_variation(self, var): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) ############################################################ # collect annotations from gemini's custom annotation files ############################################################ pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = consequence_so = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var, self._effect_fields) severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var, self._effect_fields) if severe_impacts: extra_fields.update(severe_impacts.extra_fields) gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof consequence_so = severe_impacts.so # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F gt_depths = np.array(var.gt_depths, np.int32) # 10 37 0 gt_ref_depths = np.array(var.gt_ref_depths, np.int32) # 2 21 0 -1 gt_alt_depths = np.array(var.gt_alt_depths, np.int32) # 8 16 0 -1 gt_quals = np.array(var.gt_quals, np.float32) # 10.78 22 99 -1 # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None if self.args.skip_info_string is False: info = var.INFO else: info = None # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] if impacts is not None: for idx, impact in enumerate(impacts): var_impact = [ self.v_id, (idx + 1), impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score ] variant_impacts.append(var_impact) # construct the core variant record. # 1 row per variant to VARIANTS table if extra_fields: extra_fields.update({ "chrom": var.CHROM, "start": var.start, "end": var.end }) chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [ chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join(var.ALT), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), call_rate, in_dbsnp, rs_ids, clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, consequence_so, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_ASN, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled ] return variant, variant_impacts, extra_fields
def _prepare_variation(self, var): """ private method to collect metrics for a single variant (var) in a VCF file. """ # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = extract_aaf(var) ############################################################ # collect annotations from gemini's custom annotation files ############################################################ cyto_band = annotations.get_cyto_info(var) dbsnp_info = annotations.get_dbsnp_info(var) in_dbsnp = 0 if dbsnp_info.rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) encode_segway_seg = annotations.get_encode_segway_segs(var) encode_chrhmm_seg = annotations.get_encode_chromhmm_segs(var) # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = effect_severity = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var) severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var) if severe_impacts: gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER elif var.FILTER is None: filter = "PASS" # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F # tally the genotypes self._update_sample_gt_counts(gt_types) # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] is_exonic = False is_coding = False is_lof = False if impacts is not None: for idx, impact in enumerate(impacts): var_impact = [ self.v_id, (idx + 1), impact.gene, impact.transcript, impact.exonic, impact.coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score ] variant_impacts.append(var_impact) if impact.exonic == True: is_exonic = True if impact.coding == True: is_coding = True if impact.is_lof == True: is_lof = True # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [ chrom, var.start, var.end, self.v_id, anno_id, var.REF, ','.join( var.ALT), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), call_rate, in_dbsnp, dbsnp_info.rs_ids, dbsnp_info.in_omim, dbsnp_info.clin_sig, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_ASN, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, encode_tfbs, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, encode_segway_seg.gm12878, encode_segway_seg.h1hesc, encode_segway_seg.helas3, encode_segway_seg.hepg2, encode_segway_seg.huvec, encode_segway_seg.k562, encode_chrhmm_seg.gm12878, encode_chrhmm_seg.h1hesc, encode_chrhmm_seg.helas3, encode_chrhmm_seg.hepg2, encode_chrhmm_seg.huvec, encode_chrhmm_seg.k562 ] return variant, variant_impacts
def _prepare_variation(self, var): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) if not isinstance(aaf, (float, int)): if aaf is not None: aaf = max(aaf) ############################################################ # collect annotations from gemini's custom annotation files # but only if the size of the variant is <= 50kb ############################################################ if var.end - var.POS < 50000: pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) fitcons = annotations.get_fitcons(var) Exac = annotations.get_exac_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # the variant is too big to annotate else: pfam_domain = None cyto_band = None rs_ids = None clinvar_info = annotations.ClinVarInfo() in_dbsnp = None rmsk_hits = None in_cpg = None in_segdup = None is_conserved = None esp = annotations.ESPInfo(None, None, None, None, None) thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None) Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None) recomb_rate = None gms = annotations.GmsTechs(None, None, None) grc = None in_cse = None encode_tfbs = None encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None) encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None) gerp_el = None vista_enhancers = None cosmic_ids = None fitcons = None cadd_raw = None cadd_scaled = None gerp_bp = None # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = consequence_so = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var, self._effect_fields) il = [i for i in impacts if i.effect_severity] # in case we don't have sever impact, we still try to get the impact # to annote the main variants table. if len(il) == 0 and len(impacts) > 0: il = impacts[:1] if len(il) > 0: im = il[0] transcript = im.transcript exon, gene = im.exon, im.gene effect_severity = im.effect_severity codon_change = im.codon_change biotype = im.biotype is_coding = im.is_coding aa_change, aa_length, consequence = im.aa_change, im.aa_length, im.consequence sift_score = im.sift_score polyphen_pred = im.polyphen_pred polyphen_score = im.polyphen_score sift_pred = im.sift_pred sift_score = im.sift_score anno_id = im.anno_id is_exonic = im.is_exonic is_coding = im.is_coding is_lof = im.is_lof severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var, self._effect_fields) if severe_impacts: extra_fields.update(severe_impacts.extra_fields) gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof consequence_so = severe_impacts.so # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F gt_depths = np.array(var.gt_depths, np.int32) # 10 37 0 gt_ref_depths = np.array(var.gt_ref_depths, np.int32) # 2 21 0 -1 gt_alt_depths = np.array(var.gt_alt_depths, np.int32) # 8 16 0 -1 gt_quals = np.array(var.gt_quals, np.float32) # 10.78 22 99 -1 gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32) # 1.0 2.0 2.1 -1 gt_phred_likelihoods = get_phred_lik(var.gt_phred_likelihoods) if gt_phred_likelihoods is not None: gt_phred_ll_homref = gt_phred_likelihoods[:, 0] gt_phred_ll_het = gt_phred_likelihoods[:, 1] gt_phred_ll_homalt = gt_phred_likelihoods[:, 2] # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None gt_alt_depths = gt_quals = gt_copy_numbers = None if self.args.skip_info_string: info = None else: info = var.INFO # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] for idx, impact in enumerate(impacts or [], start=1): var_impact = [self.v_id, idx, impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score] variant_impacts.append(var_impact) # extract structural variants sv = svs.StructuralVariant(var) ci_left = sv.get_ci_left() ci_right = sv.get_ci_right() # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join([x or "" for x in var.ALT]), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), pack_blob(gt_copy_numbers), pack_blob(gt_phred_ll_homref), pack_blob(gt_phred_ll_het), pack_blob(gt_phred_ll_homalt), call_rate, in_dbsnp, rs_ids, ci_left[0], ci_left[1], ci_right[0], ci_right[1], sv.get_length(), sv.is_precise(), sv.get_sv_tool(), sv.get_evidence_type(), sv.get_event_id(), sv.get_mate_id(), sv.get_strand(), clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, clinvar_info.clinvar_causal_allele, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, consequence_so, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found, Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR, Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH, Exac.aaf_SAS] return variant, variant_impacts, extra_fields
def _prepare_variation(self, var): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) if not isinstance(aaf, (float, int)): if aaf is not None: aaf = max(aaf) ############################################################ # collect annotations from gemini's custom annotation files # but only if the size of the variant is <= 50kb ############################################################ if var.end - var.POS < 50000: pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) fitcons = annotations.get_fitcons(var) Exac = annotations.get_exac_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # the variant is too big to annotate else: pfam_domain = None cyto_band = None rs_ids = None clinvar_info = annotations.ClinVarInfo() in_dbsnp = None rmsk_hits = None in_cpg = None in_segdup = None is_conserved = None esp = annotations.ESPInfo(None, None, None, None, None) thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None) Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None) recomb_rate = None gms = annotations.GmsTechs(None, None, None) grc = None in_cse = None encode_tfbs = None encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None) encode_cons_seg = annotations.ENCODESegInfo( None, None, None, None, None, None) gerp_el = None vista_enhancers = None cosmic_ids = None fitcons = None cadd_raw = None cadd_scaled = None gerp_bp = None # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = consequence_so = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var, self._effect_fields) il = [i for i in impacts if i.effect_severity] # in case we don't have sever impact, we still try to get the impact # to annote the main variants table. if len(il) == 0 and len(impacts) > 0: il = impacts[:1] if len(il) > 0: im = il[0] transcript = im.transcript exon, gene = im.exon, im.gene effect_severity = im.effect_severity codon_change = im.codon_change biotype = im.biotype is_coding = im.is_coding aa_change, aa_length, consequence = im.aa_change, im.aa_length, im.consequence sift_score = im.sift_score polyphen_pred = im.polyphen_pred polyphen_score = im.polyphen_score sift_pred = im.sift_pred sift_score = im.sift_score anno_id = im.anno_id is_exonic = im.is_exonic is_coding = im.is_coding is_lof = im.is_lof severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var, self._effect_fields) if severe_impacts: extra_fields.update(severe_impacts.extra_fields) gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof consequence_so = severe_impacts.so # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = var.gt_bases gt_types = var.gt_types gt_phases = var.gt_phases gt_depths = var.gt_depths gt_ref_depths = var.gt_ref_depths gt_alt_depths = var.gt_alt_depths gt_quals = var.gt_quals #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32) # 1.0 2.0 2.1 -1 gt_copy_numbers = None gt_phred_ll_homref = var.gt_phred_ll_homref gt_phred_ll_het = var.gt_phred_ll_het gt_phred_ll_homalt = var.gt_phred_ll_homalt # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None gt_alt_depths = gt_quals = gt_copy_numbers = None if self.args.skip_info_string: info = None else: info = dict(var.INFO) # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] for idx, impact in enumerate(impacts or [], start=1): var_impact = [ self.v_id, idx, impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score ] variant_impacts.append(var_impact) # extract structural variants sv = svs.StructuralVariant(var) ci_left = sv.get_ci_left() ci_right = sv.get_ci_right() # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [ chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join([x or "" for x in var.ALT ]), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), pack_blob(gt_copy_numbers), pack_blob(gt_phred_ll_homref), pack_blob(gt_phred_ll_het), pack_blob(gt_phred_ll_homalt), call_rate, in_dbsnp, rs_ids, ci_left[0], ci_left[1], ci_right[0], ci_right[1], sv.get_length(), sv.is_precise(), sv.get_sv_tool(), sv.get_evidence_type(), sv.get_event_id(), sv.get_mate_id(), sv.get_strand(), clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, clinvar_info.clinvar_causal_allele, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, consequence_so, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found, Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR, Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH, Exac.aaf_SAS ] return variant, variant_impacts, extra_fields