def test_unused(): extra = ['XXX', 'YYY'] keys = VEP.keys + extra ann = VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding|xval|yval', keys=keys) assert ann.unused() == extra, ann.unused() assert ann.effects['XXX'] == 'xval'
def test_bug(): e = sorted([ VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding' ), VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript" ) ]) assert e[-1].so == 'missense_variant', e[-1].so
def test_unused(): extra = ['YYY'] keys = VEP.keys + extra ann = VEP( 'missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding|xval|yval', keys=keys) assert ann.unused() == extra, ann.unused() assert ann.effects['YYY'] == 'yval'
def test_splice(): e = VEP( 'splice_acceptor_variant&intron_variant&feature_truncation|||ENSG00000221978|CCNL2|ENST00000408918||||-/226|protein_coding|1' ) assert (e.is_coding, e.is_exonic, e.is_splicing) == (False, False, True) e = VEP( 'intron_variant&feature_elongation|||ENSG00000187634|SAMD11|ENST00000341065||||-/589|protein_coding|1' ) assert (e.is_coding, e.is_exonic, e.is_splicing) == (False, False, False)
def test_gemini_issue812(): ann = VEP( 'protein_altering_variant|caGCAGCAGCAGCAGCAACAGCAG/caA|QQQQQQQQ/Q|ENSG00000204842|ATXN2|ENST00000608853|1/25|||14-21/1153|protein_coding|', keys= "Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL" .split("|")) assert ann.is_coding
def test_bug_vcf2db_21(): ann = VEP( 'synonymous_variant|tcA/tcG|S|ENSG00000186092|OR4F5|ENST00000335137|1/1|||60/305|protein_coding||Low_complexity_(Seg):seg&Transmembrane_helices:TMhelix&Prints_domain:PR00237&Superfamily_domains:SSF81321&Gene3D:1.20.1070.10&hmmpanther:PTHR26451&hmmpanther:PTHR26451:SF72&PROSITE_profiles:PS50262||||ENST00000335137.3:c.180A>G|ENST00000335137.3:c.180A>G(p.%3D)|||-0.817044|0.039', keys= "Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL|DOMAINS|CLIN_SIG" .split("|")) assert ann.codon_change == "tcA/tcG", ann.codon_change
def test_32(): keys = "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|REFSEQ_MATCH|SOURCE|GIVEN_REF|USED_REF|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|MAX_AF|MAX_AF_POPS|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|SpliceRegion".split( "|") s = "-|frameshift_variant&start_lost&start_retained_variant|HIGH|HRNR|ENSG00000197915|Transcript|ENST00000368801|protein_coding|2/3||ENST00000368801.2:c.1del|ENSP00000357791.2:p.Met1?|77/9623|1/8553|1/2850|M/X|Atg/tg|rs34061715&COSM111478||-1||deletion|HGNC|HGNC:20846|YES|1|P1|CCDS30859.1|ENSP00000357791|Q86YZ3||UPI00001D7CAD||Ensembl|T|T||||||0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|intron_variant&non_coding_transcript_variant|MODIFIER|FLG-AS1|ENSG00000237975|Transcript|ENST00000420707|antisense_RNA||1/8|ENST00000420707.5:n.159-25632del|||||||rs34061715&COSM111478||1||deletion|HGNC|HGNC:27913||5||||||||Ensembl|T|T|||||10|0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|intron_variant&non_coding_transcript_variant|MODIFIER|FLG-AS1|ENSG00000237975|Transcript|ENST00000593011|antisense_RNA||1/3|ENST00000593011.5:n.296+54843del|||||||rs34061715&COSM111478||1||deletion|HGNC|HGNC:27913||4||||||||Ensembl|T|T|||||10|0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|frameshift_variant&start_lost&start_retained_variant|HIGH|HRNR|388697|Transcript|NM_001009931.2|protein_coding|2/3||NM_001009931.2:c.1del|NP_001009931.1:p.Met1?|80/9632|1/8553|1/2850|M/X|Atg/tg|rs34061715&COSM111478||-1||deletion|EntrezGene|HGNC:20846|YES||||NP_001009931.1||||rseq_mrna_match|RefSeq|T|T||||||0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||".split( ",") for e in s: eff = VEP(e, keys=keys) if not "intron" in e.lower(): assert eff.impact_severity == "HIGH", (eff.impact_severity, e)
def test_canonical_order(): effects = EFFECTS[:] effects.append( VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene|*", prioritize_canonical=True)) effects = sorted(effects) assert effects[-1].is_canonical assert effects[0].impact_severity == "LOW" assert not effects[0].is_canonical
def test_veps(): f = os.path.join(HERE, "vep-csqs.txt.gz") with gzip.open(f, "rt") as veps: for csq in (VEP(l.strip()) for l in veps): assert csq.severity in (1, 2, 3) assert csq.is_pseudogene in (True, False) assert csq.coding in (True, False) assert isinstance(csq.polyphen_value, float) or csq.polyphen_value is None csq.gene assert isinstance(csq.sift_value, float) or csq.sift_value is None
def test_weird_vep(): keys = "Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL|CCDS|RadialSVM_score|RadialSVM_pred|LR_score|LR_pred|CADD_raw|CADD_phred|Reliability_index".split( "|") csqs = [ "?|||117581|TWIST2|NM_001271893.1|1/1||||protein_coding|YES||||||||,non_coding_transcript_exon_variant&non_coding_transcript_variant|||117581|TWIST2|NM_001271893.1_dupl8|1/1||||mRNA|||||||||", "non_coding_transcript_exon_variant&non_coding_transcript_variant|||117581|TWIST2|NM_001271893.1_dupl8|1/1||||mRNA|||||||||,?|||117581|TWIST2|NM_001271893.1|1/1||||protein_coding|YES||||||||", "?|||115286|SLC25A26|NM_173471.3|1/1||||protein_coding|YES||||||||", "|||ENSG00000138190|EXOC6|ENST00000260762||||-/804|protein_coding,|||ENSG00000138190|EXOC6|ENST00000371547||||-/820|protein_coding,|||ENSG00000138190|EXOC6|ENST00000443748||||-/701|protein_coding,NMD_transcript_variant|||ENSG00000138190|EXOC6|ENST00000495132||||-/404|nonsense_mediated_decay,|||ENSG00000138190|EXOC6|ENST00000371552||||-/799|protein_coding", "|||ENSG00000013503|POLR3B|ENST00000539066||||-/1075|protein_coding,nc_transcript_variant|||ENSG00000013503|POLR3B|ENST00000549195|||||processed_transcript,|||ENSG00000013503|POLR3B|ENST00000549569||||-/170|protein_coding,|||ENSG00000013503|POLR3B|ENST00000228347||||-/1133|", "|||ENSG00000147202|DIAPH2|ENST00000373054||||-/1097|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000355827||||-/1096|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000324765||||-/1101|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000373049||||-/1096|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000373061||||-/1101|protein_coding", ] import sys for cs in csqs: for c in cs.split(","): v = VEP(c, keys) assert v.impact_severity in ('LOW', 'MEDIUM', 'HIGH')
def test_vep(): ann = VEP( 'missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding' ) assert ann.gene == 'OR4F5' assert ann.transcript == 'ENST00000335137' assert ann.aa_change == "F/C", ann.aa_change assert ann.consequences == ['missense_variant'] assert ann.coding assert ann.biotype == "protein_coding" assert ann.severity == 2 assert ann.impact_severity == "MED", ann.impact_severity assert not ann.is_pseudogene assert ann.polyphen_value == 0.568, ann.polyphen assert ann.polyphen_class == "possibly_damaging", ann.polyphen assert ann.sift_value == 0.0, ann.sift assert ann.sift_class == "deleterious", ann.sift
gen = record.genotype(s).gt_type if gen: samples_gen_counts[gen] += 1 else: samples_gen_counts[3] += 1 samples_gen_counts[4] = float(samples_gen_counts[1] + 2*samples_gen_counts[2])/2*sum(samples_gen_counts[0:3]) return samples_gen_counts if __name__ == '__main__': #vcf_fname = sys.argv[1] vcf_fname = "/Users/dashazhernakova/Documents/Doby/GenomeRussia/60samples/ps+nov+yak+papers.filtered.from_russia.rsids.VEP.coding.vcf.gz" vcf_reader = vcf.Reader(open(vcf_fname)) VEP.keys = vcf_reader.infos['CSQ'].desc.split(" ")[-1].split("|") sample_groups = ['Pskov', 'Novgorod', 'Yakut', 'Simons', 'Pagani', 'all'] transcr_dict = defaultdict[list] for record in vcf_reader: if not record.ID: record.ID = record.CHROM + ":" + str(record.POS) effects = [] for annot in record.INFO['CSQ']: vep = VEP(annot) if vep.effects['LoF'] == 'HC': sample_mafs = fillSampleMAFs(sample_groups, record) transcr_dict[vep.effects['Feature'] + ":" + vep.effects['Symbol']].append([record.ID, sample_mafs]) for transcr, snps in transcr_dict.items(): for snp in snps: print transcr.split(":")[1] + "\t" + transcr.split(":")[0] + "\t". join(snp)
csq.gene assert isinstance(csq.sift_value, float) or csq.sift_value is None def test_snpeffs(): f = os.path.join(HERE, "snpeff-anns.txt.gz") with gzip.open(f, "rt") as anns: for csq in (SnpEff(l.strip()) for l in anns): assert csq.severity in (1, 2, 3) assert csq.is_pseudogene in (True, False) assert csq.coding in (True, False) assert csq.polyphen_value is None EFFECTS = [ VEP("upstream_gene_variant|||ENSG00000223972|DDX11L1|ENST00000456328|||||processed_transcript" ), VEP("downstream_gene_variant|||ENSG00000227232|WASH7P|ENST00000488147|||||unprocessed_pseudogene" ), VEP("non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript" ), VEP("non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript" ), VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript" ), VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript" ), VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript" ), VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene" ), VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene"
for i, line in enumerate(sys.stdin): if line[0] == "#": print(line, end="") if "<ID=CSQ," in line: csq_keys = get_csq_keys(line) continue if i % 1000 == 0: print("filter: %d" % i, file=sys.stderr) toks = line.rstrip().split("\t") info = toks[7] pos = info.index('CSQ=') + 4 vi = info[pos:].split(";")[0] veps = [VEP(c, keys=csq_keys) for c in vi.split(",")] if not any(isfunctional(v) for v in veps): continue if 'max_aaf_all=' in info: vals = info.split('max_aaf_all=')[1].split(";")[0].split(",") if max(map(float, vals)) > 0.001: print("skipping because of max_aaf_all:", line, file=sys.stderr) continue print(line, end="") sys.stdout.flush()