def setUp(self): """ define a default SNV object """ self.pops = [ "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF" ] Info.set_populations(self.pops) chrom = "1" pos = "15000000" snp_id = "." ref = "A" alt = "G" qual = "1000" filt = "PASS" info = "HGNC_ID=1001;CQ=missense_variant;random_tag" self.keys = "GT:DP:AD" self.values = "0/1:50:10,10" self.var = SNV(chrom, pos, snp_id, ref, alt, qual, filt, info=info, format=self.keys, sample=self.values)
def test_get_low_depth_alleles(self): ''' test that get_low_depth_alleles() works correctly ''' # check with a single allele whre it is non-zero self.var.info = Info('AC=1') alts = ('C', ) self.assertEqual(self.var.get_low_depth_alleles('G', alts), []) # check with a single allele with zero count self.var.info = Info('AC=0') alts = ('C', ) self.assertEqual(self.var.get_low_depth_alleles('G', alts), ['C']) # check with multiallelic, where both are nonzero self.var.info = Info('AC=1,1') self.var.format = {'AD': '5,10,10'} alts = ('C', 'G') self.assertEqual(self.var.get_low_depth_alleles('G', alts), []) # check with multiallelic, where one a has zero count self.var.info = Info('AC=1,0') self.var.format = {'AD': '5,10,10'} alts = ('C', 'G') self.assertEqual(self.var.get_low_depth_alleles('G', alts), ['G'])
def test_parse_gene_symbols_multi_alts_multi_symbols(self): ''' check parse_gene_symbols() when we have multiple symbols per allele ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [ Symbols(info={ 'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X' }, idx=0), Symbols(info={ 'HGNC_ID': 'E|Y', 'HGNC': 'E|Y', 'SYMBOL': 'E|Y', 'ENSG': 'E|Y', 'ENST': 'E|Y', 'ENSP': 'E|Y', 'ENSR': 'E|Y' }, idx=0) ])
def setUp(self): """ define a family and variant, and start the Allosomal class """ # generate a test family child_gender = "F" mom_aff = "1" dad_aff = "1" self.trio = self.create_family(child_gender, mom_aff, dad_aff) # generate a test variant child = create_snv(child_gender, "0/1", chrom='X', pos=150, extra_info='HGNC=TEST;MAX_AF=0.0005') mom = create_snv("F", "0/0", chrom='X', pos=150) dad = create_snv("M", "0/0", chrom='X', pos=150) self.variants = [TrioGenotypes('X', '150', child, mom, dad)] self.report = Report(None, None, None) Info.set_populations([ "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF" ])
def __init__(self, chrom, position, id, ref, alts, qual, filter, info=None, format=None, sample=None, gender=None, sum_x_lr2=None, parents=None, mnv_code=None): """ initialise the object with the definition values """ self.chrom = chrom self.position = int(position) self.variant_id = id self.mutation_id = "NA" self.set_mutation_id(self.variant_id) self.ref_allele = ref self.alt_alleles = tuple(alts.split(',')) self.mnv_code = mnv_code self.qual = qual self.filter = filter self.sum_x_lr2 = sum_x_lr2 self.has_parents = parents # intialise variables that will be set later self.inheritance_type = None self.gender = None if gender is not None: self._set_gender(gender) self.vcf_line = None self.format = None if format is not None and sample is not None: self.add_format(format, sample) self.info = Info(info, self.mnv_code) masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles) self.info.set_genes_and_consequence(self.get_chrom(), self.get_position(), self.alt_alleles, masked) self.genotype = None if self.format is not None and self._get_gender() is not None: self.set_genotype()
def test_parse_gene_symbols_missing_gene(self): ''' check the gene symbol is the genome pos when we lack any other info ''' # remove the only possibly source of the gene symbol info = Info('') alts = ('C', ) genes = info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={}, idx=0)])
def setUp(self): """ define a default Info object """ self.pops = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"] Info.set_populations(self.pops) # set up a SNV object, since SNV inherits Info self.info = Info("HGNC_ID=1001;CQ=missense_variant;random_tag")
def test_get_consequence_multiallelic_with_masked(self): ''' test that get_consequence works correctly with multiple alleles ''' chrom, pos = '1', 1000 info = Info('CQ=missense_variant,synonymous_variant') alts = ('C', 'G') self.assertEqual(info.get_consequences(chrom, pos, alts, ['G']), [['missense_variant']])
def test_get_consequence_multiallelic(self): ''' test that get_consequence works correctly with multiple alleles ''' chrom, pos = '1', 1000 info = Info('CQ=missense_variant,synonymous_variant') alts = ('C', 'G') self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant'], ['synonymous_variant']])
def test_is_missense_cnv(self): ''' test that is_missense() works correctly for CNVs ''' chrom, pos, alts, = '1', '15000000', ('G',) info = Info('HGNC=ATRX;CQ=coding_sequence_variant;random_tag') info.set_genes_and_consequence(chrom, pos, alts, []) self.assertTrue(info.is_missense(is_cnv=True)) self.assertFalse(info.is_missense(is_cnv=False))
def setUp(self): """ define a default Info object """ self.pops = [ "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF" ] Info.set_populations(self.pops) # set up a SNV object, since SNV inherits Info self.info = Info("HGNC_ID=1001;CQ=missense_variant;random_tag")
def test_parse_gene_symbols_multi_alts_multi_symbols(self): ''' check parse_gene_symbols() when we have multiple symbols per allele ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [Symbols(info={'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X'}, idx=0), Symbols(info={'HGNC_ID': 'E|Y', 'HGNC': 'E|Y', 'SYMBOL': 'E|Y', 'ENSG': 'E|Y', 'ENST': 'E|Y', 'ENSP': 'E|Y', 'ENSR': 'E|Y'}, idx=0)])
def test_find_max_allele_frequency_without_populations(self): ''' test if the MAF finder operates correctly when we haven't set any populations to check ''' self.info["MAX_AF"] = "0.005" # this is a regression test for a problem that only occurs if the unit # tests are run in an order such that the populations might not have # been set in previous commits. Info.set_populations([]) self.assertEqual(self.info.find_max_allele_frequency(), None) # reset the populations, so that other unit tests can also rely on the # populations being set Info.set_populations(self.pops)
def test_get_consequence(self): """ test that get_consequence works correctly """ chrom, pos = '1', 1000 info = Info('CQ=missense_variant;HGNC=TEST') alts = ('C',) # check that in the absence of any known conserved final exon positions, # the consequence is unchanged. self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant']]) info = Info('CQ=missense_variant|stop_gained;HGNC=TEST|TEST2') self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant', 'stop_gained']])
def test_parse_gene_symbols_multi_alts_masked_alt(self): ''' check parse_gene_symbols() when we mask alt alleles ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') # mask one allele self.assertEqual(info.parse_gene_symbols(alts, ['C']), [Symbols(info={'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X'}, idx=0)]) # mask both alleles self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']), [])
def test_parse_gene_symbols_multi_alts(self): ''' check parse_gene_symbols() when we have multiple alleles ''' info = Info('HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E') alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [Symbols(info={'HGNC_ID': 'D', 'HGNC': 'D', 'SYMBOL': 'D', 'ENSG': 'D', 'ENST': 'D', 'ENSP': 'D', 'ENSR': 'D'}, idx=0), Symbols(info={'HGNC_ID': 'E', 'HGNC': 'E', 'SYMBOL': 'E', 'ENSG': 'E', 'ENST': 'E', 'ENSP': 'E', 'ENSR': 'E'}, idx=0)]) # if we have more alleles than the available symbols, we get an error # NOTE: this doesn't check if we have fewer alleles than symbols alts = ('G', 'T', 'C') with self.assertRaises(IndexError): self.info.parse_gene_symbols(alts, [])
def load_variants(family, pp_filter, pops, known_genes, last_base, sum_x_lr2, debug_chrom=None, debug_pos=None): """ loads the variants for a trio or singleton Args: family: Family object containing an data for an affected proband pp_filter: float between 0 and 1, being the threshold for the PP_DNM filter pops: list of populations who have minor allele frequencies in INFO known_genes: genes known to be involved with genetic disorders. last_base: set of sites in genome at conserved last base of exons, where we upgrade the severity of variants to loss-of-function. debug_chrom: chromosome string, to give more information about why a variant fails to pass the filters. debug_pos: chromosome position, to give more information about why a variant fails to pass the filters. sum_x_lr2: Sum of mean l2r on x chromosomes for all probands Returns: list of filtered variants for a trio, as TrioGenotypes objects """ # define several parameters of the variant classes, before initialisation for Var in [SNV, CNV]: Var.set_known_genes(known_genes) Var.set_debug(debug_chrom, debug_pos) Info.set_last_base_sites(last_base) Info.set_populations(pops) #get sum of mean l2r for proband sum_x_lr2_proband = 0 if family.child.person_id in sum_x_lr2.keys(): sum_x_lr2_proband = sum_x_lr2[family.child.person_id] variants = load_trio(family, sum_x_lr2_proband) return filter_de_novos(variants, pp_filter)
def test_get_low_depth_alleles_bad_indel(self): ''' test that get_low_depth_alleles() works for indels with depth=1 ''' # check with multiallelic, where all should pass, since none are indels self.var.info = Info('AC=1,1') self.var.format = {'AD': '5,10,1'} alts = ('C', 'G') self.assertEqual(self.var.get_low_depth_alleles('G', alts), []) # but if we have an indel alt, and the corresponding depth is bad, fail alts = ('C', 'GG') self.assertEqual(self.var.get_low_depth_alleles('G', alts), ['GG'])
def setUp(self): """ define a default SNV object """ self.pops = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"] Info.set_populations(self.pops) chrom = "1" pos = "15000000" snp_id = "." ref = "A" alt = "G" qual = "1000" filt = "PASS" info = "HGNC_ID=1001;CQ=missense_variant;random_tag" self.keys = "GT:DP:AD" self.values = "0/1:50:10,10" self.var = SNV(chrom, pos, snp_id, ref, alt, qual, filt, info=info, format=self.keys, sample=self.values)
def load_variants(family, pp_filter, pops, known_genes, last_base, sum_x_lr2, debug_chrom=None, debug_pos=None): """ loads the variants for a trio or singleton Args: family: Family object containing an data for an affected proband pp_filter: float between 0 and 1, being the threshold for the PP_DNM filter pops: list of populations who have minor allele frequencies in INFO known_genes: genes known to be involved with genetic disorders. last_base: set of sites in genome at conserved last base of exons, where we upgrade the severity of variants to loss-of-function. debug_chrom: chromosome string, to give more information about why a variant fails to pass the filters. debug_pos: chromosome position, to give more information about why a variant fails to pass the filters. sum_x_lr2: Sum of mean l2r on x chromosomes for all probands Returns: list of filtered variants for a trio, as TrioGenotypes objects """ parents = family.has_parents() # define several parameters of the variant classes, before initialisation for Var in [SNV, CNV]: Var.set_known_genes(known_genes) Var.set_debug(debug_chrom, debug_pos) Info.set_last_base_sites(last_base) Info.set_populations(pops) #get sum of mean l2r for proband sum_x_lr2_proband = 0 if family.child.person_id in sum_x_lr2.keys(): sum_x_lr2_proband = sum_x_lr2[family.child.person_id] variants = load_trio(family, sum_x_lr2_proband) return filter_de_novos(variants, pp_filter)
def setUp(self): """ define a family and variant, and start the Allosomal class """ # generate a test family child_gender = "F" mom_aff = "1" dad_aff = "1" self.trio = self.create_family(child_gender, mom_aff, dad_aff) # generate a test variant child = create_snv(child_gender, "0/1", chrom='X', pos=150, extra_info='HGNC=TEST;MAX_AF=0.0005') mom = create_snv("F", "0/0", chrom='X', pos=150) dad = create_snv("M", "0/0", chrom='X', pos=150) self.variants = [TrioGenotypes('X', '150', child, mom, dad)] self.report = Report(None, None, None) Info.set_populations(["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"])
def test_parse_gene_symbols_multi_alts(self): ''' check parse_gene_symbols() when we have multiple alleles ''' info = Info( 'HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E' ) alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [ Symbols(info={ 'HGNC_ID': 'D', 'HGNC': 'D', 'SYMBOL': 'D', 'ENSG': 'D', 'ENST': 'D', 'ENSP': 'D', 'ENSR': 'D' }, idx=0), Symbols(info={ 'HGNC_ID': 'E', 'HGNC': 'E', 'SYMBOL': 'E', 'ENSG': 'E', 'ENST': 'E', 'ENSP': 'E', 'ENSR': 'E' }, idx=0) ]) # if we have more alleles than the available symbols, we get an error # NOTE: this doesn't check if we have fewer alleles than symbols alts = ('G', 'T', 'C') with self.assertRaises(IndexError): self.info.parse_gene_symbols(alts, [])
def test_is_compound_pair_unknown_gene(self): """check that is_compound_pair() excludes pairs for unknown genes """ # set some variants, so we can alter them later var1 = self.create_variant(chrom="1", position="150", sex="F", cq="stop_gained") var2 = self.create_variant(chrom="1", position="160", sex="F", cq="stop_gained") var1 = self.set_compound_het_var(var1, "110") var2 = self.set_compound_het_var(var2, "101") var1.child.info = Info('CQ=missense_variant') var2.child.info = Info('CQ=missense_variant') var1.child.info.set_genes_and_consequence('1', 100, ('G', ), []) var2.child.info.set_genes_and_consequence('1', 100, ('G', ), []) # exclude pairs where both members are not loss-of-function self.assertFalse(self.inh.is_compound_pair(var1, var2))
def test_parse_gene_symbols_multi_alts_masked_alt(self): ''' check parse_gene_symbols() when we mask alt alleles ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') # mask one allele self.assertEqual(info.parse_gene_symbols(alts, ['C']), [ Symbols(info={ 'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X' }, idx=0) ]) # mask both alleles self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']), [])
def test_get_consequence_last_base(self): '''check get_consequence() works with last base of exon changes ''' chrom, pos = '1', 1000 alts = ('C',) info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence(chrom, pos, alts, []) # Now check that if the variant is at a position where it is a final # base in an exon with a conserved base, the consequence gets converted. info.last_base = set([("1", 1000)]) self.assertEqual(info.get_consequences(chrom, pos, alts, []), [["conserved_exon_terminus_variant"]]) # If we have a variant in multiple genes, check that it only alters the # missense/splice_region variants, and doesn't alter synonymous variants # (since these will be in transcripts where the variant is distant from # an exon boundary.) info = Info('CQ=missense_variant|synonymous_variant;HGNC=TEST|TEST1') info.set_genes_and_consequence(chrom, pos, alts, []) info.last_base = set([("1", 1000)]) self.assertEqual(info.get_consequences(chrom, pos, alts, []), [["conserved_exon_terminus_variant", "synonymous_variant"]])
def test_is_missense_cnv(self): ''' test that is_missense() works correctly for CNVs ''' chrom, pos, alts, = '1', '15000000', ('G', ) info = Info('HGNC=ATRX;CQ=coding_sequence_variant;random_tag') info.set_genes_and_consequence(chrom, pos, alts, []) self.assertTrue(info.is_missense(is_cnv=True)) self.assertFalse(info.is_missense(is_cnv=False))
def test_get_consequence(self): """ test that get_consequence works correctly """ chrom, pos = '1', 1000 info = Info('CQ=missense_variant;HGNC=TEST') alts = ('C', ) # check that in the absence of any known conserved final exon positions, # the consequence is unchanged. self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant']]) info = Info('CQ=missense_variant|stop_gained;HGNC=TEST|TEST2') self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant', 'stop_gained']])
def tearDown(self): SNV.known_genes = None Info.set_populations([])
class TestVariantInfoPy(unittest.TestCase): """ unit testing of the Info class """ def setUp(self): """ define a default Info object """ self.pops = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"] Info.set_populations(self.pops) # set up a SNV object, since SNV inherits Info self.info = Info("HGNC_ID=1001;CQ=missense_variant;random_tag") def tearDown(self): Info.set_populations([]) def test_get_consequence(self): """ test that get_consequence works correctly """ chrom, pos = '1', 1000 info = Info('CQ=missense_variant;HGNC=TEST') alts = ('C',) # check that in the absence of any known conserved final exon positions, # the consequence is unchanged. self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant']]) info = Info('CQ=missense_variant|stop_gained;HGNC=TEST|TEST2') self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant', 'stop_gained']]) def test_get_consequence_last_base(self): '''check get_consequence() works with last base of exon changes ''' chrom, pos = '1', 1000 alts = ('C',) info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence(chrom, pos, alts, []) # Now check that if the variant is at a position where it is a final # base in an exon with a conserved base, the consequence gets converted. info.last_base = set([("1", 1000)]) self.assertEqual(info.get_consequences(chrom, pos, alts, []), [["conserved_exon_terminus_variant"]]) # If we have a variant in multiple genes, check that it only alters the # missense/splice_region variants, and doesn't alter synonymous variants # (since these will be in transcripts where the variant is distant from # an exon boundary.) info = Info('CQ=missense_variant|synonymous_variant;HGNC=TEST|TEST1') info.set_genes_and_consequence(chrom, pos, alts, []) info.last_base = set([("1", 1000)]) self.assertEqual(info.get_consequences(chrom, pos, alts, []), [["conserved_exon_terminus_variant", "synonymous_variant"]]) def test_get_consequence_multiallelic(self): ''' test that get_consequence works correctly with multiple alleles ''' chrom, pos = '1', 1000 info = Info('CQ=missense_variant,synonymous_variant') alts = ('C', 'G') self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant'], ['synonymous_variant']]) def test_get_consequence_multiallelic_with_masked(self): ''' test that get_consequence works correctly with multiple alleles ''' chrom, pos = '1', 1000 info = Info('CQ=missense_variant,synonymous_variant') alts = ('C', 'G') self.assertEqual(info.get_consequences(chrom, pos, alts, ['G']), [['missense_variant']]) def test_parse_gene_symbols(self): """ test that parse_gene_symbols() works correctly """ alts = ('C',) # check for when a HGNC key exists self.info["HGNC_ID"] = "A" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A'}, idx=0)]) # check for when a HGNC key doesn't exist del self.info["HGNC_ID"] genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={}, idx=0)]) # check for multiple gene symbols self.info["HGNC_ID"] = "A|B|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A|B|C'}, idx=0)]) # check for multiple gene symbols, when some are missing self.info["HGNC_ID"] = "|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C'}, idx=0)]) # check for multiple gene symbols, when some missing symbols have # alternates in other symbol fields. self.info["HGNC_ID"] = ".|.|C" self.info["HGNC"] = "Z|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C', 'HGNC': 'Z||C'}, idx=0)]) # Check that including alternate symbols has the correct precendence # order. Note that doing this properly would require checking all of the # possible order combinations. self.info["HGNC_ID"] = ".|.|C" self.info["HGNC"] = "Z|.|C" self.info["SYMBOL"] = "A|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C', 'HGNC': 'Z||C', "SYMBOL": "A||C"}, idx=0)]) def test_parse_gene_symbols_multi_alts(self): ''' check parse_gene_symbols() when we have multiple alleles ''' info = Info('HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E') alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [Symbols(info={'HGNC_ID': 'D', 'HGNC': 'D', 'SYMBOL': 'D', 'ENSG': 'D', 'ENST': 'D', 'ENSP': 'D', 'ENSR': 'D'}, idx=0), Symbols(info={'HGNC_ID': 'E', 'HGNC': 'E', 'SYMBOL': 'E', 'ENSG': 'E', 'ENST': 'E', 'ENSP': 'E', 'ENSR': 'E'}, idx=0)]) # if we have more alleles than the available symbols, we get an error # NOTE: this doesn't check if we have fewer alleles than symbols alts = ('G', 'T', 'C') with self.assertRaises(IndexError): self.info.parse_gene_symbols(alts, []) def test_parse_gene_symbols_multi_alts_multi_symbols(self): ''' check parse_gene_symbols() when we have multiple symbols per allele ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [Symbols(info={'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X'}, idx=0), Symbols(info={'HGNC_ID': 'E|Y', 'HGNC': 'E|Y', 'SYMBOL': 'E|Y', 'ENSG': 'E|Y', 'ENST': 'E|Y', 'ENSP': 'E|Y', 'ENSR': 'E|Y'}, idx=0)]) def test_parse_gene_symbols_multi_alts_masked_alt(self): ''' check parse_gene_symbols() when we mask alt alleles ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') # mask one allele self.assertEqual(info.parse_gene_symbols(alts, ['C']), [Symbols(info={'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X'}, idx=0)]) # mask both alleles self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']), []) def test_parse_gene_symbols_missing_gene(self): ''' check the gene symbol is the genome pos when we lack any other info ''' # remove the only possibly source of the gene symbol info = Info('') alts = ('C', ) genes = info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={}, idx=0)]) def test_is_lof(self): """ test that is_lof() works correctly """ # check that known LOF consensequence return True info = Info('CQ=stop_gained;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_lof()) # check that known non-LOF consensequence returns False info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_lof()) # check that null values return False info = Info('HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_lof()) # check when the variant overlaps multiple genes (so has multiple # gene symbols and consequences). info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_lof()) self.assertTrue(info.is_lof("ATRX")) self.assertFalse(info.is_lof("TTN")) # check that when we have a MNV, we can lose or gain a LOF annotation info.mnv_code = 'masked_stop_gain_mnv' self.assertFalse(info.is_lof("ATRX")) info.mnv_code = 'modified_stop_gained_mnv' self.assertTrue(info.is_lof("TTN")) def test_is_missense(self): """ test that is_missense() works correctly """ # check that known missense equivalent consequence return True info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_missense(is_cnv=False)) # check that known LoF equivalent consequence returns False info = Info('CQ=stop_gained;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_missense(is_cnv=False)) # check that null values return False info = Info('HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_missense(is_cnv=False)) # check when the variant overlaps multiple genes (so has multiple # gene symbols and consequences). info = Info('CQ=missense_variant|synonymous_variant;HGNC=ATRX|TTN') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_missense(is_cnv=False)) self.assertTrue(info.is_missense(False, "ATRX")) self.assertFalse(info.is_missense(False, "TTN")) # check that when we have a MNV, we can lose or gain a LOF annotation info.mnv_code = 'modified_synonymous_mnv' self.assertFalse(info.is_missense(False, "ATRX")) info.mnv_code = 'modified_protein_altering_mnv' self.assertTrue(info.is_missense(False, "TTN")) # check that masked stop gained MNVs are converted to a missense info = Info('CQ=stop_gained;HGNC=ATRX') info.set_genes_and_consequence('1', 100, ('G'), []) info.mnv_code = 'masked_stop_gain_mnv' self.assertTrue(info.is_missense(False)) def test_is_missense_cnv(self): ''' test that is_missense() works correctly for CNVs ''' chrom, pos, alts, = '1', '15000000', ('G',) info = Info('HGNC=ATRX;CQ=coding_sequence_variant;random_tag') info.set_genes_and_consequence(chrom, pos, alts, []) self.assertTrue(info.is_missense(is_cnv=True)) self.assertFalse(info.is_missense(is_cnv=False)) def test_get_per_gene_consequence(self): """ test that get_per_gene_consequence works correctly """ self.info.symbols = [Symbols(info={'HGNC': 'ATRX'}, idx=0)] self.info.consequence = [["missense_variant"]] self.assertEqual(self.info.get_per_gene_consequence(None), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("TEST"), []) # check a variant with consequences in multiple genes, that we only # pull out the consequencesquences for a single gene self.info.symbols = [Symbols(info={'HGNC': 'ATRX|TTN'}, idx=0)] self.info.consequence = [["missense_variant", "synonymous_variant"]] self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("TTN"), ["synonymous_variant"]) # check a symbol where two symbols match, we only use the first consequence self.info.symbols = [Symbols(info={'HGNC': 'TEMP|ATRX|TEMP'}, idx=0)] self.info.consequence = [["splice_acceptor_variant", "missense_variant", "synonymous_variant"]] self.assertEqual(self.info.get_per_gene_consequence("TEMP"), ["splice_acceptor_variant"]) # check a symbol with some None gene symbols self.info.symbols = [Symbols(info={'HGNC': '|ATRX|'}, idx=0)] self.info.consequence = [["splice_acceptor_variant", "missense_variant", "synonymous_variant"]] self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"]) def test_get_allele_frequency(self): """ tests that number conversion works as expected """ # single number returns that number self.assertEqual(self.info.get_allele_frequency("1"), 1) # two numbers return one number self.assertEqual(self.info.get_allele_frequency("1,1"), 1) # two numbers return the highest number self.assertEqual(self.info.get_allele_frequency("1,2"), 2) # number and string return the number self.assertEqual(self.info.get_allele_frequency("a,1"), 1) # single string value returns None self.assertEqual(self.info.get_allele_frequency("a"), None) # multiple string values return None self.assertEqual(self.info.get_allele_frequency("a,b"), None) # multiple string values return None self.assertEqual(self.info.get_allele_frequency(None), None) def test_is_number(self): """ tests that we can check if a value represents a number """ self.assertEqual(self.info.is_number(None), False) self.assertEqual(self.info.is_number("5"), True) self.assertEqual(self.info.is_number("a"), False) def test_find_max_allele_frequency(self): """ test if the MAF finder operates correctly """ # check for var without recorded MAF self.assertIsNone(self.info.find_max_allele_frequency()) # check for single population self.info["MAX_AF"] = "0.005" self.assertEqual(self.info.find_max_allele_frequency(), 0.005) # check for two populations self.info["AFR_AF"] = "0.01" self.assertEqual(self.info.find_max_allele_frequency(), 0.01) # check for all populations pops = set(["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", \ "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]) for pop in pops: self.info[pop] = "0.05" self.assertEqual(self.info.find_max_allele_frequency(), 0.05) # make sure we can handle having None values self.info["AFR_AF"] = None self.assertEqual(self.info.find_max_allele_frequency(), 0.05) def test_find_max_allele_frequency_without_populations(self): ''' test if the MAF finder operates correctly when we haven't set any populations to check ''' self.info["MAX_AF"] = "0.005" # this is a regression test for a problem that only occurs if the unit # tests are run in an order such that the populations might not have # been set in previous commits. Info.set_populations([]) self.assertEqual(self.info.find_max_allele_frequency(), None) # reset the populations, so that other unit tests can also rely on the # populations being set Info.set_populations(self.pops)
def test_is_lof(self): """ test that is_lof() works correctly """ # check that known LOF consensequence return True info = Info('CQ=stop_gained;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_lof()) # check that known non-LOF consensequence returns False info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_lof()) # check that null values return False info = Info('HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_lof()) # check when the variant overlaps multiple genes (so has multiple # gene symbols and consequences). info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_lof()) self.assertTrue(info.is_lof("ATRX")) self.assertFalse(info.is_lof("TTN")) # check that when we have a MNV, we can lose or gain a LOF annotation info.mnv_code = 'masked_stop_gain_mnv' self.assertFalse(info.is_lof("ATRX")) info.mnv_code = 'modified_stop_gained_mnv' self.assertTrue(info.is_lof("TTN"))
def test_get_consequence_last_base(self): '''check get_consequence() works with last base of exon changes ''' chrom, pos = '1', 1000 alts = ('C', ) info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence(chrom, pos, alts, []) # Now check that if the variant is at a position where it is a final # base in an exon with a conserved base, the consequence gets converted. info.last_base = set([("1", 1000)]) self.assertEqual(info.get_consequences(chrom, pos, alts, []), [["conserved_exon_terminus_variant"]]) # If we have a variant in multiple genes, check that it only alters the # missense/splice_region variants, and doesn't alter synonymous variants # (since these will be in transcripts where the variant is distant from # an exon boundary.) info = Info('CQ=missense_variant|synonymous_variant;HGNC=TEST|TEST1') info.set_genes_and_consequence(chrom, pos, alts, []) info.last_base = set([("1", 1000)]) self.assertEqual( info.get_consequences(chrom, pos, alts, []), [["conserved_exon_terminus_variant", "synonymous_variant"]])
def tearDown(self): Info.set_populations([])
def test_is_missense(self): """ test that is_missense() works correctly """ # check that known missense equivalent consequence return True info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_missense(is_cnv=False)) # check that known LoF equivalent consequence returns False info = Info('CQ=stop_gained;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_missense(is_cnv=False)) # check that null values return False info = Info('HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_missense(is_cnv=False)) # check when the variant overlaps multiple genes (so has multiple # gene symbols and consequences). info = Info('CQ=missense_variant|synonymous_variant;HGNC=ATRX|TTN') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_missense(is_cnv=False)) self.assertTrue(info.is_missense(False, "ATRX")) self.assertFalse(info.is_missense(False, "TTN")) # check that when we have a MNV, we can lose or gain a LOF annotation info.mnv_code = 'modified_synonymous_mnv' self.assertFalse(info.is_missense(False, "ATRX")) info.mnv_code = 'modified_protein_altering_mnv' self.assertTrue(info.is_missense(False, "TTN")) # check that masked stop gained MNVs are converted to a missense info = Info('CQ=stop_gained;HGNC=ATRX') info.set_genes_and_consequence('1', 100, ('G'), []) info.mnv_code = 'masked_stop_gain_mnv' self.assertTrue(info.is_missense(False))
class Variant(object): """ generic functions for variants """ # define some codes used in ped files to identify male and female sexes male_codes = set(["1", "m", "M", "male"]) female_codes = set(["2", "f", "F", "female"]) x_pseudoautosomal_regions = [(60001, 2699520), (154930290, 155260560), \ (88456802, 92375509)] y_pseudoautosomal_regions = [(10001, 2649520), (59034050, 59363566)] known_genes = None @classmethod def set_known_genes(cls_obj, known_genes): cls_obj.known_genes = known_genes def __init__(self, chrom, position, id, ref, alts, qual, filter, info=None, format=None, sample=None, gender=None, sum_x_lr2=None, parents=None, mnv_code=None): """ initialise the object with the definition values """ self.chrom = chrom self.position = int(position) self.variant_id = id self.mutation_id = "NA" self.set_mutation_id(self.variant_id) self.ref_allele = ref self.alt_alleles = tuple(alts.split(',')) self.mnv_code = mnv_code self.qual = qual self.filter = filter self.sum_x_lr2 = sum_x_lr2 self.has_parents = parents # intialise variables that will be set later self.inheritance_type = None self.gender = None if gender is not None: self._set_gender(gender) self.vcf_line = None self.format = None if format is not None and sample is not None: self.add_format(format, sample) self.info = Info(info, self.mnv_code) masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles) self.info.set_genes_and_consequence(self.get_chrom(), self.get_position(), self.alt_alleles, masked) self.genotype = None if self.format is not None and self._get_gender() is not None: self.set_genotype() def is_lof(self, gene_symbol=None): return self.info.is_lof(gene_symbol) def is_missense(self, is_cnv, gene_symbol=None): return self.info.is_missense(is_cnv, gene_symbol) def is_synoymous(self, gene_symbol=None): return self.info.is_synoymous(gene_symbol) def __repr__(self): ''' repr function for Variant objects. SNV(...) and CNV(...) also work ''' def quote(value): if value is not None: value = '"{}"'.format(value) return value # reprocess the format dictionary back to the original text strings keys, sample = None, None if self.format is not None: keys = quote(':'.join(sorted(self.format))) sample = quote(':'.join([ self.format[x] for x in sorted(self.format) ])) info = quote(self.info) gender = quote(self.gender) mnv_code = quote(self.mnv_code) return '{}(chrom="{}", position={}, id="{}", ref="{}", alts="{}", ' \ 'qual="{}", filter="{}", info={}, format={}, sample={}, gender={}, ' \ 'mnv_code={})'.format(type(self).__name__, self.chrom, self.position, self.variant_id, self.ref_allele, ','.join(self.alt_alleles), self.qual, self.filter, info, keys, sample, gender, mnv_code) def __hash__(self): return hash(str(self)) def __eq__(self, other): return hash(self) == hash(other) def _set_gender(self, gender): """ sets the gender of the individual for the variant """ if gender in self.male_codes: self.gender = "male" elif gender in self.female_codes: self.gender = "female" else: raise ValueError("unknown gender code") self.set_inheritance_type(self.get_position(), self.is_male()) def _get_gender(self): """returns the gender for a person (1, M = male, 2, F = female). """ return self.gender def set_mutation_id(self, variant_id): """ sets the mutation ID based on the VCF ID field The variant ID can be either "." for null value, an rsID, a HGMD ID, a COSMIC ID, or any combination of those (including multiple HGMD IDs for a single variant). Args: variant_id: string from the VCF ID field, can be rsID, or a list of &-separated IDs, which can include COSMIC and HGMD IDs. """ if variant_id != ".": variant_id = variant_id.split("&") ids = [] for value in variant_id: # include everything that isn't an rsID if not value.startswith("rs"): ids.append(value) if len(ids) > 0: self.mutation_id = ",".join(ids) def get_mutation_id(self): return self.mutation_id def is_male(self): """ returns True/False for whether the person is male """ return self._get_gender() in self.male_codes def add_format(self, keys, values): """Parses the FORMAT column from VCF files. Args: keys: FORMAT text from a line in a VCF file values: the values for the format keys """ self.format = dict(zip(keys.split(":"), values.split(":"))) def get_low_depth_alleles(self, ref, alts): ''' get a list of alleles with zero counts, or indels with 1 read Some variants have multiple alts, so we need to select the alt with the most severe consequence. However, in at least one version of the VCFs, one of the alts could have zero counts, which I believe resulted from the population based multi-sample calling. We need to drop the consequences recorded for zero-count alternate alleles before finding the most severe. We also want to avoid indels with only one read, because these are universally bad calls. Args: ref: reference allele alts: tuple of alt alleles Returns: list of alleles with sufficiently low depth ''' is_indel = lambda x, y: len(x) > 1 or len(y) > 1 allele_counts = ['1'] * len(alts) if 'AC' in self.info: allele_counts = self.info['AC'].split(',') allele_depths = ['10'] * len(alts) if 'AD' in self.format: allele_depths = self.format['AD'].split(',')[1:] counts = list(zip(allele_counts, allele_depths)) assert len(counts) == len(alts) # find the positions of alleles where the allele count is zero, # or indels with 1 alt read pos = set() for i, (count, depth) in enumerate(counts): if count == '0': pos.add(i) elif depth == '1' and is_indel(ref, alts[i]): pos.add(i) # return the alleles with zero-count ,so we can mask them out return [ alts[i] for i in sorted(pos) ] def add_vcf_line(self, vcf_line): self.vcf_line = vcf_line def get_vcf_line(self): return self.vcf_line def set_inheritance_type(self, pos, is_male): """ sets the chromosome type (eg autosomal, or X chromosome type). provides the chromosome type for a chromosome (eg Autosomal, or X-chrom male etc). This only does simple string matching. The chromosome string is either the chromosome number, or in the case of the sex-chromosomes, the chromosome character. This doesn't allow for chromosomes to be specified as "chr1", and sex chromosomes have to be specified as "X" or "Y", not "23" or "24". Args: pos: position on the chromosome is_male: True/False for whether the individual is male """ if self.get_chrom() not in ["chrX", "ChrX", "X", "chrY", "ChrY", "Y"]: self.inheritance_type = "autosomal" elif self.get_chrom() in ["chrX", "ChrX", "X"]: # check if the gene lies within a pseudoautosomal region for start, end in self.x_pseudoautosomal_regions: if start < pos < end: self.inheritance_type = "autosomal" return if is_male: self.inheritance_type = "XChrMale" else: self.inheritance_type = "XChrFemale" elif self.get_chrom() in ["chrY", "ChrY", "Y"]: # check if the gene lies within a pseudoautosomal region for start, end in self.y_pseudoautosomal_regions: if start < pos < end: self.inheritance_type = "autosomal" return if is_male: self.inheritance_type = "YChrMale" else: self.inheritance_type = "YChrFemale" def get_inheritance_type(self): """ return the variant chromosomal inheritance type """ return self.inheritance_type def get_chrom(self): """ return the variant chromosome """ return self.chrom def get_position(self): """ return the variant chromosomal position """ return self.position def get_genotype(self): """ return the genotype value """ return self.genotype def get_sum_x_lr2(self): """ return the sum of mean l2r on x chromsome """ return self.sum_x_lr2 def get_has_parents(self): """returns false for singletons, true for trios """ return self.has_parents
class Variant(object): """ generic functions for variants """ # define some codes used in ped files to identify male and female sexes male_codes = set(["1", "m", "M", "male"]) female_codes = set(["2", "f", "F", "female"]) x_pseudoautosomal_regions = [(60001, 2699520), (154930290, 155260560), \ (88456802, 92375509)] y_pseudoautosomal_regions = [(10001, 2649520), (59034050, 59363566)] known_genes = None @classmethod def set_known_genes(cls_obj, known_genes): cls_obj.known_genes = known_genes def __init__(self, chrom, position, id, ref, alts, qual, filter, info=None, format=None, sample=None, gender=None, sum_x_lr2=None, parents=None, mnv_code=None): """ initialise the object with the definition values """ self.chrom = chrom self.position = int(position) self.variant_id = id self.mutation_id = "NA" self.set_mutation_id(self.variant_id) self.ref_allele = ref self.alt_alleles = tuple(alts.split(',')) self.mnv_code = mnv_code self.qual = qual self.filter = filter self.sum_x_lr2 = sum_x_lr2 self.has_parents = parents # intialise variables that will be set later self.inheritance_type = None self.gender = None if gender is not None: self._set_gender(gender) self.vcf_line = None self.format = None if format is not None and sample is not None: self.add_format(format, sample) self.info = Info(info, self.mnv_code) masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles) self.info.set_genes_and_consequence(self.get_chrom(), self.get_position(), self.alt_alleles, masked) self.genotype = None if self.format is not None and self._get_gender() is not None: self.set_genotype() def is_lof(self, gene_symbol=None): return self.info.is_lof(gene_symbol) def is_missense(self, is_cnv, gene_symbol=None): return self.info.is_missense(is_cnv, gene_symbol) def is_synoymous(self, gene_symbol=None): return self.info.is_synoymous(gene_symbol) def __repr__(self): ''' repr function for Variant objects. SNV(...) and CNV(...) also work ''' def quote(value): if value is not None: value = '"{}"'.format(value) return value # reprocess the format dictionary back to the original text strings keys, sample = None, None if self.format is not None: keys = quote(':'.join(sorted(self.format))) sample = quote(':'.join( [self.format[x] for x in sorted(self.format)])) info = quote(self.info) gender = quote(self.gender) mnv_code = quote(self.mnv_code) return '{}(chrom="{}", position={}, id="{}", ref="{}", alts="{}", ' \ 'qual="{}", filter="{}", info={}, format={}, sample={}, gender={}, ' \ 'mnv_code={})'.format(type(self).__name__, self.chrom, self.position, self.variant_id, self.ref_allele, ','.join(self.alt_alleles), self.qual, self.filter, info, keys, sample, gender, mnv_code) def __hash__(self): return hash(str(self)) def __eq__(self, other): return hash(self) == hash(other) def _set_gender(self, gender): """ sets the gender of the individual for the variant """ if gender in self.male_codes: self.gender = "male" elif gender in self.female_codes: self.gender = "female" else: raise ValueError("unknown gender code") self.set_inheritance_type(self.get_position(), self.is_male()) def _get_gender(self): """returns the gender for a person (1, M = male, 2, F = female). """ return self.gender def set_mutation_id(self, variant_id): """ sets the mutation ID based on the VCF ID field The variant ID can be either "." for null value, an rsID, a HGMD ID, a COSMIC ID, or any combination of those (including multiple HGMD IDs for a single variant). Args: variant_id: string from the VCF ID field, can be rsID, or a list of &-separated IDs, which can include COSMIC and HGMD IDs. """ if variant_id != ".": variant_id = variant_id.split("&") ids = [] for value in variant_id: # include everything that isn't an rsID if not value.startswith("rs"): ids.append(value) if len(ids) > 0: self.mutation_id = ",".join(ids) def get_mutation_id(self): return self.mutation_id def is_male(self): """ returns True/False for whether the person is male """ return self._get_gender() in self.male_codes def add_format(self, keys, values): """Parses the FORMAT column from VCF files. Args: keys: FORMAT text from a line in a VCF file values: the values for the format keys """ self.format = dict(zip(keys.split(":"), values.split(":"))) def get_low_depth_alleles(self, ref, alts): ''' get a list of alleles with zero counts, or indels with 1 read Some variants have multiple alts, so we need to select the alt with the most severe consequence. However, in at least one version of the VCFs, one of the alts could have zero counts, which I believe resulted from the population based multi-sample calling. We need to drop the consequences recorded for zero-count alternate alleles before finding the most severe. We also want to avoid indels with only one read, because these are universally bad calls. Args: ref: reference allele alts: tuple of alt alleles Returns: list of alleles with sufficiently low depth ''' is_indel = lambda x, y: len(x) > 1 or len(y) > 1 allele_counts = ['1'] * len(alts) if 'AC' in self.info: allele_counts = self.info['AC'].split(',') allele_depths = ['10'] * len(alts) if 'AD' in self.format: allele_depths = self.format['AD'].split(',')[1:] counts = list(zip(allele_counts, allele_depths)) assert len(counts) == len(alts) # find the positions of alleles where the allele count is zero, # or indels with 1 alt read pos = set() for i, (count, depth) in enumerate(counts): if count == '0': pos.add(i) elif depth == '1' and is_indel(ref, alts[i]): pos.add(i) # return the alleles with zero-count ,so we can mask them out return [alts[i] for i in sorted(pos)] def add_vcf_line(self, vcf_line): self.vcf_line = vcf_line def get_vcf_line(self): return self.vcf_line def set_inheritance_type(self, pos, is_male): """ sets the chromosome type (eg autosomal, or X chromosome type). provides the chromosome type for a chromosome (eg Autosomal, or X-chrom male etc). This only does simple string matching. The chromosome string is either the chromosome number, or in the case of the sex-chromosomes, the chromosome character. This doesn't allow for chromosomes to be specified as "chr1", and sex chromosomes have to be specified as "X" or "Y", not "23" or "24". Args: pos: position on the chromosome is_male: True/False for whether the individual is male """ if self.get_chrom() not in ["chrX", "ChrX", "X", "chrY", "ChrY", "Y"]: self.inheritance_type = "autosomal" elif self.get_chrom() in ["chrX", "ChrX", "X"]: # check if the gene lies within a pseudoautosomal region for start, end in self.x_pseudoautosomal_regions: if start < pos < end: self.inheritance_type = "autosomal" return if is_male: self.inheritance_type = "XChrMale" else: self.inheritance_type = "XChrFemale" elif self.get_chrom() in ["chrY", "ChrY", "Y"]: # check if the gene lies within a pseudoautosomal region for start, end in self.y_pseudoautosomal_regions: if start < pos < end: self.inheritance_type = "autosomal" return if is_male: self.inheritance_type = "YChrMale" else: self.inheritance_type = "YChrFemale" def get_inheritance_type(self): """ return the variant chromosomal inheritance type """ return self.inheritance_type def get_chrom(self): """ return the variant chromosome """ return self.chrom def get_position(self): """ return the variant chromosomal position """ return self.position def get_genotype(self): """ return the genotype value """ return self.genotype def get_sum_x_lr2(self): """ return the sum of mean l2r on x chromsome """ return self.sum_x_lr2 def get_has_parents(self): """returns false for singletons, true for trios """ return self.has_parents
class TestVariantInfoPy(unittest.TestCase): """ unit testing of the Info class """ def setUp(self): """ define a default Info object """ self.pops = [ "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF" ] Info.set_populations(self.pops) # set up a SNV object, since SNV inherits Info self.info = Info("HGNC_ID=1001;CQ=missense_variant;random_tag") def tearDown(self): Info.set_populations([]) def test_get_consequence(self): """ test that get_consequence works correctly """ chrom, pos = '1', 1000 info = Info('CQ=missense_variant;HGNC=TEST') alts = ('C', ) # check that in the absence of any known conserved final exon positions, # the consequence is unchanged. self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant']]) info = Info('CQ=missense_variant|stop_gained;HGNC=TEST|TEST2') self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant', 'stop_gained']]) def test_get_consequence_last_base(self): '''check get_consequence() works with last base of exon changes ''' chrom, pos = '1', 1000 alts = ('C', ) info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence(chrom, pos, alts, []) # Now check that if the variant is at a position where it is a final # base in an exon with a conserved base, the consequence gets converted. info.last_base = set([("1", 1000)]) self.assertEqual(info.get_consequences(chrom, pos, alts, []), [["conserved_exon_terminus_variant"]]) # If we have a variant in multiple genes, check that it only alters the # missense/splice_region variants, and doesn't alter synonymous variants # (since these will be in transcripts where the variant is distant from # an exon boundary.) info = Info('CQ=missense_variant|synonymous_variant;HGNC=TEST|TEST1') info.set_genes_and_consequence(chrom, pos, alts, []) info.last_base = set([("1", 1000)]) self.assertEqual( info.get_consequences(chrom, pos, alts, []), [["conserved_exon_terminus_variant", "synonymous_variant"]]) def test_get_consequence_multiallelic(self): ''' test that get_consequence works correctly with multiple alleles ''' chrom, pos = '1', 1000 info = Info('CQ=missense_variant,synonymous_variant') alts = ('C', 'G') self.assertEqual(info.get_consequences(chrom, pos, alts, []), [['missense_variant'], ['synonymous_variant']]) def test_get_consequence_multiallelic_with_masked(self): ''' test that get_consequence works correctly with multiple alleles ''' chrom, pos = '1', 1000 info = Info('CQ=missense_variant,synonymous_variant') alts = ('C', 'G') self.assertEqual(info.get_consequences(chrom, pos, alts, ['G']), [['missense_variant']]) def test_parse_gene_symbols(self): """ test that parse_gene_symbols() works correctly """ alts = ('C', ) # check for when a HGNC key exists self.info["HGNC_ID"] = "A" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A'}, idx=0)]) # check for when a HGNC key doesn't exist del self.info["HGNC_ID"] genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={}, idx=0)]) # check for multiple gene symbols self.info["HGNC_ID"] = "A|B|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A|B|C'}, idx=0)]) # check for multiple gene symbols, when some are missing self.info["HGNC_ID"] = "|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C'}, idx=0)]) # check for multiple gene symbols, when some missing symbols have # alternates in other symbol fields. self.info["HGNC_ID"] = ".|.|C" self.info["HGNC"] = "Z|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual( genes, [Symbols(info={ 'HGNC_ID': '||C', 'HGNC': 'Z||C' }, idx=0)]) # Check that including alternate symbols has the correct precendence # order. Note that doing this properly would require checking all of the # possible order combinations. self.info["HGNC_ID"] = ".|.|C" self.info["HGNC"] = "Z|.|C" self.info["SYMBOL"] = "A|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [ Symbols(info={ 'HGNC_ID': '||C', 'HGNC': 'Z||C', "SYMBOL": "A||C" }, idx=0) ]) def test_parse_gene_symbols_multi_alts(self): ''' check parse_gene_symbols() when we have multiple alleles ''' info = Info( 'HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E' ) alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [ Symbols(info={ 'HGNC_ID': 'D', 'HGNC': 'D', 'SYMBOL': 'D', 'ENSG': 'D', 'ENST': 'D', 'ENSP': 'D', 'ENSR': 'D' }, idx=0), Symbols(info={ 'HGNC_ID': 'E', 'HGNC': 'E', 'SYMBOL': 'E', 'ENSG': 'E', 'ENST': 'E', 'ENSP': 'E', 'ENSR': 'E' }, idx=0) ]) # if we have more alleles than the available symbols, we get an error # NOTE: this doesn't check if we have fewer alleles than symbols alts = ('G', 'T', 'C') with self.assertRaises(IndexError): self.info.parse_gene_symbols(alts, []) def test_parse_gene_symbols_multi_alts_multi_symbols(self): ''' check parse_gene_symbols() when we have multiple symbols per allele ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [ Symbols(info={ 'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X' }, idx=0), Symbols(info={ 'HGNC_ID': 'E|Y', 'HGNC': 'E|Y', 'SYMBOL': 'E|Y', 'ENSG': 'E|Y', 'ENST': 'E|Y', 'ENSP': 'E|Y', 'ENSR': 'E|Y' }, idx=0) ]) def test_parse_gene_symbols_multi_alts_masked_alt(self): ''' check parse_gene_symbols() when we mask alt alleles ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') # mask one allele self.assertEqual(info.parse_gene_symbols(alts, ['C']), [ Symbols(info={ 'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X' }, idx=0) ]) # mask both alleles self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']), []) def test_parse_gene_symbols_missing_gene(self): ''' check the gene symbol is the genome pos when we lack any other info ''' # remove the only possibly source of the gene symbol info = Info('') alts = ('C', ) genes = info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={}, idx=0)]) def test_is_lof(self): """ test that is_lof() works correctly """ # check that known LOF consensequence return True info = Info('CQ=stop_gained;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_lof()) # check that known non-LOF consensequence returns False info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_lof()) # check that null values return False info = Info('HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_lof()) # check when the variant overlaps multiple genes (so has multiple # gene symbols and consequences). info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_lof()) self.assertTrue(info.is_lof("ATRX")) self.assertFalse(info.is_lof("TTN")) # check that when we have a MNV, we can lose or gain a LOF annotation info.mnv_code = 'masked_stop_gain_mnv' self.assertFalse(info.is_lof("ATRX")) info.mnv_code = 'modified_stop_gained_mnv' self.assertTrue(info.is_lof("TTN")) def test_is_missense(self): """ test that is_missense() works correctly """ # check that known missense equivalent consequence return True info = Info('CQ=missense_variant;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_missense(is_cnv=False)) # check that known LoF equivalent consequence returns False info = Info('CQ=stop_gained;HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_missense(is_cnv=False)) # check that null values return False info = Info('HGNC=TEST') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertFalse(info.is_missense(is_cnv=False)) # check when the variant overlaps multiple genes (so has multiple # gene symbols and consequences). info = Info('CQ=missense_variant|synonymous_variant;HGNC=ATRX|TTN') info.set_genes_and_consequence('1', 100, ('G'), []) self.assertTrue(info.is_missense(is_cnv=False)) self.assertTrue(info.is_missense(False, "ATRX")) self.assertFalse(info.is_missense(False, "TTN")) # check that when we have a MNV, we can lose or gain a LOF annotation info.mnv_code = 'modified_synonymous_mnv' self.assertFalse(info.is_missense(False, "ATRX")) info.mnv_code = 'modified_protein_altering_mnv' self.assertTrue(info.is_missense(False, "TTN")) # check that masked stop gained MNVs are converted to a missense info = Info('CQ=stop_gained;HGNC=ATRX') info.set_genes_and_consequence('1', 100, ('G'), []) info.mnv_code = 'masked_stop_gain_mnv' self.assertTrue(info.is_missense(False)) def test_is_missense_cnv(self): ''' test that is_missense() works correctly for CNVs ''' chrom, pos, alts, = '1', '15000000', ('G', ) info = Info('HGNC=ATRX;CQ=coding_sequence_variant;random_tag') info.set_genes_and_consequence(chrom, pos, alts, []) self.assertTrue(info.is_missense(is_cnv=True)) self.assertFalse(info.is_missense(is_cnv=False)) def test_get_per_gene_consequence(self): """ test that get_per_gene_consequence works correctly """ self.info.symbols = [Symbols(info={'HGNC': 'ATRX'}, idx=0)] self.info.consequence = [["missense_variant"]] self.assertEqual(self.info.get_per_gene_consequence(None), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("TEST"), []) # check a variant with consequences in multiple genes, that we only # pull out the consequencesquences for a single gene self.info.symbols = [Symbols(info={'HGNC': 'ATRX|TTN'}, idx=0)] self.info.consequence = [["missense_variant", "synonymous_variant"]] self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("TTN"), ["synonymous_variant"]) # check a symbol where two symbols match, we only use the first consequence self.info.symbols = [Symbols(info={'HGNC': 'TEMP|ATRX|TEMP'}, idx=0)] self.info.consequence = [[ "splice_acceptor_variant", "missense_variant", "synonymous_variant" ]] self.assertEqual(self.info.get_per_gene_consequence("TEMP"), ["splice_acceptor_variant"]) # check a symbol with some None gene symbols self.info.symbols = [Symbols(info={'HGNC': '|ATRX|'}, idx=0)] self.info.consequence = [[ "splice_acceptor_variant", "missense_variant", "synonymous_variant" ]] self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"]) def test_get_allele_frequency(self): """ tests that number conversion works as expected """ # single number returns that number self.assertEqual(self.info.get_allele_frequency("1"), 1) # two numbers return one number self.assertEqual(self.info.get_allele_frequency("1,1"), 1) # two numbers return the highest number self.assertEqual(self.info.get_allele_frequency("1,2"), 2) # number and string return the number self.assertEqual(self.info.get_allele_frequency("a,1"), 1) # single string value returns None self.assertEqual(self.info.get_allele_frequency("a"), None) # multiple string values return None self.assertEqual(self.info.get_allele_frequency("a,b"), None) # multiple string values return None self.assertEqual(self.info.get_allele_frequency(None), None) def test_is_number(self): """ tests that we can check if a value represents a number """ self.assertEqual(self.info.is_number(None), False) self.assertEqual(self.info.is_number("5"), True) self.assertEqual(self.info.is_number("a"), False) def test_find_max_allele_frequency(self): """ test if the MAF finder operates correctly """ # check for var without recorded MAF self.assertIsNone(self.info.find_max_allele_frequency()) # check for single population self.info["MAX_AF"] = "0.005" self.assertEqual(self.info.find_max_allele_frequency(), 0.005) # check for two populations self.info["AFR_AF"] = "0.01" self.assertEqual(self.info.find_max_allele_frequency(), 0.01) # check for all populations pops = set(["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", \ "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]) for pop in pops: self.info[pop] = "0.05" self.assertEqual(self.info.find_max_allele_frequency(), 0.05) # make sure we can handle having None values self.info["AFR_AF"] = None self.assertEqual(self.info.find_max_allele_frequency(), 0.05) def test_find_max_allele_frequency_without_populations(self): ''' test if the MAF finder operates correctly when we haven't set any populations to check ''' self.info["MAX_AF"] = "0.005" # this is a regression test for a problem that only occurs if the unit # tests are run in an order such that the populations might not have # been set in previous commits. Info.set_populations([]) self.assertEqual(self.info.find_max_allele_frequency(), None) # reset the populations, so that other unit tests can also rely on the # populations being set Info.set_populations(self.pops)