Exemple #1
0
    def setUp(self):
        """ define a default SNV object
        """

        self.pops = [
            "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
            "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"
        ]
        Info.set_populations(self.pops)

        chrom = "1"
        pos = "15000000"
        snp_id = "."
        ref = "A"
        alt = "G"
        qual = "1000"
        filt = "PASS"

        info = "HGNC_ID=1001;CQ=missense_variant;random_tag"
        self.keys = "GT:DP:AD"
        self.values = "0/1:50:10,10"

        self.var = SNV(chrom,
                       pos,
                       snp_id,
                       ref,
                       alt,
                       qual,
                       filt,
                       info=info,
                       format=self.keys,
                       sample=self.values)
    def test_get_low_depth_alleles(self):
        ''' test that get_low_depth_alleles() works correctly
        '''

        # check with a single allele whre it is non-zero
        self.var.info = Info('AC=1')
        alts = ('C', )
        self.assertEqual(self.var.get_low_depth_alleles('G', alts), [])

        # check with a single allele with zero count
        self.var.info = Info('AC=0')
        alts = ('C', )
        self.assertEqual(self.var.get_low_depth_alleles('G', alts), ['C'])

        # check with multiallelic, where both are nonzero
        self.var.info = Info('AC=1,1')
        self.var.format = {'AD': '5,10,10'}
        alts = ('C', 'G')
        self.assertEqual(self.var.get_low_depth_alleles('G', alts), [])

        # check with multiallelic, where one a has zero count
        self.var.info = Info('AC=1,0')
        self.var.format = {'AD': '5,10,10'}
        alts = ('C', 'G')
        self.assertEqual(self.var.get_low_depth_alleles('G', alts), ['G'])
    def test_parse_gene_symbols_multi_alts_multi_symbols(self):
        ''' check parse_gene_symbols() when we have multiple symbols per allele
        '''

        info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
            'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
        alts = ('G', 'C')

        self.assertEqual(info.parse_gene_symbols(alts, []), [
            Symbols(info={
                'HGNC_ID': 'D|X',
                'HGNC': 'D|X',
                'SYMBOL': 'D|X',
                'ENSG': 'D|X',
                'ENST': 'D|X',
                'ENSP': 'D|X',
                'ENSR': 'D|X'
            },
                    idx=0),
            Symbols(info={
                'HGNC_ID': 'E|Y',
                'HGNC': 'E|Y',
                'SYMBOL': 'E|Y',
                'ENSG': 'E|Y',
                'ENST': 'E|Y',
                'ENSP': 'E|Y',
                'ENSR': 'E|Y'
            },
                    idx=0)
        ])
    def setUp(self):
        """ define a family and variant, and start the Allosomal class
        """

        # generate a test family
        child_gender = "F"
        mom_aff = "1"
        dad_aff = "1"

        self.trio = self.create_family(child_gender, mom_aff, dad_aff)

        # generate a test variant
        child = create_snv(child_gender,
                           "0/1",
                           chrom='X',
                           pos=150,
                           extra_info='HGNC=TEST;MAX_AF=0.0005')
        mom = create_snv("F", "0/0", chrom='X', pos=150)
        dad = create_snv("M", "0/0", chrom='X', pos=150)

        self.variants = [TrioGenotypes('X', '150', child, mom, dad)]

        self.report = Report(None, None, None)
        Info.set_populations([
            "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
            "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"
        ])
Exemple #5
0
    def __init__(self,
                 chrom,
                 position,
                 id,
                 ref,
                 alts,
                 qual,
                 filter,
                 info=None,
                 format=None,
                 sample=None,
                 gender=None,
                 sum_x_lr2=None,
                 parents=None,
                 mnv_code=None):
        """ initialise the object with the definition values
        """

        self.chrom = chrom
        self.position = int(position)

        self.variant_id = id
        self.mutation_id = "NA"
        self.set_mutation_id(self.variant_id)

        self.ref_allele = ref
        self.alt_alleles = tuple(alts.split(','))

        self.mnv_code = mnv_code
        self.qual = qual
        self.filter = filter

        self.sum_x_lr2 = sum_x_lr2

        self.has_parents = parents

        # intialise variables that will be set later
        self.inheritance_type = None

        self.gender = None
        if gender is not None:
            self._set_gender(gender)

        self.vcf_line = None

        self.format = None
        if format is not None and sample is not None:
            self.add_format(format, sample)

        self.info = Info(info, self.mnv_code)
        masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles)
        self.info.set_genes_and_consequence(self.get_chrom(),
                                            self.get_position(),
                                            self.alt_alleles, masked)

        self.genotype = None
        if self.format is not None and self._get_gender() is not None:
            self.set_genotype()
 def test_parse_gene_symbols_missing_gene(self):
     ''' check the gene symbol is the genome pos when we lack any other info
     '''
     
     # remove the only possibly source of the gene symbol
     info = Info('')
     alts = ('C', )
     
     genes = info.parse_gene_symbols(alts, [])
     self.assertEqual(genes, [Symbols(info={}, idx=0)])
 def setUp(self):
     """ define a default Info object
     """
     
     self.pops = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF",
         "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]
     Info.set_populations(self.pops)
     
     # set up a SNV object, since SNV inherits Info
     self.info = Info("HGNC_ID=1001;CQ=missense_variant;random_tag")
    def test_parse_gene_symbols_missing_gene(self):
        ''' check the gene symbol is the genome pos when we lack any other info
        '''

        # remove the only possibly source of the gene symbol
        info = Info('')
        alts = ('C', )

        genes = info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={}, idx=0)])
 def test_get_consequence_multiallelic_with_masked(self):
     ''' test that get_consequence works correctly with multiple alleles
     '''
     
     chrom, pos = '1', 1000
     info = Info('CQ=missense_variant,synonymous_variant')
     alts = ('C', 'G')
     
     self.assertEqual(info.get_consequences(chrom, pos, alts, ['G']),
         [['missense_variant']])
    def test_get_consequence_multiallelic(self):
        ''' test that get_consequence works correctly with multiple alleles
        '''

        chrom, pos = '1', 1000
        info = Info('CQ=missense_variant,synonymous_variant')
        alts = ('C', 'G')

        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
                         [['missense_variant'], ['synonymous_variant']])
 def test_is_missense_cnv(self):
     ''' test that is_missense() works correctly for CNVs
     '''
     
     chrom, pos, alts, = '1', '15000000', ('G',)
     info = Info('HGNC=ATRX;CQ=coding_sequence_variant;random_tag')
     info.set_genes_and_consequence(chrom, pos, alts, [])
     
     self.assertTrue(info.is_missense(is_cnv=True))
     self.assertFalse(info.is_missense(is_cnv=False))
    def setUp(self):
        """ define a default Info object
        """

        self.pops = [
            "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
            "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"
        ]
        Info.set_populations(self.pops)

        # set up a SNV object, since SNV inherits Info
        self.info = Info("HGNC_ID=1001;CQ=missense_variant;random_tag")
 def test_parse_gene_symbols_multi_alts_multi_symbols(self):
     ''' check parse_gene_symbols() when we have multiple symbols per allele
     '''
     
     info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
         'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
     alts = ('G', 'C')
     
     self.assertEqual(info.parse_gene_symbols(alts, []),
         [Symbols(info={'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X',
             'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X'}, idx=0),
         Symbols(info={'HGNC_ID': 'E|Y', 'HGNC': 'E|Y', 'SYMBOL': 'E|Y',
             'ENSG': 'E|Y', 'ENST': 'E|Y', 'ENSP': 'E|Y', 'ENSR': 'E|Y'}, idx=0)])
    def test_find_max_allele_frequency_without_populations(self):
        ''' test if the MAF finder operates correctly when we haven't set any
        populations to check
        '''

        self.info["MAX_AF"] = "0.005"

        # this is a regression test for a problem that only occurs if the unit
        # tests are run in an order such that the populations might not have
        # been set in previous commits.
        Info.set_populations([])
        self.assertEqual(self.info.find_max_allele_frequency(), None)

        # reset the populations, so that other unit tests can also rely on the
        # populations being set
        Info.set_populations(self.pops)
 def test_get_consequence(self):
     """ test that get_consequence works correctly
     """
     
     chrom, pos = '1', 1000
     info = Info('CQ=missense_variant;HGNC=TEST')
     alts = ('C',)
     
     # check that in the absence of any known conserved final exon positions,
     # the consequence is unchanged.
     self.assertEqual(info.get_consequences(chrom, pos, alts, []),
         [['missense_variant']])
     
     info = Info('CQ=missense_variant|stop_gained;HGNC=TEST|TEST2')
     self.assertEqual(info.get_consequences(chrom, pos, alts, []),
         [['missense_variant', 'stop_gained']])
 def test_find_max_allele_frequency_without_populations(self):
     ''' test if the MAF finder operates correctly when we haven't set any
     populations to check
     '''
     
     self.info["MAX_AF"] = "0.005"
     
     # this is a regression test for a problem that only occurs if the unit
     # tests are run in an order such that the populations might not have
     # been set in previous commits.
     Info.set_populations([])
     self.assertEqual(self.info.find_max_allele_frequency(), None)
     
     # reset the populations, so that other unit tests can also rely on the
     # populations being set
     Info.set_populations(self.pops)
 def test_parse_gene_symbols_multi_alts_masked_alt(self):
     ''' check parse_gene_symbols() when we mask alt alleles
     '''
     
     info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
         'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
     alts = ('G', 'C')
     
     # mask one allele
     self.assertEqual(info.parse_gene_symbols(alts, ['C']),
         [Symbols(info={'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X',
             'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X'}, idx=0)])
     
     # mask both alleles
     self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']),
         [])
 def test_parse_gene_symbols_multi_alts(self):
     ''' check parse_gene_symbols() when we have multiple alleles
     '''
     
     info = Info('HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E')
     alts = ('G', 'C')
     
     self.assertEqual(info.parse_gene_symbols(alts,  []),
         [Symbols(info={'HGNC_ID': 'D', 'HGNC': 'D', 'SYMBOL': 'D',
             'ENSG': 'D', 'ENST': 'D', 'ENSP': 'D', 'ENSR': 'D'}, idx=0),
         Symbols(info={'HGNC_ID': 'E', 'HGNC': 'E', 'SYMBOL': 'E',
             'ENSG': 'E', 'ENST': 'E', 'ENSP': 'E', 'ENSR': 'E'}, idx=0)])
     
     # if we have more alleles than the available symbols, we get an error
     # NOTE: this doesn't check if we have fewer alleles than symbols
     alts = ('G', 'T', 'C')
     with self.assertRaises(IndexError):
         self.info.parse_gene_symbols(alts, [])
Exemple #19
0
def load_variants(family,
                  pp_filter,
                  pops,
                  known_genes,
                  last_base,
                  sum_x_lr2,
                  debug_chrom=None,
                  debug_pos=None):
    """ loads the variants for a trio or singleton
    
    Args:
        family: Family object containing an data for an affected proband
        pp_filter: float between 0 and 1, being the threshold for the PP_DNM filter
        pops: list of populations who have minor allele frequencies in INFO
        known_genes: genes known to be involved with genetic disorders.
        last_base: set of sites in genome at conserved last base of exons,
            where we upgrade the severity of variants to loss-of-function.
        debug_chrom: chromosome string, to give more information about why
            a variant fails to pass the filters.
        debug_pos: chromosome position, to give more information about why
            a variant fails to pass the filters.
        sum_x_lr2: Sum of mean l2r on x chromosomes for all probands
    
    Returns:
        list of filtered variants for a trio, as TrioGenotypes objects
    """

    # define several parameters of the variant classes, before initialisation
    for Var in [SNV, CNV]:
        Var.set_known_genes(known_genes)
        Var.set_debug(debug_chrom, debug_pos)

    Info.set_last_base_sites(last_base)
    Info.set_populations(pops)

    #get sum of mean l2r for proband
    sum_x_lr2_proband = 0
    if family.child.person_id in sum_x_lr2.keys():
        sum_x_lr2_proband = sum_x_lr2[family.child.person_id]

    variants = load_trio(family, sum_x_lr2_proband)

    return filter_de_novos(variants, pp_filter)
    def test_get_low_depth_alleles_bad_indel(self):
        ''' test that get_low_depth_alleles() works for indels with depth=1
        '''
        # check with multiallelic, where all should pass, since none are indels
        self.var.info = Info('AC=1,1')
        self.var.format = {'AD': '5,10,1'}
        alts = ('C', 'G')
        self.assertEqual(self.var.get_low_depth_alleles('G', alts), [])

        # but if we have an indel alt, and the corresponding depth is bad, fail
        alts = ('C', 'GG')
        self.assertEqual(self.var.get_low_depth_alleles('G', alts), ['GG'])
 def setUp(self):
     """ define a default SNV object
     """
     
     self.pops = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
         "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]
     Info.set_populations(self.pops)
     
     chrom = "1"
     pos = "15000000"
     snp_id = "."
     ref = "A"
     alt = "G"
     qual = "1000"
     filt = "PASS"
     
     info = "HGNC_ID=1001;CQ=missense_variant;random_tag"
     self.keys = "GT:DP:AD"
     self.values = "0/1:50:10,10"
     
     self.var = SNV(chrom, pos, snp_id, ref, alt, qual, filt, info=info,
         format=self.keys, sample=self.values)
def load_variants(family, pp_filter, pops, known_genes, last_base, sum_x_lr2,
        debug_chrom=None, debug_pos=None):
    """ loads the variants for a trio or singleton
    
    Args:
        family: Family object containing an data for an affected proband
        pp_filter: float between 0 and 1, being the threshold for the PP_DNM filter
        pops: list of populations who have minor allele frequencies in INFO
        known_genes: genes known to be involved with genetic disorders.
        last_base: set of sites in genome at conserved last base of exons,
            where we upgrade the severity of variants to loss-of-function.
        debug_chrom: chromosome string, to give more information about why
            a variant fails to pass the filters.
        debug_pos: chromosome position, to give more information about why
            a variant fails to pass the filters.
        sum_x_lr2: Sum of mean l2r on x chromosomes for all probands
    
    Returns:
        list of filtered variants for a trio, as TrioGenotypes objects
    """

    parents = family.has_parents()
    
    # define several parameters of the variant classes, before initialisation
    for Var in [SNV, CNV]:
        Var.set_known_genes(known_genes)
        Var.set_debug(debug_chrom, debug_pos)
    
    Info.set_last_base_sites(last_base)
    Info.set_populations(pops)

#get sum of mean l2r for proband
    sum_x_lr2_proband = 0
    if family.child.person_id in sum_x_lr2.keys():
        sum_x_lr2_proband = sum_x_lr2[family.child.person_id]
    
    variants = load_trio(family, sum_x_lr2_proband)
    
    return filter_de_novos(variants, pp_filter)
 def setUp(self):
     """ define a family and variant, and start the Allosomal class
     """
     
     # generate a test family
     child_gender = "F"
     mom_aff = "1"
     dad_aff = "1"
     
     self.trio = self.create_family(child_gender, mom_aff, dad_aff)
     
     # generate a test variant
     child = create_snv(child_gender, "0/1", chrom='X', pos=150,
         extra_info='HGNC=TEST;MAX_AF=0.0005')
     mom = create_snv("F", "0/0", chrom='X', pos=150)
     dad = create_snv("M", "0/0", chrom='X', pos=150)
     
     self.variants = [TrioGenotypes('X', '150', child, mom, dad)]
     
     self.report = Report(None, None, None)
     Info.set_populations(["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF",
         "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"])
    def test_parse_gene_symbols_multi_alts(self):
        ''' check parse_gene_symbols() when we have multiple alleles
        '''

        info = Info(
            'HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E'
        )
        alts = ('G', 'C')

        self.assertEqual(info.parse_gene_symbols(alts, []), [
            Symbols(info={
                'HGNC_ID': 'D',
                'HGNC': 'D',
                'SYMBOL': 'D',
                'ENSG': 'D',
                'ENST': 'D',
                'ENSP': 'D',
                'ENSR': 'D'
            },
                    idx=0),
            Symbols(info={
                'HGNC_ID': 'E',
                'HGNC': 'E',
                'SYMBOL': 'E',
                'ENSG': 'E',
                'ENST': 'E',
                'ENSP': 'E',
                'ENSR': 'E'
            },
                    idx=0)
        ])

        # if we have more alleles than the available symbols, we get an error
        # NOTE: this doesn't check if we have fewer alleles than symbols
        alts = ('G', 'T', 'C')
        with self.assertRaises(IndexError):
            self.info.parse_gene_symbols(alts, [])
Exemple #25
0
    def test_is_compound_pair_unknown_gene(self):
        """check that is_compound_pair() excludes pairs for unknown genes
        """

        # set some variants, so we can alter them later
        var1 = self.create_variant(chrom="1",
                                   position="150",
                                   sex="F",
                                   cq="stop_gained")
        var2 = self.create_variant(chrom="1",
                                   position="160",
                                   sex="F",
                                   cq="stop_gained")

        var1 = self.set_compound_het_var(var1, "110")
        var2 = self.set_compound_het_var(var2, "101")

        var1.child.info = Info('CQ=missense_variant')
        var2.child.info = Info('CQ=missense_variant')
        var1.child.info.set_genes_and_consequence('1', 100, ('G', ), [])
        var2.child.info.set_genes_and_consequence('1', 100, ('G', ), [])

        # exclude pairs where both members are not loss-of-function
        self.assertFalse(self.inh.is_compound_pair(var1, var2))
    def test_parse_gene_symbols_multi_alts_masked_alt(self):
        ''' check parse_gene_symbols() when we mask alt alleles
        '''

        info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
            'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
        alts = ('G', 'C')

        # mask one allele
        self.assertEqual(info.parse_gene_symbols(alts, ['C']), [
            Symbols(info={
                'HGNC_ID': 'D|X',
                'HGNC': 'D|X',
                'SYMBOL': 'D|X',
                'ENSG': 'D|X',
                'ENST': 'D|X',
                'ENSP': 'D|X',
                'ENSR': 'D|X'
            },
                    idx=0)
        ])

        # mask both alleles
        self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']), [])
 def test_get_consequence_last_base(self):
     '''check get_consequence() works with last base of exon changes
     '''
     
     chrom, pos = '1', 1000
     alts = ('C',)
     info = Info('CQ=missense_variant;HGNC=TEST')
     info.set_genes_and_consequence(chrom, pos, alts, [])
     
     # Now check that if the variant is at a position where it is a final
     # base in an exon with a conserved base, the consequence gets converted.
     info.last_base = set([("1", 1000)])
     self.assertEqual(info.get_consequences(chrom, pos, alts, []),
         [["conserved_exon_terminus_variant"]])
     
     # If we have a variant in multiple genes, check that it only alters the
     # missense/splice_region variants, and doesn't alter synonymous variants
     # (since these will be in transcripts where the variant is distant from
     # an exon boundary.)
     info = Info('CQ=missense_variant|synonymous_variant;HGNC=TEST|TEST1')
     info.set_genes_and_consequence(chrom, pos, alts, [])
     info.last_base = set([("1", 1000)])
     self.assertEqual(info.get_consequences(chrom, pos, alts, []),
         [["conserved_exon_terminus_variant", "synonymous_variant"]])
    def test_is_missense_cnv(self):
        ''' test that is_missense() works correctly for CNVs
        '''

        chrom, pos, alts, = '1', '15000000', ('G', )
        info = Info('HGNC=ATRX;CQ=coding_sequence_variant;random_tag')
        info.set_genes_and_consequence(chrom, pos, alts, [])

        self.assertTrue(info.is_missense(is_cnv=True))
        self.assertFalse(info.is_missense(is_cnv=False))
    def __init__(self, chrom, position, id, ref, alts, qual, filter, info=None,
            format=None, sample=None, gender=None, sum_x_lr2=None, parents=None, mnv_code=None):
        """ initialise the object with the definition values
        """
        
        self.chrom = chrom
        self.position = int(position)
        
        self.variant_id = id
        self.mutation_id = "NA"
        self.set_mutation_id(self.variant_id)
        
        self.ref_allele = ref
        self.alt_alleles = tuple(alts.split(','))
        
        self.mnv_code = mnv_code
        self.qual = qual
        self.filter = filter

        self.sum_x_lr2 = sum_x_lr2

        self.has_parents = parents
         
        # intialise variables that will be set later
        self.inheritance_type = None
        
        self.gender = None
        if gender is not None:
            self._set_gender(gender)
        
        self.vcf_line = None
        
        self.format = None
        if format is not None and sample is not None:
            self.add_format(format, sample)
        
        self.info = Info(info, self.mnv_code)
        masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles)
        self.info.set_genes_and_consequence(self.get_chrom(),
            self.get_position(), self.alt_alleles, masked)
        
        self.genotype = None
        if self.format is not None and self._get_gender() is not None:
            self.set_genotype()
    def test_get_consequence(self):
        """ test that get_consequence works correctly
        """

        chrom, pos = '1', 1000
        info = Info('CQ=missense_variant;HGNC=TEST')
        alts = ('C', )

        # check that in the absence of any known conserved final exon positions,
        # the consequence is unchanged.
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
                         [['missense_variant']])

        info = Info('CQ=missense_variant|stop_gained;HGNC=TEST|TEST2')
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
                         [['missense_variant', 'stop_gained']])
Exemple #31
0
 def tearDown(self):
     SNV.known_genes = None
     Info.set_populations([])
class TestVariantInfoPy(unittest.TestCase):
    """  unit testing of the Info class
    """
    
    def setUp(self):
        """ define a default Info object
        """
        
        self.pops = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF",
            "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]
        Info.set_populations(self.pops)
        
        # set up a SNV object, since SNV inherits Info
        self.info = Info("HGNC_ID=1001;CQ=missense_variant;random_tag")
    
    def tearDown(self):
        Info.set_populations([])
    
    def test_get_consequence(self):
        """ test that get_consequence works correctly
        """
        
        chrom, pos = '1', 1000
        info = Info('CQ=missense_variant;HGNC=TEST')
        alts = ('C',)
        
        # check that in the absence of any known conserved final exon positions,
        # the consequence is unchanged.
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
            [['missense_variant']])
        
        info = Info('CQ=missense_variant|stop_gained;HGNC=TEST|TEST2')
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
            [['missense_variant', 'stop_gained']])
        
    def test_get_consequence_last_base(self):
        '''check get_consequence() works with last base of exon changes
        '''
        
        chrom, pos = '1', 1000
        alts = ('C',)
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence(chrom, pos, alts, [])
        
        # Now check that if the variant is at a position where it is a final
        # base in an exon with a conserved base, the consequence gets converted.
        info.last_base = set([("1", 1000)])
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
            [["conserved_exon_terminus_variant"]])
        
        # If we have a variant in multiple genes, check that it only alters the
        # missense/splice_region variants, and doesn't alter synonymous variants
        # (since these will be in transcripts where the variant is distant from
        # an exon boundary.)
        info = Info('CQ=missense_variant|synonymous_variant;HGNC=TEST|TEST1')
        info.set_genes_and_consequence(chrom, pos, alts, [])
        info.last_base = set([("1", 1000)])
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
            [["conserved_exon_terminus_variant", "synonymous_variant"]])
    
    def test_get_consequence_multiallelic(self):
        ''' test that get_consequence works correctly with multiple alleles
        '''
        
        chrom, pos = '1', 1000
        info = Info('CQ=missense_variant,synonymous_variant')
        alts = ('C', 'G')
        
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
            [['missense_variant'], ['synonymous_variant']])
        
    def test_get_consequence_multiallelic_with_masked(self):
        ''' test that get_consequence works correctly with multiple alleles
        '''
        
        chrom, pos = '1', 1000
        info = Info('CQ=missense_variant,synonymous_variant')
        alts = ('C', 'G')
        
        self.assertEqual(info.get_consequences(chrom, pos, alts, ['G']),
            [['missense_variant']])
    
    def test_parse_gene_symbols(self):
        """ test that parse_gene_symbols() works correctly
        """
        
        alts = ('C',)
        
        # check for when a HGNC key exists
        self.info["HGNC_ID"] = "A"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A'}, idx=0)])
        
        # check for when a HGNC key doesn't exist
        del self.info["HGNC_ID"]
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={}, idx=0)])
        
        # check for multiple gene symbols
        self.info["HGNC_ID"] = "A|B|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A|B|C'}, idx=0)])
        
        # check for multiple gene symbols, when some are missing
        self.info["HGNC_ID"] = "|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C'}, idx=0)])
        
        # check for multiple gene symbols, when some missing symbols have
        # alternates in other symbol fields.
        self.info["HGNC_ID"] = ".|.|C"
        self.info["HGNC"] = "Z|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C', 'HGNC': 'Z||C'}, idx=0)])
        
        # Check that including alternate symbols has the correct precendence
        # order. Note that doing this properly would require checking all of the
        # possible order combinations.
        self.info["HGNC_ID"] = ".|.|C"
        self.info["HGNC"] = "Z|.|C"
        self.info["SYMBOL"] = "A|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C',
            'HGNC': 'Z||C', "SYMBOL": "A||C"}, idx=0)])
    
    def test_parse_gene_symbols_multi_alts(self):
        ''' check parse_gene_symbols() when we have multiple alleles
        '''
        
        info = Info('HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E')
        alts = ('G', 'C')
        
        self.assertEqual(info.parse_gene_symbols(alts,  []),
            [Symbols(info={'HGNC_ID': 'D', 'HGNC': 'D', 'SYMBOL': 'D',
                'ENSG': 'D', 'ENST': 'D', 'ENSP': 'D', 'ENSR': 'D'}, idx=0),
            Symbols(info={'HGNC_ID': 'E', 'HGNC': 'E', 'SYMBOL': 'E',
                'ENSG': 'E', 'ENST': 'E', 'ENSP': 'E', 'ENSR': 'E'}, idx=0)])
        
        # if we have more alleles than the available symbols, we get an error
        # NOTE: this doesn't check if we have fewer alleles than symbols
        alts = ('G', 'T', 'C')
        with self.assertRaises(IndexError):
            self.info.parse_gene_symbols(alts, [])
        
    def test_parse_gene_symbols_multi_alts_multi_symbols(self):
        ''' check parse_gene_symbols() when we have multiple symbols per allele
        '''
        
        info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
            'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
        alts = ('G', 'C')
        
        self.assertEqual(info.parse_gene_symbols(alts, []),
            [Symbols(info={'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X',
                'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X'}, idx=0),
            Symbols(info={'HGNC_ID': 'E|Y', 'HGNC': 'E|Y', 'SYMBOL': 'E|Y',
                'ENSG': 'E|Y', 'ENST': 'E|Y', 'ENSP': 'E|Y', 'ENSR': 'E|Y'}, idx=0)])
        
    def test_parse_gene_symbols_multi_alts_masked_alt(self):
        ''' check parse_gene_symbols() when we mask alt alleles
        '''
        
        info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
            'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
        alts = ('G', 'C')
        
        # mask one allele
        self.assertEqual(info.parse_gene_symbols(alts, ['C']),
            [Symbols(info={'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X',
                'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X'}, idx=0)])
        
        # mask both alleles
        self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']),
            [])
    
    def test_parse_gene_symbols_missing_gene(self):
        ''' check the gene symbol is the genome pos when we lack any other info
        '''
        
        # remove the only possibly source of the gene symbol
        info = Info('')
        alts = ('C', )
        
        genes = info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={}, idx=0)])
    
    def test_is_lof(self):
        """ test that is_lof() works correctly
        """
        
        # check that known LOF consensequence return True
        info = Info('CQ=stop_gained;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_lof())
        
        # check that known non-LOF consensequence returns False
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_lof())
        
        # check that null values return False
        info = Info('HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_lof())
        
        # check when the variant overlaps multiple genes (so has multiple
        # gene symbols and consequences).
        info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        
        self.assertTrue(info.is_lof())
        self.assertTrue(info.is_lof("ATRX"))
        self.assertFalse(info.is_lof("TTN"))
        
        # check that when we have a MNV, we can lose or gain a LOF annotation
        info.mnv_code = 'masked_stop_gain_mnv'
        self.assertFalse(info.is_lof("ATRX"))
        
        info.mnv_code = 'modified_stop_gained_mnv'
        self.assertTrue(info.is_lof("TTN"))
    
    def test_is_missense(self):
        """ test that is_missense() works correctly
        """
        
        # check that known missense equivalent consequence return True
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_missense(is_cnv=False))
        
        # check that known LoF equivalent consequence returns False
        info = Info('CQ=stop_gained;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_missense(is_cnv=False))
        
        # check that null values return False
        info = Info('HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_missense(is_cnv=False))
        
        # check when the variant overlaps multiple genes (so has multiple
        # gene symbols and consequences).
        info = Info('CQ=missense_variant|synonymous_variant;HGNC=ATRX|TTN')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_missense(is_cnv=False))
        self.assertTrue(info.is_missense(False, "ATRX"))
        self.assertFalse(info.is_missense(False, "TTN"))
        
        # check that when we have a MNV, we can lose or gain a LOF annotation
        info.mnv_code = 'modified_synonymous_mnv'
        self.assertFalse(info.is_missense(False, "ATRX"))
        
        info.mnv_code = 'modified_protein_altering_mnv'
        self.assertTrue(info.is_missense(False, "TTN"))
        
        # check that masked stop gained MNVs are converted to a missense
        info = Info('CQ=stop_gained;HGNC=ATRX')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        info.mnv_code = 'masked_stop_gain_mnv'
        self.assertTrue(info.is_missense(False))
    
    def test_is_missense_cnv(self):
        ''' test that is_missense() works correctly for CNVs
        '''
        
        chrom, pos, alts, = '1', '15000000', ('G',)
        info = Info('HGNC=ATRX;CQ=coding_sequence_variant;random_tag')
        info.set_genes_and_consequence(chrom, pos, alts, [])
        
        self.assertTrue(info.is_missense(is_cnv=True))
        self.assertFalse(info.is_missense(is_cnv=False))
    
    def test_get_per_gene_consequence(self):
        """ test that get_per_gene_consequence works correctly
        """
        
        self.info.symbols = [Symbols(info={'HGNC': 'ATRX'}, idx=0)]
        self.info.consequence = [["missense_variant"]]
        
        self.assertEqual(self.info.get_per_gene_consequence(None), ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("TEST"), [])
        
        # check a variant with consequences in multiple genes, that we only
        # pull out the consequencesquences for a single gene
        self.info.symbols = [Symbols(info={'HGNC': 'ATRX|TTN'}, idx=0)]
        self.info.consequence = [["missense_variant", "synonymous_variant"]]
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("TTN"), ["synonymous_variant"])
        
        # check a symbol where two symbols match, we only use the first consequence
        self.info.symbols = [Symbols(info={'HGNC': 'TEMP|ATRX|TEMP'}, idx=0)]
        self.info.consequence = [["splice_acceptor_variant", "missense_variant",
            "synonymous_variant"]]
        self.assertEqual(self.info.get_per_gene_consequence("TEMP"),
            ["splice_acceptor_variant"])
        
        # check a symbol with some None gene symbols
        self.info.symbols = [Symbols(info={'HGNC': '|ATRX|'}, idx=0)]
        self.info.consequence = [["splice_acceptor_variant", "missense_variant",
            "synonymous_variant"]]
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"),
            ["missense_variant"])
    
    def test_get_allele_frequency(self):
        """ tests that number conversion works as expected
        """
        
        # single number returns that number
        self.assertEqual(self.info.get_allele_frequency("1"), 1)
        
        # two numbers return one number
        self.assertEqual(self.info.get_allele_frequency("1,1"), 1)
        
        # two numbers return the highest number
        self.assertEqual(self.info.get_allele_frequency("1,2"), 2)
        
        # number and string return the number
        self.assertEqual(self.info.get_allele_frequency("a,1"), 1)
        
        # single string value returns None
        self.assertEqual(self.info.get_allele_frequency("a"), None)
        
        # multiple string values return None
        self.assertEqual(self.info.get_allele_frequency("a,b"), None)
        
        # multiple string values return None
        self.assertEqual(self.info.get_allele_frequency(None), None)
    
    def test_is_number(self):
        """ tests that we can check if a value represents a number
        """
        
        self.assertEqual(self.info.is_number(None), False)
        self.assertEqual(self.info.is_number("5"), True)
        self.assertEqual(self.info.is_number("a"), False)
    
    def test_find_max_allele_frequency(self):
        """ test if the MAF finder operates correctly
        """
        
        # check for var without recorded MAF
        self.assertIsNone(self.info.find_max_allele_frequency())
        
        # check for single population
        self.info["MAX_AF"] = "0.005"
        self.assertEqual(self.info.find_max_allele_frequency(), 0.005)
        
        # check for two populations
        self.info["AFR_AF"] = "0.01"
        self.assertEqual(self.info.find_max_allele_frequency(), 0.01)
        
        # check for all populations
        pops = set(["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", \
            "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"])
        for pop in pops:
            self.info[pop] = "0.05"
            self.assertEqual(self.info.find_max_allele_frequency(), 0.05)
        
        # make sure we can handle having None values
        self.info["AFR_AF"] = None
        self.assertEqual(self.info.find_max_allele_frequency(), 0.05)
    
    def test_find_max_allele_frequency_without_populations(self):
        ''' test if the MAF finder operates correctly when we haven't set any
        populations to check
        '''
        
        self.info["MAX_AF"] = "0.005"
        
        # this is a regression test for a problem that only occurs if the unit
        # tests are run in an order such that the populations might not have
        # been set in previous commits.
        Info.set_populations([])
        self.assertEqual(self.info.find_max_allele_frequency(), None)
        
        # reset the populations, so that other unit tests can also rely on the
        # populations being set
        Info.set_populations(self.pops)
    def test_is_lof(self):
        """ test that is_lof() works correctly
        """

        # check that known LOF consensequence return True
        info = Info('CQ=stop_gained;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_lof())

        # check that known non-LOF consensequence returns False
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_lof())

        # check that null values return False
        info = Info('HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_lof())

        # check when the variant overlaps multiple genes (so has multiple
        # gene symbols and consequences).
        info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN')
        info.set_genes_and_consequence('1', 100, ('G'), [])

        self.assertTrue(info.is_lof())
        self.assertTrue(info.is_lof("ATRX"))
        self.assertFalse(info.is_lof("TTN"))

        # check that when we have a MNV, we can lose or gain a LOF annotation
        info.mnv_code = 'masked_stop_gain_mnv'
        self.assertFalse(info.is_lof("ATRX"))

        info.mnv_code = 'modified_stop_gained_mnv'
        self.assertTrue(info.is_lof("TTN"))
 def test_is_lof(self):
     """ test that is_lof() works correctly
     """
     
     # check that known LOF consensequence return True
     info = Info('CQ=stop_gained;HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertTrue(info.is_lof())
     
     # check that known non-LOF consensequence returns False
     info = Info('CQ=missense_variant;HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertFalse(info.is_lof())
     
     # check that null values return False
     info = Info('HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertFalse(info.is_lof())
     
     # check when the variant overlaps multiple genes (so has multiple
     # gene symbols and consequences).
     info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     
     self.assertTrue(info.is_lof())
     self.assertTrue(info.is_lof("ATRX"))
     self.assertFalse(info.is_lof("TTN"))
     
     # check that when we have a MNV, we can lose or gain a LOF annotation
     info.mnv_code = 'masked_stop_gain_mnv'
     self.assertFalse(info.is_lof("ATRX"))
     
     info.mnv_code = 'modified_stop_gained_mnv'
     self.assertTrue(info.is_lof("TTN"))
    def test_get_consequence_last_base(self):
        '''check get_consequence() works with last base of exon changes
        '''

        chrom, pos = '1', 1000
        alts = ('C', )
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence(chrom, pos, alts, [])

        # Now check that if the variant is at a position where it is a final
        # base in an exon with a conserved base, the consequence gets converted.
        info.last_base = set([("1", 1000)])
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
                         [["conserved_exon_terminus_variant"]])

        # If we have a variant in multiple genes, check that it only alters the
        # missense/splice_region variants, and doesn't alter synonymous variants
        # (since these will be in transcripts where the variant is distant from
        # an exon boundary.)
        info = Info('CQ=missense_variant|synonymous_variant;HGNC=TEST|TEST1')
        info.set_genes_and_consequence(chrom, pos, alts, [])
        info.last_base = set([("1", 1000)])
        self.assertEqual(
            info.get_consequences(chrom, pos, alts, []),
            [["conserved_exon_terminus_variant", "synonymous_variant"]])
 def tearDown(self):
     Info.set_populations([])
    def test_is_missense(self):
        """ test that is_missense() works correctly
        """

        # check that known missense equivalent consequence return True
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_missense(is_cnv=False))

        # check that known LoF equivalent consequence returns False
        info = Info('CQ=stop_gained;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_missense(is_cnv=False))

        # check that null values return False
        info = Info('HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_missense(is_cnv=False))

        # check when the variant overlaps multiple genes (so has multiple
        # gene symbols and consequences).
        info = Info('CQ=missense_variant|synonymous_variant;HGNC=ATRX|TTN')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_missense(is_cnv=False))
        self.assertTrue(info.is_missense(False, "ATRX"))
        self.assertFalse(info.is_missense(False, "TTN"))

        # check that when we have a MNV, we can lose or gain a LOF annotation
        info.mnv_code = 'modified_synonymous_mnv'
        self.assertFalse(info.is_missense(False, "ATRX"))

        info.mnv_code = 'modified_protein_altering_mnv'
        self.assertTrue(info.is_missense(False, "TTN"))

        # check that masked stop gained MNVs are converted to a missense
        info = Info('CQ=stop_gained;HGNC=ATRX')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        info.mnv_code = 'masked_stop_gain_mnv'
        self.assertTrue(info.is_missense(False))
class Variant(object):
    """ generic functions for variants
    """
    
    # define some codes used in ped files to identify male and female sexes
    male_codes = set(["1", "m", "M", "male"])
    female_codes = set(["2", "f", "F", "female"])
    
    x_pseudoautosomal_regions = [(60001, 2699520), (154930290, 155260560), \
        (88456802, 92375509)]
    y_pseudoautosomal_regions = [(10001, 2649520), (59034050, 59363566)]
    known_genes = None
    
    @classmethod
    def set_known_genes(cls_obj, known_genes):
        cls_obj.known_genes = known_genes
    
    def __init__(self, chrom, position, id, ref, alts, qual, filter, info=None,
            format=None, sample=None, gender=None, sum_x_lr2=None, parents=None, mnv_code=None):
        """ initialise the object with the definition values
        """
        
        self.chrom = chrom
        self.position = int(position)
        
        self.variant_id = id
        self.mutation_id = "NA"
        self.set_mutation_id(self.variant_id)
        
        self.ref_allele = ref
        self.alt_alleles = tuple(alts.split(','))
        
        self.mnv_code = mnv_code
        self.qual = qual
        self.filter = filter

        self.sum_x_lr2 = sum_x_lr2

        self.has_parents = parents
         
        # intialise variables that will be set later
        self.inheritance_type = None
        
        self.gender = None
        if gender is not None:
            self._set_gender(gender)
        
        self.vcf_line = None
        
        self.format = None
        if format is not None and sample is not None:
            self.add_format(format, sample)
        
        self.info = Info(info, self.mnv_code)
        masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles)
        self.info.set_genes_and_consequence(self.get_chrom(),
            self.get_position(), self.alt_alleles, masked)
        
        self.genotype = None
        if self.format is not None and self._get_gender() is not None:
            self.set_genotype()


    def is_lof(self, gene_symbol=None):
        return self.info.is_lof(gene_symbol)
    def is_missense(self, is_cnv, gene_symbol=None):
        return self.info.is_missense(is_cnv, gene_symbol)
    def is_synoymous(self, gene_symbol=None):
        return self.info.is_synoymous(gene_symbol)
    
    def __repr__(self):
        ''' repr function for Variant objects. SNV(...) and CNV(...) also work
        '''
        
        def quote(value):
            if value is not None:
                value = '"{}"'.format(value)
            return value
        
        # reprocess the format dictionary back to the original text strings
        keys, sample = None, None
        if self.format is not None:
            keys = quote(':'.join(sorted(self.format)))
            sample = quote(':'.join([ self.format[x] for x in sorted(self.format) ]))
        
        info = quote(self.info)
        gender = quote(self.gender)
        mnv_code = quote(self.mnv_code)
        
        return '{}(chrom="{}", position={}, id="{}", ref="{}", alts="{}", ' \
            'qual="{}", filter="{}", info={}, format={}, sample={}, gender={}, ' \
            'mnv_code={})'.format(type(self).__name__, self.chrom,
            self.position, self.variant_id, self.ref_allele,
            ','.join(self.alt_alleles), self.qual, self.filter, info, keys, sample,
            gender, mnv_code)
    
    def __hash__(self):
        return hash(str(self))
    
    def __eq__(self, other):
        return hash(self) == hash(other)
    
    def _set_gender(self, gender):
        """ sets the gender of the individual for the variant
        """
        
        if gender in self.male_codes:
            self.gender = "male"
        elif gender in self.female_codes:
            self.gender = "female"
        else:
            raise ValueError("unknown gender code")
        
        self.set_inheritance_type(self.get_position(), self.is_male())
    
    def _get_gender(self):
        """returns the gender for a person (1, M = male, 2, F = female).
        """
        return self.gender
    
    def set_mutation_id(self, variant_id):
        """ sets the mutation ID based on the VCF ID field
        
        The variant ID can be either "." for null value, an rsID, a HGMD ID,
        a COSMIC ID, or any combination of those (including multiple HGMD IDs
        for a single variant).
        
        Args:
            variant_id: string from the VCF ID field, can be rsID, or a list of
                &-separated IDs, which can include COSMIC and HGMD IDs.
        """
        
        if variant_id != ".":
            variant_id = variant_id.split("&")
            ids = []
            for value in variant_id:
                # include everything that isn't an rsID
                if not value.startswith("rs"):
                    ids.append(value)
            
            if len(ids) > 0:
                self.mutation_id = ",".join(ids)
                    
    def get_mutation_id(self):
        return self.mutation_id
    
    def is_male(self):
        """ returns True/False for whether the person is male
        """
        
        return self._get_gender() in self.male_codes
    
    def add_format(self, keys, values):
        """Parses the FORMAT column from VCF files.
        
        Args:
            keys: FORMAT text from a line in a VCF file
            values: the values for the format keys
        """
        
        self.format = dict(zip(keys.split(":"), values.split(":")))
    
    def get_low_depth_alleles(self, ref, alts):
        ''' get a list of alleles with zero counts, or indels with 1 read
        
        Some variants have multiple alts, so we need to select the alt with
        the most severe consequence. However, in at least one version of the
        VCFs, one of the alts could have zero counts, which I believe resulted
        from the population based multi-sample calling. We need to drop the
        consequences recorded for zero-count alternate alleles before finding
        the most severe.
        
        We also want to avoid indels with only one read, because these are
        universally bad calls.
        
        Args:
            ref: reference allele
            alts: tuple of alt alleles
        
        Returns:
            list of alleles with sufficiently low depth
        '''
        
        is_indel = lambda x, y: len(x) > 1 or len(y) > 1
        
        allele_counts = ['1'] * len(alts)
        if 'AC' in self.info:
            allele_counts = self.info['AC'].split(',')
        
        allele_depths = ['10'] * len(alts)
        if 'AD' in self.format:
            allele_depths = self.format['AD'].split(',')[1:]
        
        counts = list(zip(allele_counts, allele_depths))
        
        assert len(counts) == len(alts)
        
        # find the positions of alleles where the allele count is zero,
        # or indels with 1 alt read
        pos = set()
        for i, (count, depth) in enumerate(counts):
            if count == '0':
                pos.add(i)
            elif depth == '1' and is_indel(ref, alts[i]):
                pos.add(i)
        
        # return the alleles with zero-count ,so we can mask them out
        return [ alts[i] for i in sorted(pos) ]
    
    def add_vcf_line(self, vcf_line):
        self.vcf_line = vcf_line
    
    def get_vcf_line(self):
        return self.vcf_line
        
    def set_inheritance_type(self, pos, is_male):
        """ sets the chromosome type (eg autosomal, or X chromosome type).
        
        provides the chromosome type for a chromosome (eg Autosomal, or
        X-chrom male etc). This only does simple string matching. The
        chromosome string is either the chromosome number, or in the case of
        the sex-chromosomes, the chromosome character. This doesn't allow for
        chromosomes to be specified as "chr1", and sex chromosomes have to be
        specified as "X" or "Y", not "23" or "24".
        
        Args:
            pos: position on the chromosome
            is_male: True/False for whether the individual is male
        """
        
        if self.get_chrom() not in ["chrX", "ChrX", "X", "chrY", "ChrY", "Y"]:
            self.inheritance_type = "autosomal"
        elif self.get_chrom() in ["chrX", "ChrX", "X"]:
            # check if the gene lies within a pseudoautosomal region
            for start, end in self.x_pseudoautosomal_regions:
                if start < pos < end:
                    self.inheritance_type = "autosomal"
                    return
            
            if is_male:
                self.inheritance_type =  "XChrMale"
            else:
                self.inheritance_type = "XChrFemale"
        elif self.get_chrom() in ["chrY", "ChrY", "Y"]:
            # check if the gene lies within a pseudoautosomal region
            for start, end in self.y_pseudoautosomal_regions:
                if start < pos < end:
                    self.inheritance_type = "autosomal"
                    return
            if is_male:
                self.inheritance_type =  "YChrMale"
            else:
                self.inheritance_type = "YChrFemale"
    
    def get_inheritance_type(self):
        """ return the variant chromosomal inheritance type
        """
        
        return self.inheritance_type
    
    def get_chrom(self):
        """ return the variant chromosome
        """
        
        return self.chrom
    
    def get_position(self):
        """ return the variant chromosomal position
        """
        
        return self.position
    
    def get_genotype(self):
        """ return the genotype value
        """
        
        return self.genotype

    def get_sum_x_lr2(self):
        """ return the sum of mean l2r on x chromsome
        """

        return self.sum_x_lr2

    def get_has_parents(self):
        """returns false for singletons, true for trios
        """
        
        return self.has_parents
Exemple #39
0
class Variant(object):
    """ generic functions for variants
    """

    # define some codes used in ped files to identify male and female sexes
    male_codes = set(["1", "m", "M", "male"])
    female_codes = set(["2", "f", "F", "female"])

    x_pseudoautosomal_regions = [(60001, 2699520), (154930290, 155260560), \
        (88456802, 92375509)]
    y_pseudoautosomal_regions = [(10001, 2649520), (59034050, 59363566)]
    known_genes = None

    @classmethod
    def set_known_genes(cls_obj, known_genes):
        cls_obj.known_genes = known_genes

    def __init__(self,
                 chrom,
                 position,
                 id,
                 ref,
                 alts,
                 qual,
                 filter,
                 info=None,
                 format=None,
                 sample=None,
                 gender=None,
                 sum_x_lr2=None,
                 parents=None,
                 mnv_code=None):
        """ initialise the object with the definition values
        """

        self.chrom = chrom
        self.position = int(position)

        self.variant_id = id
        self.mutation_id = "NA"
        self.set_mutation_id(self.variant_id)

        self.ref_allele = ref
        self.alt_alleles = tuple(alts.split(','))

        self.mnv_code = mnv_code
        self.qual = qual
        self.filter = filter

        self.sum_x_lr2 = sum_x_lr2

        self.has_parents = parents

        # intialise variables that will be set later
        self.inheritance_type = None

        self.gender = None
        if gender is not None:
            self._set_gender(gender)

        self.vcf_line = None

        self.format = None
        if format is not None and sample is not None:
            self.add_format(format, sample)

        self.info = Info(info, self.mnv_code)
        masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles)
        self.info.set_genes_and_consequence(self.get_chrom(),
                                            self.get_position(),
                                            self.alt_alleles, masked)

        self.genotype = None
        if self.format is not None and self._get_gender() is not None:
            self.set_genotype()

    def is_lof(self, gene_symbol=None):
        return self.info.is_lof(gene_symbol)

    def is_missense(self, is_cnv, gene_symbol=None):
        return self.info.is_missense(is_cnv, gene_symbol)

    def is_synoymous(self, gene_symbol=None):
        return self.info.is_synoymous(gene_symbol)

    def __repr__(self):
        ''' repr function for Variant objects. SNV(...) and CNV(...) also work
        '''
        def quote(value):
            if value is not None:
                value = '"{}"'.format(value)
            return value

        # reprocess the format dictionary back to the original text strings
        keys, sample = None, None
        if self.format is not None:
            keys = quote(':'.join(sorted(self.format)))
            sample = quote(':'.join(
                [self.format[x] for x in sorted(self.format)]))

        info = quote(self.info)
        gender = quote(self.gender)
        mnv_code = quote(self.mnv_code)

        return '{}(chrom="{}", position={}, id="{}", ref="{}", alts="{}", ' \
            'qual="{}", filter="{}", info={}, format={}, sample={}, gender={}, ' \
            'mnv_code={})'.format(type(self).__name__, self.chrom,
            self.position, self.variant_id, self.ref_allele,
            ','.join(self.alt_alleles), self.qual, self.filter, info, keys, sample,
            gender, mnv_code)

    def __hash__(self):
        return hash(str(self))

    def __eq__(self, other):
        return hash(self) == hash(other)

    def _set_gender(self, gender):
        """ sets the gender of the individual for the variant
        """

        if gender in self.male_codes:
            self.gender = "male"
        elif gender in self.female_codes:
            self.gender = "female"
        else:
            raise ValueError("unknown gender code")

        self.set_inheritance_type(self.get_position(), self.is_male())

    def _get_gender(self):
        """returns the gender for a person (1, M = male, 2, F = female).
        """
        return self.gender

    def set_mutation_id(self, variant_id):
        """ sets the mutation ID based on the VCF ID field
        
        The variant ID can be either "." for null value, an rsID, a HGMD ID,
        a COSMIC ID, or any combination of those (including multiple HGMD IDs
        for a single variant).
        
        Args:
            variant_id: string from the VCF ID field, can be rsID, or a list of
                &-separated IDs, which can include COSMIC and HGMD IDs.
        """

        if variant_id != ".":
            variant_id = variant_id.split("&")
            ids = []
            for value in variant_id:
                # include everything that isn't an rsID
                if not value.startswith("rs"):
                    ids.append(value)

            if len(ids) > 0:
                self.mutation_id = ",".join(ids)

    def get_mutation_id(self):
        return self.mutation_id

    def is_male(self):
        """ returns True/False for whether the person is male
        """

        return self._get_gender() in self.male_codes

    def add_format(self, keys, values):
        """Parses the FORMAT column from VCF files.
        
        Args:
            keys: FORMAT text from a line in a VCF file
            values: the values for the format keys
        """

        self.format = dict(zip(keys.split(":"), values.split(":")))

    def get_low_depth_alleles(self, ref, alts):
        ''' get a list of alleles with zero counts, or indels with 1 read
        
        Some variants have multiple alts, so we need to select the alt with
        the most severe consequence. However, in at least one version of the
        VCFs, one of the alts could have zero counts, which I believe resulted
        from the population based multi-sample calling. We need to drop the
        consequences recorded for zero-count alternate alleles before finding
        the most severe.
        
        We also want to avoid indels with only one read, because these are
        universally bad calls.
        
        Args:
            ref: reference allele
            alts: tuple of alt alleles
        
        Returns:
            list of alleles with sufficiently low depth
        '''

        is_indel = lambda x, y: len(x) > 1 or len(y) > 1

        allele_counts = ['1'] * len(alts)
        if 'AC' in self.info:
            allele_counts = self.info['AC'].split(',')

        allele_depths = ['10'] * len(alts)
        if 'AD' in self.format:
            allele_depths = self.format['AD'].split(',')[1:]

        counts = list(zip(allele_counts, allele_depths))

        assert len(counts) == len(alts)

        # find the positions of alleles where the allele count is zero,
        # or indels with 1 alt read
        pos = set()
        for i, (count, depth) in enumerate(counts):
            if count == '0':
                pos.add(i)
            elif depth == '1' and is_indel(ref, alts[i]):
                pos.add(i)

        # return the alleles with zero-count ,so we can mask them out
        return [alts[i] for i in sorted(pos)]

    def add_vcf_line(self, vcf_line):
        self.vcf_line = vcf_line

    def get_vcf_line(self):
        return self.vcf_line

    def set_inheritance_type(self, pos, is_male):
        """ sets the chromosome type (eg autosomal, or X chromosome type).
        
        provides the chromosome type for a chromosome (eg Autosomal, or
        X-chrom male etc). This only does simple string matching. The
        chromosome string is either the chromosome number, or in the case of
        the sex-chromosomes, the chromosome character. This doesn't allow for
        chromosomes to be specified as "chr1", and sex chromosomes have to be
        specified as "X" or "Y", not "23" or "24".
        
        Args:
            pos: position on the chromosome
            is_male: True/False for whether the individual is male
        """

        if self.get_chrom() not in ["chrX", "ChrX", "X", "chrY", "ChrY", "Y"]:
            self.inheritance_type = "autosomal"
        elif self.get_chrom() in ["chrX", "ChrX", "X"]:
            # check if the gene lies within a pseudoautosomal region
            for start, end in self.x_pseudoautosomal_regions:
                if start < pos < end:
                    self.inheritance_type = "autosomal"
                    return

            if is_male:
                self.inheritance_type = "XChrMale"
            else:
                self.inheritance_type = "XChrFemale"
        elif self.get_chrom() in ["chrY", "ChrY", "Y"]:
            # check if the gene lies within a pseudoautosomal region
            for start, end in self.y_pseudoautosomal_regions:
                if start < pos < end:
                    self.inheritance_type = "autosomal"
                    return
            if is_male:
                self.inheritance_type = "YChrMale"
            else:
                self.inheritance_type = "YChrFemale"

    def get_inheritance_type(self):
        """ return the variant chromosomal inheritance type
        """

        return self.inheritance_type

    def get_chrom(self):
        """ return the variant chromosome
        """

        return self.chrom

    def get_position(self):
        """ return the variant chromosomal position
        """

        return self.position

    def get_genotype(self):
        """ return the genotype value
        """

        return self.genotype

    def get_sum_x_lr2(self):
        """ return the sum of mean l2r on x chromsome
        """

        return self.sum_x_lr2

    def get_has_parents(self):
        """returns false for singletons, true for trios
        """

        return self.has_parents
 def test_is_missense(self):
     """ test that is_missense() works correctly
     """
     
     # check that known missense equivalent consequence return True
     info = Info('CQ=missense_variant;HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertTrue(info.is_missense(is_cnv=False))
     
     # check that known LoF equivalent consequence returns False
     info = Info('CQ=stop_gained;HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertFalse(info.is_missense(is_cnv=False))
     
     # check that null values return False
     info = Info('HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertFalse(info.is_missense(is_cnv=False))
     
     # check when the variant overlaps multiple genes (so has multiple
     # gene symbols and consequences).
     info = Info('CQ=missense_variant|synonymous_variant;HGNC=ATRX|TTN')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertTrue(info.is_missense(is_cnv=False))
     self.assertTrue(info.is_missense(False, "ATRX"))
     self.assertFalse(info.is_missense(False, "TTN"))
     
     # check that when we have a MNV, we can lose or gain a LOF annotation
     info.mnv_code = 'modified_synonymous_mnv'
     self.assertFalse(info.is_missense(False, "ATRX"))
     
     info.mnv_code = 'modified_protein_altering_mnv'
     self.assertTrue(info.is_missense(False, "TTN"))
     
     # check that masked stop gained MNVs are converted to a missense
     info = Info('CQ=stop_gained;HGNC=ATRX')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     info.mnv_code = 'masked_stop_gain_mnv'
     self.assertTrue(info.is_missense(False))
 def tearDown(self):
     Info.set_populations([])
class TestVariantInfoPy(unittest.TestCase):
    """  unit testing of the Info class
    """
    def setUp(self):
        """ define a default Info object
        """

        self.pops = [
            "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
            "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"
        ]
        Info.set_populations(self.pops)

        # set up a SNV object, since SNV inherits Info
        self.info = Info("HGNC_ID=1001;CQ=missense_variant;random_tag")

    def tearDown(self):
        Info.set_populations([])

    def test_get_consequence(self):
        """ test that get_consequence works correctly
        """

        chrom, pos = '1', 1000
        info = Info('CQ=missense_variant;HGNC=TEST')
        alts = ('C', )

        # check that in the absence of any known conserved final exon positions,
        # the consequence is unchanged.
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
                         [['missense_variant']])

        info = Info('CQ=missense_variant|stop_gained;HGNC=TEST|TEST2')
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
                         [['missense_variant', 'stop_gained']])

    def test_get_consequence_last_base(self):
        '''check get_consequence() works with last base of exon changes
        '''

        chrom, pos = '1', 1000
        alts = ('C', )
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence(chrom, pos, alts, [])

        # Now check that if the variant is at a position where it is a final
        # base in an exon with a conserved base, the consequence gets converted.
        info.last_base = set([("1", 1000)])
        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
                         [["conserved_exon_terminus_variant"]])

        # If we have a variant in multiple genes, check that it only alters the
        # missense/splice_region variants, and doesn't alter synonymous variants
        # (since these will be in transcripts where the variant is distant from
        # an exon boundary.)
        info = Info('CQ=missense_variant|synonymous_variant;HGNC=TEST|TEST1')
        info.set_genes_and_consequence(chrom, pos, alts, [])
        info.last_base = set([("1", 1000)])
        self.assertEqual(
            info.get_consequences(chrom, pos, alts, []),
            [["conserved_exon_terminus_variant", "synonymous_variant"]])

    def test_get_consequence_multiallelic(self):
        ''' test that get_consequence works correctly with multiple alleles
        '''

        chrom, pos = '1', 1000
        info = Info('CQ=missense_variant,synonymous_variant')
        alts = ('C', 'G')

        self.assertEqual(info.get_consequences(chrom, pos, alts, []),
                         [['missense_variant'], ['synonymous_variant']])

    def test_get_consequence_multiallelic_with_masked(self):
        ''' test that get_consequence works correctly with multiple alleles
        '''

        chrom, pos = '1', 1000
        info = Info('CQ=missense_variant,synonymous_variant')
        alts = ('C', 'G')

        self.assertEqual(info.get_consequences(chrom, pos, alts, ['G']),
                         [['missense_variant']])

    def test_parse_gene_symbols(self):
        """ test that parse_gene_symbols() works correctly
        """

        alts = ('C', )

        # check for when a HGNC key exists
        self.info["HGNC_ID"] = "A"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A'}, idx=0)])

        # check for when a HGNC key doesn't exist
        del self.info["HGNC_ID"]
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={}, idx=0)])

        # check for multiple gene symbols
        self.info["HGNC_ID"] = "A|B|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A|B|C'}, idx=0)])

        # check for multiple gene symbols, when some are missing
        self.info["HGNC_ID"] = "|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C'}, idx=0)])

        # check for multiple gene symbols, when some missing symbols have
        # alternates in other symbol fields.
        self.info["HGNC_ID"] = ".|.|C"
        self.info["HGNC"] = "Z|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(
            genes, [Symbols(info={
                'HGNC_ID': '||C',
                'HGNC': 'Z||C'
            }, idx=0)])

        # Check that including alternate symbols has the correct precendence
        # order. Note that doing this properly would require checking all of the
        # possible order combinations.
        self.info["HGNC_ID"] = ".|.|C"
        self.info["HGNC"] = "Z|.|C"
        self.info["SYMBOL"] = "A|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [
            Symbols(info={
                'HGNC_ID': '||C',
                'HGNC': 'Z||C',
                "SYMBOL": "A||C"
            },
                    idx=0)
        ])

    def test_parse_gene_symbols_multi_alts(self):
        ''' check parse_gene_symbols() when we have multiple alleles
        '''

        info = Info(
            'HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E'
        )
        alts = ('G', 'C')

        self.assertEqual(info.parse_gene_symbols(alts, []), [
            Symbols(info={
                'HGNC_ID': 'D',
                'HGNC': 'D',
                'SYMBOL': 'D',
                'ENSG': 'D',
                'ENST': 'D',
                'ENSP': 'D',
                'ENSR': 'D'
            },
                    idx=0),
            Symbols(info={
                'HGNC_ID': 'E',
                'HGNC': 'E',
                'SYMBOL': 'E',
                'ENSG': 'E',
                'ENST': 'E',
                'ENSP': 'E',
                'ENSR': 'E'
            },
                    idx=0)
        ])

        # if we have more alleles than the available symbols, we get an error
        # NOTE: this doesn't check if we have fewer alleles than symbols
        alts = ('G', 'T', 'C')
        with self.assertRaises(IndexError):
            self.info.parse_gene_symbols(alts, [])

    def test_parse_gene_symbols_multi_alts_multi_symbols(self):
        ''' check parse_gene_symbols() when we have multiple symbols per allele
        '''

        info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
            'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
        alts = ('G', 'C')

        self.assertEqual(info.parse_gene_symbols(alts, []), [
            Symbols(info={
                'HGNC_ID': 'D|X',
                'HGNC': 'D|X',
                'SYMBOL': 'D|X',
                'ENSG': 'D|X',
                'ENST': 'D|X',
                'ENSP': 'D|X',
                'ENSR': 'D|X'
            },
                    idx=0),
            Symbols(info={
                'HGNC_ID': 'E|Y',
                'HGNC': 'E|Y',
                'SYMBOL': 'E|Y',
                'ENSG': 'E|Y',
                'ENST': 'E|Y',
                'ENSP': 'E|Y',
                'ENSR': 'E|Y'
            },
                    idx=0)
        ])

    def test_parse_gene_symbols_multi_alts_masked_alt(self):
        ''' check parse_gene_symbols() when we mask alt alleles
        '''

        info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
            'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
        alts = ('G', 'C')

        # mask one allele
        self.assertEqual(info.parse_gene_symbols(alts, ['C']), [
            Symbols(info={
                'HGNC_ID': 'D|X',
                'HGNC': 'D|X',
                'SYMBOL': 'D|X',
                'ENSG': 'D|X',
                'ENST': 'D|X',
                'ENSP': 'D|X',
                'ENSR': 'D|X'
            },
                    idx=0)
        ])

        # mask both alleles
        self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']), [])

    def test_parse_gene_symbols_missing_gene(self):
        ''' check the gene symbol is the genome pos when we lack any other info
        '''

        # remove the only possibly source of the gene symbol
        info = Info('')
        alts = ('C', )

        genes = info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={}, idx=0)])

    def test_is_lof(self):
        """ test that is_lof() works correctly
        """

        # check that known LOF consensequence return True
        info = Info('CQ=stop_gained;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_lof())

        # check that known non-LOF consensequence returns False
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_lof())

        # check that null values return False
        info = Info('HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_lof())

        # check when the variant overlaps multiple genes (so has multiple
        # gene symbols and consequences).
        info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN')
        info.set_genes_and_consequence('1', 100, ('G'), [])

        self.assertTrue(info.is_lof())
        self.assertTrue(info.is_lof("ATRX"))
        self.assertFalse(info.is_lof("TTN"))

        # check that when we have a MNV, we can lose or gain a LOF annotation
        info.mnv_code = 'masked_stop_gain_mnv'
        self.assertFalse(info.is_lof("ATRX"))

        info.mnv_code = 'modified_stop_gained_mnv'
        self.assertTrue(info.is_lof("TTN"))

    def test_is_missense(self):
        """ test that is_missense() works correctly
        """

        # check that known missense equivalent consequence return True
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_missense(is_cnv=False))

        # check that known LoF equivalent consequence returns False
        info = Info('CQ=stop_gained;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_missense(is_cnv=False))

        # check that null values return False
        info = Info('HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_missense(is_cnv=False))

        # check when the variant overlaps multiple genes (so has multiple
        # gene symbols and consequences).
        info = Info('CQ=missense_variant|synonymous_variant;HGNC=ATRX|TTN')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_missense(is_cnv=False))
        self.assertTrue(info.is_missense(False, "ATRX"))
        self.assertFalse(info.is_missense(False, "TTN"))

        # check that when we have a MNV, we can lose or gain a LOF annotation
        info.mnv_code = 'modified_synonymous_mnv'
        self.assertFalse(info.is_missense(False, "ATRX"))

        info.mnv_code = 'modified_protein_altering_mnv'
        self.assertTrue(info.is_missense(False, "TTN"))

        # check that masked stop gained MNVs are converted to a missense
        info = Info('CQ=stop_gained;HGNC=ATRX')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        info.mnv_code = 'masked_stop_gain_mnv'
        self.assertTrue(info.is_missense(False))

    def test_is_missense_cnv(self):
        ''' test that is_missense() works correctly for CNVs
        '''

        chrom, pos, alts, = '1', '15000000', ('G', )
        info = Info('HGNC=ATRX;CQ=coding_sequence_variant;random_tag')
        info.set_genes_and_consequence(chrom, pos, alts, [])

        self.assertTrue(info.is_missense(is_cnv=True))
        self.assertFalse(info.is_missense(is_cnv=False))

    def test_get_per_gene_consequence(self):
        """ test that get_per_gene_consequence works correctly
        """

        self.info.symbols = [Symbols(info={'HGNC': 'ATRX'}, idx=0)]
        self.info.consequence = [["missense_variant"]]

        self.assertEqual(self.info.get_per_gene_consequence(None),
                         ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"),
                         ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("TEST"), [])

        # check a variant with consequences in multiple genes, that we only
        # pull out the consequencesquences for a single gene
        self.info.symbols = [Symbols(info={'HGNC': 'ATRX|TTN'}, idx=0)]
        self.info.consequence = [["missense_variant", "synonymous_variant"]]
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"),
                         ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("TTN"),
                         ["synonymous_variant"])

        # check a symbol where two symbols match, we only use the first consequence
        self.info.symbols = [Symbols(info={'HGNC': 'TEMP|ATRX|TEMP'}, idx=0)]
        self.info.consequence = [[
            "splice_acceptor_variant", "missense_variant", "synonymous_variant"
        ]]
        self.assertEqual(self.info.get_per_gene_consequence("TEMP"),
                         ["splice_acceptor_variant"])

        # check a symbol with some None gene symbols
        self.info.symbols = [Symbols(info={'HGNC': '|ATRX|'}, idx=0)]
        self.info.consequence = [[
            "splice_acceptor_variant", "missense_variant", "synonymous_variant"
        ]]
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"),
                         ["missense_variant"])

    def test_get_allele_frequency(self):
        """ tests that number conversion works as expected
        """

        # single number returns that number
        self.assertEqual(self.info.get_allele_frequency("1"), 1)

        # two numbers return one number
        self.assertEqual(self.info.get_allele_frequency("1,1"), 1)

        # two numbers return the highest number
        self.assertEqual(self.info.get_allele_frequency("1,2"), 2)

        # number and string return the number
        self.assertEqual(self.info.get_allele_frequency("a,1"), 1)

        # single string value returns None
        self.assertEqual(self.info.get_allele_frequency("a"), None)

        # multiple string values return None
        self.assertEqual(self.info.get_allele_frequency("a,b"), None)

        # multiple string values return None
        self.assertEqual(self.info.get_allele_frequency(None), None)

    def test_is_number(self):
        """ tests that we can check if a value represents a number
        """

        self.assertEqual(self.info.is_number(None), False)
        self.assertEqual(self.info.is_number("5"), True)
        self.assertEqual(self.info.is_number("a"), False)

    def test_find_max_allele_frequency(self):
        """ test if the MAF finder operates correctly
        """

        # check for var without recorded MAF
        self.assertIsNone(self.info.find_max_allele_frequency())

        # check for single population
        self.info["MAX_AF"] = "0.005"
        self.assertEqual(self.info.find_max_allele_frequency(), 0.005)

        # check for two populations
        self.info["AFR_AF"] = "0.01"
        self.assertEqual(self.info.find_max_allele_frequency(), 0.01)

        # check for all populations
        pops = set(["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", \
            "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"])
        for pop in pops:
            self.info[pop] = "0.05"
            self.assertEqual(self.info.find_max_allele_frequency(), 0.05)

        # make sure we can handle having None values
        self.info["AFR_AF"] = None
        self.assertEqual(self.info.find_max_allele_frequency(), 0.05)

    def test_find_max_allele_frequency_without_populations(self):
        ''' test if the MAF finder operates correctly when we haven't set any
        populations to check
        '''

        self.info["MAX_AF"] = "0.005"

        # this is a regression test for a problem that only occurs if the unit
        # tests are run in an order such that the populations might not have
        # been set in previous commits.
        Info.set_populations([])
        self.assertEqual(self.info.find_max_allele_frequency(), None)

        # reset the populations, so that other unit tests can also rely on the
        # populations being set
        Info.set_populations(self.pops)
 def tearDown(self):
     SNV.known_genes = None
     Info.set_populations([])