コード例 #1
0
    def test_is_lof(self):
        """ test that is_lof() works correctly
        """

        # check that known LOF consensequence return True
        info = Info('CQ=stop_gained;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertTrue(info.is_lof())

        # check that known non-LOF consensequence returns False
        info = Info('CQ=missense_variant;HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_lof())

        # check that null values return False
        info = Info('HGNC=TEST')
        info.set_genes_and_consequence('1', 100, ('G'), [])
        self.assertFalse(info.is_lof())

        # check when the variant overlaps multiple genes (so has multiple
        # gene symbols and consequences).
        info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN')
        info.set_genes_and_consequence('1', 100, ('G'), [])

        self.assertTrue(info.is_lof())
        self.assertTrue(info.is_lof("ATRX"))
        self.assertFalse(info.is_lof("TTN"))

        # check that when we have a MNV, we can lose or gain a LOF annotation
        info.mnv_code = 'masked_stop_gain_mnv'
        self.assertFalse(info.is_lof("ATRX"))

        info.mnv_code = 'modified_stop_gained_mnv'
        self.assertTrue(info.is_lof("TTN"))
コード例 #2
0
 def test_is_lof(self):
     """ test that is_lof() works correctly
     """
     
     # check that known LOF consensequence return True
     info = Info('CQ=stop_gained;HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertTrue(info.is_lof())
     
     # check that known non-LOF consensequence returns False
     info = Info('CQ=missense_variant;HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertFalse(info.is_lof())
     
     # check that null values return False
     info = Info('HGNC=TEST')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     self.assertFalse(info.is_lof())
     
     # check when the variant overlaps multiple genes (so has multiple
     # gene symbols and consequences).
     info = Info('CQ=stop_gained|missense_variant;HGNC=ATRX|TTN')
     info.set_genes_and_consequence('1', 100, ('G'), [])
     
     self.assertTrue(info.is_lof())
     self.assertTrue(info.is_lof("ATRX"))
     self.assertFalse(info.is_lof("TTN"))
     
     # check that when we have a MNV, we can lose or gain a LOF annotation
     info.mnv_code = 'masked_stop_gain_mnv'
     self.assertFalse(info.is_lof("ATRX"))
     
     info.mnv_code = 'modified_stop_gained_mnv'
     self.assertTrue(info.is_lof("TTN"))
コード例 #3
0
class Variant(object):
    """ generic functions for variants
    """
    
    # define some codes used in ped files to identify male and female sexes
    male_codes = set(["1", "m", "M", "male"])
    female_codes = set(["2", "f", "F", "female"])
    
    x_pseudoautosomal_regions = [(60001, 2699520), (154930290, 155260560), \
        (88456802, 92375509)]
    y_pseudoautosomal_regions = [(10001, 2649520), (59034050, 59363566)]
    known_genes = None
    
    @classmethod
    def set_known_genes(cls_obj, known_genes):
        cls_obj.known_genes = known_genes
    
    def __init__(self, chrom, position, id, ref, alts, qual, filter, info=None,
            format=None, sample=None, gender=None, sum_x_lr2=None, parents=None, mnv_code=None):
        """ initialise the object with the definition values
        """
        
        self.chrom = chrom
        self.position = int(position)
        
        self.variant_id = id
        self.mutation_id = "NA"
        self.set_mutation_id(self.variant_id)
        
        self.ref_allele = ref
        self.alt_alleles = tuple(alts.split(','))
        
        self.mnv_code = mnv_code
        self.qual = qual
        self.filter = filter

        self.sum_x_lr2 = sum_x_lr2

        self.has_parents = parents
         
        # intialise variables that will be set later
        self.inheritance_type = None
        
        self.gender = None
        if gender is not None:
            self._set_gender(gender)
        
        self.vcf_line = None
        
        self.format = None
        if format is not None and sample is not None:
            self.add_format(format, sample)
        
        self.info = Info(info, self.mnv_code)
        masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles)
        self.info.set_genes_and_consequence(self.get_chrom(),
            self.get_position(), self.alt_alleles, masked)
        
        self.genotype = None
        if self.format is not None and self._get_gender() is not None:
            self.set_genotype()


    def is_lof(self, gene_symbol=None):
        return self.info.is_lof(gene_symbol)
    def is_missense(self, is_cnv, gene_symbol=None):
        return self.info.is_missense(is_cnv, gene_symbol)
    def is_synoymous(self, gene_symbol=None):
        return self.info.is_synoymous(gene_symbol)
    
    def __repr__(self):
        ''' repr function for Variant objects. SNV(...) and CNV(...) also work
        '''
        
        def quote(value):
            if value is not None:
                value = '"{}"'.format(value)
            return value
        
        # reprocess the format dictionary back to the original text strings
        keys, sample = None, None
        if self.format is not None:
            keys = quote(':'.join(sorted(self.format)))
            sample = quote(':'.join([ self.format[x] for x in sorted(self.format) ]))
        
        info = quote(self.info)
        gender = quote(self.gender)
        mnv_code = quote(self.mnv_code)
        
        return '{}(chrom="{}", position={}, id="{}", ref="{}", alts="{}", ' \
            'qual="{}", filter="{}", info={}, format={}, sample={}, gender={}, ' \
            'mnv_code={})'.format(type(self).__name__, self.chrom,
            self.position, self.variant_id, self.ref_allele,
            ','.join(self.alt_alleles), self.qual, self.filter, info, keys, sample,
            gender, mnv_code)
    
    def __hash__(self):
        return hash(str(self))
    
    def __eq__(self, other):
        return hash(self) == hash(other)
    
    def _set_gender(self, gender):
        """ sets the gender of the individual for the variant
        """
        
        if gender in self.male_codes:
            self.gender = "male"
        elif gender in self.female_codes:
            self.gender = "female"
        else:
            raise ValueError("unknown gender code")
        
        self.set_inheritance_type(self.get_position(), self.is_male())
    
    def _get_gender(self):
        """returns the gender for a person (1, M = male, 2, F = female).
        """
        return self.gender
    
    def set_mutation_id(self, variant_id):
        """ sets the mutation ID based on the VCF ID field
        
        The variant ID can be either "." for null value, an rsID, a HGMD ID,
        a COSMIC ID, or any combination of those (including multiple HGMD IDs
        for a single variant).
        
        Args:
            variant_id: string from the VCF ID field, can be rsID, or a list of
                &-separated IDs, which can include COSMIC and HGMD IDs.
        """
        
        if variant_id != ".":
            variant_id = variant_id.split("&")
            ids = []
            for value in variant_id:
                # include everything that isn't an rsID
                if not value.startswith("rs"):
                    ids.append(value)
            
            if len(ids) > 0:
                self.mutation_id = ",".join(ids)
                    
    def get_mutation_id(self):
        return self.mutation_id
    
    def is_male(self):
        """ returns True/False for whether the person is male
        """
        
        return self._get_gender() in self.male_codes
    
    def add_format(self, keys, values):
        """Parses the FORMAT column from VCF files.
        
        Args:
            keys: FORMAT text from a line in a VCF file
            values: the values for the format keys
        """
        
        self.format = dict(zip(keys.split(":"), values.split(":")))
    
    def get_low_depth_alleles(self, ref, alts):
        ''' get a list of alleles with zero counts, or indels with 1 read
        
        Some variants have multiple alts, so we need to select the alt with
        the most severe consequence. However, in at least one version of the
        VCFs, one of the alts could have zero counts, which I believe resulted
        from the population based multi-sample calling. We need to drop the
        consequences recorded for zero-count alternate alleles before finding
        the most severe.
        
        We also want to avoid indels with only one read, because these are
        universally bad calls.
        
        Args:
            ref: reference allele
            alts: tuple of alt alleles
        
        Returns:
            list of alleles with sufficiently low depth
        '''
        
        is_indel = lambda x, y: len(x) > 1 or len(y) > 1
        
        allele_counts = ['1'] * len(alts)
        if 'AC' in self.info:
            allele_counts = self.info['AC'].split(',')
        
        allele_depths = ['10'] * len(alts)
        if 'AD' in self.format:
            allele_depths = self.format['AD'].split(',')[1:]
        
        counts = list(zip(allele_counts, allele_depths))
        
        assert len(counts) == len(alts)
        
        # find the positions of alleles where the allele count is zero,
        # or indels with 1 alt read
        pos = set()
        for i, (count, depth) in enumerate(counts):
            if count == '0':
                pos.add(i)
            elif depth == '1' and is_indel(ref, alts[i]):
                pos.add(i)
        
        # return the alleles with zero-count ,so we can mask them out
        return [ alts[i] for i in sorted(pos) ]
    
    def add_vcf_line(self, vcf_line):
        self.vcf_line = vcf_line
    
    def get_vcf_line(self):
        return self.vcf_line
        
    def set_inheritance_type(self, pos, is_male):
        """ sets the chromosome type (eg autosomal, or X chromosome type).
        
        provides the chromosome type for a chromosome (eg Autosomal, or
        X-chrom male etc). This only does simple string matching. The
        chromosome string is either the chromosome number, or in the case of
        the sex-chromosomes, the chromosome character. This doesn't allow for
        chromosomes to be specified as "chr1", and sex chromosomes have to be
        specified as "X" or "Y", not "23" or "24".
        
        Args:
            pos: position on the chromosome
            is_male: True/False for whether the individual is male
        """
        
        if self.get_chrom() not in ["chrX", "ChrX", "X", "chrY", "ChrY", "Y"]:
            self.inheritance_type = "autosomal"
        elif self.get_chrom() in ["chrX", "ChrX", "X"]:
            # check if the gene lies within a pseudoautosomal region
            for start, end in self.x_pseudoautosomal_regions:
                if start < pos < end:
                    self.inheritance_type = "autosomal"
                    return
            
            if is_male:
                self.inheritance_type =  "XChrMale"
            else:
                self.inheritance_type = "XChrFemale"
        elif self.get_chrom() in ["chrY", "ChrY", "Y"]:
            # check if the gene lies within a pseudoautosomal region
            for start, end in self.y_pseudoautosomal_regions:
                if start < pos < end:
                    self.inheritance_type = "autosomal"
                    return
            if is_male:
                self.inheritance_type =  "YChrMale"
            else:
                self.inheritance_type = "YChrFemale"
    
    def get_inheritance_type(self):
        """ return the variant chromosomal inheritance type
        """
        
        return self.inheritance_type
    
    def get_chrom(self):
        """ return the variant chromosome
        """
        
        return self.chrom
    
    def get_position(self):
        """ return the variant chromosomal position
        """
        
        return self.position
    
    def get_genotype(self):
        """ return the genotype value
        """
        
        return self.genotype

    def get_sum_x_lr2(self):
        """ return the sum of mean l2r on x chromsome
        """

        return self.sum_x_lr2

    def get_has_parents(self):
        """returns false for singletons, true for trios
        """
        
        return self.has_parents
コード例 #4
0
ファイル: variant.py プロジェクト: dowing/clinical-filter
class Variant(object):
    """ generic functions for variants
    """

    # define some codes used in ped files to identify male and female sexes
    male_codes = set(["1", "m", "M", "male"])
    female_codes = set(["2", "f", "F", "female"])

    x_pseudoautosomal_regions = [(60001, 2699520), (154930290, 155260560), \
        (88456802, 92375509)]
    y_pseudoautosomal_regions = [(10001, 2649520), (59034050, 59363566)]
    known_genes = None

    @classmethod
    def set_known_genes(cls_obj, known_genes):
        cls_obj.known_genes = known_genes

    def __init__(self,
                 chrom,
                 position,
                 id,
                 ref,
                 alts,
                 qual,
                 filter,
                 info=None,
                 format=None,
                 sample=None,
                 gender=None,
                 sum_x_lr2=None,
                 parents=None,
                 mnv_code=None):
        """ initialise the object with the definition values
        """

        self.chrom = chrom
        self.position = int(position)

        self.variant_id = id
        self.mutation_id = "NA"
        self.set_mutation_id(self.variant_id)

        self.ref_allele = ref
        self.alt_alleles = tuple(alts.split(','))

        self.mnv_code = mnv_code
        self.qual = qual
        self.filter = filter

        self.sum_x_lr2 = sum_x_lr2

        self.has_parents = parents

        # intialise variables that will be set later
        self.inheritance_type = None

        self.gender = None
        if gender is not None:
            self._set_gender(gender)

        self.vcf_line = None

        self.format = None
        if format is not None and sample is not None:
            self.add_format(format, sample)

        self.info = Info(info, self.mnv_code)
        masked = self.get_low_depth_alleles(self.ref_allele, self.alt_alleles)
        self.info.set_genes_and_consequence(self.get_chrom(),
                                            self.get_position(),
                                            self.alt_alleles, masked)

        self.genotype = None
        if self.format is not None and self._get_gender() is not None:
            self.set_genotype()

    def is_lof(self, gene_symbol=None):
        return self.info.is_lof(gene_symbol)

    def is_missense(self, is_cnv, gene_symbol=None):
        return self.info.is_missense(is_cnv, gene_symbol)

    def is_synoymous(self, gene_symbol=None):
        return self.info.is_synoymous(gene_symbol)

    def __repr__(self):
        ''' repr function for Variant objects. SNV(...) and CNV(...) also work
        '''
        def quote(value):
            if value is not None:
                value = '"{}"'.format(value)
            return value

        # reprocess the format dictionary back to the original text strings
        keys, sample = None, None
        if self.format is not None:
            keys = quote(':'.join(sorted(self.format)))
            sample = quote(':'.join(
                [self.format[x] for x in sorted(self.format)]))

        info = quote(self.info)
        gender = quote(self.gender)
        mnv_code = quote(self.mnv_code)

        return '{}(chrom="{}", position={}, id="{}", ref="{}", alts="{}", ' \
            'qual="{}", filter="{}", info={}, format={}, sample={}, gender={}, ' \
            'mnv_code={})'.format(type(self).__name__, self.chrom,
            self.position, self.variant_id, self.ref_allele,
            ','.join(self.alt_alleles), self.qual, self.filter, info, keys, sample,
            gender, mnv_code)

    def __hash__(self):
        return hash(str(self))

    def __eq__(self, other):
        return hash(self) == hash(other)

    def _set_gender(self, gender):
        """ sets the gender of the individual for the variant
        """

        if gender in self.male_codes:
            self.gender = "male"
        elif gender in self.female_codes:
            self.gender = "female"
        else:
            raise ValueError("unknown gender code")

        self.set_inheritance_type(self.get_position(), self.is_male())

    def _get_gender(self):
        """returns the gender for a person (1, M = male, 2, F = female).
        """
        return self.gender

    def set_mutation_id(self, variant_id):
        """ sets the mutation ID based on the VCF ID field
        
        The variant ID can be either "." for null value, an rsID, a HGMD ID,
        a COSMIC ID, or any combination of those (including multiple HGMD IDs
        for a single variant).
        
        Args:
            variant_id: string from the VCF ID field, can be rsID, or a list of
                &-separated IDs, which can include COSMIC and HGMD IDs.
        """

        if variant_id != ".":
            variant_id = variant_id.split("&")
            ids = []
            for value in variant_id:
                # include everything that isn't an rsID
                if not value.startswith("rs"):
                    ids.append(value)

            if len(ids) > 0:
                self.mutation_id = ",".join(ids)

    def get_mutation_id(self):
        return self.mutation_id

    def is_male(self):
        """ returns True/False for whether the person is male
        """

        return self._get_gender() in self.male_codes

    def add_format(self, keys, values):
        """Parses the FORMAT column from VCF files.
        
        Args:
            keys: FORMAT text from a line in a VCF file
            values: the values for the format keys
        """

        self.format = dict(zip(keys.split(":"), values.split(":")))

    def get_low_depth_alleles(self, ref, alts):
        ''' get a list of alleles with zero counts, or indels with 1 read
        
        Some variants have multiple alts, so we need to select the alt with
        the most severe consequence. However, in at least one version of the
        VCFs, one of the alts could have zero counts, which I believe resulted
        from the population based multi-sample calling. We need to drop the
        consequences recorded for zero-count alternate alleles before finding
        the most severe.
        
        We also want to avoid indels with only one read, because these are
        universally bad calls.
        
        Args:
            ref: reference allele
            alts: tuple of alt alleles
        
        Returns:
            list of alleles with sufficiently low depth
        '''

        is_indel = lambda x, y: len(x) > 1 or len(y) > 1

        allele_counts = ['1'] * len(alts)
        if 'AC' in self.info:
            allele_counts = self.info['AC'].split(',')

        allele_depths = ['10'] * len(alts)
        if 'AD' in self.format:
            allele_depths = self.format['AD'].split(',')[1:]

        counts = list(zip(allele_counts, allele_depths))

        assert len(counts) == len(alts)

        # find the positions of alleles where the allele count is zero,
        # or indels with 1 alt read
        pos = set()
        for i, (count, depth) in enumerate(counts):
            if count == '0':
                pos.add(i)
            elif depth == '1' and is_indel(ref, alts[i]):
                pos.add(i)

        # return the alleles with zero-count ,so we can mask them out
        return [alts[i] for i in sorted(pos)]

    def add_vcf_line(self, vcf_line):
        self.vcf_line = vcf_line

    def get_vcf_line(self):
        return self.vcf_line

    def set_inheritance_type(self, pos, is_male):
        """ sets the chromosome type (eg autosomal, or X chromosome type).
        
        provides the chromosome type for a chromosome (eg Autosomal, or
        X-chrom male etc). This only does simple string matching. The
        chromosome string is either the chromosome number, or in the case of
        the sex-chromosomes, the chromosome character. This doesn't allow for
        chromosomes to be specified as "chr1", and sex chromosomes have to be
        specified as "X" or "Y", not "23" or "24".
        
        Args:
            pos: position on the chromosome
            is_male: True/False for whether the individual is male
        """

        if self.get_chrom() not in ["chrX", "ChrX", "X", "chrY", "ChrY", "Y"]:
            self.inheritance_type = "autosomal"
        elif self.get_chrom() in ["chrX", "ChrX", "X"]:
            # check if the gene lies within a pseudoautosomal region
            for start, end in self.x_pseudoautosomal_regions:
                if start < pos < end:
                    self.inheritance_type = "autosomal"
                    return

            if is_male:
                self.inheritance_type = "XChrMale"
            else:
                self.inheritance_type = "XChrFemale"
        elif self.get_chrom() in ["chrY", "ChrY", "Y"]:
            # check if the gene lies within a pseudoautosomal region
            for start, end in self.y_pseudoautosomal_regions:
                if start < pos < end:
                    self.inheritance_type = "autosomal"
                    return
            if is_male:
                self.inheritance_type = "YChrMale"
            else:
                self.inheritance_type = "YChrFemale"

    def get_inheritance_type(self):
        """ return the variant chromosomal inheritance type
        """

        return self.inheritance_type

    def get_chrom(self):
        """ return the variant chromosome
        """

        return self.chrom

    def get_position(self):
        """ return the variant chromosomal position
        """

        return self.position

    def get_genotype(self):
        """ return the genotype value
        """

        return self.genotype

    def get_sum_x_lr2(self):
        """ return the sum of mean l2r on x chromsome
        """

        return self.sum_x_lr2

    def get_has_parents(self):
        """returns false for singletons, true for trios
        """

        return self.has_parents