def __init__(self, opts):
        """intialise the class with the some definitions
        """

        self.set_definitions(opts)
        self.report = Report(self.output_path, self.export_vcf, self.ID_mapper,
                             self.known_genes_date)
Example #2
0
    def test__save_tabular(self):
        ''' check that _save_tabular() works correctly
        '''

        temp = tempfile.NamedTemporaryFile(suffix='.txt',
                                           dir=self.temp_dir,
                                           delete=False)
        report = Report(temp.name, None, None)

        var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"])
        var[0].child.format['GQ'] = 40
        _save_tabular(temp.name, [var], self.trio)

        with open(temp.name, 'r') as handle:
            lines = handle.readlines()

        expected = [
            'proband\tsex\tchrom\tposition\tgene\t'
            'mutation_ID\ttranscript\tconsequence\tref/alt_alleles\tMAX_MAF\t'
            'inheritance\ttrio_genotype\tmom_aff\tdad_aff\tresult\tpp_dnm\t'
            'exac_allele_count\tGQ\thas_parents\tcnv_length\n',
            'child\tF\tX\t150\tTEST\tNA\tNA\t'
            'missense_variant\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t1\t'
            'single_variant\t0.99\tNA\t40\tTrue\tNA\n'
        ]

        self.assertEqual(lines, expected)
Example #3
0
    def setUp(self):
        """ define a family and variant, and start the Allosomal class
        """

        # generate a test family
        child_gender = "F"
        mom_aff = "1"
        dad_aff = "1"

        self.trio = self.create_family(child_gender, mom_aff, dad_aff)

        # generate a test variant
        child = create_snv(child_gender,
                           "0/1",
                           chrom='X',
                           pos=150,
                           extra_info='HGNC=TEST;MAX_AF=0.0005')
        mom = create_snv("F", "0/0", chrom='X', pos=150)
        dad = create_snv("M", "0/0", chrom='X', pos=150)

        self.variants = [TrioGenotypes('X', '150', child, mom, dad)]

        self.report = Report(None, None, None)
        Info.set_populations([
            "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
            "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"
        ])
Example #4
0
    def __init__(self, population_tags=None, count=0, known_genes=None, date=None,
            regions=None, lof_sites=None, pp_filter=0.0, sum_x_lr2_file=None,
            output_path=None, export_vcf=None, debug_chrom=None, debug_pos=None):
        """ initialise the class object
        
        Args:
            population_tags: list of population ID tags, that could exist within
                the INFO field, or None.
            count: number of probands to analyse, helpful for tracking progress
                in output logs.
            known_genes: path to table of genes genes known to be associated
                with genetic disorders, or None.
            date: date of the known_genes file, or None if not using/unknown.
            regions: path to a table of regions for DECIPHER CNV syndromes.
            lof_sites: path to json file of [chrom, position] coordinates in
                genome, for modifying to a loss-of-function consequence if
                required. Can be None if unneeded.
            pp_filter: threshold from 0 to 1 for pp_dnm value to filter out
                candidiate DNMs which fall below this value
            sum_x_lr2_file: File containing sum of l2r values on x chromosome 
                for each person
            output_path: path to write output tab-separated file to
            export_vcf: path to file or folder to write VCFs to.
            debug_chrom: chromosome for debugging purposes.
            debug_pos: position for debugging variant filtering at.
        """
        
        self.pp_filter = pp_filter
        self.total = count
        self.count = 0
        
        self.populations = population_tags
        self.debug_chrom = debug_chrom
        self.debug_pos = debug_pos
        
        # open reference datasets, these return None if the paths are None
        self.known_genes = open_known_genes(known_genes)
        self.cnv_regions = open_cnv_regions(regions)
        self.last_base = open_last_base_sites(lof_sites)

        #open file containing sum of mean log 2 ratios on X, returns an empty dict if path is None
        self.sum_x_lr2 = open_x_lr2_file(sum_x_lr2_file)
        
        self.reporter = Report(output_path, export_vcf, date)
    def setUp(self):
        """ define a family and variant, and start the Allosomal class
        """

        # generate a test family
        child_gender = "F"
        mom_aff = "1"
        dad_aff = "1"

        self.trio = self.create_family(child_gender, mom_aff, dad_aff)

        # generate a test variant
        child_var = self.create_snv(child_gender, "0/1")
        mom_var = self.create_snv("F", "0/0")
        dad_var = self.create_snv("M", "0/0")

        var = TrioGenotypes(child_var)
        var.add_mother_variant(mom_var)
        var.add_father_variant(dad_var)
        self.variants = [var]

        self.report = Report(None, None, None, None)
        self.report.family = self.trio
 def setUp(self):
     """ define a family and variant, and start the Allosomal class
     """
     
     # generate a test family
     child_gender = "F"
     mom_aff = "1"
     dad_aff = "1"
     
     self.trio = self.create_family(child_gender, mom_aff, dad_aff)
     
     # generate a test variant
     child_var = self.create_snv(child_gender, "0/1")
     mom_var = self.create_snv("F", "0/0")
     dad_var = self.create_snv("M", "0/0")
     
     var = TrioGenotypes(child_var)
     var.add_mother_variant(mom_var)
     var.add_father_variant(dad_var)
     self.variants = [var]
     
     self.report = Report(None, None, None, None)
     self.report.family = self.trio
class ClinicalFilter(LoadOptions):
    """ filters trios for candidate variants that might contribute to a
    probands disorder.
    """
    def __init__(self, opts):
        """intialise the class with the some definitions
        """

        self.set_definitions(opts)
        self.report = Report(self.output_path, self.export_vcf, self.ID_mapper,
                             self.known_genes_date)

    def filter_trios(self):
        """ loads trio variants, and screens for candidate variants
        """

        self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, \
            self.debug_chrom, self.debug_pos)

        # load the trio paths into the current path setup
        for family_ID in sorted(self.families):
            self.family = self.families[family_ID]

            # some families have more than one child in the family, so run
            # through each child.
            self.family.set_child()
            while self.family.child is not None:
                if self.family.child.is_affected():
                    variants = self.vcf_loader.get_trio_variants(
                        self.family, self.pp_filter)
                    self.vcf_provenance = self.vcf_loader.get_trio_provenance()
                    self.analyse_trio(variants)

                self.family.set_child_examined()

        sys.exit(0)

    def analyse_trio(self, variants):
        """identify candidate variants in exome data for a single trio.
        
        takes variants that passed the initial filtering from VCF loading, and
        splits the variants into groups for each gene with variants. Then
        analyses variants in a single gene (so we can utilise the appropriate
        inheritance mechanisms for that gene), before running some
        pos-inheritance filters, and exporting the data (ir required).
        
        Args:
            variants: list of TrioGenotypes objects
        """

        # organise variants by gene, then find variants that fit
        # different inheritance models
        genes_dict = self.create_gene_dict(variants)
        found_vars = []
        for gene in genes_dict:
            gene_vars = genes_dict[gene]
            found_vars += self.find_variants(gene_vars, gene)

        # remove any duplicate variants (which might ocur due to CNVs being
        # checked against all the genes that they encompass)
        found_vars = self.exclude_duplicates(found_vars)

        # apply some final filters to the flagged variants
        post_filter = PostInheritanceFilter(found_vars, self.family,
                                            self.debug_chrom, self.debug_pos)
        found_vars = post_filter.filter_variants()

        # export the results to either tab-separated table or VCF format
        self.report.export_data(found_vars, self.family, \
            self.vcf_loader.child_header, self.vcf_provenance)

    def create_gene_dict(self, variants):
        """creates dictionary of variants indexed by gene
        
        Args:
            variants: list of TrioGenotypes objects
        
        Returns:
            dictionary of variants indexed by HGNC symbols
        """

        # organise the variants into entries for each gene
        genes = {}
        for var in variants:
            # variants (particularly CNVs) can span multiple genes, so we need
            # to check each gene separately, and then collapse duplicates later
            for gene in var.get_genes():
                if gene not in genes:
                    genes[gene] = []
                # add the variant to the gene entry
                genes[gene].append(var)

        return genes

    def find_variants(self, variants, gene):
        """ finds variants that fit inheritance models
        
        Args:
            variants: list of TrioGenotype objects
            gene: gene ID as string
        
        Returns:
            list of variants that pass inheritance checks
        """

        # get the inheritance for the gene (monoalleleic, biallelic, hemizygous
        # etc), but allow for times when we haven't specified a list of genes
        # to use
        gene_inh = None
        if self.known_genes is not None and gene in self.known_genes:
            gene_inh = self.known_genes[gene]["inh"]

        # If we are looking for variants in a set of known genes, and the gene
        # isn't part of that set, then we don't ant to examine the variant for
        # that gene, UNLESS the variant is a CNV, since CNVs can be included
        # purely from size thresholds, regardless of which gene they overlap.
        if self.known_genes is not None and gene not in self.known_genes:
            variants = [x for x in variants if x.is_cnv()]

        # ignore intergenic variants
        if gene is None:
            for var in variants:
                if var.get_chrom() == self.debug_chrom and var.get_position(
                ) == self.debug_pos:
                    print(var, "lacks HGNC/gene symbol")
            return []

        # Now that we are examining a single gene, check that the consequences
        # for the gene are in the required functional categories.
        variants = [
            var for var in variants
            if var.child.is_lof(gene) or var.child.is_missense(gene)
        ]
        if variants == []:
            return []

        logging.debug("{} {} {} {}".format(self.family.child.get_id(), gene,
                                           variants, gene_inh))
        chrom_inheritance = variants[0].get_inheritance_type()

        if chrom_inheritance == "autosomal":
            finder = Autosomal(variants, self.family, self.known_genes, gene,
                               self.cnv_regions)
        elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]:
            finder = Allosomal(variants, self.family, self.known_genes, gene,
                               self.cnv_regions)

        variants = finder.get_candidate_variants()
        variants = [(x[0], list(x[1]), list(x[2]), [gene]) for x in variants]

        return variants

    def exclude_duplicates(self, variants):
        """ rejig variants included under multiple inheritance mechanisms
        
        Args:
            variants: list of candidate variants
        
        Returns:
            list of (variant, check_type, inheritance) tuples, with duplicates
            excluded, and originals modified to show both mechanisms
        """

        unique_vars = {}
        for variant in variants:
            key = variant[0].child.get_key()
            if key not in unique_vars:
                unique_vars[key] = list(variant)
            else:
                result = variant[1]
                inh = variant[2]
                hgnc = variant[3]

                # append the check type and inheritance type to the first
                # instance of the variant
                unique_vars[key][1] += [
                    x for x in result if x not in unique_vars[key][1]
                ]
                unique_vars[key][2] += [
                    x for x in inh if x not in unique_vars[key][2]
                ]

                # add the HGNC symbols that are unique to the current variant
                # to the merged variant
                hgnc = [x for x in hgnc if x not in unique_vars[key][3]]
                unique_vars[key][3] += hgnc

        unique_vars = [tuple(unique_vars[x]) for x in unique_vars]

        return unique_vars
Example #8
0
class Filter(object):
    """ filters trios for candidate variants that might contribute to a
    probands disorder.
    """
    
    def __init__(self, population_tags=None, count=0, known_genes=None, date=None,
            regions=None, lof_sites=None, pp_filter=0.0, sum_x_lr2_file=None,
            output_path=None, export_vcf=None, debug_chrom=None, debug_pos=None):
        """ initialise the class object
        
        Args:
            population_tags: list of population ID tags, that could exist within
                the INFO field, or None.
            count: number of probands to analyse, helpful for tracking progress
                in output logs.
            known_genes: path to table of genes genes known to be associated
                with genetic disorders, or None.
            date: date of the known_genes file, or None if not using/unknown.
            regions: path to a table of regions for DECIPHER CNV syndromes.
            lof_sites: path to json file of [chrom, position] coordinates in
                genome, for modifying to a loss-of-function consequence if
                required. Can be None if unneeded.
            pp_filter: threshold from 0 to 1 for pp_dnm value to filter out
                candidiate DNMs which fall below this value
            sum_x_lr2_file: File containing sum of l2r values on x chromosome 
                for each person
            output_path: path to write output tab-separated file to
            export_vcf: path to file or folder to write VCFs to.
            debug_chrom: chromosome for debugging purposes.
            debug_pos: position for debugging variant filtering at.
        """
        
        self.pp_filter = pp_filter
        self.total = count
        self.count = 0
        
        self.populations = population_tags
        self.debug_chrom = debug_chrom
        self.debug_pos = debug_pos
        
        # open reference datasets, these return None if the paths are None
        self.known_genes = open_known_genes(known_genes)
        self.cnv_regions = open_cnv_regions(regions)
        self.last_base = open_last_base_sites(lof_sites)

        #open file containing sum of mean log 2 ratios on X, returns an empty dict if path is None
        self.sum_x_lr2 = open_x_lr2_file(sum_x_lr2_file)
        
        self.reporter = Report(output_path, export_vcf, date)
    
    def filter_trio(self, family):
        """ loads trio variants, and screens for candidate variants
        """
        
        # some families have more than one child in the family, so run
        # through each child.
        family.set_child()
        while family.child is not None:
            if family.child.is_affected():
                self.count += 1
                logging.info("opening trio {} of {}".format(self.count, self.total))
                
                found_vars = self.analyse_trio(family)
                # export the results to either tab-separated table or VCF format
                self.reporter.export_data(found_vars, family)
            
            family.set_child_examined()
    
    def analyse_trio(self, family):
        """identify candidate variants in exome data for a single trio.
        
        takes variants that passed the initial filtering from VCF loading, and
        splits the variants into groups for each gene with variants. Then
        analyses variants in a single gene (so we can utilise the appropriate
        inheritance mechanisms for that gene), before running some
        pos-inheritance filters, and exporting the data (ir required).
        
        Args:
            family: Family object
        
        Returns:
            list of (TrioGenotype, [genes], [inheritances], [type]) tuples for
            variants that pass inheritance and post-inheritance checks.
        """
        
        variants = load_variants(family, self.pp_filter, self.populations,
            self.known_genes, self.last_base, self.sum_x_lr2, self.debug_chrom, self.debug_pos)
        
        # organise variants by gene, then find variants that fit different
        # inheritance models. We have to flatten the list of variant lists
        genes = self.create_gene_dict(variants)
        variants = [ self.find_variants(genes[x], x, family) for x in genes ]
        variants = [ x for sublist in variants for x in sublist ]

        # remove any duplicate variants (which might ocur due to CNVs being
        # checked against all the genes that they encompass)
        variants = self.exclude_duplicates(variants)
        
        # apply some final filters to the flagged variants
        post_filter = PostInheritanceFilter(family, self.debug_chrom, self.debug_pos)
        
        return post_filter.filter_variants(variants)
    
    def create_gene_dict(self, variants):
        """creates dictionary of variants indexed by gene
        
        Args:
            variants: list of TrioGenotypes objects
        
        Returns:
            dictionary of variants indexed by HGNC ID
        """
        
        # organise the variants into entries for each gene
        genes = {}
        for var in variants:
            # variants (particularly CNVs) can span multiple genes, so we need
            # to check each gene separately, and then collapse duplicates later
            for gene_list in var.get_genes():
                for gene in gene_list:
                    if gene not in genes:
                        genes[gene] = []
                    # add the variant to the gene entry
                    genes[gene].append(var)
        
        return genes
        
    def find_variants(self, variants, gene, family):
        """ finds variants that fit inheritance models
        
        Args:
            variants: list of TrioGenotype objects
            gene: gene ID as string
        
        Returns:
            list of variants that pass inheritance checks
        """
        
        # get the inheritance for the gene (monoalleleic, biallelic, hemizygous
        # etc), but allow for times when we haven't specified a list of genes
        # to use
        known_gene = None
        gene_inh = None
        if self.known_genes is not None and gene in self.known_genes:
            known_gene = self.known_genes[gene]
            gene_inh = known_gene['inh']
        
        chrom_inheritance = variants[0].get_inheritance_type()
        
        # If we are looking for variants in a set of known genes, and the gene
        # isn't part of that set, then we don't ant to examine the variant for
        # that gene, UNLESS the variant is a CNV, since CNVs can be included
        # purely from size thresholds, regardless of which gene they overlap.
        if self.known_genes is not None and gene not in self.known_genes:
            variants = [ x for x in variants if x.is_cnv() ]
        
        # ignore intergenic variants
        if gene is None:
            for var in variants:
                if var.get_chrom() == self.debug_chrom and var.get_position() == self.debug_pos:
                    print(var, "lacks HGNC/gene symbol")
            return []
        
        # Now that we are examining a single gene, check that the consequences
        # for the gene are in the required functional categories.
        variants = [ var for var in variants if var.child.is_lof(gene) or var.child.is_missense(var.child.is_cnv(), gene) ]
        if variants == []:
            return []
        
        for x in variants[0].child.info.symbols:
            try:
                symbol = x.get(gene, ['HGNC', 'SYMBOL', 'ENSG'])
                break
            except KeyError:
                continue
        logging.info("{}\t{}\tvariants: {}\trequired_mode: {}".format(
            family.child.get_id(), symbol, [str(x) for x in variants], gene_inh))
        
        if chrom_inheritance == "autosomal":
            finder = Autosomal(variants, family, known_gene, gene, self.cnv_regions)
        elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]:
            finder = Allosomal(variants, family, known_gene, gene, self.cnv_regions)
        
        return finder.get_candidate_variants()
    
    def exclude_duplicates(self, variants):
        """ rejig variants included under multiple inheritance mechanisms
        
        Args:
            variants: list of candidate variants
        
        Returns:
            list of (variant, check_type, inheritance) tuples, with duplicates
            excluded, and originals modified to show both mechanisms
        """
        
        unique_vars = {}
        for variant in variants:
            key = variant[0].child.get_key()
            if key not in unique_vars:
                unique_vars[key] = list(variant)
            else:
                result = variant[1]
                inh = variant[2]
                hgnc = variant[3]
                
                # append the check type and inheritance type to the first
                # instance of the variant
                unique_vars[key][1] += [x for x in result if x not in unique_vars[key][1]]
                unique_vars[key][2] += [x for x in inh if x not in unique_vars[key][2]]
                
                unique_vars[key][1] = sorted(unique_vars[key][1])
                unique_vars[key][2] = sorted(unique_vars[key][2])
                
                # add the gene IDs that are unique to the current variant
                # to the merged variant
                genes = [x for x in hgnc if x not in unique_vars[key][3]]
                unique_vars[key][3] += genes
        
        unique_vars = [tuple(unique_vars[x]) for x in unique_vars]
        
        return unique_vars
class TestReportPy(unittest.TestCase):
    """ test the Report class
    """
    
    def setUp(self):
        """ define a family and variant, and start the Allosomal class
        """
        
        # generate a test family
        child_gender = "F"
        mom_aff = "1"
        dad_aff = "1"
        
        self.trio = self.create_family(child_gender, mom_aff, dad_aff)
        
        # generate a test variant
        child_var = self.create_snv(child_gender, "0/1")
        mom_var = self.create_snv("F", "0/0")
        dad_var = self.create_snv("M", "0/0")
        
        var = TrioGenotypes(child_var)
        var.add_mother_variant(mom_var)
        var.add_father_variant(dad_var)
        self.variants = [var]
        
        self.report = Report(None, None, None, None)
        self.report.family = self.trio
        # self.report.tags_dict = tags
    
    def create_snv(self, gender, genotype):
        """ create a default variant
        """
        
        chrom = "X"
        pos = "15000000"
        snp_id = "."
        ref = "A"
        alt = "G"
        qual = "50"
        filt = "PASS"
        
        # set up a SNV object, since SNV inherits VcfInfo
        var = SNV(chrom, pos, snp_id, ref, alt, filt)
        
        info = "HGNC=TEST;CQ=missense_variant;random_tag;EUR_AF=0.0005"
        format_keys = "GT:DP"
        sample_values = genotype + ":50"
        
        var.vcf_line = [chrom, pos, snp_id, ref, alt, qual, filt, info, format_keys, sample_values]
        
        var.add_info(info)
        var.add_format(format_keys, sample_values)
        var.set_gender(gender)
        var.set_genotype()
        
        return var
    
    def create_family(self, child_gender, mom_aff, dad_aff):
        """ create a default family, with optional gender and parental statuses
        """
        
        fam = Family("test")
        fam.add_child("child", "child_vcf", "2", child_gender)
        fam.add_mother("mother", "mother_vcf", mom_aff, "2")
        fam.add_father("father", "father_vcf", dad_aff, "1")
        fam.set_child()
        
        return fam
    
    def test__get_provenance(self):
        """ check that _get_provenance() works correctly
        """
        
        prov = ["checksum", "sample.calls.date.vcf.gz", "2014-01-01"]
        member = "proband"
        
        self.assertEqual(self.report._get_provenance(prov, member), \
            ["##UberVCF_proband_Id=sample\n", \
            "##UberVCF_proband_Checksum=checksum\n", \
            "##UberVCF_proband_Basename=sample.calls.date.vcf.gz\n", \
            "##UberVCF_proband_Date=2014-01-01\n"])
    
    def test__get_vcf_export_path(self):
        """ check that _get_vcf_export_path() works correctly
        """
        
        # use a folder to place the VCFG file in, which means we join the
        # proband ID to get a full path
        self.report.export_vcf = os.getcwd()
        self.assertEqual(self.report._get_vcf_export_path(), os.path.join(os.getcwd(), "child.vcf.gz"))
        
        # define an un-uable directory, to raise an error
        self.report.export_vcf = os.getcwd() + "asjhfgasjhfg"
        self.assertRaises(ValueError, self.report._get_vcf_export_path)
        
        # define a specific path for a VCF file, which is returned directly
        self.report.export_vcf = os.path.join(os.getcwd(), "sample_id.vcf.gz")
        self.assertEqual(self.report._get_vcf_export_path(), self.report.export_vcf)
    
    def test__make_vcf_header(self):
        """ check that _make_vcf_header() works correctly
        """
        
        # define the intial header lines
        header = ["####fileformat=VCFv4.1\n",
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"]
        
        # define the VCF provenances
        provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"),
            ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"),
            ("checksum", "father.calls.date.vcf.gz", "2014-01-03")]
        
        processed_header = ["####fileformat=VCFv4.1\n",
           "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
           '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The type of clinical filter that passed this variant.">\n',
           '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,Description="The inheritance mode (Monoallelic, Biallelic etc) under which the variant was found.">\n',
           '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,Description="The HGNC symbol which the variant was identified as being reportable for.">\n',
           '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,Description="The 012 coded genotypes for a trio (child, mother, father).">\n',
           '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The inheritance of the variant in the trio (biparental, paternal, maternal, deNovo).">\n',
           "##ClinicalFilterRunDate={0}\n".format(datetime.date.today()),
           "##ClinicalFilterVersion=XXX\n",
           "##ClinicalFilterHistory=single_variant,compound_het\n",
           "##UberVCF_proband_Id=proband\n",
           "##UberVCF_proband_Checksum=checksum\n",
           "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n",
           "##UberVCF_proband_Date=2014-01-01\n",
           "##UberVCF_maternal_Id=mother\n",
           "##UberVCF_maternal_Checksum=checksum\n",
           "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n",
           "##UberVCF_maternal_Date=2014-01-02\n",
           "##UberVCF_paternal_Id=father\n",
           "##UberVCF_paternal_Checksum=checksum\n",
           "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n",
           "##UberVCF_paternal_Date=2014-01-03\n",
           "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"]
        
        # check that the standard function returns the expected value. Note that
        # I haven't checked the output if self.known_genes_date is not None, nor
        # have I checked if the _clinicalFilterVersion is available
        self.assertEqual(self.report._make_vcf_header(header, provenance),
           processed_header)
    
    def test__get_parental_inheritance(self):
        """ check that _get_parental_inheritance() works correctly
        """
        
        var = self.variants[0]
        
        # check for the default genotypes
        self.assertEqual(self.report._get_parental_inheritance(var), "deNovo")
        
        # check when only the mother is non-ref
        var.mother.genotype = 1
        self.assertEqual(self.report._get_parental_inheritance(var), "maternal")
        
        # check when both parents are non-ref
        var.father.genotype = 1
        self.assertEqual(self.report._get_parental_inheritance(var), "biparental")
        
        # check when only the father is non-ref
        var.mother.genotype = 0
        self.assertEqual(self.report._get_parental_inheritance(var), "paternal")
        
        # check when the proband lacks parental information
        self.report.family.father = None
        self.report.family.mother = None
        self.assertEqual(self.report._get_parental_inheritance(var), "unknown")
    
    def test__get_vcf_lines(self):
        """ check that _get_vcf_lines() works correctly
        """
        
         # define the intial header lines
        header = ["####fileformat=VCFv4.1\n",
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"]
        
        # define the VCF provenances
        provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"),
            ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"),
            ("checksum", "father.calls.date.vcf.gz", "2014-01-03")]
        
        # define what the header will become
        vcf_lines = ["####fileformat=VCFv4.1\n",
           "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
           '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The type of clinical filter that passed this variant.">\n',
           '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,Description="The inheritance mode (Monoallelic, Biallelic etc) under which the variant was found.">\n',
           '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,Description="The HGNC symbol which the variant was identified as being reportable for.">\n',
           '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,Description="The 012 coded genotypes for a trio (child, mother, father).">\n',
           '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The inheritance of the variant in the trio (biparental, paternal, maternal, deNovo).">\n',
           "##ClinicalFilterRunDate={0}\n".format(datetime.date.today()),
           "##ClinicalFilterVersion=XXX\n",
           "##ClinicalFilterHistory=single_variant,compound_het\n",
           "##UberVCF_proband_Id=proband\n",
           "##UberVCF_proband_Checksum=checksum\n",
           "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n",
           "##UberVCF_proband_Date=2014-01-01\n",
           "##UberVCF_maternal_Id=mother\n",
           "##UberVCF_maternal_Checksum=checksum\n",
           "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n",
           "##UberVCF_maternal_Date=2014-01-02\n",
           "##UberVCF_paternal_Id=father\n",
           "##UberVCF_paternal_Checksum=checksum\n",
           "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n",
           "##UberVCF_paternal_Date=2014-01-03\n",
           "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"]
        
        # define what the default variant vcf line will become
        line = ["X\t15000000\t.\tA\tG\t50\tPASS\tHGNC=TEST;CQ=missense_variant;random_tag;EUR_AF=0.0005;ClinicalFilterGeneInheritance=Monoallelic;ClinicalFilterType=single_variant;ClinicalFilterReportableHGNC=TEST\tGT:DP:INHERITANCE:INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n"]
        
        # check that a list of one variant produces the correct VCF output. Note
        # that we haven't checked against CNVs, which can change the
        # INHERITANCE_GENOTYPE flag, nor have we tested a larger list of variants
        var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"])
        self.assertEqual(self.report._get_vcf_lines([var], header, provenance), vcf_lines + line)
    
    def test__get_output_line(self):
        """ check that _get_output_line() works correctly
        """
        
        var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"])
        dad_aff = "0"
        mom_aff = "1"
        alt_id = "test_id"
        
        # check the output for the default variant
        expected = "child\ttest_id\tF\tX\t15000000\tTEST\tNA\tNA\tmissense_variant\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t0\tsingle_variant\tNA\tNA\n"
        self.assertEqual(self.report._get_output_line(var, dad_aff, mom_aff, alt_id), expected)
        
        # introduce additional info for the output line parsing, check the line
        # that is returned is expected
        var[0].child.info["PolyPhen"] = "probably_damaging(0.99)"
        var[0].child.info["SIFT"] = "deleterious(0)"
        var[0].child.info["ENST"] = "ENST00X"
        expected = "child\ttest_id\tF\tX\t15000000\tTEST\tNA\tENST00X\tmissense_variant,PolyPhen=probably_damaging(0.99),SIFT=deleterious(0)\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t0\tsingle_variant\tNA\tNA\n"
        self.assertEqual(self.report._get_output_line(var, dad_aff, mom_aff, alt_id), expected)
    def __init__(self, opts):
        """intialise the class with the some definitions
        """

        self.set_definitions(opts)
        self.report = Report(self.output_path, self.export_vcf, self.ID_mapper, self.known_genes_date)
class ClinicalFilter(LoadOptions):
    """ filters trios for candidate variants that might contribute to a
    probands disorder.
    """

    def __init__(self, opts):
        """intialise the class with the some definitions
        """

        self.set_definitions(opts)
        self.report = Report(self.output_path, self.export_vcf, self.ID_mapper, self.known_genes_date)

    def filter_trios(self):
        """ loads trio variants, and screens for candidate variants
        """

        self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, self.debug_chrom, self.debug_pos)

        # load the trio paths into the current path setup
        for family_ID in sorted(self.families):
            self.family = self.families[family_ID]

            # some families have more than one child in the family, so run
            # through each child.
            self.family.set_child()
            while self.family.child is not None:
                if self.family.child.is_affected():
                    variants = self.vcf_loader.get_trio_variants(self.family, self.pp_filter)
                    self.vcf_provenance = self.vcf_loader.get_trio_provenance()
                    self.analyse_trio(variants)

                self.family.set_child_examined()

        sys.exit(0)

    def analyse_trio(self, variants):
        """identify candidate variants in exome data for a single trio.
        
        takes variants that passed the initial filtering from VCF loading, and
        splits the variants into groups for each gene with variants. Then
        analyses variants in a single gene (so we can utilise the appropriate
        inheritance mechanisms for that gene), before running some
        pos-inheritance filters, and exporting the data (ir required).
        
        Args:
            variants: list of TrioGenotypes objects
        """

        # organise variants by gene, then find variants that fit
        # different inheritance models
        genes_dict = self.create_gene_dict(variants)
        found_vars = []
        for gene in genes_dict:
            gene_vars = genes_dict[gene]
            found_vars += self.find_variants(gene_vars, gene)

        # remove any duplicate variants (which might ocur due to CNVs being
        # checked against all the genes that they encompass)
        found_vars = self.exclude_duplicates(found_vars)

        # apply some final filters to the flagged variants
        post_filter = PostInheritanceFilter(found_vars, self.family, self.debug_chrom, self.debug_pos)
        found_vars = post_filter.filter_variants()

        # export the results to either tab-separated table or VCF format
        self.report.export_data(found_vars, self.family, self.vcf_loader.child_header, self.vcf_provenance)

    def create_gene_dict(self, variants):
        """creates dictionary of variants indexed by gene
        
        Args:
            variants: list of TrioGenotypes objects
        
        Returns:
            dictionary of variants indexed by HGNC symbols
        """

        # organise the variants into entries for each gene
        genes = {}
        for var in variants:
            # variants (particularly CNVs) can span multiple genes, so we need
            # to check each gene separately, and then collapse duplicates later
            for gene in var.get_genes():
                if gene not in genes:
                    genes[gene] = []
                # add the variant to the gene entry
                genes[gene].append(var)

        return genes

    def find_variants(self, variants, gene):
        """ finds variants that fit inheritance models
        
        Args:
            variants: list of TrioGenotype objects
            gene: gene ID as string
        
        Returns:
            list of variants that pass inheritance checks
        """

        # get the inheritance for the gene (monoalleleic, biallelic, hemizygous
        # etc), but allow for times when we haven't specified a list of genes
        # to use
        gene_inh = None
        if self.known_genes is not None and gene in self.known_genes:
            gene_inh = self.known_genes[gene]["inh"]

        # If we are looking for variants in a set of known genes, and the gene
        # isn't part of that set, then we don't ant to examine the variant for
        # that gene, UNLESS the variant is a CNV, since CNVs can be included
        # purely from size thresholds, regardless of which gene they overlap.
        if self.known_genes is not None and gene not in self.known_genes:
            variants = [x for x in variants if x.is_cnv()]

        # ignore intergenic variants
        if gene is None:
            for var in variants:
                if var.get_chrom() == self.debug_chrom and var.get_position() == self.debug_pos:
                    print(var, "lacks HGNC/gene symbol")
            return []

        # Now that we are examining a single gene, check that the consequences
        # for the gene are in the required functional categories.
        variants = [var for var in variants if var.child.is_lof(gene) or var.child.is_missense(gene)]
        if variants == []:
            return []

        logging.debug("{} {} {} {}".format(self.family.child.get_id(), gene, variants, gene_inh))
        chrom_inheritance = variants[0].get_inheritance_type()

        if chrom_inheritance == "autosomal":
            finder = Autosomal(variants, self.family, self.known_genes, gene, self.cnv_regions)
        elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]:
            finder = Allosomal(variants, self.family, self.known_genes, gene, self.cnv_regions)

        variants = finder.get_candidate_variants()
        variants = [(x[0], list(x[1]), list(x[2]), [gene]) for x in variants]

        return variants

    def exclude_duplicates(self, variants):
        """ rejig variants included under multiple inheritance mechanisms
        
        Args:
            variants: list of candidate variants
        
        Returns:
            list of (variant, check_type, inheritance) tuples, with duplicates
            excluded, and originals modified to show both mechanisms
        """

        unique_vars = {}
        for variant in variants:
            key = variant[0].child.get_key()
            if key not in unique_vars:
                unique_vars[key] = list(variant)
            else:
                result = variant[1]
                inh = variant[2]
                hgnc = variant[3]

                # append the check type and inheritance type to the first
                # instance of the variant
                unique_vars[key][1] += [x for x in result if x not in unique_vars[key][1]]
                unique_vars[key][2] += [x for x in inh if x not in unique_vars[key][2]]

                # add the HGNC symbols that are unique to the current variant
                # to the merged variant
                hgnc = [x for x in hgnc if x not in unique_vars[key][3]]
                unique_vars[key][3] += hgnc

        unique_vars = [tuple(unique_vars[x]) for x in unique_vars]

        return unique_vars
Example #12
0
class TestReportPy(unittest.TestCase):
    """ test the Report class
    """
    def setUp(self):
        """ define a family and variant, and start the Allosomal class
        """

        # generate a test family
        child_gender = "F"
        mom_aff = "1"
        dad_aff = "1"

        self.trio = self.create_family(child_gender, mom_aff, dad_aff)

        # generate a test variant
        child_var = self.create_snv(child_gender, "0/1")
        mom_var = self.create_snv("F", "0/0")
        dad_var = self.create_snv("M", "0/0")

        var = TrioGenotypes(child_var)
        var.add_mother_variant(mom_var)
        var.add_father_variant(dad_var)
        self.variants = [var]

        self.report = Report(None, None, None, None)
        self.report.family = self.trio
        # self.report.tags_dict = tags

    def create_snv(self, gender, genotype):
        """ create a default variant
        """

        chrom = "X"
        pos = "15000000"
        snp_id = "."
        ref = "A"
        alt = "G"
        qual = "50"
        filt = "PASS"

        # set up a SNV object, since SNV inherits VcfInfo
        var = SNV(chrom, pos, snp_id, ref, alt, filt)

        info = "HGNC=TEST;CQ=missense_variant;random_tag;EUR_AF=0.0005"
        format_keys = "GT:DP"
        sample_values = genotype + ":50"

        var.vcf_line = [
            chrom, pos, snp_id, ref, alt, qual, filt, info, format_keys,
            sample_values
        ]

        var.add_info(info)
        var.add_format(format_keys, sample_values)
        var.set_gender(gender)
        var.set_genotype()

        return var

    def create_family(self, child_gender, mom_aff, dad_aff):
        """ create a default family, with optional gender and parental statuses
        """

        fam = Family("test")
        fam.add_child("child", "child_vcf", "2", child_gender)
        fam.add_mother("mother", "mother_vcf", mom_aff, "2")
        fam.add_father("father", "father_vcf", dad_aff, "1")
        fam.set_child()

        return fam

    def test__get_provenance(self):
        """ check that _get_provenance() works correctly
        """

        prov = ["checksum", "sample.calls.date.vcf.gz", "2014-01-01"]
        member = "proband"

        self.assertEqual(self.report._get_provenance(prov, member), \
            ["##UberVCF_proband_Id=sample\n", \
            "##UberVCF_proband_Checksum=checksum\n", \
            "##UberVCF_proband_Basename=sample.calls.date.vcf.gz\n", \
            "##UberVCF_proband_Date=2014-01-01\n"])

    def test__get_vcf_export_path(self):
        """ check that _get_vcf_export_path() works correctly
        """

        # use a folder to place the VCFG file in, which means we join the
        # proband ID to get a full path
        self.report.export_vcf = os.getcwd()
        self.assertEqual(self.report._get_vcf_export_path(),
                         os.path.join(os.getcwd(), "child.vcf.gz"))

        # define an un-uable directory, to raise an error
        self.report.export_vcf = os.getcwd() + "asjhfgasjhfg"
        self.assertRaises(ValueError, self.report._get_vcf_export_path)

        # define a specific path for a VCF file, which is returned directly
        self.report.export_vcf = os.path.join(os.getcwd(), "sample_id.vcf.gz")
        self.assertEqual(self.report._get_vcf_export_path(),
                         self.report.export_vcf)

    def test__make_vcf_header(self):
        """ check that _make_vcf_header() works correctly
        """

        # define the intial header lines
        header = [
            "####fileformat=VCFv4.1\n",
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"
        ]

        # define the VCF provenances
        provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"),
                      ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"),
                      ("checksum", "father.calls.date.vcf.gz", "2014-01-03")]

        processed_header = [
            "####fileformat=VCFv4.1\n",
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
            '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The type of clinical filter that passed this variant.">\n',
            '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,Description="The inheritance mode (Monoallelic, Biallelic etc) under which the variant was found.">\n',
            '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,Description="The 012 coded genotypes for a trio (child, mother, father).">\n',
            '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The inheritance of the variant in the trio (biparental, paternal, maternal, deNovo).">\n',
            "##ClinicalFilterRunDate={0}\n".format(
                datetime.date.today()), "##ClinicalFilterVersion=XXX\n",
            "##ClinicalFilterHistory=single_variant,compound_het\n",
            "##UberVCF_proband_Id=proband\n",
            "##UberVCF_proband_Checksum=checksum\n",
            "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n",
            "##UberVCF_proband_Date=2014-01-01\n",
            "##UberVCF_maternal_Id=mother\n",
            "##UberVCF_maternal_Checksum=checksum\n",
            "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n",
            "##UberVCF_maternal_Date=2014-01-02\n",
            "##UberVCF_paternal_Id=father\n",
            "##UberVCF_paternal_Checksum=checksum\n",
            "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n",
            "##UberVCF_paternal_Date=2014-01-03\n",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"
        ]

        # check that the standard function returns the expected value. Note that
        # I haven't checked the output if self.known_genes_date is not None, nor
        # have I checked if the _clinicalFilterVersion is available
        self.assertEqual(self.report._make_vcf_header(header, provenance),
                         processed_header)

    def test__get_parental_inheritance(self):
        """ check that _get_parental_inheritance() works correctly
        """

        var = self.variants[0]

        # check for the default genotypes
        self.assertEqual(self.report._get_parental_inheritance(var), "deNovo")

        # check when only the mother is non-ref
        var.mother.genotype = 1
        self.assertEqual(self.report._get_parental_inheritance(var),
                         "maternal")

        # check when both parents are non-ref
        var.father.genotype = 1
        self.assertEqual(self.report._get_parental_inheritance(var),
                         "biparental")

        # check when only the father is non-ref
        var.mother.genotype = 0
        self.assertEqual(self.report._get_parental_inheritance(var),
                         "paternal")

        # check when the proband lacks parental information
        self.report.family.father = None
        self.report.family.mother = None
        self.assertEqual(self.report._get_parental_inheritance(var), "unknown")

    def test__get_vcf_lines(self):
        """ check that _get_vcf_lines() works correctly
        """

        # define the intial header lines
        header = [
            "####fileformat=VCFv4.1\n",
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"
        ]

        # define the VCF provenances
        provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"),
                      ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"),
                      ("checksum", "father.calls.date.vcf.gz", "2014-01-03")]

        # define what the header will become
        vcf_lines = [
            "####fileformat=VCFv4.1\n",
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
            '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The type of clinical filter that passed this variant.">\n',
            '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,Description="The inheritance mode (Monoallelic, Biallelic etc) under which the variant was found.">\n',
            '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,Description="The 012 coded genotypes for a trio (child, mother, father).">\n',
            '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The inheritance of the variant in the trio (biparental, paternal, maternal, deNovo).">\n',
            "##ClinicalFilterRunDate={0}\n".format(
                datetime.date.today()), "##ClinicalFilterVersion=XXX\n",
            "##ClinicalFilterHistory=single_variant,compound_het\n",
            "##UberVCF_proband_Id=proband\n",
            "##UberVCF_proband_Checksum=checksum\n",
            "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n",
            "##UberVCF_proband_Date=2014-01-01\n",
            "##UberVCF_maternal_Id=mother\n",
            "##UberVCF_maternal_Checksum=checksum\n",
            "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n",
            "##UberVCF_maternal_Date=2014-01-02\n",
            "##UberVCF_paternal_Id=father\n",
            "##UberVCF_paternal_Checksum=checksum\n",
            "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n",
            "##UberVCF_paternal_Date=2014-01-03\n",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"
        ]

        # define what the default variant vcf line will become
        line = [
            "X\t15000000\t.\tA\tG\t50\tPASS\tHGNC=TEST;CQ=missense_variant;random_tag;EUR_AF=0.0005;ClinicalFilterGeneInheritance=Monoallelic;ClinicalFilterType=single_variant\tGT:DP:INHERITANCE:INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n"
        ]

        # check that a list of one variant produces the correct VCF output. Note
        # that we haven't checked against CNVs, which can change the
        # INHERITANCE_GENOTYPE flag, nor have we tested a larger list of variants
        var = (self.variants[0], "single_variant", "Monoallelic")
        self.assertEqual(self.report._get_vcf_lines([var], header, provenance),
                         vcf_lines + line)

    def test__get_output_line(self):
        """ check that _get_output_line() works correctly
        """

        var = (self.variants[0], "single_variant", "Monoallelic")
        dad_aff = "0"
        mom_aff = "1"
        alt_id = "test_id"

        # check the output for the default variant
        expected = "child\ttest_id\tF\tX\t15000000\tTEST\tNA\tNA\tmissense_variant\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t0\tsingle_variant\n"
        self.assertEqual(
            self.report._get_output_line(var, dad_aff, mom_aff, alt_id),
            expected)

        # introduce additional info for the output line parsing, check the line
        # that is returned is expected
        var[0].child.info["PolyPhen"] = "probably_damaging(0.99)"
        var[0].child.info["SIFT"] = "deleterious(0)"
        var[0].child.info["ENST"] = "ENST00X"
        expected = "child\ttest_id\tF\tX\t15000000\tTEST\tNA\tENST00X\tmissense_variant,PolyPhen=probably_damaging(0.99),SIFT=deleterious(0)\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t0\tsingle_variant\n"
        self.assertEqual(
            self.report._get_output_line(var, dad_aff, mom_aff, alt_id),
            expected)