def test_debug_option(self):
        """ test whether we can set up the class with the debug option
        """

        counter = 0
        total_trios = 1
        known_genes = {}

        self.vcf_loader = LoadVCFs(total_trios, known_genes, "1", "10000")

        # check that the debug filter function got set correctly
        self.assertEqual(SNV.passes_filters, SNV.passes_filters_with_debug)
    def setUp(self):
        """ define a default LoadVCFs object
        """

        total_trios = 1
        known_genes = {"ATRX": {"inheritance": {"Hemizygous": \
            {"Loss of function"}}, "start": 1, "chrom": "1", \
            "confirmed_status": {"Confirmed DD Gene"}, "end": 20000000}}

        self.vcf_loader = LoadVCFs(total_trios, known_genes, None, None)

        # make a temp directory for the cache file
        self.temp_dir = tempfile.mkdtemp()
Beispiel #3
0
 def test_debug_option(self):
     """ test whether we can set up the class with the debug option
     """
     
     total_trios = 1
     known_genes = {}
     maf_tags = None
     
     # if the debug info isn't available, then the SNV object doesn't use the
     # debug filter function
     self.vcf_loader = LoadVCFs(total_trios, maf_tags, known_genes, set(), None, None)
     self.assertNotEqual(SNV.passes_filters, SNV.passes_filters_with_debug)
     
     # if the debug info is passed in, check that the debug filter function
     # got set correctly
     self.vcf_loader = LoadVCFs(total_trios, maf_tags, known_genes, set(), "1", "10000")
     self.assertEqual(SNV.passes_filters, SNV.passes_filters_with_debug)
Beispiel #4
0
 def setUp(self):
     """ define a default LoadVCFs object
     """
     
     total_trios = 1
     maf_tags = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
         "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]
     self.known_genes = {"ATRX": {"inheritance": {"Hemizygous": \
         {"Loss of function"}}, "start": 1, "chrom": "1", \
         "confirmed_status": {"confirmed dd gene"}, "end": 20000000}}
     
     self.vcf_loader = LoadVCFs(total_trios, maf_tags, self.known_genes, set(), None, None, )
    def filter_trios(self):
        """ loads trio variants, and screens for candidate variants
        """

        self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, \
            self.debug_chrom, self.debug_pos)

        # load the trio paths into the current path setup
        for family_ID in sorted(self.families):
            self.family = self.families[family_ID]

            # some families have more than one child in the family, so run
            # through each child.
            self.family.set_child()
            while self.family.child is not None:
                if self.family.child.is_affected():
                    variants = self.vcf_loader.get_trio_variants(
                        self.family, self.pp_filter)
                    self.vcf_provenance = self.vcf_loader.get_trio_provenance()
                    self.analyse_trio(variants)

                self.family.set_child_examined()

        sys.exit(0)
    def filter_trios(self):
        """ loads trio variants, and screens for candidate variants
        """

        self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, self.debug_chrom, self.debug_pos)

        # load the trio paths into the current path setup
        for family_ID in sorted(self.families):
            self.family = self.families[family_ID]

            # some families have more than one child in the family, so run
            # through each child.
            self.family.set_child()
            while self.family.child is not None:
                if self.family.child.is_affected():
                    variants = self.vcf_loader.get_trio_variants(self.family, self.pp_filter)
                    self.vcf_provenance = self.vcf_loader.get_trio_provenance()
                    self.analyse_trio(variants)

                self.family.set_child_examined()

        sys.exit(0)
class ClinicalFilter(LoadOptions):
    """ filters trios for candidate variants that might contribute to a
    probands disorder.
    """
    def __init__(self, opts):
        """intialise the class with the some definitions
        """

        self.set_definitions(opts)
        self.report = Report(self.output_path, self.export_vcf, self.ID_mapper,
                             self.known_genes_date)

    def filter_trios(self):
        """ loads trio variants, and screens for candidate variants
        """

        self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, \
            self.debug_chrom, self.debug_pos)

        # load the trio paths into the current path setup
        for family_ID in sorted(self.families):
            self.family = self.families[family_ID]

            # some families have more than one child in the family, so run
            # through each child.
            self.family.set_child()
            while self.family.child is not None:
                if self.family.child.is_affected():
                    variants = self.vcf_loader.get_trio_variants(
                        self.family, self.pp_filter)
                    self.vcf_provenance = self.vcf_loader.get_trio_provenance()
                    self.analyse_trio(variants)

                self.family.set_child_examined()

        sys.exit(0)

    def analyse_trio(self, variants):
        """identify candidate variants in exome data for a single trio.
        
        takes variants that passed the initial filtering from VCF loading, and
        splits the variants into groups for each gene with variants. Then
        analyses variants in a single gene (so we can utilise the appropriate
        inheritance mechanisms for that gene), before running some
        pos-inheritance filters, and exporting the data (ir required).
        
        Args:
            variants: list of TrioGenotypes objects
        """

        # organise variants by gene, then find variants that fit
        # different inheritance models
        genes_dict = self.create_gene_dict(variants)
        found_vars = []
        for gene in genes_dict:
            gene_vars = genes_dict[gene]
            found_vars += self.find_variants(gene_vars, gene)

        # remove any duplicate variants (which might ocur due to CNVs being
        # checked against all the genes that they encompass)
        found_vars = self.exclude_duplicates(found_vars)

        # apply some final filters to the flagged variants
        post_filter = PostInheritanceFilter(found_vars, self.family,
                                            self.debug_chrom, self.debug_pos)
        found_vars = post_filter.filter_variants()

        # export the results to either tab-separated table or VCF format
        self.report.export_data(found_vars, self.family, \
            self.vcf_loader.child_header, self.vcf_provenance)

    def create_gene_dict(self, variants):
        """creates dictionary of variants indexed by gene
        
        Args:
            variants: list of TrioGenotypes objects
        
        Returns:
            dictionary of variants indexed by HGNC symbols
        """

        # organise the variants into entries for each gene
        genes = {}
        for var in variants:
            # variants (particularly CNVs) can span multiple genes, so we need
            # to check each gene separately, and then collapse duplicates later
            for gene in var.get_genes():
                if gene not in genes:
                    genes[gene] = []
                # add the variant to the gene entry
                genes[gene].append(var)

        return genes

    def find_variants(self, variants, gene):
        """ finds variants that fit inheritance models
        
        Args:
            variants: list of TrioGenotype objects
            gene: gene ID as string
        
        Returns:
            list of variants that pass inheritance checks
        """

        # get the inheritance for the gene (monoalleleic, biallelic, hemizygous
        # etc), but allow for times when we haven't specified a list of genes
        # to use
        gene_inh = None
        if self.known_genes is not None and gene in self.known_genes:
            gene_inh = self.known_genes[gene]["inh"]

        # If we are looking for variants in a set of known genes, and the gene
        # isn't part of that set, then we don't ant to examine the variant for
        # that gene, UNLESS the variant is a CNV, since CNVs can be included
        # purely from size thresholds, regardless of which gene they overlap.
        if self.known_genes is not None and gene not in self.known_genes:
            variants = [x for x in variants if x.is_cnv()]

        # ignore intergenic variants
        if gene is None:
            for var in variants:
                if var.get_chrom() == self.debug_chrom and var.get_position(
                ) == self.debug_pos:
                    print(var, "lacks HGNC/gene symbol")
            return []

        # Now that we are examining a single gene, check that the consequences
        # for the gene are in the required functional categories.
        variants = [
            var for var in variants
            if var.child.is_lof(gene) or var.child.is_missense(gene)
        ]
        if variants == []:
            return []

        logging.debug("{} {} {} {}".format(self.family.child.get_id(), gene,
                                           variants, gene_inh))
        chrom_inheritance = variants[0].get_inheritance_type()

        if chrom_inheritance == "autosomal":
            finder = Autosomal(variants, self.family, self.known_genes, gene,
                               self.cnv_regions)
        elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]:
            finder = Allosomal(variants, self.family, self.known_genes, gene,
                               self.cnv_regions)

        variants = finder.get_candidate_variants()
        variants = [(x[0], list(x[1]), list(x[2]), [gene]) for x in variants]

        return variants

    def exclude_duplicates(self, variants):
        """ rejig variants included under multiple inheritance mechanisms
        
        Args:
            variants: list of candidate variants
        
        Returns:
            list of (variant, check_type, inheritance) tuples, with duplicates
            excluded, and originals modified to show both mechanisms
        """

        unique_vars = {}
        for variant in variants:
            key = variant[0].child.get_key()
            if key not in unique_vars:
                unique_vars[key] = list(variant)
            else:
                result = variant[1]
                inh = variant[2]
                hgnc = variant[3]

                # append the check type and inheritance type to the first
                # instance of the variant
                unique_vars[key][1] += [
                    x for x in result if x not in unique_vars[key][1]
                ]
                unique_vars[key][2] += [
                    x for x in inh if x not in unique_vars[key][2]
                ]

                # add the HGNC symbols that are unique to the current variant
                # to the merged variant
                hgnc = [x for x in hgnc if x not in unique_vars[key][3]]
                unique_vars[key][3] += hgnc

        unique_vars = [tuple(unique_vars[x]) for x in unique_vars]

        return unique_vars
class ClinicalFilter(LoadOptions):
    """ filters trios for candidate variants that might contribute to a
    probands disorder.
    """

    def __init__(self, opts):
        """intialise the class with the some definitions
        """

        self.set_definitions(opts)
        self.report = Report(self.output_path, self.export_vcf, self.ID_mapper, self.known_genes_date)

    def filter_trios(self):
        """ loads trio variants, and screens for candidate variants
        """

        self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, self.debug_chrom, self.debug_pos)

        # load the trio paths into the current path setup
        for family_ID in sorted(self.families):
            self.family = self.families[family_ID]

            # some families have more than one child in the family, so run
            # through each child.
            self.family.set_child()
            while self.family.child is not None:
                if self.family.child.is_affected():
                    variants = self.vcf_loader.get_trio_variants(self.family, self.pp_filter)
                    self.vcf_provenance = self.vcf_loader.get_trio_provenance()
                    self.analyse_trio(variants)

                self.family.set_child_examined()

        sys.exit(0)

    def analyse_trio(self, variants):
        """identify candidate variants in exome data for a single trio.
        
        takes variants that passed the initial filtering from VCF loading, and
        splits the variants into groups for each gene with variants. Then
        analyses variants in a single gene (so we can utilise the appropriate
        inheritance mechanisms for that gene), before running some
        pos-inheritance filters, and exporting the data (ir required).
        
        Args:
            variants: list of TrioGenotypes objects
        """

        # organise variants by gene, then find variants that fit
        # different inheritance models
        genes_dict = self.create_gene_dict(variants)
        found_vars = []
        for gene in genes_dict:
            gene_vars = genes_dict[gene]
            found_vars += self.find_variants(gene_vars, gene)

        # remove any duplicate variants (which might ocur due to CNVs being
        # checked against all the genes that they encompass)
        found_vars = self.exclude_duplicates(found_vars)

        # apply some final filters to the flagged variants
        post_filter = PostInheritanceFilter(found_vars, self.family, self.debug_chrom, self.debug_pos)
        found_vars = post_filter.filter_variants()

        # export the results to either tab-separated table or VCF format
        self.report.export_data(found_vars, self.family, self.vcf_loader.child_header, self.vcf_provenance)

    def create_gene_dict(self, variants):
        """creates dictionary of variants indexed by gene
        
        Args:
            variants: list of TrioGenotypes objects
        
        Returns:
            dictionary of variants indexed by HGNC symbols
        """

        # organise the variants into entries for each gene
        genes = {}
        for var in variants:
            # variants (particularly CNVs) can span multiple genes, so we need
            # to check each gene separately, and then collapse duplicates later
            for gene in var.get_genes():
                if gene not in genes:
                    genes[gene] = []
                # add the variant to the gene entry
                genes[gene].append(var)

        return genes

    def find_variants(self, variants, gene):
        """ finds variants that fit inheritance models
        
        Args:
            variants: list of TrioGenotype objects
            gene: gene ID as string
        
        Returns:
            list of variants that pass inheritance checks
        """

        # get the inheritance for the gene (monoalleleic, biallelic, hemizygous
        # etc), but allow for times when we haven't specified a list of genes
        # to use
        gene_inh = None
        if self.known_genes is not None and gene in self.known_genes:
            gene_inh = self.known_genes[gene]["inh"]

        # If we are looking for variants in a set of known genes, and the gene
        # isn't part of that set, then we don't ant to examine the variant for
        # that gene, UNLESS the variant is a CNV, since CNVs can be included
        # purely from size thresholds, regardless of which gene they overlap.
        if self.known_genes is not None and gene not in self.known_genes:
            variants = [x for x in variants if x.is_cnv()]

        # ignore intergenic variants
        if gene is None:
            for var in variants:
                if var.get_chrom() == self.debug_chrom and var.get_position() == self.debug_pos:
                    print(var, "lacks HGNC/gene symbol")
            return []

        # Now that we are examining a single gene, check that the consequences
        # for the gene are in the required functional categories.
        variants = [var for var in variants if var.child.is_lof(gene) or var.child.is_missense(gene)]
        if variants == []:
            return []

        logging.debug("{} {} {} {}".format(self.family.child.get_id(), gene, variants, gene_inh))
        chrom_inheritance = variants[0].get_inheritance_type()

        if chrom_inheritance == "autosomal":
            finder = Autosomal(variants, self.family, self.known_genes, gene, self.cnv_regions)
        elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]:
            finder = Allosomal(variants, self.family, self.known_genes, gene, self.cnv_regions)

        variants = finder.get_candidate_variants()
        variants = [(x[0], list(x[1]), list(x[2]), [gene]) for x in variants]

        return variants

    def exclude_duplicates(self, variants):
        """ rejig variants included under multiple inheritance mechanisms
        
        Args:
            variants: list of candidate variants
        
        Returns:
            list of (variant, check_type, inheritance) tuples, with duplicates
            excluded, and originals modified to show both mechanisms
        """

        unique_vars = {}
        for variant in variants:
            key = variant[0].child.get_key()
            if key not in unique_vars:
                unique_vars[key] = list(variant)
            else:
                result = variant[1]
                inh = variant[2]
                hgnc = variant[3]

                # append the check type and inheritance type to the first
                # instance of the variant
                unique_vars[key][1] += [x for x in result if x not in unique_vars[key][1]]
                unique_vars[key][2] += [x for x in inh if x not in unique_vars[key][2]]

                # add the HGNC symbols that are unique to the current variant
                # to the merged variant
                hgnc = [x for x in hgnc if x not in unique_vars[key][3]]
                unique_vars[key][3] += hgnc

        unique_vars = [tuple(unique_vars[x]) for x in unique_vars]

        return unique_vars
class TestLoadVCFsPy(unittest.TestCase):
    """
    """
    def setUp(self):
        """ define a default LoadVCFs object
        """

        total_trios = 1
        known_genes = {"ATRX": {"inheritance": {"Hemizygous": \
            {"Loss of function"}}, "start": 1, "chrom": "1", \
            "confirmed_status": {"Confirmed DD Gene"}, "end": 20000000}}

        self.vcf_loader = LoadVCFs(total_trios, known_genes, None, None)

        # make a temp directory for the cache file
        self.temp_dir = tempfile.mkdtemp()

    def tearDown(self):
        """ remove the temp directory once a test completes
        """

        shutil.rmtree(self.temp_dir)

    def make_minimal_vcf(self):
        """ construct the bare minimum of lines for a VCF file
        """

        header = []
        header.append("##fileformat=VCFv4.1\n")
        header.append("##fileDate=2014-01-01\n")
        header.append(
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
        header.append(
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_id\n"
        )

        variants = []
        variants.append("1\t100\t.\tT\tA\t1000\tPASS\t.\tGT\t0/1\n")
        variants.append("1\t200\t.\tT\tA\t1000\tPASS\t.\tGT\t0/1\n")

        vcf = header + variants

        return vcf

    def write_temp_vcf(self, filename, vcf_data):
        """ writes data to a file, and returns the full path to the file
        """

        full_path = os.path.join(self.temp_dir, filename)

        vcf_data = "".join(vcf_data)
        output = open(full_path, "w")
        output.write(vcf_data)
        output.close()

        return full_path

    def write_gzipped_vcf(self, filename, vcf_data):
        """ writes data to a gzip file, and returns the full path to the file
        """

        full_path = os.path.join(self.temp_dir, filename)

        vcf_data = "".join(vcf_data)
        if IS_PYTHON2:
            f = gzip.open(full_path, 'wb')
        elif IS_PYTHON3:
            f = gzip.open(full_path, 'wt')
        f.write(vcf_data)
        f.close()

        return full_path

    def test_open_vcf_file(self):
        """ test obtaining a file handle for the VCF
        """

        vcf = self.make_minimal_vcf()
        path = self.write_temp_vcf("temp.vcf", vcf)

        # check that plain VCF files can be loaded
        handle = self.vcf_loader.open_vcf_file(path)
        self.assertEqual(type(handle), io.TextIOWrapper)
        handle.close()

        # check that gzipped vcf files are handled correctly
        path = self.write_gzipped_vcf("temp.vcf.gz", vcf)

        handle = self.vcf_loader.open_vcf_file(path)
        if IS_PYTHON2:
            self.assertEqual(type(handle), gzip.GzipFile)
        elif IS_PYTHON3:
            self.assertEqual(type(handle), io.TextIOWrapper)
        handle.close()

        # make sure files that don't exists raise an error
        path = os.path.join(self.temp_dir, "zzz.txt")
        with self.assertRaises(OSError):
            self.vcf_loader.open_vcf_file(path)

        # check that files with unknown extensions raise errors
        path = self.write_temp_vcf("temp.zzz", vcf)
        with self.assertRaises(OSError):
            self.vcf_loader.open_vcf_file(path)

    def test_get_vcf_header(self):
        """ test that get_vcf_header() works correctly
        """

        vcf = self.make_minimal_vcf()
        path = self.write_temp_vcf("temp.vcf", vcf)

        header = self.vcf_loader.get_vcf_header(path)

        # check that the header is returned correctly
        self.assertEqual(header, vcf[:4])

    def test_exclude_header(self):
        """ test that exclude_header() works correctly
        """

        vcf = self.make_minimal_vcf()

        # make sure we drop the header, and only the header from the file
        # check this by reading the file, and making sure the first line
        # is the line we expect from the VCF
        path = self.write_temp_vcf("temp.vcf", vcf)
        handler = open(path, "r")
        self.vcf_loader.exclude_header(handler)
        self.assertEqual(handler.readline(), vcf[4])
        handler.close()

        # also check for gzipped VCF files.
        path = self.write_gzipped_vcf("temp.vcf.gz", vcf)
        if IS_PYTHON2:
            handler = gzip.open(path, "r")
        elif IS_PYTHON3:
            handler = gzip.open(path, "rt")
        self.vcf_loader.exclude_header(handler)
        self.assertEqual(handler.readline(), vcf[4])
        handler.close()

    def test_add_single_variant(self):
        """ test that add_single_variant() works correctly
        """

        # the sub-functions are all tested elsewhere, this test merely checks
        # that valid variants are added to the variants list, and invalid
        # variants are passed over without being added to the variants list

        # set up an autosomal variant
        line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"]
        gender = "M"
        variant = SNV(*line[:6])

        # check that the variant is added to the variant list
        variants = []
        self.vcf_loader.add_single_variant(variants, variant, gender, line)
        self.assertEqual(variants, [variant])

        # set up an X-chrom male het
        line = ["X", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"]
        variant = SNV(*line[:6])

        # check that the X-chrom male het is not added to the variant list
        variants = []
        self.vcf_loader.add_single_variant(variants, variant, gender, line)
        self.assertEqual(variants, [])

    def test_get_vcf_provenance(self):
        """ test that get_vcf_provenance() works correctly
        """

        vcf = self.make_minimal_vcf()
        vcf_string = "".join(vcf)
        if IS_PYTHON3:
            vcf_string = vcf_string.encode("utf-8")
        ungzipped_hash = hashlib.sha1(vcf_string).hexdigest()
        header = vcf[:4]

        path = self.write_temp_vcf("temp.vcf", vcf)

        # check that the file defs return correctly
        (checksum, basename, date) = self.vcf_loader.get_vcf_provenance(path)

        self.assertEqual(checksum, ungzipped_hash)
        self.assertEqual(basename, "temp.vcf")
        self.assertEqual(date, "2014-01-01")

        # now write a gzip file, and check that we get the correct hash
        path = self.write_gzipped_vcf("test.vcf.gz", vcf)
        handle = open(path, "rb")
        gzipped_hash = hashlib.sha1(handle.read()).hexdigest()
        handle.close()

        (checksum, basename, date) = self.vcf_loader.get_vcf_provenance(path)
        self.assertEqual(checksum, gzipped_hash)

        # check that when a fileDate isn't available in the VCf, we can pick
        # the date from the path
        vcf.pop(1)
        path = self.write_temp_vcf("temp.file_process.2014-02-20.vcf", vcf)
        (checksum, basename, date) = self.vcf_loader.get_vcf_provenance(path)
        self.assertEqual(date, "2014-02-20")

    def test_construct_variant(self):
        """ test that construct_variant() works correctly
        """

        # check that construct variant works for SNVs
        line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"]
        gender = "M"
        test_var = SNV(*line[:6])

        variant = self.vcf_loader.construct_variant(line, gender)

        self.assertEqual(variant.get_key(), test_var.get_key())
        # initally constructing a SNV shouldn't affect the format variable
        self.assertEqual(variant.format, None)

        # check that construct variant works for CNVs
        line = [
            "1", "100", ".", "T", "<DEL>", "1000", "PASS", "END=200", "GT",
            "0/1"
        ]
        gender = "M"
        test_var = CNV(*line[:6])
        test_var.add_info(line[7])

        variant = self.vcf_loader.construct_variant(line, gender)

        self.assertEqual(variant.get_key(), test_var.get_key())
        self.assertNotEqual(variant.format, None)

        # TODO: add checks for when HGNC is in the the filters

    def test_include_variant(self):
        """ check that include_variant() works correctly
        """

        child_variants = False
        gender = "M"
        # make a child var which passes the filters
        line = [
            "1", "100", ".", "T", "A", "1000", "PASS",
            "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"
        ]
        self.assertTrue(
            self.vcf_loader.include_variant(line, child_variants, gender))

        # make a child var that fails the filters, which should return False
        line = [
            "1", "100", ".", "T", "A", "1000", "FAIL",
            "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"
        ]
        self.assertFalse(
            self.vcf_loader.include_variant(line, child_variants, gender))

        # now check for parents variants
        child_variants = True
        # check a parents var, where we have a matching child var
        self.vcf_loader.child_keys = set([("1", 100), ("X", 200)])
        line = [
            "1", "100", ".", "T", "A", "1000", "FAIL",
            "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"
        ]
        self.assertTrue(
            self.vcf_loader.include_variant(line, child_variants, gender))

        # check a parents var, where we don't have a matching child var
        line = [
            "1", "200", ".", "T", "A", "1000", "FAIL",
            "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"
        ]
        self.assertFalse(
            self.vcf_loader.include_variant(line, child_variants, gender))

        # and check parental CNVs
        line = [
            "1", "100", ".", "T", "<DEL>", "1000", "PASS", "END=200", "GT",
            "0/1"
        ]
        gender = "M"
        test_var = CNV(*line[:6])
        test_var.add_info(line[7])

        # in this function we look for overlap in CNVs. Set up a child CNV
        # that the parents CNV must match.
        self.vcf_loader.cnv_matcher = MatchCNVs([test_var])
        self.assertTrue(
            self.vcf_loader.include_variant(line, child_variants, gender))

        # check that a parental CNV without any overlap to any childs CNVs,
        # fails to pass
        line = [
            "1", "300", ".", "T", "<DEL>", "1000", "PASS", "END=400", "GT",
            "0/1"
        ]
        gender = "M"
        self.assertFalse(
            self.vcf_loader.include_variant(line, child_variants, gender))

    def test_filter_de_novos(self):
        """ check that filter_de_novos() works correctly
        """

        # make a family without parents
        family = Family("fam_id")
        child_gender = "female"
        family.add_child("child_id", "child_vcf_path", "2", child_gender)
        self.vcf_loader.family = family

        # set up an autosomal variant
        line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"]
        gender = "M"
        child_var = SNV(*line[:6])
        child_var.add_info(line[7])
        child_var.add_format(line[8], line[9])
        child_var.set_gender(child_gender)
        child_var.set_genotype()

        # combine the variant into a list of TrioGenotypes
        child_vars = [child_var]
        mother_vars = []
        father_vars = []
        trio_variants = self.vcf_loader.combine_trio_variants(
            child_vars, mother_vars, father_vars)

        # check that vars without parents get passed through automatically
        self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9),
                         trio_variants)

        # now add parents to the family
        family.add_mother("mother_id", "mother_vcf_path", "1", "female")
        family.add_father("father_id", "father_vcf_path", "1", "male")

        # re-generate the variants list now that parents have been included
        trio_variants = self.vcf_loader.combine_trio_variants(
            child_vars, mother_vars, father_vars)

        # check that vars with parents, and that appear to be de novo are
        # filtered out
        self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9),
                         [])

        # check that vars with parents, but which are not de novo, are retained
        mother_vars = child_vars
        trio_variants = self.vcf_loader.combine_trio_variants(
            child_vars, mother_vars, father_vars)

        self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9),
                         trio_variants)

    def test_debug_option(self):
        """ test whether we can set up the class with the debug option
        """

        counter = 0
        total_trios = 1
        known_genes = {}

        self.vcf_loader = LoadVCFs(total_trios, known_genes, "1", "10000")

        # check that the debug filter function got set correctly
        self.assertEqual(SNV.passes_filters, SNV.passes_filters_with_debug)
Beispiel #10
0
class TestLoadVCFsPy(unittest.TestCase):
    """ test that the LoadVCFs methods work as expected
    """
    
    @classmethod
    def setUpClass(cls):
        cls.temp_dir = tempfile.mkdtemp()
    
    @classmethod
    def tearDownClass(cls):
        shutil.rmtree(cls.temp_dir)
    
    def setUp(self):
        """ define a default LoadVCFs object
        """
        
        total_trios = 1
        maf_tags = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
            "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]
        self.known_genes = {"ATRX": {"inheritance": {"Hemizygous": \
            {"Loss of function"}}, "start": 1, "chrom": "1", \
            "confirmed_status": {"confirmed dd gene"}, "end": 20000000}}
        
        self.vcf_loader = LoadVCFs(total_trios, maf_tags, self.known_genes, set(), None, None, )
    
    def write_temp_vcf(self, path, vcf_data):
        """ writes data to a file
        """
        
        with open(path, 'w') as handle:
            handle.writelines(vcf_data)
    
    def write_gzipped_vcf(self, path, lines):
        ''' write, compress, and index lines for a VCF
        '''
    
        with tempfile.NamedTemporaryFile(dir=self.temp_dir) as handle:
            for x in lines:
                handle.write(x.encode('utf8'))
            handle.flush()
    
            # assume bgzip and tabix binaries are available, this should be
            # handled by travis-ci setup.
            with open(path, 'w') as output:
                subprocess.call(['bgzip', '-c', handle.name], stdout=output)
            subprocess.call(['tabix', '-f', '-p', 'vcf', path])
    
    def test_open_vcf(self):
        """ test obtaining a file handle for the VCF
        """
        
        vcf = make_minimal_vcf()
        path = os.path.join(self.temp_dir, "temp.vcf")
        self.write_temp_vcf(path, vcf)
        
        # check that plain VCF files can be loaded
        handle = open_vcf(path)
        self.assertEqual(type(handle), io.TextIOWrapper)
        handle.close()
        
        # check that gzipped vcf files are handled correctly
        path = os.path.join(self.temp_dir, "temp.vcf.gz")
        self.write_gzipped_vcf(path, vcf)
        
        handle = open_vcf(path)
        if IS_PYTHON3:
            self.assertEqual(type(handle), io.TextIOWrapper)
        else:
            self.assertEqual(type(handle), gzip.GzipFile)
        handle.close()
        
        # make sure files that don't exists raise an error
        path = os.path.join(self.temp_dir, "zzz.txt")
        with self.assertRaises(OSError):
            open_vcf(path)
        
        # check that files with unknown extensions raise errors
        path = os.path.join(self.temp_dir, "temp.zzz")
        self.write_temp_vcf(path, vcf)
        with self.assertRaises(OSError):
            open_vcf(path)
    
    def test_get_vcf_header(self):
        """ test that get_vcf_header() works correctly
        """
        
        vcf = make_minimal_vcf()
        path = os.path.join(self.temp_dir, "temp.vcf")
        self.write_temp_vcf(path, vcf)
        
        header = get_vcf_header(path)
        
        # check that the header is returned correctly
        self.assertEqual(header, vcf[:4])
    
    def test_exclude_header(self):
        """ test that exclude_header() works correctly
        """
        
        vcf = make_minimal_vcf()
        
        # make sure we drop the header, and only the header from the file
        # check this by reading the file, and making sure the first line
        # is the line we expect from the VCF
        path = os.path.join(self.temp_dir, "temp.vcf")
        self.write_temp_vcf(path, vcf)
        handler = open(path, "r")
        exclude_header(handler)
        self.assertEqual(handler.readline(), vcf[4])
        handler.close()
        
        # also check for gzipped VCF files.
        path = os.path.join(self.temp_dir, "temp.vcf.gz")
        self.write_gzipped_vcf(path, vcf)
        
        mode = 'r'
        if IS_PYTHON3:
            mode = 'rt'
        
        with gzip.open(path, mode) as handler:
            exclude_header(handler)
            self.assertEqual(handler.readline(), vcf[4])
    
    def test_add_single_variant(self):
        """ test that add_single_variant() works correctly
        """
        
        # the sub-functions are all tested elsewhere, this test merely checks
        # that valid variants are added to the variants list, and invalid
        # variants are passed over without being added to the variants list
        
        # set up an autosomal variant
        line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"]
        gender = "M"
        variant = SNV(*line[:6])
        
        # check that the variant is added to the variant list
        variants = []
        self.vcf_loader.add_single_variant(variants, variant, gender, line)
        self.assertEqual(variants, [variant])
        
        # set up an X-chrom male het
        line = ["X", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"]
        variant = SNV(*line[:6])
        
        # check that the X-chrom male het is not added to the variant list
        variants = []
        self.vcf_loader.add_single_variant(variants, variant, gender, line)
        self.assertEqual(variants, [])
    
    def test_get_vcf_provenance(self):
        """ test that get_vcf_provenance() works correctly
        """
        
        path = os.path.join(self.temp_dir, "temp.vcf")
        gz_path = os.path.join(self.temp_dir, "temp.vcf.gz")
        date_path = os.path.join(self.temp_dir, "temp.process.2014-02-20.vcf")
        
        family = Family('famid')
        family.add_child('child_id', 'mother', 'father', 'f', '2', path)
        family.add_mother('mom_id', '0', '0', 'female', '1', gz_path)
        family.add_father('dad_id', '0', '0', 'male', '1', date_path)
        family.set_child()
        
        vcf = make_minimal_vcf()
        vcf_string = "".join(vcf)
        if IS_PYTHON3:
            vcf_string = vcf_string.encode("utf-8")
        ungzipped_hash = hashlib.sha1(vcf_string).hexdigest()
        header = vcf[:4]
        
        self.write_temp_vcf(path, vcf)
        
        # check that the file defs return correctly
        (checksum, basename, date) = get_vcf_provenance(family.child)
        
        self.assertEqual(checksum, ungzipped_hash)
        self.assertEqual(basename, "temp.vcf")
        self.assertEqual(date, "2014-01-01")
        
        # now write a gzip file, and check that we get the correct hash
        self.write_gzipped_vcf(gz_path, vcf)
        handle = open(gz_path, "rb")
        gzipped_hash = hashlib.sha1(handle.read()).hexdigest()
        handle.close()
        
        (checksum, basename, date) = get_vcf_provenance(family.mother)
        self.assertEqual(checksum, gzipped_hash)
        
        # check that when a fileDate isn't available in the VCF, we can pick
        # the date from the path
        vcf.pop(1)
        self.write_temp_vcf(date_path, vcf)
        (checksum, basename, date) = get_vcf_provenance(family.father)
        self.assertEqual(date, "2014-02-20")
        
        # and check we get null values if the family member is not present
        family.father = None
        provenance = get_vcf_provenance(family.father)
        self.assertEqual(provenance, ('NA', 'NA', 'NA'))
    
    def test_construct_variant(self):
        """ test that construct_variant() works correctly
        """
        
        # check that construct variant works for SNVs
        line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"]
        gender = "M"
        test_var = SNV(*line[:6])
        
        variant = construct_variant(line, gender, self.known_genes)
        
        self.assertEqual(variant.get_key(), test_var.get_key())
        # initally constructing a SNV shouldn't affect the format variable
        self.assertEqual(variant.format, None)
        
        # check that construct variant works for CNVs
        line = ["1", "100", ".", "T", "<DEL>", "1000", "PASS", "END=200", "GT", "0/1"]
        gender = "M"
        test_var = CNV(*line[:6])
        test_var.add_info(line[7])
        
        variant = construct_variant(line, gender, self.known_genes)
        
        self.assertEqual(variant.get_key(), test_var.get_key())
        self.assertNotEqual(variant.format, None)
        
        # TODO: add checks for when HGNC is in the the filters
    
    def test_include_variant(self):
        """ check that include_variant() works correctly
        """
        
        mnvs = {}
        child_variants = False
        gender = "M"
        # make a child var which passes the filters
        line = ["1", "100", ".", "T", "A", "1000", "PASS", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"]
        self.assertTrue(self.vcf_loader.include_variant(line, child_variants, gender, mnvs))
        
        # make a child var that fails the filters, which should return False
        line = ["1", "100", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"]
        self.assertFalse(self.vcf_loader.include_variant(line, child_variants, gender, mnvs))
        
        # now check for parents variants
        child_variants = True
        # check a parents var, where we have a matching child var
        self.vcf_loader.child_keys = set([("1", 100), ("X", 200)])
        line = ["1", "100", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"]
        self.assertTrue(self.vcf_loader.include_variant(line, child_variants, gender, mnvs))
        
        # check a parents var, where we don't have a matching child var
        line = ["1", "200", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"]
        self.assertFalse(self.vcf_loader.include_variant(line, child_variants, gender, mnvs))
        
        # and check parental CNVs
        line = ["1", "100", ".", "T", "<DEL>", "1000", "PASS", "END=200", "GT", "0/1"]
        gender = "M"
        test_var = CNV(*line[:6])
        test_var.add_info(line[7])
        
        # in this function we look for overlap in CNVs. Set up a child CNV
        # that the parents CNV must match.
        self.assertTrue(self.vcf_loader.include_variant(line, child_variants, gender, mnvs))
        
        # check that a parental CNV without any overlap to any childs CNVs,
        # fails to pass
        line = ["1", "300", ".", "T", "<DEL>", "1000", "PASS", "END=400", "GT", "0/1"]
        gender = "M"
        self.assertFalse(self.vcf_loader.include_variant(line, child_variants, gender, mnvs))
    
    def test_open_individual(self):
        ''' test that open_individual() works correctly
        '''
        
        # missing individual returns empty list
        self.assertEqual(self.vcf_loader.open_individual(None), [])
        
        vcf = make_vcf_header()
        vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001'))
        vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001'))
        
        path = os.path.join(self.temp_dir, "temp.vcf")
        self.write_temp_vcf(path, vcf)
        
        person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path)
        
        var1 = SNV(chrom="1", position=1, id=".", ref="G", alts="T",
            filter="PASS", info="CQ=missense_variant;HGNC=TEST;MAX_AF=0.0001",
            format="DP:GT", sample="50:0/1", gender="female", mnv_code=None)
        var2 = SNV(chrom="1", position=2, id=".", ref="G", alts="T",
            filter="PASS", info="CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001",
            format="DP:GT", sample="50:0/1", gender="female", mnv_code=None)
        
        self.assertEqual(self.vcf_loader.open_individual(person), [var2])
        
        # define a set of variants to automatically pass, and check that these
        # variants pass.
        self.vcf_loader.child_keys = set([('1', 1), ('1', 2)])
        self.assertEqual(self.vcf_loader.open_individual(person,
            child_variants=True), [var1, var2])
    
    def test_open_individual_with_mnvs(self):
        ''' test that open_individual works with MNVs
        '''
        
        vcf = make_vcf_header()
        vcf.append(make_vcf_line(pos=1, cq='splice_region_variant',
            extra='HGNC=ATRX;MAX_AF=0.0001'))
        vcf.append(make_vcf_line(pos=2, cq='missense_variant',
            extra='HGNC=ATRX;MAX_AF=0.0001'))
        
        path = os.path.join(self.temp_dir, "temp.vcf.gz")
        self.write_gzipped_vcf(path, vcf)
        
        person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path)
        
        args = {'chrom': "1", 'position': 1, 'id': ".", 'ref': "G", 'alts': "T",
            'filter': "PASS", 'info': "CQ=splice_region_variant;HGNC=ATRX;MAX_AF=0.0001",
            'format': "DP:GT", 'sample': "50:0/1", 'gender': "female",
            'mnv_code': 'modified_protein_altering_mnv'}
        var1 = SNV(**args)
        
        args['position'] = 2
        args['mnv_code'] = None
        args['info'] = "CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001"
        var2 = SNV(**args)
        
        # by default only one variant passes
        self.assertEqual(self.vcf_loader.open_individual(person), [var2])
        
        # if we include MNVs, then the passing variants swap
        self.assertEqual(self.vcf_loader.open_individual(person,
            mnvs={('1', 1): 'modified_protein_altering_mnv',
            ('1', 2): 'modified_synonymous_mnv'}), [var1])
    
    def test_load_trio(self):
        ''' test that load_trio() works correctly
        '''
        
        def make_vcf(person):
            # make a VCF, where one line would pass the default filtering
            vcf = make_vcf_header()
            vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001'))
            vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001'))
            
            path = os.path.join(self.temp_dir, "{}.vcf.gz".format(person))
            self.write_gzipped_vcf(path, vcf)
            return path
        
        child_path = make_vcf('child')
        mother_path = make_vcf('mother')
        father_path = make_vcf('father')
        
        family = Family('fam_id')
        family.add_child('sample', 'mother_id', 'father_id', 'female', '2', child_path)
        family.add_mother('mother_id', '0', '0', 'female', '1', mother_path)
        family.add_father('father_id', '0', '0', 'male', '1', father_path)
        family.set_child()
        
        # define the parameters and values for the SNV class
        args = {'chrom': "1", 'position': 2, 'id': ".", 'ref': "G", 'alts': "T",
            'filter': "PASS", 'info': "CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001",
            'format': "DP:GT", 'sample': "50:0/1", 'gender': "female",
            'mnv_code': None}
        dad_args = copy.deepcopy(args)
        dad_args['gender'] = 'male'
        
        self.assertEqual(self.vcf_loader.load_trio(family),
            [TrioGenotypes(chrom="1", pos=2, child=SNV(**args),
                mother=SNV(**args), father=SNV(**dad_args)) ])
    
    def test_get_parental_var_snv(self):
        ''' check that get_parental_var() works correctly for SNVs
        '''
        
        sex = 'F'
        var = create_snv(sex, '0/1')
        mom = Person('fam_id', 'mom', '0', '0', 'F', '1', '/PATH')
        parental = []
        
        # try to get a matching variant for a mother. This will create a default
        # variant for a missing parental genotype
        self.assertEqual(self.vcf_loader.get_parental_var(var, parental, mom),
            SNV(chrom="1", position=150, id=".", ref="A", alts="G",
                filter="PASS", info=var.get_info_as_string(), format="GT", sample="0/0",
                gender="female", mnv_code=None))
        
        # now see if we can pick up a  variant where it does exist
        mother_var = create_snv(sex, '0/0')
        self.assertEqual(self.vcf_loader.get_parental_var(var, [mother_var],
            mom), mother_var)
    
    def test_get_parental_var_cnv(self):
        ''' check that get_parental_var() works correctly for CNVs
        '''
        
        sex = 'F'
        var = create_cnv(sex, 'deNovo')
        mom = Person('fam_id', 'mom', '0', '0', 'F', '1', '/PATH')
        parental_vars = []
        
        self.assertEqual(self.vcf_loader.get_parental_var(var, parental_vars,
            mom), CNV(chrom="1", position=150, id=".", ref="A",
                alts="<REF>", filter="PASS", info=var.get_info_as_string(),
                format='INHERITANCE', sample='uncertain', gender="female",
                mnv_code=None))
        
        # check that even if a CNV exist in the parent at a matching site, we
        # still create a new CNV objectr for the parent
        mother_var = create_cnv(sex, 'uncertain')
        self.assertEqual(self.vcf_loader.get_parental_var(var, [mother_var],
            mom), CNV(chrom="1", position=150, id=".", ref="A",
                alts="<REF>", filter="PASS", info=var.get_info_as_string(),
                format='INHERITANCE', sample='uncertain', gender="female",
                mnv_code=None))
    
    def test_get_parental_var_cnv_maternally_inherited(self):
        '''
        '''
        
        sex = 'F'
        mom = Person('fam_id', 'mom', '0', '0', 'F', '1', '/PATH')
        
        # check that even if a CNV exist in the parent at a matching site, we
        # still create a new CNV object for the parent
        var = create_cnv(sex, 'maternal')
        self.assertEqual(self.vcf_loader.get_parental_var(var, [], mom),
            CNV(chrom="1", position=150, id=".", ref="A",
                alts="<DUP>", filter="PASS", info=var.get_info_as_string(),
                format='INHERITANCE', sample='uncertain', gender="female",
                mnv_code=None))
    
    def test_filter_de_novos(self):
        """ check that filter_de_novos() works correctly
        """
        
        # make a family without parents
        family = Family("fam_id")
        child_gender = "female"
        family.add_child('child_id', 'mother_id', 'father_id', child_gender, '2', 'child_path')
        self.vcf_loader.family = family
        
        # set up an autosomal variant
        gender = "M"
        args = ["1", "100", ".", "T", "G", "PASS", ".", "GT", "0/1", gender]
        child_var = SNV(*args)
        
        # combine the variant into a list of TrioGenotypes
        child_vars = [child_var]
        mother_vars = []
        father_vars = []
        trio_variants = self.vcf_loader.combine_trio_variants(family, child_vars, mother_vars, father_vars)
        
        # check that vars without parents get passed through automatically
        self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), trio_variants)
        
        # now add parents to the family
        family.add_mother("mother_id", '0', '0', 'female', '1', "mother_vcf_path")
        family.add_father("father_id", '0', '0', 'male', '1', "father_vcf_path")
        self.vcf_loader.family = family
        
        # re-generate the variants list now that parents have been included
        trio_variants = self.vcf_loader.combine_trio_variants(family, child_vars, mother_vars, father_vars)
        
        # check that vars with parents, and that appear to be de novo are
        # filtered out
        self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), [])
        
        # check that vars with parents, but which are not de novo, are retained
        mother_vars = child_vars
        trio_variants = self.vcf_loader.combine_trio_variants(family, child_vars, mother_vars, father_vars)
        
        self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), trio_variants)
    
    def test_debug_option(self):
        """ test whether we can set up the class with the debug option
        """
        
        total_trios = 1
        known_genes = {}
        maf_tags = None
        
        # if the debug info isn't available, then the SNV object doesn't use the
        # debug filter function
        self.vcf_loader = LoadVCFs(total_trios, maf_tags, known_genes, set(), None, None)
        self.assertNotEqual(SNV.passes_filters, SNV.passes_filters_with_debug)
        
        # if the debug info is passed in, check that the debug filter function
        # got set correctly
        self.vcf_loader = LoadVCFs(total_trios, maf_tags, known_genes, set(), "1", "10000")
        self.assertEqual(SNV.passes_filters, SNV.passes_filters_with_debug)