Example #1
0
 def test_open_individual_with_mnvs(self):
     ''' test that open_individual works with MNVs
     '''
     
     vcf = make_vcf_header()
     vcf.append(make_vcf_line(pos=1, cq='splice_region_variant',
         extra='HGNC=ATRX;MAX_AF=0.0001'))
     vcf.append(make_vcf_line(pos=2, cq='missense_variant',
         extra='HGNC=ATRX;MAX_AF=0.0001'))
     
     path = os.path.join(self.temp_dir, "temp.vcf.gz")
     self.write_gzipped_vcf(path, vcf)
     
     person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path)
     
     args = {'chrom': "1", 'position': 1, 'id': ".", 'ref': "G", 'alts': "T",
         'filter': "PASS", 'info': "CQ=splice_region_variant;HGNC=ATRX;MAX_AF=0.0001",
         'format': "DP:GT", 'sample': "50:0/1", 'gender': "female",
         'mnv_code': 'modified_protein_altering_mnv'}
     var1 = SNV(**args)
     
     args['position'] = 2
     args['mnv_code'] = None
     args['info'] = "CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001"
     var2 = SNV(**args)
     
     # by default only one variant passes
     self.assertEqual(self.vcf_loader.open_individual(person), [var2])
     
     # if we include MNVs, then the passing variants swap
     self.assertEqual(self.vcf_loader.open_individual(person,
         mnvs={('1', 1): 'modified_protein_altering_mnv',
         ('1', 2): 'modified_synonymous_mnv'}), [var1])
Example #2
0
 def test_open_individual_with_mnvs(self):
     ''' test that open_individual works with MNVs
     '''
     
     vcf = make_vcf_header()
     vcf.append(make_vcf_line(pos=1, cq='splice_region_variant',
         extra='HGNC=ATRX;MAX_AF=0.0001'))
     vcf.append(make_vcf_line(pos=2, cq='missense_variant',
         extra='HGNC=ATRX;MAX_AF=0.0001'))
     
     path = os.path.join(self.temp_dir, "temp.vcf.gz")
     write_gzipped_vcf(path, vcf)
     
     person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path)
     
     args = {'chrom': "1", 'position': 1, 'id': ".", 'ref': "G", 'alts': "T",
         'filter': "PASS", 'info': "CQ=splice_region_variant;HGNC=ATRX;MAX_AF=0.0001",
         'format': "DP:GT", 'sample': "50:0/1", 'gender': "female",
         'mnv_code': 'modified_protein_altering_mnv', 'qual': '1000'}
     var1 = SNV(**args)
     
     args['position'] = 2
     args['mnv_code'] = None
     args['info'] = "CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001"
     var2 = SNV(**args)
     
     # by default only one variant passes
     self.assertEqual(open_individual(person), [var2])
     
     # if we include MNVs, then the passing variants swap
     self.assertEqual(open_individual(person,
         mnvs={('1', 1): 'modified_protein_altering_mnv',
         ('1', 2): 'modified_synonymous_mnv'}), [var1])
Example #3
0
 def test_open_individual(self):
     ''' test that open_individual() works correctly
     '''
     
     # missing individual returns empty list
     self.assertEqual(open_individual(None), [])
     
     vcf = make_vcf_header()
     vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001'))
     vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001'))
     
     path = os.path.join(self.temp_dir, "temp.vcf")
     write_temp_vcf(path, vcf)
     
     person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path)
     
     var1 = SNV(chrom="1", position=1, id=".", ref="G", alts="T",
         qual='1000', filter="PASS", info="CQ=missense_variant;HGNC=TEST;MAX_AF=0.0001",
         format="DP:GT", sample="50:0/1", gender="female", mnv_code=None)
     var2 = SNV(chrom="1", position=2, id=".", ref="G", alts="T",
         qual='1000', filter="PASS", info="CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001",
         format="DP:GT", sample="50:0/1", gender="female", mnv_code=None)
     
     self.assertEqual(open_individual(person), [var2])
     
     # define a set of variants to automatically pass, and check that these
     # variants pass.
     child_keys = set([('1', 1), ('1', 2)])
     self.assertEqual(open_individual(person,
         child_variants=child_keys), [var1, var2])
Example #4
0
 def test_open_individual(self):
     ''' test that open_individual() works correctly
     '''
     
     # missing individual returns empty list
     self.assertEqual(self.vcf_loader.open_individual(None), [])
     
     vcf = make_vcf_header()
     vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001'))
     vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001'))
     
     path = os.path.join(self.temp_dir, "temp.vcf")
     self.write_temp_vcf(path, vcf)
     
     person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path)
     
     var1 = SNV(chrom="1", position=1, id=".", ref="G", alts="T",
         filter="PASS", info="CQ=missense_variant;HGNC=TEST;MAX_AF=0.0001",
         format="DP:GT", sample="50:0/1", gender="female", mnv_code=None)
     var2 = SNV(chrom="1", position=2, id=".", ref="G", alts="T",
         filter="PASS", info="CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001",
         format="DP:GT", sample="50:0/1", gender="female", mnv_code=None)
     
     self.assertEqual(self.vcf_loader.open_individual(person), [var2])
     
     # define a set of variants to automatically pass, and check that these
     # variants pass.
     self.vcf_loader.child_keys = set([('1', 1), ('1', 2)])
     self.assertEqual(self.vcf_loader.open_individual(person,
         child_variants=True), [var1, var2])
Example #5
0
 def make_vcf(person):
     # make a VCF, where one line would pass the default filtering
     vcf = make_vcf_header()
     vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001'))
     vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001'))
     
     path = os.path.join(self.temp_dir, "{}.vcf.gz".format(person))
     write_gzipped_vcf(path, vcf)
     return path
Example #6
0
 def make_vcf(person):
     # make a VCF, where one line would pass the default filtering
     vcf = make_vcf_header()
     vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001'))
     vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001'))
     
     path = os.path.join(self.temp_dir, "{}.vcf.gz".format(person))
     self.write_gzipped_vcf(path, vcf)
     return path
Example #7
0
 def test_get_mnv_candidates_catch_assertion_error(self):
     ''' check that get_mnv_candidates works correctly
     '''
     
     lines = make_vcf_header()
     lines.append(make_vcf_line(chrom='1', pos=1, extra='Protein_position=1;Codons=aaT/aaG'))
     lines.append(make_vcf_line(chrom='1', pos=2, extra='Protein_position=2;Codons=Att/Ctt'))
     self.write_vcf(lines)
     
     self.assertEqual(get_mnv_candidates(self.path), {})
Example #8
0
 def test_get_mnv_candidates(self):
     ''' check that get_mnv_candidates works correctly
     '''
     
     lines = make_vcf_header()
     lines.append(make_vcf_line(chrom='1', pos=1, extra='Protein_position=1;Codons=aaT/aaG'))
     lines.append(make_vcf_line(chrom='1', pos=2, extra='Protein_position=1;Codons=Aat/Cat'))
     self.write_vcf(lines)
     
     self.assertEqual(get_mnv_candidates(self.path), {
         ('1', 1): 'alternate_residue_mnv', ('1', 2): 'alternate_residue_mnv'})
Example #9
0
 def test_find_nearby_variants_separated(self):
     ''' test that find_nearby_variants() doesn't include vars far apart
     '''
     
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=4))
     self.write_vcf(lines)
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     self.assertEqual(find_nearby_variants(vcf), [])
Example #10
0
 def test_find_nearby_variants(self):
     ''' test that find_nearby_variants() works correctly
     '''
     
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     self.write_vcf(lines)
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     self.assertEqual(find_nearby_variants(vcf), [[('1', 1), ('1', 2)]])
Example #11
0
 def test_find_nearby_variants_different_chroms(self):
     ''' test that find_nearby_variants() works correctly with successive
     variants on different chroms, but at the same position.
     '''
     
     # get the default two variants
     lines = make_vcf_header()
     lines.append(make_vcf_line(chrom='1', pos=1))
     lines.append(make_vcf_line(chrom='2', pos=1))
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     self.assertEqual(find_nearby_variants(vcf), [])
Example #12
0
 def test_same_aa_different_positions(self):
     ''' check that same_aa() works correctly for different amino acids
     '''
     
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=5, extra='Protein_position=2'))
     lines.append(make_vcf_line(pos=7, extra='Protein_position=3'))
     lines.append(make_vcf_line(pos=8, extra='Protein_position=4'))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pairs = [[('1', 7), ('1', 8)]]
     
     self.assertEqual(same_aa(vcf, pairs), [])
Example #13
0
 def test_find_nearby_variants_different_threshold(self):
     ''' test that find_nearby_variants() works correctly when we change the threshold distance.
     '''
     
     # get the default two variants
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     
     # using a lower threshold shouldn't allow any of the variants to pass
     self.assertEqual(find_nearby_variants(vcf, threshold=0), [])
Example #14
0
 def test_same_aa(self):
     ''' check that same_aa() works correctly
     '''
     
     # get the VCF lines
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=2, extra='Protein_position=1'))
     lines.append(make_vcf_line(pos=4, extra='Protein_position=1'))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pairs = [[('1', 2), ('1', 4)]]
     
     self.assertEqual(same_aa(vcf, pairs), [[('1', 2), ('1', 4)]])
Example #15
0
 def test_find_nearby_variants_duplicate_position(self):
     ''' test that find_nearby_variants() works correctly with a duplicate var
     '''
     
     # get the default two variants
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     
     # make a third variant, but at the same position as the second
     lines.append(make_vcf_line(pos=2))
     self.write_vcf(lines)
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     self.assertEqual(find_nearby_variants(vcf), [[('1', 1), ('1', 2)]])
Example #16
0
 def test_same_aa_missing_protein_positions(self):
     ''' check that same_aa() works correctly when the vars aren't in the CDS
     '''
     
     # if one of the variants in the pair does not have a protein position
     # listed (i.e. residue number), that indicates the variant could be
     # affecting the splice site, so we can't use the pair.
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=5))
     lines.append(make_vcf_line(pos=7))
     lines.append(make_vcf_line(pos=8, extra='Protein_position=4'))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pairs = [[('1', 7), ('1', 8)]]
     
     self.assertEqual(same_aa(vcf, pairs), [])
Example #17
0
 def test_screen_pairs_nonstandard_pair(self):
     ''' test that screen_pairs() works correctly
     '''
     
     # get the VCF lines
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=2))
     lines.append(make_vcf_line(pos=4))
     lines.append(make_vcf_line(pos=5))
     lines.append(make_vcf_line(pos=7))
     lines.append(make_vcf_line(pos=8))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     # set up a list of 'pairs', where one 'pair' has three variants in it.
     # we exclude 'pairs' where n != 2.
     pairs = [[('1', 2), ('1', 4), ('1', 5)], [('1', 7), ('1', 8)]]
     self.assertEqual(screen_pairs(vcf, pairs, is_not_indel), [[('1', 7), ('1', 8)]])
 def test_open_individual_male_het_chrx(self):
     """ test that open_individual() passes over hets in males on chrX
     """
     
     # the sub-functions are all tested elsewhere, this test merely checks
     # that valid variants are added to the variants list, and invalid
     # variants are passed over without being added to the variants list
     
     vcf = make_vcf_header()
     vcf.append(make_vcf_line(chrom='X', pos=1, genotype='0/1',
         extra='HGNC=TEST;MAX_AF=0.0001'))
     
     path = os.path.join(self.temp_dir, "temp.vcf")
     write_temp_vcf(path, vcf)
     
     person = Person('fam_id', 'sample', 'dad', 'mom', 'M', '2', path)
     
     self.assertEqual(open_individual(person), [])
Example #19
0
 def test_open_individual_male_het_chrx(self):
     """ test that open_individual() passes over hets in males on chrX
     """
     
     # the sub-functions are all tested elsewhere, this test merely checks
     # that valid variants are added to the variants list, and invalid
     # variants are passed over without being added to the variants list
     
     vcf = make_vcf_header()
     vcf.append(make_vcf_line(chrom='X', pos=1, genotype='0/1',
         extra='HGNC=TEST;MAX_AF=0.0001'))
     
     path = os.path.join(self.temp_dir, "temp.vcf")
     write_temp_vcf(path, vcf)
     
     person = Person('fam_id', 'sample', 'dad', 'mom', 'M', '2', path)
     
     self.assertEqual(open_individual(person), [])
 def test__write_vcf(self):
     ''' check that _write_vcf() works correctly
     '''
     
     path = tempfile.NamedTemporaryFile(suffix='.vcf.gz', dir=self.temp_dir,
         delete=False)
     lines = make_vcf_header() +  ['X\t150\t.\tA\tG\t50\tPASS\tHGNC=TEST;'
         'CQ=missense_variant;random_tag;EUR_AF=0.0005;'
         'ClinicalFilterGeneInheritance=Monoallelic;'
         'ClinicalFilterType=single_variant;'
         'ClinicalFilterReportableHGNC=TEST\tGT:DP:INHERITANCE:'
         'INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n']
     
     _write_vcf(path.name, lines)
     
     with gzip.open(path.name, 'r') as handle:
         vcf = [ x.decode() for x in handle ]
         self.assertEqual(lines, vcf)
Example #21
0
 def test_analyse_trio(self):
     ''' test that analyse_trio() works correctly
     '''
     
     # construct the VCFs for the trio members
     paths = {}
     for member in ['child', 'mom', 'dad']:
         vcf = make_vcf_header()
         
         geno, pp_dnm = '0/0', ''
         if member == 'child':
             geno, pp_dnm = '0/1', ';DENOVO-SNP;PP_DNM=1'
         
         vcf.append(make_vcf_line(genotype=geno, extra='HGNC=ARID1B' + pp_dnm))
         
         # write the VCF data to a file
         handle = tempfile.NamedTemporaryFile(dir=self.temp_dir, delete=False,
             suffix='.vcf')
         for x in vcf:
             handle.write(x.encode('utf8'))
         handle.flush()
         
         paths[member] = handle.name
     
     # create a Family object, so we can load the data from the trio's VCFs
     fam_id = 'fam01'
     child = Person(fam_id, 'child', 'dad', 'mom', 'female', '2', paths['child'])
     mom = Person(fam_id, 'mom', '0', '0', 'female', '1', paths['mom'])
     dad = Person(fam_id, 'dad', '0', '0', 'male', '1', paths['dad'])
     family = Family(fam_id, [child], mom, dad)
     
     self.assertEqual(self.finder.analyse_trio(family),
         [(TrioGenotypes(chrom="1", pos=1,
             child=SNV(chrom="1", position=1, id=".", ref="G", alts="T",
                 filter="PASS",
                 info="CQ=missense_variant;DENOVO-SNP;HGNC=ARID1B;PP_DNM=1",
                 format="DP:GT", sample="50:0/1", gender="female", mnv_code=None),
             mother=SNV(chrom="1", position=1, id=".", ref="G", alts="T",
                 filter="PASS", info="CQ=missense_variant;HGNC=ARID1B",
                 format="DP:GT", sample="50:0/0", gender="female", mnv_code=None),
             father=SNV(chrom="1", position=1, id=".", ref="G", alts="T",
                 filter="PASS", info="CQ=missense_variant;HGNC=ARID1B",
                 format="DP:GT", sample="50:0/0", gender="male", mnv_code=None)),
         ['single_variant'], ['Monoallelic'], ['ARID1B'])])
Example #22
0
 def test_get_matches(self):
     ''' check that get_matches works correctly
     '''
     
     # get the VCF lines
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     lines.append(make_vcf_line(pos=4))
     lines.append(make_vcf_line(pos=5))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pair = [('1', 2), ('1', 4)]
     
     # define the expected lines
     var1 = parse_vcf_line(make_vcf_line(pos=2).split('\t'), self.Variant)
     var2 = parse_vcf_line(make_vcf_line(pos=4).split('\t'), self.Variant)
     
     self.assertEqual(list(get_matches(vcf, pair)), [var1, var2])
Example #23
0
    def test__write_vcf(self):
        ''' check that _write_vcf() works correctly
        '''

        path = tempfile.NamedTemporaryFile(suffix='.vcf.gz',
                                           dir=self.temp_dir,
                                           delete=False)
        lines = make_vcf_header() + [
            'X\t150\t.\tA\tG\t50\tPASS\tHGNC=TEST;'
            'CQ=missense_variant;random_tag;EUR_AF=0.0005;'
            'ClinicalFilterGeneInheritance=Monoallelic;'
            'ClinicalFilterType=single_variant;'
            'ClinicalFilterReportableHGNC=TEST\tGT:DP:INHERITANCE:'
            'INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n'
        ]

        _write_vcf(path.name, lines)

        with gzip.open(path.name, 'r') as handle:
            vcf = [x.decode() for x in handle]
            self.assertEqual(lines, vcf)
Example #24
0
 def test_screen_pairs(self):
     ''' test that screen_pairs() works correctly
     '''
     
     # get the VCF lines
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     lines.append(make_vcf_line(pos=4))
     lines.append(make_vcf_line(pos=5))
     lines.append(make_vcf_line(pos=7))
     lines.append(make_vcf_line(pos=8))
     self.write_vcf(lines)
     
     vcf = tabix.open(self.path)
     pairs = [[('1', 2), ('1', 4)], [('1', 7), ('1', 8)]]
     
     self.assertEqual(screen_pairs(vcf, pairs, is_not_indel), pairs)
     
     # check that the other filter function also works cleanly
     self.assertEqual(screen_pairs(vcf, pairs, is_coding), pairs)
Example #25
0
    def test__get_vcf_lines(self):
        """ check that _get_vcf_lines() works correctly
        """

        # write VFs for the trio members, in order to be able to pick up the
        # VCF provenance information
        family = self.trio
        for member in [family.child, family.mother, family.father]:
            header = make_vcf_header()
            with open(member.get_path(), 'w') as handle:
                handle.writelines(header)

        # define what the header will become
        vcf_lines = [
            "##fileformat=VCFv4.1\n", '##fileDate=2014-01-01\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
            '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The '
            'type of clinical filter that passed this variant.">\n',
            '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,'
            'Description="The inheritance mode (Monoallelic, Biallelic etc) '
            'under which the variant was found.">\n',
            '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,'
            'Description="The HGNC symbol which the variant was identified '
            'as being reportable for.">\n',
            '##INFO=<ID=CANDIDATE_MNV,Number=.,Type=String,'
            'Description="Code for candidate multinucleotide variants. '
            'Field is only included if the translated MNV differs from '
            'both of the SNV translations. There are five possibilities: '
            'alternate_residue_mnv=MNV translates to a residue not in SNVs, '
            'masked_stop_gain_mnv=MNV masks a stop gain, '
            'modified_stop_gained_mnv=MNV introduces a stop gain, '
            'modified_synonymous_mnv=MNV reverts to synonymous, '
            'modified_protein_altering_mnv=synonymous SNVs but missense '
            'MNV.">\n',
            '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,'
            'Description="The 012 coded genotypes for a trio (child, '
            'mother, father).">\n',
            '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="'
            'The inheritance of the variant in the trio (biparental, '
            'paternal, maternal, deNovo).">\n',
            "##ClinicalFilterRunDate={0}\n".format(
                datetime.date.today()), "##ClinicalFilterVersion={}\n".format(
                    clinicalfilter.__version__),
            "##ClinicalFilterHistory=single_variant,compound_het\n",
            "##UberVCF_proband_Id=child\n",
            "##UberVCF_proband_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n",
            "##UberVCF_proband_Basename=child.vcf\n",
            "##UberVCF_proband_Date=2014-01-01\n",
            "##UberVCF_maternal_Id=mother\n",
            "##UberVCF_maternal_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n",
            "##UberVCF_maternal_Basename=mother.vcf\n",
            "##UberVCF_maternal_Date=2014-01-01\n",
            "##UberVCF_paternal_Id=father\n",
            "##UberVCF_paternal_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n",
            "##UberVCF_paternal_Basename=father.vcf\n",
            "##UberVCF_paternal_Date=2014-01-01\n",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n"
        ]

        # define what the default variant vcf line will become
        line = [
            'X\t150\t.\tA\tG\t50\tPASS\tCQ=missense_variant;'
            'ClinicalFilterGeneInheritance=Monoallelic;'
            'ClinicalFilterReportableHGNC=TEST;ClinicalFilterType=single_variant;'
            'DENOVO-SNP;HGNC=TEST;HGNC_ID=1001;MAX_AF=0.0005\tGT:DP:INHERITANCE:'
            'INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n'
        ]

        # check that a list of one variant produces the correct VCF output. Note
        # that we haven't checked against CNVs, which can change the
        # INHERITANCE_GENOTYPE flag, nor have we tested a larger list of variants
        var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"])
        var[0].child.add_vcf_line([
            'X', '150', '.', 'A', 'G', '50', 'PASS',
            'HGNC=TEST;HGNC_ID=1001;CQ=missense_variant;EUR_AF=0.0005',
            'GT:DP', '0/1:50'
        ])

        self.assertEqual(list(_get_vcf_lines([var], family)), vcf_lines + line)
Example #26
0
    def test__make_vcf_header(self):
        """ check that _make_vcf_header() works correctly
        """

        # define the intial header lines
        header = make_vcf_header()

        # define the VCF provenances
        provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"),
                      ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"),
                      ("checksum", "father.calls.date.vcf.gz", "2014-01-03")]

        processed_header = [
            "##fileformat=VCFv4.1\n", '##fileDate=2014-01-01\n',
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
            '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,'
            'Description="The type of clinical filter that passed this '
            'variant.">\n',
            '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,'
            'Description="The inheritance mode (Monoallelic, Biallelic '
            'etc) under which the variant was found.">\n',
            '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,'
            'Description="The HGNC symbol which the variant was identified '
            'as being reportable for.">\n',
            '##INFO=<ID=CANDIDATE_MNV,Number=.,Type=String,'
            'Description="Code for candidate multinucleotide variants. '
            'Field is only included if the translated MNV differs from '
            'both of the SNV translations. There are five possibilities: '
            'alternate_residue_mnv=MNV translates to a residue not in SNVs, '
            'masked_stop_gain_mnv=MNV masks a stop gain, '
            'modified_stop_gained_mnv=MNV introduces a stop gain, '
            'modified_synonymous_mnv=MNV reverts to synonymous, '
            'modified_protein_altering_mnv=synonymous SNVs but missense '
            'MNV.">\n',
            '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,'
            'Description="The 012 coded genotypes for a trio (child, '
            'mother, father).">\n',
            '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The '
            'inheritance of the variant in the trio (biparental, paternal, '
            'maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format(
                datetime.date.today()), "##ClinicalFilterVersion={}\n".format(
                    clinicalfilter.__version__),
            "##ClinicalFilterHistory=single_variant,compound_het\n",
            "##UberVCF_proband_Id=proband\n",
            "##UberVCF_proband_Checksum=checksum\n",
            "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n",
            "##UberVCF_proband_Date=2014-01-01\n",
            "##UberVCF_maternal_Id=mother\n",
            "##UberVCF_maternal_Checksum=checksum\n",
            "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n",
            "##UberVCF_maternal_Date=2014-01-02\n",
            "##UberVCF_paternal_Id=father\n",
            "##UberVCF_paternal_Checksum=checksum\n",
            "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n",
            "##UberVCF_paternal_Date=2014-01-03\n",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n"
        ]

        # check that the standard function returns the expected value. Note that
        # I haven't checked the output if self.known_genes_date is not None, nor
        # have I checked if the _clinicalFilterVersion is available
        self.assertEqual(_make_vcf_header(header, provenance),
                         processed_header)
 def test__make_vcf_header(self):
     """ check that _make_vcf_header() works correctly
     """
     
     # define the intial header lines
     header = make_vcf_header()
     
     # define the VCF provenances
     provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"),
         ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"),
         ("checksum", "father.calls.date.vcf.gz", "2014-01-03")]
     
     processed_header = ["##fileformat=VCFv4.1\n",
        '##fileDate=2014-01-01\n',
        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
        '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,'
             'Description="The type of clinical filter that passed this '
             'variant.">\n',
        '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,'
             'Description="The inheritance mode (Monoallelic, Biallelic '
             'etc) under which the variant was found.">\n',
        '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,'
             'Description="The HGNC symbol which the variant was identified '
             'as being reportable for.">\n',
        '##INFO=<ID=CANDIDATE_MNV,Number=.,Type=String,'
             'Description="Code for candidate multinucleotide variants. '
             'Field is only included if the translated MNV differs from '
             'both of the SNV translations. There are five possibilities: '
             'alternate_residue_mnv=MNV translates to a residue not in SNVs, '
             'masked_stop_gain_mnv=MNV masks a stop gain, '
             'modified_stop_gained_mnv=MNV introduces a stop gain, '
             'modified_synonymous_mnv=MNV reverts to synonymous, '
             'modified_protein_altering_mnv=synonymous SNVs but missense '
             'MNV.">\n',
        '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,'
             'Description="The 012 coded genotypes for a trio (child, '
             'mother, father).">\n',
        '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The '
             'inheritance of the variant in the trio (biparental, paternal, '
             'maternal, deNovo).">\n',
        "##ClinicalFilterRunDate={0}\n".format(datetime.date.today()),
        "##ClinicalFilterVersion={}\n".format(clinicalfilter.__version__),
        "##ClinicalFilterHistory=single_variant,compound_het\n",
        "##UberVCF_proband_Id=proband\n",
        "##UberVCF_proband_Checksum=checksum\n",
        "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n",
        "##UberVCF_proband_Date=2014-01-01\n",
        "##UberVCF_maternal_Id=mother\n",
        "##UberVCF_maternal_Checksum=checksum\n",
        "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n",
        "##UberVCF_maternal_Date=2014-01-02\n",
        "##UberVCF_paternal_Id=father\n",
        "##UberVCF_paternal_Checksum=checksum\n",
        "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n",
        "##UberVCF_paternal_Date=2014-01-03\n",
        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n"]
     
     # check that the standard function returns the expected value. Note that
     # I haven't checked the output if self.known_genes_date is not None, nor
     # have I checked if the _clinicalFilterVersion is available
     self.assertEqual(_make_vcf_header(header, provenance),
        processed_header)
    def test_analyse_trio(self):
        ''' test that analyse_trio() works correctly
        '''

        # construct the VCFs for the trio members
        paths = {}
        for member in ['child', 'mom', 'dad']:
            vcf = make_vcf_header()

            geno, pp_dnm = '0/0', ''
            if member == 'child':
                geno, pp_dnm = '0/1', ';DENOVO-SNP;PP_DNM=1'

            vcf.append(
                make_vcf_line(genotype=geno, extra='HGNC=ARID1B' + pp_dnm))

            # write the VCF data to a file
            handle = tempfile.NamedTemporaryFile(dir=self.temp_dir,
                                                 delete=False,
                                                 suffix='.vcf')
            for x in vcf:
                handle.write(x.encode('utf8'))
            handle.flush()

            paths[member] = handle.name

        # create a Family object, so we can load the data from the trio's VCFs
        fam_id = 'fam01'
        child = Person(fam_id, 'child', 'dad', 'mom', 'female', '2',
                       paths['child'])
        mom = Person(fam_id, 'mom', '0', '0', 'female', '1', paths['mom'])
        dad = Person(fam_id, 'dad', '0', '0', 'male', '1', paths['dad'])
        family = Family(fam_id, [child], mom, dad)

        self.assertEqual(self.finder.analyse_trio(family), [(TrioGenotypes(
            chrom="1",
            pos=1,
            child=SNV(
                chrom="1",
                position=1,
                id=".",
                ref="G",
                alts="T",
                qual='1000',
                filter="PASS",
                info="CQ=missense_variant;DENOVO-SNP;HGNC=ARID1B;PP_DNM=1",
                format="DP:GT",
                sample="50:0/1",
                gender="female",
                mnv_code=None),
            mother=SNV(chrom="1",
                       position=1,
                       id=".",
                       ref="G",
                       alts="T",
                       qual='1000',
                       filter="PASS",
                       info="CQ=missense_variant;HGNC=ARID1B",
                       format="DP:GT",
                       sample="50:0/0",
                       gender="female",
                       mnv_code=None),
            father=SNV(chrom="1",
                       position=1,
                       id=".",
                       ref="G",
                       alts="T",
                       qual='1000',
                       filter="PASS",
                       info="CQ=missense_variant;HGNC=ARID1B",
                       format="DP:GT",
                       sample="50:0/0",
                       gender="male",
                       mnv_code=None)), ['single_variant'], [
                           'Monoallelic', 'Mosaic'
                       ], ['ARID1B'])])
 def test__get_vcf_lines(self):
     """ check that _get_vcf_lines() works correctly
     """
     
     # write VFs for the trio members, in order to be able to pick up the
     # VCF provenance information
     family = self.trio
     for member in [family.child, family.mother, family.father]:
         header = make_vcf_header()
         with open(member.get_path(), 'w') as handle:
             handle.writelines(header)
     
     # define what the header will become
     vcf_lines = ["##fileformat=VCFv4.1\n",
        '##fileDate=2014-01-01\n',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
        '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The '
             'type of clinical filter that passed this variant.">\n',
        '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,'
             'Description="The inheritance mode (Monoallelic, Biallelic etc) '
             'under which the variant was found.">\n',
        '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,'
             'Description="The HGNC symbol which the variant was identified '
             'as being reportable for.">\n',
        '##INFO=<ID=CANDIDATE_MNV,Number=.,Type=String,'
             'Description="Code for candidate multinucleotide variants. '
             'Field is only included if the translated MNV differs from '
             'both of the SNV translations. There are five possibilities: '
             'alternate_residue_mnv=MNV translates to a residue not in SNVs, '
             'masked_stop_gain_mnv=MNV masks a stop gain, '
             'modified_stop_gained_mnv=MNV introduces a stop gain, '
             'modified_synonymous_mnv=MNV reverts to synonymous, '
             'modified_protein_altering_mnv=synonymous SNVs but missense '
             'MNV.">\n',
        '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,'
             'Description="The 012 coded genotypes for a trio (child, '
             'mother, father).">\n',
        '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="'
             'The inheritance of the variant in the trio (biparental, '
             'paternal, maternal, deNovo).">\n',
        "##ClinicalFilterRunDate={0}\n".format(datetime.date.today()),
        "##ClinicalFilterVersion={}\n".format(clinicalfilter.__version__),
        "##ClinicalFilterHistory=single_variant,compound_het\n",
        "##UberVCF_proband_Id=child\n",
        "##UberVCF_proband_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n",
        "##UberVCF_proband_Basename=child.vcf\n",
        "##UberVCF_proband_Date=2014-01-01\n",
        "##UberVCF_maternal_Id=mother\n",
        "##UberVCF_maternal_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n",
        "##UberVCF_maternal_Basename=mother.vcf\n",
        "##UberVCF_maternal_Date=2014-01-01\n",
        "##UberVCF_paternal_Id=father\n",
        "##UberVCF_paternal_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n",
        "##UberVCF_paternal_Basename=father.vcf\n",
        "##UberVCF_paternal_Date=2014-01-01\n",
        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n"]
     
     # define what the default variant vcf line will become
     line = ['X\t150\t.\tA\tG\t50\tPASS\tCQ=missense_variant;'
         'ClinicalFilterGeneInheritance=Monoallelic;'
         'ClinicalFilterReportableHGNC=TEST;ClinicalFilterType=single_variant;'
         'DENOVO-SNP;HGNC=TEST;HGNC_ID=1001;MAX_AF=0.0005\tGT:DP:INHERITANCE:'
         'INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n']
     
     # check that a list of one variant produces the correct VCF output. Note
     # that we haven't checked against CNVs, which can change the
     # INHERITANCE_GENOTYPE flag, nor have we tested a larger list of variants
     var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"])
     var[0].child.add_vcf_line(['X', '150', '.', 'A', 'G', '50',
         'PASS', 'HGNC=TEST;HGNC_ID=1001;CQ=missense_variant;EUR_AF=0.0005',
         'GT:DP', '0/1:50'])
     
     self.assertEqual(list(_get_vcf_lines([var], family)), vcf_lines + line)