def test_open_individual_with_mnvs(self): ''' test that open_individual works with MNVs ''' vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, cq='splice_region_variant', extra='HGNC=ATRX;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, cq='missense_variant', extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "temp.vcf.gz") self.write_gzipped_vcf(path, vcf) person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path) args = {'chrom': "1", 'position': 1, 'id': ".", 'ref': "G", 'alts': "T", 'filter': "PASS", 'info': "CQ=splice_region_variant;HGNC=ATRX;MAX_AF=0.0001", 'format': "DP:GT", 'sample': "50:0/1", 'gender': "female", 'mnv_code': 'modified_protein_altering_mnv'} var1 = SNV(**args) args['position'] = 2 args['mnv_code'] = None args['info'] = "CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001" var2 = SNV(**args) # by default only one variant passes self.assertEqual(self.vcf_loader.open_individual(person), [var2]) # if we include MNVs, then the passing variants swap self.assertEqual(self.vcf_loader.open_individual(person, mnvs={('1', 1): 'modified_protein_altering_mnv', ('1', 2): 'modified_synonymous_mnv'}), [var1])
def test_open_individual_with_mnvs(self): ''' test that open_individual works with MNVs ''' vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, cq='splice_region_variant', extra='HGNC=ATRX;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, cq='missense_variant', extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "temp.vcf.gz") write_gzipped_vcf(path, vcf) person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path) args = {'chrom': "1", 'position': 1, 'id': ".", 'ref': "G", 'alts': "T", 'filter': "PASS", 'info': "CQ=splice_region_variant;HGNC=ATRX;MAX_AF=0.0001", 'format': "DP:GT", 'sample': "50:0/1", 'gender': "female", 'mnv_code': 'modified_protein_altering_mnv', 'qual': '1000'} var1 = SNV(**args) args['position'] = 2 args['mnv_code'] = None args['info'] = "CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001" var2 = SNV(**args) # by default only one variant passes self.assertEqual(open_individual(person), [var2]) # if we include MNVs, then the passing variants swap self.assertEqual(open_individual(person, mnvs={('1', 1): 'modified_protein_altering_mnv', ('1', 2): 'modified_synonymous_mnv'}), [var1])
def test_open_individual(self): ''' test that open_individual() works correctly ''' # missing individual returns empty list self.assertEqual(open_individual(None), []) vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "temp.vcf") write_temp_vcf(path, vcf) person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path) var1 = SNV(chrom="1", position=1, id=".", ref="G", alts="T", qual='1000', filter="PASS", info="CQ=missense_variant;HGNC=TEST;MAX_AF=0.0001", format="DP:GT", sample="50:0/1", gender="female", mnv_code=None) var2 = SNV(chrom="1", position=2, id=".", ref="G", alts="T", qual='1000', filter="PASS", info="CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001", format="DP:GT", sample="50:0/1", gender="female", mnv_code=None) self.assertEqual(open_individual(person), [var2]) # define a set of variants to automatically pass, and check that these # variants pass. child_keys = set([('1', 1), ('1', 2)]) self.assertEqual(open_individual(person, child_variants=child_keys), [var1, var2])
def test_open_individual(self): ''' test that open_individual() works correctly ''' # missing individual returns empty list self.assertEqual(self.vcf_loader.open_individual(None), []) vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "temp.vcf") self.write_temp_vcf(path, vcf) person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path) var1 = SNV(chrom="1", position=1, id=".", ref="G", alts="T", filter="PASS", info="CQ=missense_variant;HGNC=TEST;MAX_AF=0.0001", format="DP:GT", sample="50:0/1", gender="female", mnv_code=None) var2 = SNV(chrom="1", position=2, id=".", ref="G", alts="T", filter="PASS", info="CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001", format="DP:GT", sample="50:0/1", gender="female", mnv_code=None) self.assertEqual(self.vcf_loader.open_individual(person), [var2]) # define a set of variants to automatically pass, and check that these # variants pass. self.vcf_loader.child_keys = set([('1', 1), ('1', 2)]) self.assertEqual(self.vcf_loader.open_individual(person, child_variants=True), [var1, var2])
def make_vcf(person): # make a VCF, where one line would pass the default filtering vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "{}.vcf.gz".format(person)) write_gzipped_vcf(path, vcf) return path
def make_vcf(person): # make a VCF, where one line would pass the default filtering vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "{}.vcf.gz".format(person)) self.write_gzipped_vcf(path, vcf) return path
def test_get_mnv_candidates_catch_assertion_error(self): ''' check that get_mnv_candidates works correctly ''' lines = make_vcf_header() lines.append(make_vcf_line(chrom='1', pos=1, extra='Protein_position=1;Codons=aaT/aaG')) lines.append(make_vcf_line(chrom='1', pos=2, extra='Protein_position=2;Codons=Att/Ctt')) self.write_vcf(lines) self.assertEqual(get_mnv_candidates(self.path), {})
def test_get_mnv_candidates(self): ''' check that get_mnv_candidates works correctly ''' lines = make_vcf_header() lines.append(make_vcf_line(chrom='1', pos=1, extra='Protein_position=1;Codons=aaT/aaG')) lines.append(make_vcf_line(chrom='1', pos=2, extra='Protein_position=1;Codons=Aat/Cat')) self.write_vcf(lines) self.assertEqual(get_mnv_candidates(self.path), { ('1', 1): 'alternate_residue_mnv', ('1', 2): 'alternate_residue_mnv'})
def test_find_nearby_variants_separated(self): ''' test that find_nearby_variants() doesn't include vars far apart ''' lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=4)) self.write_vcf(lines) vcf = open_vcf(self.path) exclude_header(vcf) self.assertEqual(find_nearby_variants(vcf), [])
def test_find_nearby_variants(self): ''' test that find_nearby_variants() works correctly ''' lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) self.write_vcf(lines) vcf = open_vcf(self.path) exclude_header(vcf) self.assertEqual(find_nearby_variants(vcf), [[('1', 1), ('1', 2)]])
def test_find_nearby_variants_different_chroms(self): ''' test that find_nearby_variants() works correctly with successive variants on different chroms, but at the same position. ''' # get the default two variants lines = make_vcf_header() lines.append(make_vcf_line(chrom='1', pos=1)) lines.append(make_vcf_line(chrom='2', pos=1)) vcf = open_vcf(self.path) exclude_header(vcf) self.assertEqual(find_nearby_variants(vcf), [])
def test_same_aa_different_positions(self): ''' check that same_aa() works correctly for different amino acids ''' lines = make_vcf_header() lines.append(make_vcf_line(pos=5, extra='Protein_position=2')) lines.append(make_vcf_line(pos=7, extra='Protein_position=3')) lines.append(make_vcf_line(pos=8, extra='Protein_position=4')) self.write_vcf(lines) vcf = tabix.open(self.path) pairs = [[('1', 7), ('1', 8)]] self.assertEqual(same_aa(vcf, pairs), [])
def test_find_nearby_variants_different_threshold(self): ''' test that find_nearby_variants() works correctly when we change the threshold distance. ''' # get the default two variants lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) vcf = open_vcf(self.path) exclude_header(vcf) # using a lower threshold shouldn't allow any of the variants to pass self.assertEqual(find_nearby_variants(vcf, threshold=0), [])
def test_same_aa(self): ''' check that same_aa() works correctly ''' # get the VCF lines lines = make_vcf_header() lines.append(make_vcf_line(pos=2, extra='Protein_position=1')) lines.append(make_vcf_line(pos=4, extra='Protein_position=1')) self.write_vcf(lines) vcf = tabix.open(self.path) pairs = [[('1', 2), ('1', 4)]] self.assertEqual(same_aa(vcf, pairs), [[('1', 2), ('1', 4)]])
def test_find_nearby_variants_duplicate_position(self): ''' test that find_nearby_variants() works correctly with a duplicate var ''' # get the default two variants lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) # make a third variant, but at the same position as the second lines.append(make_vcf_line(pos=2)) self.write_vcf(lines) vcf = open_vcf(self.path) exclude_header(vcf) self.assertEqual(find_nearby_variants(vcf), [[('1', 1), ('1', 2)]])
def test_same_aa_missing_protein_positions(self): ''' check that same_aa() works correctly when the vars aren't in the CDS ''' # if one of the variants in the pair does not have a protein position # listed (i.e. residue number), that indicates the variant could be # affecting the splice site, so we can't use the pair. lines = make_vcf_header() lines.append(make_vcf_line(pos=5)) lines.append(make_vcf_line(pos=7)) lines.append(make_vcf_line(pos=8, extra='Protein_position=4')) self.write_vcf(lines) vcf = tabix.open(self.path) pairs = [[('1', 7), ('1', 8)]] self.assertEqual(same_aa(vcf, pairs), [])
def test_screen_pairs_nonstandard_pair(self): ''' test that screen_pairs() works correctly ''' # get the VCF lines lines = make_vcf_header() lines.append(make_vcf_line(pos=2)) lines.append(make_vcf_line(pos=4)) lines.append(make_vcf_line(pos=5)) lines.append(make_vcf_line(pos=7)) lines.append(make_vcf_line(pos=8)) self.write_vcf(lines) vcf = tabix.open(self.path) # set up a list of 'pairs', where one 'pair' has three variants in it. # we exclude 'pairs' where n != 2. pairs = [[('1', 2), ('1', 4), ('1', 5)], [('1', 7), ('1', 8)]] self.assertEqual(screen_pairs(vcf, pairs, is_not_indel), [[('1', 7), ('1', 8)]])
def test_open_individual_male_het_chrx(self): """ test that open_individual() passes over hets in males on chrX """ # the sub-functions are all tested elsewhere, this test merely checks # that valid variants are added to the variants list, and invalid # variants are passed over without being added to the variants list vcf = make_vcf_header() vcf.append(make_vcf_line(chrom='X', pos=1, genotype='0/1', extra='HGNC=TEST;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "temp.vcf") write_temp_vcf(path, vcf) person = Person('fam_id', 'sample', 'dad', 'mom', 'M', '2', path) self.assertEqual(open_individual(person), [])
def test__write_vcf(self): ''' check that _write_vcf() works correctly ''' path = tempfile.NamedTemporaryFile(suffix='.vcf.gz', dir=self.temp_dir, delete=False) lines = make_vcf_header() + ['X\t150\t.\tA\tG\t50\tPASS\tHGNC=TEST;' 'CQ=missense_variant;random_tag;EUR_AF=0.0005;' 'ClinicalFilterGeneInheritance=Monoallelic;' 'ClinicalFilterType=single_variant;' 'ClinicalFilterReportableHGNC=TEST\tGT:DP:INHERITANCE:' 'INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n'] _write_vcf(path.name, lines) with gzip.open(path.name, 'r') as handle: vcf = [ x.decode() for x in handle ] self.assertEqual(lines, vcf)
def test_analyse_trio(self): ''' test that analyse_trio() works correctly ''' # construct the VCFs for the trio members paths = {} for member in ['child', 'mom', 'dad']: vcf = make_vcf_header() geno, pp_dnm = '0/0', '' if member == 'child': geno, pp_dnm = '0/1', ';DENOVO-SNP;PP_DNM=1' vcf.append(make_vcf_line(genotype=geno, extra='HGNC=ARID1B' + pp_dnm)) # write the VCF data to a file handle = tempfile.NamedTemporaryFile(dir=self.temp_dir, delete=False, suffix='.vcf') for x in vcf: handle.write(x.encode('utf8')) handle.flush() paths[member] = handle.name # create a Family object, so we can load the data from the trio's VCFs fam_id = 'fam01' child = Person(fam_id, 'child', 'dad', 'mom', 'female', '2', paths['child']) mom = Person(fam_id, 'mom', '0', '0', 'female', '1', paths['mom']) dad = Person(fam_id, 'dad', '0', '0', 'male', '1', paths['dad']) family = Family(fam_id, [child], mom, dad) self.assertEqual(self.finder.analyse_trio(family), [(TrioGenotypes(chrom="1", pos=1, child=SNV(chrom="1", position=1, id=".", ref="G", alts="T", filter="PASS", info="CQ=missense_variant;DENOVO-SNP;HGNC=ARID1B;PP_DNM=1", format="DP:GT", sample="50:0/1", gender="female", mnv_code=None), mother=SNV(chrom="1", position=1, id=".", ref="G", alts="T", filter="PASS", info="CQ=missense_variant;HGNC=ARID1B", format="DP:GT", sample="50:0/0", gender="female", mnv_code=None), father=SNV(chrom="1", position=1, id=".", ref="G", alts="T", filter="PASS", info="CQ=missense_variant;HGNC=ARID1B", format="DP:GT", sample="50:0/0", gender="male", mnv_code=None)), ['single_variant'], ['Monoallelic'], ['ARID1B'])])
def test_get_matches(self): ''' check that get_matches works correctly ''' # get the VCF lines lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) lines.append(make_vcf_line(pos=4)) lines.append(make_vcf_line(pos=5)) self.write_vcf(lines) vcf = tabix.open(self.path) pair = [('1', 2), ('1', 4)] # define the expected lines var1 = parse_vcf_line(make_vcf_line(pos=2).split('\t'), self.Variant) var2 = parse_vcf_line(make_vcf_line(pos=4).split('\t'), self.Variant) self.assertEqual(list(get_matches(vcf, pair)), [var1, var2])
def test__write_vcf(self): ''' check that _write_vcf() works correctly ''' path = tempfile.NamedTemporaryFile(suffix='.vcf.gz', dir=self.temp_dir, delete=False) lines = make_vcf_header() + [ 'X\t150\t.\tA\tG\t50\tPASS\tHGNC=TEST;' 'CQ=missense_variant;random_tag;EUR_AF=0.0005;' 'ClinicalFilterGeneInheritance=Monoallelic;' 'ClinicalFilterType=single_variant;' 'ClinicalFilterReportableHGNC=TEST\tGT:DP:INHERITANCE:' 'INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n' ] _write_vcf(path.name, lines) with gzip.open(path.name, 'r') as handle: vcf = [x.decode() for x in handle] self.assertEqual(lines, vcf)
def test_screen_pairs(self): ''' test that screen_pairs() works correctly ''' # get the VCF lines lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) lines.append(make_vcf_line(pos=4)) lines.append(make_vcf_line(pos=5)) lines.append(make_vcf_line(pos=7)) lines.append(make_vcf_line(pos=8)) self.write_vcf(lines) vcf = tabix.open(self.path) pairs = [[('1', 2), ('1', 4)], [('1', 7), ('1', 8)]] self.assertEqual(screen_pairs(vcf, pairs, is_not_indel), pairs) # check that the other filter function also works cleanly self.assertEqual(screen_pairs(vcf, pairs, is_coding), pairs)
def test__get_vcf_lines(self): """ check that _get_vcf_lines() works correctly """ # write VFs for the trio members, in order to be able to pick up the # VCF provenance information family = self.trio for member in [family.child, family.mother, family.father]: header = make_vcf_header() with open(member.get_path(), 'w') as handle: handle.writelines(header) # define what the header will become vcf_lines = [ "##fileformat=VCFv4.1\n", '##fileDate=2014-01-01\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The ' 'type of clinical filter that passed this variant.">\n', '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,' 'Description="The inheritance mode (Monoallelic, Biallelic etc) ' 'under which the variant was found.">\n', '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,' 'Description="The HGNC symbol which the variant was identified ' 'as being reportable for.">\n', '##INFO=<ID=CANDIDATE_MNV,Number=.,Type=String,' 'Description="Code for candidate multinucleotide variants. ' 'Field is only included if the translated MNV differs from ' 'both of the SNV translations. There are five possibilities: ' 'alternate_residue_mnv=MNV translates to a residue not in SNVs, ' 'masked_stop_gain_mnv=MNV masks a stop gain, ' 'modified_stop_gained_mnv=MNV introduces a stop gain, ' 'modified_synonymous_mnv=MNV reverts to synonymous, ' 'modified_protein_altering_mnv=synonymous SNVs but missense ' 'MNV.">\n', '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,' 'Description="The 012 coded genotypes for a trio (child, ' 'mother, father).">\n', '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="' 'The inheritance of the variant in the trio (biparental, ' 'paternal, maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format( datetime.date.today()), "##ClinicalFilterVersion={}\n".format( clinicalfilter.__version__), "##ClinicalFilterHistory=single_variant,compound_het\n", "##UberVCF_proband_Id=child\n", "##UberVCF_proband_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n", "##UberVCF_proband_Basename=child.vcf\n", "##UberVCF_proband_Date=2014-01-01\n", "##UberVCF_maternal_Id=mother\n", "##UberVCF_maternal_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n", "##UberVCF_maternal_Basename=mother.vcf\n", "##UberVCF_maternal_Date=2014-01-01\n", "##UberVCF_paternal_Id=father\n", "##UberVCF_paternal_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n", "##UberVCF_paternal_Basename=father.vcf\n", "##UberVCF_paternal_Date=2014-01-01\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n" ] # define what the default variant vcf line will become line = [ 'X\t150\t.\tA\tG\t50\tPASS\tCQ=missense_variant;' 'ClinicalFilterGeneInheritance=Monoallelic;' 'ClinicalFilterReportableHGNC=TEST;ClinicalFilterType=single_variant;' 'DENOVO-SNP;HGNC=TEST;HGNC_ID=1001;MAX_AF=0.0005\tGT:DP:INHERITANCE:' 'INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n' ] # check that a list of one variant produces the correct VCF output. Note # that we haven't checked against CNVs, which can change the # INHERITANCE_GENOTYPE flag, nor have we tested a larger list of variants var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"]) var[0].child.add_vcf_line([ 'X', '150', '.', 'A', 'G', '50', 'PASS', 'HGNC=TEST;HGNC_ID=1001;CQ=missense_variant;EUR_AF=0.0005', 'GT:DP', '0/1:50' ]) self.assertEqual(list(_get_vcf_lines([var], family)), vcf_lines + line)
def test__make_vcf_header(self): """ check that _make_vcf_header() works correctly """ # define the intial header lines header = make_vcf_header() # define the VCF provenances provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"), ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"), ("checksum", "father.calls.date.vcf.gz", "2014-01-03")] processed_header = [ "##fileformat=VCFv4.1\n", '##fileDate=2014-01-01\n', "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,' 'Description="The type of clinical filter that passed this ' 'variant.">\n', '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,' 'Description="The inheritance mode (Monoallelic, Biallelic ' 'etc) under which the variant was found.">\n', '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,' 'Description="The HGNC symbol which the variant was identified ' 'as being reportable for.">\n', '##INFO=<ID=CANDIDATE_MNV,Number=.,Type=String,' 'Description="Code for candidate multinucleotide variants. ' 'Field is only included if the translated MNV differs from ' 'both of the SNV translations. There are five possibilities: ' 'alternate_residue_mnv=MNV translates to a residue not in SNVs, ' 'masked_stop_gain_mnv=MNV masks a stop gain, ' 'modified_stop_gained_mnv=MNV introduces a stop gain, ' 'modified_synonymous_mnv=MNV reverts to synonymous, ' 'modified_protein_altering_mnv=synonymous SNVs but missense ' 'MNV.">\n', '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,' 'Description="The 012 coded genotypes for a trio (child, ' 'mother, father).">\n', '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The ' 'inheritance of the variant in the trio (biparental, paternal, ' 'maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format( datetime.date.today()), "##ClinicalFilterVersion={}\n".format( clinicalfilter.__version__), "##ClinicalFilterHistory=single_variant,compound_het\n", "##UberVCF_proband_Id=proband\n", "##UberVCF_proband_Checksum=checksum\n", "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n", "##UberVCF_proband_Date=2014-01-01\n", "##UberVCF_maternal_Id=mother\n", "##UberVCF_maternal_Checksum=checksum\n", "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n", "##UberVCF_maternal_Date=2014-01-02\n", "##UberVCF_paternal_Id=father\n", "##UberVCF_paternal_Checksum=checksum\n", "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n", "##UberVCF_paternal_Date=2014-01-03\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n" ] # check that the standard function returns the expected value. Note that # I haven't checked the output if self.known_genes_date is not None, nor # have I checked if the _clinicalFilterVersion is available self.assertEqual(_make_vcf_header(header, provenance), processed_header)
def test__make_vcf_header(self): """ check that _make_vcf_header() works correctly """ # define the intial header lines header = make_vcf_header() # define the VCF provenances provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"), ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"), ("checksum", "father.calls.date.vcf.gz", "2014-01-03")] processed_header = ["##fileformat=VCFv4.1\n", '##fileDate=2014-01-01\n', "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,' 'Description="The type of clinical filter that passed this ' 'variant.">\n', '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,' 'Description="The inheritance mode (Monoallelic, Biallelic ' 'etc) under which the variant was found.">\n', '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,' 'Description="The HGNC symbol which the variant was identified ' 'as being reportable for.">\n', '##INFO=<ID=CANDIDATE_MNV,Number=.,Type=String,' 'Description="Code for candidate multinucleotide variants. ' 'Field is only included if the translated MNV differs from ' 'both of the SNV translations. There are five possibilities: ' 'alternate_residue_mnv=MNV translates to a residue not in SNVs, ' 'masked_stop_gain_mnv=MNV masks a stop gain, ' 'modified_stop_gained_mnv=MNV introduces a stop gain, ' 'modified_synonymous_mnv=MNV reverts to synonymous, ' 'modified_protein_altering_mnv=synonymous SNVs but missense ' 'MNV.">\n', '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,' 'Description="The 012 coded genotypes for a trio (child, ' 'mother, father).">\n', '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The ' 'inheritance of the variant in the trio (biparental, paternal, ' 'maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format(datetime.date.today()), "##ClinicalFilterVersion={}\n".format(clinicalfilter.__version__), "##ClinicalFilterHistory=single_variant,compound_het\n", "##UberVCF_proband_Id=proband\n", "##UberVCF_proband_Checksum=checksum\n", "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n", "##UberVCF_proband_Date=2014-01-01\n", "##UberVCF_maternal_Id=mother\n", "##UberVCF_maternal_Checksum=checksum\n", "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n", "##UberVCF_maternal_Date=2014-01-02\n", "##UberVCF_paternal_Id=father\n", "##UberVCF_paternal_Checksum=checksum\n", "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n", "##UberVCF_paternal_Date=2014-01-03\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n"] # check that the standard function returns the expected value. Note that # I haven't checked the output if self.known_genes_date is not None, nor # have I checked if the _clinicalFilterVersion is available self.assertEqual(_make_vcf_header(header, provenance), processed_header)
def test_analyse_trio(self): ''' test that analyse_trio() works correctly ''' # construct the VCFs for the trio members paths = {} for member in ['child', 'mom', 'dad']: vcf = make_vcf_header() geno, pp_dnm = '0/0', '' if member == 'child': geno, pp_dnm = '0/1', ';DENOVO-SNP;PP_DNM=1' vcf.append( make_vcf_line(genotype=geno, extra='HGNC=ARID1B' + pp_dnm)) # write the VCF data to a file handle = tempfile.NamedTemporaryFile(dir=self.temp_dir, delete=False, suffix='.vcf') for x in vcf: handle.write(x.encode('utf8')) handle.flush() paths[member] = handle.name # create a Family object, so we can load the data from the trio's VCFs fam_id = 'fam01' child = Person(fam_id, 'child', 'dad', 'mom', 'female', '2', paths['child']) mom = Person(fam_id, 'mom', '0', '0', 'female', '1', paths['mom']) dad = Person(fam_id, 'dad', '0', '0', 'male', '1', paths['dad']) family = Family(fam_id, [child], mom, dad) self.assertEqual(self.finder.analyse_trio(family), [(TrioGenotypes( chrom="1", pos=1, child=SNV( chrom="1", position=1, id=".", ref="G", alts="T", qual='1000', filter="PASS", info="CQ=missense_variant;DENOVO-SNP;HGNC=ARID1B;PP_DNM=1", format="DP:GT", sample="50:0/1", gender="female", mnv_code=None), mother=SNV(chrom="1", position=1, id=".", ref="G", alts="T", qual='1000', filter="PASS", info="CQ=missense_variant;HGNC=ARID1B", format="DP:GT", sample="50:0/0", gender="female", mnv_code=None), father=SNV(chrom="1", position=1, id=".", ref="G", alts="T", qual='1000', filter="PASS", info="CQ=missense_variant;HGNC=ARID1B", format="DP:GT", sample="50:0/0", gender="male", mnv_code=None)), ['single_variant'], [ 'Monoallelic', 'Mosaic' ], ['ARID1B'])])
def test__get_vcf_lines(self): """ check that _get_vcf_lines() works correctly """ # write VFs for the trio members, in order to be able to pick up the # VCF provenance information family = self.trio for member in [family.child, family.mother, family.father]: header = make_vcf_header() with open(member.get_path(), 'w') as handle: handle.writelines(header) # define what the header will become vcf_lines = ["##fileformat=VCFv4.1\n", '##fileDate=2014-01-01\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The ' 'type of clinical filter that passed this variant.">\n', '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,' 'Description="The inheritance mode (Monoallelic, Biallelic etc) ' 'under which the variant was found.">\n', '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,' 'Description="The HGNC symbol which the variant was identified ' 'as being reportable for.">\n', '##INFO=<ID=CANDIDATE_MNV,Number=.,Type=String,' 'Description="Code for candidate multinucleotide variants. ' 'Field is only included if the translated MNV differs from ' 'both of the SNV translations. There are five possibilities: ' 'alternate_residue_mnv=MNV translates to a residue not in SNVs, ' 'masked_stop_gain_mnv=MNV masks a stop gain, ' 'modified_stop_gained_mnv=MNV introduces a stop gain, ' 'modified_synonymous_mnv=MNV reverts to synonymous, ' 'modified_protein_altering_mnv=synonymous SNVs but missense ' 'MNV.">\n', '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,' 'Description="The 012 coded genotypes for a trio (child, ' 'mother, father).">\n', '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="' 'The inheritance of the variant in the trio (biparental, ' 'paternal, maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format(datetime.date.today()), "##ClinicalFilterVersion={}\n".format(clinicalfilter.__version__), "##ClinicalFilterHistory=single_variant,compound_het\n", "##UberVCF_proband_Id=child\n", "##UberVCF_proband_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n", "##UberVCF_proband_Basename=child.vcf\n", "##UberVCF_proband_Date=2014-01-01\n", "##UberVCF_maternal_Id=mother\n", "##UberVCF_maternal_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n", "##UberVCF_maternal_Basename=mother.vcf\n", "##UberVCF_maternal_Date=2014-01-01\n", "##UberVCF_paternal_Id=father\n", "##UberVCF_paternal_Checksum=0a35d4e98a0f07153584f255ead8a2014ebacad0\n", "##UberVCF_paternal_Basename=father.vcf\n", "##UberVCF_paternal_Date=2014-01-01\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\n"] # define what the default variant vcf line will become line = ['X\t150\t.\tA\tG\t50\tPASS\tCQ=missense_variant;' 'ClinicalFilterGeneInheritance=Monoallelic;' 'ClinicalFilterReportableHGNC=TEST;ClinicalFilterType=single_variant;' 'DENOVO-SNP;HGNC=TEST;HGNC_ID=1001;MAX_AF=0.0005\tGT:DP:INHERITANCE:' 'INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n'] # check that a list of one variant produces the correct VCF output. Note # that we haven't checked against CNVs, which can change the # INHERITANCE_GENOTYPE flag, nor have we tested a larger list of variants var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"]) var[0].child.add_vcf_line(['X', '150', '.', 'A', 'G', '50', 'PASS', 'HGNC=TEST;HGNC_ID=1001;CQ=missense_variant;EUR_AF=0.0005', 'GT:DP', '0/1:50']) self.assertEqual(list(_get_vcf_lines([var], family)), vcf_lines + line)