def __iter__(self): if not self.metadata.fileformat: raise SyntaxError("Vcf must have fileformat defined") if self.vcf: # We need to treat the first case as an exception if self.beginning: variants = [] if self.next_line: first_variant = format_variant(line=self.next_line, header_parser=self.metadata, check_info=self.check_info) if not (self.split_variants and len(first_variant['ALT'].split(',')) > 1): variants.append(first_variant) else: for splitted_variant in split_variants( variant_dict=first_variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): variants.append(splitted_variant) for variant in variants: yield variant self.beginning = False for line in self.vcf: line = line.rstrip() # These are the variant(s) found in one line of the vcf # If there are multiple alternatives and self.split_variants # There can be more than one variant in one line variants = [] if not line.startswith('#') and len(line.split('\t')) >= 8: variant = format_variant(line=line, header_parser=self.metadata, check_info=self.check_info) if not (self.split_variants and len(variant['ALT'].split(',')) > 1): variants.append(variant) else: for splitted_variant in split_variants( variant_dict=variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): variants.append(splitted_variant) for variant in variants: yield variant else: for variant in self.variants: yield variant
def add_variant(self, chrom, pos, rs_id, ref, alt, qual, filt, info, form=None, genotypes=[]): """ Add a variant to the parser. This function is for building a vcf. It takes the relevant parameters and make a vcf variant in the proper format. """ variant_info = [chrom, pos, rs_id, ref, alt, qual, filt, info] if form: variant_info.append(form) for individual in genotypes: variant_info.append(individual) variant_line = '\t'.join(variant_info) variant = format_variant( line = variant_line, header_parser = self.metadata, check_info = self.check_info ) if not (self.split_variants and len(variant['ALT'].split(',')) > 1): self.variants.append(variant) # If multiple alternative and split_variants we must split the variant else: for splitted_variant in split_variants( variant_dict=variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): self.variants.append(splitted_variant)
def test_simple_variant(): """ Test how the format_variant behaves """ header_parser = get_header() variant_line = "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t"\ "0/1:60\t1/1:60" variant = format_variant(line=variant_line, header_parser=header_parser, check_info=True) info_dict = OrderedDict() info_dict['MQ'] = ['1'] assert variant['CHROM'] == "1" assert variant['POS'] == "11900" assert variant['ID'] == "." assert variant['REF'] == "A" assert variant['ALT'] == "T" assert variant['QUAL'] == "100" assert variant['FILTER'] == "PASS" assert variant['INFO'] == "MQ=1" assert variant['FORMAT'] == "GT:GQ" assert variant['father'] == "0/1:60" assert variant['mother'] == "0/1:60" assert variant['proband'] == "1/1:60" assert variant['info_dict'] == info_dict assert type(variant['genotypes']['mother']) == type(Genotype())
def test_simple_variant(): """ Test how the format_variant behaves """ header_parser = get_header() variant_line = "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t"\ "0/1:60\t1/1:60" variant = format_variant( line = variant_line, header_parser=header_parser, check_info=True ) info_dict = OrderedDict() info_dict['MQ'] = ['1'] assert variant['CHROM'] == "1" assert variant['POS'] == "11900" assert variant['ID'] == "." assert variant['REF'] == "A" assert variant['ALT'] == "T" assert variant['QUAL'] == "100" assert variant['FILTER'] == "PASS" assert variant['INFO'] == "MQ=1" assert variant['FORMAT'] == "GT:GQ" assert variant['father'] == "0/1:60" assert variant['mother'] == "0/1:60" assert variant['proband'] == "1/1:60" assert variant['info_dict'] == info_dict assert type(variant['genotypes']['mother']) == type(Genotype())
def test_split_minimal(): """ Test to split a vcf line without genotypes """ header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##contig=<ID=1,length=249250621,assembly=b37>', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' ] header_parser = get_header(header_lines) variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1" variant = format_variant( line = variant_line, header_parser=header_parser, check_info=True ) splitted_variants = [] for variant in split_variants(variant, header_parser): splitted_variants.append(variant) assert len(splitted_variants) == 2
def test_split_no_info(): """ Test how split genotypes when wrong number of entrys """ header_parser = get_header() # CNT should have two entrys since Number=A variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\t."\ "\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\ "\t1/2:60:0,7,8:16" #But then we need to skip the info check variant = format_variant( line = variant_line, header_parser=header_parser, check_info=False ) splitted_variants = [] for variant in split_variants(variant, header_parser): splitted_variants.append(variant) assert len(splitted_variants) == 2 first_variant = splitted_variants[0] second_variant = splitted_variants[1] assert first_variant['info_dict'] == {'.':[]} assert second_variant['info_dict'] == {'.':[]} assert first_variant['INFO'] == '.' assert second_variant['INFO'] == '.'
def test_wrong_number_of_R_entrys(): """ Test how split genotypes when wrong number of entrys """ header_parser = get_header() # CNT should have two entrys since Number=A variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1;CNT=5,8;"\ "DP_HIST=12,43\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\ "\t1/2:60:0,7,8:16" #But then we need to skip the info check variant = format_variant( line = variant_line, header_parser=header_parser, check_info=False ) splitted_variants = [] for variant in split_variants(variant, header_parser): splitted_variants.append(variant) assert len(splitted_variants) == 2 first_variant = splitted_variants[0] second_variant = splitted_variants[1] #Vcf-parser should use the first annotation for both alleles assert first_variant['info_dict']['DP_HIST'] == ['12','43'] assert second_variant['info_dict']['DP_HIST'] == ['12','43']
def test_simple_split(): """ Test how split genotypes behave when a simple split """ header_parser = get_header() variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1;CNT=5,8;"\ "DP_HIST=12,43,22\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\ "\t1/2:60:0,7,8:16" variant = variant = format_variant( line = variant_line, header_parser=header_parser, check_info=True ) splitted_variants = [] for variant in split_variants(variant, header_parser): splitted_variants.append(variant) assert len(splitted_variants) == 2 first_variant = splitted_variants[0] second_variant = splitted_variants[1] # Test if the splitted variants still have the same reference assert first_variant['REF'] == 'A' assert second_variant['REF'] == 'A' # Test if the alternative was splitted properly assert first_variant['ALT'] == 'T' assert second_variant['ALT'] == 'C' # Test if simple ino field is handled correct assert first_variant['info_dict']['MQ'] == ['1'] assert second_variant['info_dict']['MQ'] == ['1'] # Test if info field with Number='A' is handled correct assert first_variant['info_dict']['CNT'] == ['5'] assert second_variant['info_dict']['CNT'] == ['8'] # Test if info field with Number='R' is handled correct assert first_variant['info_dict']['DP_HIST'] == ['12', '43'] assert second_variant['info_dict']['DP_HIST'] == ['12', '22'] # Test if the genortypes are on the correct format assert first_variant['father'] == "1/1:60:0,7:12" assert second_variant['father'] == "0/0:60:0,0:12" assert first_variant['mother'] == "0/0:60:7,0:17" assert second_variant['mother'] == "0/1:60:7,10:17" assert first_variant['proband'] == "0/1:60:0,7:16" assert second_variant['proband'] == "0/1:60:0,8:16"
def test_malformed_line(): """ Test if proper behaviour with malformed vcf line """ header_parser = get_header() # Missing position variant_line = "1\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t"\ "0/1:60\t1/1:60" with pytest.raises(SyntaxError): variant = format_variant(line=variant_line, header_parser=header_parser, check_info=True)
def test_wrong_number_annotation_genotype(): """ Test if proper behaviour with malformed vcf line """ header_parser = get_header() # Missing position variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1;CNT=5,6;"\ "SQ=1,2\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\ "\t1/2:60:0,7,8:16" with pytest.raises(SyntaxError): variant = format_variant(line=variant_line, header_parser=header_parser, check_info=True)
def test_malformed_line(): """ Test if proper behaviour with malformed vcf line """ header_parser = get_header() # Missing position variant_line = "1\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t"\ "0/1:60\t1/1:60" with pytest.raises(SyntaxError): variant = format_variant( line = variant_line, header_parser=header_parser, check_info=True )
def test_wrong_number_annotation_genotype(): """ Test if proper behaviour with malformed vcf line """ header_parser = get_header() # Missing position variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1;CNT=5,6;"\ "SQ=1,2\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\ "\t1/2:60:0,7,8:16" with pytest.raises(SyntaxError): variant = format_variant( line = variant_line, header_parser=header_parser, check_info=True )
def test_no_genotypes(): """ Test if proper behaviour with minimal vcf """ header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##contig=<ID=1,length=249250621,assembly=b37>', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' ] header_parser = get_header(header_lines) # Missing position variant_line = "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1" variant = format_variant(line=variant_line, header_parser=header_parser, check_info=True)
def test_no_genotypes(): """ Test if proper behaviour with minimal vcf """ header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##contig=<ID=1,length=249250621,assembly=b37>', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' ] header_parser = get_header(header_lines) # Missing position variant_line = "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1" variant = format_variant( line = variant_line, header_parser=header_parser, check_info=True )
def test_csq_split(): """ Test works when splitting CSQ fields """ header_parser = get_header() variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tCSQ=T|148398|NM_152486.2,"\ "C|148398|NM_152486.2\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\ "\t1/2:60:0,7,8:16" variant = format_variant( line = variant_line, header_parser=header_parser, check_info=True ) splitted_variants = [] for variant in split_variants(variant, header_parser): splitted_variants.append(variant) assert len(splitted_variants) == 2 first_variant = splitted_variants[0] second_variant = splitted_variants[1] assert first_variant['info_dict']['CSQ'] == ['T|148398|NM_152486.2'] assert second_variant['info_dict']['CSQ'] == ['C|148398|NM_152486.2'] assert list(first_variant['vep_info'].keys()) == ['T'] assert list(second_variant['vep_info'].keys()) == ['C'] assert first_variant['vep_info']['T'] == [{ 'Allele':'T', 'Gene':'148398', 'Feature':'NM_152486.2' }]
def test_csq_split_missing_allele(): """ Test works when splitting CSQ fields where one allele is missing """ header_parser = get_header() variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tCSQ=T|148398|NM_152486.2"\ "\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\ "\t1/2:60:0,7,8:16" variant = format_variant( line = variant_line, header_parser=header_parser, check_info=True ) splitted_variants = [] for variant in split_variants(variant, header_parser): splitted_variants.append(variant) assert len(splitted_variants) == 2 first_variant = splitted_variants[0] second_variant = splitted_variants[1] assert first_variant['info_dict']['CSQ'] == ['T|148398|NM_152486.2'] with pytest.raises(KeyError): assert second_variant['info_dict']['CSQ'] == [''] assert list(first_variant['vep_info'].keys()) == ['T'] assert list(second_variant['vep_info'].keys()) == ['C'] assert second_variant['vep_info']['C'] == []
def __iter__(self): if not self.metadata.fileformat: raise SyntaxError("Vcf must have fileformat defined") if self.vcf: # We need to treat the first case as an exception if self.beginning: variants = [] first_variant = format_variant( line = self.next_line, header_parser = self.metadata, check_info = self.check_info ) if not (self.split_variants and len(first_variant['ALT'].split(',')) > 1): variants.append(first_variant) else: for splitted_variant in split_variants( variant_dict=first_variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): variants.append(splitted_variant) for variant in variants: yield variant self.beginning = False for line in self.vcf: line = line.rstrip() # These are the variant(s) found in one line of the vcf # If there are multiple alternatives and self.split_variants # There can be more than one variant in one line variants = [] if not line.startswith('#') and len(line.split('\t')) >= 8: variant = format_variant( line = line, header_parser = self.metadata, check_info = self.check_info ) if not (self.split_variants and len(variant['ALT'].split(',')) > 1): variants.append(variant) else: for splitted_variant in split_variants( variant_dict=variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): variants.append(splitted_variant) for variant in variants: yield variant else: for variant in self.variants: yield variant