Example #1
0
    def __iter__(self):

        if not self.metadata.fileformat:
            raise SyntaxError("Vcf must have fileformat defined")

        if self.vcf:

            # We need to treat the first case as an exception
            if self.beginning:
                variants = []
                if self.next_line:
                    first_variant = format_variant(line=self.next_line,
                                                   header_parser=self.metadata,
                                                   check_info=self.check_info)

                    if not (self.split_variants
                            and len(first_variant['ALT'].split(',')) > 1):
                        variants.append(first_variant)
                    else:
                        for splitted_variant in split_variants(
                                variant_dict=first_variant,
                                header_parser=self.metadata,
                                allele_symbol=self.allele_symbol):
                            variants.append(splitted_variant)

                    for variant in variants:
                        yield variant

                    self.beginning = False

            for line in self.vcf:
                line = line.rstrip()
                # These are the variant(s) found in one line of the vcf
                # If there are multiple alternatives and self.split_variants
                # There can be more than one variant in one line
                variants = []

                if not line.startswith('#') and len(line.split('\t')) >= 8:
                    variant = format_variant(line=line,
                                             header_parser=self.metadata,
                                             check_info=self.check_info)

                    if not (self.split_variants
                            and len(variant['ALT'].split(',')) > 1):
                        variants.append(variant)

                    else:
                        for splitted_variant in split_variants(
                                variant_dict=variant,
                                header_parser=self.metadata,
                                allele_symbol=self.allele_symbol):
                            variants.append(splitted_variant)

                for variant in variants:
                    yield variant

        else:
            for variant in self.variants:
                yield variant
Example #2
0
 def add_variant(self, chrom, pos, rs_id, ref, alt, qual, filt, info, form=None, genotypes=[]):
     """
     Add a variant to the parser.
     
     This function is for building a vcf. It takes the relevant parameters 
     and make a vcf variant in the proper format.
     """
     variant_info = [chrom, pos, rs_id, ref, alt, qual, filt, info]
     if form:
         variant_info.append(form)
     for individual in genotypes:
         variant_info.append(individual)
     
     variant_line = '\t'.join(variant_info)
     variant = format_variant(
         line = variant_line, 
         header_parser = self.metadata, 
         check_info = self.check_info
     )
     
     if not (self.split_variants and len(variant['ALT'].split(',')) > 1):
         self.variants.append(variant)
         
     # If multiple alternative and split_variants we must split the variant                 
     else:
         for splitted_variant in split_variants(
                                                 variant_dict=variant, 
                                                 header_parser=self.metadata, 
                                                 allele_symbol=self.allele_symbol):
             self.variants.append(splitted_variant)
Example #3
0
def test_simple_variant():
    """
    Test how the format_variant behaves
    """

    header_parser = get_header()

    variant_line = "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t"\
                    "0/1:60\t1/1:60"

    variant = format_variant(line=variant_line,
                             header_parser=header_parser,
                             check_info=True)
    info_dict = OrderedDict()
    info_dict['MQ'] = ['1']

    assert variant['CHROM'] == "1"
    assert variant['POS'] == "11900"
    assert variant['ID'] == "."
    assert variant['REF'] == "A"
    assert variant['ALT'] == "T"
    assert variant['QUAL'] == "100"
    assert variant['FILTER'] == "PASS"
    assert variant['INFO'] == "MQ=1"
    assert variant['FORMAT'] == "GT:GQ"
    assert variant['father'] == "0/1:60"
    assert variant['mother'] == "0/1:60"
    assert variant['proband'] == "1/1:60"
    assert variant['info_dict'] == info_dict
    assert type(variant['genotypes']['mother']) == type(Genotype())
def test_simple_variant():
    """
    Test how the format_variant behaves
    """
    
    header_parser = get_header()
    
    variant_line = "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t"\
                    "0/1:60\t1/1:60"
    
    variant = format_variant(
        line = variant_line, 
        header_parser=header_parser, 
        check_info=True
    )
    info_dict = OrderedDict()
    info_dict['MQ'] = ['1']
    
    assert variant['CHROM'] == "1"
    assert variant['POS'] == "11900"
    assert variant['ID'] == "."
    assert variant['REF'] == "A"
    assert variant['ALT'] == "T"
    assert variant['QUAL'] == "100"
    assert variant['FILTER'] == "PASS"
    assert variant['INFO'] == "MQ=1"
    assert variant['FORMAT'] == "GT:GQ"
    assert variant['father'] == "0/1:60"
    assert variant['mother'] == "0/1:60"
    assert variant['proband'] == "1/1:60"
    assert variant['info_dict'] == info_dict
    assert type(variant['genotypes']['mother']) == type(Genotype())
def test_split_minimal():
    """
    Test to split a vcf line without genotypes
    """
    header_lines = [
        '##fileformat=VCFv4.2',
        '##FILTER=<ID=LowQual,Description="Low quality">',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
        '##contig=<ID=1,length=249250621,assembly=b37>',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
    ]
    
    header_parser = get_header(header_lines)
    
    variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1"
    
    variant = format_variant(
        line = variant_line, 
        header_parser=header_parser, 
        check_info=True
    )
    
    splitted_variants = []
    
    for variant in split_variants(variant, header_parser):
        splitted_variants.append(variant)
    
    assert len(splitted_variants) == 2
def test_split_no_info():
    """
    Test how split genotypes when wrong number of entrys
    """
    
    header_parser = get_header()
    
    # CNT should have two entrys since Number=A
    variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\t."\
    "\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\
    "\t1/2:60:0,7,8:16"
    #But then we need to skip the info check
    variant = format_variant(
        line = variant_line, 
        header_parser=header_parser, 
        check_info=False
    )
    
    splitted_variants = []
    
    for variant in split_variants(variant, header_parser):
        splitted_variants.append(variant)
    
    assert len(splitted_variants) == 2
     
    first_variant = splitted_variants[0]
    second_variant = splitted_variants[1]
    
    assert first_variant['info_dict'] == {'.':[]}
    assert second_variant['info_dict'] == {'.':[]}

    assert first_variant['INFO'] == '.'
    assert second_variant['INFO'] == '.'
def test_wrong_number_of_R_entrys():
    """
    Test how split genotypes when wrong number of entrys
    """
    
    header_parser = get_header()
    
    # CNT should have two entrys since Number=A
    variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1;CNT=5,8;"\
    "DP_HIST=12,43\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\
    "\t1/2:60:0,7,8:16"
    #But then we need to skip the info check
    variant = format_variant(
        line = variant_line, 
        header_parser=header_parser, 
        check_info=False
    )
    
    splitted_variants = []
    
    for variant in split_variants(variant, header_parser):
        splitted_variants.append(variant)
    
    assert len(splitted_variants) == 2
     
    first_variant = splitted_variants[0]
    second_variant = splitted_variants[1]
    
    #Vcf-parser should use the first annotation for both alleles
    assert first_variant['info_dict']['DP_HIST'] == ['12','43']
    assert second_variant['info_dict']['DP_HIST'] == ['12','43']
def test_simple_split():
    """
    Test how split genotypes behave when a simple split
    """
    
    header_parser = get_header()
    
    variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1;CNT=5,8;"\
    "DP_HIST=12,43,22\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\
    "\t1/2:60:0,7,8:16"
    
    variant =         variant = format_variant(
            line = variant_line, 
            header_parser=header_parser, 
            check_info=True
        )

    
    splitted_variants = []
    
    for variant in split_variants(variant, header_parser):
        splitted_variants.append(variant)
    
    assert len(splitted_variants) == 2
     
    first_variant = splitted_variants[0]
    second_variant = splitted_variants[1]
    
    # Test if the splitted variants still have the same reference
    assert first_variant['REF'] == 'A'
    assert second_variant['REF'] == 'A'
    # Test if the alternative was splitted properly
    assert first_variant['ALT'] == 'T'
    assert second_variant['ALT'] == 'C'
    # Test if simple ino field is handled correct
    assert first_variant['info_dict']['MQ'] == ['1']
    assert second_variant['info_dict']['MQ'] == ['1']
    # Test if info field with Number='A' is handled correct
    assert first_variant['info_dict']['CNT'] == ['5']
    assert second_variant['info_dict']['CNT'] == ['8']
    # Test if info field with Number='R' is handled correct
    assert first_variant['info_dict']['DP_HIST'] == ['12', '43']
    assert second_variant['info_dict']['DP_HIST'] == ['12', '22']
    
    # Test if the genortypes are on the correct format
    assert first_variant['father'] == "1/1:60:0,7:12"
    assert second_variant['father'] == "0/0:60:0,0:12"
    
    assert first_variant['mother'] == "0/0:60:7,0:17"
    assert second_variant['mother'] == "0/1:60:7,10:17"
    
    assert first_variant['proband'] == "0/1:60:0,7:16"
    assert second_variant['proband'] == "0/1:60:0,8:16"
Example #9
0
def test_malformed_line():
    """
    Test if proper behaviour with malformed vcf line
    """

    header_parser = get_header()
    # Missing position
    variant_line = "1\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t"\
                    "0/1:60\t1/1:60"

    with pytest.raises(SyntaxError):
        variant = format_variant(line=variant_line,
                                 header_parser=header_parser,
                                 check_info=True)
Example #10
0
def test_wrong_number_annotation_genotype():
    """
    Test if proper behaviour with malformed vcf line
    """

    header_parser = get_header()
    # Missing position
    variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1;CNT=5,6;"\
    "SQ=1,2\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\
    "\t1/2:60:0,7,8:16"

    with pytest.raises(SyntaxError):
        variant = format_variant(line=variant_line,
                                 header_parser=header_parser,
                                 check_info=True)
Example #11
0
def test_malformed_line():
    """
    Test if proper behaviour with malformed vcf line
    """
    
    header_parser = get_header()
    # Missing position
    variant_line = "1\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t"\
                    "0/1:60\t1/1:60"
    
    with pytest.raises(SyntaxError):
        variant = format_variant(
            line = variant_line, 
            header_parser=header_parser, 
            check_info=True
        )
Example #12
0
def test_wrong_number_annotation_genotype():
    """
    Test if proper behaviour with malformed vcf line
    """
    
    header_parser = get_header()
    # Missing position
    variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tMQ=1;CNT=5,6;"\
    "SQ=1,2\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\
    "\t1/2:60:0,7,8:16"
    
    
    with pytest.raises(SyntaxError):
        variant = format_variant(
            line = variant_line, 
            header_parser=header_parser, 
            check_info=True
        )
Example #13
0
def test_no_genotypes():
    """
    Test if proper behaviour with minimal vcf
    """
    header_lines = [
        '##fileformat=VCFv4.2',
        '##FILTER=<ID=LowQual,Description="Low quality">',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
        '##contig=<ID=1,length=249250621,assembly=b37>',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
    ]

    header_parser = get_header(header_lines)
    # Missing position
    variant_line = "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1"

    variant = format_variant(line=variant_line,
                             header_parser=header_parser,
                             check_info=True)
Example #14
0
def test_no_genotypes():
    """
    Test if proper behaviour with minimal vcf
    """
    header_lines = [
        '##fileformat=VCFv4.2',
        '##FILTER=<ID=LowQual,Description="Low quality">',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
        '##contig=<ID=1,length=249250621,assembly=b37>',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
    ]
    
    header_parser = get_header(header_lines)
    # Missing position
    variant_line = "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1"
    
    variant = format_variant(
        line = variant_line, 
        header_parser=header_parser, 
        check_info=True
    )
Example #15
0
def test_csq_split():
    """
    Test works when splitting CSQ fields
    """
    
    header_parser = get_header()
    
    variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tCSQ=T|148398|NM_152486.2,"\
    "C|148398|NM_152486.2\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\
    "\t1/2:60:0,7,8:16"
    
    variant = format_variant(
        line = variant_line, 
        header_parser=header_parser, 
        check_info=True
    )
    
    splitted_variants = []
    
    for variant in split_variants(variant, header_parser):
        splitted_variants.append(variant)
    
    assert len(splitted_variants) == 2
     
    first_variant = splitted_variants[0]
    second_variant = splitted_variants[1]
    
    assert first_variant['info_dict']['CSQ'] == ['T|148398|NM_152486.2']
    assert second_variant['info_dict']['CSQ'] == ['C|148398|NM_152486.2']
    
    assert list(first_variant['vep_info'].keys()) == ['T']
    assert list(second_variant['vep_info'].keys()) == ['C']
    
    assert first_variant['vep_info']['T'] == [{
        'Allele':'T',
        'Gene':'148398',
        'Feature':'NM_152486.2'
    }]
Example #16
0
def test_csq_split_missing_allele():
    """
    Test works when splitting CSQ fields where one allele is missing
    """
    
    header_parser = get_header()
    
    variant_line = "3\t947379\t.\tA\tT,C\t100\tPASS\tCSQ=T|148398|NM_152486.2"\
    "\tGT:GQ:AD:DP\t1/1:60:0,7,0:12\t0/2:60:7,0,10:17"\
    "\t1/2:60:0,7,8:16"
    
    variant = format_variant(
        line = variant_line, 
        header_parser=header_parser, 
        check_info=True
    )
    
    splitted_variants = []
    
    for variant in split_variants(variant, header_parser):
        splitted_variants.append(variant)
    
    assert len(splitted_variants) == 2
     
    first_variant = splitted_variants[0]
    second_variant = splitted_variants[1]
    
    assert first_variant['info_dict']['CSQ'] == ['T|148398|NM_152486.2']
    with pytest.raises(KeyError):
        assert second_variant['info_dict']['CSQ'] == ['']
    
    assert list(first_variant['vep_info'].keys()) == ['T']
    
    assert list(second_variant['vep_info'].keys()) == ['C']
    
    assert second_variant['vep_info']['C'] == []
Example #17
0
    def __iter__(self):
        
        if not self.metadata.fileformat:
            raise SyntaxError("Vcf must have fileformat defined")
        
        if self.vcf:
            
            # We need to treat the first case as an exception
            if self.beginning:
                variants = []
                first_variant = format_variant(
                    line = self.next_line, 
                    header_parser = self.metadata, 
                    check_info = self.check_info
                )
                
                if not (self.split_variants and len(first_variant['ALT'].split(',')) > 1):
                    variants.append(first_variant)
                else:
                    for splitted_variant in split_variants(
                                                            variant_dict=first_variant, 
                                                            header_parser=self.metadata, 
                                                            allele_symbol=self.allele_symbol):
                        variants.append(splitted_variant)

                
                for variant in variants:
                    yield variant
                
                self.beginning = False
                
            
            for line in self.vcf:
                line = line.rstrip()
                # These are the variant(s) found in one line of the vcf
                # If there are multiple alternatives and self.split_variants
                # There can be more than one variant in one line
                variants = []
                
                if not line.startswith('#') and len(line.split('\t')) >= 8:
                    variant = format_variant(
                        line = line, 
                        header_parser = self.metadata, 
                        check_info = self.check_info
                        )
                    
                    if not (self.split_variants and len(variant['ALT'].split(',')) > 1):
                        variants.append(variant)
                    
                    else:
                        for splitted_variant in split_variants(
                                    variant_dict=variant, 
                                    header_parser=self.metadata, 
                                    allele_symbol=self.allele_symbol):
                            variants.append(splitted_variant)
                
                for variant in variants:
                    yield variant
        
        else:
            for variant in self.variants:
                yield variant