def get_header(header_lines = None):
    """Initiate a HeaderParser and return it"""
    header_parser = HeaderParser()
    
    if not header_lines:
        header_lines = [
            '##fileformat=VCFv4.2',
            '##FILTER=<ID=LowQual,Description="Low quality">',
            '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
            '##INFO=<ID=SQ,Number=G,Type=Float,Description="Just for test">',
            '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\
            'this allele was found in external db">',
            '##contig=<ID=1,length=249250621,assembly=b37>',
            '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\
            'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\
            '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">',
            '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\
            ' the ref and alt alleles in the order listed">',
            '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">'
            '##reference=file:///human_g1k_v37.fasta',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband'
        ]
    for line in header_lines:
        if line.startswith('##'):
            header_parser.parse_meta_data(line)
        elif line.startswith('#'):
            header_parser.parse_header_line(line)
    
    return header_parser
def test_vep_columns():
    """
    Test how the vep columns are parsed
    """
    header_parser = HeaderParser()

    vep_info_line = '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence'\
    ' type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence">'

    header_parser.parse_meta_data(vep_info_line)

    assert header_parser.vep_columns == [
        'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence'
    ]
def test_parse_vcf_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """
    
    header_parser = HeaderParser()
    
    header_lines = [
        '##fileformat=VCFv4.2',
        '##FILTER=<ID=LowQual,Description="Low quality">',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
        '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\
        'this allele was found in external db">',
        '##contig=<ID=1,length=249250621,assembly=b37>',
        '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\
        'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\
        '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">',
        '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\
        ' the ref and alt alleles in the order listed">',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
        '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">'
        '##reference=file:///human_g1k_v37.fasta',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband'
    ]
    for line in header_lines:
        if line.startswith('##'):
            header_parser.parse_meta_data(line)
        elif line.startswith('#'):
            header_parser.parse_header_line(line)
    
    assert header_parser.fileformat == "VCFv4.2"
    assert header_parser.individuals == ['father','mother','proband']
    
    assert header_parser.vep_columns == []
    
    assert "MQ" in header_parser.extra_info
    assert header_parser.extra_info["MQ"]['Description'] == "RMS Mapping Quality"
    assert header_parser.extra_info["CNT"]['Number'] == "A"
    assert header_parser.extra_info["CNT"]['Type'] == "Integer"
    assert "CNT" in header_parser.extra_info
    assert "DP_HIST" in header_parser.extra_info
    
    assert "LowQual" in header_parser.filter_dict
    assert "1" in header_parser.contig_dict
    
    assert header_parser.header == [
        'CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT',
        'father','mother','proband'
    ]
def test_vep_columns():
    """
    Test how the vep columns are parsed
    """
    header_parser = HeaderParser()
    
    vep_info_line = '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence'\
    ' type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence">'
    
    header_parser.parse_meta_data(vep_info_line)
    
    assert header_parser.vep_columns == ['Allele','Gene','Feature','Feature_type','Consequence']
    
    


    
Beispiel #5
0
def get_header(header_lines=None):
    """Initiate a HeaderParser and return it"""
    header_parser = HeaderParser()

    if not header_lines:
        header_lines = [
            '##fileformat=VCFv4.2',
            '##FILTER=<ID=LowQual,Description="Low quality">',
            '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
            '##INFO=<ID=SQ,Number=G,Type=Float,Description="Just for test">',
            '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\
            'this allele was found in external db">',
            '##contig=<ID=1,length=249250621,assembly=b37>',
            '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\
            'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\
            '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">',
            '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\
            ' the ref and alt alleles in the order listed">',
            '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">'
            '##reference=file:///human_g1k_v37.fasta',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband'
        ]
    for line in header_lines:
        if line.startswith('##'):
            header_parser.parse_meta_data(line)
        elif line.startswith('#'):
            header_parser.parse_header_line(line)

    return header_parser
def test_malformed_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """

    header_parser = HeaderParser()

    malformed_fileformat = '##fileformat'
    malformed_info_line = '##INFO=<ID=MQ,Number=1,Description="RMS Mapping Quality">'
    malformed_contig_line = '##contig=<ID=1,assembly=b37>'

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_fileformat)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_info_line)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_contig_line)
def test_parse_vcf_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """

    header_parser = HeaderParser()

    header_lines = [
        '##fileformat=VCFv4.2',
        '##FILTER=<ID=LowQual,Description="Low quality">',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
        '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\
        'this allele was found in external db">',
        '##contig=<ID=1,length=249250621,assembly=b37>',
        '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\
        'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\
        '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">',
        '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\
        ' the ref and alt alleles in the order listed">',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
        '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">'
        '##reference=file:///human_g1k_v37.fasta',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband'
    ]
    for line in header_lines:
        if line.startswith('##'):
            header_parser.parse_meta_data(line)
        elif line.startswith('#'):
            header_parser.parse_header_line(line)

    assert header_parser.fileformat == "VCFv4.2"
    assert header_parser.individuals == ['father', 'mother', 'proband']

    assert header_parser.vep_columns == []

    assert "MQ" in header_parser.extra_info
    assert header_parser.extra_info["MQ"][
        'Description'] == "RMS Mapping Quality"
    assert header_parser.extra_info["CNT"]['Number'] == "A"
    assert header_parser.extra_info["CNT"]['Type'] == "Integer"
    assert "CNT" in header_parser.extra_info
    assert "DP_HIST" in header_parser.extra_info

    assert "LowQual" in header_parser.filter_dict
    assert "1" in header_parser.contig_dict

    assert header_parser.header == [
        'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
        'father', 'mother', 'proband'
    ]
def test_malformed_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """
    
    header_parser = HeaderParser()
    
    malformed_fileformat = '##fileformat'
    malformed_info_line = '##INFO=<ID=MQ,Number=1,Description="RMS Mapping Quality">'
    malformed_contig_line = '##contig=<ID=1,assembly=b37>'
    
    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_fileformat)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_info_line)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_contig_line)
Beispiel #9
0
    def __init__(self,
                 infile=None,
                 fsock=None,
                 split_variants=False,
                 check_info=False,
                 allele_symbol='0',
                 fileformat=None):
        super(VCFParser, self).__init__()
        self.logger = logging.getLogger(__name__)

        self.vcf = None
        self.logger.debug("Set self.vcf to:{0}".format(self.vcf))
        self.beginning = True
        self.infile = infile
        self.fsock = fsock
        self.split_variants = split_variants
        self.logger.info("Split variants = {0}".format(self.split_variants))
        self.fileformat = fileformat

        self.check_info = check_info
        self.logger.info("check info = {0}".format(self.check_info))

        self.allele_symbol = allele_symbol
        self.logger.info("Allele symbol = {0}".format(self.allele_symbol))

        self.logger.info("Initializing HeaderParser")
        self.metadata = HeaderParser()
        # These are the individuals described in the header
        self.individuals = []
        # This is the header line of the vcf
        self.header = []

        # If there are no file or stream the user can add variants manually.
        # These will be added to self.variants
        self.variants = []

        if (fsock or infile):

            if fsock:
                if not infile and hasattr(fsock, 'name'):
                    self.logger.info("Reading vcf form stdin")
                    if sys.version_info < (3, 0):
                        self.logger.info("Using codecs to read stdin")
                        sys.stdin = getreader('utf-8')(fsock)

                    self.vcf = sys.stdin

            else:
                self.logger.info("Reading vcf form file {0}".format(infile))
                file_name, file_extension = os.path.splitext(infile)
                if file_extension == '.gz':
                    self.logger.debug("Vcf is zipped")
                    self.vcf = getreader('utf-8')(gzip.open(infile),
                                                  errors='replace')
                elif file_extension == '.vcf':
                    self.vcf = open(infile,
                                    mode='r',
                                    encoding='utf-8',
                                    errors='replace')
                else:
                    raise IOError("File is not in a supported format!\n"
                                  " Or use correct ending(.vcf or .vcf.gz)")

            self.logger.debug("Reading first line.")
            self.next_line = self.vcf.readline().rstrip()
            self.current_line = self.next_line

            # First line is allways a metadata line
            if not self.next_line.startswith('#'):
                raise IOError(
                    "VCF files allways have to start with a metadata line.")
            self.metadata.parse_meta_data(self.next_line)

            # Parse the metadata lines
            while self.next_line.startswith('#'):
                if self.next_line.startswith('##'):
                    self.metadata.parse_meta_data(self.next_line)
                elif self.next_line.startswith('#'):
                    self.metadata.parse_header_line(self.next_line)
                self.next_line = self.vcf.readline().rstrip()

            self.individuals = self.metadata.individuals
            self.logger.info("Setting self.individuals to {0}".format(
                self.individuals))
            self.header = self.metadata.header
            self.vep_header = self.metadata.vep_columns

        else:
            if not self.fileformat:
                raise IOError("Please initialize with a fileformat.")
            else:
                self.metadata.fileformat = self.fileformat
Beispiel #10
0
class VCFParser(object):
    """docstring for VCFParser"""
    def __init__(self,
                 infile=None,
                 fsock=None,
                 split_variants=False,
                 check_info=False,
                 allele_symbol='0',
                 fileformat=None):
        super(VCFParser, self).__init__()
        self.logger = logging.getLogger(__name__)

        self.vcf = None
        self.logger.debug("Set self.vcf to:{0}".format(self.vcf))
        self.beginning = True
        self.infile = infile
        self.fsock = fsock
        self.split_variants = split_variants
        self.logger.info("Split variants = {0}".format(self.split_variants))
        self.fileformat = fileformat

        self.check_info = check_info
        self.logger.info("check info = {0}".format(self.check_info))

        self.allele_symbol = allele_symbol
        self.logger.info("Allele symbol = {0}".format(self.allele_symbol))

        self.logger.info("Initializing HeaderParser")
        self.metadata = HeaderParser()
        # These are the individuals described in the header
        self.individuals = []
        # This is the header line of the vcf
        self.header = []

        # If there are no file or stream the user can add variants manually.
        # These will be added to self.variants
        self.variants = []

        if (fsock or infile):

            if fsock:
                if not infile and hasattr(fsock, 'name'):
                    self.logger.info("Reading vcf form stdin")
                    if sys.version_info < (3, 0):
                        self.logger.info("Using codecs to read stdin")
                        sys.stdin = getreader('utf-8')(fsock)

                    self.vcf = sys.stdin

            else:
                self.logger.info("Reading vcf form file {0}".format(infile))
                file_name, file_extension = os.path.splitext(infile)
                if file_extension == '.gz':
                    self.logger.debug("Vcf is zipped")
                    self.vcf = getreader('utf-8')(gzip.open(infile),
                                                  errors='replace')
                elif file_extension == '.vcf':
                    self.vcf = open(infile,
                                    mode='r',
                                    encoding='utf-8',
                                    errors='replace')
                else:
                    raise IOError("File is not in a supported format!\n"
                                  " Or use correct ending(.vcf or .vcf.gz)")

            self.logger.debug("Reading first line.")
            self.next_line = self.vcf.readline().rstrip()
            self.current_line = self.next_line

            # First line is allways a metadata line
            if not self.next_line.startswith('#'):
                raise IOError(
                    "VCF files allways have to start with a metadata line.")
            self.metadata.parse_meta_data(self.next_line)

            # Parse the metadata lines
            while self.next_line.startswith('#'):
                if self.next_line.startswith('##'):
                    self.metadata.parse_meta_data(self.next_line)
                elif self.next_line.startswith('#'):
                    self.metadata.parse_header_line(self.next_line)
                self.next_line = self.vcf.readline().rstrip()

            self.individuals = self.metadata.individuals
            self.logger.info("Setting self.individuals to {0}".format(
                self.individuals))
            self.header = self.metadata.header
            self.vep_header = self.metadata.vep_columns

        else:
            if not self.fileformat:
                raise IOError("Please initialize with a fileformat.")
            else:
                self.metadata.fileformat = self.fileformat

    def add_variant(self,
                    chrom,
                    pos,
                    rs_id,
                    ref,
                    alt,
                    qual,
                    filt,
                    info,
                    form=None,
                    genotypes=[]):
        """
        Add a variant to the parser.
        
        This function is for building a vcf. It takes the relevant parameters 
        and make a vcf variant in the proper format.
        """
        variant_info = [chrom, pos, rs_id, ref, alt, qual, filt, info]
        if form:
            variant_info.append(form)
        for individual in genotypes:
            variant_info.append(individual)

        variant_line = '\t'.join(variant_info)
        variant = format_variant(line=variant_line,
                                 header_parser=self.metadata,
                                 check_info=self.check_info)

        if not (self.split_variants and len(variant['ALT'].split(',')) > 1):
            self.variants.append(variant)

        # If multiple alternative and split_variants we must split the variant
        else:
            for splitted_variant in split_variants(
                    variant_dict=variant,
                    header_parser=self.metadata,
                    allele_symbol=self.allele_symbol):
                self.variants.append(splitted_variant)

    def __iter__(self):

        if not self.metadata.fileformat:
            raise SyntaxError("Vcf must have fileformat defined")

        if self.vcf:

            # We need to treat the first case as an exception
            if self.beginning:
                variants = []
                if self.next_line:
                    first_variant = format_variant(line=self.next_line,
                                                   header_parser=self.metadata,
                                                   check_info=self.check_info)

                    if not (self.split_variants
                            and len(first_variant['ALT'].split(',')) > 1):
                        variants.append(first_variant)
                    else:
                        for splitted_variant in split_variants(
                                variant_dict=first_variant,
                                header_parser=self.metadata,
                                allele_symbol=self.allele_symbol):
                            variants.append(splitted_variant)

                    for variant in variants:
                        yield variant

                    self.beginning = False

            for line in self.vcf:
                line = line.rstrip()
                # These are the variant(s) found in one line of the vcf
                # If there are multiple alternatives and self.split_variants
                # There can be more than one variant in one line
                variants = []

                if not line.startswith('#') and len(line.split('\t')) >= 8:
                    variant = format_variant(line=line,
                                             header_parser=self.metadata,
                                             check_info=self.check_info)

                    if not (self.split_variants
                            and len(variant['ALT'].split(',')) > 1):
                        variants.append(variant)

                    else:
                        for splitted_variant in split_variants(
                                variant_dict=variant,
                                header_parser=self.metadata,
                                allele_symbol=self.allele_symbol):
                            variants.append(splitted_variant)

                for variant in variants:
                    yield variant

        else:
            for variant in self.variants:
                yield variant

    def __repr__(self):
        return "Parser(infile={0},fsock={1},split_variants={2})".format(
            self.infile, self.fsock, self.split_variants)
Beispiel #11
0
    def __init__(self, infile=None, fsock=None, split_variants=False, 
                check_info=False, allele_symbol='0', fileformat = None):
        super(VCFParser, self).__init__()
        self.logger = logging.getLogger(__name__)
        
        self.vcf = None
        self.logger.debug("Set self.vcf to:{0}".format(self.vcf))
        self.beginning = True
        self.infile = infile
        self.fsock = fsock
        self.split_variants = split_variants
        self.logger.info("Split variants = {0}".format(self.split_variants))
        self.fileformat = fileformat
        
        self.check_info = check_info
        self.logger.info("check info = {0}".format(self.check_info))

        self.allele_symbol = allele_symbol
        self.logger.info("Allele symbol = {0}".format(self.allele_symbol))
        
        self.logger.info("Initializing HeaderParser")
        self.metadata = HeaderParser()
        # These are the individuals described in the header
        self.individuals = []
        # This is the header line of the vcf
        self.header = []
        
        # If there are no file or stream the user can add variants manually.
        # These will be added to self.variants
        self.variants = []
        
        if (fsock or infile):
        
            if fsock:
                if not infile and hasattr(fsock, 'name'):
                    self.logger.info("Reading vcf form stdin")
                    if sys.version_info < (3, 0):
                        self.logger.info("Using codecs to read stdin")
                        sys.stdin = getreader('utf-8')(fsock)
                    
                    self.vcf = sys.stdin
            
            else:
                self.logger.info("Reading vcf form file {0}".format(infile))
                file_name, file_extension = os.path.splitext(infile)
                if file_extension == '.gz':
                    self.logger.debug("Vcf is zipped")
                    self.vcf = getreader('utf-8')(gzip.open(infile), errors='replace')
                elif file_extension == '.vcf':
                    self.vcf = open(infile, mode='r', encoding='utf-8', errors='replace')
                else:
                    raise IOError("File is not in a supported format!\n"
                                        " Or use correct ending(.vcf or .vcf.gz)")
            
            self.logger.debug("Reading first line.")
            self.next_line = self.vcf.readline().rstrip()
            self.current_line = self.next_line
           
            # First line is allways a metadata line
            if not self.next_line.startswith('#'):
                raise IOError("VCF files allways have to start with a metadata line.")
            self.metadata.parse_meta_data(self.next_line)
            
            # Parse the metadata lines
            while self.next_line.startswith('#'):
                if self.next_line.startswith('##'):
                    self.metadata.parse_meta_data(self.next_line)
                elif self.next_line.startswith('#'):
                    self.metadata.parse_header_line(self.next_line)
                self.next_line = self.vcf.readline().rstrip()
            
            self.individuals = self.metadata.individuals
            self.logger.info("Setting self.individuals to {0}".format(
                self.individuals
            ))
            self.header = self.metadata.header
            self.vep_header = self.metadata.vep_columns
        
        else:
            if not self.fileformat:
                raise IOError("Please initialize with a fileformat.")
            else:
                self.metadata.fileformat = self.fileformat
Beispiel #12
0
class VCFParser(object):
    """docstring for VCFParser"""
    def __init__(self, infile=None, fsock=None, split_variants=False, 
                check_info=False, allele_symbol='0', fileformat = None):
        super(VCFParser, self).__init__()
        self.logger = logging.getLogger(__name__)
        
        self.vcf = None
        self.logger.debug("Set self.vcf to:{0}".format(self.vcf))
        self.beginning = True
        self.infile = infile
        self.fsock = fsock
        self.split_variants = split_variants
        self.logger.info("Split variants = {0}".format(self.split_variants))
        self.fileformat = fileformat
        
        self.check_info = check_info
        self.logger.info("check info = {0}".format(self.check_info))

        self.allele_symbol = allele_symbol
        self.logger.info("Allele symbol = {0}".format(self.allele_symbol))
        
        self.logger.info("Initializing HeaderParser")
        self.metadata = HeaderParser()
        # These are the individuals described in the header
        self.individuals = []
        # This is the header line of the vcf
        self.header = []
        
        # If there are no file or stream the user can add variants manually.
        # These will be added to self.variants
        self.variants = []
        
        if (fsock or infile):
        
            if fsock:
                if not infile and hasattr(fsock, 'name'):
                    self.logger.info("Reading vcf form stdin")
                    if sys.version_info < (3, 0):
                        self.logger.info("Using codecs to read stdin")
                        sys.stdin = getreader('utf-8')(fsock)
                    
                    self.vcf = sys.stdin
            
            else:
                self.logger.info("Reading vcf form file {0}".format(infile))
                file_name, file_extension = os.path.splitext(infile)
                if file_extension == '.gz':
                    self.logger.debug("Vcf is zipped")
                    self.vcf = getreader('utf-8')(gzip.open(infile), errors='replace')
                elif file_extension == '.vcf':
                    self.vcf = open(infile, mode='r', encoding='utf-8', errors='replace')
                else:
                    raise IOError("File is not in a supported format!\n"
                                        " Or use correct ending(.vcf or .vcf.gz)")
            
            self.logger.debug("Reading first line.")
            self.next_line = self.vcf.readline().rstrip()
            self.current_line = self.next_line
           
            # First line is allways a metadata line
            if not self.next_line.startswith('#'):
                raise IOError("VCF files allways have to start with a metadata line.")
            self.metadata.parse_meta_data(self.next_line)
            
            # Parse the metadata lines
            while self.next_line.startswith('#'):
                if self.next_line.startswith('##'):
                    self.metadata.parse_meta_data(self.next_line)
                elif self.next_line.startswith('#'):
                    self.metadata.parse_header_line(self.next_line)
                self.next_line = self.vcf.readline().rstrip()
            
            self.individuals = self.metadata.individuals
            self.logger.info("Setting self.individuals to {0}".format(
                self.individuals
            ))
            self.header = self.metadata.header
            self.vep_header = self.metadata.vep_columns
        
        else:
            if not self.fileformat:
                raise IOError("Please initialize with a fileformat.")
            else:
                self.metadata.fileformat = self.fileformat
    
    def add_variant(self, chrom, pos, rs_id, ref, alt, qual, filt, info, form=None, genotypes=[]):
        """
        Add a variant to the parser.
        
        This function is for building a vcf. It takes the relevant parameters 
        and make a vcf variant in the proper format.
        """
        variant_info = [chrom, pos, rs_id, ref, alt, qual, filt, info]
        if form:
            variant_info.append(form)
        for individual in genotypes:
            variant_info.append(individual)
        
        variant_line = '\t'.join(variant_info)
        variant = format_variant(
            line = variant_line, 
            header_parser = self.metadata, 
            check_info = self.check_info
        )
        
        if not (self.split_variants and len(variant['ALT'].split(',')) > 1):
            self.variants.append(variant)
            
        # If multiple alternative and split_variants we must split the variant                 
        else:
            for splitted_variant in split_variants(
                                                    variant_dict=variant, 
                                                    header_parser=self.metadata, 
                                                    allele_symbol=self.allele_symbol):
                self.variants.append(splitted_variant)
    
    def __iter__(self):
        
        if not self.metadata.fileformat:
            raise SyntaxError("Vcf must have fileformat defined")
        
        if self.vcf:
            
            # We need to treat the first case as an exception
            if self.beginning:
                variants = []
                first_variant = format_variant(
                    line = self.next_line, 
                    header_parser = self.metadata, 
                    check_info = self.check_info
                )
                
                if not (self.split_variants and len(first_variant['ALT'].split(',')) > 1):
                    variants.append(first_variant)
                else:
                    for splitted_variant in split_variants(
                                                            variant_dict=first_variant, 
                                                            header_parser=self.metadata, 
                                                            allele_symbol=self.allele_symbol):
                        variants.append(splitted_variant)

                
                for variant in variants:
                    yield variant
                
                self.beginning = False
                
            
            for line in self.vcf:
                line = line.rstrip()
                # These are the variant(s) found in one line of the vcf
                # If there are multiple alternatives and self.split_variants
                # There can be more than one variant in one line
                variants = []
                
                if not line.startswith('#') and len(line.split('\t')) >= 8:
                    variant = format_variant(
                        line = line, 
                        header_parser = self.metadata, 
                        check_info = self.check_info
                        )
                    
                    if not (self.split_variants and len(variant['ALT'].split(',')) > 1):
                        variants.append(variant)
                    
                    else:
                        for splitted_variant in split_variants(
                                    variant_dict=variant, 
                                    header_parser=self.metadata, 
                                    allele_symbol=self.allele_symbol):
                            variants.append(splitted_variant)
                
                for variant in variants:
                    yield variant
        
        else:
            for variant in self.variants:
                yield variant

    def __repr__(self):
        return "Parser(infile={0},fsock={1},split_variants={2})".format(
            self.infile, self.fsock, self.split_variants
        )