def get_header(header_lines = None): """Initiate a HeaderParser and return it""" header_parser = HeaderParser() if not header_lines: header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##INFO=<ID=SQ,Number=G,Type=Float,Description="Just for test">', '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\ 'this allele was found in external db">', '##contig=<ID=1,length=249250621,assembly=b37>', '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\ 'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\ '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">', '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\ ' the ref and alt alleles in the order listed">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">' '##reference=file:///human_g1k_v37.fasta', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband' ] for line in header_lines: if line.startswith('##'): header_parser.parse_meta_data(line) elif line.startswith('#'): header_parser.parse_header_line(line) return header_parser
def test_vep_columns(): """ Test how the vep columns are parsed """ header_parser = HeaderParser() vep_info_line = '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence'\ ' type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence">' header_parser.parse_meta_data(vep_info_line) assert header_parser.vep_columns == [ 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence' ]
def test_parse_vcf_lines(): """ Test how the header parser behaves with simple vcf lines """ header_parser = HeaderParser() header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\ 'this allele was found in external db">', '##contig=<ID=1,length=249250621,assembly=b37>', '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\ 'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\ '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">', '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\ ' the ref and alt alleles in the order listed">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">' '##reference=file:///human_g1k_v37.fasta', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband' ] for line in header_lines: if line.startswith('##'): header_parser.parse_meta_data(line) elif line.startswith('#'): header_parser.parse_header_line(line) assert header_parser.fileformat == "VCFv4.2" assert header_parser.individuals == ['father','mother','proband'] assert header_parser.vep_columns == [] assert "MQ" in header_parser.extra_info assert header_parser.extra_info["MQ"]['Description'] == "RMS Mapping Quality" assert header_parser.extra_info["CNT"]['Number'] == "A" assert header_parser.extra_info["CNT"]['Type'] == "Integer" assert "CNT" in header_parser.extra_info assert "DP_HIST" in header_parser.extra_info assert "LowQual" in header_parser.filter_dict assert "1" in header_parser.contig_dict assert header_parser.header == [ 'CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT', 'father','mother','proband' ]
def test_vep_columns(): """ Test how the vep columns are parsed """ header_parser = HeaderParser() vep_info_line = '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence'\ ' type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence">' header_parser.parse_meta_data(vep_info_line) assert header_parser.vep_columns == ['Allele','Gene','Feature','Feature_type','Consequence']
def get_header(header_lines=None): """Initiate a HeaderParser and return it""" header_parser = HeaderParser() if not header_lines: header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##INFO=<ID=SQ,Number=G,Type=Float,Description="Just for test">', '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\ 'this allele was found in external db">', '##contig=<ID=1,length=249250621,assembly=b37>', '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\ 'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\ '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">', '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\ ' the ref and alt alleles in the order listed">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">' '##reference=file:///human_g1k_v37.fasta', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband' ] for line in header_lines: if line.startswith('##'): header_parser.parse_meta_data(line) elif line.startswith('#'): header_parser.parse_header_line(line) return header_parser
def test_malformed_lines(): """ Test how the header parser behaves with simple vcf lines """ header_parser = HeaderParser() malformed_fileformat = '##fileformat' malformed_info_line = '##INFO=<ID=MQ,Number=1,Description="RMS Mapping Quality">' malformed_contig_line = '##contig=<ID=1,assembly=b37>' with pytest.raises(SyntaxError): header_parser.parse_meta_data(malformed_fileformat) with pytest.raises(SyntaxError): header_parser.parse_meta_data(malformed_info_line) with pytest.raises(SyntaxError): header_parser.parse_meta_data(malformed_contig_line)
def test_parse_vcf_lines(): """ Test how the header parser behaves with simple vcf lines """ header_parser = HeaderParser() header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\ 'this allele was found in external db">', '##contig=<ID=1,length=249250621,assembly=b37>', '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\ 'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\ '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">', '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\ ' the ref and alt alleles in the order listed">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">' '##reference=file:///human_g1k_v37.fasta', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband' ] for line in header_lines: if line.startswith('##'): header_parser.parse_meta_data(line) elif line.startswith('#'): header_parser.parse_header_line(line) assert header_parser.fileformat == "VCFv4.2" assert header_parser.individuals == ['father', 'mother', 'proband'] assert header_parser.vep_columns == [] assert "MQ" in header_parser.extra_info assert header_parser.extra_info["MQ"][ 'Description'] == "RMS Mapping Quality" assert header_parser.extra_info["CNT"]['Number'] == "A" assert header_parser.extra_info["CNT"]['Type'] == "Integer" assert "CNT" in header_parser.extra_info assert "DP_HIST" in header_parser.extra_info assert "LowQual" in header_parser.filter_dict assert "1" in header_parser.contig_dict assert header_parser.header == [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'father', 'mother', 'proband' ]
def __init__(self, infile=None, fsock=None, split_variants=False, check_info=False, allele_symbol='0', fileformat=None): super(VCFParser, self).__init__() self.logger = logging.getLogger(__name__) self.vcf = None self.logger.debug("Set self.vcf to:{0}".format(self.vcf)) self.beginning = True self.infile = infile self.fsock = fsock self.split_variants = split_variants self.logger.info("Split variants = {0}".format(self.split_variants)) self.fileformat = fileformat self.check_info = check_info self.logger.info("check info = {0}".format(self.check_info)) self.allele_symbol = allele_symbol self.logger.info("Allele symbol = {0}".format(self.allele_symbol)) self.logger.info("Initializing HeaderParser") self.metadata = HeaderParser() # These are the individuals described in the header self.individuals = [] # This is the header line of the vcf self.header = [] # If there are no file or stream the user can add variants manually. # These will be added to self.variants self.variants = [] if (fsock or infile): if fsock: if not infile and hasattr(fsock, 'name'): self.logger.info("Reading vcf form stdin") if sys.version_info < (3, 0): self.logger.info("Using codecs to read stdin") sys.stdin = getreader('utf-8')(fsock) self.vcf = sys.stdin else: self.logger.info("Reading vcf form file {0}".format(infile)) file_name, file_extension = os.path.splitext(infile) if file_extension == '.gz': self.logger.debug("Vcf is zipped") self.vcf = getreader('utf-8')(gzip.open(infile), errors='replace') elif file_extension == '.vcf': self.vcf = open(infile, mode='r', encoding='utf-8', errors='replace') else: raise IOError("File is not in a supported format!\n" " Or use correct ending(.vcf or .vcf.gz)") self.logger.debug("Reading first line.") self.next_line = self.vcf.readline().rstrip() self.current_line = self.next_line # First line is allways a metadata line if not self.next_line.startswith('#'): raise IOError( "VCF files allways have to start with a metadata line.") self.metadata.parse_meta_data(self.next_line) # Parse the metadata lines while self.next_line.startswith('#'): if self.next_line.startswith('##'): self.metadata.parse_meta_data(self.next_line) elif self.next_line.startswith('#'): self.metadata.parse_header_line(self.next_line) self.next_line = self.vcf.readline().rstrip() self.individuals = self.metadata.individuals self.logger.info("Setting self.individuals to {0}".format( self.individuals)) self.header = self.metadata.header self.vep_header = self.metadata.vep_columns else: if not self.fileformat: raise IOError("Please initialize with a fileformat.") else: self.metadata.fileformat = self.fileformat
class VCFParser(object): """docstring for VCFParser""" def __init__(self, infile=None, fsock=None, split_variants=False, check_info=False, allele_symbol='0', fileformat=None): super(VCFParser, self).__init__() self.logger = logging.getLogger(__name__) self.vcf = None self.logger.debug("Set self.vcf to:{0}".format(self.vcf)) self.beginning = True self.infile = infile self.fsock = fsock self.split_variants = split_variants self.logger.info("Split variants = {0}".format(self.split_variants)) self.fileformat = fileformat self.check_info = check_info self.logger.info("check info = {0}".format(self.check_info)) self.allele_symbol = allele_symbol self.logger.info("Allele symbol = {0}".format(self.allele_symbol)) self.logger.info("Initializing HeaderParser") self.metadata = HeaderParser() # These are the individuals described in the header self.individuals = [] # This is the header line of the vcf self.header = [] # If there are no file or stream the user can add variants manually. # These will be added to self.variants self.variants = [] if (fsock or infile): if fsock: if not infile and hasattr(fsock, 'name'): self.logger.info("Reading vcf form stdin") if sys.version_info < (3, 0): self.logger.info("Using codecs to read stdin") sys.stdin = getreader('utf-8')(fsock) self.vcf = sys.stdin else: self.logger.info("Reading vcf form file {0}".format(infile)) file_name, file_extension = os.path.splitext(infile) if file_extension == '.gz': self.logger.debug("Vcf is zipped") self.vcf = getreader('utf-8')(gzip.open(infile), errors='replace') elif file_extension == '.vcf': self.vcf = open(infile, mode='r', encoding='utf-8', errors='replace') else: raise IOError("File is not in a supported format!\n" " Or use correct ending(.vcf or .vcf.gz)") self.logger.debug("Reading first line.") self.next_line = self.vcf.readline().rstrip() self.current_line = self.next_line # First line is allways a metadata line if not self.next_line.startswith('#'): raise IOError( "VCF files allways have to start with a metadata line.") self.metadata.parse_meta_data(self.next_line) # Parse the metadata lines while self.next_line.startswith('#'): if self.next_line.startswith('##'): self.metadata.parse_meta_data(self.next_line) elif self.next_line.startswith('#'): self.metadata.parse_header_line(self.next_line) self.next_line = self.vcf.readline().rstrip() self.individuals = self.metadata.individuals self.logger.info("Setting self.individuals to {0}".format( self.individuals)) self.header = self.metadata.header self.vep_header = self.metadata.vep_columns else: if not self.fileformat: raise IOError("Please initialize with a fileformat.") else: self.metadata.fileformat = self.fileformat def add_variant(self, chrom, pos, rs_id, ref, alt, qual, filt, info, form=None, genotypes=[]): """ Add a variant to the parser. This function is for building a vcf. It takes the relevant parameters and make a vcf variant in the proper format. """ variant_info = [chrom, pos, rs_id, ref, alt, qual, filt, info] if form: variant_info.append(form) for individual in genotypes: variant_info.append(individual) variant_line = '\t'.join(variant_info) variant = format_variant(line=variant_line, header_parser=self.metadata, check_info=self.check_info) if not (self.split_variants and len(variant['ALT'].split(',')) > 1): self.variants.append(variant) # If multiple alternative and split_variants we must split the variant else: for splitted_variant in split_variants( variant_dict=variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): self.variants.append(splitted_variant) def __iter__(self): if not self.metadata.fileformat: raise SyntaxError("Vcf must have fileformat defined") if self.vcf: # We need to treat the first case as an exception if self.beginning: variants = [] if self.next_line: first_variant = format_variant(line=self.next_line, header_parser=self.metadata, check_info=self.check_info) if not (self.split_variants and len(first_variant['ALT'].split(',')) > 1): variants.append(first_variant) else: for splitted_variant in split_variants( variant_dict=first_variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): variants.append(splitted_variant) for variant in variants: yield variant self.beginning = False for line in self.vcf: line = line.rstrip() # These are the variant(s) found in one line of the vcf # If there are multiple alternatives and self.split_variants # There can be more than one variant in one line variants = [] if not line.startswith('#') and len(line.split('\t')) >= 8: variant = format_variant(line=line, header_parser=self.metadata, check_info=self.check_info) if not (self.split_variants and len(variant['ALT'].split(',')) > 1): variants.append(variant) else: for splitted_variant in split_variants( variant_dict=variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): variants.append(splitted_variant) for variant in variants: yield variant else: for variant in self.variants: yield variant def __repr__(self): return "Parser(infile={0},fsock={1},split_variants={2})".format( self.infile, self.fsock, self.split_variants)
def __init__(self, infile=None, fsock=None, split_variants=False, check_info=False, allele_symbol='0', fileformat = None): super(VCFParser, self).__init__() self.logger = logging.getLogger(__name__) self.vcf = None self.logger.debug("Set self.vcf to:{0}".format(self.vcf)) self.beginning = True self.infile = infile self.fsock = fsock self.split_variants = split_variants self.logger.info("Split variants = {0}".format(self.split_variants)) self.fileformat = fileformat self.check_info = check_info self.logger.info("check info = {0}".format(self.check_info)) self.allele_symbol = allele_symbol self.logger.info("Allele symbol = {0}".format(self.allele_symbol)) self.logger.info("Initializing HeaderParser") self.metadata = HeaderParser() # These are the individuals described in the header self.individuals = [] # This is the header line of the vcf self.header = [] # If there are no file or stream the user can add variants manually. # These will be added to self.variants self.variants = [] if (fsock or infile): if fsock: if not infile and hasattr(fsock, 'name'): self.logger.info("Reading vcf form stdin") if sys.version_info < (3, 0): self.logger.info("Using codecs to read stdin") sys.stdin = getreader('utf-8')(fsock) self.vcf = sys.stdin else: self.logger.info("Reading vcf form file {0}".format(infile)) file_name, file_extension = os.path.splitext(infile) if file_extension == '.gz': self.logger.debug("Vcf is zipped") self.vcf = getreader('utf-8')(gzip.open(infile), errors='replace') elif file_extension == '.vcf': self.vcf = open(infile, mode='r', encoding='utf-8', errors='replace') else: raise IOError("File is not in a supported format!\n" " Or use correct ending(.vcf or .vcf.gz)") self.logger.debug("Reading first line.") self.next_line = self.vcf.readline().rstrip() self.current_line = self.next_line # First line is allways a metadata line if not self.next_line.startswith('#'): raise IOError("VCF files allways have to start with a metadata line.") self.metadata.parse_meta_data(self.next_line) # Parse the metadata lines while self.next_line.startswith('#'): if self.next_line.startswith('##'): self.metadata.parse_meta_data(self.next_line) elif self.next_line.startswith('#'): self.metadata.parse_header_line(self.next_line) self.next_line = self.vcf.readline().rstrip() self.individuals = self.metadata.individuals self.logger.info("Setting self.individuals to {0}".format( self.individuals )) self.header = self.metadata.header self.vep_header = self.metadata.vep_columns else: if not self.fileformat: raise IOError("Please initialize with a fileformat.") else: self.metadata.fileformat = self.fileformat
class VCFParser(object): """docstring for VCFParser""" def __init__(self, infile=None, fsock=None, split_variants=False, check_info=False, allele_symbol='0', fileformat = None): super(VCFParser, self).__init__() self.logger = logging.getLogger(__name__) self.vcf = None self.logger.debug("Set self.vcf to:{0}".format(self.vcf)) self.beginning = True self.infile = infile self.fsock = fsock self.split_variants = split_variants self.logger.info("Split variants = {0}".format(self.split_variants)) self.fileformat = fileformat self.check_info = check_info self.logger.info("check info = {0}".format(self.check_info)) self.allele_symbol = allele_symbol self.logger.info("Allele symbol = {0}".format(self.allele_symbol)) self.logger.info("Initializing HeaderParser") self.metadata = HeaderParser() # These are the individuals described in the header self.individuals = [] # This is the header line of the vcf self.header = [] # If there are no file or stream the user can add variants manually. # These will be added to self.variants self.variants = [] if (fsock or infile): if fsock: if not infile and hasattr(fsock, 'name'): self.logger.info("Reading vcf form stdin") if sys.version_info < (3, 0): self.logger.info("Using codecs to read stdin") sys.stdin = getreader('utf-8')(fsock) self.vcf = sys.stdin else: self.logger.info("Reading vcf form file {0}".format(infile)) file_name, file_extension = os.path.splitext(infile) if file_extension == '.gz': self.logger.debug("Vcf is zipped") self.vcf = getreader('utf-8')(gzip.open(infile), errors='replace') elif file_extension == '.vcf': self.vcf = open(infile, mode='r', encoding='utf-8', errors='replace') else: raise IOError("File is not in a supported format!\n" " Or use correct ending(.vcf or .vcf.gz)") self.logger.debug("Reading first line.") self.next_line = self.vcf.readline().rstrip() self.current_line = self.next_line # First line is allways a metadata line if not self.next_line.startswith('#'): raise IOError("VCF files allways have to start with a metadata line.") self.metadata.parse_meta_data(self.next_line) # Parse the metadata lines while self.next_line.startswith('#'): if self.next_line.startswith('##'): self.metadata.parse_meta_data(self.next_line) elif self.next_line.startswith('#'): self.metadata.parse_header_line(self.next_line) self.next_line = self.vcf.readline().rstrip() self.individuals = self.metadata.individuals self.logger.info("Setting self.individuals to {0}".format( self.individuals )) self.header = self.metadata.header self.vep_header = self.metadata.vep_columns else: if not self.fileformat: raise IOError("Please initialize with a fileformat.") else: self.metadata.fileformat = self.fileformat def add_variant(self, chrom, pos, rs_id, ref, alt, qual, filt, info, form=None, genotypes=[]): """ Add a variant to the parser. This function is for building a vcf. It takes the relevant parameters and make a vcf variant in the proper format. """ variant_info = [chrom, pos, rs_id, ref, alt, qual, filt, info] if form: variant_info.append(form) for individual in genotypes: variant_info.append(individual) variant_line = '\t'.join(variant_info) variant = format_variant( line = variant_line, header_parser = self.metadata, check_info = self.check_info ) if not (self.split_variants and len(variant['ALT'].split(',')) > 1): self.variants.append(variant) # If multiple alternative and split_variants we must split the variant else: for splitted_variant in split_variants( variant_dict=variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): self.variants.append(splitted_variant) def __iter__(self): if not self.metadata.fileformat: raise SyntaxError("Vcf must have fileformat defined") if self.vcf: # We need to treat the first case as an exception if self.beginning: variants = [] first_variant = format_variant( line = self.next_line, header_parser = self.metadata, check_info = self.check_info ) if not (self.split_variants and len(first_variant['ALT'].split(',')) > 1): variants.append(first_variant) else: for splitted_variant in split_variants( variant_dict=first_variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): variants.append(splitted_variant) for variant in variants: yield variant self.beginning = False for line in self.vcf: line = line.rstrip() # These are the variant(s) found in one line of the vcf # If there are multiple alternatives and self.split_variants # There can be more than one variant in one line variants = [] if not line.startswith('#') and len(line.split('\t')) >= 8: variant = format_variant( line = line, header_parser = self.metadata, check_info = self.check_info ) if not (self.split_variants and len(variant['ALT'].split(',')) > 1): variants.append(variant) else: for splitted_variant in split_variants( variant_dict=variant, header_parser=self.metadata, allele_symbol=self.allele_symbol): variants.append(splitted_variant) for variant in variants: yield variant else: for variant in self.variants: yield variant def __repr__(self): return "Parser(infile={0},fsock={1},split_variants={2})".format( self.infile, self.fsock, self.split_variants )