def get_header(vcf_file_path): """Parse the header and return a header object Args: vcf_file_path(str): Path to vcf Returns: head: A HeaderParser object """ logger.info("Parsing header of file {0}".format(vcf_file_path)) head = HeaderParser() handle = get_vcf_handle(infile=vcf_file_path) # Parse the header for line in handle: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break handle.close() return head
def export(ctx, outfile): """Export the variants of a loqus db The variants are exported to a vcf file """ adapter = ctx.obj['adapter'] logger.info("Export the variants from {0}".format(adapter)) nr_cases = 0 existing_chromosomes = set(adapter.get_chromosomes()) ordered_chromosomes = [] for chrom in CHROMOSOME_ORDER: if chrom in existing_chromosomes: ordered_chromosomes.append(chrom) existing_chromosomes.remove(chrom) for chrom in existing_chromosomes: ordered_chromosomes.append(chrom) nr_cases = adapter.cases().count() logger.info("Found {0} cases in database".format(nr_cases)) head = HeaderParser() head.add_fileformat("VCFv4.3") head.add_meta_line("NrCases", nr_cases) head.add_info("Obs", '1', 'Integer', "The number of observations for the variant") head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes") head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes") head.add_version_tracking("loqusdb", __version__, datetime.now().strftime("%Y-%m-%d %H:%M")) for chrom in ordered_chromosomes: length = adapter.get_max_position(chrom) head.add_contig(contig_id=chrom, length=str(length)) print_headers(head, outfile=outfile) for chrom in ordered_chromosomes: for variant in adapter.get_variants(chromosome=chrom): chrom = variant['chrom'] pos = variant['start'] ref = variant['ref'] alt = variant['alt'] observations = variant['observations'] homozygotes = variant['homozygote'] hemizygotes = variant['hemizygote'] info = "Obs={0}".format(observations) if homozygotes: info += ";Hom={0}".format(homozygotes) if hemizygotes: info += ";Hem={0}".format(hemizygotes) variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format( chrom, pos, ref, alt, info) print_variant(variant_line=variant_line, outfile=outfile)
def get_header(vcf_lines): """Parse the vcf lines and return a header object""" head = HeaderParser() for line in vcf_lines: if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) return head
def export(ctx, outfile): """Export the variants of a loqus db The variants are exported to a vcf file """ adapter = ctx.obj['adapter'] logger.info("Export the variants from {0}".format(adapter)) nr_cases = 0 for nr_cases, case in enumerate(adapter.cases()): nr_cases += 1 logger.info("Found {0} cases in database".format(nr_cases)) head = HeaderParser() head.add_fileformat("##fileformat=VCFv4.1") head.add_meta_line("NrCases", nr_cases) head.add_info("Obs", '1', 'Integer', "The number of observations for the variant") head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes") head.add_version_tracking("loqusdb", __version__, datetime.now().strftime("%Y-%m-%d %H:%M")) logger.debug("Create tempfile to print variants from database") variants = tempfile.TemporaryFile() logger.debug("Printing headers") print_headers(head, outfile=outfile) try: for variant in adapter.get_variants(): variant_id = variant['_id'].split('_') chrom = variant_id[0] pos = variant_id[1] ref = variant_id[2] alt = variant_id[3] observations = variant['observations'] homozygotes = variant['homozygote'] info = "Obs={0};Hom={1}".format(observations, homozygotes) variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format( chrom, pos, ref, alt, info) variants.write(variant_line) variants.seek(0) for line in sort_variants(variants): print_variant(variant_line=line, outfile=outfile) finally: variants.close()
def test_parse_vcf_lines(): """ Test how the header parser behaves with simple vcf lines """ header_parser = HeaderParser() header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\ 'this allele was found in external db">', '##contig=<ID=1,length=249250621,assembly=b37>', '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\ 'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\ '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">', '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\ ' the ref and alt alleles in the order listed">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">' '##reference=file:///human_g1k_v37.fasta', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband' ] for line in header_lines: if line.startswith('##'): header_parser.parse_meta_data(line) elif line.startswith('#'): header_parser.parse_header_line(line) assert header_parser.fileformat == "VCFv4.2" assert header_parser.individuals == ['father', 'mother', 'proband'] assert header_parser.vep_columns == [] assert "MQ" in header_parser.extra_info assert header_parser.extra_info["MQ"][ 'Description'] == "RMS Mapping Quality" assert header_parser.extra_info["CNT"]['Number'] == "A" assert header_parser.extra_info["CNT"]['Type'] == "Integer" assert "CNT" in header_parser.extra_info assert "DP_HIST" in header_parser.extra_info assert "LowQual" in header_parser.filter_dict assert "1" in header_parser.contig_dict assert header_parser.header == [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'father', 'mother', 'proband' ]
def test_vep_columns(): """ Test how the vep columns are parsed """ header_parser = HeaderParser() vep_info_line = '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence'\ ' type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence">' header_parser.parse_meta_data(vep_info_line) assert header_parser.vep_columns == [ 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence' ]
def test_malformed_lines(): """ Test how the header parser behaves with simple vcf lines """ header_parser = HeaderParser() malformed_fileformat = '##fileformat' malformed_info_line = '##INFO=<ID=MQ,Number=1,Description="RMS Mapping Quality">' malformed_contig_line = '##contig=<assembly=b37>' with pytest.raises(SyntaxError): header_parser.parse_meta_data(malformed_fileformat) with pytest.raises(SyntaxError): header_parser.parse_meta_data(malformed_info_line) with pytest.raises(SyntaxError): header_parser.parse_meta_data(malformed_contig_line)
def export(ctx, outfile, variant_type): """Export the variants of a loqus db The variants are exported to a vcf file """ adapter = ctx.obj['adapter'] version = ctx.obj['version'] LOG.info("Export the variants from {0}".format(adapter)) nr_cases = 0 is_sv = variant_type == 'sv' existing_chromosomes = set(adapter.get_chromosomes(sv=is_sv)) ordered_chromosomes = [] for chrom in CHROMOSOME_ORDER: if chrom in existing_chromosomes: ordered_chromosomes.append(chrom) existing_chromosomes.remove(chrom) for chrom in existing_chromosomes: ordered_chromosomes.append(chrom) nr_cases = adapter.cases().count() LOG.info("Found {0} cases in database".format(nr_cases)) head = HeaderParser() head.add_fileformat("VCFv4.3") head.add_meta_line("NrCases", nr_cases) head.add_info("Obs", '1', 'Integer', "The number of observations for the variant") head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes") head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes") head.add_version_tracking("loqusdb", version, datetime.now().strftime("%Y-%m-%d %H:%M")) if variant_type == 'sv': head.add_info("END", '1', 'Integer', "End position of the variant") head.add_info("SVTYPE", '1', 'String', "Type of structural variant") head.add_info("SVLEN", '1', 'Integer', "Length of structural variant") for chrom in ordered_chromosomes: length = adapter.get_max_position(chrom) head.add_contig(contig_id=chrom, length=str(length)) print_headers(head, outfile=outfile) for chrom in ordered_chromosomes: if variant_type == 'snv': LOG.info("Collecting all SNV variants") variants = adapter.get_variants(chromosome=chrom) else: LOG.info("Collecting all SV variants") variants = adapter.get_sv_variants(chromosome=chrom) LOG.info("{} variants found".format(variants.count())) for variant in variants: variant_line = format_variant(variant, variant_type=variant_type) # chrom = variant['chrom'] # pos = variant['start'] # ref = variant['ref'] # alt = variant['alt'] # observations = variant['observations'] # homozygotes = variant['homozygote'] # hemizygotes = variant['hemizygote'] # info = "Obs={0}".format(observations) # if homozygotes: # info += ";Hom={0}".format(homozygotes) # if hemizygotes: # info += ";Hem={0}".format(hemizygotes) # variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format( # chrom, pos, ref, alt, info) print_variant(variant_line=variant_line, outfile=outfile)