コード例 #1
0
def get_header(vcf_file_path):
    """Parse the header and return a header object

        Args:
            vcf_file_path(str): Path to vcf

        Returns:
            head: A HeaderParser object
    """
    logger.info("Parsing header of file {0}".format(vcf_file_path))
    head = HeaderParser()
    handle = get_vcf_handle(infile=vcf_file_path)
    # Parse the header
    for line in handle:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    handle.close()

    return head
コード例 #2
0
def export(ctx, outfile):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']

    logger.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    existing_chromosomes = set(adapter.get_chromosomes())

    ordered_chromosomes = []
    for chrom in CHROMOSOME_ORDER:
        if chrom in existing_chromosomes:
            ordered_chromosomes.append(chrom)
            existing_chromosomes.remove(chrom)
    for chrom in existing_chromosomes:
        ordered_chromosomes.append(chrom)

    nr_cases = adapter.cases().count()
    logger.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("VCFv4.3")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer',
                  "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes")
    head.add_version_tracking("loqusdb", __version__,
                              datetime.now().strftime("%Y-%m-%d %H:%M"))
    for chrom in ordered_chromosomes:
        length = adapter.get_max_position(chrom)
        head.add_contig(contig_id=chrom, length=str(length))

    print_headers(head, outfile=outfile)

    for chrom in ordered_chromosomes:
        for variant in adapter.get_variants(chromosome=chrom):
            chrom = variant['chrom']
            pos = variant['start']
            ref = variant['ref']
            alt = variant['alt']
            observations = variant['observations']
            homozygotes = variant['homozygote']
            hemizygotes = variant['hemizygote']
            info = "Obs={0}".format(observations)
            if homozygotes:
                info += ";Hom={0}".format(homozygotes)
            if hemizygotes:
                info += ";Hem={0}".format(hemizygotes)
            variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
                chrom, pos, ref, alt, info)
            print_variant(variant_line=variant_line, outfile=outfile)
コード例 #3
0
def get_header(vcf_lines):
    """Parse the vcf lines and return a header object"""
    head = HeaderParser()

    for line in vcf_lines:
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
    return head
コード例 #4
0
def export(ctx, outfile):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']

    logger.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    for nr_cases, case in enumerate(adapter.cases()):
        nr_cases += 1
    logger.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("##fileformat=VCFv4.1")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer',
                  "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_version_tracking("loqusdb", __version__,
                              datetime.now().strftime("%Y-%m-%d %H:%M"))

    logger.debug("Create tempfile to print variants from database")
    variants = tempfile.TemporaryFile()

    logger.debug("Printing headers")
    print_headers(head, outfile=outfile)

    try:
        for variant in adapter.get_variants():
            variant_id = variant['_id'].split('_')
            chrom = variant_id[0]
            pos = variant_id[1]
            ref = variant_id[2]
            alt = variant_id[3]

            observations = variant['observations']
            homozygotes = variant['homozygote']

            info = "Obs={0};Hom={1}".format(observations, homozygotes)

            variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
                chrom, pos, ref, alt, info)

            variants.write(variant_line)

        variants.seek(0)
        for line in sort_variants(variants):
            print_variant(variant_line=line, outfile=outfile)
    finally:
        variants.close()
コード例 #5
0
def test_parse_vcf_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """

    header_parser = HeaderParser()

    header_lines = [
        '##fileformat=VCFv4.2',
        '##FILTER=<ID=LowQual,Description="Low quality">',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
        '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\
        'this allele was found in external db">',
        '##contig=<ID=1,length=249250621,assembly=b37>',
        '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\
        'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\
        '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">',
        '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\
        ' the ref and alt alleles in the order listed">',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
        '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">'
        '##reference=file:///human_g1k_v37.fasta',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband'
    ]
    for line in header_lines:
        if line.startswith('##'):
            header_parser.parse_meta_data(line)
        elif line.startswith('#'):
            header_parser.parse_header_line(line)

    assert header_parser.fileformat == "VCFv4.2"
    assert header_parser.individuals == ['father', 'mother', 'proband']

    assert header_parser.vep_columns == []

    assert "MQ" in header_parser.extra_info
    assert header_parser.extra_info["MQ"][
        'Description'] == "RMS Mapping Quality"
    assert header_parser.extra_info["CNT"]['Number'] == "A"
    assert header_parser.extra_info["CNT"]['Type'] == "Integer"
    assert "CNT" in header_parser.extra_info
    assert "DP_HIST" in header_parser.extra_info

    assert "LowQual" in header_parser.filter_dict
    assert "1" in header_parser.contig_dict

    assert header_parser.header == [
        'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
        'father', 'mother', 'proband'
    ]
コード例 #6
0
def test_vep_columns():
    """
    Test how the vep columns are parsed
    """
    header_parser = HeaderParser()

    vep_info_line = '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence'\
    ' type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence">'

    header_parser.parse_meta_data(vep_info_line)

    assert header_parser.vep_columns == [
        'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence'
    ]
コード例 #7
0
def test_malformed_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """

    header_parser = HeaderParser()

    malformed_fileformat = '##fileformat'
    malformed_info_line = '##INFO=<ID=MQ,Number=1,Description="RMS Mapping Quality">'
    malformed_contig_line = '##contig=<assembly=b37>'

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_fileformat)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_info_line)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_contig_line)
コード例 #8
0
ファイル: export.py プロジェクト: ousamg/loqusdb
def export(ctx, outfile, variant_type):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']
    version = ctx.obj['version']
    
    LOG.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    is_sv = variant_type == 'sv'
    existing_chromosomes = set(adapter.get_chromosomes(sv=is_sv))
    
    ordered_chromosomes = []
    for chrom in CHROMOSOME_ORDER:
        if chrom in existing_chromosomes:
            ordered_chromosomes.append(chrom)
            existing_chromosomes.remove(chrom)
    for chrom in existing_chromosomes:
        ordered_chromosomes.append(chrom)
    
    nr_cases = adapter.cases().count()
    LOG.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("VCFv4.3")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer', "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes")
    head.add_version_tracking("loqusdb", version, datetime.now().strftime("%Y-%m-%d %H:%M"))
    
    if variant_type == 'sv':
        head.add_info("END", '1', 'Integer', "End position of the variant")
        head.add_info("SVTYPE", '1', 'String', "Type of structural variant")
        head.add_info("SVLEN", '1', 'Integer', "Length of structural variant")
        
        
    for chrom in ordered_chromosomes:
        length = adapter.get_max_position(chrom)
        head.add_contig(contig_id=chrom, length=str(length))

    print_headers(head, outfile=outfile)
    
    for chrom in ordered_chromosomes:
        if variant_type == 'snv':
            LOG.info("Collecting all SNV variants")
            variants = adapter.get_variants(chromosome=chrom)
        else:
            LOG.info("Collecting all SV variants")
            variants = adapter.get_sv_variants(chromosome=chrom)
        LOG.info("{} variants found".format(variants.count()))
        for variant in variants:
            variant_line = format_variant(variant, variant_type=variant_type)
            # chrom = variant['chrom']
            # pos = variant['start']
            # ref = variant['ref']
            # alt = variant['alt']
            # observations = variant['observations']
            # homozygotes = variant['homozygote']
            # hemizygotes = variant['hemizygote']
            # info = "Obs={0}".format(observations)
            # if homozygotes:
            #     info += ";Hom={0}".format(homozygotes)
            # if hemizygotes:
            #     info += ";Hem={0}".format(hemizygotes)
            # variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
            #     chrom, pos, ref, alt, info)
            print_variant(variant_line=variant_line, outfile=outfile)