Example #1
0
 def __init__(self, filename):
     self.vcf_reader = vcfpy.Reader.from_path(filename)
     self.vcf_reader.header.add_info_line(
         vcfpy.OrderedDict([
             ('ID', 'TFX_HGVSBC_G'), ('Number', '.'), ('Type', 'String'),
             ('Description',
              'HGVS genomic reference, as produced by pyhgvs.')
         ]))
     self.vcf_reader.header.add_info_line(
         vcfpy.OrderedDict([
             ('ID', 'TFX_HGVSBC_C'), ('Number', '.'), ('Type', 'String'),
             ('Description',
              'HGVS coding reference, as produced by pyhgvs.')
         ]))
     self.vcf_reader.header.add_info_line(
         vcfpy.OrderedDict([('ID', 'TFX_HGVSBC_P1'), ('Number', '.'),
                            ('Type', 'String'),
                            ('Description',
                             'HGVS 1-letter protein reference, as produced '
                             'by pyhgvs.')]))
     self.vcf_reader.header.add_info_line(
         vcfpy.OrderedDict([('ID', 'TFX_HGVSBC_P3'), ('Number', '.'),
                            ('Type', 'String'),
                            ('Description',
                             'HGVS 3-letter protein reference, as produced '
                             'by pyhgvs.')]))
Example #2
0
def test_header_pedigree_header_line():
    line1 = header.PedigreeHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "child"), ("Father", "father")]))
    line2 = header.PedigreeHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "child"), ("Father", "father")]))
    line3 = header.PedigreeHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "father")]))
    assert line1 == line2
    assert line1 != line3
    if sys.version_info < (3, 6):
        assert str(line1) == (
            "PedigreeHeaderLine('PEDIGREE', '<ID=child,Father=father>', "
            "OrderedDict([('ID', 'child'), ('Father', 'father')]))")
        assert repr(line1) == (
            "PedigreeHeaderLine('PEDIGREE', '<ID=child,Father=father>', "
            "OrderedDict([('ID', 'child'), ('Father', 'father')]))")
    else:
        assert str(line1) == (
            "PedigreeHeaderLine('PEDIGREE', '<ID=child,Father=father>', {'ID': 'child', 'Father': 'father'})"
        )
        assert repr(line1) == (
            "PedigreeHeaderLine('PEDIGREE', '<ID=child,Father=father>', {'ID': 'child', 'Father': 'father'})"
        )
    assert line1.value == "<ID=child,Father=father>"
    assert line1.serialize() == "##PEDIGREE=<ID=child,Father=father>"
    with pytest.raises(TypeError):
        hash(line1)
Example #3
0
def test_header_filter_header_line():
    line1 = header.FilterHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "PASS"),
                           ("Description", "All filters passed")]))
    line2 = header.FilterHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "PASS"),
                           ("Description", "All filters passed")]))
    line3 = header.FilterHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "q30"), ("Description", "Phred score <30")]))
    assert line1 == line2
    assert line1 != line3
    if sys.version_info < (3, 6):
        assert str(line1) == (
            "FilterHeaderLine('FILTER', '<ID=PASS,Description=\"All filters passed\">', "
            "OrderedDict([('ID', 'PASS'), ('Description', 'All filters passed')]))"
        )
        assert repr(line1) == (
            "FilterHeaderLine('FILTER', '<ID=PASS,Description=\"All filters passed\">', "
            "OrderedDict([('ID', 'PASS'), ('Description', 'All filters passed')]))"
        )
    else:
        assert str(line1) == (
            "FilterHeaderLine('FILTER', '<ID=PASS,Description=\"All filters passed\">', "
            "{'ID': 'PASS', 'Description': 'All filters passed'})")
        assert repr(line1) == (
            "FilterHeaderLine('FILTER', '<ID=PASS,Description=\"All filters passed\">', "
            "{'ID': 'PASS', 'Description': 'All filters passed'})")
    assert line1.value == '<ID=PASS,Description="All filters passed">'
    assert line1.serialize(
    ) == '##FILTER=<ID=PASS,Description="All filters passed">'
    with pytest.raises(TypeError):
        hash(line1)
Example #4
0
def test_header_sample_header_line():
    line1 = header.SampleHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "sample1")]))
    line2 = header.SampleHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "sample1")]))
    line3 = header.SampleHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "sample2")]))
    assert line1 == line2
    assert line1 != line3
    if sys.version_info < (3, 6):
        assert str(line1) == (
            "SampleHeaderLine('SAMPLE', '<ID=sample1>', OrderedDict([('ID', 'sample1')]))"
        )
        assert repr(line1) == (
            "SampleHeaderLine('SAMPLE', '<ID=sample1>', OrderedDict([('ID', 'sample1')]))"
        )
    else:
        assert str(line1) == (
            "SampleHeaderLine('SAMPLE', '<ID=sample1>', {'ID': 'sample1'})")
        assert repr(line1) == (
            "SampleHeaderLine('SAMPLE', '<ID=sample1>', {'ID': 'sample1'})")
    assert line1.value == "<ID=sample1>"
    assert line1.serialize() == "##SAMPLE=<ID=sample1>"
    with pytest.raises(TypeError):
        hash(line1)
Example #5
0
def test_header_info_header_line():
    line1 = header.InfoHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "SVTYPE"), ("Number", 1),
                           ("Type", "String")]))
    line2 = header.InfoHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "SVTYPE"), ("Number", 1),
                           ("Type", "String")]))
    line3 = header.InfoHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "END"), ("Number", 1), ("Type", "Integer")]))
    assert line1 == line2
    assert line1 != line3
    if sys.version_info < (3, 6):
        assert str(line1) == (
            "InfoHeaderLine('INFO', '<ID=SVTYPE,Number=1,Type=String>', "
            "OrderedDict([('ID', 'SVTYPE'), ('Number', 1), ('Type', 'String')]))"
        )
        assert repr(line1) == (
            "InfoHeaderLine('INFO', '<ID=SVTYPE,Number=1,Type=String>', "
            "OrderedDict([('ID', 'SVTYPE'), ('Number', 1), ('Type', 'String')]))"
        )
    else:
        assert str(line1) == (
            "InfoHeaderLine('INFO', '<ID=SVTYPE,Number=1,Type=String>', "
            "{'ID': 'SVTYPE', 'Number': 1, 'Type': 'String'})")
        assert repr(line1) == (
            "InfoHeaderLine('INFO', '<ID=SVTYPE,Number=1,Type=String>', "
            "{'ID': 'SVTYPE', 'Number': 1, 'Type': 'String'})")
    assert line1.value == "<ID=SVTYPE,Number=1,Type=String>"
    assert line1.serialize() == "##INFO=<ID=SVTYPE,Number=1,Type=String>"
    with pytest.raises(TypeError):
        hash(line1)
Example #6
0
def test_header_format_header_line():
    line1 = header.FormatHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "AD"), ("Number", "R"),
                           ("Type", "Integer")]))
    line2 = header.FormatHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "AD"), ("Number", "R"),
                           ("Type", "Integer")]))
    line3 = header.FormatHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "DP"), ("Number", 1), ("Type", "Integer")]))
    assert line1 == line2
    assert line1 != line3
    if sys.version_info < (3, 6):
        assert str(line1) == (
            "FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer>', "
            "OrderedDict([('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer')]))"
        )
        assert repr(line1) == (
            "FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer>', "
            "OrderedDict([('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer')]))"
        )
    else:
        assert str(line1) == (
            "FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer>', "
            "{'ID': 'AD', 'Number': 'R', 'Type': 'Integer'})")
        assert repr(line1) == (
            "FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer>', "
            "{'ID': 'AD', 'Number': 'R', 'Type': 'Integer'})")
    assert line1.value == "<ID=AD,Number=R,Type=Integer>"
    assert line1.serialize() == "##FORMAT=<ID=AD,Number=R,Type=Integer>"
    with pytest.raises(TypeError):
        hash(line1)
Example #7
0
def test_header_alt_allele_header_line():
    line1 = header.AltAlleleHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "DEL"), ("Description", "deletion")]))
    line2 = header.AltAlleleHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "DEL"), ("Description", "deletion")]))
    line3 = header.AltAlleleHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "DUP"), ("Description", "duplication")]))
    assert line1 == line2
    assert line1 != line3
    if sys.version_info < (3, 6):
        assert str(line1) == (
            """AltAlleleHeaderLine('ALT', '<ID=DEL,Description="deletion">', """
            """OrderedDict([('ID', 'DEL'), ('Description', 'deletion')]))""")
        assert repr(line1) == (
            """AltAlleleHeaderLine('ALT', '<ID=DEL,Description="deletion">', """
            """OrderedDict([('ID', 'DEL'), ('Description', 'deletion')]))""")
    else:
        assert str(line1) == (
            "AltAlleleHeaderLine('ALT', '<ID=DEL,Description=\"deletion\">', "
            "{'ID': 'DEL', 'Description': 'deletion'})")
        assert repr(line1) == (
            "AltAlleleHeaderLine('ALT', '<ID=DEL,Description=\"deletion\">', "
            "{'ID': 'DEL', 'Description': 'deletion'})")
    assert line1.value == '<ID=DEL,Description="deletion">'
    assert line1.serialize() == '##ALT=<ID=DEL,Description="deletion">'
    with pytest.raises(TypeError):
        hash(line1)
Example #8
0
def test_header_contig_header_line():
    line1 = header.ContigHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "1"), ("length", 234)]))
    line2 = header.ContigHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "1"), ("length", 234)]))
    line3 = header.ContigHeaderLine.from_mapping(
        vcfpy.OrderedDict([("ID", "2"), ("length", 123)]))
    assert line1 == line2
    assert line1 != line3
    if sys.version_info < (3, 6):
        assert str(line1) == (
            "ContigHeaderLine('contig', '<ID=1,length=234>', OrderedDict([('ID', '1'), ('length', 234)]))"
        )
        assert repr(line1) == (
            "ContigHeaderLine('contig', '<ID=1,length=234>', OrderedDict([('ID', '1'), ('length', 234)]))"
        )
    else:
        assert str(line1) == (
            "ContigHeaderLine('contig', '<ID=1,length=234>', {'ID': '1', 'length': 234})"
        )
        assert repr(line1) == (
            "ContigHeaderLine('contig', '<ID=1,length=234>', {'ID': '1', 'length': 234})"
        )
    assert line1.value == "<ID=1,length=234>"
    assert line1.serialize() == "##contig=<ID=1,length=234>"
    with pytest.raises(TypeError):
        hash(line1)
Example #9
0
def run(vi, vo, mmc, mmf, mmfs, mff):
    """
    Filter VCF based on given parameter 
    """
    mff = 0.99
    if not vi:
        exit("VCF file path not given. Exiting . . . . . .")
    if not path.isfile(vi):
        exit(
            "Given file path doesn't exist or it is not a file. Exiting . . . . ."
        )
    try:
        reader = vcfpy.Reader.from_path(vi)
        reader.header.add_filter_line(
            vcfpy.OrderedDict([('ID', 'min_alt_count'),
                               ('Description',
                                f'Minimum {mmc} from each atrand')]))
        reader.header.add_filter_line(
            vcfpy.OrderedDict([('ID', 'min_alt_frac'),
                               ('Description', f'Minimum {mmf} of total DP4')
                               ]))
        reader.header.add_filter_line(
            vcfpy.OrderedDict([('ID', 'min_alt_fication_frac'),
                               ('Description', f'Minimum {mff} of total DP4')
                               ]))
        writer = vcfpy.Writer.from_path(
            '/dev/stdout',
            reader.header) if not vo else vcfpy.Writer.from_path(
                vo, reader.header)

        for record in reader:
            base_sum = sum(record.INFO["DP4"])
            if ((record.INFO["DP4"][2] >= mmc) &
                (record.INFO["DP4"][2] >= mmfs *
                 (record.INFO["DP4"][2] + record.INFO["DP4"][3])) &
                (record.INFO["DP4"][3] >= mmc) &
                (record.INFO["DP4"][3] >= mmfs *
                 (record.INFO["DP4"][2] + record.INFO["DP4"][3])) &
                (record.INFO["DP4"][2] + record.INFO["DP4"][3] >=
                 mmf * base_sum)):
                if record.INFO["DP4"][2] + record.INFO["DP4"][
                        3] >= mff * base_sum:
                    record.INFO["DP4"][0] = 0
                    record.INFO["DP4"][1] = 0
                    record.INFO["AF"] = 1.0
                writer.write_record(record)
    except:
        exit(
            "Please check given file. Is it in proper format??. Exiting . . . . ."
        )
Example #10
0
def test_header_without_lines():
    lines = [
        header.HeaderLine("foo", "bar"),
        header.HeaderLine("foo2", "bar2")
    ]
    samples = header.SamplesInfos(["one", "two", "three"])
    hdr = header.Header(lines, samples)
    hdr.add_filter_line(vcfpy.OrderedDict([("ID", "PASS")]))
    hdr.add_filter_line(vcfpy.OrderedDict([("ID", "q30")]))
    assert len(hdr.lines) == 4

    hdr2 = header.header_without_lines(hdr, [("foo", "bar"),
                                             ("FILTER", "q30")])
    assert len(hdr2.lines) == 2
    assert hdr2.samples == hdr.samples
Example #11
0
def test_add_format_line(vcf_header):
    # check header before adding
    assert len(vcf_header.lines) == 18

    # add header line
    VALUE = '<ID=GTa,Number=1,Type=String,Description="Genotype">'
    line = header.FormatHeaderLine(
        "FORMAT",
        VALUE,
        vcfpy.OrderedDict([("ID", "GTa"), ("Number", 1), ("Type", "String"),
                           ("Description", "Genotype")]),
    )
    vcf_header.add_line(line)

    # check header after adding
    assert len(vcf_header.lines) == 19
    assert "GTa" in vcf_header._indices["FORMAT"]
    assert vcf_header._indices["FORMAT"]["GTa"] is vcf_header.lines[-1]

    # Check resulting added header line
    assert vcf_header.lines[-1].key == "FORMAT"
    assert vcf_header.lines[-1].value == VALUE
    assert len(vcf_header.lines[-1].mapping) == 4
    assert vcf_header.lines[-1].mapping["ID"] == "GTa"
    assert vcf_header.lines[-1].mapping["Number"] == 1
    assert vcf_header.lines[-1].mapping["Type"] == "String"
    assert vcf_header.lines[-1].mapping["Description"] == "Genotype"
Example #12
0
def test_add_contig_line_shortcut(vcf_header):
    # check header before adding
    assert len(vcf_header.lines) == 18
    assert "20a" not in vcf_header._indices["contig"]

    # add header line
    mapping = vcfpy.OrderedDict([
        ("ID", "20a"),
        ("length", 62435964),
        ("assembly", "B36"),
        ("md5", "f126cdf8a6e0c7f379d618ff66beb2da"),
        ("species", "H**o sapiens"),
        ("taxonomy", "x"),
    ])
    vcf_header.add_contig_line(mapping)

    # check header after adding
    assert len(vcf_header.lines) == 19
    assert "20a" in vcf_header._indices["contig"]
    assert vcf_header._indices["contig"]["20a"] is vcf_header.lines[-1]

    # Check resulting added header line
    assert vcf_header.lines[-1].key == "contig"
    VALUE = ("<ID=20a,length=62435964,assembly=B36,"
             "md5=f126cdf8a6e0c7f379d618ff66beb2da,"
             'species="H**o sapiens",taxonomy=x>')
    assert vcf_header.lines[-1].value == VALUE
    assert len(vcf_header.lines[-1].mapping) == 6
    assert vcf_header.lines[-1].mapping["ID"] == "20a"
    assert vcf_header.lines[-1].mapping["length"] == 62435964
    assert vcf_header.lines[-1].mapping["assembly"] == "B36"
    assert vcf_header.lines[-1].mapping[
        "md5"] == "f126cdf8a6e0c7f379d618ff66beb2da"
    assert vcf_header.lines[-1].mapping["species"] == "H**o sapiens"
    assert vcf_header.lines[-1].mapping["taxonomy"] == "x"
Example #13
0
def test_add_info_line_shortcut(vcf_header):
    # check header before adding
    assert len(vcf_header.lines) == 18

    # add header line
    VALUE = '<ID=DPa,Number=1,Type=Integer,Description="Total Depth">'
    mapping = vcfpy.OrderedDict([("ID", "DPa"), ("Number", 1),
                                 ("Type", "Integer"),
                                 ("Description", "Total Depth")])
    vcf_header.add_info_line(mapping)
    assert len(vcf_header.lines) == 19

    # check header after adding
    assert len(vcf_header.lines) == 19
    assert "DPa" in vcf_header._indices["INFO"]
    assert vcf_header._indices["INFO"]["DPa"] is vcf_header.lines[-1]

    # Check resulting added header line
    assert vcf_header.lines[-1].key == "INFO"
    assert vcf_header.lines[-1].value == VALUE
    assert len(vcf_header.lines[-1].mapping) == 4
    assert vcf_header.lines[-1].mapping["ID"] == "DPa"
    assert vcf_header.lines[-1].mapping["Number"] == 1
    assert vcf_header.lines[-1].mapping["Type"] == "Integer"
    assert vcf_header.lines[-1].mapping["Description"] == "Total Depth"
Example #14
0
 def merge_contig_header(self):
     for contig in self.readers[0].header.get_lines('contig'):
         self.merge_header.add_contig_line(
             vcfpy.OrderedDict(
                 [('ID', contig.id),
                  ('length', contig.length)]
             )
         )
Example #15
0
 def add_caller_filter_header(self):
     """
     Add caller info as FILTER to writer header
     """
     for reader, caller in zip(self.readers, self.callers):
         reader.header.add_filter_line(
             vcfpy.OrderedDict(
                 [('ID', caller),
                  ('Description', 'Variant caller label.')])
         )
Example #16
0
 def merge_filter_header(self):
     exclude = ['PASS']
     exclude.extend(self.callers)
     seen = []
     for reader, caller in zip(self.readers, self.callers):
         for filter in reader.header.get_lines('FILTER'):
             if filter.id not in seen:
                 seen.append(filter.id)
                 if filter.id not in exclude:
                     self.merge_header.add_filter_line(
                         vcfpy.OrderedDict(
                             [('ID', '{}_{}'.format(caller, filter.id)),
                              ('Description', '{} {}'.format(caller, filter.description))]
                         )
                     )
                 else:
                     self.merge_header.add_filter_line(
                         vcfpy.OrderedDict([('ID', filter.id), ('Description', filter.description)])
                     )
Example #17
0
 def merge_info_header(self):
     for reader, caller in zip(self.readers, self.callers):
         for info in reader.header.get_lines('INFO'):
             self.merge_header.add_info_line(
                 vcfpy.OrderedDict(
                     [('ID', '{}_{}'.format(caller, info.id)),
                      ('Number', info.number),
                      ('Type', info.type),
                      ('Description', '{} {}'.format(caller, info.description))]
                 )
             )
Example #18
0
 def merge_format_header(self):
     for reader, caller in zip(self.readers, self.callers):
         for format in reader.header.get_lines('FORMAT'):
             self.merge_header.add_format_line(
                 vcfpy.OrderedDict(
                     [('ID', '{}_{}'.format(caller, format.id)),
                      ('Number', format.number),
                      ('Type', format.type),
                      ('Description', '{} {}'.format(caller, format.description))]
                 )
             )
Example #19
0
def test_header_has_header_line_positive():
    lines = [
        header.FormatHeaderLine.from_mapping(
            vcfpy.OrderedDict([("ID", "DP"), ("Number", "R"),
                               ("Type", "Integer")])),
        header.InfoHeaderLine.from_mapping(
            vcfpy.OrderedDict([("ID", "AD"), ("Number", "R"),
                               ("Type", "Integer")])),
        header.FilterHeaderLine.from_mapping(
            vcfpy.OrderedDict([("ID", "PASS"),
                               ("Description", "All filters passed")])),
        header.ContigHeaderLine.from_mapping(
            vcfpy.OrderedDict([("ID", "1"), ("length", 234)])),
    ]
    samples = header.SamplesInfos(["one", "two", "three"])
    hdr = header.Header(lines, samples)

    assert hdr.has_header_line("FORMAT", "DP")
    assert hdr.has_header_line("INFO", "AD")
    assert hdr.has_header_line("FILTER", "PASS")
    assert hdr.has_header_line("contig", "1")
def collect_all_vcf(
    dirs: str,
    vcf_filename: str = "phased.partial.vcf",
    output: str = "IsoSeq_IsoPhase.vcf",
) -> None:
    no_snp_found_filename = Path(f"{Path(vcf_filename).stem}.NO_SNPS_FOUND")
    snps_by_chrom = defaultdict(lambda: [])

    reader = None

    for d in dirs:
        filename = Path(d, vcf_filename)
        if not filename.exists():
            if not no_snp_found_filename.exists():
                logger.info("VCF file {filename} does not exist. Skipping.")
            continue
        with open(filename) as rf:
            reader = vcfpy.Reader(rf)

            for r in reader:
                c = Counter()  # genotype -> count
                for x in r.samples:
                    if x.data.GT.count("|") == 0:
                        c[x.data.GT] += x.data.HQ
                    else:
                        for i, gt in enumerate(x.data.GT.split("|")):
                            c[gt] += x.data.HQ[i]
                c_keys = c.keys()
                genotype = "|".join(str(k) for k in c_keys)
                counts = ",".join(str(c[k]) for k in c_keys)
                r.samples = [
                    vcfpy.Call(
                        r,
                        "SAMPLE",
                        vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]),
                    )
                ]
                snps_by_chrom[r.CHROM].append((r.POS, r))

    keys = list(snps_by_chrom.keys())
    keys.sort()

    if reader is not None:
        reader.samples = ["SAMPLE"]
        with open(output, "w") as f:
            f = vcfpy.Writer(f, reader)
            for k in keys:
                v = snps_by_chrom[k]
                v.sort(key=lambda x: x[0])
                for _, rec in v:
                    f.write_record(rec)
        print("Output written to:", output)
Example #21
0
def main():
    # read from stdin
    reader = vcfpy.Reader.from_path('/dev/stdin')
    # add 'nhomalt' to header
    reader.header.add_info_line(
        vcfpy.OrderedDict([
            ('ID', 'nhomalt'), ('Number', 'A'), ('Type', 'Integer'),
            ('Description',
             'The number of individuals that are called homozygous for the alternate allele.'
             )
        ]))
    # write to stdout with modified header
    with vcfpy.Writer.from_path('/dev/stdout', reader.header) as writer:
        for record in reader:
            record.INFO['nhomalt'] = [nhomalt(record)]
            writer.write_record(record)
Example #22
0
def build_rec(calls=None, format_extras=None):
    calls = calls or []
    format_extras = format_extras or []
    alt1 = record.Substitution(vcfpy.SNV, "T")
    alt2 = record.Substitution(vcfpy.SNV, "A")
    return record.Record(
        "2",
        100,
        [],
        "C",
        [alt1, alt2],
        None,
        [],
        vcfpy.OrderedDict(),
        ["GT"] + format_extras,
        calls,
    )
Example #23
0
def generate_sv_record(records, comparison_result, sample_names):
    """
    This method generates a single SV record after a call has been made over a set of input records
    :param records: the input records involved in the SV call
    :param comparison_result:
    :param sample_names:
    :return:
    """

    # Build a map to easily find the records by the sample name. It can be multi-valued
    sample_names_to_records = group_by(records,
                                       lambda record: get_sample_name(record))

    # Generate calls for each sample in this group
    calls = [
        get_sample_call(sample_name,
                        sample_names_to_records.get(sample_name, None))
        for sample_name in sample_names
    ]

    first_record_of_the_group = records[0]
    chrom = first_record_of_the_group.CHROM
    id_of_new_record = generate_id(chrom, comparison_result.initial_position)
    info = vcfpy.OrderedDict()
    info["SVTYPE"] = comparison_result.svtype
    info["END"] = comparison_result.final_position
    if comparison_result.insseq is not None:
        info["INSSEQ"] = comparison_result.insseq
    return vcfpy.Record(
        CHROM=chrom,  # by construction, all the grouped records have the same
        POS=comparison_result.
        initial_position,  # by construction, all the grouped records have the same
        ID=[id_of_new_record],
        REF=first_record_of_the_group.
        REF,  # by construction, all the grouped records have the same
        ALT=[
            vcfpy.Substitution(type_=comparison_result.svtype,
                               value='<{}>'.format(comparison_result.svtype))
        ],
        QUAL=maximum_qual(records),
        FILTER=["PASS"],
        INFO=info,
        FORMAT=["GT", "TRANCHE2", "VAF"],
        calls=calls)
Example #24
0
def test_add_filter_line_shortcut(vcf_header):
    # check header before adding
    assert len(vcf_header.lines) == 18

    # add header line
    mapping = vcfpy.OrderedDict([("ID", "q10a"),
                                 ("Description", "Quality below 10")])
    vcf_header.add_filter_line(mapping)

    # check header after adding
    assert len(vcf_header.lines) == 19
    assert "q10a" in vcf_header._indices["FILTER"]
    assert vcf_header._indices["FILTER"]["q10a"] is vcf_header.lines[-1]

    # Check resulting added header line
    assert vcf_header.lines[-1].key == "FILTER"
    VALUE = '<ID=q10a,Description="Quality below 10">'
    assert vcf_header.lines[-1].value == VALUE
    assert len(vcf_header.lines[-1].mapping) == 2
    assert vcf_header.lines[-1].mapping["ID"] == "q10a"
    assert vcf_header.lines[-1].mapping["Description"] == "Quality below 10"
Example #25
0
 def select_info_header(self):
     for info_field in self.info_fields:
         for caller in self.caller_priority:
             id = '{}_{}'.format(caller, info_field)
             if id in self.reader.header.info_ids():
                 info = self.reader.header.get_info_field_info(id)
                 self.write_header.add_info_line(
                     vcfpy.OrderedDict([('ID', info.id.split('_', 1)[1]),
                                        ('Number', info.number),
                                        ('Type', info.type),
                                        ('Description',
                                         info.description.split(' ',
                                                                1)[1])]))
                 break
             else:
                 print('{} not found for {} in INFO column.'.format(
                     info_field, caller))
     not_found = list(
         set(list(self.info_fields)) - set(self.write_header.info_ids()))
     if len(not_found) > 0:
         raise Exception(', '.join(not_found) +
                         ' INFO field(s) not found in VCF')
Example #26
0
def modify_outheader(outheader):
    info_lines={'CALLER':vcfpy.OrderedDict([('ID','CALLER'),('Number','1'),('Type','String'),('Description','Variant Call method')]),
    'SS':vcfpy.OrderedDict([('ID','SS'),('Number','1'),('Type','String'),('Description','Somatic Status from respective Call Method')]),
    'OFS':vcfpy.OrderedDict([('ID','OFS'),('Number','.'),('Type','String'),('Description','Original FILTER State')])}
    #
    format_lines={'GT':vcfpy.OrderedDict([('ID','GT'),('Number','1'),('Type','String'),('Description','Genotype')]),
    'AD':vcfpy.OrderedDict([('ID','AD'),('Number','R'),('Type','Integer'),('Description','Alt Allele Depth')]),
    'AAF':vcfpy.OrderedDict([('ID','AAF'),('Number','1'),('Type','Float'),('Description','Alt Allele Frequency')])}
    #
    for i in info_lines.keys():
        if i not in outheader.info_ids():
            outheader.add_info_line(info_lines[i])
        else:
            outheader.get_info_field_info(i).mapping.update(info_lines[i])
    for f in format_lines.keys():
        if f not in outheader.format_ids():
            outheader.add_format_line(format_lines[f])
        else:
            outheader.get_format_field_info(f).mapping.update(format_lines[f])
    return outheader
Example #27
0
def main():
    args = supply_args()
    myvcf = VcfReader(args.infile)
    vrnts = myvcf.get_vrnts()
    coll = CollectMetrics()

    # Set VCF header.
    # 'AAP': hgvs_aap,
    # 'BASEP': hgvs_basep,
    # 'EXON': exon,
    # 'HGNC': hgnc,
    # 'HGVSC': hgvs_c,
    # 'HGVSP1': hgvs_p,
    # 'HGVSP3': hgvs_three,
    # 'SOURCE': self.tfx_type,
    # 'SPLICE': None,
    # 'TXC': tx,
    # 'VFX': self.veff,
    # 'PVT': None
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_AAP'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'Amino acid start position.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_BASEP'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'Coding sequence start position.')
                           ]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_EXON'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'Exon number associated with given '
                            'transcript.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_HGNC'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'HGNC gene symbol.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_HGVSC'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'HGVS cdot nomenclature.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_HGVSP1'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description',
                            'HGVS pdot nomenclature, single letter '
                            'amino acids.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_HGVSP3'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description',
                            'HGVS pdot nomenclature, three letter '
                            'amino acids.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_SOURCE'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'Annotation source.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_SPLICE'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'Splice site annotation.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_TXC'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'Transcript identifier.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_VFX'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description', 'Variant effect annotation.')]))
    myvcf.vcf_reader.header.add_info_line(
        vcfpy.OrderedDict([('ID', 'TFX_PVT'), ('Number', '.'),
                           ('Type', 'String'),
                           ('Description',
                            'Variant type or location annotation.')]))

    if args.evf:
        my_evf = AnnovarExonVrntFunc(args.evf).read_annovar()
        coll.metrics_push(my_evf)
    if args.vf:
        my_vf = AnnovarVrntFunc(args.vf).read_annovar()
        coll.metrics_push(my_vf)
    if args.ccds_evf:
        my_ccds_evf = AnnovarExonVrntFunc(args.ccds_evf).read_annovar()
        coll.metrics_push(my_ccds_evf)
    if args.ccds_vf:
        my_ccds_vf = AnnovarVrntFunc(args.ccds_vf).read_annovar()
        coll.metrics_push(my_ccds_vf)

    to_write = {}
    for coord, txs in coll.metrics.items():
        comb = defaultdict(list)
        dict_items = map(methodcaller('items'), (txs.values()))
        for k, v in chain.from_iterable(dict_items):
            comb[k].append(v)
        to_write[coord] = comb
        # Look for any instances where the number of TFX fields do not match.
        num_vals = len(set([len(x) for x in comb.values()]))
        assert num_vals == 1 or not num_vals, "%s" % comb
    writer = VcfWriter(args.outfile, myvcf.vcf_reader)
    writer.write_metrics(vrnts, to_write)
Example #28
0
def get_header(sample_name_to_header, chromosome_set):
    """
    Returns the header of the output VCF file
    :param sample_name_to_header: a dictionary from the sample names to the headers
    :param chromosome_set: the set of chromosomes selected for analysis
    :return: a vcfpy.Header
    """
    header = vcfpy.Header()

    header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2"))

    # CONTIG headers
    first_sample_header = next(iter(sample_name_to_header.values()))
    for input_header_line in first_sample_header.lines:
        if isinstance(input_header_line, vcfpy.ContigHeaderLine):
            if chromosome_set is None or input_header_line.mapping[
                    "ID"] in chromosome_set:
                header.add_line(input_header_line)

    # INFO fields
    header.add_info_line(
        vcfpy.OrderedDict(ID="END",
                          Number=1,
                          Type="Integer",
                          Description="Stop position of the interval"))
    header.add_info_line(
        vcfpy.OrderedDict(ID="SVTYPE",
                          Number=1,
                          Type="String",
                          Description="Type of structural variant"))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="INSSEQ",
            Number=1,
            Type="String",
            Description=
            "Insertion sequence of structural variant, not including sequence marked as duplication"
        ))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="TRANCHE2",
            Number=1,
            Type="String",
            Description=
            "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH"
        ))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="BNDVAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REF+SR+RP+IC+AS)"
        ))
    # FORMAT fields
    header.add_format_line(
        vcfpy.OrderedDict(ID="GT",
                          Number=1,
                          Type="String",
                          Description="Genotype"))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="TRANCHE2",
            Number=1,
            Type="String",
            Description=
            "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="BNDVAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REFPAIR+SR+RP+IC+AS)"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="VAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this SV call, derived from BNDVAF values of BND calls used to call this SV"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="INSSEQ",
            Number=1,
            Type="String",
            Description=
            "Insertion sequence of structural variant, not including sequence marked as duplication"
        ))

    # Samples, sorted to ensure determinism
    sample_names = sample_name_to_header.keys()
    header.samples = vcfpy.SamplesInfos(sorted(sample_names))

    return header
Example #29
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import vcfpy

# Open input, add FILTER header, and open output file
reader = vcfpy.Reader.from_path("input.vcf")
reader.header.add_filter_line(
    vcfpy.OrderedDict([("ID", "DP10"), ("Description", "total DP < 10")]))
writer = vcfpy.Writer.from_path("/dev/stdout", reader.header)

# Add "DP10" filter to records having less than 10 reads
for record in reader:
    ad = sum(c.data.get("DP", 0) for c in record.calls)
    if ad < 10:
        record.add_filter("DP10")
    writer.write_record(record)
Example #30
0
        # Variant calling score (QUAL) field
        dico_vcf[var_id]["features"][caller_name]["QUAL"] = record.QUAL
        # Filter field
        dico_vcf[var_id]["features"][caller_name]["FILTER"] = record.FILTER
        # Genotype (GT) field
        dico_vcf[var_id]["features"][caller_name]["GT"] = record.calls[0].data.get('GT').replace("|","/")
        # Read Depth (DP) field
        dico_vcf[var_id]["features"][caller_name]["DP"] = record.calls[0].data.get('DP')
        # Allele Frequency (AF) field
        if caller_name in ["strelka","deepvariant"]: dico_vcf[var_id]["features"][caller_name]["AF"] = round((float(record.calls[0].data['AD'][1])/(float(record.calls[0].data['AD'][0])+float(record.calls[0].data['AD'][1]))),2) # for strelka and deepvariant AD for ref and alt is in FORMAT
        else: dico_vcf[var_id]["features"][caller_name]["AF"] = round(float(record.INFO['AF'][0]),2)
#***** CREATE new vcf header *****#
# Callers list
new_header.add_line(vcfpy.HeaderLine("Nk_calls","|".join(lst_caller_name)))
# Filters list
for filter_id in dico_filter_line: new_header.add_filter_line(vcfpy.OrderedDict([('ID', filter_id),('Description', dico_filter_line[filter_id])]))
new_header.add_filter_line(vcfpy.OrderedDict([('ID', "FILTER"),('Description', "All callers filtered")]))
# INFO
dico_line_info_CALLNB = collections.OrderedDict([("ID","CALLNB"),("Number","A"),("Type","Integer"),("Description","Number of PASS calls")])
new_header.add_info_line(dico_line_info_CALLNB)
dico_line_info_CALLAF = collections.OrderedDict([("ID","CALLAF"),("Number","A"),("Type","String"),("Description","Allele frequency per caller")])
new_header.add_info_line(dico_line_info_CALLAF)
dico_line_info_CALLFILTER = collections.OrderedDict([("ID","CALLFILTER"),("Number","A"),("Type","String"),("Description","Filters per caller")])
new_header.add_info_line(dico_line_info_CALLFILTER)
dico_line_info_CALLQUAL = collections.OrderedDict([("ID","CALLQUAL"),("Number","A"),("Type","String"),("Description","Variant quality per caller")])
new_header.add_info_line(dico_line_info_CALLQUAL)
# FORMAT
dico_line_info_GT = collections.OrderedDict([("ID","GT"),("Number","1"),("Type","String"),("Description","Genotype")])
new_header.add_format_line(dico_line_info_GT)
dico_line_info_DP = collections.OrderedDict([("ID","DP"),("Number","1"),("Type","Integer"),("Description","Read depth (median if multiple calls)")])
new_header.add_format_line(dico_line_info_DP)