Ejemplo n.º 1
0
    def test_run_with_small_var_vcf_chunking_total_splits(self):
        '''test run with chunking small variatn VCF file using total_splits option'''
        input_tsv = 'tmp.multi_sample_pipeline.run.in.tsv'
        ref_fasta = os.path.join(data_dir, 'run.ref.0.fa')
        with open(input_tsv, 'w') as f:
            for i in '1', '2':
                reads = os.path.join(data_dir, 'run.reads.' + i + '.sorted.bam')
                vcf = os.path.join(data_dir, 'run.calls.' + i + '.vcf')
                print(vcf, reads, sep='\t', file=f)

        outdir = 'tmp.multi_sample_pipeline.run.out'
        if os.path.exists(outdir):
            shutil.rmtree(outdir)

        pipeline = multi_sample_pipeline.MultiSamplePipeline(ref_fasta, input_tsv, outdir, total_splits=3, min_large_ref_length=10, testing=True, clean=False)
        pipeline.run()

        expected_vcf = os.path.join(data_dir, 'run.out.vcf')
        expected_header, expected_lines = vcf_file_read.vcf_file_to_list(expected_vcf)
        got_vcf = os.path.join(outdir, 'combined_calls.vcf')
        self.assertTrue(os.path.exists(got_vcf))
        got_header, got_lines = vcf_file_read.vcf_file_to_list(got_vcf)
        # the datei, minos version, and bcftools verisons might not match
        expected_header = [x for x in expected_header if not (x.startswith('##fileDate') or x.startswith('##source=minos') or x.startswith('##bcftools_mergeVersion'))]
        got_header = [x for x in got_header if not (x.startswith('##fileDate') or x.startswith('##source=minos') or x.startswith('##bcftools_mergeVersion'))]
        self.assertEqual(expected_header, got_header)
        self.assertEqual(expected_lines, got_lines)

        shutil.rmtree(outdir)
        os.unlink(input_tsv)
def check_vcfs(expected_vcf, got_vcf):
    expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list(
        expected_vcf)
    got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(got_vcf)
    for i in range(len(expected_header)):
        if expected_header[i].startswith("##fileDate="):
            expected_header[i] = "##fileDate=" + str(datetime.date.today())
        elif expected_header[i].startswith("##source=cluster_vcf_records"):
            expected_header[i] = ("##source=cluster_vcf_records, version " +
                                  cluster_vcf_records_version)

    return expected_header == got_header and expected_vcf_records == got_vcf_records
Ejemplo n.º 3
0
    def _test_run_with_small_var_vcf_chunking_vars_per_split(self):
        """test run with chunking small variant VCF file using variants_per_split option"""
        input_tsv = "tmp.multi_sample_pipeline.run.in.tsv"
        ref_fasta = os.path.join(data_dir, "run.ref.0.fa")
        with open(input_tsv, "w") as f:
            for i in "1", "2":
                reads1 = os.path.join(data_dir,
                                      "run.reads." + i + ".sorted.bam")
                reads2 = os.path.join(data_dir,
                                      "run.reads." + i + ".sorted.bam")
                vcf = os.path.join(data_dir, "run.calls." + i + ".vcf")
                print(vcf, reads1, reads2, sep="\t", file=f)

        outdir = "tmp.multi_sample_pipeline.run.out"
        if os.path.exists(outdir):
            shutil.rmtree(outdir)

        pipeline = multi_sample_pipeline.MultiSamplePipeline(
            ref_fasta,
            input_tsv,
            outdir,
            variants_per_split=3,
            min_large_ref_length=10,
            testing=True,
            clean=False,
        )
        pipeline.run()

        expected_vcf = os.path.join(data_dir, "run.out.vcf")
        expected_header, expected_lines = vcf_file_read.vcf_file_to_list(
            expected_vcf)
        got_vcf = os.path.join(outdir, "combined_calls.vcf")
        self.assertTrue(os.path.exists(got_vcf))
        got_header, got_lines = vcf_file_read.vcf_file_to_list(got_vcf)
        # the datei, minos version, and bcftools verisons might not match
        expected_header = [
            x for x in expected_header
            if not (x.startswith("##fileDate") or x.startswith(
                "##source=minos") or x.startswith("##bcftools_mergeVersion"))
        ]
        got_header = [
            x for x in got_header
            if not (x.startswith("##fileDate") or x.startswith(
                "##source=minos") or x.startswith("##bcftools_mergeVersion"))
        ]
        self.assertEqual(expected_header, got_header)
        self.assertEqual(expected_lines, got_lines)

        shutil.rmtree(outdir)
        os.unlink(input_tsv)
Ejemplo n.º 4
0
        def check_vcfs(expected_vcf, got_vcf):
            expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list(
                expected_vcf)
            got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(
                got_vcf)
            for i in range(len(expected_header)):
                if expected_header[i].startswith("##fileDate="):
                    expected_header[i] = "##fileDate=" + str(
                        datetime.date.today())
                elif expected_header[i].startswith("##source=minos"):
                    expected_header[
                        i] = "##source=minos, version " + minos_version

            self.assertEqual(expected_header, got_header)
            self.assertEqual(expected_vcf_records, got_vcf_records)
Ejemplo n.º 5
0
    def test_load_gramtools_vcf_and_allele_coverage_files(self):
        """test load_gramtools_vcf_and_allele_coverage_files"""
        vcf_file = os.path.join(data_dir,
                                "load_gramtools_vcf_and_allele_coverage.vcf")
        quasimap_dir = os.path.join(
            data_dir, "load_gramtools_vcf_and_allele_coverage_files.quasimap")
        got_mean_depth, got_depth_variance, got_vcf_header, got_vcf_records, got_allele_coverage, got_allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            vcf_file, quasimap_dir)

        expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list(
            vcf_file)
        self.assertEqual(expected_header, got_vcf_header)
        self.assertEqual(expected_vcf_records, got_vcf_records)
        self.assertEqual(10.500, got_mean_depth)
        self.assertEqual(0.5, got_depth_variance)

        # now test bad files cause error to be raised
        vcf_file = os.path.join(
            data_dir, "load_gramtools_vcf_and_allele_coverage.short.vcf")
        with self.assertRaises(Exception):
            gramtools.load_gramtools_vcf_and_allele_coverage_files(
                vcf_file, quasimap_dir)

        vcf_file = os.path.join(
            data_dir, "load_gramtools_vcf_and_allele_coverage.long.vcf")
        with self.assertRaises(Exception):
            gramtools.load_gramtools_vcf_and_allele_coverage_files(
                vcf_file, quasimap_dir)

        vcf_file = os.path.join(
            data_dir,
            "load_gramtools_vcf_and_allele_coverage.bad_allele_count.vcf")
        with self.assertRaises(Exception):
            gramtools.load_gramtools_vcf_and_allele_coverage_files(
                vcf_file, quasimap_dir)
Ejemplo n.º 6
0
    def test_vcf_file_to_list(self):
        """test vcf_file_to_list"""
        expected_header = ["# header1", "# header2"]
        lines = [
            "ref_42\t12\tid_foo\tC\tG\t42.43\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,53:39.81",
            "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80",
            "ref_43\t42\tid_foo\tT\tG\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.82",
        ]
        expected_records = [vcf_record.VcfRecord(x) for x in lines]
        infile = os.path.join(data_dir, "vcf_file_to_list.vcf")
        got_header, got_records = vcf_file_read.vcf_file_to_list(infile)
        self.assertEqual(expected_header, got_header)
        self.assertEqual(expected_records, got_records)

        infile = os.path.join(data_dir, "vcf_file_to_list.vcf.gz")
        got_header, got_records = vcf_file_read.vcf_file_to_list(infile)
        self.assertEqual(expected_header, got_header)
        self.assertEqual(expected_records, got_records)
Ejemplo n.º 7
0
    def test_vcf_file_to_list(self):
        '''test vcf_file_to_list'''
        expected_header = ['# header1', '# header2']
        lines = [
            'ref_42\t12\tid_foo\tC\tG\t42.43\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,53:39.81',
            'ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80',
            'ref_43\t42\tid_foo\tT\tG\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.82',
        ]
        expected_records = [vcf_record.VcfRecord(x) for x in lines]
        infile = os.path.join(data_dir, 'vcf_file_to_list.vcf')
        got_header, got_records = vcf_file_read.vcf_file_to_list(infile)
        self.assertEqual(expected_header, got_header)
        self.assertEqual(expected_records, got_records)

        infile = os.path.join(data_dir, 'vcf_file_to_list.vcf.gz')
        got_header, got_records = vcf_file_read.vcf_file_to_list(infile)
        self.assertEqual(expected_header, got_header)
        self.assertEqual(expected_records, got_records)
Ejemplo n.º 8
0
def per_record_stats_from_vcf_file(infile):
    """Gathers stats for each record in a VCF file.
    Returns a list of dictionaries of stats. One dict per VCF line.
    List is sorted by ref seq name (CHROM), then position (POS)"""
    stats = []
    wanted_keys = [
        "DP",
        "DPF",
        "FRS",
        "GT_CONF",
        "GT_CONF_PERCENTILE",
        "VFR_IN_MASK",
        "VFR_ED_RA",
        "VFR_ED_TR",
        "VFR_ED_TA",
        "VFR_FILTER",
        "VFR_ALLELE_LEN",
        "VFR_ALLELE_MATCH_COUNT",
        "VFR_ALLELE_MATCH_FRAC",
        "VFR_RESULT",
    ]
    key_types = {
        "DP": int,
        "DPF": float,
        "GT_CONF": float,
        "GT_CONF_PERCENTILE": float,
        "FRS": float,
        "VFR_IN_MASK": int,
        "VFR_ED_RA": int,
        "VFR_ED_TR": int,
        "VFR_ED_TA": int,
        "VFR_ALLELE_MATCH_FRAC": float,
        "VFR_ALLELE_LEN": int,
        "VFR_ALLELE_MATCH_COUNT": int,
    }
    header_lines, vcf_records = vcf_file_read.vcf_file_to_list(infile)
    for record in vcf_records:
        record_stats = {x: record.FORMAT.get(x, "NA") for x in wanted_keys}
        record_stats["FRS"] = _frs_from_vcf_record(record)
        record_stats["CHROM"] = record.CHROM
        record_stats["POS"] = record.POS + 1
        for key, key_type in key_types.items():
            try:
                record_stats[key] = key_type(record_stats[key])
            except:
                pass

        stats.append(record_stats)

    stats.sort(key=itemgetter("CHROM", "POS"))
    return stats
def test_normalise_vcf():
    infile = os.path.join(data_dir, "normalise_vcf.in.vcf")
    ref_fa = os.path.join(data_dir, "normalise_vcf.in.fa")
    expect = os.path.join(data_dir, "normalise_vcf.out.vcf")
    tmp_out = "tmp.normalise_vcf.vcf"
    utils.rm_rf(tmp_out)
    utils.normalise_vcf(infile, ref_fa, tmp_out)
    expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list(
        expect)
    got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(tmp_out)
    # The normalizing commands add lots of lines to the header.
    # We don't care about those, so just check the actual records.
    assert got_vcf_records == expected_vcf_records
    os.unlink(tmp_out)

    # test again but without breaking alleles into separate records
    utils.normalise_vcf(infile, ref_fa, tmp_out, break_alleles=False)
    expect = os.path.join(data_dir, "normalise_vcf.out.no_break_alleles.vcf")
    expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list(
        expect)
    got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(tmp_out)
    assert got_vcf_records == expected_vcf_records
    os.unlink(tmp_out)
Ejemplo n.º 10
0
def get_probes_and_vcf_records(
    vcf_file, ref_seqs, flank_length, use_fail_conflict=False
):
    """Input vcf_file is assumed to have been made by vcf_qc_annotate.add_qc_to_vcf(),
    so that each record has the FORMAT tag VFR_FILTER.
    For each line of the input VCF file, yields a
    tuple (vcf_record, alt probe sequence).
    vcf_file = name of VCF file.
    ref_seqs = dictionary of sequence name -> sequence.
    flank_length = number of nucleotides to add either side of variant sequence."""
    header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_file)
    yield header_lines
    wanted_format = _get_wanted_format(use_fail_conflict)

    for record in vcf_records:
        if record.FORMAT["VFR_FILTER"] not in wanted_format:
            yield record, None, None
            continue

        flank_start = max(0, record.POS - flank_length)
        ref_seq = ref_seqs[record.CHROM]
        if ref_seq[record.POS : record.POS + len(record.REF)] != record.REF:
            record.set_format_key_value("VFR_FILTER", "REF_STRING_MISMATCH")
            yield record, None, None
            continue

        flank_end = min(len(ref_seq) - 1, record.ref_end_pos() + flank_length)
        probe_allele_start = record.POS - flank_start

        alt_index = int(record.FORMAT["GT"].split("/")[0])
        alt_allele = record.REF if alt_index == 0 else record.ALT[alt_index - 1]
        alt_probe_allele_end = probe_allele_start + len(alt_allele) - 1
        alt_probe_seq = (
            ref_seq[flank_start : record.POS]
            + alt_allele
            + ref_seq[record.ref_end_pos() + 1 : flank_end + 1]
        )
        alt_probe = probe.Probe(alt_probe_seq, probe_allele_start, alt_probe_allele_end)
        assert alt_probe.allele_seq() == alt_allele

        ref_probe_allele_end = probe_allele_start + len(record.REF) - 1
        ref_probe_seq = (
            ref_seq[flank_start : record.POS]
            + record.REF
            + ref_seq[record.ref_end_pos() + 1 : flank_end + 1]
        )
        ref_probe = probe.Probe(ref_probe_seq, probe_allele_start, ref_probe_allele_end)
        assert ref_probe.allele_seq() == record.REF

        yield record, ref_probe, alt_probe
Ejemplo n.º 11
0
def _vcf_file_to_dict(vcf_file):
    """Loads VCF file. Returns a dictionary of sequence name -> sorted list
    by position of variants"""
    records = {}

    header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_file)
    for record in vcf_records:
        if record.CHROM not in records:
            records[record.CHROM] = []
        records[record.CHROM].append(record)

    for l in records.values():
        l.sort(key=operator.attrgetter("POS"))

    return records
Ejemplo n.º 12
0
    def run(self):
        header_lines, vcf_records = vcf_file_read.vcf_file_to_list(self.infile)

        with open(self.outfile_small_vars,
                  "w") as f_small, open(self.outfile_long_deletions,
                                        "w") as f_big:
            print(*header_lines, sep="\n", file=f_small)
            print(*header_lines, sep="\n", file=f_big)

            for original_record in vcf_records:
                split_records = original_record.split_into_snps()
                for record in split_records:
                    if len(record.REF) < self.min_large_ref_length:
                        print(record, file=f_small)
                    else:
                        print(record, file=f_big)
Ejemplo n.º 13
0
def get_probes_and_vcf_records(vcf_file,
                               ref_seqs,
                               flank_length,
                               use_fail_conflict=False):
    """For each line of the input VCF file, yields a
    tuple (vcf_record, alt probe sequence).
    vcf_file = name of VCF file.
    ref_seqs = dictionary of sequence name -> sequence.
    flank_length = number of nucleotides to add either side of variant sequence."""
    header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_file)
    yield header_lines

    for i, vcf_record in enumerate(vcf_records):
        ref_probe, alt_probe = make_probes(ref_seqs, vcf_records, i,
                                           flank_length)
        yield vcf_record, ref_probe, alt_probe
Ejemplo n.º 14
0
def load_gramtools_vcf_and_allele_coverage_files(vcf_file, quasimap_dir):
    """Loads the perl_generated_vcf file and allele_coverage files.
    Sanity checks that they agree: 1) same number of lines (excluding header
    lines in vcf) and 2) number of alts agree on each line.
    Raises error at the first time somthing wrong is found.
    Returns a list of tuples: (VcfRecord, dict of allele -> coverage)"""
    allele_base_counts_file = os.path.join(quasimap_dir, "quasimap_outputs",
                                           "allele_base_coverage.json")
    grouped_allele_counts_file = os.path.join(
        quasimap_dir, "quasimap_outputs",
        "grouped_allele_counts_coverage.json")
    all_allele_coverage, allele_groups = load_allele_files(
        allele_base_counts_file, grouped_allele_counts_file)
    vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
    coverages = []

    if len(all_allele_coverage) != len(vcf_lines):
        raise Exception("Number of records in VCF (" + str(len(vcf_lines)) +
                        ") does not match number output from gramtools.(" +
                        str(len(all_allele_coverage)) + "). Cannot continue")

    for i, (allele_combi_coverage,
            allele_per_base_coverage) in enumerate(all_allele_coverage):
        if len(allele_per_base_coverage) != 1 + len(vcf_lines[i].ALT):
            raise Exception(
                "Mismatch in number of alleles for this VCF record:\n" +
                str(vcf_lines[i]) + "\nLine number is " + str(i + 1))

        coverages.append(sum(allele_combi_coverage.values()))

    assert len(coverages) > 0
    # Unlikely to happen edge case on real data is when coverages has length 1.
    # It happens when running test_run in adjudicator_test, with a split VCf.
    # One of the splits only has 1 record.
    if len(coverages) == 1:
        variance = 1.000
    else:
        variance = round(statistics.variance(coverages), 3)

    return (
        round(statistics.mean(coverages), 3),
        variance,
        vcf_header,
        vcf_lines,
        all_allele_coverage,
        allele_groups,
    )
Ejemplo n.º 15
0
def minos_vcf_to_plot_data(infile, outfile):
    header, records = vcf_file_read.vcf_file_to_list(infile)
    data = []
    tp_or_fp_types = set()
    output_cols = ["DP", "GT_CONF"]

    for record in records:
        if record.FORMAT is None:
            continue

        check_geno = record.FORMAT.get("MINOS_CHECK_GENOTYPE", None)
        dp = record.FORMAT.get("DP", None)
        gt_conf = record.FORMAT.get("GT_CONF", None)
        if dp is not None and gt_conf is not None:
            to_append = [dp, gt_conf]

            if check_geno is not None:
                if check_geno == "0":
                    tp_or_fp_type = "FP"
                elif check_geno == "1":
                    tp_or_fp_type = "TP"
                else:
                    tp_or_fp_type = "Unknown"

                tp_or_fp_types.add(tp_or_fp_type)
                to_append.append(tp_or_fp_type)

            data.append(to_append)

    if len(data) == 0:
        logging.warning("No DP and GT_CONF data found in VCF file " + infile +
                        " therefore no plots will be made")
        return None

    if "TP" in tp_or_fp_types or "FP" in tp_or_fp_types:
        output_cols.append("TP_OR_FP")

    with open(outfile, "w") as f:
        print(*output_cols, sep="\t", file=f)
        for l in data:
            if len(l) < len(output_cols):
                l.append("Unknown")
            print(*l[:len(output_cols)], sep="\t", file=f)

    return tp_or_fp_types
Ejemplo n.º 16
0
def minos_vcf_to_plot_data(infile, outfile):
    header, records = vcf_file_read.vcf_file_to_list(infile)
    data = []
    tp_or_fp_types = set()
    output_cols = ['DP', 'GT_CONF']

    for record in records:
        if record.FORMAT is None:
            continue

        check_geno = record.FORMAT.get('MINOS_CHECK_GENOTYPE', None)
        dp = record.FORMAT.get('DP', None)
        gt_conf = record.FORMAT.get('GT_CONF', None)
        if dp is not None and gt_conf is not None:
            to_append = [dp, gt_conf]

            if check_geno is not None:
                if check_geno == '0':
                    tp_or_fp_type = 'FP'
                elif check_geno == '1':
                    tp_or_fp_type = 'TP'
                else:
                    tp_or_fp_type = 'Unknown'

                tp_or_fp_types.add(tp_or_fp_type)
                to_append.append(tp_or_fp_type)

            data.append(to_append)

    if len(data) == 0:
        logging.warning('No DP and GT_CONF data found in VCF file ' + infile + ' therefore no plots will be made')
        return None

    if 'TP' in tp_or_fp_types or 'FP' in tp_or_fp_types:
        output_cols.append('TP_OR_FP')

    with open(outfile, 'w') as f:
        print(*output_cols, sep='\t', file=f)
        for l in data:
            if len(l) < len(output_cols):
                l.append('Unknown')
            print(*l[:len(output_cols)], sep='\t', file=f)

    return tp_or_fp_types
Ejemplo n.º 17
0
def _filter_fps_and_long_vars_from_probe_mapped_vcf(vcf_in, vcf_out, max_ref_len, detailed_VCF=False):
    """vcf_in should be file made by _merge_vcf_files_for_probe_mapping, and
    then annotated using probe_mapping.annotate_vcf_with_probe_mapping().
    Outputs a new VCF file that only contains the TPs, based on probe mapping"""
    header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_in)
    with open(vcf_out, "w") as f:
        for line in header_lines:
            if (
                line.startswith("#CHROM")
                or line.startswith("##fileformat")
                or line.startswith("##contig")
                or line.startswith("##FORMAT=<ID=GT")
            ):
                print(line, file=f)

        i = 0
        while i < len(vcf_records):
            records = [vcf_records[i]]
            i += 1
            while i < len(vcf_records) and vcf_records[i].ID == records[0].ID:
                records.append(vcf_records[i])
                i += 1

            records = [
                x
                for x in records
                if x.FORMAT.get("VFR_RESULT", "FP") == "TP"
                and x.FORMAT.get("VFR_IN_MASK", "0") != "1"
            ]
            if max_ref_len is not None:
                records = [x for x in records if len(x.REF) <= max_ref_len]

            if len(records) > 1:
                logging.warning(
                    "Skipping the following VCF lines. They conflict, but probe mapping thinks they are all true-positives:"
                )
                for record in records:
                    logging.warning(f"  {record}")
            elif len(records) == 1:
                if not detailed_VCF:
                    records[0].FORMAT = {"GT": "1/1"}
                print(records[0], file=f)
Ejemplo n.º 18
0
def load_gramtools_vcf_and_allele_coverage_files(vcf_file, quasimap_dir):
    """Loads the perl_generated_vcf file and allele_coverage files.
    Sanity checks that they agree: 1) same number of lines (excluding header
    lines in vcf) and 2) number of alts agree on each line.
    Raises error at the first time somthing wrong is found.
    Returns a list of tuples: (VcfRecord, dict of allele -> coverage)"""
    vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
    all_allele_coverage, allele_groups = _load_quasimap_json_files(quasimap_dir)
    coverages = _coverage_list_from_allele_coverage(
        all_allele_coverage, vcf_lines=vcf_lines
    )
    coverages = [x for x in coverages if x is not None]

    # Unlikely to happen edge case when there were no SNPs in the input.
    # Or the few SNPs we do get coverage for all all coverage zero.
    # By default, we only counted read depth at SNPs, so in this edge case we
    # get no read depths. Do the coverage estimate again, but use indels to
    # estimate. Is likely to be a little less accurate, but we have no choice.
    if len(coverages) == 0 or max(coverages) == 0:
        coverages = _coverage_list_from_allele_coverage(
            all_allele_coverage, vcf_lines=vcf_lines, use_indels=True
        )
        coverages = [x for x in coverages if x is not None]

    assert len(coverages) > 0
    # Unlikely to happen edge case on real data is when coverages has length 1.
    # It happens when running test_run in adjudicator_test, with a split VCf.
    # One of the splits only has 1 record.
    if len(coverages) == 1:
        variance = 1.000
    else:
        variance = round(statistics.variance(coverages), 3)

    return (
        round(statistics.mean(coverages), 3),
        variance,
        vcf_header,
        vcf_lines,
        all_allele_coverage,
        allele_groups,
    )
Ejemplo n.º 19
0
def _vcf_file_to_dict(vcf_file, pass_only=True):
    """Loads VCF file. Returns a dictionary of sequence name -> sorted list
    by position of variants"""
    records = {}
    wanted_format = {"PASS"}
    if not pass_only:
        wanted_format.add("FAIL_BUT_TEST")

    header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_file)
    for record in vcf_records:
        if record.FORMAT["VFR_FILTER"] not in wanted_format:
            continue

        if record.CHROM not in records:
            records[record.CHROM] = []
        records[record.CHROM].append(record)

    for l in records.values():
        l.sort(key=operator.attrgetter("POS"))

    return records
Ejemplo n.º 20
0
    def _add_gt_conf_percentile_to_vcf_file(cls, vcf_file, mean_depth,
                                            depth_variance, error_rate,
                                            iterations):
        '''Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added'''
        simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator(
            mean_depth,
            depth_variance,
            error_rate,
            allele_length=1,
            iterations=iterations)
        simulations.run_simulations()
        vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
        for i, line in enumerate(vcf_header):
            if line.startswith('##FORMAT=<ID=GT_CONF'):
                break
        else:
            raise Exception(
                f'No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue'
            )

        vcf_header.insert(
            i + 1,
            r'''##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF"'''
        )

        with open(vcf_file, 'w') as f:
            print(*vcf_header, sep='\n', file=f)

            for vcf_record in vcf_lines:
                if 'GT_CONF' in vcf_record.FORMAT:
                    conf = int(round(float(vcf_record.FORMAT['GT_CONF'])))
                    if 'GT' in vcf_record.FORMAT and '.' not in vcf_record.FORMAT[
                            'GT']:
                        vcf_record.set_format_key_value(
                            'GT_CONF_PERCENTILE',
                            str(simulations.get_percentile(conf)))

                print(vcf_record, file=f)
Ejemplo n.º 21
0
def _merge_vcf_files_for_probe_mapping(list_of_vcf_files, ref_fasta, vcf_out):
    ref_seqs = utils.file_to_dict_of_seqs(ref_fasta)
    # This makes a merged file, where two different ALTs at the same place
    # result in one record with a list of ALTs. For probe mapping, we want
    # a separate record for each allele. Also need genotype to be "1/1"
    vcf_merge.merge_vcf_files(list_of_vcf_files, ref_seqs, vcf_out)
    header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_out)
    with open(vcf_out, "w") as f:
        print("##fileformat=VCFv4.2", file=f)
        for seq in ref_seqs.values():
            print(f"##contig=<ID={seq.id},length={len(seq)}>", file=f)
        print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=f)
        print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample", file=f)

        for i, record in enumerate(vcf_records):
            for alt in record.ALT:
                new_record = copy.copy(record)
                new_record.ID = str(i)
                new_record.ALT = [alt]
                new_record.INFO = {}
                new_record.FILTER = set(["PASS"])
                new_record.FORMAT = {"GT": "1/1", "VFR_FILTER": "PASS"}
                print(new_record, file=f)
Ejemplo n.º 22
0
    def _add_gt_conf_percentile_and_filters_to_vcf_file(
        cls,
        vcf_file,
        mean_depth,
        depth_variance,
        error_rate,
        iterations,
        min_dp=5,
        min_gcp=5,
    ):
        """Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added,
        and filter for DP and GT_CONF_PERCENTILE"""
        if mean_depth > 0:
            simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator(
                mean_depth,
                depth_variance,
                error_rate,
                allele_length=1,
                iterations=iterations,
            )
            simulations.run_simulations()
        vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
        for i, line in enumerate(vcf_header):
            if line.startswith("##FORMAT=<ID=GT_CONF"):
                break
        else:
            raise Exception(
                f"No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue"
            )

        vcf_header.insert(
            i + 1,
            r"""##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">""",
        )
        vcf_header.insert(
            i + 1,
            f'##FILTER=<ID=MIN_DP,Description="Minimum DP of {min_dp}">')
        vcf_header.insert(
            i + 1,
            f'##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of {min_gcp}">',
        )

        with open(vcf_file, "w") as f:
            print(*vcf_header, sep="\n", file=f)

            for vcf_record in vcf_lines:
                vcf_record.FILTER = set()

                if "GT" in vcf_record.FORMAT and "GT_CONF" in vcf_record.FORMAT:
                    if "." not in vcf_record.FORMAT["GT"]:
                        conf = int(round(float(vcf_record.FORMAT["GT_CONF"])))
                        vcf_record.set_format_key_value(
                            "GT_CONF_PERCENTILE",
                            str(simulations.get_percentile(conf)))
                        if ("DP" in vcf_record.FORMAT
                                and float(vcf_record.FORMAT["DP"]) < min_dp):
                            vcf_record.FILTER.add("MIN_DP")
                        if float(vcf_record.FORMAT["GT_CONF_PERCENTILE"]
                                 ) < min_gcp:
                            vcf_record.FILTER.add("MIN_GCP")
                        if len(vcf_record.FILTER) == 0:
                            vcf_record.FILTER.add("PASS")
                    else:
                        # Add a default null percentile
                        vcf_record.set_format_key_value(
                            "GT_CONF_PERCENTILE", "0.0")

                print(vcf_record, file=f)
Ejemplo n.º 23
0
def vcf_records_are_the_same(file1, file2):
    """Returns True if records in the two VCF files are the same.
    Ignores header lines in the files. Returns False if any lines are different"""
    _, expect_records = vcf_file_read.vcf_file_to_list(file1)
    _, got_records = vcf_file_read.vcf_file_to_list(file2)
    return got_records == expect_records
Ejemplo n.º 24
0
    def _add_gt_conf_percentile_and_filters_to_vcf_file(
        cls,
        vcf_file,
        geno_simulations,
        min_dp=0,
        min_gcp=5,
        min_frs=0.9,
        conf_scores_file=None,
    ):
        """Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added,
        and filter for DP, GT_CONF_PERCENTILE, and FRS"""
        if conf_scores_file is not None:
            real_conf_scores = []

        vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
        for i, line in enumerate(vcf_header):
            if line.startswith("##FORMAT=<ID=GT_CONF"):
                break
        else:
            raise Exception(
                f"No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue"
            )

        vcf_header[i + 1:i + 1] = [
            '##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">',
            f'##FILTER=<ID=MIN_FRS,Description="Minimum FRS of {min_frs}">',
            f'##FILTER=<ID=MIN_DP,Description="Minimum DP of {min_dp}">',
            f'##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of {min_gcp}">',
        ]

        with open(vcf_file, "w") as f:
            print(*vcf_header, sep="\n", file=f)

            for vcf_record in vcf_lines:
                vcf_record.FILTER = set()

                if "GT" in vcf_record.FORMAT and "GT_CONF" in vcf_record.FORMAT:
                    if "." not in vcf_record.FORMAT["GT"]:
                        conf = int(round(float(vcf_record.FORMAT["GT_CONF"])))
                        vcf_record.set_format_key_value(
                            "GT_CONF_PERCENTILE",
                            str(geno_simulations.get_percentile(conf)),
                        )
                        if ("DP" in vcf_record.FORMAT
                                and float(vcf_record.FORMAT["DP"]) < min_dp):
                            vcf_record.FILTER.add("MIN_DP")
                        if float(vcf_record.FORMAT["GT_CONF_PERCENTILE"]
                                 ) < min_gcp:
                            vcf_record.FILTER.add("MIN_GCP")
                        if ("FRS" in vcf_record.FORMAT
                                and float(vcf_record.FORMAT["FRS"]) < min_frs):
                            vcf_record.FILTER.add("MIN_FRS")
                        if len(vcf_record.FILTER) == 0:
                            vcf_record.FILTER.add("PASS")

                        if conf_scores_file is not None:
                            real_conf_scores.append(conf)
                    else:
                        # Add a default null percentile
                        vcf_record.set_format_key_value(
                            "GT_CONF_PERCENTILE", "0.0")

                print(vcf_record, file=f)

        if conf_scores_file is not None:
            with open(conf_scores_file, "w") as f:
                print(*real_conf_scores, sep="\n", file=f)
Ejemplo n.º 25
0
    def _add_gt_conf_percentile_and_filters_to_vcf_file(
            cls,
            vcf_file,
            mean_depth,
            depth_variance,
            error_rate,
            iterations,
            min_dp=2,
            min_gt_conf_percentile=2.5):
        '''Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added,
        and filter for DP and GT_CONF_PERCENTILE'''
        simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator(
            mean_depth,
            depth_variance,
            error_rate,
            allele_length=1,
            iterations=iterations)
        simulations.run_simulations()
        vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file)
        for i, line in enumerate(vcf_header):
            if line.startswith('##FORMAT=<ID=GT_CONF'):
                break
        else:
            raise Exception(
                f'No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue'
            )

        vcf_header.insert(
            i + 1,
            r'''##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">'''
        )
        vcf_header.insert(
            i + 1,
            f'##FILTER=<ID=MIN_DP,Description="Minimum DP of {min_dp}">')
        vcf_header.insert(
            i + 1,
            f'##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of {min_gt_conf_percentile}">'
        )

        with open(vcf_file, 'w') as f:
            print(*vcf_header, sep='\n', file=f)

            for vcf_record in vcf_lines:
                vcf_record.FILTER = set()

                if 'GT_CONF' in vcf_record.FORMAT:
                    conf = int(round(float(vcf_record.FORMAT['GT_CONF'])))
                    if 'GT' in vcf_record.FORMAT and '.' not in vcf_record.FORMAT[
                            'GT']:
                        vcf_record.set_format_key_value(
                            'GT_CONF_PERCENTILE',
                            str(simulations.get_percentile(conf)))
                        if 'DP' in vcf_record.FORMAT and float(
                                vcf_record.FORMAT['DP']) < min_dp:
                            vcf_record.FILTER.add('MIN_DP')
                        if float(vcf_record.FORMAT['GT_CONF_PERCENTILE']
                                 ) < min_gt_conf_percentile:
                            vcf_record.FILTER.add('MIN_GCP')
                        if len(vcf_record.FILTER) == 0:
                            vcf_record.FILTER.add('PASS')

                print(vcf_record, file=f)