Esempio n. 1
0
def test_Variants():
    variants = variant_tracking.Variants()
    assert len(variants) == 0
    variant1 = variant_tracking.Variant(1, 3, "A", "T")
    assert variant1 not in variants
    var_id = variants.add(variant1)
    assert variant1 in variants
    assert variants[variant1] == var_id
    variant2 = variant_tracking.Variant(1, 2, "A", "T")
    variant3 = variant_tracking.Variant(0, 1, "A", "T")
    variants.add(variant2)
    variants.add(variant3)
    expect = [
        (variant3, variants[variant3]),
        (variant2, variants[variant2]),
        (variant1, variants[variant1]),
    ]
    assert list(variants.sorted_iter()) == expect

    tmp_file = "tmp.variants.save_to_file"
    utils.rm_rf(tmp_file)
    variants.save_to_file(tmp_file)
    new_variants = variant_tracking.Variants()
    new_variants.load_from_file(tmp_file)
    assert variants == new_variants
    os.unlink(tmp_file)
def test_simplify_vcf():
    infile = os.path.join(data_dir, "simplify_vcf.in.vcf")
    ref_fa = os.path.join(data_dir, "simplify_vcf.ref.fa")
    ref_seqs = {}
    pyfastaq.tasks.file_to_dict(ref_fa, ref_seqs)
    tmp_out = "tmp.simplify_vcf.out.vcf"
    utils.rm_rf(tmp_out)
    utils.simplify_vcf(infile, tmp_out, ref_seqs=ref_seqs)
    expect = os.path.join(data_dir, "simplify_vcf.expect.vcf")
    assert filecmp.cmp(tmp_out, expect, shallow=False)
    os.unlink(tmp_out)
    utils.simplify_vcf(infile + ".gz", tmp_out, ref_seqs=ref_seqs)
    assert filecmp.cmp(tmp_out, expect, shallow=False)
    utils.simplify_vcf(infile, tmp_out, keep_ref_calls=True, ref_seqs=ref_seqs)
    expect = os.path.join(data_dir, "simplify_vcf.expect_keep_ref_calls.vcf")
    assert filecmp.cmp(tmp_out, expect, shallow=False)
    os.unlink(tmp_out)
def test_normalise_vcf():
    infile = os.path.join(data_dir, "normalise_vcf.in.vcf")
    ref_fa = os.path.join(data_dir, "normalise_vcf.in.fa")
    expect = os.path.join(data_dir, "normalise_vcf.out.vcf")
    tmp_out = "tmp.normalise_vcf.vcf"
    utils.rm_rf(tmp_out)
    utils.normalise_vcf(infile, ref_fa, tmp_out)
    expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list(
        expect)
    got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(tmp_out)
    # The normalizing commands add lots of lines to the header.
    # We don't care about those, so just check the actual records.
    assert got_vcf_records == expected_vcf_records
    os.unlink(tmp_out)

    # test again but without breaking alleles into separate records
    utils.normalise_vcf(infile, ref_fa, tmp_out, break_alleles=False)
    expect = os.path.join(data_dir, "normalise_vcf.out.no_break_alleles.vcf")
    expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list(
        expect)
    got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(tmp_out)
    assert got_vcf_records == expected_vcf_records
    os.unlink(tmp_out)
Esempio n. 4
0
def test_load_one_vcf_file():
    vcf_file = os.path.join(data_dir, "load_one_vcf_file.vcf")
    ref_fasta = os.path.join(data_dir, "load_one_vcf_file.fa")
    (
        ref_seqs,
        ref_names,
        ref_seq_to_id,
    ) = variant_tracking.VariantTracker.load_ref_seq_data(ref_fasta)
    tmp_dir = "tmp.load_one_vcf_file"
    utils.rm_rf(tmp_dir)
    os.mkdir(tmp_dir)
    got_sample, got_variants = variant_tracking._load_one_vcf_file(
        vcf_file, ref_seqs, ref_seq_to_id, ref_fasta, tmp_dir, True)
    assert got_sample == "sample_42"
    expect_variants = [
        variant_tracking.Variant(seq_id=0, pos=1, ref="T", alt="TCGC"),
        variant_tracking.Variant(seq_id=0, pos=2, ref="C", alt="G"),
        variant_tracking.Variant(seq_id=0, pos=6, ref="A", alt="T"),
        variant_tracking.Variant(seq_id=0, pos=8, ref="T", alt="G"),
        variant_tracking.Variant(seq_id=1, pos=1, ref="G", alt="C"),
        variant_tracking.Variant(seq_id=1, pos=1, ref="G", alt="A"),
    ]
    assert got_variants == expect_variants
    os.rmdir(tmp_dir)
Esempio n. 5
0
def test_VariantBlock():
    # Contruct and add a variant and sample
    block = variant_tracking.VariantBlock()
    assert block.number_of_samples() == 0
    assert block.number_of_variants() == 0
    block.add_variants(1)
    assert block.number_of_samples() == 0
    assert block.number_of_variants() == 1
    block.add_samples(1)
    assert block.number_of_samples() == 1
    assert block.number_of_variants() == 1

    # Getting and setting variant
    assert not block.has_variant(0, 0)
    block.set_variant(0, 0)
    assert block.has_variant(0, 0)

    # Add more samples and variant, check first variant and sample not changed
    block.add_variants(3)
    block.add_samples(2)
    assert block.number_of_samples() == 3
    assert block.number_of_variants() == 4
    assert block.has_variant(0, 0)
    assert not block.has_variant(0, 1)
    assert not block.has_variant(0, 2)
    assert not block.has_variant(1, 0)
    assert not block.has_variant(1, 1)
    assert not block.has_variant(1, 2)
    assert not block.has_variant(2, 0)
    assert not block.has_variant(2, 1)
    assert not block.has_variant(2, 2)
    assert not block.has_variant(3, 0)
    assert not block.has_variant(3, 1)
    assert not block.has_variant(3, 2)
    block.set_variant(2, 2)
    block.set_variant(3, 0)
    block.set_variant(3, 1)

    # Save to file
    variants = variant_tracking.Variants()
    variants.add(variant_tracking.Variant(0, 0, "A", "G"))
    variants.add(variant_tracking.Variant(0, 2, "G", "T"))
    variants.add(variant_tracking.Variant(0, 2, "G", "C"))
    variants.add(variant_tracking.Variant(1, 42, "G", "C"))
    tmp_file = "tmp.variant_tracking.block.tsv.gz"
    utils.rm_rf(tmp_file)
    utils.rm_rf(tmp_file + ".tbi")
    block.write_to_bgzip_file_and_tab_index(tmp_file, variants)
    wanted_ids = set([v for k, v in variants.sorted_iter()])

    # Load slices from file. Note that none of the variants had variants[1], so
    # should not be in the file
    assert variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 1, 0,
                                                0) == {}
    assert variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 1, 41,
                                                41) == {}
    assert variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 1, 43,
                                                43) == {}
    assert variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 1,
                                                1) == {}
    expect_vars = {0: block.bitarrays[0]}
    assert (variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 0,
                                                 0) == expect_vars)
    assert variant_tracking.load_slice_of_block(tmp_file, {1}, 0, 0, 0) == {}
    assert (variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 0,
                                                 1) == expect_vars)
    expect_vars[2] = bitarray(block.bitarrays[2])
    assert (variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 0,
                                                 2) == expect_vars)
    assert (variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 0,
                                                 3) == expect_vars)

    # Load variant patterns from slices of block. Make another block file to test
    # getting from >1 file
    block.clear_samples()
    variants.add(variant_tracking.Variant(0, 1, "C", "G"))
    variants.add(variant_tracking.Variant(0, 10, "T", "A"))
    block.add_variants(2)
    block.add_samples(2)
    block.set_variant(0, 1)
    block.set_variant(1, 0)
    block.set_variant(1, 1)
    block.set_variant(4, 0)
    block.set_variant(5, 1)
    tmp_file2 = "tmp.variant_tracking.block.2.tsv.gz"
    utils.rm_rf(tmp_file2)
    utils.rm_rf(tmp_file2 + ".tbi")
    block.write_to_bgzip_file_and_tab_index(tmp_file2, variants)
    wanted_ids = set([v for k, v in variants.sorted_iter()])
    got_patterns = variant_tracking.var_patterns_from_block_slices(
        [tmp_file, tmp_file2], wanted_ids, 1, 0, 41)
    assert got_patterns == set()
    got_patterns = variant_tracking.var_patterns_from_block_slices(
        [tmp_file, tmp_file2], wanted_ids, 1, 0, 42)
    assert got_patterns == {(3, )}
    got_patterns = variant_tracking.var_patterns_from_block_slices(
        [tmp_file, tmp_file2], wanted_ids, 1, 42, 42)
    assert got_patterns == {(3, )}
    got_patterns = variant_tracking.var_patterns_from_block_slices(
        [tmp_file, tmp_file2], wanted_ids, 1, 42, 43)
    assert got_patterns == {(3, )}
    got_patterns = variant_tracking.var_patterns_from_block_slices(
        [tmp_file, tmp_file2], wanted_ids, 1, 43, 43)
    assert got_patterns == set()
    got_patterns = variant_tracking.var_patterns_from_block_slices(
        [tmp_file, tmp_file2], wanted_ids, 0, 0, 9)
    expect_patterns = {(0, 1), (2, ), (0, ), (1, 4)}
    assert got_patterns == expect_patterns
    got_patterns = variant_tracking.var_patterns_from_block_slices(
        [tmp_file, tmp_file2], wanted_ids, 0, 0, 10)
    expect_patterns = {(0, 1, 5), (2, ), (0, ), (1, 4)}
    assert got_patterns == expect_patterns

    os.unlink(tmp_file)
    os.unlink(tmp_file + ".tbi")
    os.unlink(tmp_file2)
    os.unlink(tmp_file2 + ".tbi")
Esempio n. 6
0
def test_VariantTracker_make_from_vcf_then_save_then_load_then_cluster():
    # Create from VCF files and save to directory
    vcf_files = [
        os.path.join(data_dir, f"construct_from_vcf_files.{i}.vcf")
        for i in (1, 2, 3)
    ]
    ref_fasta = os.path.join(data_dir, "construct_from_vcf_files.ref.fa")
    root_dir = "tmp.construct_from_vcf_files"
    utils.rm_rf(root_dir)
    tmp_dir = f"{root_dir}.tmp"
    utils.rm_rf(tmp_dir)
    os.mkdir(tmp_dir)
    root_dir = "tmp.construct_from_vcf_files"
    tracker = variant_tracking.VariantTracker(root_dir, ref_fasta)
    tracker.merge_vcf_files(vcf_files,
                            temp_dir=tmp_dir,
                            cpus=2,
                            sample_limit=2)

    # Check variables ok and files made ok
    expect_samples = [["sample_1", "sample_2"], ["sample_3"]]
    expect_block_files = ["block.0.tsv.gz", "block.1.tsv.gz"]
    assert tracker.samples == expect_samples
    assert tracker.var_block_files == expect_block_files
    expect_dir = os.path.join(data_dir, "construct_from_vcf_files.expect")
    expect_variants = variant_tracking.Variants()
    expect_variants.load_from_file(os.path.join(expect_dir, "variants.tsv.gz"))
    assert tracker.variants == expect_variants
    for block_file in expect_block_files:
        expect = os.path.join(expect_dir, block_file)
        got = os.path.join(root_dir, block_file)
        with vcf_file_read.open_vcf_file_for_reading(expect) as f:
            expect_lines = [x.rstrip() for x in f]
        with vcf_file_read.open_vcf_file_for_reading(got) as f:
            got_lines = [x.rstrip() for x in f]
        assert expect_lines == got_lines
        assert os.path.exists(f"{got}.tbi")

    # Now the root_dir exists, constructor should load data from directory
    tracker = variant_tracking.VariantTracker(root_dir, ref_fasta)
    assert tracker.samples == expect_samples
    assert tracker.var_block_files == expect_block_files
    assert tracker.variants == expect_variants

    # Run the clustering. max_ref_len 8 is length of longest REF string in
    # the test data
    cluster_out = "tmp.variant_tracker.cluster"
    vcf_out = f"{cluster_out}.vcf"
    excluded_out = f"{cluster_out}.excluded.tsv"
    tracker.cluster(cluster_out, 10)
    expect_vcf = os.path.join(data_dir, "cluster.max_ref_8.vcf")
    expect_excluded = os.path.join(data_dir, "cluster.max_ref_8.excluded.tsv")
    assert filecmp.cmp(vcf_out, expect_vcf, shallow=False)
    assert filecmp.cmp(excluded_out, expect_excluded, shallow=False)
    os.unlink(vcf_out)
    os.unlink(excluded_out)

    # Set ref length limit to exclude the ref length 8 variant
    tracker.cluster(cluster_out, 7)
    expect_vcf = os.path.join(data_dir, "cluster.max_ref_7.vcf")
    expect_excluded = os.path.join(data_dir, "cluster.max_ref_7.excluded.tsv")
    assert filecmp.cmp(vcf_out, expect_vcf, shallow=False)
    assert filecmp.cmp(excluded_out, expect_excluded, shallow=False)
    os.unlink(vcf_out)
    os.unlink(excluded_out)

    # Set allele limit to trigger on the length 8 variant, so we only
    # get the alt alleles for each sample
    tracker.cluster(cluster_out, 8, max_alleles=5)
    expect_vcf = os.path.join(data_dir, "cluster.max_alleles_5.vcf")
    expect_excluded = os.path.join(data_dir,
                                   "cluster.max_alleles_5.excluded.tsv")
    assert filecmp.cmp(vcf_out, expect_vcf, shallow=False)
    assert filecmp.cmp(excluded_out, expect_excluded, shallow=False)
    os.unlink(vcf_out)
    os.unlink(excluded_out)

    utils.rm_rf(tmp_dir)
    utils.rm_rf(root_dir)
def _load_one_vcf_file(
    vcf_file, ref_seqs, ref_seq_to_id, ref_fasta, temp_dir, break_alleles
):
    sample = vcf_file_read.get_sample_name_from_vcf_file(vcf_file)
    if sample is None:
        raise Exception(f"Error getting sample name from vcf file {vcf_file}")
    tmpdir = tempfile.mkdtemp(prefix="normalize_vcf.", dir=temp_dir)
    simplified_vcf = os.path.join(tmpdir, "simplified.vcf")
    normalized_vcf = os.path.join(tmpdir, "normalized.vcf")
    utils.simplify_vcf(vcf_file, simplified_vcf, ref_seqs=ref_seqs)
    utils.normalise_vcf(
        simplified_vcf, ref_fasta, normalized_vcf, break_alleles=break_alleles
    )
    os.unlink(simplified_vcf)
    variants = []

    with vcf_file_read.open_vcf_file_for_reading(normalized_vcf) as f:
        for line in f:
            if line.startswith("#"):
                continue

            record = vcf_record.VcfRecord(line)
            if record.POS < 0:
                logging.warning(
                    f"VCF record with negative POS in file {vcf_file}. Ignoring: {record}"
                )
                continue
            elif record.CHROM not in ref_seqs:
                logging.warning(
                    f"CHROM not recognised in VCF record in file {vcf_file}. Ignoring: {record}"
                )
                continue
            elif not record.ref_string_matches_ref_sequence(ref_seqs[record.CHROM]):
                logging.warning(
                    f"REF string does not match reference seq in file {vcf_file}. Ignoring: {record}"
                )
                continue
            elif "GT" not in record.FORMAT:
                logging.warning(
                    f"No GT in VCF record in file {vcf_file}. Ignoring: {record}"
                )
                continue

            gt_indexes = re.split("[/|]", record.FORMAT["GT"])
            if "." in gt_indexes:
                continue
            gt_indexes = set([int(x) for x in gt_indexes])
            if gt_indexes == {0}:
                continue

            for i in gt_indexes:
                if i > 0:
                    variants.append(
                        Variant(
                            ref_seq_to_id[record.CHROM],
                            record.POS,
                            record.REF,
                            record.ALT[i - 1],
                        )
                    )

    utils.rm_rf(tmpdir)
    return sample, variants
    def merge_vcf_files(
        self,
        infiles,
        temp_dir=None,
        cpus=1,
        mem_limit=2,
        force=False,
        sample_limit=None,
        break_alleles=True,
    ):
        if force:
            utils.rm_rf(self.root_dir)
        os.mkdir(self.root_dir)

        if temp_dir is None:
            temp_dir = os.path.join(self.root_dir, "tmp")
        if os.path.exists(temp_dir):
            made_temp_dir = False
        else:
            os.mkdir(temp_dir)
            made_temp_dir = True

        self.var_block_files = []
        self.var_block_tabixes = []
        self.samples = []
        self._add_var_block_file()
        self.variants = Variants()
        var_block = VariantBlock()

        for i in range(0, len(infiles), cpus):
            with multiprocessing.Pool(cpus) as pool:
                new_variants_lists = pool.starmap(
                    _load_one_vcf_file,
                    zip(
                        infiles[i : i + cpus],
                        itertools.repeat(self.ref_seqs),
                        itertools.repeat(self.ref_seq_to_id),
                        itertools.repeat(self.ref_fasta),
                        itertools.repeat(temp_dir),
                        itertools.repeat(break_alleles),
                    ),
                )

            for sample, new_variants in new_variants_lists:
                if len(new_variants) == 0:
                    continue

                if self._block_will_break_limits(var_block, mem_limit, sample_limit):
                    self._write_last_var_block_file(var_block)
                    self._add_var_block_file()
                    var_block.clear_samples()

                self.samples[-1].append(sample)
                if var_block.number_of_samples() == 0 == var_block.number_of_variants():
                    var_block.add_variants(1)
                    self.variants.add(new_variants[0])

                var_block.add_samples(1)

                for variant in new_variants:
                    if variant not in self.variants:
                        var_block.add_variants(1)
                    var_id = self.variants.add(variant)
                    var_block.set_variant(var_id, -1)

            if i % 100 == 0:
                logging.info(f"Loaded {i+cpus} files out of {len(infiles)}")

        if made_temp_dir:
            utils.rm_rf(temp_dir)

        logging.info(f"Loaded all {len(infiles)} VCF files")
        self._write_last_var_block_file(var_block)

        logging.info(f"Saving metadata to file {self.metadata_file}")
        self._save_metadata_file()

        logging.info(f"Saving variants to file {self.variants_file}")
        self.variants.save_to_file(self.variants_file)
        logging.info("Finished merging VCF files")