def test_Variants(): variants = variant_tracking.Variants() assert len(variants) == 0 variant1 = variant_tracking.Variant(1, 3, "A", "T") assert variant1 not in variants var_id = variants.add(variant1) assert variant1 in variants assert variants[variant1] == var_id variant2 = variant_tracking.Variant(1, 2, "A", "T") variant3 = variant_tracking.Variant(0, 1, "A", "T") variants.add(variant2) variants.add(variant3) expect = [ (variant3, variants[variant3]), (variant2, variants[variant2]), (variant1, variants[variant1]), ] assert list(variants.sorted_iter()) == expect tmp_file = "tmp.variants.save_to_file" utils.rm_rf(tmp_file) variants.save_to_file(tmp_file) new_variants = variant_tracking.Variants() new_variants.load_from_file(tmp_file) assert variants == new_variants os.unlink(tmp_file)
def test_any_vars_overlap(): variants = [variant_tracking.Variant(0, 4, "T", "G")] assert not allele_combinations.any_vars_overlap(variants) variants.append(variant_tracking.Variant(0, 5, "TA", "G")) assert not allele_combinations.any_vars_overlap(variants) variants.append(variant_tracking.Variant(0, 6, "A", "G")) assert allele_combinations.any_vars_overlap(variants)
def test_var_pattern_to_alleles(): ref_seq = pyfastaq.sequences.Fasta("ref", "GTAGCTGACTGCTAGTGTA") # 0123456789012345678 variants = { 0: variant_tracking.Variant(0, 3, "G", "T"), 1: variant_tracking.Variant(0, 3, "G", "A"), 2: variant_tracking.Variant(0, 4, "C", "A"), 3: variant_tracking.Variant(0, 6, "GA", "G"), 4: variant_tracking.Variant(0, 8, "C", "CAAA"), 5: variant_tracking.Variant(0, 9, "T", "CG"), 6: variant_tracking.Variant(0, 10, "G", "C"), 7: variant_tracking.Variant(0, 10, "GC", "G"), } f = variant_tracking.var_pattern_to_alleles assert f(variants, {0}, ref_seq, 2, 3) == {"AT"} assert f(variants, {0}, ref_seq, 2, 4) == {"ATC"} assert f(variants, {0}, ref_seq, 3, 3) == {"T"} assert f(variants, {0}, ref_seq, 3, 4) == {"TC"} assert f(variants, {0, 1}, ref_seq, 3, 4) == {"TC", "AC"} assert f(variants, {0, 2}, ref_seq, 3, 4) == {"TA"} assert f(variants, {0, 2, 3}, ref_seq, 3, 7) == {"TATG"} assert f(variants, {0, 2, 3}, ref_seq, 3, 8) == {"TATGC"} assert f(variants, {0, 2, 3}, ref_seq, 3, 9) == {"TATGCT"} assert f(variants, {0, 2, 3, 4}, ref_seq, 3, 9) == {"TATGCAAAT"} assert f(variants, {0, 2, 3, 4, 5}, ref_seq, 3, 9) == {"TATGCAAACG"} assert f(variants, {0, 2, 3, 4, 5}, ref_seq, 3, 10) == {"TATGCAAACGG"} assert f(variants, {0, 2, 3, 4, 5, 6}, ref_seq, 3, 10) == {"TATGCAAACGC"} assert f(variants, {0, 2, 3, 4, 5, 6}, ref_seq, 3, 11) == {"TATGCAAACGCC"} assert f(variants, {0, 2, 3, 4, 5, 7}, ref_seq, 3, 11) == {"TATGCAAACGG"}
def test_var_cluster_to_coords_and_snps_and_non_snps(): variants = [variant_tracking.Variant(0, 10, "A", "G")] f = allele_combinations.var_cluster_to_coords_and_snps_and_non_snps snps = {10: {"A", "G"}} assert f(variants) == (10, 10, snps, []) indel = variant_tracking.Variant(0, 10, "AT", "A") alt_indel = variant_tracking.Variant(0, 11, "T", "") variants.extend([ variant_tracking.Variant(0, 10, "A", "T"), indel, ]) snps[10].add("T") assert f(variants) == (10, 11, snps, [alt_indel])
def var_cluster_to_coords_and_snps_and_non_snps(variants): assert len(variants) > 0 nucleotides = {"A", "C", "G", "T"} snps = {} # position => set of alts (and the ref nucleotide) non_snps = [] # list of variants start = float("Inf") end = -1 for var in variants: start = min(start, var.pos) end = max(end, var.pos + len(var.ref) - 1) if var.ref in nucleotides and var.alt in nucleotides: if var.pos not in snps: snps[var.pos] = {var.ref} snps[var.pos].add(var.alt) else: ref = var.ref alt = var.alt i = 0 while i < len(ref) - 1 and i < len(alt) and ref[i] == alt[i]: i += 1 non_snps.append( variant_tracking.Variant(var.seq_id, var.pos + i, ref[i:], alt[i:])) return start, end, snps, non_snps
def test_variants_overlap(): v1 = variant_tracking.Variant(0, 10, "A", "T") v2 = variant_tracking.Variant(0, 10, "AG", "T") v3 = variant_tracking.Variant(1, 10, "A", "T") v4 = variant_tracking.Variant(0, 11, "A", "T") v5 = variant_tracking.Variant(0, 12, "A", "T") f = variant_tracking.variants_overlap assert f(v1, v2) assert f(v2, v1) assert not f(v1, v3) assert not f(v3, v1) assert not f(v1, v4) assert not f(v4, v1) assert not f(v1, v5) assert not f(v5, v1) assert f(v2, v4) assert f(v4, v2) assert not f(v2, v5) assert not f(v5, v2)
def test_load_one_vcf_file(): vcf_file = os.path.join(data_dir, "load_one_vcf_file.vcf") ref_fasta = os.path.join(data_dir, "load_one_vcf_file.fa") ( ref_seqs, ref_names, ref_seq_to_id, ) = variant_tracking.VariantTracker.load_ref_seq_data(ref_fasta) tmp_dir = "tmp.load_one_vcf_file" utils.rm_rf(tmp_dir) os.mkdir(tmp_dir) got_sample, got_variants = variant_tracking._load_one_vcf_file( vcf_file, ref_seqs, ref_seq_to_id, ref_fasta, tmp_dir, True) assert got_sample == "sample_42" expect_variants = [ variant_tracking.Variant(seq_id=0, pos=1, ref="T", alt="TCGC"), variant_tracking.Variant(seq_id=0, pos=2, ref="C", alt="G"), variant_tracking.Variant(seq_id=0, pos=6, ref="A", alt="T"), variant_tracking.Variant(seq_id=0, pos=8, ref="T", alt="G"), variant_tracking.Variant(seq_id=1, pos=1, ref="G", alt="C"), variant_tracking.Variant(seq_id=1, pos=1, ref="G", alt="A"), ] assert got_variants == expect_variants os.rmdir(tmp_dir)
def test_VariantBlock(): # Contruct and add a variant and sample block = variant_tracking.VariantBlock() assert block.number_of_samples() == 0 assert block.number_of_variants() == 0 block.add_variants(1) assert block.number_of_samples() == 0 assert block.number_of_variants() == 1 block.add_samples(1) assert block.number_of_samples() == 1 assert block.number_of_variants() == 1 # Getting and setting variant assert not block.has_variant(0, 0) block.set_variant(0, 0) assert block.has_variant(0, 0) # Add more samples and variant, check first variant and sample not changed block.add_variants(3) block.add_samples(2) assert block.number_of_samples() == 3 assert block.number_of_variants() == 4 assert block.has_variant(0, 0) assert not block.has_variant(0, 1) assert not block.has_variant(0, 2) assert not block.has_variant(1, 0) assert not block.has_variant(1, 1) assert not block.has_variant(1, 2) assert not block.has_variant(2, 0) assert not block.has_variant(2, 1) assert not block.has_variant(2, 2) assert not block.has_variant(3, 0) assert not block.has_variant(3, 1) assert not block.has_variant(3, 2) block.set_variant(2, 2) block.set_variant(3, 0) block.set_variant(3, 1) # Save to file variants = variant_tracking.Variants() variants.add(variant_tracking.Variant(0, 0, "A", "G")) variants.add(variant_tracking.Variant(0, 2, "G", "T")) variants.add(variant_tracking.Variant(0, 2, "G", "C")) variants.add(variant_tracking.Variant(1, 42, "G", "C")) tmp_file = "tmp.variant_tracking.block.tsv.gz" utils.rm_rf(tmp_file) utils.rm_rf(tmp_file + ".tbi") block.write_to_bgzip_file_and_tab_index(tmp_file, variants) wanted_ids = set([v for k, v in variants.sorted_iter()]) # Load slices from file. Note that none of the variants had variants[1], so # should not be in the file assert variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 1, 0, 0) == {} assert variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 1, 41, 41) == {} assert variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 1, 43, 43) == {} assert variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 1, 1) == {} expect_vars = {0: block.bitarrays[0]} assert (variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 0, 0) == expect_vars) assert variant_tracking.load_slice_of_block(tmp_file, {1}, 0, 0, 0) == {} assert (variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 0, 1) == expect_vars) expect_vars[2] = bitarray(block.bitarrays[2]) assert (variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 0, 2) == expect_vars) assert (variant_tracking.load_slice_of_block(tmp_file, wanted_ids, 0, 0, 3) == expect_vars) # Load variant patterns from slices of block. Make another block file to test # getting from >1 file block.clear_samples() variants.add(variant_tracking.Variant(0, 1, "C", "G")) variants.add(variant_tracking.Variant(0, 10, "T", "A")) block.add_variants(2) block.add_samples(2) block.set_variant(0, 1) block.set_variant(1, 0) block.set_variant(1, 1) block.set_variant(4, 0) block.set_variant(5, 1) tmp_file2 = "tmp.variant_tracking.block.2.tsv.gz" utils.rm_rf(tmp_file2) utils.rm_rf(tmp_file2 + ".tbi") block.write_to_bgzip_file_and_tab_index(tmp_file2, variants) wanted_ids = set([v for k, v in variants.sorted_iter()]) got_patterns = variant_tracking.var_patterns_from_block_slices( [tmp_file, tmp_file2], wanted_ids, 1, 0, 41) assert got_patterns == set() got_patterns = variant_tracking.var_patterns_from_block_slices( [tmp_file, tmp_file2], wanted_ids, 1, 0, 42) assert got_patterns == {(3, )} got_patterns = variant_tracking.var_patterns_from_block_slices( [tmp_file, tmp_file2], wanted_ids, 1, 42, 42) assert got_patterns == {(3, )} got_patterns = variant_tracking.var_patterns_from_block_slices( [tmp_file, tmp_file2], wanted_ids, 1, 42, 43) assert got_patterns == {(3, )} got_patterns = variant_tracking.var_patterns_from_block_slices( [tmp_file, tmp_file2], wanted_ids, 1, 43, 43) assert got_patterns == set() got_patterns = variant_tracking.var_patterns_from_block_slices( [tmp_file, tmp_file2], wanted_ids, 0, 0, 9) expect_patterns = {(0, 1), (2, ), (0, ), (1, 4)} assert got_patterns == expect_patterns got_patterns = variant_tracking.var_patterns_from_block_slices( [tmp_file, tmp_file2], wanted_ids, 0, 0, 10) expect_patterns = {(0, 1, 5), (2, ), (0, ), (1, 4)} assert got_patterns == expect_patterns os.unlink(tmp_file) os.unlink(tmp_file + ".tbi") os.unlink(tmp_file2) os.unlink(tmp_file2 + ".tbi")
def test_var_cluster_to_coords_and_alts(): ref_seq = pyfastaq.sequences.Fasta("ref", "CTAGTCGATGCACTGATAGTA") # 012345678901234567890 variants = [variant_tracking.Variant(0, 4, "T", "G")] f = allele_combinations.var_cluster_to_coords_and_alts assert f(variants, ref_seq) == (4, 4, {"G"}) variants.append(variant_tracking.Variant(0, 4, "T", "A")) assert f(variants, ref_seq) == (4, 4, {"A", "G"}) variants = [ variant_tracking.Variant(0, 4, "T", "G"), variant_tracking.Variant(0, 5, "C", "A"), ] assert f(variants, ref_seq) == (4, 5, {"TA", "GC", "GA"}) variants = [ variant_tracking.Variant(0, 4, "TC", "T"), variant_tracking.Variant(0, 5, "C", "A"), ] assert f(variants, ref_seq) == (4, 5, {"TA", "T"}) variants = [ variant_tracking.Variant(0, 4, "TC", "T"), variant_tracking.Variant(0, 4, "TCG", "T"), variant_tracking.Variant(0, 5, "C", "A"), ] assert f(variants, ref_seq) == (4, 6, {"TG", "T", "TAG"}) # Annoying edge case. The two deletions appear to overlap, because # we're using the VCF convention of using the nucleotide before the deletion # in the ref and alt. Check that we get the result of both deletions # being applied variants = [ variant_tracking.Variant(0, 4, "TCG", "T"), variant_tracking.Variant(0, 6, "GAT", "G"), ] assert f(variants, ref_seq) == (4, 8, {"TAT", "TCG", "T"}) variants.append(variant_tracking.Variant(0, 5, "C", "A")) assert f(variants, ref_seq) == (4, 8, {"TAT", "TCG", "T", "TAG", "TAGAT"})