def test_double_deletion_with_snp_inside_first_deletion(): graph = Graph.from_dicts( { 1: "ACTG", 2: "A", 3: "C", 4: "T", 5: "AAA", 6: "G" }, { 1: [2, 5, 6], 2: [3, 4], 3: [5, 6], 4: [5, 6], 5: [6] }, [1, 2, 4, 6]) variants = VcfVariants([ VcfVariant(1, 4, "GAT", "G", type="DELETION"), VcfVariant(1, 6, "TAAA", "T", type="DELETION") ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() print(new_graph)
def test_overlapping_deletions(): graph = Graph.from_dicts( { 1: "AA", 2: "TCTG", 3: "TCT", 4: "G", 5: "A", 6: "GG" }, { 1: [2, 3], 2: [3, 6], 3: [4, 5], 4: [6], 5: [6] }, [1, 2, 3, 5, 6]) variants = VcfVariants([ VcfVariant(1, 2, "ATCTG", "A", type="DELETION"), VcfVariant(1, 6, "GTCTA", "T", type="DELETION"), VcfVariant(1, 10, "A", "G", type="SNP") ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() assert list(new_graph.get_edges(1)) == [2, 8] assert list(new_graph.get_edges(8)) == [3, 9] assert list(new_graph.get_edges(2)) == [3, 9] assert list(new_graph.get_edges(9)) == [6] ref_node, var_node = new_graph.get_variant_nodes(variants[1]) assert ref_node == 3 assert var_node == 9 print(new_graph)
def test_insertion_with_identical_false_path(): graph = Graph.from_dicts({ 1: "AA", 2: "TCTG", 3: "TCTG", 4: "GG" }, { 1: [2, 3], 2: [3], 3: [4], }, [1, 3, 4]) variants = VcfVariants([ VcfVariant(1, 2, "A", "ATCTG", type="INSERTION"), ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() print(new_graph) assert list(new_graph.get_edges(1)) == [2, 6] assert list(new_graph.get_edges(6)) == [3] assert list(new_graph.get_edges(2)) == [3] ref_node, var_node = new_graph.get_variant_nodes(variants[0]) assert ref_node == 6 assert var_node == 2
def make_unique_index(args): graph = Graph.from_file(args.graph) reverse = ReverseKmerIndex.from_file(args.reverse) flat = FlatKmers.from_file(args.flat_index) unique = UniqueKmerIndex.from_flat_kmers_and_snps_graph( flat, graph, reverse) unique.to_file(args.out_file_name)
def make_unique_variant_kmers(args): logging.info("Reading kmer index") kmer_index = CollisionFreeKmerIndex.from_file(args.kmer_index) to_shared_memory(kmer_index, "kmer_index_shared") logging.info("Reading variant to nodes") variant_to_nodes = VariantToNodes.from_file(args.variant_to_nodes) to_shared_memory(variant_to_nodes, "variant_to_nodes_shared") logging.info("REading graph") graph = Graph.from_file(args.graph) to_shared_memory(graph, "graph_shared") logging.info("Reading all variants") variants = VcfVariants.from_vcf(args.vcf, skip_index=True, make_generator=True) variants = variants.get_chunks(chunk_size=args.chunk_size) pool = Pool(args.n_threads) all_flat_kmers = [] for flat_kmers in pool.starmap(make_unique_variant_kmers_single_thread, zip(variants, repeat(args))): all_flat_kmers.append(flat_kmers) logging.info("Merge all flat kmers") merged_flat = FlatKmers.from_multiple_flat_kmers(all_flat_kmers) merged_flat.to_file(args.out_file_name) logging.info("Wrote to file %s" % args.out_file_name)
def test_double_deletion_with_snp_inside_first_deletiod_and_false_deletion_path( ): repeated_sequence = "AGGTCCCAGGTCCATCT" graph = Graph.from_dicts( { 1: "TTTT", 2: "AGGTCC", 3: "C", 4: "A", 5: repeated_sequence, 6: repeated_sequence }, { 1: [2, 5, 6], 2: [3, 4], 3: [5, 6], 4: [5, 6], 5: [6] }, [1, 2, 3, 5, 6]) variants = VcfVariants([ VcfVariant(1, 4, "TAGGTCCC", "T", type="DELETION"), VcfVariant(1, 11, "CAGGTCCCAGGTCCATCT", "C", type="DELETION") ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() print(new_graph) assert list(new_graph.get_edges(1)) == [2, 8] assert list(new_graph.get_edges(2)) == [3, 4] assert list(new_graph.get_edges(3)) == [5, 9] assert list(new_graph.get_edges(4)) == [5, 9] assert list(new_graph.get_edges(9)) == [6] assert list(new_graph.get_edges(8)) == [5, 9]
def test_simple(): graph1 = Graph.from_dicts({ 1: "ACTG", 2: "A", 3: "C", 4: "ACT" }, { 1: [2, 3], 2: [4], 3: [4] }, [1, 2, 4]) graph2 = Graph.from_dicts({ 1: "AAAA", 2: "A", 3: "C", 4: "ACT" }, { 1: [2, 3], 2: [4], 3: [4] }, [1, 2, 4]) merged_graph = merge_graphs([graph1, graph2]) print(merged_graph) assert list(merged_graph.get_edges(1)) == [2, 3] assert merged_graph.get_node_at_ref_offset(0) == 1 assert merged_graph.get_node_sequence( merged_graph.get_node_at_ref_offset(8)) == "AAAA" assert merged_graph.get_node_sequence( merged_graph.get_node_at_ref_offset(4)) == "A" assert len(merged_graph.get_edges( merged_graph.get_node_at_ref_offset(8))) == 2 assert len(merged_graph.get_edges( merged_graph.get_node_at_ref_offset(11))) == 2 assert merged_graph.get_ref_offset_at_node(6) == 8 assert 7 in merged_graph.linear_ref_nodes() assert merged_graph.get_ref_offset_at_node(7) == 12 assert list(merged_graph.chromosome_start_nodes) == [1, 6] merged_graph.to_file("merged_graph.npz") merged_graph2 = Graph.from_file("merged_graph.npz")
def test_indel_graph2(): graph = Graph.from_dicts( { 1: "gggggaggcttgtggttagcagagagtgggtggaagacagaggtttgag", 2: "ga", 3: "gagagagacccaggggagaaaaccagctgcagaggcaggaggggtccagggcagcccgaggccagagatgggcgtcttccttacagccacctgtggtccc", 100: "" }, { 1: [2, 100], 2: [3], 100: [3] }, [1, 2, 3]) kmer_finder = SnpKmerFinder(graph, k=31) flat_kmers = kmer_finder.find_kmers() print(kmer_finder.kmers_found)
def test_find_insertion_nodes(): g = Graph.from_dicts({ 1: "CTACCA", 2: "AA", 3: "TAAATAA", 4: "" }, { 1: [2, 4], 2: [3], 4: [3] }, [1, 3]) variant = VcfVariant(1, 6, "A", "AAA", "", "INSERTION") ref_node, variant_node = g.get_variant_nodes(variant) assert ref_node == 4 assert variant_node == 2
def test_simple_insertion(): graph = Graph.from_dicts({ 1: "ACTG", 2: "C", 3: "AAAA" }, { 1: [2, 3], 2: [3] }, [1, 3]) variants = VcfVariants([VcfVariant(1, 4, "G", "GC", type="INSERTION")]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() assert new_graph.node_has_edges(5, [3]) assert new_graph.node_has_edges(1, [2, 5]) assert new_graph.node_has_edges(2, [3])
def simple_test(): g = Graph.from_dicts({ 1: "CTACCA", 2: "AA", 3: "TAAATAA", 4: "" }, { 1: [2, 4], 2: [3], 4: [3] }, [1, 2, 3]) print(g.ref_offset_to_node) print(g.get_node_size(3)) k = 4 variants = VcfVariants([VcfVariant(6, "AAA", "A", "", "DELETION")]) reference_kmers = ReferenceKmerIndex.from_sequence("CTACCAAATAAATAA", k) finder = UniqueVariantKmersFinder(g, reference_kmers, variants, k) finder.find_unique_kmers()
def create_index(args): if args.graph_file_name is not None: graph = Graph.from_file(args.graph_file_name) to_shared_memory(graph, "graph_shared") if args.threads == 1: kmers = create_index_single_thread(args) kmers.to_file(args.out_file_name) else: logging.info("Making pool with %d workers" % args.threads) pool = Pool(args.threads) genome_size = args.genome_size n_total_start_positions = genome_size // args.spacing n_positions_each_process = n_total_start_positions // args.threads logging.info( "Using genome size %d. Will process %d genome positions in each process." % (genome_size, n_positions_each_process)) intervals = [] for i in range(args.threads): start_position = n_positions_each_process * i * args.spacing end_position = n_positions_each_process * (i + 1) * args.spacing intervals.append((start_position, end_position)) logging.info("Creating interval for genome segment %d-%d" % (start_position, end_position)) all_hashes = [] all_nodes = [] all_ref_offsets = [] all_allele_frequencies = [] for flat_kmers in pool.starmap(create_index_single_thread, zip(repeat(args), intervals)): all_hashes.append(flat_kmers._hashes) all_nodes.append(flat_kmers._nodes) all_ref_offsets.append(flat_kmers._ref_offsets) all_allele_frequencies.append(flat_kmers._allele_frequencies) logging.info("Making full index from all indexes") full_index = FlatKmers(np.concatenate(all_hashes), np.concatenate(all_nodes), np.concatenate(all_ref_offsets), np.concatenate(all_allele_frequencies)) logging.info("Saving full index") full_index.to_file(args.out_file_name)
def test_from_dicts(): g = Graph.from_dicts({ 1: "ACTG", 2: "A", 3: "G", 4: "AAA" }, { 1: [2, 3], 2: [4], 3: [4] }, [1, 2, 4]) assert g.get_node_size(1) == 4 assert g.get_node_size(2) == 1 assert g.get_node_size(3) == 1 assert g.get_node_size(4) == 3 assert list(g.get_edges(1)) == [2, 3] assert g.get_node_sequence(2) == "A"
def test_indel_graph(): graph = Graph.from_dicts({ 1: "ACTG", 2: "A", 3: "", 4: "TAAT" }, { 1: [2, 3], 2: [4], 3: [4] }, [1, 2, 4]) kmer_finder = SnpKmerFinder(graph, k=3) flat_kmers = kmer_finder.find_kmers() print(kmer_finder.kmers_found) index = KmerIndex.from_flat_kmers(flat_kmers) hits = index.get(sequence_to_kmer_hash("GTA")) assert list(hits[1] == [1, 3, 4]) print(hits) hits = index.get(sequence_to_kmer_hash("GAT")) assert list(hits[1] == [1, 2, 4]) print(hits)
def test_tricky_case_nested_deletions(): graph = Graph.from_dicts( { 1: "TATAT", 2: "AT", 3: "A", 4: "T", 5: "A", 6: "A", 7: "T", 8: "A", 9: "GG" }, { 1: [2, 6], 2: [3, 6], 3: [4, 5], 4: [6], 5: [6], 6: [7, 8], 7: [9], 8: [9] }, [1, 2, 3, 5, 6, 8, 9]) variants = VcfVariants([ VcfVariant(1, 5, "TATAA", "T", type="DELETION"), VcfVariant(1, 7, "TAA", "T", type="DELETION"), VcfVariant(1, 5, "A", "T", type="SNP"), ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() print(new_graph) assert list(new_graph.get_edges(1)) == [2, 11] assert list(new_graph.get_edges(2)) == [3, 12] assert list(new_graph.get_edges(11)) == [6] assert list(new_graph.get_edges(12)) == [6]
def test_simple_snp_graph(): graph = Graph.from_dicts({ 1: "ACTG", 2: "A", 3: "G", 4: "AAAT" }, { 1: [2, 3], 2: [4], 3: [4] }, [1, 2, 4]) kmer_finder = SnpKmerFinder(graph, k=3) flat_kmers = kmer_finder.find_kmers() print(kmer_finder.kmers_found) print(flat_kmers._ref_offsets) print(flat_kmers._nodes) print(flat_kmers._hashes) assert kmer_finder.has_kmer("ACT", {1}) assert kmer_finder.has_kmer("GAA", {1, 2, 4}) assert kmer_finder.has_kmer("GGA", {1, 3, 4}) assert kmer_finder.has_kmer("AAT", {4})
def test_insertion_with_multiple_paths(): graph = Graph.from_dicts( { 1: "AAAG", 2: "GAGT", 3: "GA", 4: "C", 5: "G", 6: "T" }, { 1: [2, 3], 2: [3], 3: [4, 5], 4: [6], 5: [6] }, [1, 3, 5, 6]) variants = VcfVariants([VcfVariant(1, 4, "G", "GGAGT", type="INSERTION")]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() assert list(new_graph.get_edges(1)) == [2, 8] assert list(new_graph.get_edges(8)) == [3] print(new_graph)