Ejemplo n.º 1
0
def test_double_deletion_with_snp_inside_first_deletion():

    graph = Graph.from_dicts(
        {
            1: "ACTG",
            2: "A",
            3: "C",
            4: "T",
            5: "AAA",
            6: "G"
        }, {
            1: [2, 5, 6],
            2: [3, 4],
            3: [5, 6],
            4: [5, 6],
            5: [6]
        }, [1, 2, 4, 6])

    variants = VcfVariants([
        VcfVariant(1, 4, "GAT", "G", type="DELETION"),
        VcfVariant(1, 6, "TAAA", "T", type="DELETION")
    ])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()
    print(new_graph)
Ejemplo n.º 2
0
def test_overlapping_deletions():
    graph = Graph.from_dicts(
        {
            1: "AA",
            2: "TCTG",
            3: "TCT",
            4: "G",
            5: "A",
            6: "GG"
        }, {
            1: [2, 3],
            2: [3, 6],
            3: [4, 5],
            4: [6],
            5: [6]
        }, [1, 2, 3, 5, 6])

    variants = VcfVariants([
        VcfVariant(1, 2, "ATCTG", "A", type="DELETION"),
        VcfVariant(1, 6, "GTCTA", "T", type="DELETION"),
        VcfVariant(1, 10, "A", "G", type="SNP")
    ])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()

    assert list(new_graph.get_edges(1)) == [2, 8]
    assert list(new_graph.get_edges(8)) == [3, 9]
    assert list(new_graph.get_edges(2)) == [3, 9]
    assert list(new_graph.get_edges(9)) == [6]

    ref_node, var_node = new_graph.get_variant_nodes(variants[1])
    assert ref_node == 3
    assert var_node == 9
    print(new_graph)
Ejemplo n.º 3
0
def test_insertion_with_identical_false_path():
    graph = Graph.from_dicts({
        1: "AA",
        2: "TCTG",
        3: "TCTG",
        4: "GG"
    }, {
        1: [2, 3],
        2: [3],
        3: [4],
    }, [1, 3, 4])

    variants = VcfVariants([
        VcfVariant(1, 2, "A", "ATCTG", type="INSERTION"),
    ])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()
    print(new_graph)

    assert list(new_graph.get_edges(1)) == [2, 6]
    assert list(new_graph.get_edges(6)) == [3]
    assert list(new_graph.get_edges(2)) == [3]

    ref_node, var_node = new_graph.get_variant_nodes(variants[0])
    assert ref_node == 6
    assert var_node == 2
def make_unique_index(args):
    graph = Graph.from_file(args.graph)
    reverse = ReverseKmerIndex.from_file(args.reverse)
    flat = FlatKmers.from_file(args.flat_index)
    unique = UniqueKmerIndex.from_flat_kmers_and_snps_graph(
        flat, graph, reverse)
    unique.to_file(args.out_file_name)
    def make_unique_variant_kmers(args):
        logging.info("Reading kmer index")
        kmer_index = CollisionFreeKmerIndex.from_file(args.kmer_index)
        to_shared_memory(kmer_index, "kmer_index_shared")
        logging.info("Reading variant to nodes")
        variant_to_nodes = VariantToNodes.from_file(args.variant_to_nodes)
        to_shared_memory(variant_to_nodes, "variant_to_nodes_shared")
        logging.info("REading graph")
        graph = Graph.from_file(args.graph)
        to_shared_memory(graph, "graph_shared")
        logging.info("Reading all variants")
        variants = VcfVariants.from_vcf(args.vcf,
                                        skip_index=True,
                                        make_generator=True)
        variants = variants.get_chunks(chunk_size=args.chunk_size)
        pool = Pool(args.n_threads)

        all_flat_kmers = []
        for flat_kmers in pool.starmap(make_unique_variant_kmers_single_thread,
                                       zip(variants, repeat(args))):
            all_flat_kmers.append(flat_kmers)

        logging.info("Merge all flat kmers")
        merged_flat = FlatKmers.from_multiple_flat_kmers(all_flat_kmers)
        merged_flat.to_file(args.out_file_name)
        logging.info("Wrote to file %s" % args.out_file_name)
Ejemplo n.º 6
0
def test_double_deletion_with_snp_inside_first_deletiod_and_false_deletion_path(
):

    repeated_sequence = "AGGTCCCAGGTCCATCT"
    graph = Graph.from_dicts(
        {
            1: "TTTT",
            2: "AGGTCC",
            3: "C",
            4: "A",
            5: repeated_sequence,
            6: repeated_sequence
        }, {
            1: [2, 5, 6],
            2: [3, 4],
            3: [5, 6],
            4: [5, 6],
            5: [6]
        }, [1, 2, 3, 5, 6])

    variants = VcfVariants([
        VcfVariant(1, 4, "TAGGTCCC", "T", type="DELETION"),
        VcfVariant(1, 11, "CAGGTCCCAGGTCCATCT", "C", type="DELETION")
    ])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()

    print(new_graph)
    assert list(new_graph.get_edges(1)) == [2, 8]
    assert list(new_graph.get_edges(2)) == [3, 4]
    assert list(new_graph.get_edges(3)) == [5, 9]
    assert list(new_graph.get_edges(4)) == [5, 9]
    assert list(new_graph.get_edges(9)) == [6]
    assert list(new_graph.get_edges(8)) == [5, 9]
Ejemplo n.º 7
0
def test_simple():

    graph1 = Graph.from_dicts({
        1: "ACTG",
        2: "A",
        3: "C",
        4: "ACT"
    }, {
        1: [2, 3],
        2: [4],
        3: [4]
    }, [1, 2, 4])

    graph2 = Graph.from_dicts({
        1: "AAAA",
        2: "A",
        3: "C",
        4: "ACT"
    }, {
        1: [2, 3],
        2: [4],
        3: [4]
    }, [1, 2, 4])

    merged_graph = merge_graphs([graph1, graph2])

    print(merged_graph)
    assert list(merged_graph.get_edges(1)) == [2, 3]

    assert merged_graph.get_node_at_ref_offset(0) == 1
    assert merged_graph.get_node_sequence(
        merged_graph.get_node_at_ref_offset(8)) == "AAAA"
    assert merged_graph.get_node_sequence(
        merged_graph.get_node_at_ref_offset(4)) == "A"
    assert len(merged_graph.get_edges(
        merged_graph.get_node_at_ref_offset(8))) == 2
    assert len(merged_graph.get_edges(
        merged_graph.get_node_at_ref_offset(11))) == 2

    assert merged_graph.get_ref_offset_at_node(6) == 8
    assert 7 in merged_graph.linear_ref_nodes()
    assert merged_graph.get_ref_offset_at_node(7) == 12

    assert list(merged_graph.chromosome_start_nodes) == [1, 6]

    merged_graph.to_file("merged_graph.npz")
    merged_graph2 = Graph.from_file("merged_graph.npz")
Ejemplo n.º 8
0
def test_indel_graph2():
    graph = Graph.from_dicts(
        {
            1: "gggggaggcttgtggttagcagagagtgggtggaagacagaggtttgag",
            2: "ga",
            3:
            "gagagagacccaggggagaaaaccagctgcagaggcaggaggggtccagggcagcccgaggccagagatgggcgtcttccttacagccacctgtggtccc",
            100: ""
        }, {
            1: [2, 100],
            2: [3],
            100: [3]
        }, [1, 2, 3])
    kmer_finder = SnpKmerFinder(graph, k=31)
    flat_kmers = kmer_finder.find_kmers()
    print(kmer_finder.kmers_found)
Ejemplo n.º 9
0
def test_find_insertion_nodes():
    g = Graph.from_dicts({
        1: "CTACCA",
        2: "AA",
        3: "TAAATAA",
        4: ""
    }, {
        1: [2, 4],
        2: [3],
        4: [3]
    }, [1, 3])
    variant = VcfVariant(1, 6, "A", "AAA", "", "INSERTION")

    ref_node, variant_node = g.get_variant_nodes(variant)
    assert ref_node == 4
    assert variant_node == 2
Ejemplo n.º 10
0
def test_simple_insertion():
    graph = Graph.from_dicts({
        1: "ACTG",
        2: "C",
        3: "AAAA"
    }, {
        1: [2, 3],
        2: [3]
    }, [1, 3])

    variants = VcfVariants([VcfVariant(1, 4, "G", "GC", type="INSERTION")])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()

    assert new_graph.node_has_edges(5, [3])
    assert new_graph.node_has_edges(1, [2, 5])
    assert new_graph.node_has_edges(2, [3])
def simple_test():
    g = Graph.from_dicts({
        1: "CTACCA",
        2: "AA",
        3: "TAAATAA",
        4: ""
    }, {
        1: [2, 4],
        2: [3],
        4: [3]
    }, [1, 2, 3])
    print(g.ref_offset_to_node)
    print(g.get_node_size(3))
    k = 4
    variants = VcfVariants([VcfVariant(6, "AAA", "A", "", "DELETION")])
    reference_kmers = ReferenceKmerIndex.from_sequence("CTACCAAATAAATAA", k)
    finder = UniqueVariantKmersFinder(g, reference_kmers, variants, k)
    finder.find_unique_kmers()
def create_index(args):
    if args.graph_file_name is not None:
        graph = Graph.from_file(args.graph_file_name)
        to_shared_memory(graph, "graph_shared")

    if args.threads == 1:
        kmers = create_index_single_thread(args)
        kmers.to_file(args.out_file_name)
    else:
        logging.info("Making pool with %d workers" % args.threads)
        pool = Pool(args.threads)
        genome_size = args.genome_size
        n_total_start_positions = genome_size // args.spacing
        n_positions_each_process = n_total_start_positions // args.threads
        logging.info(
            "Using genome size %d. Will process %d genome positions in each process."
            % (genome_size, n_positions_each_process))
        intervals = []
        for i in range(args.threads):
            start_position = n_positions_each_process * i * args.spacing
            end_position = n_positions_each_process * (i + 1) * args.spacing
            intervals.append((start_position, end_position))
            logging.info("Creating interval for genome segment %d-%d" %
                         (start_position, end_position))

        all_hashes = []
        all_nodes = []
        all_ref_offsets = []
        all_allele_frequencies = []
        for flat_kmers in pool.starmap(create_index_single_thread,
                                       zip(repeat(args), intervals)):
            all_hashes.append(flat_kmers._hashes)
            all_nodes.append(flat_kmers._nodes)
            all_ref_offsets.append(flat_kmers._ref_offsets)
            all_allele_frequencies.append(flat_kmers._allele_frequencies)

        logging.info("Making full index from all indexes")
        full_index = FlatKmers(np.concatenate(all_hashes),
                               np.concatenate(all_nodes),
                               np.concatenate(all_ref_offsets),
                               np.concatenate(all_allele_frequencies))

        logging.info("Saving full index")
        full_index.to_file(args.out_file_name)
Ejemplo n.º 13
0
def test_from_dicts():
    g = Graph.from_dicts({
        1: "ACTG",
        2: "A",
        3: "G",
        4: "AAA"
    }, {
        1: [2, 3],
        2: [4],
        3: [4]
    }, [1, 2, 4])

    assert g.get_node_size(1) == 4
    assert g.get_node_size(2) == 1
    assert g.get_node_size(3) == 1
    assert g.get_node_size(4) == 3

    assert list(g.get_edges(1)) == [2, 3]

    assert g.get_node_sequence(2) == "A"
Ejemplo n.º 14
0
def test_indel_graph():
    graph = Graph.from_dicts({
        1: "ACTG",
        2: "A",
        3: "",
        4: "TAAT"
    }, {
        1: [2, 3],
        2: [4],
        3: [4]
    }, [1, 2, 4])
    kmer_finder = SnpKmerFinder(graph, k=3)
    flat_kmers = kmer_finder.find_kmers()
    print(kmer_finder.kmers_found)

    index = KmerIndex.from_flat_kmers(flat_kmers)
    hits = index.get(sequence_to_kmer_hash("GTA"))
    assert list(hits[1] == [1, 3, 4])
    print(hits)
    hits = index.get(sequence_to_kmer_hash("GAT"))
    assert list(hits[1] == [1, 2, 4])
    print(hits)
Ejemplo n.º 15
0
def test_tricky_case_nested_deletions():
    graph = Graph.from_dicts(
        {
            1: "TATAT",
            2: "AT",
            3: "A",
            4: "T",
            5: "A",
            6: "A",
            7: "T",
            8: "A",
            9: "GG"
        }, {
            1: [2, 6],
            2: [3, 6],
            3: [4, 5],
            4: [6],
            5: [6],
            6: [7, 8],
            7: [9],
            8: [9]
        }, [1, 2, 3, 5, 6, 8, 9])

    variants = VcfVariants([
        VcfVariant(1, 5, "TATAA", "T", type="DELETION"),
        VcfVariant(1, 7, "TAA", "T", type="DELETION"),
        VcfVariant(1, 5, "A", "T", type="SNP"),
    ])

    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()
    print(new_graph)

    assert list(new_graph.get_edges(1)) == [2, 11]
    assert list(new_graph.get_edges(2)) == [3, 12]
    assert list(new_graph.get_edges(11)) == [6]
    assert list(new_graph.get_edges(12)) == [6]
Ejemplo n.º 16
0
def test_simple_snp_graph():

    graph = Graph.from_dicts({
        1: "ACTG",
        2: "A",
        3: "G",
        4: "AAAT"
    }, {
        1: [2, 3],
        2: [4],
        3: [4]
    }, [1, 2, 4])

    kmer_finder = SnpKmerFinder(graph, k=3)
    flat_kmers = kmer_finder.find_kmers()
    print(kmer_finder.kmers_found)
    print(flat_kmers._ref_offsets)
    print(flat_kmers._nodes)
    print(flat_kmers._hashes)

    assert kmer_finder.has_kmer("ACT", {1})
    assert kmer_finder.has_kmer("GAA", {1, 2, 4})
    assert kmer_finder.has_kmer("GGA", {1, 3, 4})
    assert kmer_finder.has_kmer("AAT", {4})
Ejemplo n.º 17
0
def test_insertion_with_multiple_paths():

    graph = Graph.from_dicts(
        {
            1: "AAAG",
            2: "GAGT",
            3: "GA",
            4: "C",
            5: "G",
            6: "T"
        }, {
            1: [2, 3],
            2: [3],
            3: [4, 5],
            4: [6],
            5: [6]
        }, [1, 3, 5, 6])

    variants = VcfVariants([VcfVariant(1, 4, "G", "GGAGT", type="INSERTION")])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()
    assert list(new_graph.get_edges(1)) == [2, 8]
    assert list(new_graph.get_edges(8)) == [3]
    print(new_graph)