예제 #1
0
def test_double_deletion_with_snp_inside_first_deletiod_and_false_deletion_path(
):

    repeated_sequence = "AGGTCCCAGGTCCATCT"
    graph = Graph.from_dicts(
        {
            1: "TTTT",
            2: "AGGTCC",
            3: "C",
            4: "A",
            5: repeated_sequence,
            6: repeated_sequence
        }, {
            1: [2, 5, 6],
            2: [3, 4],
            3: [5, 6],
            4: [5, 6],
            5: [6]
        }, [1, 2, 3, 5, 6])

    variants = VcfVariants([
        VcfVariant(1, 4, "TAGGTCCC", "T", type="DELETION"),
        VcfVariant(1, 11, "CAGGTCCCAGGTCCATCT", "C", type="DELETION")
    ])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()

    print(new_graph)
    assert list(new_graph.get_edges(1)) == [2, 8]
    assert list(new_graph.get_edges(2)) == [3, 4]
    assert list(new_graph.get_edges(3)) == [5, 9]
    assert list(new_graph.get_edges(4)) == [5, 9]
    assert list(new_graph.get_edges(9)) == [6]
    assert list(new_graph.get_edges(8)) == [5, 9]
    def make_unique_variant_kmers(args):
        logging.info("Reading kmer index")
        kmer_index = CollisionFreeKmerIndex.from_file(args.kmer_index)
        to_shared_memory(kmer_index, "kmer_index_shared")
        logging.info("Reading variant to nodes")
        variant_to_nodes = VariantToNodes.from_file(args.variant_to_nodes)
        to_shared_memory(variant_to_nodes, "variant_to_nodes_shared")
        logging.info("REading graph")
        graph = Graph.from_file(args.graph)
        to_shared_memory(graph, "graph_shared")
        logging.info("Reading all variants")
        variants = VcfVariants.from_vcf(args.vcf,
                                        skip_index=True,
                                        make_generator=True)
        variants = variants.get_chunks(chunk_size=args.chunk_size)
        pool = Pool(args.n_threads)

        all_flat_kmers = []
        for flat_kmers in pool.starmap(make_unique_variant_kmers_single_thread,
                                       zip(variants, repeat(args))):
            all_flat_kmers.append(flat_kmers)

        logging.info("Merge all flat kmers")
        merged_flat = FlatKmers.from_multiple_flat_kmers(all_flat_kmers)
        merged_flat.to_file(args.out_file_name)
        logging.info("Wrote to file %s" % args.out_file_name)
예제 #3
0
def test_messy_graph():
    reference = "GCATATTTT"
    variants = VcfVariants([
        VcfVariant(1, 2, "CAT", "C", type="DELETION"),
        VcfVariant(1, 3, "A", "G", type="SNP"),
        VcfVariant(1, 4, "TA", "T", type="DELETION"),
        VcfVariant(1, 5, "A", "AT", type="INSERTION"),
    ])

    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph()
    print(graph)
    print(constructor.get_graph_with_dummy_nodes())

    ref, var = graph.get_variant_nodes(variants[2])
    assert ref == 5
    assert var == 10

    ref, var = graph.get_variant_nodes(variants[0])
    assert ref == 3
    assert var == 9

    ref, var = graph.get_variant_nodes(variants[1])
    assert ref == 3
    assert var == 2

    ref, var = graph.get_variant_nodes(variants[3])
    assert ref == 11
    assert var == 6

    assert list(graph.get_edges(10)) == [6, 11]
    assert list(graph.get_edges(11)) == [7]
    assert list(graph.get_edges(9)) == [5, 10]
예제 #4
0
def test_double_deletion_with_snp_inside_first_deletion():

    graph = Graph.from_dicts(
        {
            1: "ACTG",
            2: "A",
            3: "C",
            4: "T",
            5: "AAA",
            6: "G"
        }, {
            1: [2, 5, 6],
            2: [3, 4],
            3: [5, 6],
            4: [5, 6],
            5: [6]
        }, [1, 2, 4, 6])

    variants = VcfVariants([
        VcfVariant(1, 4, "GAT", "G", type="DELETION"),
        VcfVariant(1, 6, "TAAA", "T", type="DELETION")
    ])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()
    print(new_graph)
예제 #5
0
def test_overlapping_deletions():
    graph = Graph.from_dicts(
        {
            1: "AA",
            2: "TCTG",
            3: "TCT",
            4: "G",
            5: "A",
            6: "GG"
        }, {
            1: [2, 3],
            2: [3, 6],
            3: [4, 5],
            4: [6],
            5: [6]
        }, [1, 2, 3, 5, 6])

    variants = VcfVariants([
        VcfVariant(1, 2, "ATCTG", "A", type="DELETION"),
        VcfVariant(1, 6, "GTCTA", "T", type="DELETION"),
        VcfVariant(1, 10, "A", "G", type="SNP")
    ])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()

    assert list(new_graph.get_edges(1)) == [2, 8]
    assert list(new_graph.get_edges(8)) == [3, 9]
    assert list(new_graph.get_edges(2)) == [3, 9]
    assert list(new_graph.get_edges(9)) == [6]

    ref_node, var_node = new_graph.get_variant_nodes(variants[1])
    assert ref_node == 3
    assert var_node == 9
    print(new_graph)
예제 #6
0
def test_insertion_with_identical_false_path():
    graph = Graph.from_dicts({
        1: "AA",
        2: "TCTG",
        3: "TCTG",
        4: "GG"
    }, {
        1: [2, 3],
        2: [3],
        3: [4],
    }, [1, 3, 4])

    variants = VcfVariants([
        VcfVariant(1, 2, "A", "ATCTG", type="INSERTION"),
    ])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()
    print(new_graph)

    assert list(new_graph.get_edges(1)) == [2, 6]
    assert list(new_graph.get_edges(6)) == [3]
    assert list(new_graph.get_edges(2)) == [3]

    ref_node, var_node = new_graph.get_variant_nodes(variants[0])
    assert ref_node == 6
    assert var_node == 2
예제 #7
0
def make(args):
    if args.vcf is not None:
        logging.info("Will create from vcf file")
        reference = Fasta(args.reference_fasta_file)

        chromosome = args.chromosome
        numeric_chromosome = chromosome
        if chromosome == "X":
            numeric_chromosome = "23"
        elif chromosome == "Y":
            numeric_chromosome = "24"

        variants = VcfVariants.from_vcf(args.vcf,
                                        limit_to_chromosome=numeric_chromosome)
        ref_sequence = str(reference[args.chromosome])
        logging.info("Extracted sequence for chromosome %s. Length is: %d" %
                     (chromosome, len(ref_sequence)))
        logging.info("There are %d variants in chromosome" % len(variants))

        constructor = GraphConstructor(ref_sequence, variants)
        graph = constructor.get_graph_with_dummy_nodes()
        graph.to_file(args.out_file_name)
    else:
        logging.info("Will create from files %s" % args.vg_json_files)
        graph = Graph.from_vg_json_files(args.vg_json_files)
        graph.to_file(args.out_file_name)
예제 #8
0
 def make_variant_to_nodes(args):
     from .variant_to_nodes import VariantToNodes
     graph = Graph.from_file(args.graph)
     variants = VcfVariants.from_vcf(args.vcf)
     variant_to_nodes = VariantToNodes.from_graph_and_variants(
         graph, variants)
     variant_to_nodes.to_file(args.out_file_name)
     logging.info("Wrote to file %s" % args.out_file_name)
예제 #9
0
def make_haplotype_to_nodes(args):
    graph = Graph.from_file(args.graph_file_name)
    variants = VcfVariants.from_vcf(args.vcf_file_name)
    haplotype_to_nodes = HaplotypeToNodes.from_graph_and_variants(
        graph, variants, args.n_haplotypes)
    logging.info("Saving to file")
    haplotype_to_nodes.to_file(args.out_file_name)
    logging.info("Wrote to file %s" % args.out_file_name)
예제 #10
0
    def validate_graph(args):
        variants = VcfVariants.from_vcf(args.vcf)
        graph = Graph.from_file(args.graph)

        for i, variant in enumerate(variants):
            if i % 1000 == 0:
                logging.info("%d variants processed" % i)

            ref_node, var_node = graph.get_variant_nodes(variant)
예제 #11
0
def test_deletion_with_snp_at_end():
    reference = "TCTGTCTAGG"
    variants = VcfVariants([
        VcfVariant(1, 4, "GTCTA", "G", type="DELETION"),
        VcfVariant(1, 8, "A", "T", type="SNP")
    ])
    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph()

    print(graph)
예제 #12
0
def test_insertion_with_snp_right_before_and_right_after():
    reference = "AAAAAA"
    variants = VcfVariants([
        VcfVariant(1, 2, "A", "T", type="SNP"),
        VcfVariant(1, 2, "A", "AC", type="INSERTION"),
        VcfVariant(1, 3, "A", "C", type="SNP")
    ])
    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph()
    print(graph)
예제 #13
0
def test_deletion_with_snp_right_before_and_right_after():
    reference = "AAAAAA"
    variants = VcfVariants([
        VcfVariant(1, 2, "A", "T", type="SNP"),
        VcfVariant(1, 2, "AA", "A", type="DELETION"),
        VcfVariant(1, 3, "A", "C", type="SNP")
    ])
    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph()
    graph_with_dummy_nodes = constructor.get_graph_with_dummy_nodes()
    print(graph)
    print(graph_with_dummy_nodes)
예제 #14
0
def test_single_insertion():
    reference = "AATTGG"

    variants = VcfVariants([VcfVariant(1, 2, "A", "AAA", type="INSERTION")])

    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph_with_dummy_nodes()
    print(graph)

    assert list(graph.get_edges(1)) == [2, 5]
    assert list(graph.get_edges(5)) == [3]
    assert list(graph.get_edges(3)) == []
예제 #15
0
def add_allele_frequencies(args):
    logging.info("Reading graph")
    graph = Graph.from_file(args.graph_file_name)
    variants = VcfVariants.from_vcf(args.vcf_file_name,
                                    limit_to_chromosome=args.chromosome,
                                    skip_index=True)
    graph.set_allele_frequencies_from_variants(
        variants, use_chromosome=1
    )  # Use chromosome 1 because we always assume this is a single-chromosome graph
    graph.to_file(args.graph_file_name)
    logging.info("Wrote modified graph to the same file %s" %
                 args.graph_file_name)
예제 #16
0
def test_double_deletion_with_snp_inside_first_deletion():

    reference = "ACTGATAAAG"
    variants = VcfVariants([
        VcfVariant(1, 4, "GAT", "G", type="DELETION"),
        VcfVariant(1, 6, "T", "C", type="SNP"),
        VcfVariant(1, 6, "TAAA", "T", type="DELETION")
    ])
    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph()
    print(graph.get_node_at_ref_offset(0))
    print(graph.get_node_at_ref_offset(1))
    graph_with_dummy_nodes = constructor.get_graph_with_dummy_nodes()
    print(graph_with_dummy_nodes)
예제 #17
0
 def make_random_haplotypes(args):
     graph = Graph.from_file(args.graph)
     variants = VcfVariants.from_vcf(args.vcf_file_name, skip_index=True)
     haplotype_nodes = HaplotypeToNodes.make_from_n_random_haplotypes(
         graph,
         variants,
         n_haplotypes=args.n_haplotypes,
         weight_by_allele_frequency=not args.no_frequency_weighting)
     logging.info(
         "Making new haplotypenodes by traversing full graph for each haplotype"
     )
     new = haplotype_nodes.get_new_by_traversing_graph(
         graph, args.n_haplotypes)
     new.to_file(args.out_file_name)
     logging.info("Wrote haplotypenodes to %s" % args.out_file_name)
예제 #18
0
def test_simple_insertion():
    graph = Graph.from_dicts({
        1: "ACTG",
        2: "C",
        3: "AAAA"
    }, {
        1: [2, 3],
        2: [3]
    }, [1, 3])

    variants = VcfVariants([VcfVariant(1, 4, "G", "GC", type="INSERTION")])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()

    assert new_graph.node_has_edges(5, [3])
    assert new_graph.node_has_edges(1, [2, 5])
    assert new_graph.node_has_edges(2, [3])
def simple_test():
    g = Graph.from_dicts({
        1: "CTACCA",
        2: "AA",
        3: "TAAATAA",
        4: ""
    }, {
        1: [2, 4],
        2: [3],
        4: [3]
    }, [1, 2, 3])
    print(g.ref_offset_to_node)
    print(g.get_node_size(3))
    k = 4
    variants = VcfVariants([VcfVariant(6, "AAA", "A", "", "DELETION")])
    reference_kmers = ReferenceKmerIndex.from_sequence("CTACCAAATAAATAA", k)
    finder = UniqueVariantKmersFinder(g, reference_kmers, variants, k)
    finder.find_unique_kmers()
예제 #20
0
def test_single_deletion():
    logging.info("\n\nTest single deletion")
    reference = "AATTGG"

    variants = VcfVariants([VcfVariant(1, 2, "ATT", "A", type="DELETION")])

    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph()
    print(graph)

    assert list(graph.get_edges(1)) == [2, 5]
    assert list(graph.get_edges(2)) == [3]

    assert graph.get_node_sequence(5) == ""
    assert graph.get_node_sequence(2) == "TT"
    assert graph.get_node_sequence(1) == "AA"
    assert graph.get_node_sequence(3) == "GG"

    assert graph.linear_ref_nodes() == set([1, 2, 3])
예제 #21
0
def test_insertion_with_snp_right_before():
    reference = "AAAAAA"
    variants = VcfVariants([
        VcfVariant(1, 2, "A", "T", type="SNP"),
        VcfVariant(1, 2, "A", "AC", type="INSERTION")
    ])
    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph_with_dummy_nodes()

    assert list(graph.get_edges(1)) == [2, 3]
    assert list(graph.get_edges(2)) == [4, 7]
    assert list(graph.get_edges(3)) == [4, 7]
    assert list(graph.get_edges(4)) == [5]
    assert list(graph.get_edges(5)) == []
    assert list(graph.get_edges(7)) == [5]
    assert graph.get_node_sequence(5) == "AAAA"
    assert graph.get_node_sequence(7) == ""
    assert graph.get_node_sequence(4) == "C"
    assert graph.get_node_sequence(2) == "T"
    assert graph.linear_ref_nodes() == set([1, 3, 5])
예제 #22
0
    def make_genotype_matrix(args):
        from .genotype_matrix import GenotypeMatrix
        variants = VcfVariants.from_vcf(args.vcf_file_name,
                                        skip_index=True,
                                        limit_to_n_lines=None,
                                        make_generator=True)

        if args.node_to_haplotypes is not None:
            graph = Graph.from_file(args.graph)
            nodes_to_haplotypes = NodeToHaplotypes.from_file(
                args.node_to_haplotypes)
            matrix = GenotypeMatrix.from_nodes_to_haplotypes_and_variants(
                nodes_to_haplotypes, variants, graph, args.n_individuals)
        else:
            logging.info("Making genotype matrix directly from vcf")
            matrix = GenotypeMatrix.from_variants(variants,
                                                  args.n_individuals,
                                                  args.n_variants,
                                                  n_threads=args.n_threads,
                                                  chunk_size=args.chunk_size)

        matrix.to_file(args.out_file_name)
예제 #23
0
def test_tricky_case_nested_deletions():
    graph = Graph.from_dicts(
        {
            1: "TATAT",
            2: "AT",
            3: "A",
            4: "T",
            5: "A",
            6: "A",
            7: "T",
            8: "A",
            9: "GG"
        }, {
            1: [2, 6],
            2: [3, 6],
            3: [4, 5],
            4: [6],
            5: [6],
            6: [7, 8],
            7: [9],
            8: [9]
        }, [1, 2, 3, 5, 6, 8, 9])

    variants = VcfVariants([
        VcfVariant(1, 5, "TATAA", "T", type="DELETION"),
        VcfVariant(1, 7, "TAA", "T", type="DELETION"),
        VcfVariant(1, 5, "A", "T", type="SNP"),
    ])

    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()
    print(new_graph)

    assert list(new_graph.get_edges(1)) == [2, 11]
    assert list(new_graph.get_edges(2)) == [3, 12]
    assert list(new_graph.get_edges(11)) == [6]
    assert list(new_graph.get_edges(12)) == [6]
예제 #24
0
def test_insertion_with_multiple_paths():

    graph = Graph.from_dicts(
        {
            1: "AAAG",
            2: "GAGT",
            3: "GA",
            4: "C",
            5: "G",
            6: "T"
        }, {
            1: [2, 3],
            2: [3],
            3: [4, 5],
            4: [6],
            5: [6]
        }, [1, 3, 5, 6])

    variants = VcfVariants([VcfVariant(1, 4, "G", "GGAGT", type="INSERTION")])
    dummy_node_adder = DummyNodeAdder(graph, variants)
    new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes()
    assert list(new_graph.get_edges(1)) == [2, 8]
    assert list(new_graph.get_edges(8)) == [3]
    print(new_graph)
예제 #25
0
def test_single_snp():
    reference = "ACTGGG"

    variants = VcfVariants([VcfVariant(1, 2, "C", "T", type="SNP")])

    constructor = GraphConstructor(reference, variants)
    graph = constructor.get_graph()

    assert list(graph.get_edges(1)) == [2, 3], graph.get_edges(1)
    assert list(graph.get_edges(2)) == [4], graph.get_edges(2)
    assert list(graph.get_edges(3)) == [4], graph.get_edges(3)
    assert list(graph.get_edges(4)) == [], graph.get_edges(4)

    assert graph.get_node_size(1) == 1
    assert graph.get_node_size(2) == 1
    assert graph.get_node_size(3) == 1
    assert graph.get_node_size(4) == 4

    assert graph.get_node_sequence(1) == "A"
    assert graph.get_node_sequence(2) == "T"
    assert graph.get_node_sequence(3) == "C"
    assert graph.get_node_sequence(4) == "TGGG"

    assert graph.linear_ref_nodes() == set([1, 3, 4])
예제 #26
0
def add_indel_nodes2(args):
    variants = VcfVariants.from_vcf(args.vcf_file_name)
    graph = Graph.from_file(args.graph_file_name)
    adder = DummyNodeAdder(graph, variants)
    new_graph = adder.create_new_graph_with_dummy_nodes()
    new_graph.to_file(args.out_file_name)