def test_double_deletion_with_snp_inside_first_deletiod_and_false_deletion_path( ): repeated_sequence = "AGGTCCCAGGTCCATCT" graph = Graph.from_dicts( { 1: "TTTT", 2: "AGGTCC", 3: "C", 4: "A", 5: repeated_sequence, 6: repeated_sequence }, { 1: [2, 5, 6], 2: [3, 4], 3: [5, 6], 4: [5, 6], 5: [6] }, [1, 2, 3, 5, 6]) variants = VcfVariants([ VcfVariant(1, 4, "TAGGTCCC", "T", type="DELETION"), VcfVariant(1, 11, "CAGGTCCCAGGTCCATCT", "C", type="DELETION") ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() print(new_graph) assert list(new_graph.get_edges(1)) == [2, 8] assert list(new_graph.get_edges(2)) == [3, 4] assert list(new_graph.get_edges(3)) == [5, 9] assert list(new_graph.get_edges(4)) == [5, 9] assert list(new_graph.get_edges(9)) == [6] assert list(new_graph.get_edges(8)) == [5, 9]
def make_unique_variant_kmers(args): logging.info("Reading kmer index") kmer_index = CollisionFreeKmerIndex.from_file(args.kmer_index) to_shared_memory(kmer_index, "kmer_index_shared") logging.info("Reading variant to nodes") variant_to_nodes = VariantToNodes.from_file(args.variant_to_nodes) to_shared_memory(variant_to_nodes, "variant_to_nodes_shared") logging.info("REading graph") graph = Graph.from_file(args.graph) to_shared_memory(graph, "graph_shared") logging.info("Reading all variants") variants = VcfVariants.from_vcf(args.vcf, skip_index=True, make_generator=True) variants = variants.get_chunks(chunk_size=args.chunk_size) pool = Pool(args.n_threads) all_flat_kmers = [] for flat_kmers in pool.starmap(make_unique_variant_kmers_single_thread, zip(variants, repeat(args))): all_flat_kmers.append(flat_kmers) logging.info("Merge all flat kmers") merged_flat = FlatKmers.from_multiple_flat_kmers(all_flat_kmers) merged_flat.to_file(args.out_file_name) logging.info("Wrote to file %s" % args.out_file_name)
def test_messy_graph(): reference = "GCATATTTT" variants = VcfVariants([ VcfVariant(1, 2, "CAT", "C", type="DELETION"), VcfVariant(1, 3, "A", "G", type="SNP"), VcfVariant(1, 4, "TA", "T", type="DELETION"), VcfVariant(1, 5, "A", "AT", type="INSERTION"), ]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph() print(graph) print(constructor.get_graph_with_dummy_nodes()) ref, var = graph.get_variant_nodes(variants[2]) assert ref == 5 assert var == 10 ref, var = graph.get_variant_nodes(variants[0]) assert ref == 3 assert var == 9 ref, var = graph.get_variant_nodes(variants[1]) assert ref == 3 assert var == 2 ref, var = graph.get_variant_nodes(variants[3]) assert ref == 11 assert var == 6 assert list(graph.get_edges(10)) == [6, 11] assert list(graph.get_edges(11)) == [7] assert list(graph.get_edges(9)) == [5, 10]
def test_double_deletion_with_snp_inside_first_deletion(): graph = Graph.from_dicts( { 1: "ACTG", 2: "A", 3: "C", 4: "T", 5: "AAA", 6: "G" }, { 1: [2, 5, 6], 2: [3, 4], 3: [5, 6], 4: [5, 6], 5: [6] }, [1, 2, 4, 6]) variants = VcfVariants([ VcfVariant(1, 4, "GAT", "G", type="DELETION"), VcfVariant(1, 6, "TAAA", "T", type="DELETION") ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() print(new_graph)
def test_overlapping_deletions(): graph = Graph.from_dicts( { 1: "AA", 2: "TCTG", 3: "TCT", 4: "G", 5: "A", 6: "GG" }, { 1: [2, 3], 2: [3, 6], 3: [4, 5], 4: [6], 5: [6] }, [1, 2, 3, 5, 6]) variants = VcfVariants([ VcfVariant(1, 2, "ATCTG", "A", type="DELETION"), VcfVariant(1, 6, "GTCTA", "T", type="DELETION"), VcfVariant(1, 10, "A", "G", type="SNP") ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() assert list(new_graph.get_edges(1)) == [2, 8] assert list(new_graph.get_edges(8)) == [3, 9] assert list(new_graph.get_edges(2)) == [3, 9] assert list(new_graph.get_edges(9)) == [6] ref_node, var_node = new_graph.get_variant_nodes(variants[1]) assert ref_node == 3 assert var_node == 9 print(new_graph)
def test_insertion_with_identical_false_path(): graph = Graph.from_dicts({ 1: "AA", 2: "TCTG", 3: "TCTG", 4: "GG" }, { 1: [2, 3], 2: [3], 3: [4], }, [1, 3, 4]) variants = VcfVariants([ VcfVariant(1, 2, "A", "ATCTG", type="INSERTION"), ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() print(new_graph) assert list(new_graph.get_edges(1)) == [2, 6] assert list(new_graph.get_edges(6)) == [3] assert list(new_graph.get_edges(2)) == [3] ref_node, var_node = new_graph.get_variant_nodes(variants[0]) assert ref_node == 6 assert var_node == 2
def make(args): if args.vcf is not None: logging.info("Will create from vcf file") reference = Fasta(args.reference_fasta_file) chromosome = args.chromosome numeric_chromosome = chromosome if chromosome == "X": numeric_chromosome = "23" elif chromosome == "Y": numeric_chromosome = "24" variants = VcfVariants.from_vcf(args.vcf, limit_to_chromosome=numeric_chromosome) ref_sequence = str(reference[args.chromosome]) logging.info("Extracted sequence for chromosome %s. Length is: %d" % (chromosome, len(ref_sequence))) logging.info("There are %d variants in chromosome" % len(variants)) constructor = GraphConstructor(ref_sequence, variants) graph = constructor.get_graph_with_dummy_nodes() graph.to_file(args.out_file_name) else: logging.info("Will create from files %s" % args.vg_json_files) graph = Graph.from_vg_json_files(args.vg_json_files) graph.to_file(args.out_file_name)
def make_variant_to_nodes(args): from .variant_to_nodes import VariantToNodes graph = Graph.from_file(args.graph) variants = VcfVariants.from_vcf(args.vcf) variant_to_nodes = VariantToNodes.from_graph_and_variants( graph, variants) variant_to_nodes.to_file(args.out_file_name) logging.info("Wrote to file %s" % args.out_file_name)
def make_haplotype_to_nodes(args): graph = Graph.from_file(args.graph_file_name) variants = VcfVariants.from_vcf(args.vcf_file_name) haplotype_to_nodes = HaplotypeToNodes.from_graph_and_variants( graph, variants, args.n_haplotypes) logging.info("Saving to file") haplotype_to_nodes.to_file(args.out_file_name) logging.info("Wrote to file %s" % args.out_file_name)
def validate_graph(args): variants = VcfVariants.from_vcf(args.vcf) graph = Graph.from_file(args.graph) for i, variant in enumerate(variants): if i % 1000 == 0: logging.info("%d variants processed" % i) ref_node, var_node = graph.get_variant_nodes(variant)
def test_deletion_with_snp_at_end(): reference = "TCTGTCTAGG" variants = VcfVariants([ VcfVariant(1, 4, "GTCTA", "G", type="DELETION"), VcfVariant(1, 8, "A", "T", type="SNP") ]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph() print(graph)
def test_insertion_with_snp_right_before_and_right_after(): reference = "AAAAAA" variants = VcfVariants([ VcfVariant(1, 2, "A", "T", type="SNP"), VcfVariant(1, 2, "A", "AC", type="INSERTION"), VcfVariant(1, 3, "A", "C", type="SNP") ]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph() print(graph)
def test_deletion_with_snp_right_before_and_right_after(): reference = "AAAAAA" variants = VcfVariants([ VcfVariant(1, 2, "A", "T", type="SNP"), VcfVariant(1, 2, "AA", "A", type="DELETION"), VcfVariant(1, 3, "A", "C", type="SNP") ]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph() graph_with_dummy_nodes = constructor.get_graph_with_dummy_nodes() print(graph) print(graph_with_dummy_nodes)
def test_single_insertion(): reference = "AATTGG" variants = VcfVariants([VcfVariant(1, 2, "A", "AAA", type="INSERTION")]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph_with_dummy_nodes() print(graph) assert list(graph.get_edges(1)) == [2, 5] assert list(graph.get_edges(5)) == [3] assert list(graph.get_edges(3)) == []
def add_allele_frequencies(args): logging.info("Reading graph") graph = Graph.from_file(args.graph_file_name) variants = VcfVariants.from_vcf(args.vcf_file_name, limit_to_chromosome=args.chromosome, skip_index=True) graph.set_allele_frequencies_from_variants( variants, use_chromosome=1 ) # Use chromosome 1 because we always assume this is a single-chromosome graph graph.to_file(args.graph_file_name) logging.info("Wrote modified graph to the same file %s" % args.graph_file_name)
def test_double_deletion_with_snp_inside_first_deletion(): reference = "ACTGATAAAG" variants = VcfVariants([ VcfVariant(1, 4, "GAT", "G", type="DELETION"), VcfVariant(1, 6, "T", "C", type="SNP"), VcfVariant(1, 6, "TAAA", "T", type="DELETION") ]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph() print(graph.get_node_at_ref_offset(0)) print(graph.get_node_at_ref_offset(1)) graph_with_dummy_nodes = constructor.get_graph_with_dummy_nodes() print(graph_with_dummy_nodes)
def make_random_haplotypes(args): graph = Graph.from_file(args.graph) variants = VcfVariants.from_vcf(args.vcf_file_name, skip_index=True) haplotype_nodes = HaplotypeToNodes.make_from_n_random_haplotypes( graph, variants, n_haplotypes=args.n_haplotypes, weight_by_allele_frequency=not args.no_frequency_weighting) logging.info( "Making new haplotypenodes by traversing full graph for each haplotype" ) new = haplotype_nodes.get_new_by_traversing_graph( graph, args.n_haplotypes) new.to_file(args.out_file_name) logging.info("Wrote haplotypenodes to %s" % args.out_file_name)
def test_simple_insertion(): graph = Graph.from_dicts({ 1: "ACTG", 2: "C", 3: "AAAA" }, { 1: [2, 3], 2: [3] }, [1, 3]) variants = VcfVariants([VcfVariant(1, 4, "G", "GC", type="INSERTION")]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() assert new_graph.node_has_edges(5, [3]) assert new_graph.node_has_edges(1, [2, 5]) assert new_graph.node_has_edges(2, [3])
def simple_test(): g = Graph.from_dicts({ 1: "CTACCA", 2: "AA", 3: "TAAATAA", 4: "" }, { 1: [2, 4], 2: [3], 4: [3] }, [1, 2, 3]) print(g.ref_offset_to_node) print(g.get_node_size(3)) k = 4 variants = VcfVariants([VcfVariant(6, "AAA", "A", "", "DELETION")]) reference_kmers = ReferenceKmerIndex.from_sequence("CTACCAAATAAATAA", k) finder = UniqueVariantKmersFinder(g, reference_kmers, variants, k) finder.find_unique_kmers()
def test_single_deletion(): logging.info("\n\nTest single deletion") reference = "AATTGG" variants = VcfVariants([VcfVariant(1, 2, "ATT", "A", type="DELETION")]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph() print(graph) assert list(graph.get_edges(1)) == [2, 5] assert list(graph.get_edges(2)) == [3] assert graph.get_node_sequence(5) == "" assert graph.get_node_sequence(2) == "TT" assert graph.get_node_sequence(1) == "AA" assert graph.get_node_sequence(3) == "GG" assert graph.linear_ref_nodes() == set([1, 2, 3])
def test_insertion_with_snp_right_before(): reference = "AAAAAA" variants = VcfVariants([ VcfVariant(1, 2, "A", "T", type="SNP"), VcfVariant(1, 2, "A", "AC", type="INSERTION") ]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph_with_dummy_nodes() assert list(graph.get_edges(1)) == [2, 3] assert list(graph.get_edges(2)) == [4, 7] assert list(graph.get_edges(3)) == [4, 7] assert list(graph.get_edges(4)) == [5] assert list(graph.get_edges(5)) == [] assert list(graph.get_edges(7)) == [5] assert graph.get_node_sequence(5) == "AAAA" assert graph.get_node_sequence(7) == "" assert graph.get_node_sequence(4) == "C" assert graph.get_node_sequence(2) == "T" assert graph.linear_ref_nodes() == set([1, 3, 5])
def make_genotype_matrix(args): from .genotype_matrix import GenotypeMatrix variants = VcfVariants.from_vcf(args.vcf_file_name, skip_index=True, limit_to_n_lines=None, make_generator=True) if args.node_to_haplotypes is not None: graph = Graph.from_file(args.graph) nodes_to_haplotypes = NodeToHaplotypes.from_file( args.node_to_haplotypes) matrix = GenotypeMatrix.from_nodes_to_haplotypes_and_variants( nodes_to_haplotypes, variants, graph, args.n_individuals) else: logging.info("Making genotype matrix directly from vcf") matrix = GenotypeMatrix.from_variants(variants, args.n_individuals, args.n_variants, n_threads=args.n_threads, chunk_size=args.chunk_size) matrix.to_file(args.out_file_name)
def test_tricky_case_nested_deletions(): graph = Graph.from_dicts( { 1: "TATAT", 2: "AT", 3: "A", 4: "T", 5: "A", 6: "A", 7: "T", 8: "A", 9: "GG" }, { 1: [2, 6], 2: [3, 6], 3: [4, 5], 4: [6], 5: [6], 6: [7, 8], 7: [9], 8: [9] }, [1, 2, 3, 5, 6, 8, 9]) variants = VcfVariants([ VcfVariant(1, 5, "TATAA", "T", type="DELETION"), VcfVariant(1, 7, "TAA", "T", type="DELETION"), VcfVariant(1, 5, "A", "T", type="SNP"), ]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() print(new_graph) assert list(new_graph.get_edges(1)) == [2, 11] assert list(new_graph.get_edges(2)) == [3, 12] assert list(new_graph.get_edges(11)) == [6] assert list(new_graph.get_edges(12)) == [6]
def test_insertion_with_multiple_paths(): graph = Graph.from_dicts( { 1: "AAAG", 2: "GAGT", 3: "GA", 4: "C", 5: "G", 6: "T" }, { 1: [2, 3], 2: [3], 3: [4, 5], 4: [6], 5: [6] }, [1, 3, 5, 6]) variants = VcfVariants([VcfVariant(1, 4, "G", "GGAGT", type="INSERTION")]) dummy_node_adder = DummyNodeAdder(graph, variants) new_graph = dummy_node_adder.create_new_graph_with_dummy_nodes() assert list(new_graph.get_edges(1)) == [2, 8] assert list(new_graph.get_edges(8)) == [3] print(new_graph)
def test_single_snp(): reference = "ACTGGG" variants = VcfVariants([VcfVariant(1, 2, "C", "T", type="SNP")]) constructor = GraphConstructor(reference, variants) graph = constructor.get_graph() assert list(graph.get_edges(1)) == [2, 3], graph.get_edges(1) assert list(graph.get_edges(2)) == [4], graph.get_edges(2) assert list(graph.get_edges(3)) == [4], graph.get_edges(3) assert list(graph.get_edges(4)) == [], graph.get_edges(4) assert graph.get_node_size(1) == 1 assert graph.get_node_size(2) == 1 assert graph.get_node_size(3) == 1 assert graph.get_node_size(4) == 4 assert graph.get_node_sequence(1) == "A" assert graph.get_node_sequence(2) == "T" assert graph.get_node_sequence(3) == "C" assert graph.get_node_sequence(4) == "TGGG" assert graph.linear_ref_nodes() == set([1, 3, 4])
def add_indel_nodes2(args): variants = VcfVariants.from_vcf(args.vcf_file_name) graph = Graph.from_file(args.graph_file_name) adder = DummyNodeAdder(graph, variants) new_graph = adder.create_new_graph_with_dummy_nodes() new_graph.to_file(args.out_file_name)