Example #1
0
    def simple_test():
        graph = Graph({
            1: Block(10),
            2: Block(1),
            3: Block(1),
            4: Block(10)
        }, {
            1: [2, 3],
            2: [4],
            3: [4]
        })
        graph.convert_to_numpy_backend()

        sequence_graph = SequenceGraph.create_empty_from_ob_graph(graph)
        sequence_graph.set_sequence(1, "GGGTTTATAC")
        sequence_graph.set_sequence(2, "A")
        sequence_graph.set_sequence(3, "C")
        sequence_graph.set_sequence(4, "GTACATTGTA")

        linear_ref = Interval(0, 10, [1, 2, 3], graph)
        linear_ref = linear_ref.to_numpy_indexed_interval()

        critical_nodes = set([4])

        finder = MinimizerFinder(graph,
                                 sequence_graph,
                                 critical_nodes,
                                 linear_ref,
                                 k=3,
                                 w=3)
        minimizers = finder.find_minimizers()
        assert minimizers.has_minimizer(2, 0)
        assert minimizers.has_minimizer(3, 0)
        assert minimizers.has_minimizer(4, 4)
    def setUp(self):
        self.linear_graph = Graph({i: Block(5)
                                   for i in range(1, 4)},
                                  {i: [i + 1]
                                   for i in range(1, 3)})

        self.scores = DensePileup.from_intervals(
            self.linear_graph, [Interval(0, 5, [i]) for i in range(1, 4)])

        self.graph = Graph({i: Block(5)
                            for i in range(1, 4)}, {
                                1: [3],
                                2: [3],
                                3: [4]
                            })
Example #3
0
    def test_find_max_path_on_start_and_end_node(self):

        graph = Graph({
            1: Block(10),
            2: Block(10),
            3: Block(10),
            4: Block(10)
        }, {
            1: [2, 3],
            2: [4],
            3: [4]
        })

        peak = ConnectedAreas(graph, {
            2: [0, 10],
            4: [0, 10],
        })

        binary_peak = BinaryContinousAreas.from_old_areas(peak)
        qvalues = DensePileup.from_intervals(graph,
                                             [Interval(7, 2, [1, 2, 4])])
        scored_peak = ScoredPeak.from_peak_and_pileup(binary_peak, qvalues)

        max_path = scored_peak.get_max_path()
        self.assertEqual(max_path, Interval(0, 10, [2, 4]))
Example #4
0
    def test_find_max_path_through_subgraph_multiple_paths(self):

        graph = Graph({
            1: Block(10),
            2: Block(10),
            3: Block(10),
            4: Block(10)
        }, {
            1: [2, 3],
            2: [4],
            3: [4]
        })

        peak = ConnectedAreas(graph, {
            2: [0, 10],
            3: [0, 10],
            1: [5, 10],
            4: [0, 3]
        })

        binary_peak = BinaryContinousAreas.from_old_areas(peak)
        qvalues = DensePileup.from_intervals(
            graph,
            [
                Interval(7, 2, [1, 3, 4])  # Giving higher qvalue
                # through this path
            ])

        print(qvalues)

        scored_peak = ScoredPeak.from_peak_and_pileup(binary_peak, qvalues)
        print(scored_peak)

        max_path = scored_peak.get_max_path()
        self.assertEqual(max_path, Interval(5, 3, [1, 3, 4]))
    def test_create_from_nongraphpeakcollection(self):

        graph = Graph({
            1: Block(10),
            2: Block(10),
            3: Block(10)
        }, {
            1: [2],
            2: [3]
        })
        graph.convert_to_numpy_backend()
        linear_path = Interval(0, 10, [1, 2, 3], graph)
        linear_path = linear_path.to_numpy_indexed_interval()

        nongraph_peaks = NonGraphPeakCollection([
            NonGraphPeak("chr1", 3, 10, 5),
            NonGraphPeak("chr1", 13, 15, 7),
        ])

        peaks = PeakCollection.create_from_nongraph_peak_collection(
            graph, nongraph_peaks, linear_path, None)

        self.assertEqual(peaks.intervals[0], Interval(3, 10, [1]))
        self.assertEqual(peaks.intervals[1], Interval(3, 5, [2]))

        peaks = PeakCollection.create_from_nongraph_peak_collection(
            graph, nongraph_peaks, linear_path, LinearRegion("chr1", 3, 20))
        self.assertEqual(peaks.intervals[0], Interval(0, 7, [1]))
        self.assertEqual(peaks.intervals[1], Interval(0, 2, [2]))
    def test_three_nodes_in(self):
        graph = Graph({i: Block(5)
                       for i in range(1, 5)}, {
                           1: [4],
                           2: [4],
                           3: [4]
                       })

        intervals = [
            Interval(2, 5, [1]),
            Interval(2, 5, [2]),
            Interval(2, 5, [3]),
            Interval(0, 3, [4])
        ]
        pileup = DensePileup.from_intervals(graph, intervals)

        subgraphs = SubgraphCollectionPartiallyOrderedGraph.create_from_pileup(
            graph, pileup)
        print(subgraphs)

        correct1 = BinaryContinousAreas(graph)
        correct1.add_start(-1, 3)
        correct1.add_start(-2, 3)
        correct1.add_start(-3, 3)
        correct1.add_start(4, 3)

        self.assertTrue(correct1 in subgraphs)
    def test_simple3(self):
        graph = Graph({i: Block(5)
                       for i in range(1, 6)}, {
                           1: [3],
                           2: [3],
                           3: [4, 5]
                       })
        scores = DensePileup.from_intervals(
            graph, [Interval(0, 5, [i]) for i in range(1, 6)])
        intervals = [
            Interval(0, 5, [1]),
            Interval(0, 5, [3]),
            Interval(0, 5, [4]),
            Interval(0, 3, [5])
        ]
        pileup = DensePileup.from_intervals(graph, intervals)
        subgraphs = SubgraphCollectionPartiallyOrderedGraph.create_from_pileup(
            graph, pileup)
        scored_peaks = (ScoredPeak.from_peak_and_pileup(peak, scores)
                        for peak in subgraphs)
        max_paths = [peak.get_max_path() for peak in scored_peaks]

        self.assertTrue(
            Interval(0, 5, [1, 3, 4]) in max_paths
            or Interval(0, 3, [1, 3, 5]) in max_paths)
Example #8
0
 def __init__(self, tf_experiment_dir, data_dir):
     self.experiment_dir = tf_experiment_dir
     self.data_dir = data_dir
     self.bam_file = pysam.AlignmentFile(self.experiment_dir + "/linear_alignments.bam", "rb")
     self.linear_path = NumpyIndexedInterval.from_file(self.data_dir + "/5_linear_pathv2.interval")
     self.graph = Graph.from_file(self.data_dir + "/5.nobg")
     self.alignment_collection = AlignmentCollection.from_file(self.experiment_dir + "/5_alignments.pickle", self.graph)
     self.check_peaks()
Example #9
0
def test_reverse():
    graph = Graph({
        1: Block(10),
        2: Block(5),
        3: Block(10),
        4: Block(5)
    }, {
        1: [2, 3],
        2: [4],
        3: [4]
    })
    graph.convert_to_numpy_backend()
    linear_path = NumpyIndexedInterval.from_interval(
        Interval(0, 10, [1, 2, 4], graph))
    alignments = [Interval(4, 5, [-3, -1], graph)]
    projected = project_alignments(alignments, linear_path)
    projected = list(projected)
    assert projected[0] == (5, 16, "-")
 def set_graph(self):
     self.graph = Graph({
         1: Block(5),
         2: Block(5),
         3: Block(5)
     }, {
         1: [2],
         2: [3]
     })
Example #11
0
def test_simple():
    graph = Graph({
        1: Block(10),
        2: Block(5),
        3: Block(10),
        4: Block(5)
    }, {
        1: [2, 3],
        2: [4],
        3: [4]
    })
    graph.convert_to_numpy_backend()
    linear_path = NumpyIndexedInterval.from_interval(
        Interval(0, 10, [1, 2, 4], graph))
    alignments = [Interval(5, 5, [1, 3], graph), Interval(5, 5, [3, 4], graph)]
    projected = project_alignments(alignments, linear_path)
    projected = list(projected)
    assert projected[0] == (5, 15, "+")
    assert projected[1] == (15, 25, "+")
    def setUp(self):

        self.graph = Graph({i: Block(3)
                            for i in range(1, 7)},
                           {i: [i + 1]
                            for i in range(1, 6)})
        self.peaks = PeakCollection([
            Peak(3, 3, [1, 2, 3, 4], self.graph),
            Peak(3, 3, [5, 6], self.graph)
        ])
Example #13
0
    def test_many_nodes():
        nodes = {i: Block(1) for i in range(2, 10)}
        nodes[1] = Block(10)
        nodes[10] = Block(10)

        graph = Graph(
            nodes, {
                1: [2, 3],
                2: [4],
                3: [4],
                4: [5, 6],
                5: [7],
                6: [7],
                7: [8, 9],
                8: [10],
                9: [10]
            })

        graph.convert_to_numpy_backend()
        sequence_graph = SequenceGraph.create_empty_from_ob_graph(graph)
        sequence_graph.set_sequence(1, "ACTGACTGAC")
        sequence_graph.set_sequence(10, "ACTGACTGAC")
        sequence_graph.set_sequence(2, "A")
        sequence_graph.set_sequence(3, "C")
        sequence_graph.set_sequence(4, "A")
        sequence_graph.set_sequence(5, "G")
        sequence_graph.set_sequence(6, "C")
        sequence_graph.set_sequence(7, "T")
        sequence_graph.set_sequence(8, "A")
        sequence_graph.set_sequence(9, "A")

        linear_ref = Interval(0, 10, [1, 2, 4, 6, 7, 8, 10], graph)
        linear_ref = linear_ref.to_numpy_indexed_interval()
        critical_nodes = {1, 4, 7, 10}

        finder = MinimizerFinder(graph,
                                 sequence_graph,
                                 critical_nodes,
                                 linear_ref,
                                 k=3,
                                 w=3)
        minimizers = finder.find_minimizers()
        print(len(minimizers.minimizers))
Example #14
0
    def setUp(self):
        self.simple_graph = Graph({i: Block(3)
                                   for i in range(1, 9)}, {
                                       1: [2, 3],
                                       2: [4],
                                       3: [4],
                                       4: [5],
                                       5: [6, 7],
                                       6: [8],
                                       7: [8]
                                   })
        print(self.simple_graph.get_first_blocks())
        print(self.simple_graph.reverse_adj_list)

        self.simple_snarls = \
            {
                20: SimpleSnarl(1, 4, 20),
                21: SimpleSnarl(5, 8, 21),
                22: SimpleSnarl(4, 5, 22)
            }
 def set_graph(self):
     self.graph = Graph({
         1: Block(5),
         2: Block(5),
         3: Block(5),
         4: Block(5)
     }, {
         1: [2, 3],
         2: [4],
         3: [4]
     })
Example #16
0
def macs_to_graph_peaks(folder):
    for chrom in ["1", "2", "3", "4", "5"]:
        path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom +
                                              "_linear_pathv2.interval")
        graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
        macs_peaks = PeakCollection.from_fasta_file(
            folder + "/macs_sequences_chr%s_summits_unique.fasta" % chrom,
            graph)
        macs_peaks.to_file(
            folder +
            "/%s_macs_unique_graph_summits.intervalcollection" % chrom, True)
    def test_overlapping_alt_loci(self):
        chrom_file = "data/chrom.sizes.test"
        alt_loci = "data/alt_loci_test"

        graph = create_initial_grch38_graph(chrom_file)

        numeric_graph, name_translation = convert_to_numeric_graph(graph)

        self.assertEqual(len(graph.blocks), 3)

        self.assertEqual(len([a for a, v in graph.adj_list.items() if v]), 0)

        new_numeric_graph, numeric_translation = \
                                                 connect_without_flanks(numeric_graph, alt_loci, name_translation)

        correct_graph_structure = Graph(
            {
                1: Block(1),
                2: Block(1),
                3: Block(1),
                4: Block(1),
                5: Block(1),
                6: Block(1),
                7: Block(1),
                8: Block(1),
                9: Block(1),
            },
            {
                1: [2, 8],
                2: [3, 9],
                3: [4],
                4: [5],
                5: [6],
                6: [7],
                9: [5],
                8: [6]
            }
        )

        self.assertTrue(correct_graph_structure.has_identical_structure(new_numeric_graph))
def run_predict_path(args):
    chromosomes = args.chromosomes.split(",")
    processes = []
    if not os.path.isfile(args.alignments):
        logging.error("Input alignments file %s does not exist" %
                      args.alignments)
        sys.exit()

    for chromosome in chromosomes:
        logging.info("Starting process for chromosome %s " % chromosome)
        process = Process(target=run_predict_path_single_chromosome,
                          args=(args.alignments, chromosome, args.data_dir,
                                args.linear_ref_bonus, args.out_file_name,
                                args.max_nodes_to_traverse))
        process.start()
        processes.append(process)

    for process in processes:
        process.join()

    # Merge all fasta files that were produces
    out_fasta = open(args.out_file_name + ".fa", "w")
    logging.info("Merging fasta files")
    for chromosome in tqdm(chromosomes):
        with open(args.out_file_name + "_" + chromosome + ".fasta") as f:
            out_fasta.write(f.read())

    logging.info("Wrote resulting linear reference to %s" %
                 (args.out_file_name + ".fa"))

    # Create indexed intervals for each interval file that was produced
    logging.info("Creating indexed interval for all chromosomes")
    for chromosome in chromosomes:
        file_name = args.out_file_name + "_" + chromosome + ".intervalcollection"
        graph = Graph.from_file(args.data_dir + chromosome + ".nobg")
        intervals = IntervalCollection.from_file(file_name,
                                                 text_file=True,
                                                 graph=graph)
        intervals = list(intervals.intervals)
        assert len(
            intervals) == 1, "Only a single interval in file is supported"
        interval = intervals[0]
        indexed = interval.to_numpy_indexed_interval()
        indexed.to_file(file_name + ".indexed")
        logging.info("Wrote indexed interval to file %s" % file_name +
                     ".indexed")

    if not args.skip_bwa_index:
        logging.info("Running bwa index")
        run_bwa_index(args.out_file_name + ".fa")
    else:
        logging.info("Not creating bwa index")
Example #19
0
def test_many_nodes():
    nodes = {i: Block(1) for i in range(2, 10)}
    nodes[1] = Block(10)
    nodes[10] = Block(10)

    graph = Graph(
        nodes, {
            1: [2, 3],
            2: [4],
            3: [4],
            4: [5, 6],
            5: [7],
            6: [7],
            7: [8, 9],
            8: [10],
            9: [10]
        })

    graph.convert_to_numpy_backend()
    sequence_graph = SequenceGraph.create_empty_from_ob_graph(graph)
    sequence_graph.set_sequence(1, "ACTGACTGAC")
    sequence_graph.set_sequence(10, "ACTGACTGAC")
    sequence_graph.set_sequence(2, "A")
    sequence_graph.set_sequence(3, "C")
    sequence_graph.set_sequence(4, "A")
    sequence_graph.set_sequence(5, "G")
    sequence_graph.set_sequence(6, "C")
    sequence_graph.set_sequence(7, "T")
    sequence_graph.set_sequence(8, "T")
    sequence_graph.set_sequence(9, "A")

    linear_ref_nodes = {1, 2, 4, 6, 7, 8, 10}
    read_sequence = "ACTGACCAGTAACTGAC"
    start_node = 1
    start_offset = 4
    aligner = LocalGraphAligner(graph, sequence_graph, read_sequence,
                                linear_ref_nodes, start_node, start_offset)
    alignment, score = aligner.align()
    assert alignment == [1, 3, 4, 5, 7, 9, 10]
Example #20
0
def visualize_alt_locus(args, skip_wrapping=False, quiet=False):
    from offsetbasedgraph.graphutils import GeneList, \
        create_gene_dicts, create_subgraph_around_alt_locus

    if not isinstance(args.translation_file_name, Translation):
        trans = Translation.from_file(args.translation_file_name)
    else:
        trans = args.translation_file_name

    graph = trans.graph2
    orig_trans = trans.copy()

    # Find all genes on this graph
    genes = GeneList(get_gene_objects_as_intervals(args.genes)).gene_list

    alt_loci_genes, gene_name_dict, main_genes = create_gene_dicts(genes, alt_loci_fn=args.alt_locations_file_name)
    genes = main_genes[args.alt_locus] + alt_loci_genes[args.alt_locus]
    genes = [g.translate(trans) for g in genes]
    subgraph, trans, start_position = create_subgraph_around_alt_locus(graph, trans, args.alt_locus, 200000, alt_loci_fn=args.alt_locations_file_name)

    start_position = orig_trans.translate_position(start_position, True)[0]

    genes = [g for g in genes if not g.multiple_alt_loci() and g.transcription_region.length() > 100]


    if len(genes) > 40:
        genes = genes[0:40]

    levels = Graph.level_dict(subgraph.blocks)

    # Find start block by choosing a block having no edges in
    start = None
    for b in subgraph.blocks:
        if len(subgraph.reverse_adj_list[b]) == 0:
            start = b
            break

    assert start is not None

    from visualizehtml import VisualizeHtml
    subgraph.start_block = start
    max_offset = sum([subgraph.blocks[b].length() for b in subgraph.blocks])
    v = VisualizeHtml(subgraph, 0, max_offset, 0, levels, "", 800, genes, start_position)

    if quiet:
        return

    if skip_wrapping:
        print(str(v))
    else:
        print(v.get_wrapped_html())
    def setUp(self):
        self.graph = Graph({i: Block(10)
                            for i in range(1, 4)},
                           {i: [i + 1]
                            for i in range(1, 3)})

        self.index = GraphIndex({
            1: [(2, 10), (3, 20)],
            2: [(3, 10)],
            3: [],
            -1: [],
            -2: [(-1, 10)],
            -3: [(-2, 10), (-1, 20)]
        })
        self.extender = GraphExtender(self.index)
Example #22
0
 def setUp(self):
     self.complex_graph = Graph(
         {i: Block(3) for i in range(1, 13)},
         {
             1: [2, 3],
             2: [7, 8],
             3: [4, 5],
             4: [6],
             5: [6],
             6: [10],
             7: [9],
             8: [9],
             9: [10],
             10: [12]
          })
     self.complex_graph.convert_to_numpy_backend()
Example #23
0
    def test_find_max_path_through_subgraph_two_node_graph(self):

        graph = Graph({1: Block(10), 2: Block(10)}, {1: [2]})

        peak = ConnectedAreas(graph, {2: [0, 4], 1: [5, 10]})

        binary_peak = BinaryContinousAreas.from_old_areas(peak)
        qvalues = DensePileup.from_base_value(graph, 10)
        print("q values")
        print(qvalues)
        print(qvalues.data._values)
        scored_peak = ScoredPeak.from_peak_and_pileup(binary_peak, qvalues)
        print(scored_peak)

        max_path = scored_peak.get_max_path()

        self.assertEqual(max_path, Interval(5, 4, [1, 2]))
def run_predict_path_single_chromosome(alignment_file_name, chromosome,
                                       graph_dir, linear_ref_bonus,
                                       out_file_base_name,
                                       max_nodes_to_traverse):
    sequence_graph = SequenceGraph.from_file(graph_dir + chromosome +
                                             ".nobg.sequences")
    graph = Graph.from_file(graph_dir + chromosome + ".nobg")
    linear_path = NumpyIndexedInterval.from_file(graph_dir +
                                                 "/%s_linear_pathv2.interval" %
                                                 chromosome)
    PathPredicter(alignment_file_name,
                  graph,
                  sequence_graph,
                  chromosome,
                  linear_path,
                  out_file_base_name,
                  linear_ref_bonus=linear_ref_bonus,
                  max_nodes_to_traverse=max_nodes_to_traverse)
Example #25
0
def read_graphs(graph_dir, chromosomes):
    logging.info("Reading graphs")
    graphs = {}
    sequence_graphs = {}
    linear_ref_nodes = {}
    for chromosome in chromosomes:
        chromosome_name = chromosome
        if chromosome == "X":
            chromosome_name = "23"
        logging.info("Reading graphs for chromosome %s" % chromosome)
        graphs[chromosome_name] = Graph.from_file(graph_dir + chromosome +
                                                  ".nobg")
        sequence_graphs[chromosome_name] = SequenceGraph.from_file(
            graph_dir + chromosome + ".nobg.sequencesv2")
        linear_ref_nodes[
            chromosome_name] = None  #NumpyIndexedInterval.from_file(graph_dir + chromosome + "_linear_pathv2.interval").nodes_in_interval()

    return graphs, sequence_graphs, linear_ref_nodes
    def setUp(self):
        self.graph = Graph({i: Block(10)
                            for i in range(1, 5)}, {
                                1: [2, 3],
                                2: [4],
                                3: [4]
                            })

        self.index = GraphIndex({
            1: [(2, 10), (3, 10), (4, 20)],
            2: [(4, 10)],
            3: [(4, 10)],
            4: [],
            -1: [],
            -2: [(-1, 10)],
            -3: [(-1, 10)],
            -4: [(-2, 10), (-3, 10), (-1, 20)]
        })
        self.extender = GraphExtender(self.index)
    def _coordinate(self, rp):
        """
        Returns the hierarhcial and sequential coordinates of a region path
        """

        length = self.graph.blocks[rp].length()
        # Translate rp back to get GRCh38 hier. coordinates
        from offsetbasedgraph import Interval, Graph

        hier_id = str(rp)
        hier_of = 0

        origin = Graph.block_origin(rp)
        if origin == "main" or origin == "merged":
            dist_back = self._distance_to_start(rp)
            hier_id = self.start_position.region_path_id
            hier_of = dist_back + self.start_position.offset

        return (str(rp), "0", str(hier_id), str(hier_of), str(length))
Example #28
0
    def test_find_max_path_through_subgraph_with_illegal_paths(self):

        graph = Graph(
            {
                1: Block(10),
                2: Block(10),
                3: Block(10),
                4: Block(10)
            },
            {
                1: [2, 3],
                2: [4],
                -4: [-3]  # Making 3=>4 not allowed path
            })

        peak = ConnectedAreas(graph, {
            2: [0, 10],
            3: [0, 10],
            1: [5, 10],
            4: [0, 8]
        })

        binary_peak = BinaryContinousAreas.from_old_areas(peak)
        qvalues = DensePileup.from_intervals(
            graph,
            [
                Interval(0, 10, [3]),  # Higher value on 3 than 2
                Interval(0, 10, [3]),
                Interval(0, 10, [4]),  # Highest value if ending on 4
                Interval(0, 10, [4]),
                Interval(0, 10, [1]),  # Highest value if inncluding 1
                Interval(0, 10, [1]),  # Highest value if inncluding 1
                Interval(0, 10, [1, 2, 4])
            ])

        scored_peak = ScoredPeak.from_peak_and_pileup(binary_peak, qvalues)

        max_path = scored_peak.get_max_path()
        print(max_path)

        self.assertEqual(max_path, Interval(5, 8, [1, 2, 4]))
Example #29
0
    def _create_data(self):
        node_offset = 1
        for chrom_number, chromosome in enumerate(self.chromosomes):
            graph = Graph(
                {i + node_offset: Block(10)
                 for i in range(0, 3)},
                {i + node_offset: [i + 1 + node_offset]
                 for i in range(0, 2)})

            linear_map = LinearMap.from_graph(graph)
            linear_map_file_name = "linear_map_%s.npz" % chromosome
            linear_map.to_file(linear_map_file_name)
            self.linear_maps.append(linear_map_file_name)
            self.sequence_retrievers.append(
                SequenceRetriever(
                    {i + node_offset: "A" * 10
                     for i in range(0, 3)}))
            self._create_reads(chrom_number, chromosome, graph)
            node_offset += 3
            graph.convert_to_numpy_backend()
            SequenceGraph.create_empty_from_ob_graph(graph).to_file(
                chromosome + ".nobg.sequences")
            graph.to_file(chromosome + ".nobg")
    def test_convert_to_approx_linear_peaks(self):
        graph = Graph({i: Block(3)
                       for i in range(1, 10)}, {
                           1: [2],
                           2: [3],
                           3: [4],
                           4: [5],
                           5: [6],
                           6: [7, 8],
                           7: [9],
                           9: [9]
                       })
        graph.convert_to_numpy_backend()
        linear_interval = Interval(0, 3, [2, 4, 8, 9], graph)
        linear_interval = linear_interval.to_numpy_indexed_interval()

        peaks = PeakCollection([Peak(2, 2, [2, 3, 4]), Peak(1, 1, [3, 4, 5])])
        linear_peaks = peaks.to_approx_linear_peaks(linear_interval, "chr4")
        linear_peaks = linear_peaks.peaks
        print(linear_peaks)

        self.assertEqual(linear_peaks[0], NonGraphPeak("chr4", 2, 5))
        self.assertEqual(linear_peaks[1], NonGraphPeak("chr4", 3, 3))
Example #31
0
    def test_simple(self):
        graph = Graph(
            {i: Block(3) for i in range(1, 5)},
            {
                1: [2, 3],
                2: [4],
                3: [4]
            }
        )
        graph.convert_to_numpy_backend()

        intervals = IntervalCollection([
            Interval(0, 3, [1, 3])
        ])

        haplotyper = HaploTyper(graph, intervals)
        haplotyper.build()
        max_interval = haplotyper.get_maximum_interval_through_graph()

        self.assertEqual(
            max_interval,
            Interval(0, 3, [1, 3, 4])
        )
Example #32
0
def make_haplotype_paths(graph_file_name, linear_ref_path_file_name,
                         haplotype0_file_name, haplotype1_file_name,
                         out_base_name, chromosome):
    # Make a linear reference fasta and interval and haplotypes fasta and intervals

    chrom = chromosome
    graph = Graph.from_file(graph_file_name)
    sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences")

    linear_ref = IntervalCollection.from_file(linear_ref_path_file_name,
                                              text_file=True)
    linear_ref = list(linear_ref.intervals)[0]
    linear_ref_nodes = set(linear_ref.region_paths)

    # Write linear ref fasta to file
    linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref)
    out_file = open("linear_ref_" + chrom + ".fasta", "w")
    out_file.writelines([">%s\n" % chrom])
    out_file.writelines([linear_ref_seq + "\n"])
    out_file.close()
    logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" %
                 len(linear_ref_nodes))

    haplotype_nodes = [set(), set()]  # For haplotype 0 and 1
    for haplotype in [0, 1]:
        haplotype_file_name = haplotype0_file_name
        if haplotype == 1:
            haplotype_file_name = haplotype1_file_name

        intervals = vg_json_file_to_intervals(haplotype_file_name, graph)

        for interval in intervals:
            for node in interval.region_paths:
                haplotype_nodes[haplotype].add(node)

    logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0]))
    logging.info("N nodes in haplotype 0 that are also in linear ref: %d" %
                 len(haplotype_nodes[0].intersection(linear_ref_nodes)))
    logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1]))

    # Traverse graph to get full correct haplotype intervals
    first_nodes = graph.get_first_blocks()
    assert len(first_nodes) == 1
    logging.info("N nodes in graph: %d" % len(graph.blocks))

    for haplotype in [0, 1]:
        logging.info("Traversing haplotype %d" % haplotype)

        nodes = []
        node = first_nodes[0]
        nodes_in_haplotype = haplotype_nodes[haplotype]
        nodes_in_haplotype = set(range(
            0, max(linear_ref_nodes))).difference(linear_ref_nodes)
        logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype))

        assert len(
            nodes_in_haplotype
        ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty"

        n_haplotype_nodes = 0
        i = 0
        while True:

            nodes.append(node)
            if i % 50000 == 0:
                logging.info("#%d nodes traversed. On node %d" % (i, node))
            i += 1

            next_nodes = set(graph.adj_list[node])

            if len(next_nodes) == 0:
                logging.info("Reached end node %d with 0 edges" % node)
                break

            next_on_haplotype = next_nodes.intersection(nodes_in_haplotype)
            if len(next_on_haplotype) == 1:
                n_haplotype_nodes += 1
                next_node = list(next_on_haplotype)[0]
                assert next_node != node
                node = next_node
            elif len(next_on_haplotype) == 0:
                logging.debug(
                    "No new haplotype node from %d. Will follow reference" %
                    node)
                # Choose reference with lowest id to avoid deletion
                node = min(list(next_nodes.intersection(linear_ref_nodes)))
            else:
                # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node)
                # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion
                node = min(list(next_on_haplotype))

        logging.info("Found %d nodes. %d on haplotype" %
                     (len(nodes), n_haplotype_nodes))
        haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(),
                                      nodes, graph)
        print("Path length: %d" % haplotype_interval.length())

        file_base_name = out_base_name + "_" + str(haplotype)
        IntervalCollection([haplotype_interval
                            ]).to_file(file_base_name + ".intervalcollection",
                                       text_file=True)

        sequence = sequence_graph.get_interval_sequence(haplotype_interval)
        out_file = open(file_base_name + ".fasta", "w")
        out_file.writelines([">%s\n" % chrom])
        out_file.writelines([sequence + "\n"])
        out_file.close()
        logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")
Example #33
0
def count_variants_in_graph(graph, linear_path):

    reference_nodes = linear_path.nodes_in_interval()
    n_variants = 0
    i = 0
    for node in graph.blocks:
        if i % 1000000 == 0:
            print("Node #%d" % i)
        i += 1
        if node not in reference_nodes:
            continue

        n_variants += max(0, len(graph.adj_list[node]) - 1)

    print("Variants: %d" % n_variants)
    return n_variants


if __name__ == "__main__":
    n_variants = 0
    for chromosome in sys.argv[2].split(","):
        print("Chromosome %s" % chromosome)
        graph = Graph.from_file(sys.argv[1] + "/" + chromosome +
                                "_pruned.nobg")
        linear_path = NumpyIndexedInterval.from_file(sys.argv[1] + "/" +
                                                     chromosome +
                                                     "_linear_pathv2.interval")
        n_variants += count_variants_in_graph(graph, linear_path)

    print("Total: %d" % n_variants)