def setUp(self):
        self.correct_ob_graph = GraphWithReversals(
            {
                1: Block(7),
                2: Block(4),
                3: Block(7),
                4: Block(4)
            }, {
                1: [2, 3],
                2: [4],
                3: [4]
            })
        self.correct_ob_graph.convert_to_numpy_backend()

        self.correct_sequence_graph = SequenceGraph.create_empty_from_ob_graph(
            self.correct_ob_graph)
        self.correct_sequence_graph.set_sequences_using_vg_json_graph(
            "tests/vg_test_graph.json")

        remove_files = [
            "tests/testgraph.obg", "tests/test_linear_map_starts.pickle",
            "tests/test_linear_map_ends.pickle",
            "tests/test_linear_map.length", "tests/sample.intervalcollection",
            "tests/testintervals.intervalcollection",
            "tests/testsequences.fasta",
            "tests/node_range_test_data/vg_alignments_1.json"
            "tests/node_range_test_data/vg_alignments_2.json"
            "tests/node_range_test_data/vg_alignments_3.json"
            "tests/node_range_test_data/vg_alignments_4.json"
            "tests/node_range_test_data/vg_alignments_5.json"
        ]
        for file in remove_files:
            if os.path.isfile(file):
                os.remove(file)
    def test_create_from_nongraphpeakcollection(self):

        graph = Graph({
            1: Block(10),
            2: Block(10),
            3: Block(10)
        }, {
            1: [2],
            2: [3]
        })
        graph.convert_to_numpy_backend()
        linear_path = Interval(0, 10, [1, 2, 3], graph)
        linear_path = linear_path.to_numpy_indexed_interval()

        nongraph_peaks = NonGraphPeakCollection([
            NonGraphPeak("chr1", 3, 10, 5),
            NonGraphPeak("chr1", 13, 15, 7),
        ])

        peaks = PeakCollection.create_from_nongraph_peak_collection(
            graph, nongraph_peaks, linear_path, None)

        self.assertEqual(peaks.intervals[0], Interval(3, 10, [1]))
        self.assertEqual(peaks.intervals[1], Interval(3, 5, [2]))

        peaks = PeakCollection.create_from_nongraph_peak_collection(
            graph, nongraph_peaks, linear_path, LinearRegion("chr1", 3, 20))
        self.assertEqual(peaks.intervals[0], Interval(0, 7, [1]))
        self.assertEqual(peaks.intervals[1], Interval(0, 2, [2]))
Example #3
0
class TestHaplotyper(unittest.TestCase):

    def setUp(self):
        self.complex_graph = Graph(
            {i: Block(3) for i in range(1, 13)},
            {
                1: [2, 3],
                2: [7, 8],
                3: [4, 5],
                4: [6],
                5: [6],
                6: [10],
                7: [9],
                8: [9],
                9: [10],
                10: [12]
             })
        self.complex_graph.convert_to_numpy_backend()

    def test_simple(self):
        graph = Graph(
            {i: Block(3) for i in range(1, 5)},
            {
                1: [2, 3],
                2: [4],
                3: [4]
            }
        )
        graph.convert_to_numpy_backend()

        intervals = IntervalCollection([
            Interval(0, 3, [1, 3])
        ])

        haplotyper = HaploTyper(graph, intervals)
        haplotyper.build()
        max_interval = haplotyper.get_maximum_interval_through_graph()

        self.assertEqual(
            max_interval,
            Interval(0, 3, [1, 3, 4])
        )

    def test_complex_graph(self):
        intervals = IntervalCollection([
            Interval(0, 3, [1, 3, 4, 6, 10]),
            Interval(1, 2, [2]),
            Interval(2, 3, [2]),
            Interval(0, 3, [7, 9])
        ])
        haplotyper = HaploTyper(self.complex_graph, intervals)
        haplotyper.build()
        max_interval = haplotyper.get_maximum_interval_through_graph()

        self.assertEqual(
            max_interval,
            Interval(0, 3, [1, 2, 7, 9, 10, 12])
        )
 def test_create_ob_graph(self):
     run_argument_parser([
         "create_ob_graph", "-o", "tests/testgraph.obg",
         "tests/vg_test_graph.json"
     ])
     graph = GraphWithReversals.from_numpy_file("tests/testgraph.obg")
     self.assertEqual(graph, self.correct_ob_graph)
    def test_finds_correct_max_path_among_many_paths(self):
        graph = GraphWithReversals(
            {
                1: Block(10),
                2: Block(10),
                3: Block(10),
                4: Block(10),
                5: Block(10)
            }, {
                1: [2, 3, 4],
                2: [5],
                4: [5],
                3: [5]
            })

        pileup = SparsePileup(graph)
        pileup.data = {
            1: ValuedIndexes([], [], 2, 10),
            # Higher qval, but two holes with low
            2: ValuedIndexes([1, 2, 7, 8], [0, 2.001, 0, 2.001], 2, 10),
            3: ValuedIndexes([], [], 1.5, 10),
            4: ValuedIndexes([], [], 2, 10),
            5: ValuedIndexes([], [], 2, 10)
        }
        self._assert_finds_max_paths([Interval(0, 10, [1, 4, 5])], graph,
                                     pileup)
Example #6
0
 def create_linear_graph(self):
     nodes = {i + 1: Block(self.node_size) for i in range(0, self.n_nodes)}
     adj_list = {i: [i + 1] for i in range(1, self.n_nodes)}
     self.graph = GraphWithReversals(nodes, adj_list)
     self.graph.to_file(self.GRAPH_NAME)
     snarlbuilder = SnarlGraphBuilder(self.graph,
                                      snarls={
                                          self.n_nodes + 2:
                                          SimpleSnarl(1,
                                                      self.n_nodes,
                                                      id=self.n_nodes + 2)
                                      },
                                      id_counter=self.n_nodes + 3)
     self.snarlgraph = snarlbuilder.build_snarl_graphs()
     self.linear_map = LinearSnarlMap.from_snarl_graph(
         self.snarlgraph, self.graph)
     self.linear_map.to_json_files(self.MAP_NAME)
    def set_graph(self):
        self.fragment_length = 5
        self.read_length = 1

        self.graph = GraphWithReversals({i: Block(15)
                                         for i in range(1, 5)}, {
                                             1: [2, 3],
                                             2: [4],
                                             3: [4]
                                         })
        LinearMap.from_graph(self.graph).to_file("test_linear_map.npz")
    def _create_graph_with_linear_blocks(self):
        blocks = {
            i: Block(self.n_basepairs_length)
            for i in range(100, 100 + self.n_paths)
        }
        graph = GraphWithReversals(blocks, {})

        # Add dummy blocks at start and end
        start = Block(1)
        end = Block(1)
        for block in graph.blocks:
            graph.adj_list[1].append(block)
            graph.reverse_adj_list[block].append(1)
            graph.adj_list[block].append(2)
            graph.reverse_adj_list[2].append(block)

        graph.blocks[1] = start
        graph.blocks[2] = end

        self.graph = graph
        self.translation = Translation({}, {}, graph)
Example #9
0
    def test_simple(self):
        graph = Graph(
            {i: Block(3) for i in range(1, 5)},
            {
                1: [2, 3],
                2: [4],
                3: [4]
            }
        )
        graph.convert_to_numpy_backend()

        intervals = IntervalCollection([
            Interval(0, 3, [1, 3])
        ])

        haplotyper = HaploTyper(graph, intervals)
        haplotyper.build()
        max_interval = haplotyper.get_maximum_interval_through_graph()

        self.assertEqual(
            max_interval,
            Interval(0, 3, [1, 3, 4])
        )
    def test_convert_to_approx_linear_peaks(self):
        graph = Graph({i: Block(3)
                       for i in range(1, 10)}, {
                           1: [2],
                           2: [3],
                           3: [4],
                           4: [5],
                           5: [6],
                           6: [7, 8],
                           7: [9],
                           9: [9]
                       })
        graph.convert_to_numpy_backend()
        linear_interval = Interval(0, 3, [2, 4, 8, 9], graph)
        linear_interval = linear_interval.to_numpy_indexed_interval()

        peaks = PeakCollection([Peak(2, 2, [2, 3, 4]), Peak(1, 1, [3, 4, 5])])
        linear_peaks = peaks.to_approx_linear_peaks(linear_interval, "chr4")
        linear_peaks = linear_peaks.peaks
        print(linear_peaks)

        self.assertEqual(linear_peaks[0], NonGraphPeak("chr4", 2, 5))
        self.assertEqual(linear_peaks[1], NonGraphPeak("chr4", 3, 3))
Example #11
0
    def _create_data(self):
        node_offset = 1
        for chrom_number, chromosome in enumerate(self.chromosomes):
            graph = Graph(
                {i + node_offset: Block(10)
                 for i in range(0, 3)},
                {i + node_offset: [i + 1 + node_offset]
                 for i in range(0, 2)})

            linear_map = LinearMap.from_graph(graph)
            linear_map_file_name = "linear_map_%s.npz" % chromosome
            linear_map.to_file(linear_map_file_name)
            self.linear_maps.append(linear_map_file_name)
            self.sequence_retrievers.append(
                SequenceRetriever(
                    {i + node_offset: "A" * 10
                     for i in range(0, 3)}))
            self._create_reads(chrom_number, chromosome, graph)
            node_offset += 3
            graph.convert_to_numpy_backend()
            SequenceGraph.create_empty_from_ob_graph(graph).to_file(
                chromosome + ".nobg.sequences")
            graph.to_file(chromosome + ".nobg")
    def setUp(self):
        self.simple_graph = GraphWithReversals(
            {
                1: Block(3),
                2: Block(3),
                3: Block(3)
            }, {
                1: [2],
                2: [3]
            })

        self.reversed_simple_graph = GraphWithReversals(
            {
                1: Block(3),
                2: Block(3),
                3: Block(3)
            }, {
                -2: [-1],
                -3: [-2]
            })
        self.simple_graphs = [self.simple_graph, self.reversed_simple_graph]

        self.graph2 = Graph({
            1: Block(3),
            2: Block(3)
        }, {
            -2: [1],
        })

        self.graph3 = Graph({1: Block(3), 2: Block(3)}, {2: [-1]})

        areas = {2: np.array([0, 3])}
        self.middle_areas = ConnectedAreas(self.simple_graph, areas)
        self.middle_closed_area = ConnectedAreas(self.simple_graph,
                                                 {2: np.array([1, 2])})
        self.middle_left_area = ConnectedAreas(self.simple_graph,
                                               {2: np.array([0, 2])})
Example #13
0
 def setUp(self):
     self.complex_graph = Graph(
         {i: Block(3) for i in range(1, 13)},
         {
             1: [2, 3],
             2: [7, 8],
             3: [4, 5],
             4: [6],
             5: [6],
             6: [10],
             7: [9],
             8: [9],
             9: [10],
             10: [12]
          })
     self.complex_graph.convert_to_numpy_backend()
Example #14
0
    def setUp(self):
        self.graph = GraphWithReversals({i: Block(3)
                                         for i in range(1, 12)}, {
                                             1: [2, 3],
                                             2: [7, 8],
                                             3: [4, 5],
                                             4: [6],
                                             5: [6],
                                             6: [10],
                                             7: [9],
                                             8: [9],
                                             9: [10],
                                             10: [11]
                                         })

        self.linear_length = 18
        LinearMap.from_graph(self.graph).to_file("test_linear_map.npz")
    def set_graph(self):
        self.fragment_length = 6
        self.read_length = 2
        blocks = {i: Block(3) for i in range(1, 11)}
        blocks[11] = Block(1000)
        self.graph = GraphWithReversals(
            blocks, {
                1: [2, 3],
                2: [7, 8],
                3: [4, 5],
                4: [6],
                5: [6],
                6: [10],
                7: [9],
                8: [9],
                9: [10],
                10: [11]
            })

        LinearMap.from_graph(self.graph).to_file("test_linear_map.npz")
Example #16
0
def find_linear_path_through_chromosome(chromosome, chromend, fasta_file_name,
                                        ob_graph_file_name,
                                        vg_graph_file_name):
    genome = Fasta(fasta_file_name)
    seq = str(genome[chromosome][0:50818468]).lower()

    logging.info("Creating sequence retriever")
    sequence_retriever = SequenceRetriever.from_vg_json_graph(
        vg_graph_file_name)

    graph = GraphWithReversals.from_numpy_file(ob_graph_file_name)

    start_nodes = graph.get_first_blocks()
    assert len(start_nodes) == 1, "Found %d start nodes" % start_nodes
    start_node = start_nodes[0]

    traverser = GraphTraverserUsingSequence(graph, seq, sequence_retriever)
    traverser.search_from_node(start_node)
    path = traverser.get_interval_found()
    path = IntervalCollection(path)
    path.to_file("22_path.intervalcollection", text_file=True)
    logging.info("Done")
Example #17
0
 def _init_caller(self):
     self.caller = CallPeaks(GraphWithReversals.from_file(self.GRAPH_NAME),
                             "")
Example #18
0
class MACSTests(object):
    GRAPH_NAME = "lin_graph.tmp"
    MAP_NAME = "lin_map.tmp"
    INTERVALS_NAME = "graph_intervals.tmp"
    CONTROL_NAME = "graph_control.tmp"

    def __init__(self,
                 node_size,
                 n_nodes,
                 n_intervals,
                 read_length=15,
                 fragment_length=50,
                 with_control=False):
        self.node_size = node_size
        self.n_nodes = n_nodes
        self.with_control = with_control
        self.n_intervals = n_intervals
        self.read_length = read_length
        self.genome_size = node_size * n_nodes
        self.fragment_length = fragment_length
        self.peak_depth = 10
        self.setup()

    def setup(self):
        print("######## SETUP ########")
        self.create_linear_graph()
        self.create_intervals()
        self.write_intervals()
        self.info = ExperimentInfo(self.genome_size, self.fragment_length,
                                   self.read_length)
        self.info.n_control_reads = self.n_intervals_control
        self.info.n_sample_reads = self.n_intervals
        logging.info("N_control %s, N_sample: %s", self.info.n_control_reads,
                     self.info.n_sample_reads)
        self.control_file_name = self.INTERVALS_NAME
        if self.with_control:
            self.control_file_name = self.CONTROL_NAME
        """
        self.caller = CallPeaks(self.GRAPH_NAME, self.INTERVALS_NAME,
                                control_file_name,
                                has_control=self.with_control,
                                experiment_info=self.info,
                                verbose=True,
                                linear_map=self.MAP_NAME)
        """
        self.sample_intervals = IntervalCollection(self.graph_intervals)

        if self.with_control:
            self.control_intervals = IntervalCollection(
                self.graph_intervals_control)
        else:
            self.control_intervals = IntervalCollection(self.graph_intervals)

        self._init_caller()

    def _init_caller(self):
        self.caller = CallPeaks(GraphWithReversals.from_file(self.GRAPH_NAME),
                                "")
        #self.caller.create_graph()

    # Tests
    def test_filter_dup(self):
        command = "macs2 filterdup -i %s --keep-dup=1 -o %s" % (
            "lin_intervals.bed", "lin_intervals_dup.bed")
        command = command.split()
        subprocess.check_output(command)
        self.dup_file_name = self.caller.filter_duplicates(
            "graph_intervals", write_to_file="graph_intervals_filtered.tmp")
        self.assertEqualIntervalFiles(self.dup_file_name,
                                      "lin_intervals_dup.bed")

    def test_sample_pileup(self):
        self.caller.sample_intervals = self.graph_intervals
        self.caller.create_sample_pileup(True)
        self._create_sample_pileup()
        self.assertPileupFilesEqual(self.caller._sample_track,
                                    "lin_sample_pileup.bdg")

    def test_control_pileup(self):
        self.caller.control_intervals = self.graph_intervals
        self.caller.create_control(True)
        self._create_control()
        assert isinstance(self.caller._control_track, str)
        self.assertPileupFilesEqual(self.caller._control_track,
                                    "lin_control_pileup.bdg",
                                    min_value=self.background)

    def test_call_peaks(self):
        print("###############################################")
        print(self.graph.adj_list[222])
        print(self.graph.reverse_adj_list[222])
        self.assertPileupFilesEqual("control_track.bdg",
                                    "macstest_control_lambda.bdg")
        self.assertPileupFilesEqual("sample_track.bdg",
                                    "macstest_treat_pileup.bdg")
        self.caller._control_pileup = SparsePileup.from_bed_graph(
            self.graph, "control_track.bdg")
        self.caller._sample_pileup = SparsePileup.from_bed_graph(
            self.graph, "sample_track.bdg")
        self.caller.get_score()
        self._get_scores("qpois")
        self.assertPileupFilesEqual("q_values.bdg", "lin_scores.bdg")
        self._call_peaks()
        self.caller.call_peaks()
        self.assertEqualBedFiles("final_peaks.bed", "lin_peaks.bed")

    def neg_linear_to_graph_interval(self, lin_interval):
        start_rp = ((lin_interval.end - 1) // self.node_size + 1)
        end_rp = (lin_interval.start // self.node_size + 1)
        start_offset = start_rp * self.node_size - lin_interval.end
        end_offset = end_rp * self.node_size - lin_interval.start
        # start_offset = (-lin_interval.end) % self.node_size
        # end_offset = (-lin_interval.start) % self.node_size
        # start_rp = (lin_interval.end) // self.node_size + 1
        # end_rp = (-lin_interval.start) // self.node_size + 1
        rps = list(range(start_rp * -1, end_rp * -1 + 1))
        interval = DirectedInterval(start_offset,
                                    end_offset,
                                    rps,
                                    graph=self.graph)
        return interval

    def linear_to_graph_interval(self, lin_interval, is_control=None):
        if lin_interval.direction == -1:
            return self.neg_linear_to_graph_interval(lin_interval)

        start = lin_interval.start
        end = lin_interval.end
        start_rp = start // self.node_size + 1
        end_rp = (end - 1) // self.node_size + 1
        start_pos = Position(start_rp, start % self.node_size)
        end_pos = Position(end_rp, ((end - 1) % self.node_size) + 1)
        region_paths = list(range(start_rp, end_rp + 1))
        interval = DirectedInterval(start_pos,
                                    end_pos,
                                    region_paths,
                                    direction=lin_interval.direction,
                                    graph=self.graph)
        return interval

    def _convert_valued_interval(self, interval):
        true_id = abs(interval.node_id) - 1
        interval.start += self.node_size * true_id
        interval.end += self.node_size * true_id

    def graph_to_linear_pos(self, pos):
        return pos.region_path_id * self.node_size + pos.offset

    def graph_to_linear_interval(self, graph_interval):
        start = self.graph_to_linear_pos(graph_interval.start_position)
        end = self.graph_to_linear_pos(graph_interval.end_position)
        return SimpleInterval(start, end, graph_interval.direction)

    def assertEqualIntervals(self, linear_intervals, graph_intervals):
        graph_intervals = [
            self.graph_to_linear_interval(g_interval)
            for g_interval in graph_intervals
        ]
        #assert len(graph_intervals) == len(linear_intervals), \
        #    "%d != %d" % (len(graph_intervals), len(linear_intervals))
        for interval in graph_intervals:
            assert interval in linear_intervals

    def assertEqualIntervalFiles(self, graph_file, linear_file):
        graph_intervals = IntervalCollection.from_file(graph_file)
        linear_intervals = (SimpleInterval.from_file_line(line)
                            for line in open(linear_file).readlines())
        self.assertEqualIntervals(list(linear_intervals), graph_intervals)

    def _create_binary_track(self, intervals):
        pileup = np.zeros(self.genome_size, dtype="bool")
        for interval in intervals:
            if interval is None:
                continue
            pileup[interval.start:interval.end] = True
        return pileup

    def assertEqualBedFiles(self, graph_file, linear_file):
        graph_intervals = [
            SimpleInterval.from_file_line(line)
            for line in open(graph_file).readlines()
        ]
        linear_intervals = [
            SimpleInterval.from_file_line(line)
            for line in open(linear_file).readlines()
        ]

        for graph_interval in graph_intervals:
            self._convert_valued_interval(graph_interval)
        pileup1 = self._create_binary_track(linear_intervals)
        pileup2 = self._create_binary_track(graph_intervals)
        indices = np.where(pileup1 != pileup2)[0]

        if not np.allclose(pileup1, pileup2):
            logging.error(indices)
            logging.error("%s %s %s", indices[0],
                          indices[np.where(np.diff(indices) > 1)], indices[-1])
        assert np.allclose(pileup1, pileup2)

    def _create_pileup(self,
                       pileup_file,
                       convert=False,
                       limit=False,
                       min_value=None):
        pileup = np.zeros(self.genome_size)
        valued_intervals = (ValuedInterval.from_file_line(line)
                            for line in open(pileup_file).readlines())
        for interval in valued_intervals:
            if interval is None:
                continue
            if convert:
                self._convert_valued_interval(interval)
            pileup[interval.start:interval.end] = np.maximum(
                pileup[interval.start:interval.end], interval.value)

        if min_value is not None:
            pileup = np.maximum(pileup, min_value)
        return pileup

    def assertPileupFilesEqual(self, graph_file, linear_file, min_value=None):
        assert isinstance(graph_file, str)
        assert isinstance(linear_file, str)

        linear_pileup = self._create_pileup(linear_file, min_value=min_value)
        graph_pileup = self._create_pileup(graph_file, convert=True)
        assert sum(graph_pileup) > 0
        rtol = 0.001
        rtol = 0.05

        if not np.allclose(linear_pileup, graph_pileup, rtol=rtol):
            different = np.abs(linear_pileup - graph_pileup) > rtol
            logging.error(different)
            logging.error(np.where(different))
            logging.error("Number of indices different")
            logging.error(len(np.where(different)[0]))
            if not len(np.where(different)[0]):
                return
            logging.error("Differences:")


        assert np.allclose(linear_pileup, graph_pileup, rtol=rtol), \
            "Pileup in %s != pileup in %s" % (linear_file, graph_file)

    def _create_sample_pileup(self):
        command = "macs2 pileup -i %s -o %s --extsize %s -f BED" % (
            "lin_intervals.bed", "lin_sample_pileup.bdg",
            self.fragment_length - 1)
        logging.info(command)
        subprocess.check_output(command.split())

    def _get_scores(self, t="qpois"):
        command = "macs2 bdgcmp -t macstest_treat_pileup.bdg -c macstest_control_lambda.bdg -m %s -o lin_scores.bdg" % t
        # command = "macs2 bdgcmp -t macstest_treat_pileup.bdg -c macstest_control_lambda.bdg  -m %s -o lin_scores.bdg" % t
        logging.info(command)
        subprocess.check_output(command.split())

    def _call_peaks(self):
        threshold = -np.log10(0.05)
        command = "macs2 bdgpeakcall -i lin_scores.bdg -c %s -l %s -g %s -o lin_peaks.bed" % (
            threshold, self.info.fragment_length, self.read_length)
        logging.info(command)
        subprocess.check_output(command.split())

    def _create_control(self):
        for ext in [2500]:
            command = "macs2 pileup -i %s -o %s -B --extsize %s" % (
                "lin_intervals.bed", "lin_control_pileup%s.bdg -f BED" % ext,
                ext)
            subprocess.check_output(command.split())
            command = "macs2 bdgopt -i lin_control_pileup%s.bdg -m multiply -p %s -o lin_control_pileup%s.bdg" % (
                ext, (self.fragment_length - 1) / (ext * 2), ext)
            subprocess.check_output(command.split())
        # command = "macs2 bdgcmp -m max -t lin_control_pileup2500.bdg -c lin_control_pileup5000.bdg -o lin_control_pileup.bdg"

        # subprocess.check_output(command.split())

        self.background = self.n_intervals * self.info.fragment_length / self.genome_size
        logging.info(self.background)
        command = "macs2 bdgopt -i lin_control_pileup2500.bdg -m max -p %s -o lin_control_pileup.bdg" % self.background
        logging.info(command)
        subprocess.check_output(command.split())

    def write_intervals(self):
        f = open("lin_intervals.bed", "w")
        f.writelines(interval.to_file_line()
                     for interval in self.linear_intervals)
        f.close()
        logging.info("Wrote to lin_intervals.bed")
        graph_intervals = IntervalCollection(self.graph_intervals)
        graph_intervals.to_file(self.INTERVALS_NAME, True)

        if self.with_control:
            f = open("lin_intervals_control.bed", "w")
            f.writelines(interval.to_file_line()
                         for interval in self.linear_intervals_control)
            f.close()
            graph_intervals = IntervalCollection(self.graph_intervals_control)
            graph_intervals.to_file(self.CONTROL_NAME, True)
            graph_intervals.to_file(self.CONTROL_NAME + ".tmp", True)

        print("Wrote to graph_intervals")

    def create_linear_graph(self):
        nodes = {i + 1: Block(self.node_size) for i in range(0, self.n_nodes)}
        adj_list = {i: [i + 1] for i in range(1, self.n_nodes)}
        self.graph = GraphWithReversals(nodes, adj_list)
        self.graph.to_file(self.GRAPH_NAME)
        snarlbuilder = SnarlGraphBuilder(self.graph,
                                         snarls={
                                             self.n_nodes + 2:
                                             SimpleSnarl(1,
                                                         self.n_nodes,
                                                         id=self.n_nodes + 2)
                                         },
                                         id_counter=self.n_nodes + 3)
        self.snarlgraph = snarlbuilder.build_snarl_graphs()
        self.linear_map = LinearSnarlMap.from_snarl_graph(
            self.snarlgraph, self.graph)
        self.linear_map.to_json_files(self.MAP_NAME)

    def _get_graph_interval(self, tmp_start, tmp_end, direction):
        start = tmp_start
        end = tmp_end
        if direction == -1:
            start = -tmp_end
            end = -tmp_start
        start_rp = start // self.node_size
        end_rp = (end + 1) // self.node_size
        region_paths = list(range(start_rp, end_rp))
        start_pos = Position(start_rp, start % self.node_size)
        end_pos = Position(end_rp, (end % self.node_size) + 1)
        return DirectedInterval(start_pos,
                                end_pos,
                                region_paths,
                                direction=direction)

    def create_pairs_around_point(self, point, n=1):
        intervals = []
        for _ in range(n):
            offset = random.randint(-n, n)
            point = point + offset
            pos_start = point - self.fragment_length // 2
            pos_end = pos_start + self.read_length
            if pos_start > 0 and pos_end < self.genome_size:
                intervals.append(SimpleInterval(pos_start, pos_end, 1))
                assert pos_start >= 0 and pos_end >= 0
            neg_end = point + self.fragment_length // 2
            neg_start = neg_end - self.read_length
            if neg_end < self.genome_size and neg_start >= 0:
                intervals.append(SimpleInterval(neg_start, neg_end, -1))
                assert neg_start >= 0 and neg_end >= 0

        return intervals

    def create_random_linear_reads(self, n_reads, include_pairs=False):
        logging.info("Creating %d linear reads" % n_reads)
        reads = []
        for i in range(n_reads // self.peak_depth + 1):
            logging.debug("Creating read %d" % i)
            point = random.randint(0, self.genome_size)
            reads.extend(
                self.create_pairs_around_point(point, n=self.peak_depth))

        return reads

    def _find_graph_size(self, intervals):
        max_point = [
            interval.end if interval.direction == -1 else interval.start +
            self.fragment_length for interval in intervals
        ]
        return max(max_point)

    def create_intervals(self):
        self.linear_intervals = self.create_random_linear_reads(
            self.n_intervals, include_pairs=True)
        self.graph._size = self._find_graph_size(self.linear_intervals)
        dummy_end = SimpleInterval(self.genome_size - self.read_length,
                                   self.genome_size, -1)
        self.linear_intervals.append(dummy_end)
        self.graph_intervals = [
            self.linear_to_graph_interval(i) for i in self.linear_intervals
        ]
        logging.debug(len(self.graph_intervals))
        self.n_intervals = len(self.linear_intervals)
        self.linear_intervals = sorted(self.linear_intervals,
                                       key=lambda x: (x.node_id, x.start))
        self.graph_intervals = sorted(
            self.graph_intervals,
            key=lambda x: (x.region_paths[0], x.start_position.offset))
        logging.info("Created %d intervals ", self.n_intervals)
        if self.with_control:
            self.linear_intervals_control = self.create_random_linear_reads(
                self.n_intervals, include_pairs=False)
            self.linear_intervals_control.append(dummy_end)
            self.graph_intervals_control = [
                self.linear_to_graph_interval(i, is_control=True)
                for i in self.linear_intervals_control
            ]
            self.n_intervals_control = len(self.linear_intervals_control)
            logging.info("Created %d control intervals ",
                         self.n_intervals_control)
        else:
            self.n_intervals_control = self.n_intervals

    def test_shift_estimation(self):
        self.setup()
        caller = CallPeaks("lin_graph.tmp",
                           "graph_intervals_filtered.tmp",
                           "graph_intervals_filtered.tmp",
                           has_control=False)
        caller.create_graph()
        info = ExperimentInfo.find_info(caller.ob_graph,
                                        caller.sample_file_name,
                                        caller.control_file_name)
        read_length_graph = info.read_length
        fragment_length_graph = info.fragment_length

        # Macs
        command = [
            "macs2", "predictd", "-i", "lin_intervals_dup.bed", "-g",
            str(self.genome_size), "-m", "5", "50"
        ]
        string_commmand = ' '.join(command)
        logging.info(string_commmand)
        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
        output = output.decode("utf-8")
        logging.debug(output)
        tag_size = re.search("tag size = ([0-9]+)", output).groups()[0]
        tag_size = int(tag_size)
        fragment_length = re.search("fragment length is ([0-9]+) bp",
                                    output).groups()[0]
        fragment_length = int(fragment_length)

        assert read_length_graph == tag_size, \
            "Read length from graph % d != %d (macs reads length)" % (read_length_graph, tag_size)
        assert fragment_length_graph == fragment_length

    def profile(self):
        self.caller.run()

    def _run_whole_macs(self):
        command = "macs2 callpeak -t lin_intervals.bed -f BED -g " + str(
            self.genome_size) + " --nomodel --extsize " + str(
                self.info.fragment_length
            ) + " -n macstest -B -q 0.05 --keep-dup all"
        if self.with_control:
            command += " --slocal=1000 -c lin_intervals_control.bed"

        logging.info("Macs command used: %s", command)
        command = command.split()
        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
        output = output.decode("utf-8")
        logging.debug(output)

    def assertPeakSetsEqual(self, linear_peaks_file, graph_peaks_file):
        linear_path = DirectedInterval(0,
                                       self.node_size,
                                       list(range(1, self.n_nodes + 1)),
                                       graph=self.graph)
        comparer = PeaksComparer.create_from_graph_peaks_and_linear_peaks(
            linear_peaks_file,
            graph_peaks_file,
            self.graph,
            linear_path,
            graph_region=None)
        # for i, j in zip(sorted(comparer.peaks1.intervals, key=lambda x: x.region_paths[0]),
        #                 sorted(comparer.peaks2.intervals, key=lambda x: x.region_paths[0])):
        #     print(i, j)

        assert len(comparer.peaks1.intervals) == len(comparer.peaks2.intervals)
        matches = comparer.get_peaks_at_same_position()
        # for m in matches:
        #     print(m)
        assert len(matches) == len(comparer.peaks1.intervals)

    def test_whole_pipeline(self):
        self._run_whole_macs()
        # self.caller.create_graph()
        self.caller.sample_intervals = self.sample_intervals
        self.caller.control_intervals = self.control_intervals

        config = Configuration(save_tmp_results_to_file=True,
                               skip_filter_duplicates=True,
                               p_val_cutoff=0.05)
        self.caller.run_pre_callpeaks(has_control=self.with_control,
                                      experiment_info=self.info,
                                      linear_map=self.MAP_NAME,
                                      configuration=config)

        # self.assertPileupFilesEqual("sample_track.bdg",
        #                             "macstest_treat_pileup.bdg")

        # self.assertPileupFilesEqual("control_track.bdg",
        #                             "macstest_control_lambda.bdg")

        logging.info("################### GETTING SCORE")
        self.caller.get_p_values()
        self.caller.get_p_to_q_values_mapping()
        self.caller.get_q_values()

        logging.info("################### CALLING PEAKS")
        self.caller.call_peaks_from_q_values(experiment_info=self.info,
                                             config=config)
        # Cannot compare bedgraphs anymore, as graph pileup is not trimmed before maxpaths
        #self.assertEqualBedFiles("final_peaks.bed",
        #                         "macstest_peaks.narrowPeak")

        self.assertPeakSetsEqual("macstest_peaks.narrowPeak",
                                 "max_paths.intervalcollection")

        print("Success. All assertions passed")

    def test_final_tracks(self):
        self._run_whole_macs()
        self.caller.run()
        self.assertEqualBedFiles("final_peaks.bed",
                                 "macstest_peaks.narrowPeak")
    def setUp(self):
        blocks = {i: Block(10) for i in range(1, 5)}
        edges = {i: [i + 1] for i in range(1, 4)}
        self.linear_graph = GraphWithReversals(blocks, edges)
        self.one_peak_q_values = SparsePileup(self.linear_graph)
        self.one_peak_q_values.data = \
            {
                1: ValuedIndexes([5], [2], 0, 10),
                2: ValuedIndexes([3], [0], 2, 10)
            }

        self.one_peak_with_hole = SparsePileup(self.linear_graph)
        self.one_peak_with_hole.data = \
            {
                1: ValuedIndexes([5, 8], [2, 0], 0, 10),
                2: ValuedIndexes([3], [0], 2, 10)
            }

        self.one_peak_with_big_hole = SparsePileup(self.linear_graph)
        self.one_peak_with_big_hole.data = \
            {
                1: ValuedIndexes([5, 7], [2, 0], 0, 10),
                2: ValuedIndexes([3], [0], 2, 10)
            }

        self.split_graph = GraphWithReversals(
            {i: Block(10)
             for i in range(1, 5)}, {
                 1: [2, 3],
                 2: [4],
                 3: [4]
             })

        self.split_graph_with_path_around = \
            GraphWithReversals(
                {i: Block(10) for i in range(1, 8)},
                {}
            )

        self.graph_with_reversal = \
            GraphWithReversals(
                {i: Block(10) for i in range(1, 4)},
                {
                    1: [2],
                    -3: [-2]
                }
            )

        self.single_block_graph = \
            GraphWithReversals({1: Block(20)}, {})

        self.multi_start_end_graph = \
                GraphWithReversals({i: Block(10) for i in range(1, 6)},
                           {
                               1: [3],
                               2: [3],
                               3: [4, 5]
                           })

        self.junction_graph = GraphWithReversals(
            {i: Block(5)
             for i in range(10, 20)}, {
                 10: [15],
                 11: [15],
                 12: [15],
                 13: [15],
                 14: [15],
                 15: [16, 17, 18, 19]
             })

        self.fragment_length = 6
        self.read_length = 2
Example #20
0
    def setUp(self):
        self.graph = GraphWithReversals({i: Block(3)
                                         for i in range(1, 13)}, {
                                             11: [1],
                                             1: [2, 3],
                                             2: [7, 8],
                                             3: [6],
                                             6: [10],
                                             7: [9],
                                             8: [9],
                                             9: [10],
                                             10: [12]
                                         })
        self.blocks[3] = Block(6)

        self.linear_length = 21
        self.snarlgraph = SnarlGraph(
            {
                11:
                Block(3),
                12:
                Block(3),
                1:
                Block(3),
                10:
                Block(3),
                20:
                SnarlGraph(
                    {
                        3:
                        Block(6),
                        22:
                        SnarlGraph({
                            7: Block(3),
                            8: Block(3)
                        }, {
                            2: [7, 8],
                            7: [9],
                            8: [9]
                        },
                                   start_node=2,
                                   end_node=9),
                        2:
                        Block(3),
                        6:
                        Block(3),
                        9:
                        Block(3),
                    }, {
                        3: [6],
                        2: [22],
                        22: [9],
                        1: [2, 3],
                        6: [10],
                        9: [10]
                    },
                    start_node=1,
                    end_node=10)
            },
            {
                11: [1],
                1: [20],
                20: [10],
                10: [12],
                13: [11],  # Dummy
                12: [14],  # Dummy
            },
            start_node=13,
            end_node=14)

        LinearSnarlMap.from_snarl_graph(
            self.snarlgraph, self.graph).to_json_files("test_linear_map.tmp")
Example #21
0
from offsetbasedgraph import GraphWithReversals, Block

def create_test_data():
    simple_graph = """
    {
    "node": [
        {"id": 1, "sequence": "TTTCCCC"},
        {"id": 2, "sequence": "TTTT"},
        {"id": 3, "sequence": "CCCCTTT"}
    ],
    "edge": [
        {"from": 1, "to": 2},
        {"from": 2, "to": 3, "to_end": true}
    ]
    }
    """

    f = open("simple_graph.json", "w")
    f.write(simple_graph.replace("\n", " "))
    f.close()

simple_graph = GraphWithReversals({
                             1: Block(7),
                             2: Block(4),
                             3: Block(7)
                         },
                        {
                            1: [2],
                            2: [-3]
                        })