def setUp(self): self.correct_ob_graph = GraphWithReversals( { 1: Block(7), 2: Block(4), 3: Block(7), 4: Block(4) }, { 1: [2, 3], 2: [4], 3: [4] }) self.correct_ob_graph.convert_to_numpy_backend() self.correct_sequence_graph = SequenceGraph.create_empty_from_ob_graph( self.correct_ob_graph) self.correct_sequence_graph.set_sequences_using_vg_json_graph( "tests/vg_test_graph.json") remove_files = [ "tests/testgraph.obg", "tests/test_linear_map_starts.pickle", "tests/test_linear_map_ends.pickle", "tests/test_linear_map.length", "tests/sample.intervalcollection", "tests/testintervals.intervalcollection", "tests/testsequences.fasta", "tests/node_range_test_data/vg_alignments_1.json" "tests/node_range_test_data/vg_alignments_2.json" "tests/node_range_test_data/vg_alignments_3.json" "tests/node_range_test_data/vg_alignments_4.json" "tests/node_range_test_data/vg_alignments_5.json" ] for file in remove_files: if os.path.isfile(file): os.remove(file)
def test_create_from_nongraphpeakcollection(self): graph = Graph({ 1: Block(10), 2: Block(10), 3: Block(10) }, { 1: [2], 2: [3] }) graph.convert_to_numpy_backend() linear_path = Interval(0, 10, [1, 2, 3], graph) linear_path = linear_path.to_numpy_indexed_interval() nongraph_peaks = NonGraphPeakCollection([ NonGraphPeak("chr1", 3, 10, 5), NonGraphPeak("chr1", 13, 15, 7), ]) peaks = PeakCollection.create_from_nongraph_peak_collection( graph, nongraph_peaks, linear_path, None) self.assertEqual(peaks.intervals[0], Interval(3, 10, [1])) self.assertEqual(peaks.intervals[1], Interval(3, 5, [2])) peaks = PeakCollection.create_from_nongraph_peak_collection( graph, nongraph_peaks, linear_path, LinearRegion("chr1", 3, 20)) self.assertEqual(peaks.intervals[0], Interval(0, 7, [1])) self.assertEqual(peaks.intervals[1], Interval(0, 2, [2]))
class TestHaplotyper(unittest.TestCase): def setUp(self): self.complex_graph = Graph( {i: Block(3) for i in range(1, 13)}, { 1: [2, 3], 2: [7, 8], 3: [4, 5], 4: [6], 5: [6], 6: [10], 7: [9], 8: [9], 9: [10], 10: [12] }) self.complex_graph.convert_to_numpy_backend() def test_simple(self): graph = Graph( {i: Block(3) for i in range(1, 5)}, { 1: [2, 3], 2: [4], 3: [4] } ) graph.convert_to_numpy_backend() intervals = IntervalCollection([ Interval(0, 3, [1, 3]) ]) haplotyper = HaploTyper(graph, intervals) haplotyper.build() max_interval = haplotyper.get_maximum_interval_through_graph() self.assertEqual( max_interval, Interval(0, 3, [1, 3, 4]) ) def test_complex_graph(self): intervals = IntervalCollection([ Interval(0, 3, [1, 3, 4, 6, 10]), Interval(1, 2, [2]), Interval(2, 3, [2]), Interval(0, 3, [7, 9]) ]) haplotyper = HaploTyper(self.complex_graph, intervals) haplotyper.build() max_interval = haplotyper.get_maximum_interval_through_graph() self.assertEqual( max_interval, Interval(0, 3, [1, 2, 7, 9, 10, 12]) )
def test_create_ob_graph(self): run_argument_parser([ "create_ob_graph", "-o", "tests/testgraph.obg", "tests/vg_test_graph.json" ]) graph = GraphWithReversals.from_numpy_file("tests/testgraph.obg") self.assertEqual(graph, self.correct_ob_graph)
def test_finds_correct_max_path_among_many_paths(self): graph = GraphWithReversals( { 1: Block(10), 2: Block(10), 3: Block(10), 4: Block(10), 5: Block(10) }, { 1: [2, 3, 4], 2: [5], 4: [5], 3: [5] }) pileup = SparsePileup(graph) pileup.data = { 1: ValuedIndexes([], [], 2, 10), # Higher qval, but two holes with low 2: ValuedIndexes([1, 2, 7, 8], [0, 2.001, 0, 2.001], 2, 10), 3: ValuedIndexes([], [], 1.5, 10), 4: ValuedIndexes([], [], 2, 10), 5: ValuedIndexes([], [], 2, 10) } self._assert_finds_max_paths([Interval(0, 10, [1, 4, 5])], graph, pileup)
def create_linear_graph(self): nodes = {i + 1: Block(self.node_size) for i in range(0, self.n_nodes)} adj_list = {i: [i + 1] for i in range(1, self.n_nodes)} self.graph = GraphWithReversals(nodes, adj_list) self.graph.to_file(self.GRAPH_NAME) snarlbuilder = SnarlGraphBuilder(self.graph, snarls={ self.n_nodes + 2: SimpleSnarl(1, self.n_nodes, id=self.n_nodes + 2) }, id_counter=self.n_nodes + 3) self.snarlgraph = snarlbuilder.build_snarl_graphs() self.linear_map = LinearSnarlMap.from_snarl_graph( self.snarlgraph, self.graph) self.linear_map.to_json_files(self.MAP_NAME)
def set_graph(self): self.fragment_length = 5 self.read_length = 1 self.graph = GraphWithReversals({i: Block(15) for i in range(1, 5)}, { 1: [2, 3], 2: [4], 3: [4] }) LinearMap.from_graph(self.graph).to_file("test_linear_map.npz")
def _create_graph_with_linear_blocks(self): blocks = { i: Block(self.n_basepairs_length) for i in range(100, 100 + self.n_paths) } graph = GraphWithReversals(blocks, {}) # Add dummy blocks at start and end start = Block(1) end = Block(1) for block in graph.blocks: graph.adj_list[1].append(block) graph.reverse_adj_list[block].append(1) graph.adj_list[block].append(2) graph.reverse_adj_list[2].append(block) graph.blocks[1] = start graph.blocks[2] = end self.graph = graph self.translation = Translation({}, {}, graph)
def test_simple(self): graph = Graph( {i: Block(3) for i in range(1, 5)}, { 1: [2, 3], 2: [4], 3: [4] } ) graph.convert_to_numpy_backend() intervals = IntervalCollection([ Interval(0, 3, [1, 3]) ]) haplotyper = HaploTyper(graph, intervals) haplotyper.build() max_interval = haplotyper.get_maximum_interval_through_graph() self.assertEqual( max_interval, Interval(0, 3, [1, 3, 4]) )
def test_convert_to_approx_linear_peaks(self): graph = Graph({i: Block(3) for i in range(1, 10)}, { 1: [2], 2: [3], 3: [4], 4: [5], 5: [6], 6: [7, 8], 7: [9], 9: [9] }) graph.convert_to_numpy_backend() linear_interval = Interval(0, 3, [2, 4, 8, 9], graph) linear_interval = linear_interval.to_numpy_indexed_interval() peaks = PeakCollection([Peak(2, 2, [2, 3, 4]), Peak(1, 1, [3, 4, 5])]) linear_peaks = peaks.to_approx_linear_peaks(linear_interval, "chr4") linear_peaks = linear_peaks.peaks print(linear_peaks) self.assertEqual(linear_peaks[0], NonGraphPeak("chr4", 2, 5)) self.assertEqual(linear_peaks[1], NonGraphPeak("chr4", 3, 3))
def _create_data(self): node_offset = 1 for chrom_number, chromosome in enumerate(self.chromosomes): graph = Graph( {i + node_offset: Block(10) for i in range(0, 3)}, {i + node_offset: [i + 1 + node_offset] for i in range(0, 2)}) linear_map = LinearMap.from_graph(graph) linear_map_file_name = "linear_map_%s.npz" % chromosome linear_map.to_file(linear_map_file_name) self.linear_maps.append(linear_map_file_name) self.sequence_retrievers.append( SequenceRetriever( {i + node_offset: "A" * 10 for i in range(0, 3)})) self._create_reads(chrom_number, chromosome, graph) node_offset += 3 graph.convert_to_numpy_backend() SequenceGraph.create_empty_from_ob_graph(graph).to_file( chromosome + ".nobg.sequences") graph.to_file(chromosome + ".nobg")
def setUp(self): self.simple_graph = GraphWithReversals( { 1: Block(3), 2: Block(3), 3: Block(3) }, { 1: [2], 2: [3] }) self.reversed_simple_graph = GraphWithReversals( { 1: Block(3), 2: Block(3), 3: Block(3) }, { -2: [-1], -3: [-2] }) self.simple_graphs = [self.simple_graph, self.reversed_simple_graph] self.graph2 = Graph({ 1: Block(3), 2: Block(3) }, { -2: [1], }) self.graph3 = Graph({1: Block(3), 2: Block(3)}, {2: [-1]}) areas = {2: np.array([0, 3])} self.middle_areas = ConnectedAreas(self.simple_graph, areas) self.middle_closed_area = ConnectedAreas(self.simple_graph, {2: np.array([1, 2])}) self.middle_left_area = ConnectedAreas(self.simple_graph, {2: np.array([0, 2])})
def setUp(self): self.complex_graph = Graph( {i: Block(3) for i in range(1, 13)}, { 1: [2, 3], 2: [7, 8], 3: [4, 5], 4: [6], 5: [6], 6: [10], 7: [9], 8: [9], 9: [10], 10: [12] }) self.complex_graph.convert_to_numpy_backend()
def setUp(self): self.graph = GraphWithReversals({i: Block(3) for i in range(1, 12)}, { 1: [2, 3], 2: [7, 8], 3: [4, 5], 4: [6], 5: [6], 6: [10], 7: [9], 8: [9], 9: [10], 10: [11] }) self.linear_length = 18 LinearMap.from_graph(self.graph).to_file("test_linear_map.npz")
def set_graph(self): self.fragment_length = 6 self.read_length = 2 blocks = {i: Block(3) for i in range(1, 11)} blocks[11] = Block(1000) self.graph = GraphWithReversals( blocks, { 1: [2, 3], 2: [7, 8], 3: [4, 5], 4: [6], 5: [6], 6: [10], 7: [9], 8: [9], 9: [10], 10: [11] }) LinearMap.from_graph(self.graph).to_file("test_linear_map.npz")
def find_linear_path_through_chromosome(chromosome, chromend, fasta_file_name, ob_graph_file_name, vg_graph_file_name): genome = Fasta(fasta_file_name) seq = str(genome[chromosome][0:50818468]).lower() logging.info("Creating sequence retriever") sequence_retriever = SequenceRetriever.from_vg_json_graph( vg_graph_file_name) graph = GraphWithReversals.from_numpy_file(ob_graph_file_name) start_nodes = graph.get_first_blocks() assert len(start_nodes) == 1, "Found %d start nodes" % start_nodes start_node = start_nodes[0] traverser = GraphTraverserUsingSequence(graph, seq, sequence_retriever) traverser.search_from_node(start_node) path = traverser.get_interval_found() path = IntervalCollection(path) path.to_file("22_path.intervalcollection", text_file=True) logging.info("Done")
def _init_caller(self): self.caller = CallPeaks(GraphWithReversals.from_file(self.GRAPH_NAME), "")
class MACSTests(object): GRAPH_NAME = "lin_graph.tmp" MAP_NAME = "lin_map.tmp" INTERVALS_NAME = "graph_intervals.tmp" CONTROL_NAME = "graph_control.tmp" def __init__(self, node_size, n_nodes, n_intervals, read_length=15, fragment_length=50, with_control=False): self.node_size = node_size self.n_nodes = n_nodes self.with_control = with_control self.n_intervals = n_intervals self.read_length = read_length self.genome_size = node_size * n_nodes self.fragment_length = fragment_length self.peak_depth = 10 self.setup() def setup(self): print("######## SETUP ########") self.create_linear_graph() self.create_intervals() self.write_intervals() self.info = ExperimentInfo(self.genome_size, self.fragment_length, self.read_length) self.info.n_control_reads = self.n_intervals_control self.info.n_sample_reads = self.n_intervals logging.info("N_control %s, N_sample: %s", self.info.n_control_reads, self.info.n_sample_reads) self.control_file_name = self.INTERVALS_NAME if self.with_control: self.control_file_name = self.CONTROL_NAME """ self.caller = CallPeaks(self.GRAPH_NAME, self.INTERVALS_NAME, control_file_name, has_control=self.with_control, experiment_info=self.info, verbose=True, linear_map=self.MAP_NAME) """ self.sample_intervals = IntervalCollection(self.graph_intervals) if self.with_control: self.control_intervals = IntervalCollection( self.graph_intervals_control) else: self.control_intervals = IntervalCollection(self.graph_intervals) self._init_caller() def _init_caller(self): self.caller = CallPeaks(GraphWithReversals.from_file(self.GRAPH_NAME), "") #self.caller.create_graph() # Tests def test_filter_dup(self): command = "macs2 filterdup -i %s --keep-dup=1 -o %s" % ( "lin_intervals.bed", "lin_intervals_dup.bed") command = command.split() subprocess.check_output(command) self.dup_file_name = self.caller.filter_duplicates( "graph_intervals", write_to_file="graph_intervals_filtered.tmp") self.assertEqualIntervalFiles(self.dup_file_name, "lin_intervals_dup.bed") def test_sample_pileup(self): self.caller.sample_intervals = self.graph_intervals self.caller.create_sample_pileup(True) self._create_sample_pileup() self.assertPileupFilesEqual(self.caller._sample_track, "lin_sample_pileup.bdg") def test_control_pileup(self): self.caller.control_intervals = self.graph_intervals self.caller.create_control(True) self._create_control() assert isinstance(self.caller._control_track, str) self.assertPileupFilesEqual(self.caller._control_track, "lin_control_pileup.bdg", min_value=self.background) def test_call_peaks(self): print("###############################################") print(self.graph.adj_list[222]) print(self.graph.reverse_adj_list[222]) self.assertPileupFilesEqual("control_track.bdg", "macstest_control_lambda.bdg") self.assertPileupFilesEqual("sample_track.bdg", "macstest_treat_pileup.bdg") self.caller._control_pileup = SparsePileup.from_bed_graph( self.graph, "control_track.bdg") self.caller._sample_pileup = SparsePileup.from_bed_graph( self.graph, "sample_track.bdg") self.caller.get_score() self._get_scores("qpois") self.assertPileupFilesEqual("q_values.bdg", "lin_scores.bdg") self._call_peaks() self.caller.call_peaks() self.assertEqualBedFiles("final_peaks.bed", "lin_peaks.bed") def neg_linear_to_graph_interval(self, lin_interval): start_rp = ((lin_interval.end - 1) // self.node_size + 1) end_rp = (lin_interval.start // self.node_size + 1) start_offset = start_rp * self.node_size - lin_interval.end end_offset = end_rp * self.node_size - lin_interval.start # start_offset = (-lin_interval.end) % self.node_size # end_offset = (-lin_interval.start) % self.node_size # start_rp = (lin_interval.end) // self.node_size + 1 # end_rp = (-lin_interval.start) // self.node_size + 1 rps = list(range(start_rp * -1, end_rp * -1 + 1)) interval = DirectedInterval(start_offset, end_offset, rps, graph=self.graph) return interval def linear_to_graph_interval(self, lin_interval, is_control=None): if lin_interval.direction == -1: return self.neg_linear_to_graph_interval(lin_interval) start = lin_interval.start end = lin_interval.end start_rp = start // self.node_size + 1 end_rp = (end - 1) // self.node_size + 1 start_pos = Position(start_rp, start % self.node_size) end_pos = Position(end_rp, ((end - 1) % self.node_size) + 1) region_paths = list(range(start_rp, end_rp + 1)) interval = DirectedInterval(start_pos, end_pos, region_paths, direction=lin_interval.direction, graph=self.graph) return interval def _convert_valued_interval(self, interval): true_id = abs(interval.node_id) - 1 interval.start += self.node_size * true_id interval.end += self.node_size * true_id def graph_to_linear_pos(self, pos): return pos.region_path_id * self.node_size + pos.offset def graph_to_linear_interval(self, graph_interval): start = self.graph_to_linear_pos(graph_interval.start_position) end = self.graph_to_linear_pos(graph_interval.end_position) return SimpleInterval(start, end, graph_interval.direction) def assertEqualIntervals(self, linear_intervals, graph_intervals): graph_intervals = [ self.graph_to_linear_interval(g_interval) for g_interval in graph_intervals ] #assert len(graph_intervals) == len(linear_intervals), \ # "%d != %d" % (len(graph_intervals), len(linear_intervals)) for interval in graph_intervals: assert interval in linear_intervals def assertEqualIntervalFiles(self, graph_file, linear_file): graph_intervals = IntervalCollection.from_file(graph_file) linear_intervals = (SimpleInterval.from_file_line(line) for line in open(linear_file).readlines()) self.assertEqualIntervals(list(linear_intervals), graph_intervals) def _create_binary_track(self, intervals): pileup = np.zeros(self.genome_size, dtype="bool") for interval in intervals: if interval is None: continue pileup[interval.start:interval.end] = True return pileup def assertEqualBedFiles(self, graph_file, linear_file): graph_intervals = [ SimpleInterval.from_file_line(line) for line in open(graph_file).readlines() ] linear_intervals = [ SimpleInterval.from_file_line(line) for line in open(linear_file).readlines() ] for graph_interval in graph_intervals: self._convert_valued_interval(graph_interval) pileup1 = self._create_binary_track(linear_intervals) pileup2 = self._create_binary_track(graph_intervals) indices = np.where(pileup1 != pileup2)[0] if not np.allclose(pileup1, pileup2): logging.error(indices) logging.error("%s %s %s", indices[0], indices[np.where(np.diff(indices) > 1)], indices[-1]) assert np.allclose(pileup1, pileup2) def _create_pileup(self, pileup_file, convert=False, limit=False, min_value=None): pileup = np.zeros(self.genome_size) valued_intervals = (ValuedInterval.from_file_line(line) for line in open(pileup_file).readlines()) for interval in valued_intervals: if interval is None: continue if convert: self._convert_valued_interval(interval) pileup[interval.start:interval.end] = np.maximum( pileup[interval.start:interval.end], interval.value) if min_value is not None: pileup = np.maximum(pileup, min_value) return pileup def assertPileupFilesEqual(self, graph_file, linear_file, min_value=None): assert isinstance(graph_file, str) assert isinstance(linear_file, str) linear_pileup = self._create_pileup(linear_file, min_value=min_value) graph_pileup = self._create_pileup(graph_file, convert=True) assert sum(graph_pileup) > 0 rtol = 0.001 rtol = 0.05 if not np.allclose(linear_pileup, graph_pileup, rtol=rtol): different = np.abs(linear_pileup - graph_pileup) > rtol logging.error(different) logging.error(np.where(different)) logging.error("Number of indices different") logging.error(len(np.where(different)[0])) if not len(np.where(different)[0]): return logging.error("Differences:") assert np.allclose(linear_pileup, graph_pileup, rtol=rtol), \ "Pileup in %s != pileup in %s" % (linear_file, graph_file) def _create_sample_pileup(self): command = "macs2 pileup -i %s -o %s --extsize %s -f BED" % ( "lin_intervals.bed", "lin_sample_pileup.bdg", self.fragment_length - 1) logging.info(command) subprocess.check_output(command.split()) def _get_scores(self, t="qpois"): command = "macs2 bdgcmp -t macstest_treat_pileup.bdg -c macstest_control_lambda.bdg -m %s -o lin_scores.bdg" % t # command = "macs2 bdgcmp -t macstest_treat_pileup.bdg -c macstest_control_lambda.bdg -m %s -o lin_scores.bdg" % t logging.info(command) subprocess.check_output(command.split()) def _call_peaks(self): threshold = -np.log10(0.05) command = "macs2 bdgpeakcall -i lin_scores.bdg -c %s -l %s -g %s -o lin_peaks.bed" % ( threshold, self.info.fragment_length, self.read_length) logging.info(command) subprocess.check_output(command.split()) def _create_control(self): for ext in [2500]: command = "macs2 pileup -i %s -o %s -B --extsize %s" % ( "lin_intervals.bed", "lin_control_pileup%s.bdg -f BED" % ext, ext) subprocess.check_output(command.split()) command = "macs2 bdgopt -i lin_control_pileup%s.bdg -m multiply -p %s -o lin_control_pileup%s.bdg" % ( ext, (self.fragment_length - 1) / (ext * 2), ext) subprocess.check_output(command.split()) # command = "macs2 bdgcmp -m max -t lin_control_pileup2500.bdg -c lin_control_pileup5000.bdg -o lin_control_pileup.bdg" # subprocess.check_output(command.split()) self.background = self.n_intervals * self.info.fragment_length / self.genome_size logging.info(self.background) command = "macs2 bdgopt -i lin_control_pileup2500.bdg -m max -p %s -o lin_control_pileup.bdg" % self.background logging.info(command) subprocess.check_output(command.split()) def write_intervals(self): f = open("lin_intervals.bed", "w") f.writelines(interval.to_file_line() for interval in self.linear_intervals) f.close() logging.info("Wrote to lin_intervals.bed") graph_intervals = IntervalCollection(self.graph_intervals) graph_intervals.to_file(self.INTERVALS_NAME, True) if self.with_control: f = open("lin_intervals_control.bed", "w") f.writelines(interval.to_file_line() for interval in self.linear_intervals_control) f.close() graph_intervals = IntervalCollection(self.graph_intervals_control) graph_intervals.to_file(self.CONTROL_NAME, True) graph_intervals.to_file(self.CONTROL_NAME + ".tmp", True) print("Wrote to graph_intervals") def create_linear_graph(self): nodes = {i + 1: Block(self.node_size) for i in range(0, self.n_nodes)} adj_list = {i: [i + 1] for i in range(1, self.n_nodes)} self.graph = GraphWithReversals(nodes, adj_list) self.graph.to_file(self.GRAPH_NAME) snarlbuilder = SnarlGraphBuilder(self.graph, snarls={ self.n_nodes + 2: SimpleSnarl(1, self.n_nodes, id=self.n_nodes + 2) }, id_counter=self.n_nodes + 3) self.snarlgraph = snarlbuilder.build_snarl_graphs() self.linear_map = LinearSnarlMap.from_snarl_graph( self.snarlgraph, self.graph) self.linear_map.to_json_files(self.MAP_NAME) def _get_graph_interval(self, tmp_start, tmp_end, direction): start = tmp_start end = tmp_end if direction == -1: start = -tmp_end end = -tmp_start start_rp = start // self.node_size end_rp = (end + 1) // self.node_size region_paths = list(range(start_rp, end_rp)) start_pos = Position(start_rp, start % self.node_size) end_pos = Position(end_rp, (end % self.node_size) + 1) return DirectedInterval(start_pos, end_pos, region_paths, direction=direction) def create_pairs_around_point(self, point, n=1): intervals = [] for _ in range(n): offset = random.randint(-n, n) point = point + offset pos_start = point - self.fragment_length // 2 pos_end = pos_start + self.read_length if pos_start > 0 and pos_end < self.genome_size: intervals.append(SimpleInterval(pos_start, pos_end, 1)) assert pos_start >= 0 and pos_end >= 0 neg_end = point + self.fragment_length // 2 neg_start = neg_end - self.read_length if neg_end < self.genome_size and neg_start >= 0: intervals.append(SimpleInterval(neg_start, neg_end, -1)) assert neg_start >= 0 and neg_end >= 0 return intervals def create_random_linear_reads(self, n_reads, include_pairs=False): logging.info("Creating %d linear reads" % n_reads) reads = [] for i in range(n_reads // self.peak_depth + 1): logging.debug("Creating read %d" % i) point = random.randint(0, self.genome_size) reads.extend( self.create_pairs_around_point(point, n=self.peak_depth)) return reads def _find_graph_size(self, intervals): max_point = [ interval.end if interval.direction == -1 else interval.start + self.fragment_length for interval in intervals ] return max(max_point) def create_intervals(self): self.linear_intervals = self.create_random_linear_reads( self.n_intervals, include_pairs=True) self.graph._size = self._find_graph_size(self.linear_intervals) dummy_end = SimpleInterval(self.genome_size - self.read_length, self.genome_size, -1) self.linear_intervals.append(dummy_end) self.graph_intervals = [ self.linear_to_graph_interval(i) for i in self.linear_intervals ] logging.debug(len(self.graph_intervals)) self.n_intervals = len(self.linear_intervals) self.linear_intervals = sorted(self.linear_intervals, key=lambda x: (x.node_id, x.start)) self.graph_intervals = sorted( self.graph_intervals, key=lambda x: (x.region_paths[0], x.start_position.offset)) logging.info("Created %d intervals ", self.n_intervals) if self.with_control: self.linear_intervals_control = self.create_random_linear_reads( self.n_intervals, include_pairs=False) self.linear_intervals_control.append(dummy_end) self.graph_intervals_control = [ self.linear_to_graph_interval(i, is_control=True) for i in self.linear_intervals_control ] self.n_intervals_control = len(self.linear_intervals_control) logging.info("Created %d control intervals ", self.n_intervals_control) else: self.n_intervals_control = self.n_intervals def test_shift_estimation(self): self.setup() caller = CallPeaks("lin_graph.tmp", "graph_intervals_filtered.tmp", "graph_intervals_filtered.tmp", has_control=False) caller.create_graph() info = ExperimentInfo.find_info(caller.ob_graph, caller.sample_file_name, caller.control_file_name) read_length_graph = info.read_length fragment_length_graph = info.fragment_length # Macs command = [ "macs2", "predictd", "-i", "lin_intervals_dup.bed", "-g", str(self.genome_size), "-m", "5", "50" ] string_commmand = ' '.join(command) logging.info(string_commmand) output = subprocess.check_output(command, stderr=subprocess.STDOUT) output = output.decode("utf-8") logging.debug(output) tag_size = re.search("tag size = ([0-9]+)", output).groups()[0] tag_size = int(tag_size) fragment_length = re.search("fragment length is ([0-9]+) bp", output).groups()[0] fragment_length = int(fragment_length) assert read_length_graph == tag_size, \ "Read length from graph % d != %d (macs reads length)" % (read_length_graph, tag_size) assert fragment_length_graph == fragment_length def profile(self): self.caller.run() def _run_whole_macs(self): command = "macs2 callpeak -t lin_intervals.bed -f BED -g " + str( self.genome_size) + " --nomodel --extsize " + str( self.info.fragment_length ) + " -n macstest -B -q 0.05 --keep-dup all" if self.with_control: command += " --slocal=1000 -c lin_intervals_control.bed" logging.info("Macs command used: %s", command) command = command.split() output = subprocess.check_output(command, stderr=subprocess.STDOUT) output = output.decode("utf-8") logging.debug(output) def assertPeakSetsEqual(self, linear_peaks_file, graph_peaks_file): linear_path = DirectedInterval(0, self.node_size, list(range(1, self.n_nodes + 1)), graph=self.graph) comparer = PeaksComparer.create_from_graph_peaks_and_linear_peaks( linear_peaks_file, graph_peaks_file, self.graph, linear_path, graph_region=None) # for i, j in zip(sorted(comparer.peaks1.intervals, key=lambda x: x.region_paths[0]), # sorted(comparer.peaks2.intervals, key=lambda x: x.region_paths[0])): # print(i, j) assert len(comparer.peaks1.intervals) == len(comparer.peaks2.intervals) matches = comparer.get_peaks_at_same_position() # for m in matches: # print(m) assert len(matches) == len(comparer.peaks1.intervals) def test_whole_pipeline(self): self._run_whole_macs() # self.caller.create_graph() self.caller.sample_intervals = self.sample_intervals self.caller.control_intervals = self.control_intervals config = Configuration(save_tmp_results_to_file=True, skip_filter_duplicates=True, p_val_cutoff=0.05) self.caller.run_pre_callpeaks(has_control=self.with_control, experiment_info=self.info, linear_map=self.MAP_NAME, configuration=config) # self.assertPileupFilesEqual("sample_track.bdg", # "macstest_treat_pileup.bdg") # self.assertPileupFilesEqual("control_track.bdg", # "macstest_control_lambda.bdg") logging.info("################### GETTING SCORE") self.caller.get_p_values() self.caller.get_p_to_q_values_mapping() self.caller.get_q_values() logging.info("################### CALLING PEAKS") self.caller.call_peaks_from_q_values(experiment_info=self.info, config=config) # Cannot compare bedgraphs anymore, as graph pileup is not trimmed before maxpaths #self.assertEqualBedFiles("final_peaks.bed", # "macstest_peaks.narrowPeak") self.assertPeakSetsEqual("macstest_peaks.narrowPeak", "max_paths.intervalcollection") print("Success. All assertions passed") def test_final_tracks(self): self._run_whole_macs() self.caller.run() self.assertEqualBedFiles("final_peaks.bed", "macstest_peaks.narrowPeak")
def setUp(self): blocks = {i: Block(10) for i in range(1, 5)} edges = {i: [i + 1] for i in range(1, 4)} self.linear_graph = GraphWithReversals(blocks, edges) self.one_peak_q_values = SparsePileup(self.linear_graph) self.one_peak_q_values.data = \ { 1: ValuedIndexes([5], [2], 0, 10), 2: ValuedIndexes([3], [0], 2, 10) } self.one_peak_with_hole = SparsePileup(self.linear_graph) self.one_peak_with_hole.data = \ { 1: ValuedIndexes([5, 8], [2, 0], 0, 10), 2: ValuedIndexes([3], [0], 2, 10) } self.one_peak_with_big_hole = SparsePileup(self.linear_graph) self.one_peak_with_big_hole.data = \ { 1: ValuedIndexes([5, 7], [2, 0], 0, 10), 2: ValuedIndexes([3], [0], 2, 10) } self.split_graph = GraphWithReversals( {i: Block(10) for i in range(1, 5)}, { 1: [2, 3], 2: [4], 3: [4] }) self.split_graph_with_path_around = \ GraphWithReversals( {i: Block(10) for i in range(1, 8)}, {} ) self.graph_with_reversal = \ GraphWithReversals( {i: Block(10) for i in range(1, 4)}, { 1: [2], -3: [-2] } ) self.single_block_graph = \ GraphWithReversals({1: Block(20)}, {}) self.multi_start_end_graph = \ GraphWithReversals({i: Block(10) for i in range(1, 6)}, { 1: [3], 2: [3], 3: [4, 5] }) self.junction_graph = GraphWithReversals( {i: Block(5) for i in range(10, 20)}, { 10: [15], 11: [15], 12: [15], 13: [15], 14: [15], 15: [16, 17, 18, 19] }) self.fragment_length = 6 self.read_length = 2
def setUp(self): self.graph = GraphWithReversals({i: Block(3) for i in range(1, 13)}, { 11: [1], 1: [2, 3], 2: [7, 8], 3: [6], 6: [10], 7: [9], 8: [9], 9: [10], 10: [12] }) self.blocks[3] = Block(6) self.linear_length = 21 self.snarlgraph = SnarlGraph( { 11: Block(3), 12: Block(3), 1: Block(3), 10: Block(3), 20: SnarlGraph( { 3: Block(6), 22: SnarlGraph({ 7: Block(3), 8: Block(3) }, { 2: [7, 8], 7: [9], 8: [9] }, start_node=2, end_node=9), 2: Block(3), 6: Block(3), 9: Block(3), }, { 3: [6], 2: [22], 22: [9], 1: [2, 3], 6: [10], 9: [10] }, start_node=1, end_node=10) }, { 11: [1], 1: [20], 20: [10], 10: [12], 13: [11], # Dummy 12: [14], # Dummy }, start_node=13, end_node=14) LinearSnarlMap.from_snarl_graph( self.snarlgraph, self.graph).to_json_files("test_linear_map.tmp")
from offsetbasedgraph import GraphWithReversals, Block def create_test_data(): simple_graph = """ { "node": [ {"id": 1, "sequence": "TTTCCCC"}, {"id": 2, "sequence": "TTTT"}, {"id": 3, "sequence": "CCCCTTT"} ], "edge": [ {"from": 1, "to": 2}, {"from": 2, "to": 3, "to_end": true} ] } """ f = open("simple_graph.json", "w") f.write(simple_graph.replace("\n", " ")) f.close() simple_graph = GraphWithReversals({ 1: Block(7), 2: Block(4), 3: Block(7) }, { 1: [2], 2: [-3] })