def check_unique(peak_file_name, motif_file_name, set_file_name, filter_unique): is_dip = "_dip." in set_file_name peaks = PeakCollection.from_file(peak_file_name, True) ids = {peak.unique_id for peak in peaks} motifs = PeakCollection.from_file(motif_file_name, True) unique_motifs = { i for i, motif in enumerate(motifs) if motif.unique_id in ids } parser = parse_dip_set_file if is_dip else parse_set_file counts = [ count for i, count in enumerate(parser(set_file_name)) if (not filter_unique) or (i in unique_motifs) ] N = len(counts) print(N, motif_file_name) if is_dip: successes = len( [count for count in counts if (count[0] + count[1]) >= count[2]]) else: successes = [count for count in counts if count[0] >= count[1]] print(successes) successes = len(successes) return successes, N
def test_create_from_nongraphpeakcollection(self): graph = Graph({ 1: Block(10), 2: Block(10), 3: Block(10) }, { 1: [2], 2: [3] }) graph.convert_to_numpy_backend() linear_path = Interval(0, 10, [1, 2, 3], graph) linear_path = linear_path.to_numpy_indexed_interval() nongraph_peaks = NonGraphPeakCollection([ NonGraphPeak("chr1", 3, 10, 5), NonGraphPeak("chr1", 13, 15, 7), ]) peaks = PeakCollection.create_from_nongraph_peak_collection( graph, nongraph_peaks, linear_path, None) self.assertEqual(peaks.intervals[0], Interval(3, 10, [1])) self.assertEqual(peaks.intervals[1], Interval(3, 5, [2])) peaks = PeakCollection.create_from_nongraph_peak_collection( graph, nongraph_peaks, linear_path, LinearRegion("chr1", 3, 20)) self.assertEqual(peaks.intervals[0], Interval(0, 7, [1])) self.assertEqual(peaks.intervals[1], Interval(0, 2, [2]))
def test_get_identical_intervals(self): identical = self.peaks.get_identical_intervals( PeakCollection([Peak(2, 3, [1, 2, 3, 4], self.graph)])) self.assertEqual(len(identical), 0) identical = self.peaks.get_identical_intervals( PeakCollection([Peak(3, 3, [1, 2, 3, 4], self.graph)])) self.assertEqual(len(identical), 1)
def from_graph_peaks_in_fasta(cls, graph, vg_graph_json_file_name, chromosome, fasta_file_name, regions_bed_file, true_peaks_file): reads = PeakCollection.from_fasta_file(fasta_file_name, graph=graph) vg_graph = pyvg.vg.Graph.create_from_file( vg_graph_json_file_name, limit_to_chromosomes=chromosome) logging.info("Finding linear path") linear_path_file = "linear_path_%s.intervalcollection" % chromosome try: linear_path = obg.IntervalCollection.from_file( linear_path_file, text_file=True).intervals[0] linear_path = linear_path.to_indexed_interval() except FileNotFoundError: linear_path = create_linear_path(graph, vg_graph, path_name=chromosome, write_to_file=linear_path_file) linear_path.graph = graph filtered_reads = [] # Convert regions to intervals in graph logging.info("Converting regions to regions in graph") graph_regions = [] bed_file = open(regions_bed_file) for line in bed_file: print(line) line = line.split() chr = line[0] start = int(line[1]) end = int(line[2]) if chr != "chr%s" % chromosome: logging.info("Skipping %s, %d, %d" % (chr, start, end)) continue graph_interval = linear_path.get_subinterval(start, end) graph_regions.append(graph_interval) assert len(graph_regions ) > 0, " Found not graph regions for chr %d" % chromosome graph_regions = PeakCollection(graph_regions) # Filter out reads not overlapping with linear regions for read in reads: n_overlapping = graph_regions.get_overlapping_intervals( read, minimum_overlap=1) if n_overlapping: filtered_reads.append(read) logging.info("Found %d reads in graph regions" % len(filtered_reads)) return cls(chromosome, reads, true_peaks_file)
def setUp(self): self.graph = Graph({i: Block(3) for i in range(1, 7)}, {i: [i + 1] for i in range(1, 6)}) self.peaks = PeakCollection([ Peak(3, 3, [1, 2, 3, 4], self.graph), Peak(3, 3, [5, 6], self.graph) ])
def create_linear_peaks_from_bed(linear_sequence_fasta_file, peaks_bed_file, obg_graph_file_name, vg_graph_file_name, start_node, region): ob_graph = obg.GraphWithReversals.from_file(obg_graph_file_name) search_sequence = open(linear_sequence_fasta_file).read() sequence_retriever = SequenceRetriever.from_vg_graph(vg_graph_file_name) traverser = GraphTraverserUsingSequence(ob_graph, search_sequence, sequence_retriever) traverser.search_from_node(start_node) linear_path_interval = traverser.get_interval_found() IntervalCollection([linear_path_interval ]).to_file("linear_path.intervalcollection", text_file=True) print("Length") print(linear_path_interval.length()) print(linear_path_interval.region_paths[0]) print(linear_path_interval.start_position) print(linear_path_interval.end_position) linear_peaks = PeakCollection.create_from_linear_intervals_in_bed_file( obg_graph_file_name, linear_path_interval, peaks_bed_file, region.start, region.end) linear_peaks.to_file("linear_peaks.intervalcollection", text_file=True)
def __init__(self, graph, subgraphs_file_name, peaks_file_name): self.graph = graph IntervalCollection.interval_class = DirectedInterval self.subgraphs = SubgraphCollection.from_pickle(subgraphs_file_name, graph=graph) self.peaks = PeakCollection.create_list_from_file(peaks_file_name, graph=graph)
def compare_with_correct_peaks(self): correct_peaks = PeakCollection(self.correct_peaks) #for peak in correct_peaks: # print(peak) found_peaks = PeakCollection.create_list_from_file( "max_paths.intervalcollection", graph=self.graph) #for i in found_peaks: # print(i) matched = correct_peaks.get_identical_intervals(found_peaks) subgraphs = self.caller.q_value_peak_caller.peaks_as_subgraphs print("%d subgraphs" % len(subgraphs.subgraphs)) print( "%d correct peaks identically found, %3.f %% " % (len(matched), 100 * len(matched) / len(correct_peaks.intervals)))
def test_intervals_to_fasta_from_fasta(self): run_argument_parser([ "create_ob_graph", "-o", "tests/testgraph.obg", "tests/vg_test_graph.json" ]) PeakCollection([Peak(0, 2, [1, 2], score=3) ]).to_file("tests/testintervals.intervalcollection", text_file=True) run_argument_parser([ "peaks_to_fasta", "tests/testgraph.obg.sequences", "tests/testintervals.intervalcollection", "tests/testsequences.fasta" ]) collection = PeakCollection.from_fasta_file( "tests/testsequences.fasta") self.assertEqual(len(collection.intervals), 1) self.assertEqual(collection.intervals[0].sequence.lower(), "tttcccctt")
def macs_to_graph_peaks(folder): for chrom in ["1", "2", "3", "4", "5"]: path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") macs_peaks = PeakCollection.from_fasta_file( folder + "/macs_sequences_chr%s_summits_unique.fasta" % chrom, graph) macs_peaks.to_file( folder + "/%s_macs_unique_graph_summits.intervalcollection" % chrom, True)
def test_get_summits(self): qvalues = SparseValues(np.array([0]), np.array([3])) qvalues.track_size = 22 qvalues.to_sparse_files("tests/test_qvalues") run_argument_parser([ "create_ob_graph", "-o", "tests/testgraph.obg", "tests/vg_test_graph.json" ]) max_paths = PeakCollection([Peak(0, 2, [1, 2], score=3)]) PeakFasta(self.correct_sequence_graph).write_max_path_sequences( "tests/test_max_paths.fasta", max_paths) run_argument_parser([ "get_summits", "-g", "tests/testgraph.obg", "tests/test_max_paths.fasta", "tests/test_qvalues", "2" ]) result = PeakCollection.from_fasta_file( "tests/test_max_paths_summits.fasta") self.assertEqual(result.intervals[0], Peak(2, 6, [1])) self.assertEqual(result.intervals[0].sequence.lower(), "tccc")
def test_convert_to_approx_linear_peaks(self): graph = Graph({i: Block(3) for i in range(1, 10)}, { 1: [2], 2: [3], 3: [4], 4: [5], 5: [6], 6: [7, 8], 7: [9], 9: [9] }) graph.convert_to_numpy_backend() linear_interval = Interval(0, 3, [2, 4, 8, 9], graph) linear_interval = linear_interval.to_numpy_indexed_interval() peaks = PeakCollection([Peak(2, 2, [2, 3, 4]), Peak(1, 1, [3, 4, 5])]) linear_peaks = peaks.to_approx_linear_peaks(linear_interval, "chr4") linear_peaks = linear_peaks.peaks print(linear_peaks) self.assertEqual(linear_peaks[0], NonGraphPeak("chr4", 2, 5)) self.assertEqual(linear_peaks[1], NonGraphPeak("chr4", 3, 3))
def test_approx_contains(self): peaks = PeakCollection( [Peak(3, 3, [1, 2, 3, 4]), Peak(3, 3, [-10, 11])]) peaks.create_node_index() self.assertTrue(peaks.approx_contains_part_of_interval(Peak(1, 2, [1]))) self.assertTrue( peaks.approx_contains_part_of_interval(Peak(1, 2, [10]))) self.assertFalse( peaks.approx_contains_part_of_interval(Peak(1, 2, [100])))
def check_peaks(self): peaks = PeakCollection.from_file(self.experiment_dir + "/not_matching_set1.intervals", self.graph) i = 0 for peak in peaks: peak.graph = self.graph alignments = set([a.strip() for a in self.alignment_collection.get_alignments_on_interval(peak).keys()]) linear_peak = peak.to_linear_offsets2(self.linear_path) linear_alignments = set([a.qname for a in self.bam_file.fetch("5", linear_peak[0], linear_peak[1])]) i += 1 print(" ==== Peak %d === " % i) print(peak) print("Linear: %s" % str(linear_peak)) print("%d alignments" % len(alignments)) print("%d linear alignments" % len(linear_alignments)) print("%d alignments in common" % len(alignments.intersection(linear_alignments))) print("%d linear not in graph" % len(linear_alignments.difference(alignments))) print("%d graph not in linear" % len(alignments.difference(linear_alignments))) print("Not aligned by linear:") print(alignments.difference(linear_alignments))
class TestPeakCollection(unittest.TestCase): def setUp(self): self.graph = Graph({i: Block(3) for i in range(1, 7)}, {i: [i + 1] for i in range(1, 6)}) self.peaks = PeakCollection([ Peak(3, 3, [1, 2, 3, 4], self.graph), Peak(3, 3, [5, 6], self.graph) ]) def test_contains_interval(self): self.assertTrue(self.peaks.contains_interval(Peak(3, 3, [1, 2, 3, 4]))) self.assertFalse(self.peaks.contains_interval(Peak(2, 3, [1, 2, 3, 4]))) def test_get_similar_intervals(self): similar = self.peaks.get_similar_intervals( Peak(2, 3, [1, 2, 3, 4], self.graph), 1) self.assertTrue(len(similar) == 1) self.assertEqual(similar[0], self.peaks.intervals[0]) def test_get_identical_intervals(self): identical = self.peaks.get_identical_intervals( PeakCollection([Peak(2, 3, [1, 2, 3, 4], self.graph)])) self.assertEqual(len(identical), 0) identical = self.peaks.get_identical_intervals( PeakCollection([Peak(3, 3, [1, 2, 3, 4], self.graph)])) self.assertEqual(len(identical), 1) def test_get_overlapping_intervals(self): overlapping = self.peaks.get_overlapping_intervals( Peak(3, 3, [1, 2], self.graph)) self.assertTrue(len(overlapping), 1) overlapping = self.peaks.get_overlapping_intervals( Peak(3, 3, [1, 6], self.graph)) self.assertTrue(len(overlapping), 2) def test_approx_contains(self): peaks = PeakCollection( [Peak(3, 3, [1, 2, 3, 4]), Peak(3, 3, [-10, 11])]) peaks.create_node_index() self.assertTrue(peaks.approx_contains_part_of_interval(Peak(1, 2, [1]))) self.assertTrue( peaks.approx_contains_part_of_interval(Peak(1, 2, [10]))) self.assertFalse( peaks.approx_contains_part_of_interval(Peak(1, 2, [100]))) def test_create_from_nongraphpeakcollection(self): graph = Graph({ 1: Block(10), 2: Block(10), 3: Block(10) }, { 1: [2], 2: [3] }) graph.convert_to_numpy_backend() linear_path = Interval(0, 10, [1, 2, 3], graph) linear_path = linear_path.to_numpy_indexed_interval() nongraph_peaks = NonGraphPeakCollection([ NonGraphPeak("chr1", 3, 10, 5), NonGraphPeak("chr1", 13, 15, 7), ]) peaks = PeakCollection.create_from_nongraph_peak_collection( graph, nongraph_peaks, linear_path, None) self.assertEqual(peaks.intervals[0], Interval(3, 10, [1])) self.assertEqual(peaks.intervals[1], Interval(3, 5, [2])) peaks = PeakCollection.create_from_nongraph_peak_collection( graph, nongraph_peaks, linear_path, LinearRegion("chr1", 3, 20)) self.assertEqual(peaks.intervals[0], Interval(0, 7, [1])) self.assertEqual(peaks.intervals[1], Interval(0, 2, [2])) def test_convert_to_approx_linear_peaks(self): graph = Graph({i: Block(3) for i in range(1, 10)}, { 1: [2], 2: [3], 3: [4], 4: [5], 5: [6], 6: [7, 8], 7: [9], 9: [9] }) graph.convert_to_numpy_backend() linear_interval = Interval(0, 3, [2, 4, 8, 9], graph) linear_interval = linear_interval.to_numpy_indexed_interval() peaks = PeakCollection([Peak(2, 2, [2, 3, 4]), Peak(1, 1, [3, 4, 5])]) linear_peaks = peaks.to_approx_linear_peaks(linear_interval, "chr4") linear_peaks = linear_peaks.peaks print(linear_peaks) self.assertEqual(linear_peaks[0], NonGraphPeak("chr4", 2, 5)) self.assertEqual(linear_peaks[1], NonGraphPeak("chr4", 3, 3))
from graph_peak_caller.peakcollection import PeakCollection chrom = sys.argv[1] fragment_length = int(sys.argv[2]) ref = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") direct = SparseValues.from_sparse_files(chrom + "_direct_pileup") filtered_peaks = SparseValues.from_sparse_files(chrom + "_hole_cleaned") variant_map = load_variant_maps(chrom, "/data/bioinf/tair2/") max_paths, sub_graphs = SparseMaxPaths(filtered_peaks, graph, direct, ref, variant_map).run() long_maxpaths = [path for path in max_paths if path.length() >= fragment_length] for max_path in long_max_paths: assert max_path.length() > 0, "Max path %s has negative length" % max_path score = np.max(self.q_values.get_interval_values(max_path)) max_path.set_score(score) assert not np.isnan(score), "Score %s is nan" % score PeakCollection(long_maxpaths).to_file(chrom + "_max_paths.intervalcollection", text_file=True) from graph_peak_caller.peakfasta import PeakFasta from offsetbasedgraph import SequenceGraph seqgraph = SequenceGraph.from_file("/data/bioinf/tair2/" + chrom + ".nobg.sequences") PeakFasta(seqgraph).write_max_path_sequences(chrom + "_sequences.fasta", long_maxpaths)
if touching: visited.add(touching[0].unique_id) mapping[touching[0].unique_id] = peak.unique_id return mapping out_file = open("motif_summary_graph_matching_macs.tsv", "w") for chrom in ["1", "2", "3", "4", "5"]: logging.info("Chromosome %s" % chrom) path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") macs_peaks = NonGraphPeakCollection.from_fasta("macs_sequences_chr" + chrom + "_summits.fasta") macs_peaks = PeakCollection.create_from_nongraph_peak_collection( graph, macs_peaks, path) macs_peaks.create_node_index() graph_peaks = PeakCollection.from_fasta_file(chrom + "_sequences_summits.fasta") graph_peaks.create_node_index() macs_motif_matches = set([ line.split("\t")[2] for line in open("fimo_macs_chr" + chrom + "/fimo.txt") if not line.startswith("#") ]) graph_motif_matches = set([ line.split("\t")[2] for line in open("fimo_graph_chr" + chrom + "/fimo.txt") if not line.startswith("#") ])
from graph_peak_caller.analysis.nongraphpeaks import NonGraphPeakCollection from graph_peak_caller.peakcollection import PeakCollection import sys from offsetbasedgraph import Graph, NumpyIndexedInterval for chrom in sys.argv[1].split(","): linear_path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") print("Chrom " + chrom) peaks = NonGraphPeakCollection.from_bed_file("macs_peaks_chr" + chrom + ".bed", 60) print(len(peaks.peaks)) graph_peaks = PeakCollection.create_from_nongraph_peak_collection(graph, peaks, linear_path) graph_peaks.to_file(chrom + "_macs_all_summits.intervalcollection", text_file=True) print("Wrote to " + chrom + "_macs_all_summits.intervalcollection")