def check_unique(peak_file_name, motif_file_name, set_file_name,
                 filter_unique):
    is_dip = "_dip." in set_file_name
    peaks = PeakCollection.from_file(peak_file_name, True)
    ids = {peak.unique_id for peak in peaks}
    motifs = PeakCollection.from_file(motif_file_name, True)
    unique_motifs = {
        i
        for i, motif in enumerate(motifs) if motif.unique_id in ids
    }
    parser = parse_dip_set_file if is_dip else parse_set_file
    counts = [
        count for i, count in enumerate(parser(set_file_name))
        if (not filter_unique) or (i in unique_motifs)
    ]
    N = len(counts)
    print(N, motif_file_name)
    if is_dip:
        successes = len(
            [count for count in counts if (count[0] + count[1]) >= count[2]])
    else:
        successes = [count for count in counts if count[0] >= count[1]]
        print(successes)
        successes = len(successes)
    return successes, N
    def test_create_from_nongraphpeakcollection(self):

        graph = Graph({
            1: Block(10),
            2: Block(10),
            3: Block(10)
        }, {
            1: [2],
            2: [3]
        })
        graph.convert_to_numpy_backend()
        linear_path = Interval(0, 10, [1, 2, 3], graph)
        linear_path = linear_path.to_numpy_indexed_interval()

        nongraph_peaks = NonGraphPeakCollection([
            NonGraphPeak("chr1", 3, 10, 5),
            NonGraphPeak("chr1", 13, 15, 7),
        ])

        peaks = PeakCollection.create_from_nongraph_peak_collection(
            graph, nongraph_peaks, linear_path, None)

        self.assertEqual(peaks.intervals[0], Interval(3, 10, [1]))
        self.assertEqual(peaks.intervals[1], Interval(3, 5, [2]))

        peaks = PeakCollection.create_from_nongraph_peak_collection(
            graph, nongraph_peaks, linear_path, LinearRegion("chr1", 3, 20))
        self.assertEqual(peaks.intervals[0], Interval(0, 7, [1]))
        self.assertEqual(peaks.intervals[1], Interval(0, 2, [2]))
    def test_get_identical_intervals(self):
        identical = self.peaks.get_identical_intervals(
            PeakCollection([Peak(2, 3, [1, 2, 3, 4], self.graph)]))
        self.assertEqual(len(identical), 0)

        identical = self.peaks.get_identical_intervals(
            PeakCollection([Peak(3, 3, [1, 2, 3, 4], self.graph)]))
        self.assertEqual(len(identical), 1)
Beispiel #4
0
    def from_graph_peaks_in_fasta(cls, graph, vg_graph_json_file_name,
                                  chromosome, fasta_file_name,
                                  regions_bed_file, true_peaks_file):
        reads = PeakCollection.from_fasta_file(fasta_file_name, graph=graph)
        vg_graph = pyvg.vg.Graph.create_from_file(
            vg_graph_json_file_name, limit_to_chromosomes=chromosome)
        logging.info("Finding linear path")

        linear_path_file = "linear_path_%s.intervalcollection" % chromosome
        try:
            linear_path = obg.IntervalCollection.from_file(
                linear_path_file, text_file=True).intervals[0]
            linear_path = linear_path.to_indexed_interval()
        except FileNotFoundError:
            linear_path = create_linear_path(graph,
                                             vg_graph,
                                             path_name=chromosome,
                                             write_to_file=linear_path_file)

        linear_path.graph = graph

        filtered_reads = []

        # Convert regions to intervals in graph
        logging.info("Converting regions to regions in graph")
        graph_regions = []
        bed_file = open(regions_bed_file)
        for line in bed_file:
            print(line)
            line = line.split()
            chr = line[0]
            start = int(line[1])
            end = int(line[2])

            if chr != "chr%s" % chromosome:
                logging.info("Skipping %s, %d, %d" % (chr, start, end))
                continue

            graph_interval = linear_path.get_subinterval(start, end)
            graph_regions.append(graph_interval)

        assert len(graph_regions
                   ) > 0, " Found not graph regions for chr %d" % chromosome
        graph_regions = PeakCollection(graph_regions)

        # Filter out reads not overlapping with linear regions
        for read in reads:
            n_overlapping = graph_regions.get_overlapping_intervals(
                read, minimum_overlap=1)

            if n_overlapping:
                filtered_reads.append(read)

        logging.info("Found %d reads in graph regions" % len(filtered_reads))

        return cls(chromosome, reads, true_peaks_file)
    def setUp(self):

        self.graph = Graph({i: Block(3)
                            for i in range(1, 7)},
                           {i: [i + 1]
                            for i in range(1, 6)})
        self.peaks = PeakCollection([
            Peak(3, 3, [1, 2, 3, 4], self.graph),
            Peak(3, 3, [5, 6], self.graph)
        ])
def create_linear_peaks_from_bed(linear_sequence_fasta_file, peaks_bed_file,
                                 obg_graph_file_name, vg_graph_file_name,
                                 start_node, region):

    ob_graph = obg.GraphWithReversals.from_file(obg_graph_file_name)
    search_sequence = open(linear_sequence_fasta_file).read()
    sequence_retriever = SequenceRetriever.from_vg_graph(vg_graph_file_name)
    traverser = GraphTraverserUsingSequence(ob_graph, search_sequence,
                                            sequence_retriever)
    traverser.search_from_node(start_node)
    linear_path_interval = traverser.get_interval_found()
    IntervalCollection([linear_path_interval
                        ]).to_file("linear_path.intervalcollection",
                                   text_file=True)
    print("Length")
    print(linear_path_interval.length())
    print(linear_path_interval.region_paths[0])
    print(linear_path_interval.start_position)
    print(linear_path_interval.end_position)

    linear_peaks = PeakCollection.create_from_linear_intervals_in_bed_file(
        obg_graph_file_name, linear_path_interval, peaks_bed_file,
        region.start, region.end)

    linear_peaks.to_file("linear_peaks.intervalcollection", text_file=True)
 def __init__(self, graph, subgraphs_file_name, peaks_file_name):
     self.graph = graph
     IntervalCollection.interval_class = DirectedInterval
     self.subgraphs = SubgraphCollection.from_pickle(subgraphs_file_name,
                                                     graph=graph)
     self.peaks = PeakCollection.create_list_from_file(peaks_file_name,
                                                       graph=graph)
Beispiel #8
0
    def compare_with_correct_peaks(self):
        correct_peaks = PeakCollection(self.correct_peaks)
        #for peak in correct_peaks:
        #    print(peak)
        found_peaks = PeakCollection.create_list_from_file(
            "max_paths.intervalcollection", graph=self.graph)

        #for i in found_peaks:
        #    print(i)
        matched = correct_peaks.get_identical_intervals(found_peaks)
        subgraphs = self.caller.q_value_peak_caller.peaks_as_subgraphs
        print("%d subgraphs" % len(subgraphs.subgraphs))

        print(
            "%d correct peaks identically found, %3.f %% " %
            (len(matched), 100 * len(matched) / len(correct_peaks.intervals)))
    def test_intervals_to_fasta_from_fasta(self):
        run_argument_parser([
            "create_ob_graph", "-o", "tests/testgraph.obg",
            "tests/vg_test_graph.json"
        ])

        PeakCollection([Peak(0, 2, [1, 2], score=3)
                        ]).to_file("tests/testintervals.intervalcollection",
                                   text_file=True)
        run_argument_parser([
            "peaks_to_fasta", "tests/testgraph.obg.sequences",
            "tests/testintervals.intervalcollection",
            "tests/testsequences.fasta"
        ])

        collection = PeakCollection.from_fasta_file(
            "tests/testsequences.fasta")
        self.assertEqual(len(collection.intervals), 1)
        self.assertEqual(collection.intervals[0].sequence.lower(), "tttcccctt")
Beispiel #10
0
def macs_to_graph_peaks(folder):
    for chrom in ["1", "2", "3", "4", "5"]:
        path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom +
                                              "_linear_pathv2.interval")
        graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
        macs_peaks = PeakCollection.from_fasta_file(
            folder + "/macs_sequences_chr%s_summits_unique.fasta" % chrom,
            graph)
        macs_peaks.to_file(
            folder +
            "/%s_macs_unique_graph_summits.intervalcollection" % chrom, True)
    def test_get_summits(self):

        qvalues = SparseValues(np.array([0]), np.array([3]))
        qvalues.track_size = 22
        qvalues.to_sparse_files("tests/test_qvalues")

        run_argument_parser([
            "create_ob_graph", "-o", "tests/testgraph.obg",
            "tests/vg_test_graph.json"
        ])
        max_paths = PeakCollection([Peak(0, 2, [1, 2], score=3)])
        PeakFasta(self.correct_sequence_graph).write_max_path_sequences(
            "tests/test_max_paths.fasta", max_paths)

        run_argument_parser([
            "get_summits", "-g", "tests/testgraph.obg",
            "tests/test_max_paths.fasta", "tests/test_qvalues", "2"
        ])

        result = PeakCollection.from_fasta_file(
            "tests/test_max_paths_summits.fasta")
        self.assertEqual(result.intervals[0], Peak(2, 6, [1]))
        self.assertEqual(result.intervals[0].sequence.lower(), "tccc")
    def test_convert_to_approx_linear_peaks(self):
        graph = Graph({i: Block(3)
                       for i in range(1, 10)}, {
                           1: [2],
                           2: [3],
                           3: [4],
                           4: [5],
                           5: [6],
                           6: [7, 8],
                           7: [9],
                           9: [9]
                       })
        graph.convert_to_numpy_backend()
        linear_interval = Interval(0, 3, [2, 4, 8, 9], graph)
        linear_interval = linear_interval.to_numpy_indexed_interval()

        peaks = PeakCollection([Peak(2, 2, [2, 3, 4]), Peak(1, 1, [3, 4, 5])])
        linear_peaks = peaks.to_approx_linear_peaks(linear_interval, "chr4")
        linear_peaks = linear_peaks.peaks
        print(linear_peaks)

        self.assertEqual(linear_peaks[0], NonGraphPeak("chr4", 2, 5))
        self.assertEqual(linear_peaks[1], NonGraphPeak("chr4", 3, 3))
    def test_approx_contains(self):

        peaks = PeakCollection(
            [Peak(3, 3, [1, 2, 3, 4]),
             Peak(3, 3, [-10, 11])])
        peaks.create_node_index()

        self.assertTrue(peaks.approx_contains_part_of_interval(Peak(1, 2,
                                                                    [1])))

        self.assertTrue(
            peaks.approx_contains_part_of_interval(Peak(1, 2, [10])))

        self.assertFalse(
            peaks.approx_contains_part_of_interval(Peak(1, 2, [100])))
Beispiel #14
0
 def check_peaks(self):
     peaks = PeakCollection.from_file(self.experiment_dir + "/not_matching_set1.intervals", self.graph) 
     
     i = 0
     for peak in peaks:
         peak.graph = self.graph
         alignments = set([a.strip() for a in self.alignment_collection.get_alignments_on_interval(peak).keys()])
         linear_peak = peak.to_linear_offsets2(self.linear_path)
         linear_alignments = set([a.qname for a in self.bam_file.fetch("5", linear_peak[0], linear_peak[1])])
         i += 1
         print(" ==== Peak %d === " % i)
         print(peak)
         print("Linear: %s" % str(linear_peak))
         print("%d alignments" % len(alignments))
         print("%d linear alignments" % len(linear_alignments))
         print("%d alignments in common" % len(alignments.intersection(linear_alignments)))
         print("%d linear not in graph" % len(linear_alignments.difference(alignments)))
         print("%d graph not in linear" % len(alignments.difference(linear_alignments)))
         print("Not aligned by linear:")
         print(alignments.difference(linear_alignments))
class TestPeakCollection(unittest.TestCase):
    def setUp(self):

        self.graph = Graph({i: Block(3)
                            for i in range(1, 7)},
                           {i: [i + 1]
                            for i in range(1, 6)})
        self.peaks = PeakCollection([
            Peak(3, 3, [1, 2, 3, 4], self.graph),
            Peak(3, 3, [5, 6], self.graph)
        ])

    def test_contains_interval(self):
        self.assertTrue(self.peaks.contains_interval(Peak(3, 3, [1, 2, 3, 4])))
        self.assertFalse(self.peaks.contains_interval(Peak(2, 3,
                                                           [1, 2, 3, 4])))

    def test_get_similar_intervals(self):
        similar = self.peaks.get_similar_intervals(
            Peak(2, 3, [1, 2, 3, 4], self.graph), 1)
        self.assertTrue(len(similar) == 1)
        self.assertEqual(similar[0], self.peaks.intervals[0])

    def test_get_identical_intervals(self):
        identical = self.peaks.get_identical_intervals(
            PeakCollection([Peak(2, 3, [1, 2, 3, 4], self.graph)]))
        self.assertEqual(len(identical), 0)

        identical = self.peaks.get_identical_intervals(
            PeakCollection([Peak(3, 3, [1, 2, 3, 4], self.graph)]))
        self.assertEqual(len(identical), 1)

    def test_get_overlapping_intervals(self):
        overlapping = self.peaks.get_overlapping_intervals(
            Peak(3, 3, [1, 2], self.graph))
        self.assertTrue(len(overlapping), 1)

        overlapping = self.peaks.get_overlapping_intervals(
            Peak(3, 3, [1, 6], self.graph))
        self.assertTrue(len(overlapping), 2)

    def test_approx_contains(self):

        peaks = PeakCollection(
            [Peak(3, 3, [1, 2, 3, 4]),
             Peak(3, 3, [-10, 11])])
        peaks.create_node_index()

        self.assertTrue(peaks.approx_contains_part_of_interval(Peak(1, 2,
                                                                    [1])))

        self.assertTrue(
            peaks.approx_contains_part_of_interval(Peak(1, 2, [10])))

        self.assertFalse(
            peaks.approx_contains_part_of_interval(Peak(1, 2, [100])))

    def test_create_from_nongraphpeakcollection(self):

        graph = Graph({
            1: Block(10),
            2: Block(10),
            3: Block(10)
        }, {
            1: [2],
            2: [3]
        })
        graph.convert_to_numpy_backend()
        linear_path = Interval(0, 10, [1, 2, 3], graph)
        linear_path = linear_path.to_numpy_indexed_interval()

        nongraph_peaks = NonGraphPeakCollection([
            NonGraphPeak("chr1", 3, 10, 5),
            NonGraphPeak("chr1", 13, 15, 7),
        ])

        peaks = PeakCollection.create_from_nongraph_peak_collection(
            graph, nongraph_peaks, linear_path, None)

        self.assertEqual(peaks.intervals[0], Interval(3, 10, [1]))
        self.assertEqual(peaks.intervals[1], Interval(3, 5, [2]))

        peaks = PeakCollection.create_from_nongraph_peak_collection(
            graph, nongraph_peaks, linear_path, LinearRegion("chr1", 3, 20))
        self.assertEqual(peaks.intervals[0], Interval(0, 7, [1]))
        self.assertEqual(peaks.intervals[1], Interval(0, 2, [2]))

    def test_convert_to_approx_linear_peaks(self):
        graph = Graph({i: Block(3)
                       for i in range(1, 10)}, {
                           1: [2],
                           2: [3],
                           3: [4],
                           4: [5],
                           5: [6],
                           6: [7, 8],
                           7: [9],
                           9: [9]
                       })
        graph.convert_to_numpy_backend()
        linear_interval = Interval(0, 3, [2, 4, 8, 9], graph)
        linear_interval = linear_interval.to_numpy_indexed_interval()

        peaks = PeakCollection([Peak(2, 2, [2, 3, 4]), Peak(1, 1, [3, 4, 5])])
        linear_peaks = peaks.to_approx_linear_peaks(linear_interval, "chr4")
        linear_peaks = linear_peaks.peaks
        print(linear_peaks)

        self.assertEqual(linear_peaks[0], NonGraphPeak("chr4", 2, 5))
        self.assertEqual(linear_peaks[1], NonGraphPeak("chr4", 3, 3))
from graph_peak_caller.peakcollection import PeakCollection

chrom = sys.argv[1]
fragment_length = int(sys.argv[2])

ref = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval")


graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
direct = SparseValues.from_sparse_files(chrom + "_direct_pileup")
filtered_peaks = SparseValues.from_sparse_files(chrom + "_hole_cleaned")
variant_map = load_variant_maps(chrom, "/data/bioinf/tair2/")

max_paths, sub_graphs = SparseMaxPaths(filtered_peaks, graph, direct, ref, variant_map).run()
long_maxpaths = [path for path in max_paths if path.length() >= fragment_length]

for max_path in long_max_paths:
    assert max_path.length() > 0, "Max path %s has negative length" % max_path
    score = np.max(self.q_values.get_interval_values(max_path))
    max_path.set_score(score)
    assert not np.isnan(score), "Score %s is nan" % score


PeakCollection(long_maxpaths).to_file(chrom + "_max_paths.intervalcollection", text_file=True)

from graph_peak_caller.peakfasta import PeakFasta
from offsetbasedgraph import SequenceGraph
seqgraph = SequenceGraph.from_file("/data/bioinf/tair2/" + chrom + ".nobg.sequences")
PeakFasta(seqgraph).write_max_path_sequences(chrom + "_sequences.fasta", long_maxpaths)

        if touching:
            visited.add(touching[0].unique_id)
            mapping[touching[0].unique_id] = peak.unique_id

    return mapping


out_file = open("motif_summary_graph_matching_macs.tsv", "w")
for chrom in ["1", "2", "3", "4", "5"]:
    logging.info("Chromosome %s" % chrom)
    path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom +
                                          "_linear_pathv2.interval")
    graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
    macs_peaks = NonGraphPeakCollection.from_fasta("macs_sequences_chr" +
                                                   chrom + "_summits.fasta")
    macs_peaks = PeakCollection.create_from_nongraph_peak_collection(
        graph, macs_peaks, path)
    macs_peaks.create_node_index()
    graph_peaks = PeakCollection.from_fasta_file(chrom +
                                                 "_sequences_summits.fasta")
    graph_peaks.create_node_index()
    macs_motif_matches = set([
        line.split("\t")[2]
        for line in open("fimo_macs_chr" + chrom + "/fimo.txt")
        if not line.startswith("#")
    ])
    graph_motif_matches = set([
        line.split("\t")[2]
        for line in open("fimo_graph_chr" + chrom + "/fimo.txt")
        if not line.startswith("#")
    ])
from graph_peak_caller.analysis.nongraphpeaks import NonGraphPeakCollection
from graph_peak_caller.peakcollection import PeakCollection

import sys
from offsetbasedgraph import Graph, NumpyIndexedInterval


for chrom in sys.argv[1].split(","):
    linear_path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval")
    graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
    print("Chrom " + chrom)
    peaks = NonGraphPeakCollection.from_bed_file("macs_peaks_chr" + chrom + ".bed", 60) 
    print(len(peaks.peaks))
    graph_peaks = PeakCollection.create_from_nongraph_peak_collection(graph, peaks, linear_path)
    graph_peaks.to_file(chrom + "_macs_all_summits.intervalcollection", text_file=True)
    print("Wrote to " + chrom + "_macs_all_summits.intervalcollection")