Esempio n. 1
0
    def fill_small_wholes(self,
                          max_size,
                          write_holes_to_file=None,
                          touched_nodes=None):
        cleaner = HolesCleaner(self, max_size, touched_nodes=touched_nodes)
        areas = cleaner.run()
        n_filled = 0

        hole_intervals = []

        for node_id in areas.areas:
            if touched_nodes is not None:
                if node_id not in touched_nodes:
                    continue

            starts = areas.get_starts(node_id)
            ends = areas.get_ends(node_id)
            for start, end in zip(starts, ends):
                self.data[node_id].set_interval_value(start, end, True)
                logging.debug("Filling hole %s, %d, %d" %
                              (node_id, start, end))
                n_filled += 1
                assert end - start <= max_size
                hole_intervals.append(Interval(start, end, [node_id]))

        logging.info("Filled %d small holes (splitted into holes per node)" %
                     n_filled)

        if write_holes_to_file is not None:
            intervals = IntervalCollection(hole_intervals)
            intervals.to_file(write_holes_to_file, text_file=True)

        self.sanitize()
Esempio n. 2
0
def get_intersecting_intervals(args):
    from offsetbasedgraph import IntervalCollection
    intervals1 = IntervalCollection.from_file(args.file1, text_file=True, graph=args.graph)
    intervals2 = IntervalCollection.from_file(args.file2, text_file=True, graph=args.graph)

    out = []
    for interval1 in intervals1.intervals:
        for interval2 in intervals2.intervals:
            if interval1.intersects(interval2):
                out.append(interval1)
                logging.info("Found match between %s and %s" % (interval1, interval2))
                continue

    IntervalCollection(out).to_file(args.out_file_name, text_file=True)
    logging.info("Wrote intersecting intervals to %s" % args.out_file_name)
Esempio n. 3
0
def create_linear_peaks_from_bed(linear_sequence_fasta_file, peaks_bed_file,
                                 obg_graph_file_name, vg_graph_file_name,
                                 start_node, region):

    ob_graph = obg.GraphWithReversals.from_file(obg_graph_file_name)
    search_sequence = open(linear_sequence_fasta_file).read()
    sequence_retriever = SequenceRetriever.from_vg_graph(vg_graph_file_name)
    traverser = GraphTraverserUsingSequence(ob_graph, search_sequence,
                                            sequence_retriever)
    traverser.search_from_node(start_node)
    linear_path_interval = traverser.get_interval_found()
    IntervalCollection([linear_path_interval
                        ]).to_file("linear_path.intervalcollection",
                                   text_file=True)
    print("Length")
    print(linear_path_interval.length())
    print(linear_path_interval.region_paths[0])
    print(linear_path_interval.start_position)
    print(linear_path_interval.end_position)

    linear_peaks = PeakCollection.create_from_linear_intervals_in_bed_file(
        obg_graph_file_name, linear_path_interval, peaks_bed_file,
        region.start, region.end)

    linear_peaks.to_file("linear_peaks.intervalcollection", text_file=True)
Esempio n. 4
0
def vg_path_to_obg_interval(path_file_name, out_file_name):

    json_objects = get_json_lines(path_file_name)

    alignments = (Alignment.from_json(json_object)
                  for json_object in json_objects)
    intervals = []
    for alignment in alignments:
        path = alignment.path
        interval = path.to_obg()
        intervals.append(interval)
        chrom = path.name
        start_node = path.mappings[0].node_id(
        )  # [m.node_id() for m in path.mappings]

        logging.info("Processing chromosome %s with start node %d" %
                     (chrom, start_node))

        with open("chr%s_start_node.txt" % chrom, "w") as f:
            f.write(str(start_node))

        file_name = out_file_name.split(
            ".")[0] + "_" + chrom + "." + out_file_name.split(".")[-1]
        IntervalCollection([interval]).to_file(file_name, text_file=True)
        logging.info("Number of files in interval for chrom %s: %d" %
                     (chrom, len(interval.region_paths)))
        logging.info("Wrote path as obg interval to %s" % file_name)
Esempio n. 5
0
    def call_peaks(self):

        genome_size = sum(block.length()
                          for block in self.graph.blocks.values())
        experiment_info = ExperimentInfo(genome_size, 50, 20)
        experiment_info.n_sample_reads = self.n_sample_reads
        experiment_info.n_control_reads = self.n_control_reads

        snarlbuilder = SnarlGraphBuilder(self.graph.copy(),
                                         self.snarls,
                                         id_counter=self.graph.max_block_id() +
                                         1)
        snarlgraph = snarlbuilder.build_snarl_graphs()
        linear_map = LinearSnarlMap(snarlgraph, self.graph)
        linear_map.to_file("simulated_snarl_map.tmp")
        caller = CallPeaks(self.graph,
                           sample_intervals="dummy",
                           control_intervals=IntervalCollection(
                               self.control_reads),
                           experiment_info=experiment_info,
                           has_control=self.with_control,
                           linear_map="simulated_snarl_map.tmp")

        caller._sample_pileup = self.sample_pileup
        self.sample_pileup.to_bed_graph("sample.bdg")
        caller.create_control(True)
        caller.scale_tracks(True)
        caller.get_score()
        caller.call_peaks()
        sequence_retriever = DummySequenceRetriever()
        caller.save_max_path_sequences_to_fasta_file(
            "simulated_peak_sequences.fasta", sequence_retriever)
        self.caller = caller
 def assert_final_peaks_equals_input_peaks(self):
     final_peaks = IntervalCollection.create_list_from_file(
         "test_max_paths.intervalcollection")
     for peak in self.peaks:
         self.assertTrue(
             peak in final_peaks.intervals,
             "Peak %s not in final peaks. Final peaks: \n%s" %
             (peak, final_peaks.intervals))
     self.assertEqual(len(self.peaks), len(final_peaks.intervals))
Esempio n. 7
0
    def from_file(cls, file_name, graph):
        logging.info("Reading from file")
        logging.info("Reading dict structure")
        with open(file_name, "rb") as f:
            node_dict = pickle.load(f)

        logging.info("Reading intervals")
        intervals = IntervalCollection.from_file(file_name + ".intervals",
                                                 graph=graph)
        return cls(node_dict, graph, list(intervals))
Esempio n. 8
0
 def _read_alignments(self):
     if self.alignment_file_name.endswith(".json"):
         self.alignments = vg_json_file_to_interval_collection(self.alignment_file_name).intervals
     elif self.alignment_file_name.endswith(".graphnodes"):
         self.alignments = (Interval(0, 1, [int(n) for n in line.strip().split()[1].split(",")])
                            for line in open(self.alignment_file_name))
     elif self.alignment_file_name.endswith(".graphalignments"):
         self.alignments = (Interval.from_file_line(line.strip().split("\t")[1]) for line in open(self.alignment_file_name) if line.strip().split("\t")[1] != ".")
     else:
         self.alignments = IntervalCollection.from_file(self.alignment_file_name).intervals
Esempio n. 9
0
 def test_count_unique_reads(self):
     reads = [
         IntervalCollection([
             Interval(4, 10, [1, 2, 3]),
             Interval(4, 5, [1]),
             Interval(5, 5, [1]),
             Interval(6, 2, [-3, -2, -1])
         ])
     ]
     unique = MultipleGraphsCallpeaks.count_number_of_unique_reads(reads)
     self.assertEqual(unique, 3)
def run_predict_path(args):
    chromosomes = args.chromosomes.split(",")
    processes = []
    if not os.path.isfile(args.alignments):
        logging.error("Input alignments file %s does not exist" %
                      args.alignments)
        sys.exit()

    for chromosome in chromosomes:
        logging.info("Starting process for chromosome %s " % chromosome)
        process = Process(target=run_predict_path_single_chromosome,
                          args=(args.alignments, chromosome, args.data_dir,
                                args.linear_ref_bonus, args.out_file_name,
                                args.max_nodes_to_traverse))
        process.start()
        processes.append(process)

    for process in processes:
        process.join()

    # Merge all fasta files that were produces
    out_fasta = open(args.out_file_name + ".fa", "w")
    logging.info("Merging fasta files")
    for chromosome in tqdm(chromosomes):
        with open(args.out_file_name + "_" + chromosome + ".fasta") as f:
            out_fasta.write(f.read())

    logging.info("Wrote resulting linear reference to %s" %
                 (args.out_file_name + ".fa"))

    # Create indexed intervals for each interval file that was produced
    logging.info("Creating indexed interval for all chromosomes")
    for chromosome in chromosomes:
        file_name = args.out_file_name + "_" + chromosome + ".intervalcollection"
        graph = Graph.from_file(args.data_dir + chromosome + ".nobg")
        intervals = IntervalCollection.from_file(file_name,
                                                 text_file=True,
                                                 graph=graph)
        intervals = list(intervals.intervals)
        assert len(
            intervals) == 1, "Only a single interval in file is supported"
        interval = intervals[0]
        indexed = interval.to_numpy_indexed_interval()
        indexed.to_file(file_name + ".indexed")
        logging.info("Wrote indexed interval to file %s" % file_name +
                     ".indexed")

    if not args.skip_bwa_index:
        logging.info("Running bwa index")
        run_bwa_index(args.out_file_name + ".fa")
    else:
        logging.info("Not creating bwa index")
Esempio n. 11
0
    def test_filter_duplicates(self):
        intervals = [
            Interval(0, 10, [1, 2, 3]),
            Interval(1, 10, [1, 2, 3]),
            Interval(0, 10, [1, 2, 3])
        ]

        interval_collection = IntervalCollection(intervals)
        intervals_filtered = list(UniqueIntervals(interval_collection))

        self.assertEqual(len(intervals_filtered), len(intervals) - 1)
        self.assertEqual(intervals_filtered[0], intervals[0])
        self.assertEqual(intervals_filtered[1], intervals[1])
Esempio n. 12
0
    def check_similarity(self, analyse_first_n_peaks=10000000):
        print("Number of peaks in main set: %d" % len(self.peaks1.intervals))
        self.results.tot_peaks1 = len(self.peaks1.intervals)
        self.results.tot_peaks2 = len(self.peaks2.intervals)
        counter = 0
        visited = set([])
        for peak in sorted(self.peaks1, key=lambda x: x.score,
                           reverse=True)[0:analyse_first_n_peaks]:
            assert peak.unique_id is not None
            counter += 1
            if counter % 500 == 0:
                logging.info("Checked %d peaks" % counter)
            touching = self.peaks2.approx_contains_part_of_interval(
                peak, visited)
            if touching:
                visited.add(touching[0].unique_id)
                self.peaks2_in_peaks1.append(touching[0])
                self.peaks1_in_peaks2.append(peak)
            else:
                self.peaks1_not_in_peaks2.append(peak)
        for peak in self.peaks2:
            if peak.unique_id not in visited:
                self.peaks2_not_in_peaks1.append(peak)

        self.results.peaks1_in_peaks2 = len(self.peaks1_in_peaks2)
        self.results.peaks2_in_peaks1 = len(self.peaks2_in_peaks1)

        self.results.peaks1_not_in_peaks2 = len(self.peaks1_not_in_peaks2)
        self.results.peaks2_not_in_peaks1 = len(self.peaks2_not_in_peaks1)

        chromosome = self.chromosome
        if chromosome is None:
            chromosome = "unknown"

        gpc_not_matching_macs = IntervalCollection(self.peaks1_not_in_peaks2)
        gpc_not_matching_macs.to_file("gpc_not_matching_macs_chr%s.intervals" %
                                      chromosome,
                                      text_file=True)
        logging.info(
            "Wrote peaks not matching to file gpc_not_matching_macs_chr%s.intervals"
            % chromosome)

        macs_not_matching_gpc = IntervalCollection(self.peaks2_not_in_peaks1)
        macs_not_matching_gpc.to_file("macs_not_matching_gpc_chr%s.intervals" %
                                      chromosome,
                                      text_file=True)
        logging.info(
            "Wrote peaks not matching to file macs_not_matching_gpc_chr%s.intervals"
            % chromosome)
Esempio n. 13
0
 def __init__(self, graph, sequence_retriever, linear_path_file_name,
              peaks1_file_name, peaks2_file_name):
     self.graph = graph
     self.sequence_retriever = sequence_retriever
     self.peaks1 = PeakCollection.create_list_from_file(peaks1_file_name,
                                                        graph=graph)
     self.peaks2 = PeakCollection.create_list_from_file(peaks2_file_name,
                                                        graph=graph)
     print("Number of intervals in set 1/2: %d / %d" %
           (len(self.peaks1.intervals), len(self.peaks2.intervals)))
     if linear_path_file_name is not None:
         self.linear_path = IntervalCollection.create_list_from_file(
             linear_path_file_name, self.graph).intervals[0]
Esempio n. 14
0
def find_linear_path_through_chromosome(chromosome, chromend, fasta_file_name,
                                        ob_graph_file_name,
                                        vg_graph_file_name):
    genome = Fasta(fasta_file_name)
    seq = str(genome[chromosome][0:50818468]).lower()

    logging.info("Creating sequence retriever")
    sequence_retriever = SequenceRetriever.from_vg_json_graph(
        vg_graph_file_name)

    graph = GraphWithReversals.from_numpy_file(ob_graph_file_name)

    start_nodes = graph.get_first_blocks()
    assert len(start_nodes) == 1, "Found %d start nodes" % start_nodes
    start_node = start_nodes[0]

    traverser = GraphTraverserUsingSequence(graph, seq, sequence_retriever)
    traverser.search_from_node(start_node)
    path = traverser.get_interval_found()
    path = IntervalCollection(path)
    path.to_file("22_path.intervalcollection", text_file=True)
    logging.info("Done")
Esempio n. 15
0
    def test_complex_graph(self):
        intervals = IntervalCollection([
            Interval(0, 3, [1, 3, 4, 6, 10]),
            Interval(1, 2, [2]),
            Interval(2, 3, [2]),
            Interval(0, 3, [7, 9])
        ])
        haplotyper = HaploTyper(self.complex_graph, intervals)
        haplotyper.build()
        max_interval = haplotyper.get_maximum_interval_through_graph()

        self.assertEqual(
            max_interval,
            Interval(0, 3, [1, 2, 7, 9, 10, 12])
        )
    def test_all_steps(self):
        run_argument_parser([
            "create_ob_graph", "-o", "tests/testgraph.obg",
            "tests/vg_test_graph.json"
        ])
        run_argument_parser(
            ['create_linear_map', "--graph", "tests/testgraph.obg"])

        IntervalCollection([Interval(1, 1, [1, 2])
                            ]).to_file("tests/sample.intervalcollection")

        run_argument_parser([
            "callpeaks", "--graph", "tests/testgraph.obg", "-s",
            "tests/sample.intervalcollection", "-n", "tests/test_experiment_",
            "-f", "10", "-r", "7"
        ])
Esempio n. 17
0
def analyse_pileups_on_peaks(ob_graph, pileups_file_names,
                             peak_intervals_file_name):
    print("Analysing peaks")
    pileups = {
        name: SparsePileup.from_bed_graph(ob_graph, pileup)
        for name, pileup in pileups_file_names.items()
    }
    peaks = IntervalCollection.from_file(peak_intervals_file_name,
                                         text_file=True)

    for peak in peaks:
        print()
        print("Peak %s" % peak)
        rp = peak.region_paths[0]
        for name, pileup in pileups.items():
            pileup_sum = sum(pileup.data[rp].sum() for rp in peak.region_paths)
            print("Pileup %s: %d" % (name, pileup_sum))
Esempio n. 18
0
def make_haplotype_fasta(chromosome, haplotype, data_dir):

    s = SequenceGraph.from_file(data_dir + "giab_chr" + chromosome +
                                ".nobg.sequences")
    print("Getting interval")
    interval = list(
        IntervalCollection.from_file(data_dir + "haplotype_" + chromosome +
                                     "__" + haplotype + ".intervalcollection",
                                     text_file=True).intervals)[0]

    print("Getting sequence")
    sequence = s.get_interval_sequence(interval)
    print("Writing to file")
    f = open(
        data_dir + "giab_chr" + chromosome + "_haplotype" + haplotype +
        ".fasta", "w")
    f.write(">seq\n%s\n" % sequence)
    f.close()
    f.close()
Esempio n. 19
0
    def test_simple(self):
        graph = Graph(
            {i: Block(3) for i in range(1, 5)},
            {
                1: [2, 3],
                2: [4],
                3: [4]
            }
        )
        graph.convert_to_numpy_backend()

        intervals = IntervalCollection([
            Interval(0, 3, [1, 3])
        ])

        haplotyper = HaploTyper(graph, intervals)
        haplotyper.build()
        max_interval = haplotyper.get_maximum_interval_through_graph()

        self.assertEqual(
            max_interval,
            Interval(0, 3, [1, 3, 4])
        )
Esempio n. 20
0
def vg_alignments_to_linear():
    ob_graph = obg.GraphWithReversals.from_file("haplo1kg50-mhc.obg")
    vg_graph = pyvg.vg.Graph.create_from_file("haplo1kg50-mhc.json")
    path = create_linear_path(ob_graph, vg_graph)
    analyser = AlignmentsAnalyser(
        vg_graph, "ENCFF001HNI_haplo1kg50-mhc_filtered_q50.gam", ob_graph,
        path)  # sample reads
    #linear = analyser.to_linear_alignments()
    #collection = IntervalCollection(linear)
    #collection.to_file("graph_reads_on_linear2.intervals")

    linear = IntervalCollection.from_file(
        "graph_reads_on_linear2.intervals").intervals
    #linear = IntervalCollection.create_list_from_file("graph_reads_on_linear.intervals")
    f = open("graph_reads_on_linear.bed", "w")
    path = path.to_indexed_interval()
    linear_reads = []
    for read in linear:
        read.graph = ob_graph
        assert np.all(np.array(read.region_paths) > 0) or np.all(
            np.array(read.region_paths) < 0)

        dir = "+"
        if read.region_paths[0] < 0:
            dir = "-"
            read = read.get_reverse()

        graph_start = read.start_position
        graph_end = read.end_position

        linear_start = MHC_REGION.start + path.get_offset_at_position(
            graph_start)
        linear_end = MHC_REGION.start + path.get_offset_at_position(graph_end)

        f.writelines("chr6\t%d\t%d\t.\t0\t%s\n" %
                     (linear_start, linear_end, dir))
    f.close()
Esempio n. 21
0
    def to_file(self, file_name):
        logging.info("Writing to file")
        with open(file_name, "wb") as f:
            pickle.dump(self._node_dict, f)

        IntervalCollection(self.intervals).to_file(file_name + ".intervals")
Esempio n. 22
0
def make_haplotype_paths(graph_file_name, linear_ref_path_file_name,
                         haplotype0_file_name, haplotype1_file_name,
                         out_base_name, chromosome):
    # Make a linear reference fasta and interval and haplotypes fasta and intervals

    chrom = chromosome
    graph = Graph.from_file(graph_file_name)
    sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences")

    linear_ref = IntervalCollection.from_file(linear_ref_path_file_name,
                                              text_file=True)
    linear_ref = list(linear_ref.intervals)[0]
    linear_ref_nodes = set(linear_ref.region_paths)

    # Write linear ref fasta to file
    linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref)
    out_file = open("linear_ref_" + chrom + ".fasta", "w")
    out_file.writelines([">%s\n" % chrom])
    out_file.writelines([linear_ref_seq + "\n"])
    out_file.close()
    logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" %
                 len(linear_ref_nodes))

    haplotype_nodes = [set(), set()]  # For haplotype 0 and 1
    for haplotype in [0, 1]:
        haplotype_file_name = haplotype0_file_name
        if haplotype == 1:
            haplotype_file_name = haplotype1_file_name

        intervals = vg_json_file_to_intervals(haplotype_file_name, graph)

        for interval in intervals:
            for node in interval.region_paths:
                haplotype_nodes[haplotype].add(node)

    logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0]))
    logging.info("N nodes in haplotype 0 that are also in linear ref: %d" %
                 len(haplotype_nodes[0].intersection(linear_ref_nodes)))
    logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1]))

    # Traverse graph to get full correct haplotype intervals
    first_nodes = graph.get_first_blocks()
    assert len(first_nodes) == 1
    logging.info("N nodes in graph: %d" % len(graph.blocks))

    for haplotype in [0, 1]:
        logging.info("Traversing haplotype %d" % haplotype)

        nodes = []
        node = first_nodes[0]
        nodes_in_haplotype = haplotype_nodes[haplotype]
        nodes_in_haplotype = set(range(
            0, max(linear_ref_nodes))).difference(linear_ref_nodes)
        logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype))

        assert len(
            nodes_in_haplotype
        ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty"

        n_haplotype_nodes = 0
        i = 0
        while True:

            nodes.append(node)
            if i % 50000 == 0:
                logging.info("#%d nodes traversed. On node %d" % (i, node))
            i += 1

            next_nodes = set(graph.adj_list[node])

            if len(next_nodes) == 0:
                logging.info("Reached end node %d with 0 edges" % node)
                break

            next_on_haplotype = next_nodes.intersection(nodes_in_haplotype)
            if len(next_on_haplotype) == 1:
                n_haplotype_nodes += 1
                next_node = list(next_on_haplotype)[0]
                assert next_node != node
                node = next_node
            elif len(next_on_haplotype) == 0:
                logging.debug(
                    "No new haplotype node from %d. Will follow reference" %
                    node)
                # Choose reference with lowest id to avoid deletion
                node = min(list(next_nodes.intersection(linear_ref_nodes)))
            else:
                # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node)
                # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion
                node = min(list(next_on_haplotype))

        logging.info("Found %d nodes. %d on haplotype" %
                     (len(nodes), n_haplotype_nodes))
        haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(),
                                      nodes, graph)
        print("Path length: %d" % haplotype_interval.length())

        file_base_name = out_base_name + "_" + str(haplotype)
        IntervalCollection([haplotype_interval
                            ]).to_file(file_base_name + ".intervalcollection",
                                       text_file=True)

        sequence = sequence_graph.get_interval_sequence(haplotype_interval)
        out_file = open(file_base_name + ".fasta", "w")
        out_file.writelines([">%s\n" % chrom])
        out_file.writelines([sequence + "\n"])
        out_file.close()
        logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")
Esempio n. 23
0
    def check_similarity_old(self, analyse_first_n_peaks=10000000):
        i = 1
        for peak_datasets in [(self.peaks1, self.peaks2),
                              (self.peaks2, self.peaks1)]:
            n_identical = 0
            tot_n_similar = 0
            n_similar = 0
            n_tot = 0
            print("\n-- Comparing set %d against set %d ---" % (i, i % 2 + 1))
            peaks1, peaks2 = peak_datasets
            print("Number of peaks in main set: %d" % len(peaks1.intervals))
            if i == 1:
                self.results.tot_peaks1 = len(peaks1.intervals)
            else:
                self.results.tot_peaks2 = len(peaks1.intervals)

            not_matching = []
            matching = []
            counter = 0
            visited = set([])
            for peak in sorted(peaks1, key=lambda x: x.score,
                               reverse=True)[0:analyse_first_n_peaks]:
                assert peak.unique_id is not None
                counter += 1
                if counter % 500 == 0:
                    logging.info("Checked %d peaks" % counter)
                touching = peaks2.approx_contains_part_of_interval(
                    peak, visited)
                if touching:
                    visited.add(touching[0].unique_id)
                    n_similar += 1
                    if i == 1:
                        self.peaks1_in_peaks2.append(peak)
                    else:
                        self.peaks2_in_peaks1.append(peak)
                    matching.append(peak)
                else:
                    not_matching.append(peak)
                    if i == 1:
                        self.peaks1_not_in_peaks2.append(peak)
                    else:
                        self.peaks2_not_in_peaks1.append(peak)

                n_tot += 1
            self.results.peaks1_in_peaks2 = len(self.peaks1_in_peaks2)
            self.results.peaks2_in_peaks1 = len(self.peaks2_in_peaks1)

            self.results.peaks1_not_in_peaks2 = len(self.peaks1_not_in_peaks2)
            self.results.peaks2_not_in_peaks1 = len(self.peaks2_not_in_peaks1)

            not_matching = IntervalCollection(not_matching)
            not_matching.to_file("not_matching_set%d.intervals" % i,
                                 text_file=True)
            logging.info(
                "Wrote peaks not matching to file not_matching_set%d.intervals"
                % i)
            matching = IntervalCollection(matching)
            matching.to_file("matching_set%d.intervals" % i, text_file=True)

            logging.info("Total peaks in main set: %d" % n_tot)
            logging.info("N similar to peak in other set: %d " % n_similar)
            logging.info("N not matching other set: %d " %
                         len(not_matching.intervals))

            i += 1
Esempio n. 24
0
 def do_asserts(self):
     for i, chromosome in enumerate(self.chromosomes):
         final_peaks = IntervalCollection.create_list_from_file(
             "multigraphs_" + chromosome + "_max_paths.intervalcollection")
         for peak in self.peaks[i]:
             assert peak in final_peaks
Esempio n. 25
0
 def _create_reads(self, *args):
     super(TestMultipleGraphsCallPeaksCommandLine,
           self)._create_reads(*args)
     for intervals, chrom in zip(self.sample_reads, self.chromosomes):
         IntervalCollection(intervals._intervals).to_file(
             "test_sample_" + chrom + ".intervalcollection", text_file=True)
Esempio n. 26
0
    def predict_path(self):
        logging.info("Using linear bonus %d on chromosome %s" % (self.linear_ref_bonus, self.chromosome))

        logging.info("Using linear out base name %s" % self.out_file_base_name)
        out_file = open("%s_%s.fasta" % (self.out_file_base_name, self.chromosome), "w")

        # Traverse
        first_nodes = self.graph.get_first_blocks()
        assert len(first_nodes) == 1

        logging.info("N nodes in graph: %d" % len(self.graph.blocks))

        node = first_nodes[0]
        assert node in self.linear_path_nodes, "Start node should be in linear ref"

        path = []
        n_ambigious = 0
        edges_chosen = set()
        i = 0
        n_special_case = 0
        while True:
            if i % 1000000 == 0:
                logging.info("%d nodes in graph traversed on chrom %s" % (i, self.chromosome))
            i += 1

            if self.max_nodes_to_traverse is not None and i > self.max_nodes_to_traverse:
                logging.warning("Stopped traversing before end because max node to traverse was set")
                break

            path.append(node)

            next_nodes = self.graph.adj_list[node]
            if len(next_nodes) == 0:
                logging.info("Done on node %d" % node)
                break
            elif len(next_nodes) == 1:
                node = next_nodes[0]
            else:
                most_reads = 0
                most_reads_node = next_nodes[0]
                has_found_candidate_on_linear_ref = False

                for next_node in next_nodes:
                    n_reads = self.edge_counts["%s-%s" % (node, next_node)]
                    if next_node in self.linear_path_nodes:
                        n_reads += self.linear_ref_bonus

                    if n_reads > most_reads or (n_reads >= most_reads and next_node in self.linear_path_nodes):
                        if node not in self.linear_path_nodes:
                            n_special_case += 1

                        # If already found something on linear ref, and this does not have more reads or lower id (not insertion), ignore
                        if has_found_candidate_on_linear_ref and n_reads == most_reads and next_node > most_reads_node:
                            continue  # Ignore this alternative

                        most_reads_node = next_node
                        most_reads = n_reads

                        if next_node in self.linear_path_nodes:
                            has_found_candidate_on_linear_ref = True

                if most_reads == 0:
                    n_ambigious += 1

                assert most_reads_node is not None

                edges_chosen.add("%d-%d" % (node, most_reads_node))
                node = most_reads_node

                if most_reads == 0:
                    # Assert we have taken linear ref path if exists
                    if any([n in self.linear_path_nodes for n in next_nodes]):
                        if node not in self.linear_path_nodes:
                            logging.error("Chose node %d as next, but it is not in linear ref." % node)
                            logging.error("Next nodes are: %s" % next_nodes)

                            for next_node in next_nodes:
                                if next_node in self.linear_path_nodes:
                                    logging.error("    Node %d is in linear ref" % next_node)
                                else:
                                    logging.error("    Node %d is not in linear ref" % next_node)

                            raise Exception("Could not traverse correctly")

        # Find statistics of chosen nodes
        nodes_chosen = set(path)
        n_on_linear = len(nodes_chosen.intersection(self.linear_path_nodes))
        n_not_on_linear = len(nodes_chosen) - n_on_linear

        linear_ref_interval = Interval(0, self.graph.blocks[path[-1]].length(), path, self.graph)
        IntervalCollection([linear_ref_interval]).to_file("%s_%s.intervalcollection" % (self.out_file_base_name, self.chromosome),
                                                          text_file=True)

        logging.info("=== STATS FOR CHROMOSOME %s ===" % self.chromosome)
        logging.info("N ambigious choices: %d" % n_ambigious)
        logging.info("Total nodes in linear ref: %d" % len(self.linear_path_nodes))
        logging.info("N nodes chosen that are not in linear ref: %d " % n_not_on_linear)
        logging.info("N nodes chosen that are in linear ref: %d " % n_on_linear)
        logging.info("N special case: %d" % n_special_case)
        logging.info("N nodes in path: %d" % len(path))
        logging.info("Linear path length: %d" % linear_ref_interval.length())

        sequence = self.sequence_graph.get_interval_sequence(linear_ref_interval)

        out_file.writelines([">%s\n" % self.chromosome])
        out_file.writelines([sequence + "\n"])
        out_file.close()