def __init__(self, graph_name="graph.obg"): self.graph = obg.GraphWithReversals.from_file( self.data_folder + graph_name) intervals = obg.IntervalCollection( list(self.generate_read_set( 135, 36))) noise_generator = IntervalSimulator(self.graph, 36) noise = [noise_generator.generate_interval() for _ in range(2000)] intervals.intervals += noise print(len(intervals.intervals)) for i in intervals: i.graph = self.graph info = ExperimentInfo(None, 135, 36) self.intervals = intervals.intervals self.map() callpeaks = CallPeaks( self.graph, vg_gam_file_to_interval_collection(None, "mapped_reads.gam", self.graph), # obg.IntervalCollection((i for i in intervals)), vg_gam_file_to_interval_collection(None, "mapped_reads.gam", self.graph), # obg.IntervalCollection((i for i in intervals)), info, has_control=False, out_file_base_name="simulated_", linear_map="../tests/haplo1kg50-mhc.lm") callpeaks.run() peaks = callpeaks.q_value_peak_caller.max_paths counter = 0 for summit in self.summits: print(summit) for peak in peaks: if self.check_peak(peak): counter += 1 print("%s/%s" % (counter, len(peaks)))
def from_vg_json_reads_and_graph(cls, json_file_name, graph_file_name): logging.info("Reading graph %s" % graph_file_name) graph = obg.GraphWithReversals.from_numpy_file(graph_file_name) logging.info("Getting indexed interval through graph") intervals = vg_json_file_to_intervals(json_file_name, graph) haplotyper = HaploTyper(graph, obg.IntervalCollection(intervals)) haplotyper.build() indexed_interval = haplotyper.get_maximum_interval_through_graph() intervals = vg_json_file_to_intervals(json_file_name, graph) positions = (interval.start_position for interval in intervals) return cls(positions, indexed_interval)
def write_sequence_and_intervals(self, n=100): logging.info("Reading graph") logging.info("Simulating intervals") sim = IntervalSimulator(self.graph, 36) self.intervals = [sim.generate_interval() for _ in range(n)] obg.IntervalCollection(self.intervals).to_file( "simulated_intervals.py") logging.info("Getting sequences") self.retriever = SequenceRetriever.from_vg_graph( "../tests/haplo1kg50-mhc.vg") sequences = [self.retriever.get_interval_sequence(i) for i in self.intervals] with open("simulated_sequences.fq", "w") as f: for i, seq in enumerate(sequences): f.write("@sim" + str(i) + "\n") f.write(seq + "\n") f.write("+\n") f.write("~"*36 + "\n")
def get_linear_paths_in_graph(ob_graph, vg_graph, write_to_file_name=None): assert ob_graph is not None intervals = {} for path in vg_graph.paths: obg_interval = path.to_obg(ob_graph=ob_graph) if not obg_interval: logging.info("OBG interval for path " + path.name + " is False. Skipping.") continue obg_interval.name = path.name print("Path name: %s" % path.name) intervals[obg_interval.name] = obg_interval if write_to_file_name is not None: logging.info("Writing linear path to %s" % write_to_file_name) collection = obg.IntervalCollection(intervals.values()) collection.to_file(write_to_file_name, text_file=True) return intervals
def vg_json_file_to_interval_collection(vg_mapping_file_name, offset_based_graph=None): return obg.IntervalCollection( vg_json_file_to_intervals(vg_mapping_file_name, offset_based_graph))
def test_simulations(self, n=5): from pyvg.mapping import map from pyvg.util import vg_gam_file_to_interval_collection self.write_sequence_and_intervals(n) graph = obg.GraphWithReversals.from_file("../tests/graph.obg") gam_file_name = "mapped_reads.gam" map("simulated_sequences.fq", "../tests/vgdata/haplo1kg50-mhc.xg", "../tests/vgdata/haplo1kg50-mhc.gcsa", gam_file_name) reads_intervals = vg_gam_file_to_interval_collection( None, gam_file_name, graph) self.compare_intervals(list(self.intervals), list(reads_intervals)) if __name__ == "__main__": random.seed(2000) EvaluateSimulations() exit() graph = obg.GraphWithReversals.from_file("../tests/graph.obg") intervals = obg.IntervalCollection(list(generate_read_set( graph, 135, 36))) for i in intervals: i.graph = graph info = ExperimentInfo(None, 135, 36) callpeaks = CallPeaks(graph, intervals, intervals, info, has_control=False, out_file_base_name="simulated_", linear_map="../tests/haplo1kg50-mhc.lm") callpeaks.run()