def run(self, manager): mgra_ex_path = get_from_dict_with_path(manager.configuration, key="executable_path", path=["mgra"]) manager.logger.info("=" * 80) if mgra_ex_path is None: manager.logger.info( "MGRA executable path is not supplied, skipping the MGRA based tasks" ) return manager.logger.info( "Preparing data to communicate with MGRA and ontain guidance graph" ) temp_dir = os.path.join( manager.configuration["gos-asm"]["output"]["dir"], "tmp_mgra") if not os.path.exists(temp_dir): os.mkdir(temp_dir) blocks_file_name = os.path.join(temp_dir, "blocks.txt") config_file_name = os.path.join(temp_dir, "config.cfg") mgra_output_dir_name = os.path.join(temp_dir, "output/") manager.logger.debug( "Writing blocks orders in GRIMM format to {file_name}".format( file_name=blocks_file_name)) GRIMMWriter.print_genomes_as_grimm_blocks_orders( bg=manager.data["gos-asm"]["bg"], file_name=blocks_file_name) manager.logger.debug( "Writing configuration file for MGRA run to {file_name}".format( file_name=config_file_name)) config = self.create_mgra_config(blocks_file_name=blocks_file_name, manager=manager) with open(config_file_name, "wt") as destination: json.dump(obj=config, fp=destination) manager.logger.info("Running MGRA on prepared configuration") os.system("{mgra_ex_path} -c {config_file_path} -o {output_dir_path}" "".format(mgra_ex_path=mgra_ex_path, config_file_path=config_file_name, output_dir_path=mgra_output_dir_name)) manager.logger.debug("MGRA has successfully finished") manager.logger.info("Reading MGRA produced guidance graph") genomes_dir = os.path.join(mgra_output_dir_name, "genomes") genome_files = [ name for name in os.listdir(genomes_dir) if name.endswith(".gen") ] full_genomes_paths = [ os.path.join(genomes_dir, name) for name in genome_files ] guidance_bg = BreakpointGraph() for file_name in full_genomes_paths: with open(file_name, "rt") as source: guidance_bg.update( breakpoint_graph=GRIMMReader.get_breakpoint_graph( stream=source, merge_edges=False), merge_edges=False) if "mgra" not in manager.data: manager.data["mgra"] = {} manager.data["mgra"]["guidance_graph"] = guidance_bg manager.logger.info("Obtained MGRA produced guidance graph")
def get_characters(grimm_file, genomes, logger): bg = GRIMMReader.get_breakpoint_graph(open(grimm_file)) logger.info('Breakpoint graph parsed') logger.info(f'Edges in breakpoint graph: {len(list(bg.edges()))}') characters = [] # consistency_checker = TreeConsistencyChecker(tree_file) for i, component_bg in enumerate(bg.connected_components_subgraphs()): nodes_len = len(list(component_bg.nodes())) if nodes_len == 2: continue logger.info( f'Getting characters from breakpoint graph component, size={len(component_bg.bg)}' ) neighbour_index = construct_vertex_genome_index(component_bg) for i_edge, edge in enumerate(component_bg.edges()): v1, v2 = edge.vertex1.name, edge.vertex2.name if v1 > v2: v1, v2 = v2, v1 genome_colors, neighbour_edges = get_character_by_edge( component_bg, edge, genomes, neighbour_index) if white_proportion(genome_colors.values()) < 0.5: continue labels = ['edge exists', 'parallel edge doesn\'t exist'] + [ f'inversion {v1}-{v2}-{v1n}-{v2n}' for (v1n, v2n) in neighbour_edges ] characters.append((v1, v2, genome_colors, labels)) return characters
def run(self, manager): manager.logger.info("Reading blocks orders data") file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"] bg = BreakpointGraph() for file_path in file_paths: with open(file_path, "rt") as source: bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False) manager.data["gos-asm"]["bg"] = bg manager.logger.info("Reading phylogenetic tree information") tree = BGTree(newick=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"]) manager.data["gos-asm"]["phylogenetic_tree"] = tree full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]]) manager.data["gos-asm"]["target_multicolor"] = full_tmc vtree_consistent_target_multicolors = Multicolor.split_colors(full_tmc, guidance=tree.vtree_consistent_multicolors, account_for_color_multiplicity_in_guidance=False) for target_multicolor in vtree_consistent_target_multicolors[:]: for vtree_c_multicolor in deepcopy(tree.vtree_consistent_multicolors): if vtree_c_multicolor <= target_multicolor \ and vtree_c_multicolor not in vtree_consistent_target_multicolors \ and len(vtree_c_multicolor.colors) > 0: vtree_consistent_target_multicolors.append(vtree_c_multicolor) vtree_consistent_target_multicolors = sorted(vtree_consistent_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) all_target_multicolors = vtree_consistent_target_multicolors[:] # for i in range(2, len(vtree_consistent_target_multicolors) + 1): # for comb in itertools.combinations(vtree_consistent_target_multicolors[:], i): # comb = list(comb) # for mc1, mc2 in itertools.combinations(comb, 2): # if len(mc1.intersect(mc2).colors) > 0: # break # else: # new_mc = Multicolor() # for mc in comb: # new_mc += mc # all_target_multicolors.append(new_mc) hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors} all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in hashed_vertex_tree_consistent_multicolors] all_target_multicolors = sorted(all_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors # log_bg_stats(bg=bg, logger=manager.logger) manager.logger.info("Reading repeats-bridges information") manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance( file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
def test_get_breakpoint_from_file_with_comment_data_string(self): data = [ "", "\t", "#comment1", ">genome_name_1", " #comment1", "# data :: fragment : name = chromosome_X", "a b $", " #comment1 ", "\t>genome_name_2", "#data::fragment:name=scaffold111", "a $", "", "\n\t" ] file_like = io.StringIO("\n".join(data)) result_bg = GRIMMReader.get_breakpoint_graph(file_like, merge_edges=False) self.assertTrue(isinstance(result_bg, BreakpointGraph)) self.assertEqual(len(list(result_bg.connected_components_subgraphs())), 3) self.assertEqual(len(list(result_bg.edges())), 5) self.assertEqual(len(list(result_bg.nodes())), 7) multicolors = [ Multicolor(BGGenome("genome_name_1")), Multicolor(BGGenome("genome_name_2")) ] condensed_multicolors = [ Multicolor(BGGenome("genome_name_1")), Multicolor(BGGenome("genome_name_2")), Multicolor(BGGenome("genome_name_1"), BGGenome("genome_name_2")) ] for bgedge in result_bg.edges(): self.assertTrue(bgedge.multicolor in multicolors) for bgedge in result_bg.edges(): condensed_edge = result_bg.get_condensed_edge( vertex1=bgedge.vertex1, vertex2=bgedge.vertex2) self.assertTrue(condensed_edge.multicolor in condensed_multicolors) infinity_edges = [ bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge ] self.assertEqual(len(infinity_edges), 4) for bgedge in result_bg.edges(): data = bgedge.data self.assertIn("fragment", data) self.assertIsInstance(data["fragment"], dict) self.assertIn("name", data["fragment"]) self.assertIn(data["fragment"]["name"], {"chromosome_X", "scaffold111"}) ah = result_bg.get_vertex_by_name("ah") bt = result_bg.get_vertex_by_name("bt") ahi = result_bg.get_vertex_by_name("ah__infinity") edge = result_bg.get_edge_by_two_vertices(vertex1=ah, vertex2=bt) self.assertTupleEqual(edge.data["fragment"]["forward_orientation"], (ah, bt)) iedge = result_bg.get_edge_by_two_vertices(vertex1=ah, vertex2=ahi) self.assertTupleEqual(iedge.data["fragment"]["forward_orientation"], (ah, ahi))
def test_output_genomes_as_grimm(self): self._populate_four_genomes_bg(merge_edges=True) file_name = "file_name.txt" GRIMMWriter.print_genomes_as_grimm_blocks_orders( bg=self.four_genome_bg, file_name=file_name) try: with open(file_name, "rt") as source: new_bg = GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=True) self.assertEqual(len(list(new_bg.nodes())), len(list(self.four_genome_bg.nodes()))) self.assertEqual(len(list(new_bg.edges())), len(list(self.four_genome_bg.edges()))) self.assertSetEqual(set(new_bg.nodes()), set(self.four_genome_bg.nodes())) self.assertSetEqual( new_bg.get_overall_set_of_colors(), self.four_genome_bg.get_overall_set_of_colors()) finally: if os.path.exists(file_name): os.remove(file_name)
def _populate_bg(data, merge_edges=False): file_like = io.StringIO("\n".join(data)) bg = GRIMMReader.get_breakpoint_graph(file_like, merge_edges=merge_edges) return bg
def test_get_breakpoint_from_file(self): # full workflow testing with dummy data # correct cases are assumed with all kind of crazy indentation and rubbish data mixed in, but still correct data = [ "", "\t", "#comment1", ">genome_name_1", " #comment1", "a b $", "\tc -a @\t", " #comment1 ", "\t>genome_name_2", "a $", "", "\n\t" ] file_like = io.StringIO("\n".join(data)) result_bg = GRIMMReader.get_breakpoint_graph(file_like) self.assertTrue(isinstance(result_bg, BreakpointGraph)) self.assertEqual(len(list(result_bg.connected_components_subgraphs())), 3) self.assertEqual(len(list(result_bg.edges())), 6) self.assertEqual(len(list(result_bg.nodes())), 9) multicolors = [ Multicolor(BGGenome("genome_name_1"), BGGenome("genome_name_2")), Multicolor(BGGenome("genome_name_1")), Multicolor(BGGenome("genome_name_2")) ] for bgedge in result_bg.edges(): self.assertTrue(bgedge.multicolor in multicolors) infinity_edges = [ bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge ] self.assertEqual(len(infinity_edges), 3) data = [ ">genome_1", "a $", ">genome_2", "a b $", "# this is a bad genome", ">genome_3", "a b c $", ">genome_4", " # chromosome 1", "b c $", ">genome_5", "c $" ] file_like = io.StringIO("\n".join(data)) result_bg = GRIMMReader.get_breakpoint_graph(file_like) self.assertTrue(isinstance(result_bg, BreakpointGraph)) self.assertEqual(len(list(result_bg.connected_components_subgraphs())), 4) self.assertEqual(len(list(result_bg.edges())), 8) self.assertEqual(len(list(result_bg.nodes())), 12) genome1, genome2, genome3 = BGGenome("genome_1"), BGGenome( "genome_2"), BGGenome("genome_3") genome4, genome5 = BGGenome("genome_4"), BGGenome("genome_5") multicolors = [ Multicolor(genome1, genome2, genome3), Multicolor(genome1), Multicolor(genome2, genome3), Multicolor(genome2), Multicolor(genome3, genome4), Multicolor(genome3, genome4, genome5), Multicolor(genome4), Multicolor(genome5) ] for bgedge in result_bg.edges(): self.assertTrue(bgedge.multicolor in multicolors) infinity_edges = [ bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge ] self.assertEqual(len(infinity_edges), 6) infinity_multicolors = [ multicolor for multicolor in multicolors if len(multicolor.multicolors) != 2 ] for bgedge in infinity_edges: self.assertTrue(bgedge.multicolor in infinity_multicolors)