Exemple #1
0
    def run(self, manager):
        mgra_ex_path = get_from_dict_with_path(manager.configuration,
                                               key="executable_path",
                                               path=["mgra"])
        manager.logger.info("=" * 80)
        if mgra_ex_path is None:
            manager.logger.info(
                "MGRA executable path is not supplied, skipping the MGRA based tasks"
            )
            return
        manager.logger.info(
            "Preparing data to communicate with MGRA and ontain guidance graph"
        )
        temp_dir = os.path.join(
            manager.configuration["gos-asm"]["output"]["dir"], "tmp_mgra")
        if not os.path.exists(temp_dir):
            os.mkdir(temp_dir)
        blocks_file_name = os.path.join(temp_dir, "blocks.txt")
        config_file_name = os.path.join(temp_dir, "config.cfg")
        mgra_output_dir_name = os.path.join(temp_dir, "output/")

        manager.logger.debug(
            "Writing blocks orders in GRIMM format to {file_name}".format(
                file_name=blocks_file_name))
        GRIMMWriter.print_genomes_as_grimm_blocks_orders(
            bg=manager.data["gos-asm"]["bg"], file_name=blocks_file_name)

        manager.logger.debug(
            "Writing configuration file for MGRA run to {file_name}".format(
                file_name=config_file_name))
        config = self.create_mgra_config(blocks_file_name=blocks_file_name,
                                         manager=manager)
        with open(config_file_name, "wt") as destination:
            json.dump(obj=config, fp=destination)
        manager.logger.info("Running MGRA on prepared configuration")
        os.system("{mgra_ex_path} -c {config_file_path} -o {output_dir_path}"
                  "".format(mgra_ex_path=mgra_ex_path,
                            config_file_path=config_file_name,
                            output_dir_path=mgra_output_dir_name))
        manager.logger.debug("MGRA has successfully finished")
        manager.logger.info("Reading MGRA produced guidance graph")

        genomes_dir = os.path.join(mgra_output_dir_name, "genomes")
        genome_files = [
            name for name in os.listdir(genomes_dir) if name.endswith(".gen")
        ]
        full_genomes_paths = [
            os.path.join(genomes_dir, name) for name in genome_files
        ]
        guidance_bg = BreakpointGraph()
        for file_name in full_genomes_paths:
            with open(file_name, "rt") as source:
                guidance_bg.update(
                    breakpoint_graph=GRIMMReader.get_breakpoint_graph(
                        stream=source, merge_edges=False),
                    merge_edges=False)
        if "mgra" not in manager.data:
            manager.data["mgra"] = {}
        manager.data["mgra"]["guidance_graph"] = guidance_bg
        manager.logger.info("Obtained MGRA produced guidance graph")
Exemple #2
0
def get_characters(grimm_file, genomes, logger):
    bg = GRIMMReader.get_breakpoint_graph(open(grimm_file))
    logger.info('Breakpoint graph parsed')

    logger.info(f'Edges in breakpoint graph: {len(list(bg.edges()))}')

    characters = []

    # consistency_checker = TreeConsistencyChecker(tree_file)
    for i, component_bg in enumerate(bg.connected_components_subgraphs()):
        nodes_len = len(list(component_bg.nodes()))
        if nodes_len == 2: continue

        logger.info(
            f'Getting characters from breakpoint graph component, size={len(component_bg.bg)}'
        )

        neighbour_index = construct_vertex_genome_index(component_bg)

        for i_edge, edge in enumerate(component_bg.edges()):
            v1, v2 = edge.vertex1.name, edge.vertex2.name
            if v1 > v2: v1, v2 = v2, v1

            genome_colors, neighbour_edges = get_character_by_edge(
                component_bg, edge, genomes, neighbour_index)
            if white_proportion(genome_colors.values()) < 0.5: continue

            labels = ['edge exists', 'parallel edge doesn\'t exist'] + [
                f'inversion {v1}-{v2}-{v1n}-{v2n}'
                for (v1n, v2n) in neighbour_edges
            ]

            characters.append((v1, v2, genome_colors, labels))

    return characters
Exemple #3
0
    def run(self, manager):
        manager.logger.info("Reading blocks orders data")
        file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"]
        bg = BreakpointGraph()
        for file_path in file_paths:
            with open(file_path, "rt") as source:
                bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False)
        manager.data["gos-asm"]["bg"] = bg

        manager.logger.info("Reading phylogenetic tree information")
        tree = BGTree(newick=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"])
        manager.data["gos-asm"]["phylogenetic_tree"] = tree

        full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]])
        manager.data["gos-asm"]["target_multicolor"] = full_tmc
        vtree_consistent_target_multicolors = Multicolor.split_colors(full_tmc,
                                                                      guidance=tree.vtree_consistent_multicolors,
                                                                      account_for_color_multiplicity_in_guidance=False)

        for target_multicolor in vtree_consistent_target_multicolors[:]:
            for vtree_c_multicolor in deepcopy(tree.vtree_consistent_multicolors):
                if vtree_c_multicolor <= target_multicolor \
                        and vtree_c_multicolor not in vtree_consistent_target_multicolors \
                        and len(vtree_c_multicolor.colors) > 0:
                    vtree_consistent_target_multicolors.append(vtree_c_multicolor)

        vtree_consistent_target_multicolors = sorted(vtree_consistent_target_multicolors,
                                                     key=lambda mc: len(mc.hashable_representation),
                                                     reverse=True)

        all_target_multicolors = vtree_consistent_target_multicolors[:]
        # for i in range(2, len(vtree_consistent_target_multicolors) + 1):
        #     for comb in itertools.combinations(vtree_consistent_target_multicolors[:], i):
        #         comb = list(comb)
        #         for mc1, mc2 in itertools.combinations(comb, 2):
        #             if len(mc1.intersect(mc2).colors) > 0:
        #                 break
        #         else:
        #             new_mc = Multicolor()
        #             for mc in comb:
        #                 new_mc += mc
        #             all_target_multicolors.append(new_mc)
        hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors}
        all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in
                                  hashed_vertex_tree_consistent_multicolors]
        all_target_multicolors = sorted(all_target_multicolors,
                                        key=lambda mc: len(mc.hashable_representation),
                                        reverse=True)
        manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors
        # log_bg_stats(bg=bg, logger=manager.logger)

        manager.logger.info("Reading repeats-bridges information")
        manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance(
            file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
Exemple #4
0
 def test_get_breakpoint_from_file_with_comment_data_string(self):
     data = [
         "", "\t", "#comment1", ">genome_name_1", "      #comment1",
         "# data :: fragment : name = chromosome_X", "a b $",
         "   #comment1   ", "\t>genome_name_2",
         "#data::fragment:name=scaffold111", "a $", "", "\n\t"
     ]
     file_like = io.StringIO("\n".join(data))
     result_bg = GRIMMReader.get_breakpoint_graph(file_like,
                                                  merge_edges=False)
     self.assertTrue(isinstance(result_bg, BreakpointGraph))
     self.assertEqual(len(list(result_bg.connected_components_subgraphs())),
                      3)
     self.assertEqual(len(list(result_bg.edges())), 5)
     self.assertEqual(len(list(result_bg.nodes())), 7)
     multicolors = [
         Multicolor(BGGenome("genome_name_1")),
         Multicolor(BGGenome("genome_name_2"))
     ]
     condensed_multicolors = [
         Multicolor(BGGenome("genome_name_1")),
         Multicolor(BGGenome("genome_name_2")),
         Multicolor(BGGenome("genome_name_1"), BGGenome("genome_name_2"))
     ]
     for bgedge in result_bg.edges():
         self.assertTrue(bgedge.multicolor in multicolors)
     for bgedge in result_bg.edges():
         condensed_edge = result_bg.get_condensed_edge(
             vertex1=bgedge.vertex1, vertex2=bgedge.vertex2)
         self.assertTrue(condensed_edge.multicolor in condensed_multicolors)
     infinity_edges = [
         bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge
     ]
     self.assertEqual(len(infinity_edges), 4)
     for bgedge in result_bg.edges():
         data = bgedge.data
         self.assertIn("fragment", data)
         self.assertIsInstance(data["fragment"], dict)
         self.assertIn("name", data["fragment"])
         self.assertIn(data["fragment"]["name"],
                       {"chromosome_X", "scaffold111"})
     ah = result_bg.get_vertex_by_name("ah")
     bt = result_bg.get_vertex_by_name("bt")
     ahi = result_bg.get_vertex_by_name("ah__infinity")
     edge = result_bg.get_edge_by_two_vertices(vertex1=ah, vertex2=bt)
     self.assertTupleEqual(edge.data["fragment"]["forward_orientation"],
                           (ah, bt))
     iedge = result_bg.get_edge_by_two_vertices(vertex1=ah, vertex2=ahi)
     self.assertTupleEqual(iedge.data["fragment"]["forward_orientation"],
                           (ah, ahi))
Exemple #5
0
    def test_output_genomes_as_grimm(self):
        self._populate_four_genomes_bg(merge_edges=True)
        file_name = "file_name.txt"
        GRIMMWriter.print_genomes_as_grimm_blocks_orders(
            bg=self.four_genome_bg, file_name=file_name)
        try:
            with open(file_name, "rt") as source:
                new_bg = GRIMMReader.get_breakpoint_graph(stream=source,
                                                          merge_edges=True)
                self.assertEqual(len(list(new_bg.nodes())),
                                 len(list(self.four_genome_bg.nodes())))
                self.assertEqual(len(list(new_bg.edges())),
                                 len(list(self.four_genome_bg.edges())))

                self.assertSetEqual(set(new_bg.nodes()),
                                    set(self.four_genome_bg.nodes()))
                self.assertSetEqual(
                    new_bg.get_overall_set_of_colors(),
                    self.four_genome_bg.get_overall_set_of_colors())

        finally:
            if os.path.exists(file_name):
                os.remove(file_name)
Exemple #6
0
 def _populate_bg(data, merge_edges=False):
     file_like = io.StringIO("\n".join(data))
     bg = GRIMMReader.get_breakpoint_graph(file_like,
                                           merge_edges=merge_edges)
     return bg
Exemple #7
0
    def test_get_breakpoint_from_file(self):
        # full workflow testing with dummy data
        # correct cases are assumed with all kind of crazy indentation and rubbish data mixed in, but still correct
        data = [
            "", "\t", "#comment1", ">genome_name_1", "      #comment1",
            "a b $", "\tc -a @\t", "   #comment1   ", "\t>genome_name_2",
            "a $", "", "\n\t"
        ]
        file_like = io.StringIO("\n".join(data))
        result_bg = GRIMMReader.get_breakpoint_graph(file_like)
        self.assertTrue(isinstance(result_bg, BreakpointGraph))
        self.assertEqual(len(list(result_bg.connected_components_subgraphs())),
                         3)
        self.assertEqual(len(list(result_bg.edges())), 6)
        self.assertEqual(len(list(result_bg.nodes())), 9)
        multicolors = [
            Multicolor(BGGenome("genome_name_1"), BGGenome("genome_name_2")),
            Multicolor(BGGenome("genome_name_1")),
            Multicolor(BGGenome("genome_name_2"))
        ]
        for bgedge in result_bg.edges():
            self.assertTrue(bgedge.multicolor in multicolors)
        infinity_edges = [
            bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge
        ]
        self.assertEqual(len(infinity_edges), 3)

        data = [
            ">genome_1", "a $", ">genome_2", "a b $", "# this is a bad genome",
            ">genome_3", "a b c $", ">genome_4", "   # chromosome 1", "b c $",
            ">genome_5", "c $"
        ]
        file_like = io.StringIO("\n".join(data))
        result_bg = GRIMMReader.get_breakpoint_graph(file_like)
        self.assertTrue(isinstance(result_bg, BreakpointGraph))
        self.assertEqual(len(list(result_bg.connected_components_subgraphs())),
                         4)
        self.assertEqual(len(list(result_bg.edges())), 8)
        self.assertEqual(len(list(result_bg.nodes())), 12)
        genome1, genome2, genome3 = BGGenome("genome_1"), BGGenome(
            "genome_2"), BGGenome("genome_3")
        genome4, genome5 = BGGenome("genome_4"), BGGenome("genome_5")
        multicolors = [
            Multicolor(genome1, genome2, genome3),
            Multicolor(genome1),
            Multicolor(genome2, genome3),
            Multicolor(genome2),
            Multicolor(genome3, genome4),
            Multicolor(genome3, genome4, genome5),
            Multicolor(genome4),
            Multicolor(genome5)
        ]
        for bgedge in result_bg.edges():
            self.assertTrue(bgedge.multicolor in multicolors)
        infinity_edges = [
            bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge
        ]
        self.assertEqual(len(infinity_edges), 6)
        infinity_multicolors = [
            multicolor for multicolor in multicolors
            if len(multicolor.multicolors) != 2
        ]
        for bgedge in infinity_edges:
            self.assertTrue(bgedge.multicolor in infinity_multicolors)