Ejemplo n.º 1
0
    def run(self, manager):
        manager.logger.info("=" * 80)
        manager.logger.info("Assembling with Phylogeny Assembly strategy")
        bg = manager.data["gos-asm"]["bg"]

        log_bg_stats(bg=bg, logger=manager.logger)
        target_multicolor = manager.data["gos-asm"]["target_multicolor"]
        assembly_cnt = 0
        ap_header_printed = False

        kbreaks = []
        max_cc_size = 0
        for cc_cnt, cc in enumerate(
                bg.connected_components_subgraphs(copy=False)):
            if len(list(cc.nodes())) > max_cc_size:
                max_cc_size = len(list(cc.nodes()))
            possible_assemblies_graph = Graph()
            for vertex in (v for v in cc.nodes() if v.is_regular_vertex):
                if suitable_for_assembly_fragment_ends_vertex(
                        graph=bg,
                        reg_vertex=vertex,
                        target_multicolor=target_multicolor):
                    possible_assemblies_graph.add_node(vertex)
            if len(possible_assemblies_graph.nodes()) > 1:
                manager.logger.debug("suitable vertices {}".format(
                    len(possible_assemblies_graph.nodes())))
            # if len(list(possible_assemblies_graph.nodes())) > 1000:
            #     print("passing large cc")

            for v1, v2 in itertools.combinations(
                    list(possible_assemblies_graph.nodes()), 2):
                if assembly_is_allowed(graph=bg, vertex1=v1, vertex2=v2, target_multicolor=target_multicolor, data=manager.data) \
                        and has_support_edge(graph=bg, v1=v1, v2=v2):
                    s_edge = bg.get_condensed_edge(vertex1=v1, vertex2=v2)
                    evolutionary_scenarios = {}
                    full_ie1_multicolor = get_full_irregular_multicolor(
                        vertex=v1, data=manager.data, graph=bg)
                    full_ie2_multicolor = get_full_irregular_multicolor(
                        vertex=v2, data=manager.data, graph=bg)
                    for e_scenario_name, e_scenario in non_conflicting_evolutionary_scenarios:
                        evolutionary_scenarios[
                            e_scenario_name] = get_assembly_score(
                                full_ie1_multicolor=full_ie1_multicolor,
                                full_ie2_multicolor=full_ie2_multicolor,
                                sedge_multicolor=s_edge.multicolor,
                                target_multicolor=target_multicolor,
                                evolutionary_scenario=e_scenario,
                                data=manager.data)

                    scenario, maximal_score = get_maximal_phylogenetic_score(
                        evolutionary_scenarios)
                    scenario, minimal_score = get_minimal_phylogenetic_score(
                        evolutionary_scenarios)
                    if minimal_score > phylo_threshold:
                        possible_assemblies_graph.add_edge(
                            v1, v2, attr_dict={"weight": maximal_score})
                    elif minimal_score == 1:
                        iedge1 = get_irregular_edge_by_vertex_color(
                            graph=bg,
                            vertex=v1,
                            color=target_multicolor.colors.pop())
                        iedge2 = get_irregular_edge_by_vertex_color(
                            graph=bg,
                            vertex=v2,
                            color=target_multicolor.colors.pop())
                        s_edge = bg.get_condensed_edge(vertex1=v1, vertex2=v2)
                        r1_name, r1_dir = get_repeat_info(iedge1)
                        r2_name, r2_dir = get_repeat_info(iedge2)
                        r1_entry = get_repeat_entry(repeat_name=r1_name,
                                                    repeat_direction=r1_dir)
                        r2_entry = get_repeat_entry(repeat_name=r2_name,
                                                    repeat_direction=r2_dir)
                        repeat_info = {
                            "repeat_name_1": r1_name,
                            "repeat_name_2": r2_name,
                            "repeat_dir_1": r1_dir,
                            "repeat_dir_2": r2_dir
                        }
                        try:
                            repeat_guidance = get_repeat_guidance(
                                genome=target_multicolor.colors.pop(),
                                repeat1_entry=r1_entry,
                                repeat2_entry=r2_entry,
                                data=manager.data)
                        except nx.networkx.exception.NetworkXNoPath:
                            raise Exception(
                                "Pair of edges must suitable for assembly with repeats guidance"
                            )
                        repeat_info["repeat_guidance"] = repeat_guidance
                        evolutionary_scenarios = {}
                        full_ie1_multicolor = get_full_irregular_multicolor(
                            vertex=v1, data=manager.data, graph=bg)
                        full_ie2_multicolor = get_full_irregular_multicolor(
                            vertex=v2, data=manager.data, graph=bg)
                        sedge_multicolor = s_edge.multicolor if s_edge is not None else Multicolor(
                        )
                        for e_scenario_name, e_scenario in non_conflicting_evolutionary_scenarios:
                            evolutionary_scenarios[
                                e_scenario_name] = get_assembly_score(
                                    full_ie1_multicolor=full_ie1_multicolor,
                                    full_ie2_multicolor=full_ie2_multicolor,
                                    sedge_multicolor=sedge_multicolor,
                                    target_multicolor=target_multicolor,
                                    evolutionary_scenario=e_scenario,
                                    data=manager.data)

                        api = AssemblyPointInfo(
                            support_edge=s_edge,
                            iedge1=iedge1,
                            iedge2=iedge2,
                            evolutionary_scenarios=evolutionary_scenarios,
                            allowed=True,
                            repeat_info=repeat_info,
                            target_multicolor=target_multicolor,
                            target_color=target_multicolor)
                        ap = AssemblyPoint(vertex1=v1,
                                           vertex2=v2,
                                           additional_information=api)
                        ap.cc_id = cc_cnt
                        manager.logger.debug(
                            "Identified TEMP assembly point :: {ap}".format(
                                ap=ap.as_logger_entry(), ))

            matching_graph = nx.max_weight_matching(
                G=possible_assemblies_graph)
            processed_vertices = set()
            for v1, v2 in matching_graph:
                if v1 in processed_vertices or v2 in processed_vertices:
                    continue
                processed_vertices.add(v1)
                processed_vertices.add(v2)
                for color in target_multicolor.colors:
                    iedge1 = get_irregular_edge_by_vertex_color(graph=bg,
                                                                vertex=v1,
                                                                color=color)
                    iedge2 = get_irregular_edge_by_vertex_color(graph=bg,
                                                                vertex=v2,
                                                                color=color)
                    s_edge = bg.get_condensed_edge(vertex1=v1, vertex2=v2)
                    r1_name, r1_dir = get_repeat_info(iedge1)
                    r2_name, r2_dir = get_repeat_info(iedge2)
                    r1_entry = get_repeat_entry(repeat_name=r1_name,
                                                repeat_direction=r1_dir)
                    r2_entry = get_repeat_entry(repeat_name=r2_name,
                                                repeat_direction=r2_dir)
                    repeat_info = {
                        "repeat_name_1": r1_name,
                        "repeat_name_2": r2_name,
                        "repeat_dir_1": r1_dir,
                        "repeat_dir_2": r2_dir
                    }
                    try:
                        repeat_guidance = get_repeat_guidance(
                            genome=color,
                            repeat1_entry=r1_entry,
                            repeat2_entry=r2_entry,
                            data=manager.data)
                    except nx.networkx.exception.NetworkXNoPath:
                        raise Exception(
                            "Pair of edges must suitable for assembly with repeats guidance"
                        )
                    repeat_info["repeat_guidance"] = repeat_guidance
                    evolutionary_scenarios = {}
                    full_ie1_multicolor = get_full_irregular_multicolor(
                        vertex=v1, data=manager.data, graph=bg)
                    full_ie2_multicolor = get_full_irregular_multicolor(
                        vertex=v2, data=manager.data, graph=bg)
                    sedge_multicolor = s_edge.multicolor if s_edge is not None else Multicolor(
                    )
                    for e_scenario_name, e_scenario in non_conflicting_evolutionary_scenarios:
                        evolutionary_scenarios[
                            e_scenario_name] = get_assembly_score(
                                full_ie1_multicolor=full_ie1_multicolor,
                                full_ie2_multicolor=full_ie2_multicolor,
                                sedge_multicolor=sedge_multicolor,
                                target_multicolor=target_multicolor,
                                evolutionary_scenario=e_scenario,
                                data=manager.data)

                    api = AssemblyPointInfo(
                        support_edge=s_edge,
                        iedge1=iedge1,
                        iedge2=iedge2,
                        evolutionary_scenarios=evolutionary_scenarios,
                        allowed=True,
                        repeat_info=repeat_info,
                        target_multicolor=target_multicolor,
                        target_color=Multicolor(color))
                    ap = AssemblyPoint(vertex1=v1,
                                       vertex2=v2,
                                       additional_information=api)
                    ap.cc_id = cc_cnt
                    if not ap_header_printed:
                        ap_header_printed = True
                        manager.logger.debug(
                            "-" * 32 + "-" *
                            len(AssemblyPoint.logger_file_header_string()))
                        manager.logger.debug(
                            " " * 32 +
                            AssemblyPoint.logger_file_header_string())
                        manager.logger.debug(
                            "-" * 32 + "-" *
                            len(AssemblyPoint.logger_file_header_string()))
                    manager.logger.debug(
                        "Identified an assembly point :: {ap}".format(
                            ap=ap.as_logger_entry()))
                    manager.data["gos-asm"]["assembly_points"].append(ap)
                    k_break = create_k_break_from_assembly_point(
                        assembly_point=ap)
                    kbreaks.append(k_break)

        # print(max_cc_size)
        manager.logger.debug("-" * 32 + "-" *
                             len(AssemblyPoint.logger_file_header_string()))
        for k_break in kbreaks:
            bg.apply_kbreak(kbreak=k_break, merge=False)
            assembly_cnt += 1
        manager.logger.info(
            "Identified and performed {gluing_cnt} assemblies with Phylogeny Assembly strategy"
            "".format(gluing_cnt=assembly_cnt))
        log_bg_stats(bg=bg, logger=manager.logger)
Ejemplo n.º 2
0
    def run(self, manager):
        manager.logger.info("=" * 80)
        manager.logger.info("Assembling with Connected Components Assembly Strategy Using MGRA guidance graph")
        manager.logger.info("Strategy version 2")
        bg = manager.data["gos-asm"]["bg"]
        guidance_graph = manager.data["mgra"]["guidance_graph"]
        guidance_graph = get_balance_graph(breakpoint_graph=guidance_graph)

        log_bg_stats(bg=bg, logger=manager.logger)
        log_bg_stats(bg=guidance_graph, logger=manager.logger)
        target_multicolor = manager.data["gos-asm"]["target_multicolor"]
        assembly_cnt = 0
        ap_header_printed = False

        kbreaks = []
        for cc_cnt, cc in enumerate(guidance_graph.connected_components_subgraphs(copy=False)):
            possible_assemblies_graph = Graph()
            for vertex in (v for v in cc.nodes() if v.is_regular_vertex):
                if suitable_for_assembly_fragment_ends_vertex(graph=bg, reg_vertex=vertex, target_multicolor=target_multicolor):
                    possible_assemblies_graph.add_node(vertex)
            if len(list(possible_assemblies_graph.nodes())) > 1000:
                continue
            for v1, v2 in itertools.combinations(list(possible_assemblies_graph.nodes()), 2):
                if assembly_is_allowed(graph=bg, vertex1=v1, vertex2=v2, target_multicolor=target_multicolor, data=manager.data):
                    possible_assemblies_graph.add_edge(v1, v2)
            reg_vertices_for_assembly = set()
            for pag_cc in nx.connected_component_subgraphs(possible_assemblies_graph, copy=False):
                if len(list(pag_cc.nodes())) != 2:
                    continue
                reg_vertices_for_assembly.add(tuple(pag_cc.nodes()))

            for vertex_pair in reg_vertices_for_assembly:
                v1, v2 = vertex_pair
                if v1.block_name == v2.block_name:
                    continue
                for color in target_multicolor.colors:
                    iedge1 = get_irregular_edge_by_vertex_color(graph=bg, vertex=v1, color=color)
                    iedge2 = get_irregular_edge_by_vertex_color(graph=bg, vertex=v2, color=color)
                    fr1_name = get_from_dict_with_path(iedge1.data, key="name", path=["fragment"])
                    fr2_name = get_from_dict_with_path(iedge2.data, key='name', path=["fragment"])
                    if fr1_name == fr2_name:
                        continue
                    s_edge = bg.get_condensed_edge(vertex1=v1, vertex2=v2) if bg.has_edge(vertex1=v1, vertex2=v2) else None
                    r1_name, r1_dir = get_repeat_info(iedge1)
                    r2_name, r2_dir = get_repeat_info(iedge2)
                    r1_entry = get_repeat_entry(repeat_name=r1_name, repeat_direction=r1_dir)
                    r2_entry = get_repeat_entry(repeat_name=r2_name, repeat_direction=r2_dir)
                    repeat_info = {
                        "repeat_name_1": r1_name,
                        "repeat_name_2": r2_name,
                        "repeat_dir_1": r1_dir,
                        "repeat_dir_2": r2_dir
                    }
                    try:
                        repeat_guidance = get_repeat_guidance(genome=color, repeat1_entry=r1_entry, repeat2_entry=r2_entry, data=manager.data)
                    except nx.networkx.exception.NetworkXNoPath:
                        raise Exception("Pair of edges must suitable for assembly with repeats guidance")
                    repeat_info["repeat_guidance"] = repeat_guidance
                    evolutionary_scenarios = {}
                    full_ie1_multicolor = get_full_irregular_multicolor(vertex=v1, data=manager.data, graph=bg)
                    full_ie2_multicolor = get_full_irregular_multicolor(vertex=v2, data=manager.data, graph=bg)
                    sedge_multicolor = s_edge.multicolor if s_edge is not None else Multicolor()
                    for e_scenario_name, e_scenario in non_conflicting_evolutionary_scenarios:
                        evolutionary_scenarios[e_scenario_name] = get_assembly_score(full_ie1_multicolor=full_ie1_multicolor,
                                                                                     full_ie2_multicolor=full_ie2_multicolor,
                                                                                     sedge_multicolor=sedge_multicolor,
                                                                                     target_multicolor=target_multicolor,
                                                                                     evolutionary_scenario=e_scenario, data=manager.data)

                    api = AssemblyPointInfo(support_edge=s_edge, iedge1=iedge1, iedge2=iedge2,
                                            evolutionary_scenarios=evolutionary_scenarios,
                                            allowed=True, repeat_info=repeat_info, target_multicolor=target_multicolor,
                                            target_color=Multicolor(color))
                    ap = AssemblyPoint(vertex1=v1, vertex2=v2, additional_information=api)
                    ap.cc_id = cc_cnt
                    if not ap_header_printed:
                        ap_header_printed = True
                        manager.logger.debug("-"*32 + "-"*len(AssemblyPoint.logger_file_header_string()))
                        manager.logger.debug(" "*32 + AssemblyPoint.logger_file_header_string())
                        manager.logger.debug("-"*32 + "-"*len(AssemblyPoint.logger_file_header_string()))
                    manager.logger.debug("Identified an assembly point :: {ap}".format(ap=ap.as_logger_entry()))
                    manager.data["gos-asm"]["assembly_points"].append(ap)
                    k_break = create_k_break_from_assembly_point(assembly_point=ap)
                    kbreaks.append(k_break)
        manager.logger.debug("-"*32 + "-"*len(AssemblyPoint.logger_file_header_string()))
        for k_break in kbreaks:
            bg.apply_kbreak(kbreak=k_break, merge=False)
            assembly_cnt += 1
        manager.logger.info("Identified and performed {gluing_cnt} assemblies with Connected Components Assembly strategy"
                            "".format(gluing_cnt=assembly_cnt))
        log_bg_stats(bg=bg, logger=manager.logger)