Ejemplo n.º 1
0
def compute_evolutionary_score(multicolor, scenario, data):
    tree = data["gos-asm"]["phylogenetic_tree"]
    if scenario == EvolutionaryScenario.existed:
        color_to_split = multicolor
    else:
        if "full_multicolor" not in data["gos-asm"]["cache"]:
            data["gos-asm"]["cache"]["full_multicolor"] = get_full_multicolor(data=data)
        full_multicolor = data["gos-asm"]["cache"]["full_multicolor"]
        color_to_split = full_multicolor - multicolor
    return len(Multicolor.split_colors(multicolor=color_to_split, guidance=tree.consistent_multicolors,
                                       account_for_color_multiplicity_in_guidance=False))
Ejemplo n.º 2
0
    def run(self, manager):
        manager.logger.info("Reading blocks orders data")
        file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"]
        bg = BreakpointGraph()
        for file_path in file_paths:
            with open(file_path, "rt") as source:
                bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False)
        manager.data["gos-asm"]["bg"] = bg

        manager.logger.info("Reading phylogenetic tree information")
        tree = NewickReader.from_string(data_string=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"])
        manager.data["gos-asm"]["phylogenetic_tree"] = tree

        full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]])
        manager.data["gos-asm"]["target_multicolor"] = full_tmc
        tree_consistent_target_multicolors = Multicolor.split_colors(full_tmc,
                                                                     guidance=tree.consistent_multicolors,
                                                                     account_for_color_multiplicity_in_guidance=False)

        for target_multicolor in tree_consistent_target_multicolors[:]:
            for tree_c_multicolor in deepcopy(tree.consistent_multicolors):
                if tree_c_multicolor <= target_multicolor \
                        and tree_c_multicolor not in tree_consistent_target_multicolors \
                        and len(tree_c_multicolor.colors) > 0:
                    tree_consistent_target_multicolors.append(tree_c_multicolor)

        tree_consistent_target_multicolors = sorted(tree_consistent_target_multicolors,
                                                    key=lambda mc: len(mc.hashable_representation),
                                                    reverse=True)

        all_target_multicolors = tree_consistent_target_multicolors[:]
        for i in range(2, len(tree_consistent_target_multicolors) + 1):
            for comb in itertools.combinations(tree_consistent_target_multicolors[:], i):
                comb = list(comb)
                for mc1, mc2 in itertools.combinations(comb, 2):
                    if len(mc1.intersect(mc2).colors) > 0:
                        break
                else:
                    new_mc = Multicolor()
                    for mc in comb:
                        new_mc += mc
                    all_target_multicolors.append(new_mc)
        hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors}
        all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in
                                  hashed_vertex_tree_consistent_multicolors]
        all_target_multicolors = sorted(all_target_multicolors,
                                        key=lambda mc: len(mc.hashable_representation),
                                        reverse=True)
        manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors
        # log_bg_stats(bg=bg, logger=manager.logger)

        manager.logger.info("Reading repeats-bridges information")
        manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance(
            file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
def assemble_scaffolds(graph, bgtree, target_organisms, exclude=None, verbose=False, verbose_destination=None):
    overall_assembling_result = []

    # all genomes stacked up together
    overall_target_multicolor = Multicolor(*target_organisms)

    # all of them combined might not be a tree consistent set, so we separate it into smallest number
    #   of tree consistent chunks
    tree_consistent_target_multicolors = Multicolor.split_colors(overall_target_multicolor,
                                                                 guidance=bgtree.consistent_multicolors,
                                                                 account_for_color_multiplicity_in_guidance=False)
    if verbose:
        print("Supplied set of targeted for scaffolding genomes has been split into",
              len(tree_consistent_target_multicolors), "T-consistent sets:", file=verbose_destination)
        for multicolor in tree_consistent_target_multicolors:
            print("\t", [color.name for color in multicolor.multicolors.elements()], file=verbose_destination)
        print("Expanding target multicolors to include all T-consistent subcolors")

    # now we need to expand that list into a larger list to include every possible tree consistent sub-color,
    #   of whatever is already in the list
    #
    # we will change it as we go, so better iterate over a copy
    for target_multicolor in tree_consistent_target_multicolors[:]:
        for tree_c_multicolor in deepcopy(bgtree.consistent_multicolors):
            if tree_c_multicolor <= target_multicolor\
                    and tree_c_multicolor not in tree_consistent_target_multicolors\
                    and len(tree_c_multicolor.colors) > 0:
                tree_consistent_target_multicolors.append(tree_c_multicolor)

    tree_consistent_target_multicolors = sorted(tree_consistent_target_multicolors,
                                                key=lambda mc: len(mc.hashable_representation),
                                                reverse=True)

    all_target_multicolors = tree_consistent_target_multicolors[:]
    for i in range(2, len(tree_consistent_target_multicolors) + 1):
        for comb in itertools.combinations(tree_consistent_target_multicolors[:], i):
            comb = list(comb)
            for mc1, mc2 in itertools.combinations(comb, 2):
                if len(mc1.intersect(mc2).colors) > 0:
                    break
            else:
                new_mc = Multicolor()
                for mc in comb:
                    new_mc += mc
                all_target_multicolors.append(new_mc)
    hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors}
    all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in
                                       hashed_vertex_tree_consistent_multicolors]
    all_target_multicolors = sorted(all_target_multicolors,
                                                key=lambda mc: len(mc.hashable_representation),
                                                reverse=True)
    if verbose:
        print("Determined full list of targeted for scaffolding multicolors of length",
              len(tree_consistent_target_multicolors), file=verbose_destination)
        for multicolor in all_target_multicolors:
            print("\t", [color.name for color in multicolor.multicolors.elements()], file=verbose_destination)

    for multicolor in all_target_multicolors:
        assembly_points = identify_assembly_points(graph, bgtree, target_multicolor=multicolor, exclude=exclude,
                                                   verbose_destination=verbose_destination)
        for v1, v2, weight, repeat_name in assembly_points:
            overall_assembling_result.append((v1, v2, weight, repeat_name, multicolor))
        assemble_points(graph, assemblies=assembly_points, multicolor=multicolor)
    return overall_assembling_result
def identify_assembly_points(graph, bgtree, target_multicolor, exclude=None, verbose=False, verbose_destination=None):
    """
    The main granular assembling function, that IDENTIFIES assembly points, but does not perform the assembly on its own
    It DOES NOT change the supplied breakpoint graph in any way!!!
    """
    if verbose:
        print(">>Identifying assemblies for target multicolor:",
              [e.name for e in target_multicolor.multicolors.elements()], file=verbose_destination)
    guidance = bgtree.consistent_multicolors[:]
    offset = len(Multicolor.split_colors(target_multicolor, guidance=guidance,
                                         account_for_color_multiplicity_in_guidance=False)) - 1
    threshold  = 1 if offset == 0 else 2

    assemblies = []  # the overall result
    if exclude is None:
        exclude = []  # a container with single colors of genomes, that are to be considered fully assembled
    multicolor_scores = {}
    surroundings = {}
    full_irregular_multicolors = {}
    for i, cc in enumerate(graph.connected_components_subgraphs(copy=True)):
        subnets, possible_assemblies, h_support, t_support, repeats = get_irregular_subnets(cc, target_multicolor, exclude)
        for subnet in subnets:
            vertices = [vertex for vertex in subnet.nodes() if not vertex.is_irregular_vertex]
            repeats_h = {repeat for vertex in vertices for repeat in h_support[vertex]}
            repeats_t = {repeat for vertex in vertices for repeat in t_support[vertex]}
            repeats = repeats_h.intersection(repeats_t)
            for repeat in repeats:
                g = nx.Graph()
                for edge in filter(lambda edge: not edge.is_irregular_edge, subnet.edges()):
                    v1, v2 = sorted((edge.vertex1, edge.vertex2), key=lambda vertex: vertex.name)
                    if v1 in h_support and repeat in h_support[v1] and v2 in t_support and repeat in t_support[v2]:
                        before, after = get_assembly_score(graph, v1, v2, target_multicolor, bgtree,
                                                           multicolor_scores, surroundings, full_irregular_multicolors)
                        if before - after - offset >= threshold:
                            g.add_edge((v1, "h"), (v2, "t"), weight=before - after - offset)
                    if v1 in t_support and repeat in t_support[v1] and v2 in h_support and repeat in h_support[v2]:
                        before, after = get_assembly_score(graph, v1, v2, target_multicolor, bgtree,
                                                           multicolor_scores, surroundings, full_irregular_multicolors)
                        if before - after - offset >= threshold:
                            g.add_edge((v1, "t"), (v2, "h"), weight=before - after - offset)
                edges = g.edges(data=True)
                if len(edges) == 0:
                    continue
                new_edges = []
                for edge in edges:
                    first, second = edge[0], edge[1]
                    v1, v2 = sorted((first, second), key=lambda item: item[0].name)
                    new_edges.append((v1, v2, edge[2]))
                edges = sorted(new_edges, reverse=True, key=lambda e: (e[2]["weight"], e[0][0].name, e[0][1], e[1][0].name, e[0][1]))
                visited = set()
                for v1, v2, data in edges:
                    weight = data["weight"]
                    if v1 not in visited and v2 not in visited:
                        visited.add(v1)
                        visited.add(v2)
                        v1, dir1 = v1
                        v2, dir2 = v2
                        assert (dir1 == "h" and dir2 == "t") or (dir1 == "t" and dir2 == "h")
                        if dir1 == "h":
                            assemblies.append((v2, v1, weight, repeat))
                        else:
                            assemblies.append((v1, v2, weight, repeat))
    return assemblies
def get_assembly_score(graph, v1, v2, target_multicolor, tree, multicolors_scores, surroundings, full_irregular_multicolors):
    if surroundings is None:
        surroundings = {}
    if multicolors_scores is None:
        multicolors_scores = {}
    if full_irregular_multicolors is None:
        full_irregular_multicolors = {}

    full_multicolor = Multicolor(*max(tree.consistent_multicolors, key=lambda multicolor: len(list(multicolor.multicolors.elements()))).colors)
    guidance = tree.consistent_multicolors

    if v1 not in surroundings:
        surroundings[v1] = Multicolor(*get_vertex_surrounding_multicolor(graph, v1).colors)
    surrounding1 = surroundings[v1]
    if v1 not in full_irregular_multicolors:
        full_irregular_multicolors[v1] = Multicolor(*get_full_irregular_multicolor(graph, v1).colors)
    im1 = full_irregular_multicolors[v1]
    if v2 not in surroundings:
        surroundings[v2] = Multicolor(*get_vertex_surrounding_multicolor(graph, v2).colors)
    surrounding2 = surroundings[v2]
    if v2 not in full_irregular_multicolors:
        full_irregular_multicolors[v2] = Multicolor(*get_full_irregular_multicolor(graph, v2).colors)
    im2 = full_irregular_multicolors[v1]

    c_1_multicolor = full_multicolor - surrounding1
    c_2_multicolor = full_multicolor - surrounding2

    c = c_1_multicolor.intersect(c_2_multicolor)

    c_a = c_1_multicolor - c
    c_b = c_2_multicolor - c
    sedge = graph.get_edge_by_two_vertices(v1, v2)
    smulticolor = Multicolor(*sedge.multicolor.colors) if sedge is not None else Multicolor()
    if target_multicolor <= smulticolor:
        smulticolor -= target_multicolor

    if (im1 + c_a).hashable_representation not in multicolors_scores:
        multicolors_scores[(im1 + c_a).hashable_representation] = len(Multicolor.split_colors(im1 + c_a, guidance=guidance,
                                                                                              account_for_color_multiplicity_in_guidance=False))
    if (im2 + c_b).hashable_representation not in multicolors_scores:
        multicolors_scores[(im2 + c_b).hashable_representation] = len(Multicolor.split_colors(im2 + c_b, guidance=guidance,
                                                                                              account_for_color_multiplicity_in_guidance=False))
    if (smulticolor + c).hashable_representation not in multicolors_scores:
        multicolors_scores[(smulticolor + c).hashable_representation] = len(
            Multicolor.split_colors(smulticolor + c, guidance=guidance,
                                    account_for_color_multiplicity_in_guidance=False))
    ie1_score = multicolors_scores[(im1 + c_a).hashable_representation]
    ie2_score = multicolors_scores[(im2 + c_b).hashable_representation]
    se_score = multicolors_scores[(smulticolor + c).hashable_representation]
    before = ie1_score + ie2_score + se_score

    if (im1 - target_multicolor + c_a).hashable_representation not in multicolors_scores:
        multicolors_scores[(im1 - target_multicolor + c_a).hashable_representation] = len(
            Multicolor.split_colors(im1 - target_multicolor + c_a, guidance=guidance,
                                    account_for_color_multiplicity_in_guidance=False))
    if (im2 - target_multicolor + c_b).hashable_representation not in multicolors_scores:
        multicolors_scores[(im2 - target_multicolor + c_b).hashable_representation] = len(
            Multicolor.split_colors(im2 - target_multicolor + c_b, guidance=guidance,
                                    account_for_color_multiplicity_in_guidance=False))
    if (smulticolor + target_multicolor + c).hashable_representation not in multicolors_scores:
        multicolors_scores[(smulticolor + target_multicolor + c).hashable_representation] = len(
            Multicolor.split_colors(smulticolor + target_multicolor + c, guidance=guidance,
                                    account_for_color_multiplicity_in_guidance=False))
    new_ie1_score = multicolors_scores[(im1 - target_multicolor + c_a).hashable_representation]
    new_ie2_score = multicolors_scores[(im2 - target_multicolor + c_b).hashable_representation]
    new_se_score = multicolors_scores[(smulticolor + target_multicolor + c).hashable_representation]

    after = new_ie1_score + new_ie2_score + new_se_score
    return before, after
                               grimm_formatted_genomes[genome][scaffold_name]),
                      "$",
                      file=target)
        # print()

    bg_graphs = dict()
    for file_name in os.listdir(target_directory):
        file_name = os.path.join(target_directory, file_name)
        with open(file_name, "r") as source:
            bg = GRIMMReader.get_breakpoint_graph(source)
            bg_graphs[file_name.split(".")[0]] = bg
    bg = BreakpointGraph()
    for br_gr in bg_graphs.values():
        bg.update(br_gr, merge_edges=True)

    target_multicolor = Multicolor("Anguilla_japonica")

    print("Breakpoint graph stats:")
    print(
        "\t", "non-infinity nodes count:",
        len(
            list(node for node in bg.nodes()
                 if not BGVertex.is_infinity_vertex(node))))
    normal_edges, infinity_edges = [], []
    for edge in bg.edges():
        if edge.is_infinity_edge:
            infinity_edges.append(edge)
        else:
            normal_edges.append(edge)
    print("\t", "non-infinity edges count:", len(normal_edges))
    print("\t", "infinity edges count:", len(infinity_edges))