def compute_evolutionary_score(multicolor, scenario, data): tree = data["gos-asm"]["phylogenetic_tree"] if scenario == EvolutionaryScenario.existed: color_to_split = multicolor else: if "full_multicolor" not in data["gos-asm"]["cache"]: data["gos-asm"]["cache"]["full_multicolor"] = get_full_multicolor(data=data) full_multicolor = data["gos-asm"]["cache"]["full_multicolor"] color_to_split = full_multicolor - multicolor return len(Multicolor.split_colors(multicolor=color_to_split, guidance=tree.consistent_multicolors, account_for_color_multiplicity_in_guidance=False))
def run(self, manager): manager.logger.info("Reading blocks orders data") file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"] bg = BreakpointGraph() for file_path in file_paths: with open(file_path, "rt") as source: bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False) manager.data["gos-asm"]["bg"] = bg manager.logger.info("Reading phylogenetic tree information") tree = NewickReader.from_string(data_string=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"]) manager.data["gos-asm"]["phylogenetic_tree"] = tree full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]]) manager.data["gos-asm"]["target_multicolor"] = full_tmc tree_consistent_target_multicolors = Multicolor.split_colors(full_tmc, guidance=tree.consistent_multicolors, account_for_color_multiplicity_in_guidance=False) for target_multicolor in tree_consistent_target_multicolors[:]: for tree_c_multicolor in deepcopy(tree.consistent_multicolors): if tree_c_multicolor <= target_multicolor \ and tree_c_multicolor not in tree_consistent_target_multicolors \ and len(tree_c_multicolor.colors) > 0: tree_consistent_target_multicolors.append(tree_c_multicolor) tree_consistent_target_multicolors = sorted(tree_consistent_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) all_target_multicolors = tree_consistent_target_multicolors[:] for i in range(2, len(tree_consistent_target_multicolors) + 1): for comb in itertools.combinations(tree_consistent_target_multicolors[:], i): comb = list(comb) for mc1, mc2 in itertools.combinations(comb, 2): if len(mc1.intersect(mc2).colors) > 0: break else: new_mc = Multicolor() for mc in comb: new_mc += mc all_target_multicolors.append(new_mc) hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors} all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in hashed_vertex_tree_consistent_multicolors] all_target_multicolors = sorted(all_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors # log_bg_stats(bg=bg, logger=manager.logger) manager.logger.info("Reading repeats-bridges information") manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance( file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
def assemble_scaffolds(graph, bgtree, target_organisms, exclude=None, verbose=False, verbose_destination=None): overall_assembling_result = [] # all genomes stacked up together overall_target_multicolor = Multicolor(*target_organisms) # all of them combined might not be a tree consistent set, so we separate it into smallest number # of tree consistent chunks tree_consistent_target_multicolors = Multicolor.split_colors(overall_target_multicolor, guidance=bgtree.consistent_multicolors, account_for_color_multiplicity_in_guidance=False) if verbose: print("Supplied set of targeted for scaffolding genomes has been split into", len(tree_consistent_target_multicolors), "T-consistent sets:", file=verbose_destination) for multicolor in tree_consistent_target_multicolors: print("\t", [color.name for color in multicolor.multicolors.elements()], file=verbose_destination) print("Expanding target multicolors to include all T-consistent subcolors") # now we need to expand that list into a larger list to include every possible tree consistent sub-color, # of whatever is already in the list # # we will change it as we go, so better iterate over a copy for target_multicolor in tree_consistent_target_multicolors[:]: for tree_c_multicolor in deepcopy(bgtree.consistent_multicolors): if tree_c_multicolor <= target_multicolor\ and tree_c_multicolor not in tree_consistent_target_multicolors\ and len(tree_c_multicolor.colors) > 0: tree_consistent_target_multicolors.append(tree_c_multicolor) tree_consistent_target_multicolors = sorted(tree_consistent_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) all_target_multicolors = tree_consistent_target_multicolors[:] for i in range(2, len(tree_consistent_target_multicolors) + 1): for comb in itertools.combinations(tree_consistent_target_multicolors[:], i): comb = list(comb) for mc1, mc2 in itertools.combinations(comb, 2): if len(mc1.intersect(mc2).colors) > 0: break else: new_mc = Multicolor() for mc in comb: new_mc += mc all_target_multicolors.append(new_mc) hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors} all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in hashed_vertex_tree_consistent_multicolors] all_target_multicolors = sorted(all_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) if verbose: print("Determined full list of targeted for scaffolding multicolors of length", len(tree_consistent_target_multicolors), file=verbose_destination) for multicolor in all_target_multicolors: print("\t", [color.name for color in multicolor.multicolors.elements()], file=verbose_destination) for multicolor in all_target_multicolors: assembly_points = identify_assembly_points(graph, bgtree, target_multicolor=multicolor, exclude=exclude, verbose_destination=verbose_destination) for v1, v2, weight, repeat_name in assembly_points: overall_assembling_result.append((v1, v2, weight, repeat_name, multicolor)) assemble_points(graph, assemblies=assembly_points, multicolor=multicolor) return overall_assembling_result
def identify_assembly_points(graph, bgtree, target_multicolor, exclude=None, verbose=False, verbose_destination=None): """ The main granular assembling function, that IDENTIFIES assembly points, but does not perform the assembly on its own It DOES NOT change the supplied breakpoint graph in any way!!! """ if verbose: print(">>Identifying assemblies for target multicolor:", [e.name for e in target_multicolor.multicolors.elements()], file=verbose_destination) guidance = bgtree.consistent_multicolors[:] offset = len(Multicolor.split_colors(target_multicolor, guidance=guidance, account_for_color_multiplicity_in_guidance=False)) - 1 threshold = 1 if offset == 0 else 2 assemblies = [] # the overall result if exclude is None: exclude = [] # a container with single colors of genomes, that are to be considered fully assembled multicolor_scores = {} surroundings = {} full_irregular_multicolors = {} for i, cc in enumerate(graph.connected_components_subgraphs(copy=True)): subnets, possible_assemblies, h_support, t_support, repeats = get_irregular_subnets(cc, target_multicolor, exclude) for subnet in subnets: vertices = [vertex for vertex in subnet.nodes() if not vertex.is_irregular_vertex] repeats_h = {repeat for vertex in vertices for repeat in h_support[vertex]} repeats_t = {repeat for vertex in vertices for repeat in t_support[vertex]} repeats = repeats_h.intersection(repeats_t) for repeat in repeats: g = nx.Graph() for edge in filter(lambda edge: not edge.is_irregular_edge, subnet.edges()): v1, v2 = sorted((edge.vertex1, edge.vertex2), key=lambda vertex: vertex.name) if v1 in h_support and repeat in h_support[v1] and v2 in t_support and repeat in t_support[v2]: before, after = get_assembly_score(graph, v1, v2, target_multicolor, bgtree, multicolor_scores, surroundings, full_irregular_multicolors) if before - after - offset >= threshold: g.add_edge((v1, "h"), (v2, "t"), weight=before - after - offset) if v1 in t_support and repeat in t_support[v1] and v2 in h_support and repeat in h_support[v2]: before, after = get_assembly_score(graph, v1, v2, target_multicolor, bgtree, multicolor_scores, surroundings, full_irregular_multicolors) if before - after - offset >= threshold: g.add_edge((v1, "t"), (v2, "h"), weight=before - after - offset) edges = g.edges(data=True) if len(edges) == 0: continue new_edges = [] for edge in edges: first, second = edge[0], edge[1] v1, v2 = sorted((first, second), key=lambda item: item[0].name) new_edges.append((v1, v2, edge[2])) edges = sorted(new_edges, reverse=True, key=lambda e: (e[2]["weight"], e[0][0].name, e[0][1], e[1][0].name, e[0][1])) visited = set() for v1, v2, data in edges: weight = data["weight"] if v1 not in visited and v2 not in visited: visited.add(v1) visited.add(v2) v1, dir1 = v1 v2, dir2 = v2 assert (dir1 == "h" and dir2 == "t") or (dir1 == "t" and dir2 == "h") if dir1 == "h": assemblies.append((v2, v1, weight, repeat)) else: assemblies.append((v1, v2, weight, repeat)) return assemblies
def get_assembly_score(graph, v1, v2, target_multicolor, tree, multicolors_scores, surroundings, full_irregular_multicolors): if surroundings is None: surroundings = {} if multicolors_scores is None: multicolors_scores = {} if full_irregular_multicolors is None: full_irregular_multicolors = {} full_multicolor = Multicolor(*max(tree.consistent_multicolors, key=lambda multicolor: len(list(multicolor.multicolors.elements()))).colors) guidance = tree.consistent_multicolors if v1 not in surroundings: surroundings[v1] = Multicolor(*get_vertex_surrounding_multicolor(graph, v1).colors) surrounding1 = surroundings[v1] if v1 not in full_irregular_multicolors: full_irregular_multicolors[v1] = Multicolor(*get_full_irregular_multicolor(graph, v1).colors) im1 = full_irregular_multicolors[v1] if v2 not in surroundings: surroundings[v2] = Multicolor(*get_vertex_surrounding_multicolor(graph, v2).colors) surrounding2 = surroundings[v2] if v2 not in full_irregular_multicolors: full_irregular_multicolors[v2] = Multicolor(*get_full_irregular_multicolor(graph, v2).colors) im2 = full_irregular_multicolors[v1] c_1_multicolor = full_multicolor - surrounding1 c_2_multicolor = full_multicolor - surrounding2 c = c_1_multicolor.intersect(c_2_multicolor) c_a = c_1_multicolor - c c_b = c_2_multicolor - c sedge = graph.get_edge_by_two_vertices(v1, v2) smulticolor = Multicolor(*sedge.multicolor.colors) if sedge is not None else Multicolor() if target_multicolor <= smulticolor: smulticolor -= target_multicolor if (im1 + c_a).hashable_representation not in multicolors_scores: multicolors_scores[(im1 + c_a).hashable_representation] = len(Multicolor.split_colors(im1 + c_a, guidance=guidance, account_for_color_multiplicity_in_guidance=False)) if (im2 + c_b).hashable_representation not in multicolors_scores: multicolors_scores[(im2 + c_b).hashable_representation] = len(Multicolor.split_colors(im2 + c_b, guidance=guidance, account_for_color_multiplicity_in_guidance=False)) if (smulticolor + c).hashable_representation not in multicolors_scores: multicolors_scores[(smulticolor + c).hashable_representation] = len( Multicolor.split_colors(smulticolor + c, guidance=guidance, account_for_color_multiplicity_in_guidance=False)) ie1_score = multicolors_scores[(im1 + c_a).hashable_representation] ie2_score = multicolors_scores[(im2 + c_b).hashable_representation] se_score = multicolors_scores[(smulticolor + c).hashable_representation] before = ie1_score + ie2_score + se_score if (im1 - target_multicolor + c_a).hashable_representation not in multicolors_scores: multicolors_scores[(im1 - target_multicolor + c_a).hashable_representation] = len( Multicolor.split_colors(im1 - target_multicolor + c_a, guidance=guidance, account_for_color_multiplicity_in_guidance=False)) if (im2 - target_multicolor + c_b).hashable_representation not in multicolors_scores: multicolors_scores[(im2 - target_multicolor + c_b).hashable_representation] = len( Multicolor.split_colors(im2 - target_multicolor + c_b, guidance=guidance, account_for_color_multiplicity_in_guidance=False)) if (smulticolor + target_multicolor + c).hashable_representation not in multicolors_scores: multicolors_scores[(smulticolor + target_multicolor + c).hashable_representation] = len( Multicolor.split_colors(smulticolor + target_multicolor + c, guidance=guidance, account_for_color_multiplicity_in_guidance=False)) new_ie1_score = multicolors_scores[(im1 - target_multicolor + c_a).hashable_representation] new_ie2_score = multicolors_scores[(im2 - target_multicolor + c_b).hashable_representation] new_se_score = multicolors_scores[(smulticolor + target_multicolor + c).hashable_representation] after = new_ie1_score + new_ie2_score + new_se_score return before, after
grimm_formatted_genomes[genome][scaffold_name]), "$", file=target) # print() bg_graphs = dict() for file_name in os.listdir(target_directory): file_name = os.path.join(target_directory, file_name) with open(file_name, "r") as source: bg = GRIMMReader.get_breakpoint_graph(source) bg_graphs[file_name.split(".")[0]] = bg bg = BreakpointGraph() for br_gr in bg_graphs.values(): bg.update(br_gr, merge_edges=True) target_multicolor = Multicolor("Anguilla_japonica") print("Breakpoint graph stats:") print( "\t", "non-infinity nodes count:", len( list(node for node in bg.nodes() if not BGVertex.is_infinity_vertex(node)))) normal_edges, infinity_edges = [], [] for edge in bg.edges(): if edge.is_infinity_edge: infinity_edges.append(edge) else: normal_edges.append(edge) print("\t", "non-infinity edges count:", len(normal_edges)) print("\t", "infinity edges count:", len(infinity_edges))