Esempio n. 1
0
    def run(self, manager):
        manager.logger.info("Reading blocks orders data")
        file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"]
        bg = BreakpointGraph()
        for file_path in file_paths:
            with open(file_path, "rt") as source:
                bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False)
        manager.data["gos-asm"]["bg"] = bg

        manager.logger.info("Reading phylogenetic tree information")
        tree = NewickReader.from_string(data_string=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"])
        manager.data["gos-asm"]["phylogenetic_tree"] = tree

        full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]])
        manager.data["gos-asm"]["target_multicolor"] = full_tmc
        tree_consistent_target_multicolors = Multicolor.split_colors(full_tmc,
                                                                     guidance=tree.consistent_multicolors,
                                                                     account_for_color_multiplicity_in_guidance=False)

        for target_multicolor in tree_consistent_target_multicolors[:]:
            for tree_c_multicolor in deepcopy(tree.consistent_multicolors):
                if tree_c_multicolor <= target_multicolor \
                        and tree_c_multicolor not in tree_consistent_target_multicolors \
                        and len(tree_c_multicolor.colors) > 0:
                    tree_consistent_target_multicolors.append(tree_c_multicolor)

        tree_consistent_target_multicolors = sorted(tree_consistent_target_multicolors,
                                                    key=lambda mc: len(mc.hashable_representation),
                                                    reverse=True)

        all_target_multicolors = tree_consistent_target_multicolors[:]
        for i in range(2, len(tree_consistent_target_multicolors) + 1):
            for comb in itertools.combinations(tree_consistent_target_multicolors[:], i):
                comb = list(comb)
                for mc1, mc2 in itertools.combinations(comb, 2):
                    if len(mc1.intersect(mc2).colors) > 0:
                        break
                else:
                    new_mc = Multicolor()
                    for mc in comb:
                        new_mc += mc
                    all_target_multicolors.append(new_mc)
        hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors}
        all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in
                                  hashed_vertex_tree_consistent_multicolors]
        all_target_multicolors = sorted(all_target_multicolors,
                                        key=lambda mc: len(mc.hashable_representation),
                                        reverse=True)
        manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors
        # log_bg_stats(bg=bg, logger=manager.logger)

        manager.logger.info("Reading repeats-bridges information")
        manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance(
            file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
def validate_datasets(root_directory):
    root_directory = path.abspath(root_directory)

    for root, dirs, files in walk(root_directory):
        for f in files:
            block_file_path = path.join(root, f)
            _, e1, e2 = map(int, path.basename(path.dirname(path.dirname(block_file_path))).split('_'))
            with open(block_file_path) as block_file:
                breakpoint_graph = GRIMMReader.get_breakpoint_graph(block_file)
                for genomes in PAIRS_TO_CHECK:
                    leaf_distance = get_dcj_distance_two_genomes(breakpoint_graph, genomes)
                    if leaf_distance != e2:
                        print('Leaf - inner node distance differs in file {0}, expected={1}, real={2}'.
                              format(block_file_path, e2, leaf_distance))
                inner_node_distance = get_dcj_distance_two_genomes(breakpoint_graph, ('Left', 'Right'))
                if inner_node_distance != e1:
                    print('Inner node distance differs in file {0}, expected={1}, real={2}'.
                          format(block_file_path, e1, inner_node_distance))
Esempio n. 3
0
    def run(self, manager):
        mgra_ex_path = get_from_dict_with_path(manager.configuration, key="executable_path", path=["mgra"])
        manager.logger.info("=" * 80)
        if mgra_ex_path is None:
            manager.logger.info("MGRA executable path is not supplied, skipping the MGRA based tasks")
            return
        manager.logger.info("Preparing data to communicate with MGRA and ontain guidance graph")
        temp_dir = os.path.join(manager.configuration["gos-asm"]["output"]["dir"], "tmp_mgra")
        if not os.path.exists(temp_dir):
            os.mkdir(temp_dir)
        blocks_file_name = os.path.join(temp_dir, "blocks.txt")
        config_file_name = os.path.join(temp_dir, "config.cfg")
        mgra_output_dir_name = os.path.join(temp_dir, "output/")

        manager.logger.debug("Writing blocks orders in GRIMM format to {file_name}".format(file_name=blocks_file_name))
        GRIMMWriter.print_genomes_as_grimm_blocks_orders(bg=manager.data["gos-asm"]["bg"],
                                                         file_name=blocks_file_name)

        manager.logger.debug("Writing configuration file for MGRA run to {file_name}".format(file_name=config_file_name))
        config = self.create_mgra_config(blocks_file_name=blocks_file_name, manager=manager)
        with open(config_file_name, "wt") as destination:
            json.dump(obj=config, fp=destination)
        manager.logger.info("Running MGRA on prepared configuration")
        os.system("{mgra_ex_path} -c {config_file_path} -o {output_dir_path}"
                  "".format(mgra_ex_path=mgra_ex_path,
                            config_file_path=config_file_name,
                            output_dir_path=mgra_output_dir_name))
        manager.logger.debug("MGRA has successfully finished")
        manager.logger.info("Reading MGRA produced guidance graph")

        genomes_dir = os.path.join(mgra_output_dir_name, "genomes")
        genome_files = [name for name in os.listdir(genomes_dir) if name.endswith(".gen")]
        full_genomes_paths = [os.path.join(genomes_dir, name) for name in genome_files]
        guidance_bg = BreakpointGraph()
        for file_name in full_genomes_paths:
            with open(file_name, "rt") as source:
                guidance_bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False)
        if "mgra" not in manager.data:
            manager.data["mgra"] = {}
        manager.data["mgra"]["guidance_graph"] = guidance_bg
        manager.logger.info("Obtained MGRA produced guidance graph")
def run_metrics_on_block_file(block_path, full_correct_tree_file_name):
    correct_tree = read_correct_tree(full_correct_tree_file_name)
    with open(block_path) as block_file:
        breakpoint_graph = GRIMMReader.get_breakpoint_graph(block_file)
        return compare_metric_results(breakpoint_graph, correct_tree)