Esempio n. 1
0
    def test_get_list_of_edges_no_repeat_blocks(self):
        # depending on the fragment type adjacencies to be considered in BreakpointGraph are differ
        # in case of circular genome, additional adjacency is added between to outermost vertices
        # in case of linear genome, two extremity (infinity) vertices are appended to the start and end of the vertices list
        parsed_data = ("@", [("+", "a"), ("-", "b"), ("-", "a")])
        result = GRIMMReader.get_edges_from_parsed_data(parsed_data)
        reference = [(TaggedBlockVertex("at"), TaggedBlockVertex("at")),
                     (TaggedBlockVertex("ah"), TaggedBlockVertex("bh")),
                     (TaggedBlockVertex("bt"), TaggedBlockVertex("ah"))]
        self.assertDictEqual(Counter(result), Counter(reference))

        parsed_data = ("$", [("+", "a"), ("-", "b"), ("-", "a")])
        result = GRIMMReader.get_edges_from_parsed_data(parsed_data)
        reference = [(TaggedInfinityVertex("at"), TaggedBlockVertex("at")),
                     (TaggedBlockVertex("ah"), TaggedBlockVertex("bh")),
                     (TaggedBlockVertex("bt"), TaggedBlockVertex("ah")),
                     (TaggedBlockVertex("at"), TaggedInfinityVertex("at"))]
        self.assertDictEqual(Counter(result), Counter(reference))

        parsed_data = ("@", [("+", "a")])
        result = GRIMMReader.get_edges_from_parsed_data(parsed_data)
        reference = [(TaggedBlockVertex("ah"), TaggedBlockVertex("at"))]
        self.assertDictEqual(Counter(result), Counter(reference))

        parsed_data = ("$", [("-", "a"), ("-", "a")])
        result = GRIMMReader.get_edges_from_parsed_data(parsed_data)
        reference = [(TaggedInfinityVertex("ah"), TaggedBlockVertex("ah")),
                     (TaggedBlockVertex("at"), TaggedBlockVertex("ah")),
                     (TaggedBlockVertex("at"), TaggedInfinityVertex("at"))]
        self.assertDictEqual(Counter(result), Counter(reference))
def grimm_filter_unique_gene(in_file, out_file):
    lines = open(in_file).read().split('\n')

    # make unique blocks list
    i = 0
    flt = Unique_Filter()
    while i < len(lines):
        line = lines[i]
        if GRIMMReader.is_genome_declaration_string(line):
            data_line = lines[i + 1]
            parsed = GRIMMReader.parse_data_string(data_line)[1]
            flt.update_allowed_blocks(parsed)
            i += 2
        else:
            i += 1

    # write allowed blocks
    i = 0
    with open(out_file, 'w') as f:
        while i < len(lines):
            line = lines[i]
            if GRIMMReader.is_genome_declaration_string(line):
                data_line = lines[i + 1]

                parsed = GRIMMReader.parse_data_string(data_line)[1]
                parsed = flt.filter_unique(parsed)

                print(line, file=f)
                print(' '.join(p[0] + p[1] for p in parsed), '@', file=f)
                i += 2
            else:
                i += 1

    return list(map(int, flt.allowed_blocks))
Esempio n. 3
0
    def test_get_list_of_edges_repeat_blocks_at_extremities(self):
        # if a fragment starts and / or ends with repeat block, that is denoted in a form of
        # repeat_block_name__repeat
        # IF this block is preserved as a block (i.e. located inside the fragment),
        # it will be transformed into a block "repeat_block_name" with
        #   an empty tag -- value "repeat" -- None
        # IF the block is not preserved (flanking repeat), it will be dismissed and its outermost extremity
        #   will be used to make its name (i.e. "repeat_block_name"h and it will be added as a tag to the infinity vertex
        #   created for a linear fragment
        # such information shall be recorded in respective infinity vertex, if a fragment is linear,
        # and in a normal tagged vertex, if fragment is circular

        # single repeat on the left extremity of linear fragment
        parsed_data = ("$", [("+", "a__repeat"), ("-", "b"),
                             ("+", "c__tag:1")])
        result = GRIMMReader.get_edges_from_parsed_data(parsed_data)
        left_iv = TaggedInfinityVertex("bh")
        left_iv.add_tag("repeat", "ah")
        ch_vertex, ct_vertex = TaggedBlockVertex("ch"), TaggedBlockVertex("ct")
        ch_vertex.add_tag("tag", "1")
        ct_vertex.add_tag("tag", "1")
        reference = [(left_iv, TaggedBlockVertex("bh")),
                     (TaggedBlockVertex("bt"), ct_vertex),
                     (ch_vertex, TaggedInfinityVertex("ch"))]
        self.assertDictEqual(Counter(result), Counter(reference))

        # both extremities are "flanked" by repeats
        parsed_data = ("$", [("+", "a__repeat"), ("-", "b"),
                             ("+", "c__tag:1:2"), ("-", "a__repeat")])
        result = GRIMMReader.get_edges_from_parsed_data(parsed_data)
        left_iv = TaggedInfinityVertex("bh")
        left_iv.add_tag("repeat", "ah")
        right_iv = TaggedInfinityVertex("ch")
        right_iv.add_tag("repeat", "ah")
        ch_vertex, ct_vertex = TaggedBlockVertex("ch"), TaggedBlockVertex("ct")
        ch_vertex.add_tag("tag", "1:2")
        ct_vertex.add_tag("tag", "1:2")
        reference = [(left_iv, TaggedBlockVertex("bh")),
                     (TaggedBlockVertex("bt"), ct_vertex),
                     (ch_vertex, right_iv)]
        self.assertDictEqual(Counter(result), Counter(reference))

        # fragment is specified as circular, all repeats shall be treated as normal blocks with half empty tags
        parsed_data = ("@", [("+", "a__repeat"), ("-", "b"), ("+", "c__tag:1"),
                             ("-", "a__repeat")])
        result = GRIMMReader.get_edges_from_parsed_data(parsed_data)
        ch_vertex, ct_vertex = TaggedBlockVertex("ch"), TaggedBlockVertex("ct")
        ct_vertex.add_tag("tag", "1")
        ch_vertex.add_tag("tag", "1")
        ah_vertex, at_vertex = TaggedBlockVertex("ah"), TaggedBlockVertex("at")
        ah_vertex.add_tag("repeat", None)
        at_vertex.add_tag("repeat", None)
        reference = [(ah_vertex, TaggedBlockVertex("bh")),
                     (TaggedBlockVertex("bt"), ct_vertex),
                     (ch_vertex, ah_vertex), (at_vertex, at_vertex)]
        self.assertDictEqual(Counter(result), Counter(reference))
Esempio n. 4
0
def get_block_neighbours(grimm_file):
    block_neighbours = defaultdict(lambda: defaultdict(list))
    with open(grimm_file) as f:
        ls = f.readlines()

    i = 0
    while i < len(ls):
        l = ls[i]
        if GRIMMReader.is_genome_declaration_string(l):
            genome = GRIMMReader.parse_genome_declaration_string(l)
            data_line = ls[i + 1]
            bs = GRIMMReader.parse_data_string(data_line)[1]

            n = len(bs)
            j = 0

            while j < n:
                tandem_copies = 1
                prev_or, prev_block = bs[j % n]
                _, curr_block = bs[(j + 1) % n]
                next_or, next_block = bs[(j + 2) % n]

                if curr_block == prev_block:
                    j += 1
                    continue

                while curr_block == next_block:
                    j += 1
                    tandem_copies += 1
                    next_or, next_block = bs[(j + 2) % n]

                neighbours = (prev_block + ('h' if prev_or == '+' else 't'),
                              next_block + ('t' if next_or == '+' else 'h'))

                orientations = tuple(bs[(k + 1) % n][0]
                                     for k in range(j - tandem_copies + 1, j +
                                                    1))

                if orientations[0] == '-':
                    neighbours = (neighbours[1], neighbours[0])
                    orientations = tuple('+' if or_ == '-' else '+'
                                         for or_ in orientations[::-1])

                block_neighbours[int(curr_block)][genome.name].append(
                    (*neighbours, tandem_copies, orientations))

                j += 1

            i += 2
        else:
            i += 1

    return block_neighbours
Esempio n. 5
0
 def test_parse_comment_data_string_top_level(self):
     comment_data_string = "# data :: key=value"
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=comment_data_string)
     self.assertListEqual(path, [])
     self.assertEqual(key, "key")
     self.assertEqual(value, "value")
     comment_data_string = "#data::key = value"
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=comment_data_string)
     self.assertListEqual(path, [])
     self.assertEqual(key, "key")
     self.assertEqual(value, "value")
Esempio n. 6
0
 def test_parse_comment_data_string_no_value(self):
     comment_data_string = "#data:: entry1 : entry2: key = "
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=comment_data_string)
     self.assertListEqual(path, ["entry1", "entry2"])
     self.assertEqual(key, "key")
     self.assertEqual(value, "")
     comment_data_string = "#data:: key = "
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=comment_data_string)
     self.assertListEqual(path, [])
     self.assertEqual(key, "key")
     self.assertEqual(value, "")
Esempio n. 7
0
 def test_parse_comment_data_string_no_key_value(self):
     comment_data_string = "#data:: "
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=comment_data_string)
     self.assertListEqual(path, [])
     self.assertEqual(key, "")
     self.assertEqual(value, "")
     comment_data_string = "#data:: = "
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=comment_data_string)
     self.assertListEqual(path, [])
     self.assertEqual(key, "")
     self.assertEqual(value, "")
Esempio n. 8
0
    def test_parse_data_string_correct(self):
        # data string is parsed by getting information about genes order and individual orientations for each block (gene)
        # string based processing if performed
        # if no orientation is specified explicitly, positive orientation is assumed
        data_string = "a $"
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "$")
        self.assertEqual(result[1][0][0], "+")
        self.assertEqual(result[1][0][1], "a")
        self.assertEqual(len(result[0]), 1)
        self.assertEqual(len(result[1]), 1)

        data_string = "a @"
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "@")
        self.assertEqual(result[1][0][0], "+")
        self.assertEqual(result[1][0][1], "a")
        self.assertEqual(len(result[0]), 1)
        self.assertEqual(len(result[1]), 1)

        data_string = "     a -b c -d @ e f     "
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "@")
        reference_genes = ["a", "b", "c", "d"]
        result_genes = [gene[1] for gene in result[1]]
        reference_signs = ["+", "-", "+", "-"]
        result_signs = [gene[0] for gene in result[1]]
        self.assertListEqual(result_genes, reference_genes)
        self.assertListEqual(result_signs, reference_signs)

        data_string = "     a -b +c -d $ e f     "
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "$")
        reference_genes = ["a", "b", "c", "d"]
        result_genes = [gene[1] for gene in result[1]]
        reference_signs = ["+", "-", "+", "-"]
        result_signs = [gene[0] for gene in result[1]]
        self.assertListEqual(result_genes, reference_genes)
        self.assertListEqual(result_signs, reference_signs)

        data_string = "     a -b c -d @ e f $ g -h    "
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "@")
        reference_genes = ["a", "b", "c", "d"]
        result_genes = [gene[1] for gene in result[1]]
        reference_signs = ["+", "-", "+", "-"]
        result_signs = [gene[0] for gene in result[1]]
        self.assertListEqual(result_genes, reference_genes)
        self.assertListEqual(result_signs, reference_signs)
Esempio n. 9
0
def get_characters(grimm_file, genomes, logger):
    bg = GRIMMReader.get_breakpoint_graph(open(grimm_file))
    logger.info('Breakpoint graph parsed')

    logger.info(f'Edges in breakpoint graph: {len(list(bg.edges()))}')

    characters = []

    # consistency_checker = TreeConsistencyChecker(tree_file)
    for i, component_bg in enumerate(bg.connected_components_subgraphs()):
        nodes_len = len(list(component_bg.nodes()))
        if nodes_len == 2: continue

        logger.info(
            f'Getting characters from breakpoint graph component, size={len(component_bg.bg)}'
        )

        neighbour_index = construct_vertex_genome_index(component_bg)

        for i_edge, edge in enumerate(component_bg.edges()):
            v1, v2 = edge.vertex1.name, edge.vertex2.name
            if v1 > v2: v1, v2 = v2, v1

            genome_colors, neighbour_edges = get_character_by_edge(
                component_bg, edge, genomes, neighbour_index)
            if white_proportion(genome_colors.values()) < 0.5: continue

            labels = ['edge exists', 'parallel edge doesn\'t exist'] + [
                f'inversion {v1}-{v2}-{v1n}-{v2n}'
                for (v1n, v2n) in neighbour_edges
            ]

            characters.append((v1, v2, genome_colors, labels))

    return characters
def get_genomes_contain_blocks_grimm(grimm_file):
    genomes, blocks = set(), set()

    with open(grimm_file) as f:
        ls = f.readlines()
    block_genome_count = defaultdict(Counter)

    for i in range(0, len(ls), 2):
        name = GRIMMReader.parse_genome_declaration_string(ls[i]).name
        data = GRIMMReader.parse_data_string(ls[i + 1])[1]
        genomes.add(name)
        for _, block in data:
            blocks.add(int(block))
            block_genome_count[int(block)][name] += 1

    return list(sorted(genomes)), list(sorted(blocks)), block_genome_count
Esempio n. 11
0
    def run(self, manager):
        mgra_ex_path = get_from_dict_with_path(manager.configuration,
                                               key="executable_path",
                                               path=["mgra"])
        manager.logger.info("=" * 80)
        if mgra_ex_path is None:
            manager.logger.info(
                "MGRA executable path is not supplied, skipping the MGRA based tasks"
            )
            return
        manager.logger.info(
            "Preparing data to communicate with MGRA and ontain guidance graph"
        )
        temp_dir = os.path.join(
            manager.configuration["gos-asm"]["output"]["dir"], "tmp_mgra")
        if not os.path.exists(temp_dir):
            os.mkdir(temp_dir)
        blocks_file_name = os.path.join(temp_dir, "blocks.txt")
        config_file_name = os.path.join(temp_dir, "config.cfg")
        mgra_output_dir_name = os.path.join(temp_dir, "output/")

        manager.logger.debug(
            "Writing blocks orders in GRIMM format to {file_name}".format(
                file_name=blocks_file_name))
        GRIMMWriter.print_genomes_as_grimm_blocks_orders(
            bg=manager.data["gos-asm"]["bg"], file_name=blocks_file_name)

        manager.logger.debug(
            "Writing configuration file for MGRA run to {file_name}".format(
                file_name=config_file_name))
        config = self.create_mgra_config(blocks_file_name=blocks_file_name,
                                         manager=manager)
        with open(config_file_name, "wt") as destination:
            json.dump(obj=config, fp=destination)
        manager.logger.info("Running MGRA on prepared configuration")
        os.system("{mgra_ex_path} -c {config_file_path} -o {output_dir_path}"
                  "".format(mgra_ex_path=mgra_ex_path,
                            config_file_path=config_file_name,
                            output_dir_path=mgra_output_dir_name))
        manager.logger.debug("MGRA has successfully finished")
        manager.logger.info("Reading MGRA produced guidance graph")

        genomes_dir = os.path.join(mgra_output_dir_name, "genomes")
        genome_files = [
            name for name in os.listdir(genomes_dir) if name.endswith(".gen")
        ]
        full_genomes_paths = [
            os.path.join(genomes_dir, name) for name in genome_files
        ]
        guidance_bg = BreakpointGraph()
        for file_name in full_genomes_paths:
            with open(file_name, "rt") as source:
                guidance_bg.update(
                    breakpoint_graph=GRIMMReader.get_breakpoint_graph(
                        stream=source, merge_edges=False),
                    merge_edges=False)
        if "mgra" not in manager.data:
            manager.data["mgra"] = {}
        manager.data["mgra"]["guidance_graph"] = guidance_bg
        manager.logger.info("Obtained MGRA produced guidance graph")
Esempio n. 12
0
 def test_parse_genome_declaration_string(self):
     # genome declaration string is parsed, by stripping the string from the right
     # and retrieving the string after the ">" character
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string(">genome"),
         BGGenome("genome"))
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string("  >genome  "),
         BGGenome("genome"))
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string(">genome__genome"),
         BGGenome("genome__genome"))
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string(">genome>genome"),
         BGGenome("genome>genome"))
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string(">genome.!/.#4"),
         BGGenome("genome.!/.#4"))
Esempio n. 13
0
 def test_parse_comment_data_string(self):
     data_string = "# data :: fragment : name = scaffold1"
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=data_string)
     self.assertEqual(path, ["fragment"])
     self.assertEqual(key, "name")
     self.assertEqual(value, "scaffold1")
     data_string = "# data :: fragment : origin: name = ALLPATHS-LG"
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=data_string)
     self.assertEqual(path, ["fragment", "origin"])
     self.assertEqual(key, "name")
     self.assertEqual(value, "ALLPATHS-LG")
     data_string = "# data :: genome : origin: name = ALLPATHS-LG"
     path, (key, value) = GRIMMReader.parse_comment_data_string(
         comment_data_string=data_string)
     self.assertEqual(path, ["genome", "origin"])
     self.assertEqual(key, "name")
     self.assertEqual(value, "ALLPATHS-LG")
Esempio n. 14
0
 def test_is_comment_string(self):
     # a sting is considered a comment if it non empty first char is "#"
     self.assertTrue(GRIMMReader.is_comment_string("#"))
     self.assertTrue(GRIMMReader.is_comment_string("     #"))
     self.assertTrue(GRIMMReader.is_comment_string("#    "))
     self.assertTrue(GRIMMReader.is_comment_string("     #    "))
     self.assertTrue(GRIMMReader.is_comment_string("#  aaa  "))
     self.assertFalse(GRIMMReader.is_comment_string("a# "))
     self.assertTrue(GRIMMReader.is_comment_string("    ##  "))
Esempio n. 15
0
 def test_parse_data_string_error(self):
     # data string must contain a fragment termination symbol ($ or @)
     # and must contain space separated gene order information before fragment termination symbol
     data_string_1 = "   a b c d e    "
     data_string_2 = ""
     data_string_3 = " a -b -c d -e "
     data_string_4 = "$"
     data_string_5 = "@"
     data_string_6 = "@ a d s d"
     data_string_7 = "$a d s d"
     data_string_8 = "$-a d s d"
     data_string_9 = "@+a d s d"
     data_string_10 = "a b - -c d e $"
     for data_string in [
             data_string_1, data_string_2, data_string_3, data_string_4,
             data_string_5, data_string_6, data_string_7, data_string_8,
             data_string_9, data_string_10
     ]:
         with self.assertRaises(ValueError):
             GRIMMReader.parse_data_string(data_string)
Esempio n. 16
0
    def run(self, manager):
        manager.logger.info("Reading blocks orders data")
        file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"]
        bg = BreakpointGraph()
        for file_path in file_paths:
            with open(file_path, "rt") as source:
                bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False)
        manager.data["gos-asm"]["bg"] = bg

        manager.logger.info("Reading phylogenetic tree information")
        tree = BGTree(newick=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"])
        manager.data["gos-asm"]["phylogenetic_tree"] = tree

        full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]])
        manager.data["gos-asm"]["target_multicolor"] = full_tmc
        vtree_consistent_target_multicolors = Multicolor.split_colors(full_tmc,
                                                                      guidance=tree.vtree_consistent_multicolors,
                                                                      account_for_color_multiplicity_in_guidance=False)

        for target_multicolor in vtree_consistent_target_multicolors[:]:
            for vtree_c_multicolor in deepcopy(tree.vtree_consistent_multicolors):
                if vtree_c_multicolor <= target_multicolor \
                        and vtree_c_multicolor not in vtree_consistent_target_multicolors \
                        and len(vtree_c_multicolor.colors) > 0:
                    vtree_consistent_target_multicolors.append(vtree_c_multicolor)

        vtree_consistent_target_multicolors = sorted(vtree_consistent_target_multicolors,
                                                     key=lambda mc: len(mc.hashable_representation),
                                                     reverse=True)

        all_target_multicolors = vtree_consistent_target_multicolors[:]
        # for i in range(2, len(vtree_consistent_target_multicolors) + 1):
        #     for comb in itertools.combinations(vtree_consistent_target_multicolors[:], i):
        #         comb = list(comb)
        #         for mc1, mc2 in itertools.combinations(comb, 2):
        #             if len(mc1.intersect(mc2).colors) > 0:
        #                 break
        #         else:
        #             new_mc = Multicolor()
        #             for mc in comb:
        #                 new_mc += mc
        #             all_target_multicolors.append(new_mc)
        hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors}
        all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in
                                  hashed_vertex_tree_consistent_multicolors]
        all_target_multicolors = sorted(all_target_multicolors,
                                        key=lambda mc: len(mc.hashable_representation),
                                        reverse=True)
        manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors
        # log_bg_stats(bg=bg, logger=manager.logger)

        manager.logger.info("Reading repeats-bridges information")
        manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance(
            file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
Esempio n. 17
0
 def test_get_breakpoint_from_file_with_comment_data_string(self):
     data = [
         "", "\t", "#comment1", ">genome_name_1", "      #comment1",
         "# data :: fragment : name = chromosome_X", "a b $",
         "   #comment1   ", "\t>genome_name_2",
         "#data::fragment:name=scaffold111", "a $", "", "\n\t"
     ]
     file_like = io.StringIO("\n".join(data))
     result_bg = GRIMMReader.get_breakpoint_graph(file_like,
                                                  merge_edges=False)
     self.assertTrue(isinstance(result_bg, BreakpointGraph))
     self.assertEqual(len(list(result_bg.connected_components_subgraphs())),
                      3)
     self.assertEqual(len(list(result_bg.edges())), 5)
     self.assertEqual(len(list(result_bg.nodes())), 7)
     multicolors = [
         Multicolor(BGGenome("genome_name_1")),
         Multicolor(BGGenome("genome_name_2"))
     ]
     condensed_multicolors = [
         Multicolor(BGGenome("genome_name_1")),
         Multicolor(BGGenome("genome_name_2")),
         Multicolor(BGGenome("genome_name_1"), BGGenome("genome_name_2"))
     ]
     for bgedge in result_bg.edges():
         self.assertTrue(bgedge.multicolor in multicolors)
     for bgedge in result_bg.edges():
         condensed_edge = result_bg.get_condensed_edge(
             vertex1=bgedge.vertex1, vertex2=bgedge.vertex2)
         self.assertTrue(condensed_edge.multicolor in condensed_multicolors)
     infinity_edges = [
         bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge
     ]
     self.assertEqual(len(infinity_edges), 4)
     for bgedge in result_bg.edges():
         data = bgedge.data
         self.assertIn("fragment", data)
         self.assertIsInstance(data["fragment"], dict)
         self.assertIn("name", data["fragment"])
         self.assertIn(data["fragment"]["name"],
                       {"chromosome_X", "scaffold111"})
     ah = result_bg.get_vertex_by_name("ah")
     bt = result_bg.get_vertex_by_name("bt")
     ahi = result_bg.get_vertex_by_name("ah__infinity")
     edge = result_bg.get_edge_by_two_vertices(vertex1=ah, vertex2=bt)
     self.assertTupleEqual(edge.data["fragment"]["forward_orientation"],
                           (ah, bt))
     iedge = result_bg.get_edge_by_two_vertices(vertex1=ah, vertex2=ahi)
     self.assertTupleEqual(iedge.data["fragment"]["forward_orientation"],
                           (ah, ahi))
Esempio n. 18
0
    def test_output_genomes_as_grimm(self):
        self._populate_four_genomes_bg(merge_edges=True)
        file_name = "file_name.txt"
        GRIMMWriter.print_genomes_as_grimm_blocks_orders(
            bg=self.four_genome_bg, file_name=file_name)
        try:
            with open(file_name, "rt") as source:
                new_bg = GRIMMReader.get_breakpoint_graph(stream=source,
                                                          merge_edges=True)
                self.assertEqual(len(list(new_bg.nodes())),
                                 len(list(self.four_genome_bg.nodes())))
                self.assertEqual(len(list(new_bg.edges())),
                                 len(list(self.four_genome_bg.edges())))

                self.assertSetEqual(set(new_bg.nodes()),
                                    set(self.four_genome_bg.nodes()))
                self.assertSetEqual(
                    new_bg.get_overall_set_of_colors(),
                    self.four_genome_bg.get_overall_set_of_colors())

        finally:
            if os.path.exists(file_name):
                os.remove(file_name)
Esempio n. 19
0
    ch.setLevel(args.c_logging_level)
    logger.setLevel(args.c_logging_level)
    logger.addHandler(ch)
    logger.info(full_description)
    logger.info(parser.format_values())
    ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry))
    logger.info("Starting the converting process")

    genomes = defaultdict(list)
    for file_name in args.grimm:
        logger.info("Processing file \"{file_name}\"".format(file_name=file_name))
        with open(file_name, "rt") as source:
            current_genome = None
            for line in source:
                line = line.strip()
                if len(line) == 0 or GRIMMReader.is_comment_string(data_string=line):
                    continue
                if GRIMMReader.is_genome_declaration_string(data_string=line):
                    current_genome = GRIMMReader.parse_genome_declaration_string(data_string=line).name
                    if args.trim_names:
                        current_genome = current_genome.split(args.trimmer_char, 1)[0]
                elif current_genome is not None:
                    current_chromosome = []
                    chr_type, blocks = GRIMMReader.parse_data_string(data_string=line)
                    genomes[current_genome].append((chr_type, blocks))
    if args.good_genomes != "":
        good_genomes = args.good_genomes.split(",")
        if args.trim_names:
            good_genomes = [genome_name.split(args.trimmer_char, 1)[0] for genome_name in good_genomes]
        for genome_name in list(genomes.keys()):
            if genome_name not in good_genomes:
Esempio n. 20
0
 def test_is_genome_declaration_string(self):
     # string is named as genome declaration string if its first non empty element is ">"
     # genome name has to be specified after the ">" char, empty genome name is forbidden
     self.assertTrue(GRIMMReader.is_genome_declaration_string(">genome"))
     self.assertTrue(
         GRIMMReader.is_genome_declaration_string("    >genome"))
     self.assertTrue(
         GRIMMReader.is_genome_declaration_string("  \t  >genome"))
     self.assertTrue(
         GRIMMReader.is_genome_declaration_string(">genome   \t"))
     self.assertTrue(GRIMMReader.is_genome_declaration_string(">genome   "))
     self.assertTrue(
         GRIMMReader.is_genome_declaration_string("   >genome   "))
     self.assertFalse(
         GRIMMReader.is_genome_declaration_string("\tt   >genome"))
     self.assertFalse(
         GRIMMReader.is_genome_declaration_string("  t\t>genome"))
     self.assertFalse(
         GRIMMReader.is_genome_declaration_string("  t>genome"))
     self.assertFalse(GRIMMReader.is_genome_declaration_string("genome"))
     self.assertFalse(GRIMMReader.is_genome_declaration_string(">"))
     self.assertFalse(GRIMMReader.is_genome_declaration_string("     >   "))
     self.assertFalse(GRIMMReader.is_genome_declaration_string("     >"))
     self.assertFalse(GRIMMReader.is_genome_declaration_string(">   "))
Esempio n. 21
0
 def _populate_bg(data, merge_edges=False):
     file_like = io.StringIO("\n".join(data))
     bg = GRIMMReader.get_breakpoint_graph(file_like,
                                           merge_edges=merge_edges)
     return bg
Esempio n. 22
0
    def test_get_breakpoint_from_file(self):
        # full workflow testing with dummy data
        # correct cases are assumed with all kind of crazy indentation and rubbish data mixed in, but still correct
        data = [
            "", "\t", "#comment1", ">genome_name_1", "      #comment1",
            "a b $", "\tc -a @\t", "   #comment1   ", "\t>genome_name_2",
            "a $", "", "\n\t"
        ]
        file_like = io.StringIO("\n".join(data))
        result_bg = GRIMMReader.get_breakpoint_graph(file_like)
        self.assertTrue(isinstance(result_bg, BreakpointGraph))
        self.assertEqual(len(list(result_bg.connected_components_subgraphs())),
                         3)
        self.assertEqual(len(list(result_bg.edges())), 6)
        self.assertEqual(len(list(result_bg.nodes())), 9)
        multicolors = [
            Multicolor(BGGenome("genome_name_1"), BGGenome("genome_name_2")),
            Multicolor(BGGenome("genome_name_1")),
            Multicolor(BGGenome("genome_name_2"))
        ]
        for bgedge in result_bg.edges():
            self.assertTrue(bgedge.multicolor in multicolors)
        infinity_edges = [
            bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge
        ]
        self.assertEqual(len(infinity_edges), 3)

        data = [
            ">genome_1", "a $", ">genome_2", "a b $", "# this is a bad genome",
            ">genome_3", "a b c $", ">genome_4", "   # chromosome 1", "b c $",
            ">genome_5", "c $"
        ]
        file_like = io.StringIO("\n".join(data))
        result_bg = GRIMMReader.get_breakpoint_graph(file_like)
        self.assertTrue(isinstance(result_bg, BreakpointGraph))
        self.assertEqual(len(list(result_bg.connected_components_subgraphs())),
                         4)
        self.assertEqual(len(list(result_bg.edges())), 8)
        self.assertEqual(len(list(result_bg.nodes())), 12)
        genome1, genome2, genome3 = BGGenome("genome_1"), BGGenome(
            "genome_2"), BGGenome("genome_3")
        genome4, genome5 = BGGenome("genome_4"), BGGenome("genome_5")
        multicolors = [
            Multicolor(genome1, genome2, genome3),
            Multicolor(genome1),
            Multicolor(genome2, genome3),
            Multicolor(genome2),
            Multicolor(genome3, genome4),
            Multicolor(genome3, genome4, genome5),
            Multicolor(genome4),
            Multicolor(genome5)
        ]
        for bgedge in result_bg.edges():
            self.assertTrue(bgedge.multicolor in multicolors)
        infinity_edges = [
            bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge
        ]
        self.assertEqual(len(infinity_edges), 6)
        infinity_multicolors = [
            multicolor for multicolor in multicolors
            if len(multicolor.multicolors) != 2
        ]
        for bgedge in infinity_edges:
            self.assertTrue(bgedge.multicolor in infinity_multicolors)
Esempio n. 23
0
    logger.setLevel(args.c_logging_level)
    logger.addHandler(ch)
    logger.info(full_description)
    logger.info(parser.format_values())
    ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry))
    logger.info("Starting the converting process")

    genomes = defaultdict(list)
    for file_name in args.grimm:
        logger.info(
            "Processing file \"{file_name}\"".format(file_name=file_name))
        with open(file_name, "rt") as source:
            current_genome = None
            for line in source:
                line = line.strip()
                if len(line) == 0 or GRIMMReader.is_comment_string(
                        data_string=line):
                    continue
                if GRIMMReader.is_genome_declaration_string(data_string=line):
                    current_genome = GRIMMReader.parse_genome_declaration_string(
                        data_string=line).name
                    if args.trim_names:
                        current_genome = current_genome.split(
                            args.trimmer_char, 1)[0]
                elif current_genome is not None:
                    current_chromosome = []
                    chr_type, blocks = GRIMMReader.parse_data_string(
                        data_string=line)
                    genomes[current_genome].append((chr_type, blocks))
    if args.good_genomes != "":
        good_genomes = args.good_genomes.split(",")
        if args.trim_names:
Esempio n. 24
0
    def test_is_comment_data_string(self):
        self.assertTrue(GRIMMReader.is_comment_data_string("# data :: "))
        self.assertTrue(GRIMMReader.is_comment_data_string("#data:: "))
        self.assertTrue(GRIMMReader.is_comment_data_string("   #data:: "))
        self.assertTrue(GRIMMReader.is_comment_data_string("   #data  :: "))
        self.assertTrue(
            GRIMMReader.is_comment_data_string("   #data  :: LALA"))
        self.assertTrue(
            GRIMMReader.is_comment_data_string("   #data  :: LALA : LULU"))
        self.assertTrue(
            GRIMMReader.is_comment_data_string(
                "   #data  :: LALA : LULU=LILI"))

        self.assertFalse(GRIMMReader.is_comment_data_string("# data"))
        self.assertFalse(GRIMMReader.is_comment_data_string("# data:"))
        self.assertFalse(GRIMMReader.is_comment_data_string("# ldata:"))
        self.assertFalse(GRIMMReader.is_comment_data_string("# datal:"))