def test_get_list_of_edges_no_repeat_blocks(self): # depending on the fragment type adjacencies to be considered in BreakpointGraph are differ # in case of circular genome, additional adjacency is added between to outermost vertices # in case of linear genome, two extremity (infinity) vertices are appended to the start and end of the vertices list parsed_data = ("@", [("+", "a"), ("-", "b"), ("-", "a")]) result = GRIMMReader.get_edges_from_parsed_data(parsed_data) reference = [(TaggedBlockVertex("at"), TaggedBlockVertex("at")), (TaggedBlockVertex("ah"), TaggedBlockVertex("bh")), (TaggedBlockVertex("bt"), TaggedBlockVertex("ah"))] self.assertDictEqual(Counter(result), Counter(reference)) parsed_data = ("$", [("+", "a"), ("-", "b"), ("-", "a")]) result = GRIMMReader.get_edges_from_parsed_data(parsed_data) reference = [(TaggedInfinityVertex("at"), TaggedBlockVertex("at")), (TaggedBlockVertex("ah"), TaggedBlockVertex("bh")), (TaggedBlockVertex("bt"), TaggedBlockVertex("ah")), (TaggedBlockVertex("at"), TaggedInfinityVertex("at"))] self.assertDictEqual(Counter(result), Counter(reference)) parsed_data = ("@", [("+", "a")]) result = GRIMMReader.get_edges_from_parsed_data(parsed_data) reference = [(TaggedBlockVertex("ah"), TaggedBlockVertex("at"))] self.assertDictEqual(Counter(result), Counter(reference)) parsed_data = ("$", [("-", "a"), ("-", "a")]) result = GRIMMReader.get_edges_from_parsed_data(parsed_data) reference = [(TaggedInfinityVertex("ah"), TaggedBlockVertex("ah")), (TaggedBlockVertex("at"), TaggedBlockVertex("ah")), (TaggedBlockVertex("at"), TaggedInfinityVertex("at"))] self.assertDictEqual(Counter(result), Counter(reference))
def grimm_filter_unique_gene(in_file, out_file): lines = open(in_file).read().split('\n') # make unique blocks list i = 0 flt = Unique_Filter() while i < len(lines): line = lines[i] if GRIMMReader.is_genome_declaration_string(line): data_line = lines[i + 1] parsed = GRIMMReader.parse_data_string(data_line)[1] flt.update_allowed_blocks(parsed) i += 2 else: i += 1 # write allowed blocks i = 0 with open(out_file, 'w') as f: while i < len(lines): line = lines[i] if GRIMMReader.is_genome_declaration_string(line): data_line = lines[i + 1] parsed = GRIMMReader.parse_data_string(data_line)[1] parsed = flt.filter_unique(parsed) print(line, file=f) print(' '.join(p[0] + p[1] for p in parsed), '@', file=f) i += 2 else: i += 1 return list(map(int, flt.allowed_blocks))
def test_get_list_of_edges_repeat_blocks_at_extremities(self): # if a fragment starts and / or ends with repeat block, that is denoted in a form of # repeat_block_name__repeat # IF this block is preserved as a block (i.e. located inside the fragment), # it will be transformed into a block "repeat_block_name" with # an empty tag -- value "repeat" -- None # IF the block is not preserved (flanking repeat), it will be dismissed and its outermost extremity # will be used to make its name (i.e. "repeat_block_name"h and it will be added as a tag to the infinity vertex # created for a linear fragment # such information shall be recorded in respective infinity vertex, if a fragment is linear, # and in a normal tagged vertex, if fragment is circular # single repeat on the left extremity of linear fragment parsed_data = ("$", [("+", "a__repeat"), ("-", "b"), ("+", "c__tag:1")]) result = GRIMMReader.get_edges_from_parsed_data(parsed_data) left_iv = TaggedInfinityVertex("bh") left_iv.add_tag("repeat", "ah") ch_vertex, ct_vertex = TaggedBlockVertex("ch"), TaggedBlockVertex("ct") ch_vertex.add_tag("tag", "1") ct_vertex.add_tag("tag", "1") reference = [(left_iv, TaggedBlockVertex("bh")), (TaggedBlockVertex("bt"), ct_vertex), (ch_vertex, TaggedInfinityVertex("ch"))] self.assertDictEqual(Counter(result), Counter(reference)) # both extremities are "flanked" by repeats parsed_data = ("$", [("+", "a__repeat"), ("-", "b"), ("+", "c__tag:1:2"), ("-", "a__repeat")]) result = GRIMMReader.get_edges_from_parsed_data(parsed_data) left_iv = TaggedInfinityVertex("bh") left_iv.add_tag("repeat", "ah") right_iv = TaggedInfinityVertex("ch") right_iv.add_tag("repeat", "ah") ch_vertex, ct_vertex = TaggedBlockVertex("ch"), TaggedBlockVertex("ct") ch_vertex.add_tag("tag", "1:2") ct_vertex.add_tag("tag", "1:2") reference = [(left_iv, TaggedBlockVertex("bh")), (TaggedBlockVertex("bt"), ct_vertex), (ch_vertex, right_iv)] self.assertDictEqual(Counter(result), Counter(reference)) # fragment is specified as circular, all repeats shall be treated as normal blocks with half empty tags parsed_data = ("@", [("+", "a__repeat"), ("-", "b"), ("+", "c__tag:1"), ("-", "a__repeat")]) result = GRIMMReader.get_edges_from_parsed_data(parsed_data) ch_vertex, ct_vertex = TaggedBlockVertex("ch"), TaggedBlockVertex("ct") ct_vertex.add_tag("tag", "1") ch_vertex.add_tag("tag", "1") ah_vertex, at_vertex = TaggedBlockVertex("ah"), TaggedBlockVertex("at") ah_vertex.add_tag("repeat", None) at_vertex.add_tag("repeat", None) reference = [(ah_vertex, TaggedBlockVertex("bh")), (TaggedBlockVertex("bt"), ct_vertex), (ch_vertex, ah_vertex), (at_vertex, at_vertex)] self.assertDictEqual(Counter(result), Counter(reference))
def get_block_neighbours(grimm_file): block_neighbours = defaultdict(lambda: defaultdict(list)) with open(grimm_file) as f: ls = f.readlines() i = 0 while i < len(ls): l = ls[i] if GRIMMReader.is_genome_declaration_string(l): genome = GRIMMReader.parse_genome_declaration_string(l) data_line = ls[i + 1] bs = GRIMMReader.parse_data_string(data_line)[1] n = len(bs) j = 0 while j < n: tandem_copies = 1 prev_or, prev_block = bs[j % n] _, curr_block = bs[(j + 1) % n] next_or, next_block = bs[(j + 2) % n] if curr_block == prev_block: j += 1 continue while curr_block == next_block: j += 1 tandem_copies += 1 next_or, next_block = bs[(j + 2) % n] neighbours = (prev_block + ('h' if prev_or == '+' else 't'), next_block + ('t' if next_or == '+' else 'h')) orientations = tuple(bs[(k + 1) % n][0] for k in range(j - tandem_copies + 1, j + 1)) if orientations[0] == '-': neighbours = (neighbours[1], neighbours[0]) orientations = tuple('+' if or_ == '-' else '+' for or_ in orientations[::-1]) block_neighbours[int(curr_block)][genome.name].append( (*neighbours, tandem_copies, orientations)) j += 1 i += 2 else: i += 1 return block_neighbours
def test_parse_comment_data_string_top_level(self): comment_data_string = "# data :: key=value" path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=comment_data_string) self.assertListEqual(path, []) self.assertEqual(key, "key") self.assertEqual(value, "value") comment_data_string = "#data::key = value" path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=comment_data_string) self.assertListEqual(path, []) self.assertEqual(key, "key") self.assertEqual(value, "value")
def test_parse_comment_data_string_no_value(self): comment_data_string = "#data:: entry1 : entry2: key = " path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=comment_data_string) self.assertListEqual(path, ["entry1", "entry2"]) self.assertEqual(key, "key") self.assertEqual(value, "") comment_data_string = "#data:: key = " path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=comment_data_string) self.assertListEqual(path, []) self.assertEqual(key, "key") self.assertEqual(value, "")
def test_parse_comment_data_string_no_key_value(self): comment_data_string = "#data:: " path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=comment_data_string) self.assertListEqual(path, []) self.assertEqual(key, "") self.assertEqual(value, "") comment_data_string = "#data:: = " path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=comment_data_string) self.assertListEqual(path, []) self.assertEqual(key, "") self.assertEqual(value, "")
def test_parse_data_string_correct(self): # data string is parsed by getting information about genes order and individual orientations for each block (gene) # string based processing if performed # if no orientation is specified explicitly, positive orientation is assumed data_string = "a $" result = GRIMMReader.parse_data_string(data_string) self.assertEqual(result[0], "$") self.assertEqual(result[1][0][0], "+") self.assertEqual(result[1][0][1], "a") self.assertEqual(len(result[0]), 1) self.assertEqual(len(result[1]), 1) data_string = "a @" result = GRIMMReader.parse_data_string(data_string) self.assertEqual(result[0], "@") self.assertEqual(result[1][0][0], "+") self.assertEqual(result[1][0][1], "a") self.assertEqual(len(result[0]), 1) self.assertEqual(len(result[1]), 1) data_string = " a -b c -d @ e f " result = GRIMMReader.parse_data_string(data_string) self.assertEqual(result[0], "@") reference_genes = ["a", "b", "c", "d"] result_genes = [gene[1] for gene in result[1]] reference_signs = ["+", "-", "+", "-"] result_signs = [gene[0] for gene in result[1]] self.assertListEqual(result_genes, reference_genes) self.assertListEqual(result_signs, reference_signs) data_string = " a -b +c -d $ e f " result = GRIMMReader.parse_data_string(data_string) self.assertEqual(result[0], "$") reference_genes = ["a", "b", "c", "d"] result_genes = [gene[1] for gene in result[1]] reference_signs = ["+", "-", "+", "-"] result_signs = [gene[0] for gene in result[1]] self.assertListEqual(result_genes, reference_genes) self.assertListEqual(result_signs, reference_signs) data_string = " a -b c -d @ e f $ g -h " result = GRIMMReader.parse_data_string(data_string) self.assertEqual(result[0], "@") reference_genes = ["a", "b", "c", "d"] result_genes = [gene[1] for gene in result[1]] reference_signs = ["+", "-", "+", "-"] result_signs = [gene[0] for gene in result[1]] self.assertListEqual(result_genes, reference_genes) self.assertListEqual(result_signs, reference_signs)
def get_characters(grimm_file, genomes, logger): bg = GRIMMReader.get_breakpoint_graph(open(grimm_file)) logger.info('Breakpoint graph parsed') logger.info(f'Edges in breakpoint graph: {len(list(bg.edges()))}') characters = [] # consistency_checker = TreeConsistencyChecker(tree_file) for i, component_bg in enumerate(bg.connected_components_subgraphs()): nodes_len = len(list(component_bg.nodes())) if nodes_len == 2: continue logger.info( f'Getting characters from breakpoint graph component, size={len(component_bg.bg)}' ) neighbour_index = construct_vertex_genome_index(component_bg) for i_edge, edge in enumerate(component_bg.edges()): v1, v2 = edge.vertex1.name, edge.vertex2.name if v1 > v2: v1, v2 = v2, v1 genome_colors, neighbour_edges = get_character_by_edge( component_bg, edge, genomes, neighbour_index) if white_proportion(genome_colors.values()) < 0.5: continue labels = ['edge exists', 'parallel edge doesn\'t exist'] + [ f'inversion {v1}-{v2}-{v1n}-{v2n}' for (v1n, v2n) in neighbour_edges ] characters.append((v1, v2, genome_colors, labels)) return characters
def get_genomes_contain_blocks_grimm(grimm_file): genomes, blocks = set(), set() with open(grimm_file) as f: ls = f.readlines() block_genome_count = defaultdict(Counter) for i in range(0, len(ls), 2): name = GRIMMReader.parse_genome_declaration_string(ls[i]).name data = GRIMMReader.parse_data_string(ls[i + 1])[1] genomes.add(name) for _, block in data: blocks.add(int(block)) block_genome_count[int(block)][name] += 1 return list(sorted(genomes)), list(sorted(blocks)), block_genome_count
def run(self, manager): mgra_ex_path = get_from_dict_with_path(manager.configuration, key="executable_path", path=["mgra"]) manager.logger.info("=" * 80) if mgra_ex_path is None: manager.logger.info( "MGRA executable path is not supplied, skipping the MGRA based tasks" ) return manager.logger.info( "Preparing data to communicate with MGRA and ontain guidance graph" ) temp_dir = os.path.join( manager.configuration["gos-asm"]["output"]["dir"], "tmp_mgra") if not os.path.exists(temp_dir): os.mkdir(temp_dir) blocks_file_name = os.path.join(temp_dir, "blocks.txt") config_file_name = os.path.join(temp_dir, "config.cfg") mgra_output_dir_name = os.path.join(temp_dir, "output/") manager.logger.debug( "Writing blocks orders in GRIMM format to {file_name}".format( file_name=blocks_file_name)) GRIMMWriter.print_genomes_as_grimm_blocks_orders( bg=manager.data["gos-asm"]["bg"], file_name=blocks_file_name) manager.logger.debug( "Writing configuration file for MGRA run to {file_name}".format( file_name=config_file_name)) config = self.create_mgra_config(blocks_file_name=blocks_file_name, manager=manager) with open(config_file_name, "wt") as destination: json.dump(obj=config, fp=destination) manager.logger.info("Running MGRA on prepared configuration") os.system("{mgra_ex_path} -c {config_file_path} -o {output_dir_path}" "".format(mgra_ex_path=mgra_ex_path, config_file_path=config_file_name, output_dir_path=mgra_output_dir_name)) manager.logger.debug("MGRA has successfully finished") manager.logger.info("Reading MGRA produced guidance graph") genomes_dir = os.path.join(mgra_output_dir_name, "genomes") genome_files = [ name for name in os.listdir(genomes_dir) if name.endswith(".gen") ] full_genomes_paths = [ os.path.join(genomes_dir, name) for name in genome_files ] guidance_bg = BreakpointGraph() for file_name in full_genomes_paths: with open(file_name, "rt") as source: guidance_bg.update( breakpoint_graph=GRIMMReader.get_breakpoint_graph( stream=source, merge_edges=False), merge_edges=False) if "mgra" not in manager.data: manager.data["mgra"] = {} manager.data["mgra"]["guidance_graph"] = guidance_bg manager.logger.info("Obtained MGRA produced guidance graph")
def test_parse_genome_declaration_string(self): # genome declaration string is parsed, by stripping the string from the right # and retrieving the string after the ">" character self.assertEqual( GRIMMReader.parse_genome_declaration_string(">genome"), BGGenome("genome")) self.assertEqual( GRIMMReader.parse_genome_declaration_string(" >genome "), BGGenome("genome")) self.assertEqual( GRIMMReader.parse_genome_declaration_string(">genome__genome"), BGGenome("genome__genome")) self.assertEqual( GRIMMReader.parse_genome_declaration_string(">genome>genome"), BGGenome("genome>genome")) self.assertEqual( GRIMMReader.parse_genome_declaration_string(">genome.!/.#4"), BGGenome("genome.!/.#4"))
def test_parse_comment_data_string(self): data_string = "# data :: fragment : name = scaffold1" path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=data_string) self.assertEqual(path, ["fragment"]) self.assertEqual(key, "name") self.assertEqual(value, "scaffold1") data_string = "# data :: fragment : origin: name = ALLPATHS-LG" path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=data_string) self.assertEqual(path, ["fragment", "origin"]) self.assertEqual(key, "name") self.assertEqual(value, "ALLPATHS-LG") data_string = "# data :: genome : origin: name = ALLPATHS-LG" path, (key, value) = GRIMMReader.parse_comment_data_string( comment_data_string=data_string) self.assertEqual(path, ["genome", "origin"]) self.assertEqual(key, "name") self.assertEqual(value, "ALLPATHS-LG")
def test_is_comment_string(self): # a sting is considered a comment if it non empty first char is "#" self.assertTrue(GRIMMReader.is_comment_string("#")) self.assertTrue(GRIMMReader.is_comment_string(" #")) self.assertTrue(GRIMMReader.is_comment_string("# ")) self.assertTrue(GRIMMReader.is_comment_string(" # ")) self.assertTrue(GRIMMReader.is_comment_string("# aaa ")) self.assertFalse(GRIMMReader.is_comment_string("a# ")) self.assertTrue(GRIMMReader.is_comment_string(" ## "))
def test_parse_data_string_error(self): # data string must contain a fragment termination symbol ($ or @) # and must contain space separated gene order information before fragment termination symbol data_string_1 = " a b c d e " data_string_2 = "" data_string_3 = " a -b -c d -e " data_string_4 = "$" data_string_5 = "@" data_string_6 = "@ a d s d" data_string_7 = "$a d s d" data_string_8 = "$-a d s d" data_string_9 = "@+a d s d" data_string_10 = "a b - -c d e $" for data_string in [ data_string_1, data_string_2, data_string_3, data_string_4, data_string_5, data_string_6, data_string_7, data_string_8, data_string_9, data_string_10 ]: with self.assertRaises(ValueError): GRIMMReader.parse_data_string(data_string)
def run(self, manager): manager.logger.info("Reading blocks orders data") file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"] bg = BreakpointGraph() for file_path in file_paths: with open(file_path, "rt") as source: bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False) manager.data["gos-asm"]["bg"] = bg manager.logger.info("Reading phylogenetic tree information") tree = BGTree(newick=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"]) manager.data["gos-asm"]["phylogenetic_tree"] = tree full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]]) manager.data["gos-asm"]["target_multicolor"] = full_tmc vtree_consistent_target_multicolors = Multicolor.split_colors(full_tmc, guidance=tree.vtree_consistent_multicolors, account_for_color_multiplicity_in_guidance=False) for target_multicolor in vtree_consistent_target_multicolors[:]: for vtree_c_multicolor in deepcopy(tree.vtree_consistent_multicolors): if vtree_c_multicolor <= target_multicolor \ and vtree_c_multicolor not in vtree_consistent_target_multicolors \ and len(vtree_c_multicolor.colors) > 0: vtree_consistent_target_multicolors.append(vtree_c_multicolor) vtree_consistent_target_multicolors = sorted(vtree_consistent_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) all_target_multicolors = vtree_consistent_target_multicolors[:] # for i in range(2, len(vtree_consistent_target_multicolors) + 1): # for comb in itertools.combinations(vtree_consistent_target_multicolors[:], i): # comb = list(comb) # for mc1, mc2 in itertools.combinations(comb, 2): # if len(mc1.intersect(mc2).colors) > 0: # break # else: # new_mc = Multicolor() # for mc in comb: # new_mc += mc # all_target_multicolors.append(new_mc) hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors} all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in hashed_vertex_tree_consistent_multicolors] all_target_multicolors = sorted(all_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors # log_bg_stats(bg=bg, logger=manager.logger) manager.logger.info("Reading repeats-bridges information") manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance( file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
def test_get_breakpoint_from_file_with_comment_data_string(self): data = [ "", "\t", "#comment1", ">genome_name_1", " #comment1", "# data :: fragment : name = chromosome_X", "a b $", " #comment1 ", "\t>genome_name_2", "#data::fragment:name=scaffold111", "a $", "", "\n\t" ] file_like = io.StringIO("\n".join(data)) result_bg = GRIMMReader.get_breakpoint_graph(file_like, merge_edges=False) self.assertTrue(isinstance(result_bg, BreakpointGraph)) self.assertEqual(len(list(result_bg.connected_components_subgraphs())), 3) self.assertEqual(len(list(result_bg.edges())), 5) self.assertEqual(len(list(result_bg.nodes())), 7) multicolors = [ Multicolor(BGGenome("genome_name_1")), Multicolor(BGGenome("genome_name_2")) ] condensed_multicolors = [ Multicolor(BGGenome("genome_name_1")), Multicolor(BGGenome("genome_name_2")), Multicolor(BGGenome("genome_name_1"), BGGenome("genome_name_2")) ] for bgedge in result_bg.edges(): self.assertTrue(bgedge.multicolor in multicolors) for bgedge in result_bg.edges(): condensed_edge = result_bg.get_condensed_edge( vertex1=bgedge.vertex1, vertex2=bgedge.vertex2) self.assertTrue(condensed_edge.multicolor in condensed_multicolors) infinity_edges = [ bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge ] self.assertEqual(len(infinity_edges), 4) for bgedge in result_bg.edges(): data = bgedge.data self.assertIn("fragment", data) self.assertIsInstance(data["fragment"], dict) self.assertIn("name", data["fragment"]) self.assertIn(data["fragment"]["name"], {"chromosome_X", "scaffold111"}) ah = result_bg.get_vertex_by_name("ah") bt = result_bg.get_vertex_by_name("bt") ahi = result_bg.get_vertex_by_name("ah__infinity") edge = result_bg.get_edge_by_two_vertices(vertex1=ah, vertex2=bt) self.assertTupleEqual(edge.data["fragment"]["forward_orientation"], (ah, bt)) iedge = result_bg.get_edge_by_two_vertices(vertex1=ah, vertex2=ahi) self.assertTupleEqual(iedge.data["fragment"]["forward_orientation"], (ah, ahi))
def test_output_genomes_as_grimm(self): self._populate_four_genomes_bg(merge_edges=True) file_name = "file_name.txt" GRIMMWriter.print_genomes_as_grimm_blocks_orders( bg=self.four_genome_bg, file_name=file_name) try: with open(file_name, "rt") as source: new_bg = GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=True) self.assertEqual(len(list(new_bg.nodes())), len(list(self.four_genome_bg.nodes()))) self.assertEqual(len(list(new_bg.edges())), len(list(self.four_genome_bg.edges()))) self.assertSetEqual(set(new_bg.nodes()), set(self.four_genome_bg.nodes())) self.assertSetEqual( new_bg.get_overall_set_of_colors(), self.four_genome_bg.get_overall_set_of_colors()) finally: if os.path.exists(file_name): os.remove(file_name)
ch.setLevel(args.c_logging_level) logger.setLevel(args.c_logging_level) logger.addHandler(ch) logger.info(full_description) logger.info(parser.format_values()) ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry)) logger.info("Starting the converting process") genomes = defaultdict(list) for file_name in args.grimm: logger.info("Processing file \"{file_name}\"".format(file_name=file_name)) with open(file_name, "rt") as source: current_genome = None for line in source: line = line.strip() if len(line) == 0 or GRIMMReader.is_comment_string(data_string=line): continue if GRIMMReader.is_genome_declaration_string(data_string=line): current_genome = GRIMMReader.parse_genome_declaration_string(data_string=line).name if args.trim_names: current_genome = current_genome.split(args.trimmer_char, 1)[0] elif current_genome is not None: current_chromosome = [] chr_type, blocks = GRIMMReader.parse_data_string(data_string=line) genomes[current_genome].append((chr_type, blocks)) if args.good_genomes != "": good_genomes = args.good_genomes.split(",") if args.trim_names: good_genomes = [genome_name.split(args.trimmer_char, 1)[0] for genome_name in good_genomes] for genome_name in list(genomes.keys()): if genome_name not in good_genomes:
def test_is_genome_declaration_string(self): # string is named as genome declaration string if its first non empty element is ">" # genome name has to be specified after the ">" char, empty genome name is forbidden self.assertTrue(GRIMMReader.is_genome_declaration_string(">genome")) self.assertTrue( GRIMMReader.is_genome_declaration_string(" >genome")) self.assertTrue( GRIMMReader.is_genome_declaration_string(" \t >genome")) self.assertTrue( GRIMMReader.is_genome_declaration_string(">genome \t")) self.assertTrue(GRIMMReader.is_genome_declaration_string(">genome ")) self.assertTrue( GRIMMReader.is_genome_declaration_string(" >genome ")) self.assertFalse( GRIMMReader.is_genome_declaration_string("\tt >genome")) self.assertFalse( GRIMMReader.is_genome_declaration_string(" t\t>genome")) self.assertFalse( GRIMMReader.is_genome_declaration_string(" t>genome")) self.assertFalse(GRIMMReader.is_genome_declaration_string("genome")) self.assertFalse(GRIMMReader.is_genome_declaration_string(">")) self.assertFalse(GRIMMReader.is_genome_declaration_string(" > ")) self.assertFalse(GRIMMReader.is_genome_declaration_string(" >")) self.assertFalse(GRIMMReader.is_genome_declaration_string("> "))
def _populate_bg(data, merge_edges=False): file_like = io.StringIO("\n".join(data)) bg = GRIMMReader.get_breakpoint_graph(file_like, merge_edges=merge_edges) return bg
def test_get_breakpoint_from_file(self): # full workflow testing with dummy data # correct cases are assumed with all kind of crazy indentation and rubbish data mixed in, but still correct data = [ "", "\t", "#comment1", ">genome_name_1", " #comment1", "a b $", "\tc -a @\t", " #comment1 ", "\t>genome_name_2", "a $", "", "\n\t" ] file_like = io.StringIO("\n".join(data)) result_bg = GRIMMReader.get_breakpoint_graph(file_like) self.assertTrue(isinstance(result_bg, BreakpointGraph)) self.assertEqual(len(list(result_bg.connected_components_subgraphs())), 3) self.assertEqual(len(list(result_bg.edges())), 6) self.assertEqual(len(list(result_bg.nodes())), 9) multicolors = [ Multicolor(BGGenome("genome_name_1"), BGGenome("genome_name_2")), Multicolor(BGGenome("genome_name_1")), Multicolor(BGGenome("genome_name_2")) ] for bgedge in result_bg.edges(): self.assertTrue(bgedge.multicolor in multicolors) infinity_edges = [ bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge ] self.assertEqual(len(infinity_edges), 3) data = [ ">genome_1", "a $", ">genome_2", "a b $", "# this is a bad genome", ">genome_3", "a b c $", ">genome_4", " # chromosome 1", "b c $", ">genome_5", "c $" ] file_like = io.StringIO("\n".join(data)) result_bg = GRIMMReader.get_breakpoint_graph(file_like) self.assertTrue(isinstance(result_bg, BreakpointGraph)) self.assertEqual(len(list(result_bg.connected_components_subgraphs())), 4) self.assertEqual(len(list(result_bg.edges())), 8) self.assertEqual(len(list(result_bg.nodes())), 12) genome1, genome2, genome3 = BGGenome("genome_1"), BGGenome( "genome_2"), BGGenome("genome_3") genome4, genome5 = BGGenome("genome_4"), BGGenome("genome_5") multicolors = [ Multicolor(genome1, genome2, genome3), Multicolor(genome1), Multicolor(genome2, genome3), Multicolor(genome2), Multicolor(genome3, genome4), Multicolor(genome3, genome4, genome5), Multicolor(genome4), Multicolor(genome5) ] for bgedge in result_bg.edges(): self.assertTrue(bgedge.multicolor in multicolors) infinity_edges = [ bgedge for bgedge in result_bg.edges() if bgedge.is_infinity_edge ] self.assertEqual(len(infinity_edges), 6) infinity_multicolors = [ multicolor for multicolor in multicolors if len(multicolor.multicolors) != 2 ] for bgedge in infinity_edges: self.assertTrue(bgedge.multicolor in infinity_multicolors)
logger.setLevel(args.c_logging_level) logger.addHandler(ch) logger.info(full_description) logger.info(parser.format_values()) ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry)) logger.info("Starting the converting process") genomes = defaultdict(list) for file_name in args.grimm: logger.info( "Processing file \"{file_name}\"".format(file_name=file_name)) with open(file_name, "rt") as source: current_genome = None for line in source: line = line.strip() if len(line) == 0 or GRIMMReader.is_comment_string( data_string=line): continue if GRIMMReader.is_genome_declaration_string(data_string=line): current_genome = GRIMMReader.parse_genome_declaration_string( data_string=line).name if args.trim_names: current_genome = current_genome.split( args.trimmer_char, 1)[0] elif current_genome is not None: current_chromosome = [] chr_type, blocks = GRIMMReader.parse_data_string( data_string=line) genomes[current_genome].append((chr_type, blocks)) if args.good_genomes != "": good_genomes = args.good_genomes.split(",") if args.trim_names:
def test_is_comment_data_string(self): self.assertTrue(GRIMMReader.is_comment_data_string("# data :: ")) self.assertTrue(GRIMMReader.is_comment_data_string("#data:: ")) self.assertTrue(GRIMMReader.is_comment_data_string(" #data:: ")) self.assertTrue(GRIMMReader.is_comment_data_string(" #data :: ")) self.assertTrue( GRIMMReader.is_comment_data_string(" #data :: LALA")) self.assertTrue( GRIMMReader.is_comment_data_string(" #data :: LALA : LULU")) self.assertTrue( GRIMMReader.is_comment_data_string( " #data :: LALA : LULU=LILI")) self.assertFalse(GRIMMReader.is_comment_data_string("# data")) self.assertFalse(GRIMMReader.is_comment_data_string("# data:")) self.assertFalse(GRIMMReader.is_comment_data_string("# ldata:")) self.assertFalse(GRIMMReader.is_comment_data_string("# datal:"))