def test_regular_ra_parser(self, num_kmers): # given kmer_size = 11 b = builder.Graph() \ .with_kmer_size(kmer_size) \ .with_num_colors(1) seen_kmers = set() for _ in range(num_kmers): kmer_string = lexlo(''.join( [random.choice('ACGT') for _ in range(kmer_size)])) while kmer_string in seen_kmers: kmer_string = lexlo(''.join( [random.choice('ACGT') for _ in range(kmer_size)])) seen_kmers.add(kmer_string) b.with_kmer(kmer_string) fh = b.build() ra = parser.RandomAccess(fh, kmer_cache_size=None) for k_string in list(ra): ra[k_string] with mock.patch.object(fh, 'read', wraps=fh.read) as mocked_read: # when for seen_kmer in sorted(seen_kmers): ra[seen_kmer] # then assert 2 * num_kmers == mocked_read.call_count
def get_incoming_kmers(self, kmer_string): lexlo_string = lexlo(kmer_string) assert lexlo_string == kmer_string return [ lexlo(kmer_string) for kmer_string in self.get_incoming_kmer_strings(kmer_string, is_lexlo=True) ]
def get_canonical_edge(first, second): """Get canonical edge. Canonical edges are between lexlo kmers and are ordered lexicographically Return canonical edge, if the first and second nodes were lexlo""" lexlo_first = lexlo(first) lexlo_second = lexlo(second) flip_first_second = lexlo_second < lexlo_first if flip_first_second: lexlo_second, lexlo_first = lexlo_first, lexlo_second return lexlo_first, lexlo_second, flip_first_second
def make_graph_nodes_consistent(self, seed_kmer_strings=None): """ Take a Cortex graph and make all nodes have kmer_strings that are consistent with each other. If a seed kmer string is provided, then start with that seed kmer. """ if self.graph.is_consistent(): return self if seed_kmer_strings is None: seed_kmer_strings = [] graph = CortexDiGraph(self.graph) new_graph = ConsistentCortexDiGraph(graph=self.graph.graph) seeds = SeedKmerStringIterator.from_all_kmer_strings_and_seeds(self.graph.nodes(), seed_kmer_strings) for seed, lexlo_seed in seeds: new_graph.add_node(seed, kmer=self.graph.node[lexlo_seed]) seeds.remove(lexlo_seed) for source, sink, key, direction in nx.edge_dfs(graph, lexlo_seed, 'ignore'): if direction == 'forward': rc_after_ref_kmer = True ref, target = source, sink elif direction == 'reverse': ref, target = sink, source rc_after_ref_kmer = False else: raise Exception("unknown direction: {}".format(direction)) if ref not in new_graph.node: ref = revcomp(ref) rc_after_ref_kmer = not rc_after_ref_kmer matched_target, _ = revcomp_target_to_match_ref(target, ref, rc_after_ref_kmer) new_graph.add_node(matched_target, kmer=graph.node[matched_target]) seeds.remove(lexlo(matched_target)) self.graph = new_graph return self
def __setitem__(self, key, value): lexlo_key = lexlo(key) if lexlo_key in self._exclusion_set: self._exclusion_set.discard(lexlo_key) if lexlo_key in self.ra_parser and lexlo_key not in self._new_kmers: self._n_duplicates += 1 self._new_kmers[lexlo_key] = value
def __getitem__(self, key): lexlo_key = lexlo(key) if lexlo_key in self._exclusion_set: raise KeyError if lexlo_key in self._new_kmers: return self._new_kmers[lexlo_key] return self.ra_parser[lexlo_key]
def build_or_get(self, kmer_string): """Build empty kmer or return a cached kmer for a kmer string""" check_kmer_string(kmer_string) kmer_string_to_use = lexlo(kmer_string) if kmer_string_to_use in self._seen_kmers.keys(): return self._seen_kmers[kmer_string_to_use] kmer = self._build_from_lexlo(kmer_string_to_use) self._seen_kmers[kmer_string_to_use] = kmer return kmer
def test_slurping_ra_parser(self, num_kmers): # given kmer_size = 11 b = builder.Graph() \ .with_kmer_size(kmer_size) \ .with_num_colors(1) seen_kmers = set() for _ in range(num_kmers): kmer_string = lexlo(''.join( [random.choice('ACGT') for _ in range(kmer_size)])) while kmer_string in seen_kmers: kmer_string = lexlo(''.join( [random.choice('ACGT') for _ in range(kmer_size)])) seen_kmers.add(kmer_string) b.with_kmer(kmer_string) fh = b.build() fh.seek(0) with mock.patch.object(fh, 'seek', wraps=fh.seek) as mocked_seek: # when ra = parser.SlurpedRandomAccess.from_handle(fh) # then assert 0 == mocked_seek.call_count for seen_kmer in sorted(seen_kmers): ra[seen_kmer] assert 0 == mocked_seek.call_count fh.seek(0) num_header_reads = 10 num_eof_reads = 1 with mock.patch.object(fh, 'read', wraps=fh.read) as mocked_read: # when ra = parser.SlurpedRandomAccess.from_handle(fh) # then assert num_kmers + num_eof_reads + num_header_reads == mocked_read.call_count for seen_kmer in sorted(seen_kmers): ra[seen_kmer] assert num_kmers + num_eof_reads + num_header_reads == mocked_read.call_count
def set_unitig_cycle(self, unitig): unitig.is_cycle = False try: flipped_string, is_flipped = revcomp_target_to_match_ref( unitig.left_node, unitig.right_node, rc_is_after_reference_kmer=True) except ValueError: return if lexlo(unitig.right_node) == unitig.right_node: edge_letter = flipped_string[-1].upper() else: edge_letter = lexlo(flipped_string[-1]).lower() colors = unitig.unitig_edge_colors if len(colors) == 0: return for color in colors: if not self.graph.node[unitig.right_node]['kmer'].edges[ color].is_edge(edge_letter): return unitig.is_cycle = True
def __delitem__(self, item): lexlo_string = lexlo(item) if lexlo_string in self._exclusion_set: raise KeyError in_new_kmers = lexlo_string in self._new_kmers in_ra_parser = lexlo_string in self.ra_parser if in_new_kmers: del self._new_kmers[lexlo_string] if in_ra_parser: self._exclusion_set.add(lexlo_string) if in_new_kmers and in_ra_parser: self._n_duplicates -= 1
def load_kmer(self, kmer): """Load the link group for a kmer in the orientation of the kmer.""" lexlo_kmer = lexlo(kmer) is_lexlo = lexlo_kmer == kmer try: link_group = self.links.body[lexlo_kmer] logger.debug('Loaded link group for kmer %s: %s', kmer, link_group) except KeyError: pass else: for junc in link_group.get_link_junctions_in_kmer_orientation( is_lexlo): self.junctions[junc[0]].append(junc) return self
def __next__(self): while self.seed_kmer_strings: seed = self.seed_kmer_strings.pop() lexlo_seed = lexlo(seed) if lexlo_seed not in self._unseen_lexlo_kmer_strings: continue self._seen_lexlo_kmer_strings.add(lexlo_seed) return seed, lexlo_seed while self._unseen_lexlo_kmer_strings: unseen, _ = self._unseen_lexlo_kmer_strings.popitem(last=False) if unseen in self._seen_lexlo_kmer_strings: continue self._seen_lexlo_kmer_strings.add(unseen) return unseen, unseen raise StopIteration
def __iter__(self): edge_kmers = set() query_is_lexlo = self.kmer.kmer == self.query for color in self.colors: if self.orientation == EdgeTraversalOrientation.original: node_iter = self.kmer.edges[color].get_outgoing_kmer_strings( self.query, is_lexlo=query_is_lexlo ) else: node_iter = self.kmer.edges[color].get_incoming_kmer_strings( self.query, is_lexlo=query_is_lexlo ) for out_node in node_iter: if self.return_lexlo_kmers: out_node = lexlo(out_node) edge_kmers.add(out_node) return iter(edge_kmers)
def test_revcomps_many_kmers(data, num_kmers, kmer_size): # given kmers = {} for _ in range(num_kmers): kmer_string = data.draw(kmer_strings(min_size=kmer_size, max_size=kmer_size)) kmers[lexlo(kmer_string)] = kmer_string b = get_cortex_builder() for kmer in kmers.keys(): b.with_kmer('{} 1 ........'.format(kmer)) cdb = b.build() # when expect = KmerGraphExpectation( Interactor(cdb).make_graph_nodes_consistent(set(kmers.values())).graph) # then for kmer_string in kmers.values(): expect.has_node(kmer_string) expect.has_n_nodes(len(kmers))
def annotate_kmer_graph_edges(graph): """Adds nodes to graph for kmer_strings that only exist as edges in a node's kmer.""" colors = graph.graph['colors'] kmer_builder = EmptyKmerBuilder(num_colors=len(colors), default_coverage=1) for kmer_string, kmer in list(graph.nodes(data='kmer')): is_lexlo = bool(kmer_string == lexlo(kmer_string)) for color in colors: for new_kmer_string in kmer.edges[color].get_outgoing_kmer_strings( kmer_string, is_lexlo=is_lexlo): if new_kmer_string not in graph.nodes: graph.add_node( new_kmer_string, kmer=kmer_builder.build_or_get(new_kmer_string)) graph.add_edge(kmer_string, new_kmer_string, key=color) for new_kmer_string in kmer.edges[color].get_incoming_kmer_strings( kmer_string, is_lexlo=is_lexlo): if new_kmer_string not in graph.nodes: graph.add_node( new_kmer_string, kmer=kmer_builder.build_or_get(new_kmer_string)) graph.add_edge(new_kmer_string, kmer_string, key=color) return graph
def get_kmers(self, contig): kmer_size = self.graph_parser.kmer_size assert len(contig) >= kmer_size kmers = [] for kmer_start in range(len(contig) - kmer_size + 1): kmer_string = contig[kmer_start:(kmer_start + kmer_size)] lexlo_kmer_string = lexlo(kmer_string) if lexlo_kmer_string in self.seen_kmer_strings: kmer = self.seen_kmer_strings[lexlo_kmer_string] else: try: kmer = self.graph_parser.get_kmer_for_string(kmer_string) except KeyError: kmer = self.empty_kmer_builder.build(kmer_string) self.seen_kmer_strings[lexlo_kmer_string] = kmer kmer.increment_color_coverage(self.num_colors - 1) kmers.append((kmer, kmer_string)) for kmer_idx in range(len(kmers) - 1): this_kmer = kmers[kmer_idx][0] next_kmer = kmers[kmer_idx + 1][0] connect_kmers(this_kmer, next_kmer, self.contig_color, identical_kmer_check=False) return kmers
def remove(self, lexlo_kmer_string): assert lexlo_kmer_string == lexlo(lexlo_kmer_string) self._seen_lexlo_kmer_strings.add(lexlo_kmer_string)
def from_all_kmer_strings_and_seeds(cls, all_kmers, seeds): return cls( list(seeds), OrderedDict.fromkeys(lexlo(k_string) for k_string in all_kmers))
def num_neighbor(self, kmer_string): lexlo_kmer_string = lexlo(kmer_string) if lexlo_kmer_string != kmer_string: return self.other_orientation().num_neighbor(lexlo(kmer_string)) else: return self._num_neighbor()
def get_kmer_for_string(self, string): """Will compute the revcomp of kmer string before getting a kmer""" return self[lexlo(string)]
def get_incoming_kmer_strings(self, kmer_string, is_lexlo=None): if is_lexlo is None: is_lexlo = bool(kmer_string == lexlo(kmer_string)) return self._get_kmer_strings(kmer_string[:-1], True, is_lexlo)
def build(self, kmer_string): """Build empty kmer from a kmer string""" check_kmer_string(kmer_string) kmer_string_to_use = lexlo(kmer_string) return self._build_from_lexlo(kmer_string_to_use)
def get_outgoing_kmer_strings(self, kmer_string, is_lexlo=None): if is_lexlo is None: is_lexlo = bool(kmer_string == lexlo(kmer_string)) return self._get_kmer_strings(kmer_string[1:], False, is_lexlo)
def with_link_for_kmer(self, link, kmer): "link: <F|R> <num_juncs> <counts0,counts1,...> <junctions>" assert lexlo(kmer) == kmer self.links[kmer].append(link) return self