class AutomatonCache(object): def __init__(self): self.latest = None self.matches = {} def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: if term in matches: matches[term].append(entity.id) else: matches[term] = [entity.id] if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def build_automata(vocab): # Build Aho-Corasick matching automata for vocabulary items # grouped by length. from ahocorasick import Automaton start_time = datetime.now() info('start building automata at {}'.format( start_time.strftime("%H:%M:%S"))) strings = list(vocab) max_len = max(len(s) for s in strings) strings.sort(key=lambda s: len(s)) strings_by_len = defaultdict(list) for k, g in groupby(strings, lambda s: len(s)): strings_by_len[k] = list(g) automata_by_len = {} for i in range(1, max_len + 1): if i not in strings_by_len: continue a = Automaton() for s in strings_by_len[i]: a.add_word(s, i) a.make_automaton() automata_by_len[i] = a end_time = datetime.now() info('finish building automata at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return automata_by_len
class AutomatonCache(object): def __init__(self): self.latest = None self.matches = {} self.regexes = [] def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: matches[term].add(entity.id) if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.items(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def build_automaton(self): q = Entity.all() q = q.filter(Entity.schema.in_(self.TYPES.keys())) matches = {} for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = self.match_form(name) if match is None: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return automaton = Automaton() for term, entities in matches.iteritems(): automaton.add_word(term, entities) automaton.make_automaton() return automaton
def build_automata(vocab): # Build Aho-Corasick matching automata for vocabulary items # grouped by length. The wordpiece convention is inverted for # matching: continuations are unmarked (instead of "##") and # string start is marked by "^^". from ahocorasick import Automaton start_time = datetime.now() info('start building automata at {}'.format( start_time.strftime("%H:%M:%S"))) strings = [v[2:] if v.startswith('##') else '^^' + v for v in vocab] max_len = max(len(s) for s in strings) strings.sort(key=lambda s: len(s)) strings_by_len = defaultdict(list) for k, g in groupby(strings, lambda s: len(s)): strings_by_len[k] = list(g) automata_by_len = {} for i in range(1, max_len + 1): if i not in strings_by_len: continue a = Automaton() for s in strings_by_len[i]: a.add_word(s, i) a.make_automaton() automata_by_len[i] = a end_time = datetime.now() info('finish building automata at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return automata_by_len
def make_wordlist(filepath): with open(filepath, 'r') as f: wordlist = Automaton() for idx, word in enumerate(set(Base().encode(t) for t in f.read().split())): wordlist.add_word(word, (idx, word)) wordlist.make_automaton() return wordlist
class AutomatonCache(object): def __init__(self): self.latest = None self.matches = {} def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: matches[term].add(entity.id) if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.items(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def _get_keyword_processor(self, custom_vocab: List[str]): keyword_processor = Automaton() for i, keyword in enumerate(custom_vocab): if len(keyword) > 1: keyword_processor.add_word(keyword, (i, keyword)) keyword_processor.make_automaton() return keyword_processor
def _make_kwtree(keywords): if keywords: kwtree = Automaton() for keyword in keywords: kwtree.add_word(keyword, keyword) kwtree.make_automaton() else: kwtree = None return kwtree
def initialize_ac_automaton(kmers: pd.DataFrame): A = Automaton() for idx, kmer in enumerate(set(kmers['kmer'])): A.add_word(kmer, (idx, kmer)) A.make_automaton() return A
def benchmark_pyahocorasick(LINE): from ahocorasick import Automaton, STORE_INTS automaton = Automaton() for i, key in enumerate(KEYS): automaton.add_word(key, key) automaton.make_automaton() print(list(automaton.iter(LINE))) benchmark("list(automaton.iter(LINE))", locals())
class AutomatonCache(object): TYPES = { 'Person': DocumentTag.TYPE_PERSON, 'Organization': DocumentTag.TYPE_ORGANIZATION, 'Company': DocumentTag.TYPE_ORGANIZATION, 'LegalEntity': DocumentTag.TYPE_PERSON, } def __init__(self): self.latest = None self.automaton = Automaton() self.matches = {} def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = match_form(name) # TODO: this is a weird heuristic, but to avoid overly # aggressive matching it may make sense: if match is None or ' ' not in match: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def build_automaton(vocab): # Build Aho-Corasick matching automaton for vocabulary items from ahocorasick import Automaton start_time = datetime.now() info('start building automaton at {}'.format( start_time.strftime("%H:%M:%S"))) a = Automaton() for v in vocab: a.add_word(v, len(v)) a.make_automaton() end_time = datetime.now() info('finish building automata at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return a
def init_automaton(scheme_fasta): """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta Args: scheme_fasta: SNV scheme fasta file path Returns: Aho-Corasick Automaton with kmers loaded """ A = Automaton() for header, sequence in parse_fasta(scheme_fasta): A.add_word(sequence, (header, sequence, False)) A.add_word(revcomp(sequence), (header, sequence, True)) A.make_automaton() return A
def _get_automaton(normalizer): with compiler_lock: if normalizer in AUTOMATA: return AUTOMATA.get(normalizer) aho = Automaton() count = 0 for place in iter_places(): name = place.get('name') norm = normalizer(name) value = (place.get('code'), place.get('country')) aho.add_word(norm, value) count += 1 log.debug("Country automaton: %d places", count) aho.make_automaton() AUTOMATA[normalizer] = aho return aho
class AutomatonCache(object): TYPES = { 'Person': DocumentTag.TYPE_PERSON, 'Organization': DocumentTag.TYPE_ORGANIZATION, 'Company': DocumentTag.TYPE_ORGANIZATION, 'LegalEntity': DocumentTag.TYPE_PERSON, } def __init__(self): self.latest = None self.automaton = Automaton() self.matches = {} def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: type_ = self.TYPES.get(entity.type) if type_ is None: continue if term in matches: matches[term].append((entity.name, type_)) else: matches[term] = [(entity.name, type_)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def init_automaton(scheme_fasta): """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta Args: scheme_fasta: SNV scheme fasta file path Returns: Aho-Corasick Automaton with kmers loaded """ A = Automaton() for header, sequence in parse_fasta(scheme_fasta): kmer_list = expand_degenerate_bases(sequence) for seq in kmer_list: A.add_word(seq, (header, seq, False)) A.add_word(revcomp(seq), (header, seq, True)) A.make_automaton() return A
class AhoCorasickPathGenerator: def __init__(self, identifier_mapper, identifiers): self.identifier_mapper = identifier_mapper self.identifiers = identifiers self.automaton = Automaton() for identifier in identifiers: mapped = identifier_mapper(identifier) self.automaton.add_word(identifier, (len(identifier), mapped)) self.automaton.make_automaton() self.dest_dirs = set() def blind_path(self, path): out = '' idx = 0 for end_position, (length, mapped) in self.automaton.iter(path): end_idx = end_position + 1 start_idx = end_idx - length out += path[idx:start_idx] + mapped idx = end_idx out += path[idx:] return out def __call__(self, input_dir, output_dir): for root, dirs, files in os.walk(input_dir): for name in files: source_file_name = os.path.join(root, name) relpath = os.path.relpath( source_file_name, start=input_dir, ) dest_file_name = output_dir / self.blind_path(relpath) self.dest_dirs.add(abspath(dest_file_name.parent)) yield ( abspath(source_file_name), abspath(dest_file_name), ) @property def init_lines(self): return "\n".join(f'mkdir -p "{dest_dir}"' for dest_dir in self.dest_dirs) + "\n"
def __create_automaton(self): paths = [ ('Brand', os.path.join(Path.dictionary, 'Brand.txt')), ('Car', os.path.join(Path.dictionary, 'Car.txt')), ('Train', os.path.join(Path.dictionary, 'Train.txt')), ('Predicate', os.path.join(Path.dictionary, 'config.txt')) ] automaton = Automaton() for tag, path in paths: with open(path, 'r') as r_f: for line in r_f: line = line.rstrip('\n') _, *words = line.split('\t') for word in words: word = re.sub('\(.*?\)', '', word.lower()) _, tag_set = automaton.get(word, (word, set())) tag_set.add(tag) automaton.add_word(word, (word, tag_set)) automaton.make_automaton() return automaton
def create_automaton(dict_dir, vocab_suffix, min_word_len=3): assert isinstance(min_word_len, int) or isinstance(min_word_len, dict) automaton = Automaton() if os.path.isdir(dict_dir): dicts_path = [ os.path.join(dict_dir, i) for i in os.listdir(dict_dir) if i.endswith(vocab_suffix) ] else: dicts_path = [dict_dir] for path in dicts_path: tag = os.path.split(path)[-1].strip(vocab_suffix) vocab = set(readfile(path, deal_func=lambda x: x.strip())) tag_min_word_len = min_word_len if isinstance( min_word_len, int) else min_word_len[tag] for word in vocab: word_len = len(word) if word_len >= tag_min_word_len: automaton.add_word(word, (word_len, word, tag)) automaton.make_automaton() return automaton
async def _update_links_automaton(self): """ Fetch the latest version of the links from the table, build an automaton. """ logger.info( "_update_links_automaton: fetching links from table %s", self._links_table, ) try: links = await self._api.run_db_interaction( "Fetch links from the table", _db_fetch_links, self._links_table) logger.info("_update_links_automaton: we received %d links", len(links)) new_link_automaton = Automaton(ahocorasick.STORE_LENGTH) for link in links: new_link_automaton.add_word(link) await make_deferred_yieldable( deferToThread(new_link_automaton.make_automaton)) self._link_automaton = new_link_automaton except Exception as e: logger.exception("_update_links_automaton: could not update") raise e
class Gazetteer: def __init__(self, gaze_file=data_path): self.locations = {} self.vocab_to_location = {} self.automaton = Automaton() with open(gaze_file) as cin: self.load_gazes(cin) self.automaton.make_automaton() def load_gazes(self, cin): for line in cin: line = line.split('\t') line[-1] = line[-1].rstrip() self.locations[line[0]] = tuple(line) for vocab in line[3:]: if vocab in self.vocab_to_location: self.vocab_to_location[vocab].append(line[0]) else: self.vocab_to_location[vocab] = [line[0]] for vocab, value in self.vocab_to_location.items(): self.automaton.add_word(vocab, tuple(value)) def match(self, string): ret = {} for end_index, value in self.automaton.iter(string): for lid in value: if lid in ret: ret[lid] = (ret[lid][0], ret[lid][1] + 1) else: ret[lid] = (self.locations[lid], 1) return ret
from ahocorasick import Automaton from pickle import load, dump auto = Automaton() auto.add_word('abc', 'abc') auto.add_word('def', 'def') with open('automaton-wee.pickle', 'wb') as dest: dump(auto, dest)
# python dictionary with trie patterns file_words = json.load(open("../static_ioc_sample_30k.txt", "r")) words_to_search = list() trie_words = list() total_words_to_search = 1000 total_words_added = 0 t = list() patterns = dict() total_initial_words = 0 total_iterations = 10 # CHANGE the number of iterations to perform: +/- 30k patterns per iteration. A = Automaton() for x in range(0, total_iterations): print("In iteration ", x) for key in file_words: for value in file_words[key]: value_random = value + str(random.randint(10000, 500000)) if total_words_to_search != total_words_added: words_to_search.append(value) total_words_added += 1 if x == 0: total_initial_words += 1 A.add_word(value_random, value) print(f"Initial words {total_initial_words}") print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}") A.make_automaton() input() #stop program to measure memory of the built AC trie
from ahocorasick import Automaton auto = Automaton() auto.add_word('wounded', 'wounded') auto.make_automaton() for item in auto.iter('Winning \U0001F629 so gutted, can\'t do anything for 4 weeks... Myth. #wounded'): print(item) for item in auto.iter('Winning so gutted, can\'t do anything for 4 weeks... Myth. #wounded'): print(item)
class TrieTree: ''' 前缀树类,用于匹配词典 Parameters ---------- paths:一个或者一组字典文件名(str or list),文件格式要求每列用制表符隔开: 第一列为词, 第二列为词对应的信息, 第三列为信息附带的数值等,没有则默认为True 如: 中国 LOC 0.8 美国 国家 tp:为匹配类型,可选"c, m, mc",默认"mc", 分别对应: c: "BIES + _ + 词" m: "BIES + _" mc: "BIES + _","BIES + _ + 词" Return ------ defaultdict(in, {idx_0:{feature: value}, idx_1:...}) 返回一个以词id对应特征字典的特征集合 Examples -------- >>> trietree_c = TrieTree(paths=your_vocab_files, tp='c') >>> trietree_c("中国是一个国家") defaultdict(in, {0: {'B_LOC': True}, 1: {'E_LOC': True}}) >>> trietree_m = TrieTree(paths=your_vocab_files, tp='m') >>> trietree_m("中国是一个国家") defaultdict(in, {0: {'B': True}, 1: {'E': True}}) >>> trietree_mc = TrieTree(paths=your_vocab_files, tp='mc') >>> trietree_mc("中国是一个国家") defaultdict(in, {0: {'B': True, 'B_LOC': True}, 1: {'E': True, 'E_LOC': True}}) ''' def __init__(self, vocab_paths, vocab_match_type='mc', drop_vocab_pro=0, vocab_name_space=False, separator='\t'): self.match_cnt = Counter() self.user_automaton = {} self.keep_vocab_pro = 1 - drop_vocab_pro self.vocab_name_space = vocab_name_space self.vmp = vocab_match_type self.load_vocab(vocab_paths, separator=separator) self.cnt = Counter() print('trietree:\ntp: %s\n, vocab path:%s' % (self.vmp, str(vocab_paths))) if self.keep_vocab_pro < 1: print('drop vocab pro', self.keep_vocab_pro) def __call__(self, *args, **kwargs): vocab_feature = self._vocab_feature(*args, **kwargs) return vocab_feature def load_vocab(self, paths, add=False, separator='\t'): if add and hasattr(self, 'automaton'): pass else: self.automaton = Automaton() vocab = defaultdict(list) tags = set() if isinstance(paths, str): paths = [paths] for path in paths: name_space = os.path.split(path)[-1] print('read %s' % path) output = os.popen('wc -l ' + path) total = int(output.readline().split()[0]) with open(path, 'r') as r_f: print('vocab file Examples:') for n, line in enumerate(r_f): print(line.strip()) if n >= 10: break r_f.seek(0) for line in tqdm(r_f, desc='read file', total=total): if random.random() > self.keep_vocab_pro: continue splits = line.strip().split(separator) try: if len(splits) == 2: word, tag = splits value = True elif len(splits) == 3: word, tag, value = splits value = char2num(value) elif len(splits) == 1: word = splits[0] value = True tag = 'WORD' else: continue if self.vocab_name_space: tag = name_space + '_' + tag vocab[word].append((tag, value)) if tag not in tags: tags.add(tag) except Exception as e: print('vocab error: path-%s, line %s' % (path, line), e) continue self.tags = tags if not hasattr(self, 'tags') else self.tags | tags for word, value in tqdm(vocab.items(), desc='add words'): self.automaton.add_word(word, (len(word), word, value)) print('总共有%s个词' % len(vocab)) self.automaton.make_automaton() def _vocab_feature(self, sentence): vocab_feature = defaultdict(dict) self.match(sentence, vocab_feature) if self.user_automaton: self.match(sentence, vocab_feature, base_or_user='******') return vocab_feature def match(self, sentence, vocab_feature, base_or_user='******'): if base_or_user == 'base': result = self.automaton.iter(sentence) else: result = self.user_automaton.iter(sentence) for end_idx, (word_len, _, tag_value) in list(result): start_idx = end_idx - word_len + 1 for tag, value in tag_value: self.match_cnt[tag] += 1 if self.vmp == 'c': tagss = [create_tag(word_len, tag)] elif self.vmp == 'm': tagss = [create_tag(word_len, '')] elif self.vmp == 'mc': tagss = [ create_tag(word_len, tag), create_tag(word_len, '') ] else: tagss = [] for tags in tagss: for idx, tag in zip(range(start_idx, end_idx + 1), tags): vocab_feature[idx][tag] = value def init_user_automaton(self): self.user_automaton = Automaton() self.user_automaton.make_automaton() def add_word(self, word, tag, value, update=True): ''' Parameters ---------- word: 匹配的词 tag: 词对应的信息 value: 信息附带的数值 Examples -------- >>> trietree.add_word('中国', '国家', True) >>> trietree.user_automaton.get('中国') (2, '中国', [('LOC', True)]) ''' have_add = '' if self.user_automaton == {}: self.init_user_automaton() wl, w, tag_values = self.user_automaton.get(word, (len(word), word, [])) for i, (t, v) in enumerate(tag_values): if t == tag: tag_values[i] = (tag, value) break else: tag_values.append((tag, value)) self.user_automaton.add_word(w, (wl, w, tag_values)) if update: self.user_automaton.make_automaton() def add_words(self, word_tag_values): ''' do: for word, tag, value in word_tag_values: self.add_word(word, tag, value, update=False) Examples -------- word_tag_values = [('中国', '面积', 9666), ('中国', '人口', 8888)] >>> trietree.add_word('中国', '国家', True) >>> trietree.user_automaton.get('中国') (2, '中国', [('面积', 9666), ('人口', 8888)]) ''' for word, tag, value in word_tag_values: self.add_word(word, tag, value, update=False) self.user_automaton.make_automaton() def get(self, key, default=None, vocab='all'): ''' 与字典get方法一样 Parameters ---------- vocab: 用于选择基本词典或者用户自定义词典,base(基本)/user(用户自定义)/all(两个),默认为all ''' if vocab == 'base': value = self.automaton.get(key, default) elif vocab == 'user': value = self.user_automaton.get(key, default) else: value = { 'base': self.automaton.get(key, default), 'user': self.user_automaton.get(key, default) } return value
from spacy.lang.en import English from ahocorasick import Automaton A = Automaton() A.add_word("roadster", ('BRAND', 'roadster')) A.add_word("top", ('PRODUCT', 'top')) A.add_word("boot", ('PRODUCT', 'boot')) A.add_word("sandal", ('PRODUCT', 'sandal')) A.add_word("dress", ('PRODUCT', 'dress')) A.add_word("glasses", ('PRODUCT', 'glasses')) A.add_word("saucepan", ('PRODUCT', 'saucepan')) A.add_word("earphone", ('PRODUCT', 'earphone')) A.add_word("nylon", ('INFO', 'nylon')) A.add_word("cotton", ('INFO', 'cotton')) A.add_word("small", ('INFO', 'small')) A.add_word("medium", ('INFO', 'medium')) A.add_word("large", ('INFO', 'large')) A.add_word("cord", ('PRODUCT', 'cord')) A.add_word("cover", ('PRODUCT', 'cover')) A.add_word("case", ('PRODUCT', 'case')) A.add_word("phone", ('PRODUCT', 'phone')) A.add_word("lg2000", ('BRAND', 'lg2000')) A.add_word("shoe", ('PRODUCT', 'shoe')) A.add_word("warmer", ('PRODUCT', 'warmer')) A.add_word("shirt", ('PRODUCT', 'shirt')) A.add_word("xl", ('INFO', 'xl')) A.add_word("xxl", ('INFO', 'xxl')) A.add_word("lunch box", ('PRODUCT', 'lunch box')) A.add_word("battery", ('PRODUCT', 'battery')) A.add_word("batteries", ('PRODUCT', 'batteries')) A.add_word("nokia", ('BRAND', 'nokia'))
def build_trie(company_dict: dict) -> Automaton: trie = Automaton() for name, idx in company_dict.items(): trie.add_word(name, (idx, name)) return trie
class ReadTagger: def __init__( self, bc_to_id: Dict[str, str], len_linker: int, len_primer: int, *, max_mm: int = 1, use_stats: bool = True ): self.bc_to_id = bc_to_id self.len_linker = len_linker self.len_primer = len_primer self.stats = None if not use_stats else dict( n_only_primer=0, n_multiple_bcs=0, n_no_barcode=0, n_regular=0, n_barcode_mismatch=0, n_junk=0, ) self.automaton = Automaton() all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm) for pattern, barcode in all_barcodes.items(): self.automaton.add_word(pattern, barcode) self.automaton.make_automaton() def search_barcode(self, read: str) -> Tuple[int, int, str]: for end, barcode in self.automaton.iter(read): start = end - len(barcode) + 1 yield start, end + 1, barcode def tag_read(self, header: str, seq_read: str, seq_qual: str) -> TaggedRead: # as ordered set matches = OrderedDict((match, None) for match in self.search_barcode(seq_read)) match_iter: Iterator[Tuple[int, int, str]] = iter(matches) bc_start, bc_end, barcode = next(match_iter, (None, None, None)) bc_id = self.bc_to_id.get(barcode) other_barcodes = frozenset(set(self.bc_to_id[bc] for _, _, bc in match_iter) - {bc_id}) if barcode is not None: linker_end = bc_end + self.len_linker if bc_end else None junk = seq_read[:bc_start] or None linker = seq_read[bc_end:linker_end] amplicon = seq_read[linker_end:] barcode_mismatch = seq_read[bc_start:bc_end] != barcode else: junk = None linker = None amplicon = seq_read barcode_mismatch = False read = TaggedRead( header, seq_qual, self.len_primer, junk, bc_id, linker, amplicon, other_barcodes, barcode_mismatch, ) if self.stats is not None: for name, pred in PREDS.items(): if pred(read): self.stats[name] += 1 return read def get_barcode_table(self, plain=False): cell_templates = { (True, True): '{}', (True, False): '<span class="b">{}</span>', (False, True): '<span class="a">{}</span>', (False, False): '<span class="both">{}</span>', } patterns = sorted({bc for bc_pairs in self.blacklist.values() for pair in bc_pairs for bc in pair}) sprs = pd.DataFrame(index=patterns, columns=patterns, dtype=str) for pattern, bc_pairs in self.blacklist.items(): for bc1, bc2 in bc_pairs: sprs.loc[bc1, bc2] = ''.join( cell_templates[bc1[i] == base, bc2[i] == base].format(base) for i, base in enumerate(pattern) ) with pd.option_context('display.max_colwidth', -1): html = sprs.to_html(escape=False, na_rep='') if plain: return html return HTML_INTRO + html
import discord import requests from discord.ext import commands from ahocorasick import Automaton import re from constants import colors, paths, channels from utils import make_embed, load_json, save_json, show_error d = requests.get( "https://gist.githubusercontent.com/Vexs/629488c4bb4126ad2a9909309ed6bd71/raw/416403f7080d1b353d8517dfef5acec9aafda6c3/emoji_map.json" ).json() unicode = Automaton() for emoji in d.values(): unicode.add_word(emoji, emoji) unicode.make_automaton() custom = re.compile("<a?:[a-zA-Z0-9_]{2,32}:[0-9]{18,22}>") role = re.compile( r'<@&([0-9]{18,22})>|`(.*?)`|"(.*?)"|\((.*?)\)|\*(.*?)\*|-\s*(.*?)$') def get_emoji(s): emoji = [] emoji.extend(unicode.iter(s)) emoji.extend((m.end(), m.group(0)) for m in custom.finditer(s)) emoji.sort(key=lambda x: x[0]) out = [] for end_pos, text in emoji: if m := role.search(s, end_pos):
t = list() patterns = dict() total_initial_words = 0 total_iterations = 10 # CHANGE the number of iterations to perform: +/- 30k patterns per iteration. A = Automaton() for x in range(0, total_iterations): print("In iteration ", x) for key in file_words: for value in file_words[key]: value = value + str(random.randint(10000, 500000)) if total_words_to_search != total_words_added: words_to_search.append(value) total_words_added += 1 if x == 0: total_initial_words += 1 A.add_word(value, value) print(f"Initial words {total_initial_words}") print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}") A.make_automaton() start1 = process_time() for word_to_search in words_to_search: start = process_time() end = 0 for match in A.iter(word_to_search): pass end1 = process_time() print( f"Took {end1-start1}sec to match {len(words_to_search)} patterns on a AC automaton with {total_initial_words*total_iterations}" )