Esempio n. 1
0
class AutomatonCache(object):

    def __init__(self):
        self.latest = None
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                if term in matches:
                    matches[term].append(entity.id)
                else:
                    matches[term] = [entity.id]

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Esempio n. 2
0
 def build_automata(vocab):
     # Build Aho-Corasick matching automata for vocabulary items
     # grouped by length.
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automata at {}'.format(
         start_time.strftime("%H:%M:%S")))
     strings = list(vocab)
     max_len = max(len(s) for s in strings)
     strings.sort(key=lambda s: len(s))
     strings_by_len = defaultdict(list)
     for k, g in groupby(strings, lambda s: len(s)):
         strings_by_len[k] = list(g)
     automata_by_len = {}
     for i in range(1, max_len + 1):
         if i not in strings_by_len:
             continue
         a = Automaton()
         for s in strings_by_len[i]:
             a.add_word(s, i)
         a.make_automaton()
         automata_by_len[i] = a
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return automata_by_len
Esempio n. 3
0
class AutomatonCache(object):
    def __init__(self):
        self.latest = None
        self.matches = {}
        self.regexes = []

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Esempio n. 4
0
    def build_automaton(self):
        q = Entity.all()
        q = q.filter(Entity.schema.in_(self.TYPES.keys()))

        matches = {}
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = self.match_form(name)
                if match is None:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        automaton = Automaton()
        for term, entities in matches.iteritems():
            automaton.add_word(term, entities)
        automaton.make_automaton()
        return automaton
Esempio n. 5
0
 def build_automata(vocab):
     # Build Aho-Corasick matching automata for vocabulary items
     # grouped by length. The wordpiece convention is inverted for
     # matching: continuations are unmarked (instead of "##") and
     # string start is marked by "^^".
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automata at {}'.format(
         start_time.strftime("%H:%M:%S")))
     strings = [v[2:] if v.startswith('##') else '^^' + v for v in vocab]
     max_len = max(len(s) for s in strings)
     strings.sort(key=lambda s: len(s))
     strings_by_len = defaultdict(list)
     for k, g in groupby(strings, lambda s: len(s)):
         strings_by_len[k] = list(g)
     automata_by_len = {}
     for i in range(1, max_len + 1):
         if i not in strings_by_len:
             continue
         a = Automaton()
         for s in strings_by_len[i]:
             a.add_word(s, i)
         a.make_automaton()
         automata_by_len[i] = a
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return automata_by_len
Esempio n. 6
0
def make_wordlist(filepath):
    with open(filepath, 'r') as f:
        wordlist = Automaton()
        for idx, word in enumerate(set(Base().encode(t) for t in f.read().split())):
            wordlist.add_word(word, (idx, word))
            wordlist.make_automaton()
    return wordlist
Esempio n. 7
0
class AutomatonCache(object):

    def __init__(self):
        self.latest = None
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Esempio n. 8
0
    def _get_keyword_processor(self, custom_vocab: List[str]):
        keyword_processor = Automaton()

        for i, keyword in enumerate(custom_vocab):
            if len(keyword) > 1:
                keyword_processor.add_word(keyword, (i, keyword))

        keyword_processor.make_automaton()
        return keyword_processor
Esempio n. 9
0
 def _make_kwtree(keywords):
     if keywords:
         kwtree = Automaton()
         for keyword in keywords:
             kwtree.add_word(keyword, keyword)
         kwtree.make_automaton()
     else:
         kwtree = None
     return kwtree
Esempio n. 10
0
def initialize_ac_automaton(kmers: pd.DataFrame):

    A = Automaton()

    for idx, kmer in enumerate(set(kmers['kmer'])):
        A.add_word(kmer, (idx, kmer))

    A.make_automaton()

    return A
Esempio n. 11
0
def benchmark_pyahocorasick(LINE):
    from ahocorasick import Automaton, STORE_INTS

    automaton = Automaton()
    for i, key in enumerate(KEYS):
        automaton.add_word(key, key)
    automaton.make_automaton()

    print(list(automaton.iter(LINE)))

    benchmark("list(automaton.iter(LINE))", locals())
Esempio n. 12
0
class AutomatonCache(object):

    TYPES = {
        'Person': DocumentTag.TYPE_PERSON,
        'Organization': DocumentTag.TYPE_ORGANIZATION,
        'Company': DocumentTag.TYPE_ORGANIZATION,
        'LegalEntity': DocumentTag.TYPE_PERSON,
    }

    def __init__(self):
        self.latest = None
        self.automaton = Automaton()
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = match_form(name)
                # TODO: this is a weird heuristic, but to avoid overly
                # aggressive matching it may make sense:
                if match is None or ' ' not in match:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Esempio n. 13
0
 def build_automaton(vocab):
     # Build Aho-Corasick matching automaton for vocabulary items
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automaton at {}'.format(
         start_time.strftime("%H:%M:%S")))
     a = Automaton()
     for v in vocab:
         a.add_word(v, len(v))
     a.make_automaton()
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return a
Esempio n. 14
0
def init_automaton(scheme_fasta):
    """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta

    Args:
        scheme_fasta: SNV scheme fasta file path

    Returns:
         Aho-Corasick Automaton with kmers loaded
    """
    A = Automaton()
    for header, sequence in parse_fasta(scheme_fasta):
        A.add_word(sequence, (header, sequence, False))
        A.add_word(revcomp(sequence), (header, sequence, True))
    A.make_automaton()
    return A
Esempio n. 15
0
def _get_automaton(normalizer):
    with compiler_lock:
        if normalizer in AUTOMATA:
            return AUTOMATA.get(normalizer)
        aho = Automaton()
        count = 0
        for place in iter_places():
            name = place.get('name')
            norm = normalizer(name)
            value = (place.get('code'), place.get('country'))
            aho.add_word(norm, value)
            count += 1
        log.debug("Country automaton: %d places", count)
        aho.make_automaton()
        AUTOMATA[normalizer] = aho
        return aho
Esempio n. 16
0
class AutomatonCache(object):

    TYPES = {
        'Person': DocumentTag.TYPE_PERSON,
        'Organization': DocumentTag.TYPE_ORGANIZATION,
        'Company': DocumentTag.TYPE_ORGANIZATION,
        'LegalEntity': DocumentTag.TYPE_PERSON,
    }

    def __init__(self):
        self.latest = None
        self.automaton = Automaton()
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                type_ = self.TYPES.get(entity.type)
                if type_ is None:
                    continue
                if term in matches:
                    matches[term].append((entity.name, type_))
                else:
                    matches[term] = [(entity.name, type_)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Esempio n. 17
0
def init_automaton(scheme_fasta):
    """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta

    Args:
        scheme_fasta: SNV scheme fasta file path

    Returns:
         Aho-Corasick Automaton with kmers loaded
    """
    A = Automaton()
    for header, sequence in parse_fasta(scheme_fasta):
        kmer_list = expand_degenerate_bases(sequence)
        for seq in kmer_list:
            A.add_word(seq, (header, seq, False))
            A.add_word(revcomp(seq), (header, seq, True))
    A.make_automaton()
    return A
Esempio n. 18
0
class AhoCorasickPathGenerator:
    def __init__(self, identifier_mapper, identifiers):
        self.identifier_mapper = identifier_mapper
        self.identifiers = identifiers
        self.automaton = Automaton()
        for identifier in identifiers:
            mapped = identifier_mapper(identifier)
            self.automaton.add_word(identifier, (len(identifier), mapped))
        self.automaton.make_automaton()
        self.dest_dirs = set()

    def blind_path(self, path):
        out = ''
        idx = 0
        for end_position, (length, mapped) in self.automaton.iter(path):
            end_idx = end_position + 1
            start_idx = end_idx - length
            out += path[idx:start_idx] + mapped
            idx = end_idx
        out += path[idx:]
        return out

    def __call__(self, input_dir, output_dir):
        for root, dirs, files in os.walk(input_dir):
            for name in files:
                source_file_name = os.path.join(root, name)
                relpath = os.path.relpath(
                    source_file_name,
                    start=input_dir,
                )
                dest_file_name = output_dir / self.blind_path(relpath)
                self.dest_dirs.add(abspath(dest_file_name.parent))
                yield (
                    abspath(source_file_name),
                    abspath(dest_file_name),
                )

    @property
    def init_lines(self):
        return "\n".join(f'mkdir -p "{dest_dir}"'
                         for dest_dir in self.dest_dirs) + "\n"
Esempio n. 19
0
    def __create_automaton(self):
        paths = [
            ('Brand', os.path.join(Path.dictionary, 'Brand.txt')),
            ('Car', os.path.join(Path.dictionary, 'Car.txt')),
            ('Train', os.path.join(Path.dictionary, 'Train.txt')),
            ('Predicate', os.path.join(Path.dictionary, 'config.txt'))
        ]
        automaton = Automaton()
        for tag, path in paths:
            with open(path, 'r') as r_f:
                for line in r_f:
                    line = line.rstrip('\n')
                    _, *words = line.split('\t')
                    for word in words:
                        word = re.sub('\(.*?\)', '', word.lower())
                        _, tag_set = automaton.get(word, (word, set()))
                        tag_set.add(tag)
                        automaton.add_word(word, (word, tag_set))

        automaton.make_automaton()
        return automaton
Esempio n. 20
0
 def create_automaton(dict_dir, vocab_suffix, min_word_len=3):
     assert isinstance(min_word_len, int) or isinstance(min_word_len, dict)
     automaton = Automaton()
     if os.path.isdir(dict_dir):
         dicts_path = [
             os.path.join(dict_dir, i) for i in os.listdir(dict_dir)
             if i.endswith(vocab_suffix)
         ]
     else:
         dicts_path = [dict_dir]
     for path in dicts_path:
         tag = os.path.split(path)[-1].strip(vocab_suffix)
         vocab = set(readfile(path, deal_func=lambda x: x.strip()))
         tag_min_word_len = min_word_len if isinstance(
             min_word_len, int) else min_word_len[tag]
         for word in vocab:
             word_len = len(word)
             if word_len >= tag_min_word_len:
                 automaton.add_word(word, (word_len, word, tag))
     automaton.make_automaton()
     return automaton
 async def _update_links_automaton(self):
     """
     Fetch the latest version of the links from the table, build an automaton.
     """
     logger.info(
         "_update_links_automaton: fetching links from table %s",
         self._links_table,
     )
     try:
         links = await self._api.run_db_interaction(
             "Fetch links from the table", _db_fetch_links,
             self._links_table)
         logger.info("_update_links_automaton: we received %d links",
                     len(links))
         new_link_automaton = Automaton(ahocorasick.STORE_LENGTH)
         for link in links:
             new_link_automaton.add_word(link)
         await make_deferred_yieldable(
             deferToThread(new_link_automaton.make_automaton))
         self._link_automaton = new_link_automaton
     except Exception as e:
         logger.exception("_update_links_automaton: could not update")
         raise e
Esempio n. 22
0
class Gazetteer:
    def __init__(self, gaze_file=data_path):
        self.locations = {}
        self.vocab_to_location = {}
        self.automaton = Automaton()

        with open(gaze_file) as cin:
            self.load_gazes(cin)

        self.automaton.make_automaton()

    def load_gazes(self, cin):
        for line in cin:
            line = line.split('\t')
            line[-1] = line[-1].rstrip()
            self.locations[line[0]] = tuple(line)

            for vocab in line[3:]:
                if vocab in self.vocab_to_location:
                    self.vocab_to_location[vocab].append(line[0])
                else:
                    self.vocab_to_location[vocab] = [line[0]]

        for vocab, value in self.vocab_to_location.items():
            self.automaton.add_word(vocab, tuple(value))

    def match(self, string):
        ret = {}

        for end_index, value in self.automaton.iter(string):
            for lid in value:
                if lid in ret:
                    ret[lid] = (ret[lid][0], ret[lid][1] + 1)
                else:
                    ret[lid] = (self.locations[lid], 1)

        return ret
Esempio n. 23
0
from ahocorasick import Automaton
from pickle import load, dump

auto = Automaton()
auto.add_word('abc', 'abc')

auto.add_word('def', 'def')

with open('automaton-wee.pickle', 'wb') as dest:
    dump(auto, dest)
Esempio n. 24
0
# python dictionary with trie patterns
file_words = json.load(open("../static_ioc_sample_30k.txt", "r"))
words_to_search = list()
trie_words = list()

total_words_to_search = 1000
total_words_added = 0

t = list()
patterns = dict()
total_initial_words = 0
total_iterations = 10  # CHANGE the number of iterations to perform: +/- 30k patterns per iteration.
A = Automaton()
for x in range(0, total_iterations):
    print("In iteration ", x)
    for key in file_words:
        for value in file_words[key]:
            value_random = value + str(random.randint(10000, 500000))
            if total_words_to_search != total_words_added:
                words_to_search.append(value)
                total_words_added += 1
            if x == 0:
                total_initial_words += 1
            A.add_word(value_random, value)

print(f"Initial words {total_initial_words}")
print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}")
A.make_automaton()
input()  #stop program to measure memory of the built AC trie
Esempio n. 25
0
from ahocorasick import Automaton
auto = Automaton()
auto.add_word('wounded', 'wounded')

auto.make_automaton()

for item in auto.iter('Winning \U0001F629 so gutted, can\'t do anything for 4 weeks... Myth. #wounded'):
    print(item)

for item in auto.iter('Winning so gutted, can\'t do anything for 4 weeks... Myth. #wounded'):
    print(item)
Esempio n. 26
0
class TrieTree:
    '''
    前缀树类,用于匹配词典
    Parameters
    ----------
    paths:一个或者一组字典文件名(str or list),文件格式要求每列用制表符隔开:
        第一列为词,
        第二列为词对应的信息,
        第三列为信息附带的数值等,没有则默认为True
        如: 
        中国 LOC 0.8
        美国 国家

    tp:为匹配类型,可选"c, m, mc",默认"mc", 分别对应:
        c:  "BIES + _ + 词"
        m:  "BIES + _"
        mc: "BIES + _","BIES + _ + 词"

    Return
    ------
    defaultdict(in, {idx_0:{feature: value}, idx_1:...})
    返回一个以词id对应特征字典的特征集合


    Examples
    --------
    >>> trietree_c = TrieTree(paths=your_vocab_files, tp='c')
    >>> trietree_c("中国是一个国家")
    defaultdict(in, {0: {'B_LOC': True}, 1: {'E_LOC': True}})

    >>> trietree_m = TrieTree(paths=your_vocab_files, tp='m')
    >>> trietree_m("中国是一个国家")
    defaultdict(in, {0: {'B': True}, 1: {'E': True}})

    >>> trietree_mc = TrieTree(paths=your_vocab_files, tp='mc')
    >>> trietree_mc("中国是一个国家")
    defaultdict(in,
            {0: {'B': True, 'B_LOC': True}, 1: {'E': True, 'E_LOC': True}})

    '''
    def __init__(self,
                 vocab_paths,
                 vocab_match_type='mc',
                 drop_vocab_pro=0,
                 vocab_name_space=False,
                 separator='\t'):
        self.match_cnt = Counter()
        self.user_automaton = {}
        self.keep_vocab_pro = 1 - drop_vocab_pro
        self.vocab_name_space = vocab_name_space
        self.vmp = vocab_match_type
        self.load_vocab(vocab_paths, separator=separator)
        self.cnt = Counter()

        print('trietree:\ntp: %s\n, vocab path:%s' %
              (self.vmp, str(vocab_paths)))
        if self.keep_vocab_pro < 1:
            print('drop vocab pro', self.keep_vocab_pro)

    def __call__(self, *args, **kwargs):
        vocab_feature = self._vocab_feature(*args, **kwargs)
        return vocab_feature

    def load_vocab(self, paths, add=False, separator='\t'):
        if add and hasattr(self, 'automaton'):
            pass
        else:
            self.automaton = Automaton()

        vocab = defaultdict(list)
        tags = set()
        if isinstance(paths, str):
            paths = [paths]
        for path in paths:
            name_space = os.path.split(path)[-1]
            print('read %s' % path)
            output = os.popen('wc -l ' + path)
            total = int(output.readline().split()[0])
            with open(path, 'r') as r_f:
                print('vocab file Examples:')
                for n, line in enumerate(r_f):
                    print(line.strip())
                    if n >= 10:
                        break
                r_f.seek(0)
                for line in tqdm(r_f, desc='read file', total=total):
                    if random.random() > self.keep_vocab_pro:
                        continue
                    splits = line.strip().split(separator)
                    try:
                        if len(splits) == 2:
                            word, tag = splits
                            value = True
                        elif len(splits) == 3:
                            word, tag, value = splits
                            value = char2num(value)

                        elif len(splits) == 1:
                            word = splits[0]
                            value = True
                            tag = 'WORD'

                        else:
                            continue

                        if self.vocab_name_space:
                            tag = name_space + '_' + tag
                        vocab[word].append((tag, value))
                        if tag not in tags:
                            tags.add(tag)

                    except Exception as e:
                        print('vocab error: path-%s, line %s' % (path, line),
                              e)
                        continue

        self.tags = tags if not hasattr(self, 'tags') else self.tags | tags

        for word, value in tqdm(vocab.items(), desc='add words'):
            self.automaton.add_word(word, (len(word), word, value))

        print('总共有%s个词' % len(vocab))
        self.automaton.make_automaton()

    def _vocab_feature(self, sentence):
        vocab_feature = defaultdict(dict)
        self.match(sentence, vocab_feature)
        if self.user_automaton:
            self.match(sentence, vocab_feature, base_or_user='******')

        return vocab_feature

    def match(self, sentence, vocab_feature, base_or_user='******'):

        if base_or_user == 'base':
            result = self.automaton.iter(sentence)
        else:
            result = self.user_automaton.iter(sentence)

        for end_idx, (word_len, _, tag_value) in list(result):

            start_idx = end_idx - word_len + 1
            for tag, value in tag_value:
                self.match_cnt[tag] += 1
                if self.vmp == 'c':
                    tagss = [create_tag(word_len, tag)]
                elif self.vmp == 'm':
                    tagss = [create_tag(word_len, '')]
                elif self.vmp == 'mc':
                    tagss = [
                        create_tag(word_len, tag),
                        create_tag(word_len, '')
                    ]
                else:
                    tagss = []
                for tags in tagss:
                    for idx, tag in zip(range(start_idx, end_idx + 1), tags):
                        vocab_feature[idx][tag] = value

    def init_user_automaton(self):
        self.user_automaton = Automaton()
        self.user_automaton.make_automaton()

    def add_word(self, word, tag, value, update=True):
        '''
        Parameters
        ----------
        word:  匹配的词
        tag:   词对应的信息
        value: 信息附带的数值

        Examples
        --------
        >>> trietree.add_word('中国', '国家', True)
        >>> trietree.user_automaton.get('中国')
        (2, '中国', [('LOC', True)])
        '''
        have_add = ''
        if self.user_automaton == {}:
            self.init_user_automaton()
        wl, w, tag_values = self.user_automaton.get(word,
                                                    (len(word), word, []))
        for i, (t, v) in enumerate(tag_values):
            if t == tag:
                tag_values[i] = (tag, value)
                break
        else:
            tag_values.append((tag, value))
        self.user_automaton.add_word(w, (wl, w, tag_values))
        if update:
            self.user_automaton.make_automaton()

    def add_words(self, word_tag_values):
        '''
        do:

        for word, tag, value in word_tag_values:
            self.add_word(word, tag, value, update=False)



        Examples
        --------
        word_tag_values = [('中国', '面积', 9666), ('中国', '人口', 8888)]
        >>> trietree.add_word('中国', '国家', True)
        >>> trietree.user_automaton.get('中国')
        (2, '中国', [('面积', 9666), ('人口', 8888)])

        '''
        for word, tag, value in word_tag_values:
            self.add_word(word, tag, value, update=False)
        self.user_automaton.make_automaton()

    def get(self, key, default=None, vocab='all'):
        '''
        与字典get方法一样

        Parameters
        ----------
        vocab:  用于选择基本词典或者用户自定义词典,base(基本)/user(用户自定义)/all(两个),默认为all
        '''
        if vocab == 'base':
            value = self.automaton.get(key, default)
        elif vocab == 'user':
            value = self.user_automaton.get(key, default)
        else:
            value = {
                'base': self.automaton.get(key, default),
                'user': self.user_automaton.get(key, default)
            }
        return value
from spacy.lang.en import English
from ahocorasick import Automaton

A = Automaton()
A.add_word("roadster", ('BRAND', 'roadster'))
A.add_word("top", ('PRODUCT', 'top'))
A.add_word("boot", ('PRODUCT', 'boot'))
A.add_word("sandal", ('PRODUCT', 'sandal'))
A.add_word("dress", ('PRODUCT', 'dress'))
A.add_word("glasses", ('PRODUCT', 'glasses'))
A.add_word("saucepan", ('PRODUCT', 'saucepan'))
A.add_word("earphone", ('PRODUCT', 'earphone'))
A.add_word("nylon", ('INFO', 'nylon'))
A.add_word("cotton", ('INFO', 'cotton'))
A.add_word("small", ('INFO', 'small'))
A.add_word("medium", ('INFO', 'medium'))
A.add_word("large", ('INFO', 'large'))
A.add_word("cord", ('PRODUCT', 'cord'))
A.add_word("cover", ('PRODUCT', 'cover'))
A.add_word("case", ('PRODUCT', 'case'))
A.add_word("phone", ('PRODUCT', 'phone'))
A.add_word("lg2000", ('BRAND', 'lg2000'))
A.add_word("shoe", ('PRODUCT', 'shoe'))
A.add_word("warmer", ('PRODUCT', 'warmer'))
A.add_word("shirt", ('PRODUCT', 'shirt'))
A.add_word("xl", ('INFO', 'xl'))
A.add_word("xxl", ('INFO', 'xxl'))
A.add_word("lunch box", ('PRODUCT', 'lunch box'))
A.add_word("battery", ('PRODUCT', 'battery'))
A.add_word("batteries", ('PRODUCT', 'batteries'))
A.add_word("nokia", ('BRAND', 'nokia'))
def build_trie(company_dict: dict) -> Automaton:
  trie = Automaton()
  for name, idx in company_dict.items():
    trie.add_word(name, (idx, name))
  return trie
Esempio n. 29
0
class ReadTagger:
	def __init__(
		self,
		bc_to_id: Dict[str, str],
		len_linker: int,
		len_primer: int,
		*,
		max_mm: int = 1,
		use_stats: bool = True
	):
		self.bc_to_id = bc_to_id
		self.len_linker = len_linker
		self.len_primer = len_primer
		self.stats = None if not use_stats else dict(
			n_only_primer=0,
			n_multiple_bcs=0,
			n_no_barcode=0,
			n_regular=0,
			n_barcode_mismatch=0,
			n_junk=0,
		)
		
		self.automaton = Automaton()
		all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm)
		for pattern, barcode in all_barcodes.items():
			self.automaton.add_word(pattern, barcode)
		self.automaton.make_automaton()
	
	def search_barcode(self, read: str) -> Tuple[int, int, str]:
		for end, barcode in self.automaton.iter(read):
			start = end - len(barcode) + 1
			yield start, end + 1, barcode
	
	def tag_read(self, header: str, seq_read: str, seq_qual: str) -> TaggedRead:
		# as ordered set
		matches = OrderedDict((match, None) for match in self.search_barcode(seq_read))
		
		match_iter: Iterator[Tuple[int, int, str]] = iter(matches)
		bc_start, bc_end, barcode = next(match_iter, (None, None, None))
		
		bc_id = self.bc_to_id.get(barcode)
		other_barcodes = frozenset(set(self.bc_to_id[bc] for _, _, bc in match_iter) - {bc_id})
		
		if barcode is not None:
			linker_end = bc_end + self.len_linker if bc_end else None
			
			junk = seq_read[:bc_start] or None
			linker = seq_read[bc_end:linker_end]
			amplicon = seq_read[linker_end:]
			barcode_mismatch = seq_read[bc_start:bc_end] != barcode
		else:
			junk = None
			linker = None
			amplicon = seq_read
			barcode_mismatch = False
		
		read = TaggedRead(
			header, seq_qual, self.len_primer, junk, bc_id,
			linker, amplicon, other_barcodes, barcode_mismatch,
		)
		
		if self.stats is not None:
			for name, pred in PREDS.items():
				if pred(read):
					self.stats[name] += 1
		
		return read
	
	def get_barcode_table(self, plain=False):
		cell_templates = {
			(True, True): '{}',
			(True, False): '<span class="b">{}</span>',
			(False, True): '<span class="a">{}</span>',
			(False, False): '<span class="both">{}</span>',
		}
		
		patterns = sorted({bc for bc_pairs in self.blacklist.values() for pair in bc_pairs for bc in pair})
		sprs = pd.DataFrame(index=patterns, columns=patterns, dtype=str)
		for pattern, bc_pairs in self.blacklist.items():
			for bc1, bc2 in bc_pairs:
				sprs.loc[bc1, bc2] = ''.join(
					cell_templates[bc1[i] == base, bc2[i] == base].format(base)
					for i, base in enumerate(pattern)
				)
		
		with pd.option_context('display.max_colwidth', -1):
			html = sprs.to_html(escape=False, na_rep='')
		
		if plain:
			return html
		return HTML_INTRO + html
Esempio n. 30
0
import discord
import requests
from discord.ext import commands
from ahocorasick import Automaton

import re
from constants import colors, paths, channels
from utils import make_embed, load_json, save_json, show_error

d = requests.get(
    "https://gist.githubusercontent.com/Vexs/629488c4bb4126ad2a9909309ed6bd71/raw/416403f7080d1b353d8517dfef5acec9aafda6c3/emoji_map.json"
).json()
unicode = Automaton()
for emoji in d.values():
    unicode.add_word(emoji, emoji)
unicode.make_automaton()

custom = re.compile("<a?:[a-zA-Z0-9_]{2,32}:[0-9]{18,22}>")
role = re.compile(
    r'<@&([0-9]{18,22})>|`(.*?)`|"(.*?)"|\((.*?)\)|\*(.*?)\*|-\s*(.*?)$')


def get_emoji(s):
    emoji = []
    emoji.extend(unicode.iter(s))
    emoji.extend((m.end(), m.group(0)) for m in custom.finditer(s))
    emoji.sort(key=lambda x: x[0])

    out = []
    for end_pos, text in emoji:
        if m := role.search(s, end_pos):
t = list()
patterns = dict()
total_initial_words = 0
total_iterations = 10  # CHANGE the number of iterations to perform: +/- 30k patterns per iteration.
A = Automaton()
for x in range(0, total_iterations):
    print("In iteration ", x)
    for key in file_words:
        for value in file_words[key]:
            value = value + str(random.randint(10000, 500000))
            if total_words_to_search != total_words_added:
                words_to_search.append(value)
                total_words_added += 1
            if x == 0:
                total_initial_words += 1
            A.add_word(value, value)

print(f"Initial words {total_initial_words}")
print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}")
A.make_automaton()

start1 = process_time()
for word_to_search in words_to_search:
    start = process_time()
    end = 0
    for match in A.iter(word_to_search):
        pass
end1 = process_time()
print(
    f"Took {end1-start1}sec to match {len(words_to_search)} patterns on a AC automaton with {total_initial_words*total_iterations}"
)