コード例 #1
0
ファイル: wordlists.py プロジェクト: TeHikuMedia/reo-toolkit
def make_wordlist(filepath):
    with open(filepath, 'r') as f:
        wordlist = Automaton()
        for idx, word in enumerate(set(Base().encode(t) for t in f.read().split())):
            wordlist.add_word(word, (idx, word))
            wordlist.make_automaton()
    return wordlist
コード例 #2
0
 def build_automata(vocab):
     # Build Aho-Corasick matching automata for vocabulary items
     # grouped by length.
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automata at {}'.format(
         start_time.strftime("%H:%M:%S")))
     strings = list(vocab)
     max_len = max(len(s) for s in strings)
     strings.sort(key=lambda s: len(s))
     strings_by_len = defaultdict(list)
     for k, g in groupby(strings, lambda s: len(s)):
         strings_by_len[k] = list(g)
     automata_by_len = {}
     for i in range(1, max_len + 1):
         if i not in strings_by_len:
             continue
         a = Automaton()
         for s in strings_by_len[i]:
             a.add_word(s, i)
         a.make_automaton()
         automata_by_len[i] = a
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return automata_by_len
コード例 #3
0
class AutomatonCache(object):

    def __init__(self):
        self.latest = None
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                if term in matches:
                    matches[term].append(entity.id)
                else:
                    matches[term] = [entity.id]

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
コード例 #4
0
ファイル: regex_entity.py プロジェクト: tomjie/aleph
class AutomatonCache(object):
    def __init__(self):
        self.latest = None
        self.matches = {}
        self.regexes = []

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
コード例 #5
0
ファイル: regex_entity.py プロジェクト: CodeForAfrica/aleph
class AutomatonCache(object):

    def __init__(self):
        self.latest = None
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
コード例 #6
0
 def build_automata(vocab):
     # Build Aho-Corasick matching automata for vocabulary items
     # grouped by length. The wordpiece convention is inverted for
     # matching: continuations are unmarked (instead of "##") and
     # string start is marked by "^^".
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automata at {}'.format(
         start_time.strftime("%H:%M:%S")))
     strings = [v[2:] if v.startswith('##') else '^^' + v for v in vocab]
     max_len = max(len(s) for s in strings)
     strings.sort(key=lambda s: len(s))
     strings_by_len = defaultdict(list)
     for k, g in groupby(strings, lambda s: len(s)):
         strings_by_len[k] = list(g)
     automata_by_len = {}
     for i in range(1, max_len + 1):
         if i not in strings_by_len:
             continue
         a = Automaton()
         for s in strings_by_len[i]:
             a.add_word(s, i)
         a.make_automaton()
         automata_by_len[i] = a
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return automata_by_len
コード例 #7
0
def test_add_concepts():
    data_path = prepare_data(
        path.join(path.dirname(__file__), "..", "data", "raw",
                  "vocabularies-tiny.zip"))

    dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"),
                            sep="\t").dropna(subset=["concept_name"])
    automaton = Automaton()
    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    assert len(tuple(automaton.keys())) == 15791

    first_keys = sorted(automaton.keys())[:10]
    assert first_keys == [
        '% REF', '(1-6)-alpha-glucomannan', '1 alpha-hydroxyergocalciferol',
        "1,1',1'',1'''-(ethylenedinitrilo)tetra-2-propanol",
        '1,1,1-trichloro-2,2,2-trifluoroethane', '1,1-difluoroethane',
        '1,10-decanediol', '1,10-phenanthroline', '1,2,6-hexanetriol',
        '1,2-Dipalmitoylphosphatidylcholine'
    ]

    first_concept_id, first_concept_name = automaton.get(first_keys[0])

    assert (first_concept_id, first_concept_name) == (8514, '% REF')
コード例 #8
0
ファイル: add_vocabulary.py プロジェクト: syzer/sse
def update_automaton(dataframe,
                     automaton_filename=path.join(PROCESSED_DATA_PATH,
                                                  "vocabulary_automaton.pkl")):
    # Assert we have the same amount of concept names and ids.
    assert len(dataframe["concept_name"] == dataframe["concept_id"])

    try:
        with open(automaton_filename, "rb") as automaton_file:
            automaton = pickle.load(automaton_file)

        logging.info("Loaded previous automaton from path '{}'.".format(
            automaton_filename))
    except FileNotFoundError:
        logging.info("Created new automaton.")
        automaton = Automaton()

    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    with open(automaton_filename, "wb") as automaton_file:
        pickle.dump(automaton, automaton_file)

    logging.info(
        "Updated automaton under path '{}'.".format(automaton_filename))
    return automaton
コード例 #9
0
ファイル: corasick_entity.py プロジェクト: KarrieK/aleph
    def build_automaton(self):
        q = Entity.all()
        q = q.filter(Entity.schema.in_(self.TYPES.keys()))

        matches = {}
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = self.match_form(name)
                if match is None:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        automaton = Automaton()
        for term, entities in matches.iteritems():
            automaton.add_word(term, entities)
        automaton.make_automaton()
        return automaton
コード例 #10
0
    def _get_keyword_processor(self, custom_vocab: List[str]):
        keyword_processor = Automaton()

        for i, keyword in enumerate(custom_vocab):
            if len(keyword) > 1:
                keyword_processor.add_word(keyword, (i, keyword))

        keyword_processor.make_automaton()
        return keyword_processor
コード例 #11
0
 def _make_kwtree(keywords):
     if keywords:
         kwtree = Automaton()
         for keyword in keywords:
             kwtree.add_word(keyword, keyword)
         kwtree.make_automaton()
     else:
         kwtree = None
     return kwtree
コード例 #12
0
ファイル: acm.py プロジェクト: dorbarker/kmer-mlst
def initialize_ac_automaton(kmers: pd.DataFrame):

    A = Automaton()

    for idx, kmer in enumerate(set(kmers['kmer'])):
        A.add_word(kmer, (idx, kmer))

    A.make_automaton()

    return A
コード例 #13
0
def benchmark_pyahocorasick(LINE):
    from ahocorasick import Automaton, STORE_INTS

    automaton = Automaton()
    for i, key in enumerate(KEYS):
        automaton.add_word(key, key)
    automaton.make_automaton()

    print(list(automaton.iter(LINE)))

    benchmark("list(automaton.iter(LINE))", locals())
コード例 #14
0
ファイル: corasick_entity.py プロジェクト: DtorrX/aleph
class AutomatonCache(object):

    TYPES = {
        'Person': DocumentTag.TYPE_PERSON,
        'Organization': DocumentTag.TYPE_ORGANIZATION,
        'Company': DocumentTag.TYPE_ORGANIZATION,
        'LegalEntity': DocumentTag.TYPE_PERSON,
    }

    def __init__(self):
        self.latest = None
        self.automaton = Automaton()
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = match_form(name)
                # TODO: this is a weird heuristic, but to avoid overly
                # aggressive matching it may make sense:
                if match is None or ' ' not in match:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
コード例 #15
0
 def build_automaton(vocab):
     # Build Aho-Corasick matching automaton for vocabulary items
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automaton at {}'.format(
         start_time.strftime("%H:%M:%S")))
     a = Automaton()
     for v in vocab:
         a.add_word(v, len(v))
     a.make_automaton()
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return a
コード例 #16
0
def init_automaton(scheme_fasta):
    """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta

    Args:
        scheme_fasta: SNV scheme fasta file path

    Returns:
         Aho-Corasick Automaton with kmers loaded
    """
    A = Automaton()
    for header, sequence in parse_fasta(scheme_fasta):
        A.add_word(sequence, (header, sequence, False))
        A.add_word(revcomp(sequence), (header, sequence, True))
    A.make_automaton()
    return A
コード例 #17
0
def _get_automaton(normalizer):
    with compiler_lock:
        if normalizer in AUTOMATA:
            return AUTOMATA.get(normalizer)
        aho = Automaton()
        count = 0
        for place in iter_places():
            name = place.get('name')
            norm = normalizer(name)
            value = (place.get('code'), place.get('country'))
            aho.add_word(norm, value)
            count += 1
        log.debug("Country automaton: %d places", count)
        aho.make_automaton()
        AUTOMATA[normalizer] = aho
        return aho
コード例 #18
0
class AutomatonCache(object):

    TYPES = {
        'Person': DocumentTag.TYPE_PERSON,
        'Organization': DocumentTag.TYPE_ORGANIZATION,
        'Company': DocumentTag.TYPE_ORGANIZATION,
        'LegalEntity': DocumentTag.TYPE_PERSON,
    }

    def __init__(self):
        self.latest = None
        self.automaton = Automaton()
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                type_ = self.TYPES.get(entity.type)
                if type_ is None:
                    continue
                if term in matches:
                    matches[term].append((entity.name, type_))
                else:
                    matches[term] = [(entity.name, type_)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
コード例 #19
0
def init_automaton(scheme_fasta):
    """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta

    Args:
        scheme_fasta: SNV scheme fasta file path

    Returns:
         Aho-Corasick Automaton with kmers loaded
    """
    A = Automaton()
    for header, sequence in parse_fasta(scheme_fasta):
        kmer_list = expand_degenerate_bases(sequence)
        for seq in kmer_list:
            A.add_word(seq, (header, seq, False))
            A.add_word(revcomp(seq), (header, seq, True))
    A.make_automaton()
    return A
コード例 #20
0
def test_match_text():
    data_path = prepare_data(
        path.join(path.dirname(__file__), "..", "data", "raw",
                  "vocabularies-tiny.zip"))

    dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"),
                            sep="\t").dropna(subset=["concept_name"])
    automaton = Automaton()
    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    matches = list(generate_matches(automaton=automaton, text=dummy_abstract))
    match_soll_values = [(54, (46257025, 'ethyl acetate')),
                         (653, (45616149, 'formic acid')),
                         (785, (8512, 'day'))]
    assert matches == match_soll_values
コード例 #21
0
class AhoCorasickPathGenerator:
    def __init__(self, identifier_mapper, identifiers):
        self.identifier_mapper = identifier_mapper
        self.identifiers = identifiers
        self.automaton = Automaton()
        for identifier in identifiers:
            mapped = identifier_mapper(identifier)
            self.automaton.add_word(identifier, (len(identifier), mapped))
        self.automaton.make_automaton()
        self.dest_dirs = set()

    def blind_path(self, path):
        out = ''
        idx = 0
        for end_position, (length, mapped) in self.automaton.iter(path):
            end_idx = end_position + 1
            start_idx = end_idx - length
            out += path[idx:start_idx] + mapped
            idx = end_idx
        out += path[idx:]
        return out

    def __call__(self, input_dir, output_dir):
        for root, dirs, files in os.walk(input_dir):
            for name in files:
                source_file_name = os.path.join(root, name)
                relpath = os.path.relpath(
                    source_file_name,
                    start=input_dir,
                )
                dest_file_name = output_dir / self.blind_path(relpath)
                self.dest_dirs.add(abspath(dest_file_name.parent))
                yield (
                    abspath(source_file_name),
                    abspath(dest_file_name),
                )

    @property
    def init_lines(self):
        return "\n".join(f'mkdir -p "{dest_dir}"'
                         for dest_dir in self.dest_dirs) + "\n"
コード例 #22
0
 def create_automaton(dict_dir, vocab_suffix, min_word_len=3):
     assert isinstance(min_word_len, int) or isinstance(min_word_len, dict)
     automaton = Automaton()
     if os.path.isdir(dict_dir):
         dicts_path = [
             os.path.join(dict_dir, i) for i in os.listdir(dict_dir)
             if i.endswith(vocab_suffix)
         ]
     else:
         dicts_path = [dict_dir]
     for path in dicts_path:
         tag = os.path.split(path)[-1].strip(vocab_suffix)
         vocab = set(readfile(path, deal_func=lambda x: x.strip()))
         tag_min_word_len = min_word_len if isinstance(
             min_word_len, int) else min_word_len[tag]
         for word in vocab:
             word_len = len(word)
             if word_len >= tag_min_word_len:
                 automaton.add_word(word, (word_len, word, tag))
     automaton.make_automaton()
     return automaton
コード例 #23
0
ファイル: dictionary_match.py プロジェクト: skywindy/carqabot
    def __create_automaton(self):
        paths = [
            ('Brand', os.path.join(Path.dictionary, 'Brand.txt')),
            ('Car', os.path.join(Path.dictionary, 'Car.txt')),
            ('Train', os.path.join(Path.dictionary, 'Train.txt')),
            ('Predicate', os.path.join(Path.dictionary, 'config.txt'))
        ]
        automaton = Automaton()
        for tag, path in paths:
            with open(path, 'r') as r_f:
                for line in r_f:
                    line = line.rstrip('\n')
                    _, *words = line.split('\t')
                    for word in words:
                        word = re.sub('\(.*?\)', '', word.lower())
                        _, tag_set = automaton.get(word, (word, set()))
                        tag_set.add(tag)
                        automaton.add_word(word, (word, tag_set))

        automaton.make_automaton()
        return automaton
コード例 #24
0
class Gazetteer:
    def __init__(self, gaze_file=data_path):
        self.locations = {}
        self.vocab_to_location = {}
        self.automaton = Automaton()

        with open(gaze_file) as cin:
            self.load_gazes(cin)

        self.automaton.make_automaton()

    def load_gazes(self, cin):
        for line in cin:
            line = line.split('\t')
            line[-1] = line[-1].rstrip()
            self.locations[line[0]] = tuple(line)

            for vocab in line[3:]:
                if vocab in self.vocab_to_location:
                    self.vocab_to_location[vocab].append(line[0])
                else:
                    self.vocab_to_location[vocab] = [line[0]]

        for vocab, value in self.vocab_to_location.items():
            self.automaton.add_word(vocab, tuple(value))

    def match(self, string):
        ret = {}

        for end_index, value in self.automaton.iter(string):
            for lid in value:
                if lid in ret:
                    ret[lid] = (ret[lid][0], ret[lid][1] + 1)
                else:
                    ret[lid] = (self.locations[lid], 1)

        return ret
コード例 #25
0
def build_trie(company_dict: dict) -> Automaton:
    trie = Automaton()
    for name, idx in company_dict.items():
        trie.add_word(name, (idx, name))
    trie.make_automaton()
    return trie
コード例 #26
0
# python dictionary with trie patterns
file_words = json.load(open("../static_ioc_sample_30k.txt", "r"))
words_to_search = list()
trie_words = list()

total_words_to_search = 1000
total_words_added = 0

t = list()
patterns = dict()
total_initial_words = 0
total_iterations = 10  # CHANGE the number of iterations to perform: +/- 30k patterns per iteration.
A = Automaton()
for x in range(0, total_iterations):
    print("In iteration ", x)
    for key in file_words:
        for value in file_words[key]:
            value_random = value + str(random.randint(10000, 500000))
            if total_words_to_search != total_words_added:
                words_to_search.append(value)
                total_words_added += 1
            if x == 0:
                total_initial_words += 1
            A.add_word(value_random, value)

print(f"Initial words {total_initial_words}")
print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}")
A.make_automaton()
input()  #stop program to measure memory of the built AC trie
コード例 #27
0
class ReadTagger:
	def __init__(
		self,
		bc_to_id: Dict[str, str],
		len_linker: int,
		len_primer: int,
		*,
		max_mm: int = 1,
		use_stats: bool = True
	):
		self.bc_to_id = bc_to_id
		self.len_linker = len_linker
		self.len_primer = len_primer
		self.stats = None if not use_stats else dict(
			n_only_primer=0,
			n_multiple_bcs=0,
			n_no_barcode=0,
			n_regular=0,
			n_barcode_mismatch=0,
			n_junk=0,
		)
		
		self.automaton = Automaton()
		all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm)
		for pattern, barcode in all_barcodes.items():
			self.automaton.add_word(pattern, barcode)
		self.automaton.make_automaton()
	
	def search_barcode(self, read: str) -> Tuple[int, int, str]:
		for end, barcode in self.automaton.iter(read):
			start = end - len(barcode) + 1
			yield start, end + 1, barcode
	
	def tag_read(self, header: str, seq_read: str, seq_qual: str) -> TaggedRead:
		# as ordered set
		matches = OrderedDict((match, None) for match in self.search_barcode(seq_read))
		
		match_iter: Iterator[Tuple[int, int, str]] = iter(matches)
		bc_start, bc_end, barcode = next(match_iter, (None, None, None))
		
		bc_id = self.bc_to_id.get(barcode)
		other_barcodes = frozenset(set(self.bc_to_id[bc] for _, _, bc in match_iter) - {bc_id})
		
		if barcode is not None:
			linker_end = bc_end + self.len_linker if bc_end else None
			
			junk = seq_read[:bc_start] or None
			linker = seq_read[bc_end:linker_end]
			amplicon = seq_read[linker_end:]
			barcode_mismatch = seq_read[bc_start:bc_end] != barcode
		else:
			junk = None
			linker = None
			amplicon = seq_read
			barcode_mismatch = False
		
		read = TaggedRead(
			header, seq_qual, self.len_primer, junk, bc_id,
			linker, amplicon, other_barcodes, barcode_mismatch,
		)
		
		if self.stats is not None:
			for name, pred in PREDS.items():
				if pred(read):
					self.stats[name] += 1
		
		return read
	
	def get_barcode_table(self, plain=False):
		cell_templates = {
			(True, True): '{}',
			(True, False): '<span class="b">{}</span>',
			(False, True): '<span class="a">{}</span>',
			(False, False): '<span class="both">{}</span>',
		}
		
		patterns = sorted({bc for bc_pairs in self.blacklist.values() for pair in bc_pairs for bc in pair})
		sprs = pd.DataFrame(index=patterns, columns=patterns, dtype=str)
		for pattern, bc_pairs in self.blacklist.items():
			for bc1, bc2 in bc_pairs:
				sprs.loc[bc1, bc2] = ''.join(
					cell_templates[bc1[i] == base, bc2[i] == base].format(base)
					for i, base in enumerate(pattern)
				)
		
		with pd.option_context('display.max_colwidth', -1):
			html = sprs.to_html(escape=False, na_rep='')
		
		if plain:
			return html
		return HTML_INTRO + html
コード例 #28
0
ファイル: issue_53.py プロジェクト: zhu/pyahocorasick
from ahocorasick import Automaton
auto = Automaton()
auto.add_word('wounded', 'wounded')

auto.make_automaton()

for item in auto.iter('Winning \U0001F629 so gutted, can\'t do anything for 4 weeks... Myth. #wounded'):
    print(item)

for item in auto.iter('Winning so gutted, can\'t do anything for 4 weeks... Myth. #wounded'):
    print(item)
コード例 #29
0
class TrieTree:
    '''
    前缀树类,用于匹配词典
    Parameters
    ----------
    paths:一个或者一组字典文件名(str or list),文件格式要求每列用制表符隔开:
        第一列为词,
        第二列为词对应的信息,
        第三列为信息附带的数值等,没有则默认为True
        如: 
        中国 LOC 0.8
        美国 国家

    tp:为匹配类型,可选"c, m, mc",默认"mc", 分别对应:
        c:  "BIES + _ + 词"
        m:  "BIES + _"
        mc: "BIES + _","BIES + _ + 词"

    Return
    ------
    defaultdict(in, {idx_0:{feature: value}, idx_1:...})
    返回一个以词id对应特征字典的特征集合


    Examples
    --------
    >>> trietree_c = TrieTree(paths=your_vocab_files, tp='c')
    >>> trietree_c("中国是一个国家")
    defaultdict(in, {0: {'B_LOC': True}, 1: {'E_LOC': True}})

    >>> trietree_m = TrieTree(paths=your_vocab_files, tp='m')
    >>> trietree_m("中国是一个国家")
    defaultdict(in, {0: {'B': True}, 1: {'E': True}})

    >>> trietree_mc = TrieTree(paths=your_vocab_files, tp='mc')
    >>> trietree_mc("中国是一个国家")
    defaultdict(in,
            {0: {'B': True, 'B_LOC': True}, 1: {'E': True, 'E_LOC': True}})

    '''
    def __init__(self,
                 vocab_paths,
                 vocab_match_type='mc',
                 drop_vocab_pro=0,
                 vocab_name_space=False,
                 separator='\t'):
        self.match_cnt = Counter()
        self.user_automaton = {}
        self.keep_vocab_pro = 1 - drop_vocab_pro
        self.vocab_name_space = vocab_name_space
        self.vmp = vocab_match_type
        self.load_vocab(vocab_paths, separator=separator)
        self.cnt = Counter()

        print('trietree:\ntp: %s\n, vocab path:%s' %
              (self.vmp, str(vocab_paths)))
        if self.keep_vocab_pro < 1:
            print('drop vocab pro', self.keep_vocab_pro)

    def __call__(self, *args, **kwargs):
        vocab_feature = self._vocab_feature(*args, **kwargs)
        return vocab_feature

    def load_vocab(self, paths, add=False, separator='\t'):
        if add and hasattr(self, 'automaton'):
            pass
        else:
            self.automaton = Automaton()

        vocab = defaultdict(list)
        tags = set()
        if isinstance(paths, str):
            paths = [paths]
        for path in paths:
            name_space = os.path.split(path)[-1]
            print('read %s' % path)
            output = os.popen('wc -l ' + path)
            total = int(output.readline().split()[0])
            with open(path, 'r') as r_f:
                print('vocab file Examples:')
                for n, line in enumerate(r_f):
                    print(line.strip())
                    if n >= 10:
                        break
                r_f.seek(0)
                for line in tqdm(r_f, desc='read file', total=total):
                    if random.random() > self.keep_vocab_pro:
                        continue
                    splits = line.strip().split(separator)
                    try:
                        if len(splits) == 2:
                            word, tag = splits
                            value = True
                        elif len(splits) == 3:
                            word, tag, value = splits
                            value = char2num(value)

                        elif len(splits) == 1:
                            word = splits[0]
                            value = True
                            tag = 'WORD'

                        else:
                            continue

                        if self.vocab_name_space:
                            tag = name_space + '_' + tag
                        vocab[word].append((tag, value))
                        if tag not in tags:
                            tags.add(tag)

                    except Exception as e:
                        print('vocab error: path-%s, line %s' % (path, line),
                              e)
                        continue

        self.tags = tags if not hasattr(self, 'tags') else self.tags | tags

        for word, value in tqdm(vocab.items(), desc='add words'):
            self.automaton.add_word(word, (len(word), word, value))

        print('总共有%s个词' % len(vocab))
        self.automaton.make_automaton()

    def _vocab_feature(self, sentence):
        vocab_feature = defaultdict(dict)
        self.match(sentence, vocab_feature)
        if self.user_automaton:
            self.match(sentence, vocab_feature, base_or_user='******')

        return vocab_feature

    def match(self, sentence, vocab_feature, base_or_user='******'):

        if base_or_user == 'base':
            result = self.automaton.iter(sentence)
        else:
            result = self.user_automaton.iter(sentence)

        for end_idx, (word_len, _, tag_value) in list(result):

            start_idx = end_idx - word_len + 1
            for tag, value in tag_value:
                self.match_cnt[tag] += 1
                if self.vmp == 'c':
                    tagss = [create_tag(word_len, tag)]
                elif self.vmp == 'm':
                    tagss = [create_tag(word_len, '')]
                elif self.vmp == 'mc':
                    tagss = [
                        create_tag(word_len, tag),
                        create_tag(word_len, '')
                    ]
                else:
                    tagss = []
                for tags in tagss:
                    for idx, tag in zip(range(start_idx, end_idx + 1), tags):
                        vocab_feature[idx][tag] = value

    def init_user_automaton(self):
        self.user_automaton = Automaton()
        self.user_automaton.make_automaton()

    def add_word(self, word, tag, value, update=True):
        '''
        Parameters
        ----------
        word:  匹配的词
        tag:   词对应的信息
        value: 信息附带的数值

        Examples
        --------
        >>> trietree.add_word('中国', '国家', True)
        >>> trietree.user_automaton.get('中国')
        (2, '中国', [('LOC', True)])
        '''
        have_add = ''
        if self.user_automaton == {}:
            self.init_user_automaton()
        wl, w, tag_values = self.user_automaton.get(word,
                                                    (len(word), word, []))
        for i, (t, v) in enumerate(tag_values):
            if t == tag:
                tag_values[i] = (tag, value)
                break
        else:
            tag_values.append((tag, value))
        self.user_automaton.add_word(w, (wl, w, tag_values))
        if update:
            self.user_automaton.make_automaton()

    def add_words(self, word_tag_values):
        '''
        do:

        for word, tag, value in word_tag_values:
            self.add_word(word, tag, value, update=False)



        Examples
        --------
        word_tag_values = [('中国', '面积', 9666), ('中国', '人口', 8888)]
        >>> trietree.add_word('中国', '国家', True)
        >>> trietree.user_automaton.get('中国')
        (2, '中国', [('面积', 9666), ('人口', 8888)])

        '''
        for word, tag, value in word_tag_values:
            self.add_word(word, tag, value, update=False)
        self.user_automaton.make_automaton()

    def get(self, key, default=None, vocab='all'):
        '''
        与字典get方法一样

        Parameters
        ----------
        vocab:  用于选择基本词典或者用户自定义词典,base(基本)/user(用户自定义)/all(两个),默认为all
        '''
        if vocab == 'base':
            value = self.automaton.get(key, default)
        elif vocab == 'user':
            value = self.user_automaton.get(key, default)
        else:
            value = {
                'base': self.automaton.get(key, default),
                'user': self.user_automaton.get(key, default)
            }
        return value
コード例 #30
0
import discord
import requests
from discord.ext import commands
from ahocorasick import Automaton

import re
from constants import colors, paths, channels
from utils import make_embed, load_json, save_json, show_error

d = requests.get(
    "https://gist.githubusercontent.com/Vexs/629488c4bb4126ad2a9909309ed6bd71/raw/416403f7080d1b353d8517dfef5acec9aafda6c3/emoji_map.json"
).json()
unicode = Automaton()
for emoji in d.values():
    unicode.add_word(emoji, emoji)
unicode.make_automaton()

custom = re.compile("<a?:[a-zA-Z0-9_]{2,32}:[0-9]{18,22}>")
role = re.compile(
    r'<@&([0-9]{18,22})>|`(.*?)`|"(.*?)"|\((.*?)\)|\*(.*?)\*|-\s*(.*?)$')


def get_emoji(s):
    emoji = []
    emoji.extend(unicode.iter(s))
    emoji.extend((m.end(), m.group(0)) for m in custom.finditer(s))
    emoji.sort(key=lambda x: x[0])

    out = []
    for end_pos, text in emoji:
        if m := role.search(s, end_pos):