class OrdinalSuffixTrie(PhraseFilter): def __init__(self, ordinal_rules): self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring( safe_decode(token[::-1])) if suffix_search: return suffix_search[0].split('|') else: return None
def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename dictionary_name = filename.split('.', 1)[0] path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string( phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True
def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} kvs = defaultdict(OrderedDict) for language in address_phrase_dictionaries.languages: for dictionary_name in self.dictionaries: is_suffix_dictionary = 'suffixes' in dictionary_name is_prefix_dictionary = 'prefixes' in dictionary_name for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []): canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, language, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string(phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(language, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs)
def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename is_street_types_dictionary = 'street_types' in filename is_stopword_dictionary = 'stopwords' in filename path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = strip_accents(phrases[0]) for phrase in phrases: if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = strip_accents(phrase) == canonical if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary: kvs[phrase][lang] = (is_canonical, is_stopword_dictionary) kvs = [(k, '|'.join([v, str(int(c)), str(int(s))])) for k, vals in kvs.iteritems() for v, (c, s) in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True
def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} kvs = defaultdict(OrderedDict) for language in address_phrase_dictionaries.languages: for dictionary_name in self.dictionaries: is_suffix_dictionary = 'suffixes' in dictionary_name is_prefix_dictionary = 'prefixes' in dictionary_name for phrases in address_phrase_dictionaries.phrases.get( (language, dictionary_name), []): canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, language, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string( phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(language, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs)
class OrdinalSuffixTrie(PhraseFilter): def __init__(self, ordinal_rules): self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1])) if suffix_search: return suffix_search[0].split('|') else: return None
def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename dictionary_name = filename.split('.', 1)[0] path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string(phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True
import json import sys from marisa_trie import BytesTrie if __name__ == "__main__": lang = sys.argv[1] print("load mention_stat") with open("./mention_stat_{}.json".format(lang)) as f: data = json.load(f) print("mention_stat to trie") trie = BytesTrie([(k, bytes(json.dumps(v), "utf-8")) for k, v in data.items()]) print("saving...") trie.save("mention_stat_{}.marisa".format(lang)) print("Done!")
class DictionaryPhraseFilter(PhraseFilter): serialize = safe_encode deserialize = safe_decode def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} kvs = defaultdict(OrderedDict) for language in address_phrase_dictionaries.languages: for dictionary_name in self.dictionaries: is_suffix_dictionary = 'suffixes' in dictionary_name is_prefix_dictionary = 'prefixes' in dictionary_name for phrases in address_phrase_dictionaries.phrases.get( (language, dictionary_name), []): canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, language, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string( phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(language, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) def serialize(self, s): return s def deserialize(self, s): return s def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for p, t, data in self.basic_filter(tokens): if not p: t, c = t token = t token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get( token[(token_len - suffix_len):].rstrip('.')): yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search)) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search)) continue else: c = token_types.PHRASE yield t, c, len(t), map(safe_decode, data) def gen_phrases(self, s, canonical_only=False, languages=None): tokens = tokenize(s) norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] if not languages: languages = None elif not hasattr(languages, '__iter__'): languages = [languages] if not hasattr(languages, '__contains__'): languages = set(languages) for t, c, length, data in self.filter(norm_tokens): if c == token_types.PHRASE: if not canonical_only and languages is None: yield six.u(' ').join([t_i for t_i, c_i in t]) else: phrase = None for d in data: lang, dictionary, is_canonical, canonical = d.split( six.b('|')) if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'): phrase = phrase if phrase is not None else six.u( ' ').join([t_i for t_i, c_i in t]) yield phrase def string_contains_phrases(self, s, canonical_only=False, languages=None): phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages) try: phrases.next() return True except StopIteration: return False def extract_phrases(self, s, canonical_only=False, languages=None): return set( self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
return None def mention2encands(mention, wbegin, wend, mention_stat, id2title, llfile): try: cands = json.loads(mention_stat[mention][0]) out = [] total = 0 for cand, count in cands.items(): target = cand2en(cand, id2title, llfile) if target is not None and "(disambiguation)" not in target: out.append((target, count)) total += count for i, (target, count) in enumerate(out): out[i] = (target, float(count) / float(total), (wbegin, wend)) return out except KeyError: return None if __name__ == "__main__": import sys from marisa_trie import BytesTrie mention = sys.argv[1] id2title = BytesTrie() id2title.load("../data/id2title.marisa") mention_stat = BytesTrie() mention_stat.load("../data/mention_stat_ja.marisa") dbpath = "../data/enwiki_page.db" print(mention2encands(mention, mention_stat, id2title, dbpath))
def _load_prefix(cls, resource_path: str) -> BytesTrie: prefixes = read_word_set(resource_filename(resource_path, 'prefix.txt')) prefixes = [e.split() for e in prefixes] return BytesTrie([(p[0], p[1].encode('utf-8')) for p in prefixes] + [(p[0] + '-', p[1].encode('utf-8')) for p in prefixes])
def __init__(self, ordinal_rules): self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) self.configured = True
class DictionaryPhraseFilter(PhraseFilter): def __init__(self, *dictionaries): self.dictionaries = dictionaries def serialize(self, s): return s def deserialize(self, s): return s def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename is_street_types_dictionary = 'street_types' in filename is_stopword_dictionary = 'stopwords' in filename path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = strip_accents(phrases[0]) for phrase in phrases: if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = strip_accents(phrase) == canonical if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary: kvs[phrase][lang] = (is_canonical, is_stopword_dictionary) kvs = [(k, '|'.join([v, str(int(c)), str(int(s))])) for k, vals in kvs.iteritems() for v, (c, s) in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for c, t, data in self.basic_filter(tokens): if c is not token_types.PHRASE: token = t[1] token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get( token[(token_len - suffix_len):].rstrip('.')): yield (token_types.PHRASE, [(c, ) + t], suffix_search) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield (token_types.PHRASE, [(c, ) + t], prefix_search) continue yield c, t, data
def __init__(self, phrases): if hasattr(phrases, 'items'): phrases = six.iteritems(phrases) vals = [(safe_decode(key), self.serialize(val)) for key, val in phrases] self.trie = BytesTrie(vals)
def load(datafile="../data/mention_stat.marisa"): trie = BytesTrie() trie.load(datafile) return trie
class DictionaryPhraseFilter(PhraseFilter): def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} def serialize(self, s): return s def deserialize(self, s): return s def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename dictionary_name = filename.split('.', 1)[0] path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string(phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for p, t, data in self.basic_filter(tokens): if not p: t, c = t token = t token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')): yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search)) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search)) continue else: c = PHRASE yield t, c, len(t), map(safe_decode, data)
class DictionaryPhraseFilter(PhraseFilter): def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} def serialize(self, s): return s def deserialize(self, s): return s def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename dictionary_name = filename.split('.', 1)[0] path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string( phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for p, t, data in self.basic_filter(tokens): if not p: t, c = t token = t token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get( token[(token_len - suffix_len):].rstrip('.')): yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search)) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search)) continue else: c = PHRASE yield t, c, len(t), map(safe_decode, data)
def test_bytestrie(): btrie = BytesTrie() btrie.mmap('tests/simple.bytestrie') print 'start bytes trie-------' print btrie.get('foo') print 'end bytes trie-------'
for ch in "1" + string.lowercase: # '1' for # url = "http://www.noslang.com/{}/{}".format(resource, ch) print("Processing " + url) r = requests.get(url) if not r.ok: print("Skipping {} (status code {})".format(ch, r.status_code), file=sys.stderr) page = html.fromstring(r.text) for abbr in page.cssselect("abbr"): a = abbr.getprevious() definition = abbr.attrib["title"].lower() if definition in fucking_shit: definition = fucking_shit[definition] else: for stars, replacement in fucking_shit.iteritems(): definition = definition.replace(stars, replacement) yield a.attrib["name"].decode("utf-8"), definition if __name__ == "__main__": try: [path] = sys.argv[1:] except ValueError: print("Usage: [prog] path/to/trie", file=sys.stderr) sys.exit(1) abbr = BytesTrie(iter_noslang()) abbr.save(path)
class DictionaryPhraseFilter(PhraseFilter): serialize = safe_encode deserialize = safe_decode def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} kvs = defaultdict(OrderedDict) for language in address_phrase_dictionaries.languages: for dictionary_name in self.dictionaries: is_suffix_dictionary = 'suffixes' in dictionary_name is_prefix_dictionary = 'prefixes' in dictionary_name for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []): canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, language, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string(phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(language, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) def serialize(self, s): return s def deserialize(self, s): return s def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for p, t, data in self.basic_filter(tokens): if not p: t, c = t token = t token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')): yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search)) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search)) continue else: c = token_types.PHRASE yield t, c, len(t), map(safe_decode, data) def gen_phrases(self, s, canonical_only=False, languages=None): tokens = tokenize(s) norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] if not languages: languages = None elif not hasattr(languages, '__iter__'): languages = [languages] if not hasattr(languages, '__contains__'): languages = set(languages) for t, c, length, data in self.filter(norm_tokens): if c == token_types.PHRASE: if not canonical_only and languages is None: yield six.u(' ').join([t_i for t_i, c_i in t]) else: phrase = None for d in data: lang, dictionary, is_canonical, canonical = d.split(six.b('|')) if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'): phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t]) yield phrase def string_contains_phrases(self, s, canonical_only=False, languages=None): phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages) try: phrases.next() return True except StopIteration: return False def extract_phrases(self, s, canonical_only=False, languages=None): return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))