def load(in_file: str, mmap_mode="r"): data = joblib.load(in_file, mmap_mode=mmap_mode) title_trie = Trie() title_trie = title_trie.frombytes(data["title_trie"]) data["title_trie"] = title_trie return InterwikiDB(**data)
def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: """ Create a dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html :param string/list dict_source: a list of vocaburaries or a path to source file :return: a trie created from a dictionary input """ trie = None if type(dict_source) is str: # Receive a file path of the dict to read with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() trie = Trie(_vocabs) elif isinstance(dict_source, Iterable): # Received a sequence type object of vocabs trie = Trie(dict_source) elif isinstance(dict_source, Trie): trie = dict_source else: raise TypeError( "Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)" ) return trie
def __init__(self, word_file): # TODO: Check input file exists, is readable, valid, etc words = [] with open(word_file) as input_file: for word in input_file: words.append(word.lower().strip()) self.trie = Trie(words)
def train(corpus_file, out_file, mode, dim_size, window, min_count, negative, epoch, pool_size, chunk_size): with bz2.BZ2File(corpus_file) as f: sentences = LineSentence(f) sg = int(mode == 'sg') model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count, workers=pool_size, iter=epoch, negative=negative, sg=sg) words = [] entities = [] for (w, _) in model.vocab.iteritems(): if w.startswith(MARKER): entities.append(w[len(MARKER):].replace(u'_', u' ')) else: words.append(w) vocab = Vocab(Trie(words), Trie(entities)) word_embedding = np.zeros((len(words), dim_size), dtype=np.float32) entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32) for word in words: word_embedding[vocab.get_word_index(word)] = model[word] for entity in entities: entity_embedding[vocab.get_entity_index(entity)] = model[MARKER + entity.replace(u' ', u'_')] ret = dict( word_embedding=word_embedding, entity_embedding=entity_embedding, vocab=vocab, ) joblib.dump(ret, out_file, compress=False)
def __init__(self, dic, start_index=0): if isinstance(dic, Trie): self._dic = dic else: self._dic = Trie(dic) self._start_index = start_index
def create_custom_dict_trie(custom_dict_source): """The function is used to create a custom dict trie which will be used for word_tokenize() function Arguments: custom_dict_source {string or list} -- a list of vocaburaries or a path to source file Raises: ValueError -- Invalid custom_dict_source's object type Returns: Trie -- A trie created from custom dict input """ if type(custom_dict_source) is str: # Receive a file path of the custom dict to read with codecs.open(custom_dict_source, 'r',encoding='utf8') as f: _vocabs = f.read().splitlines() return Trie(_vocabs) elif isinstance(custom_dict_source, (list, tuple, set)): # Received a sequence type object of vocabs return Trie(custom_dict_source) else: raise TypeError( 'Type of custom_dict_source must be either str (path to source file) or collections' )
def create_vocabulary(ngram=1, test=False): """ Creates the vocabulary for ngram level :param ngram: :param test: If true, only runs through the first 10000 documents. :return: Steps: - Get a set of all tokens - Retain only the valid ones """ add_valid_words() # get a set of all tokens of the ngram level print("here") token_set = get_all_tokens_in_docs(ngram, test) print("Total tokens before merging: ", len(token_set)) valid_iterator = valid_ngram_iterator(token_set, ngram) vocabulary_trie = Trie(valid_iterator) vocabulary_trie.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram)) print("Total tokens after merging", len(vocabulary_trie))
def recurse_generate(words: list, trie: Trie, square_size: int, chosen_words_length=0) -> list: if chosen_words_length >= square_size or square_size <= 1: if check_solution_is_valid(words, square_size): return words return None # build up the solution letter by letter # on each iteration we check if the substring is a key inside the Trie # if not a key then we know the current permutation is not a solution so return None # loop through the characters for i in range(chosen_words_length, square_size): prefix = "".join(word[i] for word in words) # using the soon to be deprecated function because it runs ~30% faster if not trie.has_keys_with_prefix(prefix): return None prefix = "".join(word[chosen_words_length] for word in words) # we use a prefix to dictate which key to start going over for word in trie.iterkeys(prefix): new_list = words + [word] res = recurse_generate(new_list, trie, square_size, chosen_words_length + 1) if res: return res return None
def load(in_file, mmap_mode='r'): obj = joblib.load(in_file, mmap_mode=mmap_mode) title_dict = Trie() redirect_dict = RecordTrie('<I') title_dict.frombytes(obj['title_dict']) redirect_dict.frombytes(obj['redirect_dict']) return EntityDB(title_dict, redirect_dict, obj['inlink_arr'])
def load(in_file, mmap=True): title_dict = Trie() redirect_dict = RecordTrie('<I') title_dict.mmap(in_file + '_title.trie') redirect_dict.mmap(in_file + '_redirect.trie') inlink_arr = np.load(in_file + '_prior.npy', mmap_mode='r') return EntityDB(title_dict, redirect_dict, inlink_arr)
def load(input): if isinstance(input, dict): obj = input else: obj = joblib.load(input) dic = Trie() dic.frombytes(obj['dic']) return WordVocab(dic, obj['lowercase'], obj.get('start_index', 0))
def loadTrie( fname ): global trie try: fname = fname + "_trie.hny" trie.load( fname ) except(IOError): f = bz2.BZ2File( dir_path + sys.argv[1]); words = [ w.strip() for w in f.readlines() ] trie = Trie(words); trie.save(fname);
def __init__(self, args): self.args = args self.all_titles = self._all_titles_collector() self.redirects = _extract_pages(self.args.path_for_raw_xml) self.nlp = nlp_returner(args=self.args) self.entity_dict = Trie(self.all_titles) self.redirect_dict = RecordTrie( '<I', [(title, (self.entity_dict[dest_title], )) for (title, dest_title) in self.redirects if dest_title in self.entity_dict])
def _multicut(text: str, custom_dict: Trie = None): """ ส่งคืน LatticeString คืนมาเป็นก้อนๆ """ if not custom_dict: custom_dict = DEFAULT_DICT_TRIE len_text = len(text) words_at = defaultdict(list) # main data structure def serialize(p, p2): # helper function for w in words_at[p]: p_ = p + len(w) if p_ == p2: yield w elif p_ < p2: for path in serialize(p_, p2): yield w + "/" + path q = {0} last_p = 0 # last position for yield while min(q) < len_text: p = min(q) q -= {p} # q.pop, but for set for w in custom_dict.prefixes(text[p:]): words_at[p].append(w) q.add(p + len(w)) if len(q) == 1: q0 = min(q) yield LatticeString(text[last_p:q0], serialize(last_p, q0)) last_p = q0 # กรณี len(q) == 0 คือ ไม่มีใน dict if len(q) == 0: m = _PAT_ENG.match(text[p:]) if m: # อังกฤษ, เลข, ว่าง i = p + m.span()[1] else: # skip น้อยที่สุด ที่เป็นไปได้ for i in range(p, len_text): ww = custom_dict.prefixes(text[i:]) m = _PAT_ENG.match(text[i:]) if ww or m: break else: i = len_text w = text[p:i] words_at[p].append(w) yield LatticeString(w, in_dict=False) last_p = i q.add(i)
def __init__(self): if not os.path.exists(self.URI_PREFIXES_FN): ensurePathExists(self.URI_PREFIXES_FN) open(self.URI_PREFIXES_FN, 'w') if not os.path.exists(self.CACHE_SHELVE_FN): ensurePathExists(self.CACHE_SHELVE_FN) cache = self._openShelve('c') cache.close() prefixList = [line.strip() for line in open(self.URI_PREFIXES_FN, 'r')] self._uriPrefixes = Trie(prefixList)
def __init__(self, *args, **kwargs): self.trie = Trie() self.secondary_trie = set() self.book_trie = Trie() path = os.path.dirname(os.path.realpath(__file__)) + "/" if 'save' in kwargs: self.trie.load(kwargs['save']) else: with open(kwargs['words'], encoding='utf-8') as fp: keys = fp.read().splitlines() self.trie = Trie(keys) self.preprocess(kwargs['alphabet'])
def onecut(text, data=['']): if (data != ['']): trie = Trie(data) else: trie = THAI_WORDS graph = defaultdict(list) # main data structure allow_pos = tcc_pos(text) # ตำแหน่งที่ตัด ต้องตรงกับ tcc q = [0] # min-heap queue last_p = 0 # last position for yield while q[0] < len(text): p = heappop(q) for w in trie.prefixes(text[p:]): p_ = p + len(w) if p_ in allow_pos: # เลือกที่สอดคล้อง tcc graph[p].append(p_) if p_ not in q: heappush(q, p_) # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้ if len(q) == 1: pp = next(bfs_paths_graph(graph, last_p, q[0])) # เริ่มต้น last_p = pp[0] เอง for p in pp[1:]: yield text[last_p:p] last_p = p # สุดท้าย last_p == q[0] เอง # กรณี length 0 คือ ไม่มีใน dict if len(q) == 0: m = pat_eng.match(text[p:]) if m: # อังกฤษ, เลข, ว่าง i = p + m.end() else: # skip น้อยที่สุด ที่เป็นไปได้ for i in range(p + 1, len(text)): if i in allow_pos: # ใช้ tcc ด้วย ww = [ w for w in trie.prefixes(text[i:]) if (i + len(w) in allow_pos) ] m = pat_eng.match(text[i:]) if ww or m: break else: i = len(text) w = text[p:i] graph[p].append(i) yield w last_p = i heappush(q, i)
def add_terms(): for ngram in range(1,3): # update vocabulary trie # this messes up the ids but I don't use them anymore because I don't use the doc-term matrices anymore start = time.time() vocabulary = load_vocabulary_trie(ngram) keys = vocabulary.keys() + ADDED_TOKENS[ngram] vocabulary_new = Trie(keys) vocabulary_new.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram)) full_db_to_tokens(ngram, add_new_terms=set(ADDED_TOKENS[ngram])) print("adding new tokens for {}-gram took {}.".format(ngram, time.time() - start))
def _onecut(text: str, custom_dict: Trie): graph = defaultdict(list) # main data structure allow_pos = tcc_pos(text) # separating position should aligned with TCC q = [0] # min-heap queue last_p = 0 # last position for yield while q[0] < len(text): p = heappop(q) for w in custom_dict.prefixes(text[p:]): p_ = p + len(w) if p_ in allow_pos: # เลือกที่สอดคล้อง tcc graph[p].append(p_) if p_ not in q: heappush(q, p_) # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้ if len(q) == 1: pp = next(_bfs_paths_graph(graph, last_p, q[0])) # เริ่มต้น last_p = pp[0] เอง for p in pp[1:]: yield text[last_p:p] last_p = p # สุดท้าย last_p == q[0] เอง # กรณี length 0 คือ ไม่มีใน dict if len(q) == 0: m = _PAT_ENG.match(text[p:]) if m: # อังกฤษ, เลข, ว่าง i = p + m.end() else: # skip น้อยที่สุด ที่เป็นไปได้ for i in range(p + 1, len(text)): if i in allow_pos: # ใช้ tcc ด้วย ww = [ w for w in custom_dict.prefixes(text[i:]) if (i + len(w) in allow_pos) ] ww = [w for w in ww if not _PAT_TWOCHARS.match(w)] m = _PAT_ENG.match(text[i:]) if ww or m: break else: i = len(text) w = text[p:i] graph[p].append(i) yield w last_p = i heappush(q, i)
def load(in_file, mmap=True): word_dict = Trie() entity_dict = Trie() word_dict.mmap(in_file + '_word.trie') entity_dict.mmap(in_file + '_entity.trie') return Vocab(word_dict, entity_dict)
def rebuild_database() -> None: """Rebuild the search database.""" global database LOGGER.info('Updating search database...') # Clear and reset. word_to_ids.clear() for item in UI.item_list.values(): for subtype_ind in item.visual_subtypes: for tag in item.get_tags(subtype_ind): for word in tag.split(): word_to_ids[word.casefold()].add((item.id, subtype_ind)) database = Trie(word_to_ids.keys()) LOGGER.debug('Tags: {}', database.keys()) _type_cback()
def build(description_db, entity_db, white_list, start_index, min_inlink_count, target_vocab=None): counter = Counter() db_titles = set() for (title, _, titles) in description_db.iterator(): if target_vocab is not None and title not in target_vocab: continue counter.update(titles) db_titles.add(title) title_list = [ t for (t, c) in counter.iteritems() if c >= min_inlink_count ] white_list = [entity_db.resolve_redirect(t) for t in white_list] white_list = [t for t in white_list if t in db_titles] title_list = set(title_list + white_list) return EntityVocab(Trie(title_list), start_index)
class WordList(object): def __init__(self, word_file): # TODO: Check input file exists, is readable, valid, etc words = [] with open(word_file) as input_file: for word in input_file: words.append(word.lower().strip()) self.trie = Trie(words) def contains_word(self, word): """ Check whether a word exists in the list. :param word: An ASCII, lowercase string to check for. :return: True if the word is in the word list, false if it is not. """ # TODO: Raise errors if the word is None, isn't ASCII or lowercase, etc return word in self.trie def contains_prefix(self, prefix): """ Check list for words that begin with the supplied prefix :param prefix: An ASCII, lowercase string to check as a prefix :return: True if this key is a prefix for some other word or words in the list. Note that this method will return False if the word is in the list but is not a prefix of any other word. """ # TODO: Raise errors if prefix is None, isn't ASCII or lowercase, etc return len(self.trie.keys(prefix)) > 1
def build(dump_file, pool_size, chunk_size): dump_reader = WikiDumpReader(dump_file) global _extractor _extractor = WikiExtractor() titles = [] redirects = {} title_counter = Counter() with closing(Pool(pool_size)) as pool: for (page, links) in pool.imap_unordered(_process_page, dump_reader, chunksize=chunk_size): titles.append(normalize(page.title)) if page.is_redirect: redirects[normalize(page.title)] = page.redirect for link_obj in links: title_counter[normalize(link_obj.title)] += 1 title_dict = Trie(titles) redirect_items = [] for (title, dest_title) in redirects.items(): if dest_title in title_dict: redirect_items.append((title, (title_dict[dest_title], ))) redirect_dict = RecordTrie('<I', redirect_items) delete_keys = [] keys = list(title_counter.keys()) for key in keys: title = key count = title_counter[key] dest_obj = redirect_dict.get(title) if dest_obj is not None: title_counter[title_dict.restore_key(dest_obj[0][0])] += count del title_counter[title] inlink_arr = np.zeros(len(title_dict), dtype=np.int) for (title, count) in title_counter.items(): title_index = title_dict.get(title) if title_index is not None: inlink_arr[title_index] = count return EntityDB(title_dict, redirect_dict, inlink_arr)
def generate_word_square(n: int, letters: str) -> list: assert n > 0, "Invalid square" words = get_anagrams(n, letters) # Trie - https://en.wikipedia.org/wiki/Trie t = Trie(words) result = recurse_generate([], t, n, 0) print(result) return result
def __init__(self, custom_dict=None): """ Initialize tokenizer object :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) :return: trie_dict - a dictionary in the form of trie data for tokenizing engines """ if custom_dict: if type(custom_dict) is list: self.trie_dict = Trie(custom_dict) elif type(custom_dict) is str: with codecs.open(custom_dict, "r", encoding="utf8") as f: vocabs = f.read().splitlines() self.trie_dict = Trie(vocabs) else: self.trie_dict = Trie(word_dict())
def tcut(text): #global last_p, i, q, ww # for debug trie = Trie(get_data()) words_at = defaultdict(list) # main data structure def serialize(p, p2): # helper function for w in words_at[p]: p_ = p + len(w) if p_ == p2: yield w elif p_ < p2: for path in serialize(p_, p2): yield w + '/' + path q = {0} last_p = 0 # last position for yield while min(q) < len(text): p = min(q) q -= {p} # q.pop, but for set for w in trie.prefixes(text[p:]): words_at[p].append(w) q.add(p + len(w)) if len(q) == 1: q0 = min(q) yield LatticeString(text[last_p:q0], serialize(last_p, q0)) last_p = q0 # กรณี len(q) == 0 คือ ไม่มีใน dict if len(q) == 0: # skip น้อยที่สุด ที่เป็นไปได้ for i in range(p, len(text)): ww = trie.prefixes(text[i:]) if ww: break else: i = len(text) w = text[p:i] w = w.replace(' ', '') # ลบค่าที่ว่าง words_at[p].append(w) yield LatticeString(w, in_dict=False) last_p = i q.add(i)
def __init__(self, custom_dict=None): """ Initialize tokenizer object Keyword arguments: custom_dict -- a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) Object variables: trie_dict -- a trie to use in tokenizing engines """ if custom_dict: if type(custom_dict) is list: self.trie_dict = Trie(custom_dict) elif type(custom_dict) is str: with codecs.open(custom_dict, 'r',encoding='utf8') as f: vocabs = f.read().splitlines() self.trie_dict = Trie(vocabs) else: self.trie_dict = Trie(get_dict())
def build(dump_file, pool_size, chunk_size): dump_reader = WikiDumpReader(dump_file) global _extractor _extractor = WikiExtractor() titles = [] redirects = {} title_counter = Counter() with closing(Pool(pool_size)) as pool: for (page, links) in pool.imap_unordered( _process_page, dump_reader, chunksize=chunk_size ): titles.append(page.title) if page.is_redirect: redirects[page.title] = page.redirect for link_obj in links: title_counter[link_obj.title] += 1 title_dict = Trie(titles) redirect_items = [] for (title, dest_title) in redirects.iteritems(): if dest_title in title_dict: redirect_items.append((title, (title_dict[dest_title],))) redirect_dict = RecordTrie('<I', redirect_items) for (title, count) in title_counter.items(): dest_obj = redirect_dict.get(title) if dest_obj is not None: title_counter[title_dict.restore_key(dest_obj[0][0])] += count del title_counter[title] inlink_arr = np.zeros(len(title_dict), dtype=np.int) for (title, count) in title_counter.items(): title_index = title_dict.get(title) if title_index is not None: inlink_arr[title_index] = count return EntityDB(title_dict, redirect_dict, inlink_arr)
def create_custom_dict_trie(custom_dict_source): """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html :param string/list custom_dict_source: a list of vocaburaries or a path to source file :return: A trie created from custom dict input """ if type(custom_dict_source) is str: # Receive a file path of the custom dict to read with codecs.open(custom_dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() return Trie(_vocabs) elif isinstance(custom_dict_source, (list, tuple, set)): # Received a sequence type object of vocabs return Trie(custom_dict_source) else: raise TypeError( "Type of custom_dict_source must be either str (path to source file) or collections" )
def __init__(self, max_len, min_freq, min_pmi, min_entropy, cut=True, tokenizer='jieba', norm_pmi=False): self.max_len = max_len self.min_freq = min_freq self.min_pmi = min_pmi self.min_entropy = min_entropy self.cut = cut self.norm_pmi = norm_pmi # Initialize dictionary to build trie self.trie = defaultdict(int) self.rev_trie = defaultdict(int) self.len = 0 # Build existing dictionary based on trie structure sistring = set() if 'jieba_dict_path' in config['DEFAULT'] and os.path.isfile( config['DEFAULT']['jieba_dict_path']): sistring = get_sistring(config['DEFAULT']['jieba_dict_path']) if 'user_dict_path' in config['DEFAULT'] and os.path.isfile( config['DEFAULT']['user_dict_path']): sistring = get_sistring(config['DEFAULT']['user_dict_path'], sistring) self.dict = Trie(sistring) # Get blacklist self.blacklist = set() if 'blacklist_path' in config['DEFAULT'] and os.path.isfile( config['DEFAULT']['blacklist_path']): self.blacklist = get_dict(config['DEFAULT']['blacklist_path']) if cut: if tokenizer == 'jieba': self.tokenizer = Jieba() else: raise ValueError(f'Unknown tokenizer {tokenizer}')
def test(): # 1. build a trie d = dict(zero=0, one=1, two=2, three=3, four=4, five=5, six=6, seven=7, eight=8, nine=9, ten=10, eleven=11, twelve=12, thirteen=13, fourteen=10, fifteen=15, sixteen=16, seventeen=17, eighteen=18, nineteen=19, twenty=20, thirty=30, fourty=40, fifty=50, sixty=60, seventy=70, eighty=80, ninety=90, hundred=100) t = Trie(list(d.keys())) # 2. scan 2000 "sentences" with it for _ in range(1000): # scanning for the longest matches only in sentence 1 i = S1[0] #print(TEXT[i:S1[1]]) while i < S1[1]: pfx = list(t.prefixes(TEXT[i:S1[1]])) if pfx: k = pfx[-1] #print(d[k]) i += len(k) else: i += 1 # scanning for all matches in sentence 2 i = S2[0] #print(TEXT[i:S2[1]]) s = 0 while i < S2[1]: for k in t.prefixes(TEXT[i:S2[1]]): #print(k) s += d[k] i += 1 if s != 142: raise RuntimeError(str(s)) # 3. make a real list of all keys in the trie if 'nine' not in list(t.iterkeys()): raise RuntimeError(str(list(t.iterkeys())))
def train(corpus_file, mode, dim_size, window, min_count, negative, epoch, workers): with bz2.BZ2File(corpus_file) as f: sentences = LineSentence(f) sg = int(mode == 'sg') model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count, workers=workers, iter=epoch, negative=negative, sg=sg) words = [] entities = [] for (w, _) in model.vocab.iteritems(): if w.startswith(MARKER): entities.append(w[len(MARKER):].replace(u'_', u' ')) else: words.append(w) word_vocab = WordVocab(Trie(words), lowercase=True) entity_vocab = EntityVocab(Trie(entities)) word_embedding = np.zeros((len(words), dim_size), dtype=np.float32) entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32) for word in words: ind = word_vocab.get_index(word) if ind is not None: word_embedding[ind] = model[word] for entity in entities: entity_embedding[entity_vocab.get_index(entity)] = model[ MARKER + entity.replace(u' ', u'_')] return EmbeddingReader(word_embedding, entity_embedding, word_vocab, entity_vocab)
class TrieNameDB(NameDB): def __init__(self, pair_gen): self._dic = self._construct_dic(pair_gen) self._index = Trie(self._dic.keys()) def _construct_dic(self, pair_gen): dic = collections.defaultdict(list) for k, v in pair_gen: dic[k.lower()].append((k, v)) return dic def find_by_prefix(self, str, limit=50): result = [] for key in self._index.iterkeys(str.lower()): result.extend(self._dic[key]) if limit <= len(result): break return result[:limit]
def __init__(self, pair_gen): self._dic = self._construct_dic(pair_gen) self._index = Trie(self._dic.keys())
from os.path import join, exists from marisa_trie import Trie if __name__ == '__main__': assert len(sys.argv) == 2 source_dir = sys.argv[1] if source_dir.endswith("/"): source_dir = source_dir[:-1] assert exists(source_dir) target_dir = source_dir + "_marisa" if exists(target_dir): os.rmdir(target_dir) makedirs(target_dir) source_files = listdir(source_dir) for filename in source_files: print filename with open(join(source_dir, filename), 'r') as input_file: contents = input_file.read() if filename == 'mappings': with open(join(target_dir, 'mappings'), 'w') as output_file: # copy source to destination output_file.write(contents) else: with open(join(target_dir, filename + ".marisa"), 'w') as output_file: lines = contents.split("\n") d = Trie(l for l in lines if len(l) > 0) d.write(output_file)