def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: """ Create a dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html :param string/list dict_source: a list of vocaburaries or a path to source file :return: a trie created from a dictionary input """ trie = None if type(dict_source) is str: # Receive a file path of the dict to read with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() trie = Trie(_vocabs) elif isinstance(dict_source, Iterable): # Received a sequence type object of vocabs trie = Trie(dict_source) elif isinstance(dict_source, Trie): trie = dict_source else: raise TypeError( "Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)" ) return trie
def train(corpus_file, out_file, mode, dim_size, window, min_count, negative, epoch, pool_size, chunk_size): with bz2.BZ2File(corpus_file) as f: sentences = LineSentence(f) sg = int(mode == 'sg') model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count, workers=pool_size, iter=epoch, negative=negative, sg=sg) words = [] entities = [] for (w, _) in model.vocab.iteritems(): if w.startswith(MARKER): entities.append(w[len(MARKER):].replace(u'_', u' ')) else: words.append(w) vocab = Vocab(Trie(words), Trie(entities)) word_embedding = np.zeros((len(words), dim_size), dtype=np.float32) entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32) for word in words: word_embedding[vocab.get_word_index(word)] = model[word] for entity in entities: entity_embedding[vocab.get_entity_index(entity)] = model[MARKER + entity.replace(u' ', u'_')] ret = dict( word_embedding=word_embedding, entity_embedding=entity_embedding, vocab=vocab, ) joblib.dump(ret, out_file, compress=False)
def create_custom_dict_trie(custom_dict_source): """The function is used to create a custom dict trie which will be used for word_tokenize() function Arguments: custom_dict_source {string or list} -- a list of vocaburaries or a path to source file Raises: ValueError -- Invalid custom_dict_source's object type Returns: Trie -- A trie created from custom dict input """ if type(custom_dict_source) is str: # Receive a file path of the custom dict to read with codecs.open(custom_dict_source, 'r',encoding='utf8') as f: _vocabs = f.read().splitlines() return Trie(_vocabs) elif isinstance(custom_dict_source, (list, tuple, set)): # Received a sequence type object of vocabs return Trie(custom_dict_source) else: raise TypeError( 'Type of custom_dict_source must be either str (path to source file) or collections' )
def load(in_file, mmap=True): word_dict = Trie() entity_dict = Trie() word_dict.mmap(in_file + '_word.trie') entity_dict.mmap(in_file + '_entity.trie') return Vocab(word_dict, entity_dict)
def __init__(self, *args, **kwargs): self.trie = Trie() self.secondary_trie = set() self.book_trie = Trie() path = os.path.dirname(os.path.realpath(__file__)) + "/" if 'save' in kwargs: self.trie.load(kwargs['save']) else: with open(kwargs['words'], encoding='utf-8') as fp: keys = fp.read().splitlines() self.trie = Trie(keys) self.preprocess(kwargs['alphabet'])
def create_vocabulary(ngram=1, test=False): """ Creates the vocabulary for ngram level :param ngram: :param test: If true, only runs through the first 10000 documents. :return: Steps: - Get a set of all tokens - Retain only the valid ones """ add_valid_words() # get a set of all tokens of the ngram level print("here") token_set = get_all_tokens_in_docs(ngram, test) print("Total tokens before merging: ", len(token_set)) valid_iterator = valid_ngram_iterator(token_set, ngram) vocabulary_trie = Trie(valid_iterator) vocabulary_trie.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram)) print("Total tokens after merging", len(vocabulary_trie))
def __init__(self, word_file): # TODO: Check input file exists, is readable, valid, etc words = [] with open(word_file) as input_file: for word in input_file: words.append(word.lower().strip()) self.trie = Trie(words)
def load(in_file: str, mmap_mode="r"): data = joblib.load(in_file, mmap_mode=mmap_mode) title_trie = Trie() title_trie = title_trie.frombytes(data["title_trie"]) data["title_trie"] = title_trie return InterwikiDB(**data)
def __init__(self, dic, start_index=0): if isinstance(dic, Trie): self._dic = dic else: self._dic = Trie(dic) self._start_index = start_index
def build(description_db, entity_db, white_list, start_index, min_inlink_count, target_vocab=None): counter = Counter() db_titles = set() for (title, _, titles) in description_db.iterator(): if target_vocab is not None and title not in target_vocab: continue counter.update(titles) db_titles.add(title) title_list = [ t for (t, c) in counter.iteritems() if c >= min_inlink_count ] white_list = [entity_db.resolve_redirect(t) for t in white_list] white_list = [t for t in white_list if t in db_titles] title_list = set(title_list + white_list) return EntityVocab(Trie(title_list), start_index)
def generate_word_square(n: int, letters: str) -> list: assert n > 0, "Invalid square" words = get_anagrams(n, letters) # Trie - https://en.wikipedia.org/wiki/Trie t = Trie(words) result = recurse_generate([], t, n, 0) print(result) return result
def __init__(self, custom_dict=None): """ Initialize tokenizer object :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) :return: trie_dict - a dictionary in the form of trie data for tokenizing engines """ if custom_dict: if type(custom_dict) is list: self.trie_dict = Trie(custom_dict) elif type(custom_dict) is str: with codecs.open(custom_dict, "r", encoding="utf8") as f: vocabs = f.read().splitlines() self.trie_dict = Trie(vocabs) else: self.trie_dict = Trie(word_dict())
def load(target, device, mmap=True): word_dict = Trie() entity_dict = Trie() redirect_dict = RecordTrie("<I") if not isinstance(target, dict): if mmap: target = joblib.load(target, mmap_mode="r") else: target = joblib.load(target) word_dict.frombytes(target["word_dict"]) entity_dict.frombytes(target["entity_dict"]) redirect_dict.frombytes(target["redirect_dict"]) word_stats = target["word_stats"] entity_stats = target["entity_stats"] if not isinstance(word_stats, np.ndarray): word_stats = np.frombuffer( word_stats, dtype=np.int32, ).reshape(-1, 2) word_stats = torch.tensor( word_stats, device=device, requires_grad=False, ) entity_stats = np.frombuffer( entity_stats, dtype=np.int32, ).reshape(-1, 2) entity_stats = torch.tensor( entity_stats, device=device, requires_grad=False, ) return Wikipedia2VecDict( word_dict, entity_dict, redirect_dict, word_stats, entity_stats, **target["meta"], )
def load(input): if isinstance(input, dict): obj = input else: obj = joblib.load(input) dic = Trie() dic.frombytes(obj['dic']) return WordVocab(dic, obj['lowercase'], obj.get('start_index', 0))
def load(in_file, mmap_mode='r'): obj = joblib.load(in_file, mmap_mode=mmap_mode) title_dict = Trie() redirect_dict = RecordTrie('<I') title_dict.frombytes(obj['title_dict']) redirect_dict.frombytes(obj['redirect_dict']) return EntityDB(title_dict, redirect_dict, obj['inlink_arr'])
def load(in_file, mmap=True): title_dict = Trie() redirect_dict = RecordTrie('<I') title_dict.mmap(in_file + '_title.trie') redirect_dict.mmap(in_file + '_redirect.trie') inlink_arr = np.load(in_file + '_prior.npy', mmap_mode='r') return EntityDB(title_dict, redirect_dict, inlink_arr)
def __init__(self, custom_dict=None): """ Initialize tokenizer object Keyword arguments: custom_dict -- a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) Object variables: trie_dict -- a trie to use in tokenizing engines """ if custom_dict: if type(custom_dict) is list: self.trie_dict = Trie(custom_dict) elif type(custom_dict) is str: with codecs.open(custom_dict, 'r',encoding='utf8') as f: vocabs = f.read().splitlines() self.trie_dict = Trie(vocabs) else: self.trie_dict = Trie(get_dict())
def create_custom_dict_trie(custom_dict_source): """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html :param string/list custom_dict_source: a list of vocaburaries or a path to source file :return: A trie created from custom dict input """ if type(custom_dict_source) is str: # Receive a file path of the custom dict to read with codecs.open(custom_dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() return Trie(_vocabs) elif isinstance(custom_dict_source, (list, tuple, set)): # Received a sequence type object of vocabs return Trie(custom_dict_source) else: raise TypeError( "Type of custom_dict_source must be either str (path to source file) or collections" )
def __init__(self, args): self.args = args self.all_titles = self._all_titles_collector() self.redirects = _extract_pages(self.args.path_for_raw_xml) self.nlp = nlp_returner(args=self.args) self.entity_dict = Trie(self.all_titles) self.redirect_dict = RecordTrie( '<I', [(title, (self.entity_dict[dest_title], )) for (title, dest_title) in self.redirects if dest_title in self.entity_dict])
def __init__(self): if not os.path.exists(self.URI_PREFIXES_FN): ensurePathExists(self.URI_PREFIXES_FN) open(self.URI_PREFIXES_FN, 'w') if not os.path.exists(self.CACHE_SHELVE_FN): ensurePathExists(self.CACHE_SHELVE_FN) cache = self._openShelve('c') cache.close() prefixList = [line.strip() for line in open(self.URI_PREFIXES_FN, 'r')] self._uriPrefixes = Trie(prefixList)
def train(corpus_file, mode, dim_size, window, min_count, negative, epoch, workers): with bz2.BZ2File(corpus_file) as f: sentences = LineSentence(f) sg = int(mode == 'sg') model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count, workers=workers, iter=epoch, negative=negative, sg=sg) words = [] entities = [] for (w, _) in model.vocab.iteritems(): if w.startswith(MARKER): entities.append(w[len(MARKER):].replace(u'_', u' ')) else: words.append(w) word_vocab = WordVocab(Trie(words), lowercase=True) entity_vocab = EntityVocab(Trie(entities)) word_embedding = np.zeros((len(words), dim_size), dtype=np.float32) entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32) for word in words: ind = word_vocab.get_index(word) if ind is not None: word_embedding[ind] = model[word] for entity in entities: entity_embedding[entity_vocab.get_index(entity)] = model[ MARKER + entity.replace(u' ', u'_')] return EmbeddingReader(word_embedding, entity_embedding, word_vocab, entity_vocab)
def build(db, entity_db, min_word_count, min_entity_count, white_list, pool_size, chunk_size): word_counter = Counter() entity_counter = Counter() if white_list is not None: white_list = json.load(open(white_list, 'r')) else: white_list = None tokenizer = RegexpTokenizer() with click.progressbar(db.keys()) as bar: for title in bar: obj = db[title] text = obj['text'] tokens = tokenizer.tokenize(text) word_counter.update(t.text.lower() for t in tokens) for (_, title, _) in obj['links']: title = entity_db.resolve_redirect(title) entity_counter[title] += 1 word_dict = Trie([w.lower() for (w, c) in word_counter.items() if c >= min_word_count]) if white_list is None: entity_dict = Trie([e.lower() for (e, c) in entity_counter.items() if c >= min_entity_count]) else: entity_dict = Trie([e.lower() for (e, c) in entity_counter.items() if c >= min_entity_count]+white_list) entities = [] entities_dict = Trie(entities + entity_dict.keys()) return Vocab(word_dict, entities_dict)
def onecut(text, data=['']): if (data != ['']): trie = Trie(data) else: trie = THAI_WORDS graph = defaultdict(list) # main data structure allow_pos = tcc_pos(text) # ตำแหน่งที่ตัด ต้องตรงกับ tcc q = [0] # min-heap queue last_p = 0 # last position for yield while q[0] < len(text): p = heappop(q) for w in trie.prefixes(text[p:]): p_ = p + len(w) if p_ in allow_pos: # เลือกที่สอดคล้อง tcc graph[p].append(p_) if p_ not in q: heappush(q, p_) # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้ if len(q) == 1: pp = next(bfs_paths_graph(graph, last_p, q[0])) # เริ่มต้น last_p = pp[0] เอง for p in pp[1:]: yield text[last_p:p] last_p = p # สุดท้าย last_p == q[0] เอง # กรณี length 0 คือ ไม่มีใน dict if len(q) == 0: m = pat_eng.match(text[p:]) if m: # อังกฤษ, เลข, ว่าง i = p + m.end() else: # skip น้อยที่สุด ที่เป็นไปได้ for i in range(p + 1, len(text)): if i in allow_pos: # ใช้ tcc ด้วย ww = [ w for w in trie.prefixes(text[i:]) if (i + len(w) in allow_pos) ] m = pat_eng.match(text[i:]) if ww or m: break else: i = len(text) w = text[p:i] graph[p].append(i) yield w last_p = i heappush(q, i)
def load_password_blacklist(): global password_blackList if conf.password_blackList == 'NOBLACKLIST': LOGGER.warning('No password blacklist file defined.') password_blackList = Trie() return if os.path.isfile('compiledPwdBlacklist.bin'): LOGGER.info('Loading pre-compiled password blacklist...') password_blackList = Trie() password_blackList.load('compiledPwdBlacklist.bin') else: try: LOGGER.info('Compiling password blacklist...') with open(conf.password_blackList, encoding="utf-8") as f: pwds = f.read().splitlines() password_blackList = Trie(pwds) password_blackList.save('compiledPwdBlacklist.bin') except FileNotFoundError: LOGGER.error('File ' + conf.password_blackList + ' not found. Aborting.') exit(-1)
def add_terms(): for ngram in range(1,3): # update vocabulary trie # this messes up the ids but I don't use them anymore because I don't use the doc-term matrices anymore start = time.time() vocabulary = load_vocabulary_trie(ngram) keys = vocabulary.keys() + ADDED_TOKENS[ngram] vocabulary_new = Trie(keys) vocabulary_new.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram)) full_db_to_tokens(ngram, add_new_terms=set(ADDED_TOKENS[ngram])) print("adding new tokens for {}-gram took {}.".format(ngram, time.time() - start))
def rebuild_database() -> None: """Rebuild the search database.""" global database LOGGER.info('Updating search database...') # Clear and reset. word_to_ids.clear() for item in UI.item_list.values(): for subtype_ind in item.visual_subtypes: for tag in item.get_tags(subtype_ind): for word in tag.split(): word_to_ids[word.casefold()].add((item.id, subtype_ind)) database = Trie(word_to_ids.keys()) LOGGER.debug('Tags: {}', database.keys()) _type_cback()
def build(db, entity_db, min_word_count, min_entity_count): word_counter = Counter() entity_counter = Counter() tokenizer = RegexpTokenizer() with click.progressbar(db.keys()) as bar: for title in bar: obj = db[title] text = obj['text'] tokens = tokenizer.tokenize(text) word_counter.update(t.text.lower() for t in tokens) for (_, title, _) in obj['links']: title = entity_db.resolve_redirect(title) entity_counter[title] += 1 word_dict = Trie( [w for (w, c) in word_counter.iteritems() if c >= min_word_count]) entity_dict = Trie([ e for (e, c) in entity_counter.iteritems() if c >= min_entity_count ]) return Vocab(word_dict, entity_dict)
def build(dump_file, pool_size, chunk_size): dump_reader = WikiDumpReader(dump_file) global _extractor _extractor = WikiExtractor() titles = [] redirects = {} title_counter = Counter() with closing(Pool(pool_size)) as pool: for (page, links) in pool.imap_unordered(_process_page, dump_reader, chunksize=chunk_size): titles.append(normalize(page.title)) if page.is_redirect: redirects[normalize(page.title)] = page.redirect for link_obj in links: title_counter[normalize(link_obj.title)] += 1 title_dict = Trie(titles) redirect_items = [] for (title, dest_title) in redirects.items(): if dest_title in title_dict: redirect_items.append((title, (title_dict[dest_title], ))) redirect_dict = RecordTrie('<I', redirect_items) delete_keys = [] keys = list(title_counter.keys()) for key in keys: title = key count = title_counter[key] dest_obj = redirect_dict.get(title) if dest_obj is not None: title_counter[title_dict.restore_key(dest_obj[0][0])] += count del title_counter[title] inlink_arr = np.zeros(len(title_dict), dtype=np.int) for (title, count) in title_counter.items(): title_index = title_dict.get(title) if title_index is not None: inlink_arr[title_index] = count return EntityDB(title_dict, redirect_dict, inlink_arr)
def build(wiki_data_file: str, target_languages: List[str] = None): data = [] indptr = [0] titles = [] title_indices = [] with bz2.BZ2File(wiki_data_file) as f: for (n, line) in enumerate(f): if n % 1000 == 0 and n != 0: logger.info("Processed %d lines", n) line = line.rstrip().decode("utf-8") if line in ("[", "]"): continue if line[-1] == ",": line = line[:-1] obj = ujson.loads(line) if obj["type"] != "item": continue for link_obj in obj["sitelinks"].values(): site = link_obj["site"] if not site.endswith("wiki"): continue lang = site[:-4] if target_languages and lang not in target_languages: continue title_indices.append(len(indptr) - 1) data.append(len(titles)) title = "%s:%s" % (link_obj["title"], lang) titles.append(title) indptr.append(len(data)) title_trie = Trie(titles) data = np.fromiter((title_trie[titles[n]] for n in data), dtype=np.int) indptr = np.array(indptr, dtype=np.int) new_title_indices = np.empty(len(titles), dtype=np.int) for (title, index) in zip(titles, title_indices): new_title_indices[title_trie[title]] = index return InterwikiDB(title_trie, data, indptr, new_title_indices)
def tcut(text): #global last_p, i, q, ww # for debug trie = Trie(get_data()) words_at = defaultdict(list) # main data structure def serialize(p, p2): # helper function for w in words_at[p]: p_ = p + len(w) if p_ == p2: yield w elif p_ < p2: for path in serialize(p_, p2): yield w + '/' + path q = {0} last_p = 0 # last position for yield while min(q) < len(text): p = min(q) q -= {p} # q.pop, but for set for w in trie.prefixes(text[p:]): words_at[p].append(w) q.add(p + len(w)) if len(q) == 1: q0 = min(q) yield LatticeString(text[last_p:q0], serialize(last_p, q0)) last_p = q0 # กรณี len(q) == 0 คือ ไม่มีใน dict if len(q) == 0: # skip น้อยที่สุด ที่เป็นไปได้ for i in range(p, len(text)): ww = trie.prefixes(text[i:]) if ww: break else: i = len(text) w = text[p:i] w = w.replace(' ', '') # ลบค่าที่ว่าง words_at[p].append(w) yield LatticeString(w, in_dict=False) last_p = i q.add(i)