Example #1
0
def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
    """
    Create a dict trie which will be used for word_tokenize() function.
    For more information on the trie data structure,
    see: https://marisa-trie.readthedocs.io/en/latest/index.html

    :param string/list dict_source: a list of vocaburaries or a path to source file
    :return: a trie created from a dictionary input
    """
    trie = None

    if type(dict_source) is str:
        # Receive a file path of the dict to read
        with open(dict_source, "r", encoding="utf8") as f:
            _vocabs = f.read().splitlines()
            trie = Trie(_vocabs)
    elif isinstance(dict_source, Iterable):
        # Received a sequence type object of vocabs
        trie = Trie(dict_source)
    elif isinstance(dict_source, Trie):
        trie = dict_source
    else:
        raise TypeError(
            "Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)"
        )

    return trie
Example #2
0
def train(corpus_file, out_file, mode, dim_size, window, min_count,
          negative, epoch, pool_size, chunk_size):
    with bz2.BZ2File(corpus_file) as f:
        sentences = LineSentence(f)
        sg = int(mode == 'sg')

        model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count,
                         workers=pool_size, iter=epoch, negative=negative, sg=sg)

    words = []
    entities = []
    for (w, _) in model.vocab.iteritems():
        if w.startswith(MARKER):
            entities.append(w[len(MARKER):].replace(u'_', u' '))
        else:
            words.append(w)

    vocab = Vocab(Trie(words), Trie(entities))

    word_embedding = np.zeros((len(words), dim_size), dtype=np.float32)
    entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32)
    for word in words:
        word_embedding[vocab.get_word_index(word)] = model[word]
    for entity in entities:
        entity_embedding[vocab.get_entity_index(entity)] = model[MARKER + entity.replace(u' ', u'_')]

    ret = dict(
        word_embedding=word_embedding,
        entity_embedding=entity_embedding,
        vocab=vocab,
    )
    joblib.dump(ret, out_file, compress=False)
Example #3
0
def create_custom_dict_trie(custom_dict_source):
	"""The function is used to create a custom dict trie which will be
	used for word_tokenize() function
	
	Arguments:
		custom_dict_source {string or list} -- a list of vocaburaries or a path to source file
	
	Raises:
		ValueError -- Invalid custom_dict_source's object type
	
	Returns:
		Trie -- A trie created from custom dict input
	"""

	if type(custom_dict_source) is str:
		# Receive a file path of the custom dict to read
		with codecs.open(custom_dict_source, 'r',encoding='utf8') as f:
			_vocabs = f.read().splitlines()
			return Trie(_vocabs)
	elif isinstance(custom_dict_source, (list, tuple, set)):
		# Received a sequence type object of vocabs
		return Trie(custom_dict_source)
	else:
		raise TypeError(
			'Type of custom_dict_source must be either str (path to source file) or collections'
		)
Example #4
0
    def load(in_file, mmap=True):
        word_dict = Trie()
        entity_dict = Trie()

        word_dict.mmap(in_file + '_word.trie')
        entity_dict.mmap(in_file + '_entity.trie')

        return Vocab(word_dict, entity_dict)
Example #5
0
    def __init__(self, *args, **kwargs):
        self.trie = Trie()
        self.secondary_trie = set()
        self.book_trie = Trie()
        path = os.path.dirname(os.path.realpath(__file__)) + "/"
        if 'save' in kwargs:
            self.trie.load(kwargs['save'])
        else:
            with open(kwargs['words'], encoding='utf-8') as fp:
                keys = fp.read().splitlines()
                self.trie = Trie(keys)

        self.preprocess(kwargs['alphabet'])
Example #6
0
def create_vocabulary(ngram=1, test=False):
    """ Creates  the vocabulary for ngram level

    :param ngram:
    :param test: If true, only runs through the first 10000 documents.
    :return:

    Steps:
    - Get a set of all tokens
    - Retain only the valid ones
    """

    add_valid_words()
    # get a set of all tokens of the ngram level

    print("here")

    token_set = get_all_tokens_in_docs(ngram, test)
    print("Total tokens before merging: ", len(token_set))

    valid_iterator = valid_ngram_iterator(token_set, ngram)

    vocabulary_trie = Trie(valid_iterator)
    vocabulary_trie.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram))
    print("Total tokens after merging", len(vocabulary_trie))
Example #7
0
 def __init__(self, word_file):
     # TODO: Check input file exists, is readable, valid, etc
     words = []
     with open(word_file) as input_file:
         for word in input_file:
             words.append(word.lower().strip())
     self.trie = Trie(words)
Example #8
0
    def load(in_file: str, mmap_mode="r"):
        data = joblib.load(in_file, mmap_mode=mmap_mode)
        title_trie = Trie()
        title_trie = title_trie.frombytes(data["title_trie"])
        data["title_trie"] = title_trie

        return InterwikiDB(**data)
Example #9
0
    def __init__(self, dic, start_index=0):
        if isinstance(dic, Trie):
            self._dic = dic
        else:
            self._dic = Trie(dic)

        self._start_index = start_index
Example #10
0
    def build(description_db,
              entity_db,
              white_list,
              start_index,
              min_inlink_count,
              target_vocab=None):
        counter = Counter()
        db_titles = set()

        for (title, _, titles) in description_db.iterator():
            if target_vocab is not None and title not in target_vocab:
                continue

            counter.update(titles)
            db_titles.add(title)

        title_list = [
            t for (t, c) in counter.iteritems() if c >= min_inlink_count
        ]

        white_list = [entity_db.resolve_redirect(t) for t in white_list]
        white_list = [t for t in white_list if t in db_titles]

        title_list = set(title_list + white_list)

        return EntityVocab(Trie(title_list), start_index)
Example #11
0
def generate_word_square(n: int, letters: str) -> list:
    assert n > 0, "Invalid square"
    words = get_anagrams(n, letters)
    # Trie - https://en.wikipedia.org/wiki/Trie
    t = Trie(words)
    result = recurse_generate([], t, n, 0)
    print(result)
    return result
Example #12
0
    def __init__(self, custom_dict=None):
        """
        Initialize tokenizer object

        :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)

        :return: trie_dict - a dictionary in the form of trie data for tokenizing engines
        """
        if custom_dict:
            if type(custom_dict) is list:
                self.trie_dict = Trie(custom_dict)
            elif type(custom_dict) is str:
                with codecs.open(custom_dict, "r", encoding="utf8") as f:
                    vocabs = f.read().splitlines()
                self.trie_dict = Trie(vocabs)
        else:
            self.trie_dict = Trie(word_dict())
Example #13
0
    def load(target, device, mmap=True):
        word_dict = Trie()
        entity_dict = Trie()
        redirect_dict = RecordTrie("<I")

        if not isinstance(target, dict):
            if mmap:
                target = joblib.load(target, mmap_mode="r")
            else:
                target = joblib.load(target)

        word_dict.frombytes(target["word_dict"])
        entity_dict.frombytes(target["entity_dict"])
        redirect_dict.frombytes(target["redirect_dict"])

        word_stats = target["word_stats"]
        entity_stats = target["entity_stats"]
        if not isinstance(word_stats, np.ndarray):
            word_stats = np.frombuffer(
                word_stats,
                dtype=np.int32,
            ).reshape(-1, 2)
            word_stats = torch.tensor(
                word_stats,
                device=device,
                requires_grad=False,
            )
            entity_stats = np.frombuffer(
                entity_stats,
                dtype=np.int32,
            ).reshape(-1, 2)
            entity_stats = torch.tensor(
                entity_stats,
                device=device,
                requires_grad=False,
            )

        return Wikipedia2VecDict(
            word_dict,
            entity_dict,
            redirect_dict,
            word_stats,
            entity_stats,
            **target["meta"],
        )
Example #14
0
    def load(input):
        if isinstance(input, dict):
            obj = input
        else:
            obj = joblib.load(input)

        dic = Trie()
        dic.frombytes(obj['dic'])
        return WordVocab(dic, obj['lowercase'], obj.get('start_index', 0))
Example #15
0
    def load(in_file, mmap_mode='r'):
        obj = joblib.load(in_file, mmap_mode=mmap_mode)

        title_dict = Trie()
        redirect_dict = RecordTrie('<I')
        title_dict.frombytes(obj['title_dict'])
        redirect_dict.frombytes(obj['redirect_dict'])

        return EntityDB(title_dict, redirect_dict, obj['inlink_arr'])
Example #16
0
    def load(in_file, mmap=True):
        title_dict = Trie()
        redirect_dict = RecordTrie('<I')

        title_dict.mmap(in_file + '_title.trie')
        redirect_dict.mmap(in_file + '_redirect.trie')
        inlink_arr = np.load(in_file + '_prior.npy', mmap_mode='r')

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Example #17
0
	def __init__(self, custom_dict=None):
		"""
		Initialize tokenizer object
		
		Keyword arguments:
		custom_dict -- a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)

		Object variables:
		trie_dict -- a trie to use in tokenizing engines
		"""
		if custom_dict:
			if type(custom_dict) is list:
				self.trie_dict = Trie(custom_dict)
			elif type(custom_dict) is str:
				with codecs.open(custom_dict, 'r',encoding='utf8') as f:
					vocabs = f.read().splitlines()
				self.trie_dict = Trie(vocabs)
		else:
			self.trie_dict = Trie(get_dict())
Example #18
0
def create_custom_dict_trie(custom_dict_source):
    """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html

    :param string/list custom_dict_source:  a list of vocaburaries or a path to source file

    :return: A trie created from custom dict input
    """

    if type(custom_dict_source) is str:
        # Receive a file path of the custom dict to read
        with codecs.open(custom_dict_source, "r", encoding="utf8") as f:
            _vocabs = f.read().splitlines()
            return Trie(_vocabs)
    elif isinstance(custom_dict_source, (list, tuple, set)):
        # Received a sequence type object of vocabs
        return Trie(custom_dict_source)
    else:
        raise TypeError(
            "Type of custom_dict_source must be either str (path to source file) or collections"
        )
    def __init__(self, args):
        self.args = args
        self.all_titles = self._all_titles_collector()
        self.redirects = _extract_pages(self.args.path_for_raw_xml)
        self.nlp = nlp_returner(args=self.args)

        self.entity_dict = Trie(self.all_titles)

        self.redirect_dict = RecordTrie(
            '<I', [(title, (self.entity_dict[dest_title], ))
                   for (title, dest_title) in self.redirects
                   if dest_title in self.entity_dict])
Example #20
0
    def __init__(self):
        if not os.path.exists(self.URI_PREFIXES_FN):
            ensurePathExists(self.URI_PREFIXES_FN)
            open(self.URI_PREFIXES_FN, 'w')

        if not os.path.exists(self.CACHE_SHELVE_FN):
            ensurePathExists(self.CACHE_SHELVE_FN)

        cache = self._openShelve('c')
        cache.close()

        prefixList = [line.strip() for line in open(self.URI_PREFIXES_FN, 'r')]
        self._uriPrefixes = Trie(prefixList)
Example #21
0
def train(corpus_file, mode, dim_size, window, min_count, negative, epoch,
          workers):
    with bz2.BZ2File(corpus_file) as f:
        sentences = LineSentence(f)
        sg = int(mode == 'sg')

        model = Word2Vec(sentences,
                         size=dim_size,
                         window=window,
                         min_count=min_count,
                         workers=workers,
                         iter=epoch,
                         negative=negative,
                         sg=sg)

    words = []
    entities = []
    for (w, _) in model.vocab.iteritems():
        if w.startswith(MARKER):
            entities.append(w[len(MARKER):].replace(u'_', u' '))
        else:
            words.append(w)

    word_vocab = WordVocab(Trie(words), lowercase=True)
    entity_vocab = EntityVocab(Trie(entities))

    word_embedding = np.zeros((len(words), dim_size), dtype=np.float32)
    entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32)
    for word in words:
        ind = word_vocab.get_index(word)
        if ind is not None:
            word_embedding[ind] = model[word]

    for entity in entities:
        entity_embedding[entity_vocab.get_index(entity)] = model[
            MARKER + entity.replace(u' ', u'_')]

    return EmbeddingReader(word_embedding, entity_embedding, word_vocab,
                           entity_vocab)
Example #22
0
    def build(db, entity_db, min_word_count, min_entity_count, white_list, pool_size, chunk_size):
        word_counter = Counter()
        entity_counter = Counter()

        if white_list is not None:
            white_list = json.load(open(white_list, 'r'))
        else:
            white_list = None

        tokenizer = RegexpTokenizer()

        with click.progressbar(db.keys()) as bar:
            for title in bar:
                obj = db[title]
                text = obj['text']
                tokens = tokenizer.tokenize(text)

                word_counter.update(t.text.lower() for t in tokens)

                for (_, title, _) in obj['links']:
                    title = entity_db.resolve_redirect(title)
                    entity_counter[title] += 1

        word_dict = Trie([w.lower() for (w, c) in word_counter.items()
                          if c >= min_word_count])

        if white_list is None:
            entity_dict = Trie([e.lower() for (e, c) in entity_counter.items()
                            if c >= min_entity_count])
        else:
            entity_dict = Trie([e.lower() for (e, c) in entity_counter.items()
                            if c >= min_entity_count]+white_list)



        entities = []
        entities_dict = Trie(entities + entity_dict.keys())

        return Vocab(word_dict, entities_dict)
Example #23
0
def onecut(text, data=['']):
    if (data != ['']):
        trie = Trie(data)
    else:
        trie = THAI_WORDS
    graph = defaultdict(list)  # main data structure
    allow_pos = tcc_pos(text)  # ตำแหน่งที่ตัด ต้องตรงกับ tcc

    q = [0]  # min-heap queue
    last_p = 0  # last position for yield
    while q[0] < len(text):
        p = heappop(q)

        for w in trie.prefixes(text[p:]):
            p_ = p + len(w)
            if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
                graph[p].append(p_)
                if p_ not in q:
                    heappush(q, p_)

        # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
        if len(q) == 1:
            pp = next(bfs_paths_graph(graph, last_p, q[0]))
            # เริ่มต้น last_p = pp[0] เอง
            for p in pp[1:]:
                yield text[last_p:p]
                last_p = p
            # สุดท้าย last_p == q[0] เอง

        # กรณี length 0  คือ ไม่มีใน dict
        if len(q) == 0:
            m = pat_eng.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.end()
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p + 1, len(text)):
                    if i in allow_pos:  # ใช้ tcc ด้วย
                        ww = [
                            w for w in trie.prefixes(text[i:])
                            if (i + len(w) in allow_pos)
                        ]
                        m = pat_eng.match(text[i:])
                        if ww or m:
                            break
                else:
                    i = len(text)
            w = text[p:i]
            graph[p].append(i)
            yield w
            last_p = i
            heappush(q, i)
Example #24
0
def load_password_blacklist():
    global password_blackList
    if conf.password_blackList == 'NOBLACKLIST':
        LOGGER.warning('No password blacklist file defined.')
        password_blackList = Trie()
        return

    if os.path.isfile('compiledPwdBlacklist.bin'):
        LOGGER.info('Loading pre-compiled password blacklist...')
        password_blackList = Trie()
        password_blackList.load('compiledPwdBlacklist.bin')

    else:
        try:
            LOGGER.info('Compiling password blacklist...')
            with open(conf.password_blackList, encoding="utf-8") as f:
                pwds = f.read().splitlines()
                password_blackList = Trie(pwds)
            password_blackList.save('compiledPwdBlacklist.bin')
        except FileNotFoundError:
            LOGGER.error('File ' + conf.password_blackList +
                         ' not found. Aborting.')
            exit(-1)
def add_terms():


    for ngram in range(1,3):
        # update vocabulary trie
        # this messes up the ids but I don't use them anymore because I don't use the doc-term matrices anymore
        start = time.time()
        vocabulary = load_vocabulary_trie(ngram)
        keys = vocabulary.keys() + ADDED_TOKENS[ngram]
        vocabulary_new = Trie(keys)
        vocabulary_new.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram))

        full_db_to_tokens(ngram, add_new_terms=set(ADDED_TOKENS[ngram]))
        print("adding new tokens for {}-gram took {}.".format(ngram, time.time() - start))
Example #26
0
def rebuild_database() -> None:
    """Rebuild the search database."""
    global database
    LOGGER.info('Updating search database...')
    # Clear and reset.
    word_to_ids.clear()

    for item in UI.item_list.values():
        for subtype_ind in item.visual_subtypes:
            for tag in item.get_tags(subtype_ind):
                for word in tag.split():
                    word_to_ids[word.casefold()].add((item.id, subtype_ind))
    database = Trie(word_to_ids.keys())
    LOGGER.debug('Tags: {}', database.keys())
    _type_cback()
Example #27
0
    def build(db, entity_db, min_word_count, min_entity_count):
        word_counter = Counter()
        entity_counter = Counter()

        tokenizer = RegexpTokenizer()
        with click.progressbar(db.keys()) as bar:
            for title in bar:
                obj = db[title]
                text = obj['text']
                tokens = tokenizer.tokenize(text)

                word_counter.update(t.text.lower() for t in tokens)

                for (_, title, _) in obj['links']:
                    title = entity_db.resolve_redirect(title)
                    entity_counter[title] += 1

        word_dict = Trie(
            [w for (w, c) in word_counter.iteritems() if c >= min_word_count])
        entity_dict = Trie([
            e for (e, c) in entity_counter.iteritems() if c >= min_entity_count
        ])

        return Vocab(word_dict, entity_dict)
Example #28
0
    def build(dump_file, pool_size, chunk_size):
        dump_reader = WikiDumpReader(dump_file)

        global _extractor
        _extractor = WikiExtractor()

        titles = []
        redirects = {}
        title_counter = Counter()

        with closing(Pool(pool_size)) as pool:
            for (page, links) in pool.imap_unordered(_process_page,
                                                     dump_reader,
                                                     chunksize=chunk_size):
                titles.append(normalize(page.title))
                if page.is_redirect:
                    redirects[normalize(page.title)] = page.redirect

                for link_obj in links:
                    title_counter[normalize(link_obj.title)] += 1

        title_dict = Trie(titles)

        redirect_items = []
        for (title, dest_title) in redirects.items():
            if dest_title in title_dict:
                redirect_items.append((title, (title_dict[dest_title], )))

        redirect_dict = RecordTrie('<I', redirect_items)

        delete_keys = []
        keys = list(title_counter.keys())
        for key in keys:
            title = key
            count = title_counter[key]
            dest_obj = redirect_dict.get(title)
            if dest_obj is not None:
                title_counter[title_dict.restore_key(dest_obj[0][0])] += count
                del title_counter[title]

        inlink_arr = np.zeros(len(title_dict), dtype=np.int)
        for (title, count) in title_counter.items():
            title_index = title_dict.get(title)
            if title_index is not None:
                inlink_arr[title_index] = count

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Example #29
0
    def build(wiki_data_file: str, target_languages: List[str] = None):
        data = []
        indptr = [0]
        titles = []
        title_indices = []

        with bz2.BZ2File(wiki_data_file) as f:
            for (n, line) in enumerate(f):
                if n % 1000 == 0 and n != 0:
                    logger.info("Processed %d lines", n)

                line = line.rstrip().decode("utf-8")
                if line in ("[", "]"):
                    continue

                if line[-1] == ",":
                    line = line[:-1]
                obj = ujson.loads(line)
                if obj["type"] != "item":
                    continue

                for link_obj in obj["sitelinks"].values():
                    site = link_obj["site"]
                    if not site.endswith("wiki"):
                        continue
                    lang = site[:-4]
                    if target_languages and lang not in target_languages:
                        continue

                    title_indices.append(len(indptr) - 1)
                    data.append(len(titles))

                    title = "%s:%s" % (link_obj["title"], lang)
                    titles.append(title)

                indptr.append(len(data))

        title_trie = Trie(titles)
        data = np.fromiter((title_trie[titles[n]] for n in data), dtype=np.int)
        indptr = np.array(indptr, dtype=np.int)
        new_title_indices = np.empty(len(titles), dtype=np.int)
        for (title, index) in zip(titles, title_indices):
            new_title_indices[title_trie[title]] = index

        return InterwikiDB(title_trie, data, indptr, new_title_indices)
Example #30
0
def tcut(text):
    #global last_p, i, q, ww   # for debug
    trie = Trie(get_data())
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + '/' + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len(text):
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in trie.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        if len(q) == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0

        # กรณี len(q) == 0  คือ ไม่มีใน dict
        if len(q) == 0:
            # skip น้อยที่สุด ที่เป็นไปได้
            for i in range(p, len(text)):
                ww = trie.prefixes(text[i:])
                if ww:
                    break
            else:
                i = len(text)
            w = text[p:i]
            w = w.replace(' ', '')  # ลบค่าที่ว่าง
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)