Esempio n. 1
0
    def load(in_file: str, mmap_mode="r"):
        data = joblib.load(in_file, mmap_mode=mmap_mode)
        title_trie = Trie()
        title_trie = title_trie.frombytes(data["title_trie"])
        data["title_trie"] = title_trie

        return InterwikiDB(**data)
Esempio n. 2
0
def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
    """
    Create a dict trie which will be used for word_tokenize() function.
    For more information on the trie data structure,
    see: https://marisa-trie.readthedocs.io/en/latest/index.html

    :param string/list dict_source: a list of vocaburaries or a path to source file
    :return: a trie created from a dictionary input
    """
    trie = None

    if type(dict_source) is str:
        # Receive a file path of the dict to read
        with open(dict_source, "r", encoding="utf8") as f:
            _vocabs = f.read().splitlines()
            trie = Trie(_vocabs)
    elif isinstance(dict_source, Iterable):
        # Received a sequence type object of vocabs
        trie = Trie(dict_source)
    elif isinstance(dict_source, Trie):
        trie = dict_source
    else:
        raise TypeError(
            "Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)"
        )

    return trie
Esempio n. 3
0
 def __init__(self, word_file):
     # TODO: Check input file exists, is readable, valid, etc
     words = []
     with open(word_file) as input_file:
         for word in input_file:
             words.append(word.lower().strip())
     self.trie = Trie(words)
Esempio n. 4
0
def train(corpus_file, out_file, mode, dim_size, window, min_count,
          negative, epoch, pool_size, chunk_size):
    with bz2.BZ2File(corpus_file) as f:
        sentences = LineSentence(f)
        sg = int(mode == 'sg')

        model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count,
                         workers=pool_size, iter=epoch, negative=negative, sg=sg)

    words = []
    entities = []
    for (w, _) in model.vocab.iteritems():
        if w.startswith(MARKER):
            entities.append(w[len(MARKER):].replace(u'_', u' '))
        else:
            words.append(w)

    vocab = Vocab(Trie(words), Trie(entities))

    word_embedding = np.zeros((len(words), dim_size), dtype=np.float32)
    entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32)
    for word in words:
        word_embedding[vocab.get_word_index(word)] = model[word]
    for entity in entities:
        entity_embedding[vocab.get_entity_index(entity)] = model[MARKER + entity.replace(u' ', u'_')]

    ret = dict(
        word_embedding=word_embedding,
        entity_embedding=entity_embedding,
        vocab=vocab,
    )
    joblib.dump(ret, out_file, compress=False)
Esempio n. 5
0
    def __init__(self, dic, start_index=0):
        if isinstance(dic, Trie):
            self._dic = dic
        else:
            self._dic = Trie(dic)

        self._start_index = start_index
Esempio n. 6
0
def create_custom_dict_trie(custom_dict_source):
	"""The function is used to create a custom dict trie which will be
	used for word_tokenize() function
	
	Arguments:
		custom_dict_source {string or list} -- a list of vocaburaries or a path to source file
	
	Raises:
		ValueError -- Invalid custom_dict_source's object type
	
	Returns:
		Trie -- A trie created from custom dict input
	"""

	if type(custom_dict_source) is str:
		# Receive a file path of the custom dict to read
		with codecs.open(custom_dict_source, 'r',encoding='utf8') as f:
			_vocabs = f.read().splitlines()
			return Trie(_vocabs)
	elif isinstance(custom_dict_source, (list, tuple, set)):
		# Received a sequence type object of vocabs
		return Trie(custom_dict_source)
	else:
		raise TypeError(
			'Type of custom_dict_source must be either str (path to source file) or collections'
		)
Esempio n. 7
0
def create_vocabulary(ngram=1, test=False):
    """ Creates  the vocabulary for ngram level

    :param ngram:
    :param test: If true, only runs through the first 10000 documents.
    :return:

    Steps:
    - Get a set of all tokens
    - Retain only the valid ones
    """

    add_valid_words()
    # get a set of all tokens of the ngram level

    print("here")

    token_set = get_all_tokens_in_docs(ngram, test)
    print("Total tokens before merging: ", len(token_set))

    valid_iterator = valid_ngram_iterator(token_set, ngram)

    vocabulary_trie = Trie(valid_iterator)
    vocabulary_trie.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram))
    print("Total tokens after merging", len(vocabulary_trie))
Esempio n. 8
0
def recurse_generate(words: list,
                     trie: Trie,
                     square_size: int,
                     chosen_words_length=0) -> list:
    if chosen_words_length >= square_size or square_size <= 1:
        if check_solution_is_valid(words, square_size):
            return words
        return None

    # build up the solution letter by letter
    # on each iteration we check if the substring is a key inside the Trie
    # if not a key then we know the current permutation is not a solution so return None
    # loop through the characters

    for i in range(chosen_words_length, square_size):
        prefix = "".join(word[i] for word in words)
        # using the soon to be deprecated function because it runs ~30% faster
        if not trie.has_keys_with_prefix(prefix):
            return None

    prefix = "".join(word[chosen_words_length] for word in words)
    # we use a prefix to dictate which key to start going over
    for word in trie.iterkeys(prefix):
        new_list = words + [word]
        res = recurse_generate(new_list, trie, square_size,
                               chosen_words_length + 1)
        if res:
            return res

    return None
Esempio n. 9
0
    def load(in_file, mmap_mode='r'):
        obj = joblib.load(in_file, mmap_mode=mmap_mode)

        title_dict = Trie()
        redirect_dict = RecordTrie('<I')
        title_dict.frombytes(obj['title_dict'])
        redirect_dict.frombytes(obj['redirect_dict'])

        return EntityDB(title_dict, redirect_dict, obj['inlink_arr'])
Esempio n. 10
0
    def load(in_file, mmap=True):
        title_dict = Trie()
        redirect_dict = RecordTrie('<I')

        title_dict.mmap(in_file + '_title.trie')
        redirect_dict.mmap(in_file + '_redirect.trie')
        inlink_arr = np.load(in_file + '_prior.npy', mmap_mode='r')

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Esempio n. 11
0
    def load(in_file, mmap=True):
        title_dict = Trie()
        redirect_dict = RecordTrie('<I')

        title_dict.mmap(in_file + '_title.trie')
        redirect_dict.mmap(in_file + '_redirect.trie')
        inlink_arr = np.load(in_file + '_prior.npy', mmap_mode='r')

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Esempio n. 12
0
    def load(input):
        if isinstance(input, dict):
            obj = input
        else:
            obj = joblib.load(input)

        dic = Trie()
        dic.frombytes(obj['dic'])
        return WordVocab(dic, obj['lowercase'], obj.get('start_index', 0))
Esempio n. 13
0
def loadTrie( fname ):
    global trie
    try:
        fname = fname + "_trie.hny"
        trie.load( fname )
    except(IOError):
        f = bz2.BZ2File( dir_path + sys.argv[1]);
        words = [ w.strip() for w in f.readlines() ]
        trie = Trie(words);
        trie.save(fname);
    def __init__(self, args):
        self.args = args
        self.all_titles = self._all_titles_collector()
        self.redirects = _extract_pages(self.args.path_for_raw_xml)
        self.nlp = nlp_returner(args=self.args)

        self.entity_dict = Trie(self.all_titles)

        self.redirect_dict = RecordTrie(
            '<I', [(title, (self.entity_dict[dest_title], ))
                   for (title, dest_title) in self.redirects
                   if dest_title in self.entity_dict])
Esempio n. 15
0
def _multicut(text: str, custom_dict: Trie = None):
    """
    ส่งคืน LatticeString คืนมาเป็นก้อนๆ
    """
    if not custom_dict:
        custom_dict = DEFAULT_DICT_TRIE

    len_text = len(text)
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + "/" + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len_text:
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in custom_dict.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        if len(q) == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0

        # กรณี len(q) == 0  คือ ไม่มีใน dict
        if len(q) == 0:
            m = _PAT_ENG.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.span()[1]
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p, len_text):
                    ww = custom_dict.prefixes(text[i:])
                    m = _PAT_ENG.match(text[i:])
                    if ww or m:
                        break
                else:
                    i = len_text
            w = text[p:i]
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)
Esempio n. 16
0
def _multicut(text: str, custom_dict: Trie = None):
    """
    ส่งคืน LatticeString คืนมาเป็นก้อนๆ
    """
    if not custom_dict:
        custom_dict = DEFAULT_DICT_TRIE

    len_text = len(text)
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + "/" + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len_text:
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in custom_dict.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        if len(q) == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0

        # กรณี len(q) == 0  คือ ไม่มีใน dict
        if len(q) == 0:
            m = _PAT_ENG.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.span()[1]
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p, len_text):
                    ww = custom_dict.prefixes(text[i:])
                    m = _PAT_ENG.match(text[i:])
                    if ww or m:
                        break
                else:
                    i = len_text
            w = text[p:i]
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)
Esempio n. 17
0
    def __init__(self):
        if not os.path.exists(self.URI_PREFIXES_FN):
            ensurePathExists(self.URI_PREFIXES_FN)
            open(self.URI_PREFIXES_FN, 'w')

        if not os.path.exists(self.CACHE_SHELVE_FN):
            ensurePathExists(self.CACHE_SHELVE_FN)

        cache = self._openShelve('c')
        cache.close()

        prefixList = [line.strip() for line in open(self.URI_PREFIXES_FN, 'r')]
        self._uriPrefixes = Trie(prefixList)
Esempio n. 18
0
    def __init__(self, *args, **kwargs):
        self.trie = Trie()
        self.secondary_trie = set()
        self.book_trie = Trie()
        path = os.path.dirname(os.path.realpath(__file__)) + "/"
        if 'save' in kwargs:
            self.trie.load(kwargs['save'])
        else:
            with open(kwargs['words'], encoding='utf-8') as fp:
                keys = fp.read().splitlines()
                self.trie = Trie(keys)

        self.preprocess(kwargs['alphabet'])
Esempio n. 19
0
def onecut(text, data=['']):
    if (data != ['']):
        trie = Trie(data)
    else:
        trie = THAI_WORDS
    graph = defaultdict(list)  # main data structure
    allow_pos = tcc_pos(text)  # ตำแหน่งที่ตัด ต้องตรงกับ tcc

    q = [0]  # min-heap queue
    last_p = 0  # last position for yield
    while q[0] < len(text):
        p = heappop(q)

        for w in trie.prefixes(text[p:]):
            p_ = p + len(w)
            if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
                graph[p].append(p_)
                if p_ not in q:
                    heappush(q, p_)

        # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
        if len(q) == 1:
            pp = next(bfs_paths_graph(graph, last_p, q[0]))
            # เริ่มต้น last_p = pp[0] เอง
            for p in pp[1:]:
                yield text[last_p:p]
                last_p = p
            # สุดท้าย last_p == q[0] เอง

        # กรณี length 0  คือ ไม่มีใน dict
        if len(q) == 0:
            m = pat_eng.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.end()
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p + 1, len(text)):
                    if i in allow_pos:  # ใช้ tcc ด้วย
                        ww = [
                            w for w in trie.prefixes(text[i:])
                            if (i + len(w) in allow_pos)
                        ]
                        m = pat_eng.match(text[i:])
                        if ww or m:
                            break
                else:
                    i = len(text)
            w = text[p:i]
            graph[p].append(i)
            yield w
            last_p = i
            heappush(q, i)
def add_terms():


    for ngram in range(1,3):
        # update vocabulary trie
        # this messes up the ids but I don't use them anymore because I don't use the doc-term matrices anymore
        start = time.time()
        vocabulary = load_vocabulary_trie(ngram)
        keys = vocabulary.keys() + ADDED_TOKENS[ngram]
        vocabulary_new = Trie(keys)
        vocabulary_new.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram))

        full_db_to_tokens(ngram, add_new_terms=set(ADDED_TOKENS[ngram]))
        print("adding new tokens for {}-gram took {}.".format(ngram, time.time() - start))
Esempio n. 21
0
def _onecut(text: str, custom_dict: Trie):
    graph = defaultdict(list)  # main data structure
    allow_pos = tcc_pos(text)  # separating position should aligned with TCC

    q = [0]  # min-heap queue
    last_p = 0  # last position for yield
    while q[0] < len(text):
        p = heappop(q)

        for w in custom_dict.prefixes(text[p:]):
            p_ = p + len(w)
            if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
                graph[p].append(p_)
                if p_ not in q:
                    heappush(q, p_)

        # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
        if len(q) == 1:
            pp = next(_bfs_paths_graph(graph, last_p, q[0]))
            # เริ่มต้น last_p = pp[0] เอง
            for p in pp[1:]:
                yield text[last_p:p]
                last_p = p
            # สุดท้าย last_p == q[0] เอง

        # กรณี length 0 คือ ไม่มีใน dict
        if len(q) == 0:
            m = _PAT_ENG.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.end()
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p + 1, len(text)):
                    if i in allow_pos:  # ใช้ tcc ด้วย
                        ww = [
                            w
                            for w in custom_dict.prefixes(text[i:])
                            if (i + len(w) in allow_pos)
                        ]
                        ww = [w for w in ww if not _PAT_TWOCHARS.match(w)]
                        m = _PAT_ENG.match(text[i:])
                        if ww or m:
                            break
                else:
                    i = len(text)
            w = text[p:i]
            graph[p].append(i)
            yield w
            last_p = i
            heappush(q, i)
Esempio n. 22
0
    def load(in_file, mmap=True):
        word_dict = Trie()
        entity_dict = Trie()

        word_dict.mmap(in_file + '_word.trie')
        entity_dict.mmap(in_file + '_entity.trie')

        return Vocab(word_dict, entity_dict)
Esempio n. 23
0
def rebuild_database() -> None:
    """Rebuild the search database."""
    global database
    LOGGER.info('Updating search database...')
    # Clear and reset.
    word_to_ids.clear()

    for item in UI.item_list.values():
        for subtype_ind in item.visual_subtypes:
            for tag in item.get_tags(subtype_ind):
                for word in tag.split():
                    word_to_ids[word.casefold()].add((item.id, subtype_ind))
    database = Trie(word_to_ids.keys())
    LOGGER.debug('Tags: {}', database.keys())
    _type_cback()
Esempio n. 24
0
    def build(description_db,
              entity_db,
              white_list,
              start_index,
              min_inlink_count,
              target_vocab=None):
        counter = Counter()
        db_titles = set()

        for (title, _, titles) in description_db.iterator():
            if target_vocab is not None and title not in target_vocab:
                continue

            counter.update(titles)
            db_titles.add(title)

        title_list = [
            t for (t, c) in counter.iteritems() if c >= min_inlink_count
        ]

        white_list = [entity_db.resolve_redirect(t) for t in white_list]
        white_list = [t for t in white_list if t in db_titles]

        title_list = set(title_list + white_list)

        return EntityVocab(Trie(title_list), start_index)
Esempio n. 25
0
class WordList(object):
    def __init__(self, word_file):
        # TODO: Check input file exists, is readable, valid, etc
        words = []
        with open(word_file) as input_file:
            for word in input_file:
                words.append(word.lower().strip())
        self.trie = Trie(words)

    def contains_word(self, word):
        """
        Check whether a word exists in the list.
        
        :param word: An ASCII, lowercase string to check for.
        :return: True if the word is in the word list, false if it is not.
        """
        # TODO: Raise errors if the word is None, isn't ASCII or lowercase, etc
        return word in self.trie

    def contains_prefix(self, prefix):
        """
        Check list for words that begin with the supplied prefix
        
        :param prefix: An ASCII, lowercase string to check as a prefix
        :return: True if this key is a prefix for some other word or words in 
        the list. Note that this method will return False if the word is in the
        list but is not a prefix of any other word.
        """
        # TODO: Raise errors if prefix is None, isn't ASCII or lowercase, etc
        return len(self.trie.keys(prefix)) > 1
Esempio n. 26
0
    def build(dump_file, pool_size, chunk_size):
        dump_reader = WikiDumpReader(dump_file)

        global _extractor
        _extractor = WikiExtractor()

        titles = []
        redirects = {}
        title_counter = Counter()

        with closing(Pool(pool_size)) as pool:
            for (page, links) in pool.imap_unordered(_process_page,
                                                     dump_reader,
                                                     chunksize=chunk_size):
                titles.append(normalize(page.title))
                if page.is_redirect:
                    redirects[normalize(page.title)] = page.redirect

                for link_obj in links:
                    title_counter[normalize(link_obj.title)] += 1

        title_dict = Trie(titles)

        redirect_items = []
        for (title, dest_title) in redirects.items():
            if dest_title in title_dict:
                redirect_items.append((title, (title_dict[dest_title], )))

        redirect_dict = RecordTrie('<I', redirect_items)

        delete_keys = []
        keys = list(title_counter.keys())
        for key in keys:
            title = key
            count = title_counter[key]
            dest_obj = redirect_dict.get(title)
            if dest_obj is not None:
                title_counter[title_dict.restore_key(dest_obj[0][0])] += count
                del title_counter[title]

        inlink_arr = np.zeros(len(title_dict), dtype=np.int)
        for (title, count) in title_counter.items():
            title_index = title_dict.get(title)
            if title_index is not None:
                inlink_arr[title_index] = count

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Esempio n. 27
0
def generate_word_square(n: int, letters: str) -> list:
    assert n > 0, "Invalid square"
    words = get_anagrams(n, letters)
    # Trie - https://en.wikipedia.org/wiki/Trie
    t = Trie(words)
    result = recurse_generate([], t, n, 0)
    print(result)
    return result
Esempio n. 28
0
    def __init__(self, custom_dict=None):
        """
        Initialize tokenizer object

        :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)

        :return: trie_dict - a dictionary in the form of trie data for tokenizing engines
        """
        if custom_dict:
            if type(custom_dict) is list:
                self.trie_dict = Trie(custom_dict)
            elif type(custom_dict) is str:
                with codecs.open(custom_dict, "r", encoding="utf8") as f:
                    vocabs = f.read().splitlines()
                self.trie_dict = Trie(vocabs)
        else:
            self.trie_dict = Trie(word_dict())
Esempio n. 29
0
def tcut(text):
    #global last_p, i, q, ww   # for debug
    trie = Trie(get_data())
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + '/' + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len(text):
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in trie.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        if len(q) == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0

        # กรณี len(q) == 0  คือ ไม่มีใน dict
        if len(q) == 0:
            # skip น้อยที่สุด ที่เป็นไปได้
            for i in range(p, len(text)):
                ww = trie.prefixes(text[i:])
                if ww:
                    break
            else:
                i = len(text)
            w = text[p:i]
            w = w.replace(' ', '')  # ลบค่าที่ว่าง
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)
Esempio n. 30
0
	def __init__(self, custom_dict=None):
		"""
		Initialize tokenizer object
		
		Keyword arguments:
		custom_dict -- a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)

		Object variables:
		trie_dict -- a trie to use in tokenizing engines
		"""
		if custom_dict:
			if type(custom_dict) is list:
				self.trie_dict = Trie(custom_dict)
			elif type(custom_dict) is str:
				with codecs.open(custom_dict, 'r',encoding='utf8') as f:
					vocabs = f.read().splitlines()
				self.trie_dict = Trie(vocabs)
		else:
			self.trie_dict = Trie(get_dict())
Esempio n. 31
0
    def build(dump_file, pool_size, chunk_size):
        dump_reader = WikiDumpReader(dump_file)

        global _extractor
        _extractor = WikiExtractor()

        titles = []
        redirects = {}
        title_counter = Counter()

        with closing(Pool(pool_size)) as pool:
            for (page, links) in pool.imap_unordered(
                _process_page, dump_reader, chunksize=chunk_size
            ):
                titles.append(page.title)
                if page.is_redirect:
                    redirects[page.title] = page.redirect

                for link_obj in links:
                    title_counter[link_obj.title] += 1

        title_dict = Trie(titles)

        redirect_items = []
        for (title, dest_title) in redirects.iteritems():
            if dest_title in title_dict:
                redirect_items.append((title, (title_dict[dest_title],)))

        redirect_dict = RecordTrie('<I', redirect_items)

        for (title, count) in title_counter.items():
            dest_obj = redirect_dict.get(title)
            if dest_obj is not None:
                title_counter[title_dict.restore_key(dest_obj[0][0])] += count
                del title_counter[title]

        inlink_arr = np.zeros(len(title_dict), dtype=np.int)
        for (title, count) in title_counter.items():
            title_index = title_dict.get(title)
            if title_index is not None:
                inlink_arr[title_index] = count

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Esempio n. 32
0
def create_custom_dict_trie(custom_dict_source):
    """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html

    :param string/list custom_dict_source:  a list of vocaburaries or a path to source file

    :return: A trie created from custom dict input
    """

    if type(custom_dict_source) is str:
        # Receive a file path of the custom dict to read
        with codecs.open(custom_dict_source, "r", encoding="utf8") as f:
            _vocabs = f.read().splitlines()
            return Trie(_vocabs)
    elif isinstance(custom_dict_source, (list, tuple, set)):
        # Received a sequence type object of vocabs
        return Trie(custom_dict_source)
    else:
        raise TypeError(
            "Type of custom_dict_source must be either str (path to source file) or collections"
        )
    def __init__(self,
                 max_len,
                 min_freq,
                 min_pmi,
                 min_entropy,
                 cut=True,
                 tokenizer='jieba',
                 norm_pmi=False):
        self.max_len = max_len
        self.min_freq = min_freq
        self.min_pmi = min_pmi
        self.min_entropy = min_entropy
        self.cut = cut
        self.norm_pmi = norm_pmi

        # Initialize dictionary to build trie
        self.trie = defaultdict(int)
        self.rev_trie = defaultdict(int)
        self.len = 0

        # Build existing dictionary based on trie structure
        sistring = set()
        if 'jieba_dict_path' in config['DEFAULT'] and os.path.isfile(
                config['DEFAULT']['jieba_dict_path']):
            sistring = get_sistring(config['DEFAULT']['jieba_dict_path'])
        if 'user_dict_path' in config['DEFAULT'] and os.path.isfile(
                config['DEFAULT']['user_dict_path']):
            sistring = get_sistring(config['DEFAULT']['user_dict_path'],
                                    sistring)
        self.dict = Trie(sistring)
        # Get blacklist
        self.blacklist = set()
        if 'blacklist_path' in config['DEFAULT'] and os.path.isfile(
                config['DEFAULT']['blacklist_path']):
            self.blacklist = get_dict(config['DEFAULT']['blacklist_path'])

        if cut:
            if tokenizer == 'jieba':
                self.tokenizer = Jieba()
            else:
                raise ValueError(f'Unknown tokenizer {tokenizer}')
Esempio n. 34
0
def test():
    # 1. build a trie
    d = dict(zero=0, one=1, two=2, three=3, four=4, five=5, six=6, seven=7,
             eight=8, nine=9, ten=10, eleven=11, twelve=12, thirteen=13,
             fourteen=10, fifteen=15, sixteen=16, seventeen=17, eighteen=18,
             nineteen=19, twenty=20, thirty=30, fourty=40, fifty=50,
             sixty=60, seventy=70, eighty=80, ninety=90,
             hundred=100)
    t = Trie(list(d.keys()))

    # 2. scan 2000 "sentences" with it
    for _ in range(1000):
    # scanning for the longest matches only in sentence 1
        i = S1[0]
        #print(TEXT[i:S1[1]])
        while i < S1[1]:
            pfx = list(t.prefixes(TEXT[i:S1[1]]))
            if pfx:
                k = pfx[-1]
                #print(d[k])
                i += len(k)
            else:
                i += 1

        # scanning for all matches in sentence 2
        i = S2[0]
        #print(TEXT[i:S2[1]])
        s = 0
        while i < S2[1]:
            for k in t.prefixes(TEXT[i:S2[1]]):
                #print(k)
                s += d[k]
            i += 1
        if s != 142:
            raise RuntimeError(str(s))

    # 3. make a real list of all keys in the trie
    if 'nine' not in list(t.iterkeys()):
        raise RuntimeError(str(list(t.iterkeys())))
Esempio n. 35
0
def train(corpus_file, mode, dim_size, window, min_count, negative, epoch,
          workers):
    with bz2.BZ2File(corpus_file) as f:
        sentences = LineSentence(f)
        sg = int(mode == 'sg')

        model = Word2Vec(sentences,
                         size=dim_size,
                         window=window,
                         min_count=min_count,
                         workers=workers,
                         iter=epoch,
                         negative=negative,
                         sg=sg)

    words = []
    entities = []
    for (w, _) in model.vocab.iteritems():
        if w.startswith(MARKER):
            entities.append(w[len(MARKER):].replace(u'_', u' '))
        else:
            words.append(w)

    word_vocab = WordVocab(Trie(words), lowercase=True)
    entity_vocab = EntityVocab(Trie(entities))

    word_embedding = np.zeros((len(words), dim_size), dtype=np.float32)
    entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32)
    for word in words:
        ind = word_vocab.get_index(word)
        if ind is not None:
            word_embedding[ind] = model[word]

    for entity in entities:
        entity_embedding[entity_vocab.get_index(entity)] = model[
            MARKER + entity.replace(u' ', u'_')]

    return EmbeddingReader(word_embedding, entity_embedding, word_vocab,
                           entity_vocab)
Esempio n. 36
0
    def load(in_file, mmap=True):
        word_dict = Trie()
        entity_dict = Trie()

        word_dict.mmap(in_file + '_word.trie')
        entity_dict.mmap(in_file + '_entity.trie')

        return Vocab(word_dict, entity_dict)
Esempio n. 37
0
class TrieNameDB(NameDB):

  def __init__(self, pair_gen):
    self._dic = self._construct_dic(pair_gen)
    self._index = Trie(self._dic.keys())

  def _construct_dic(self, pair_gen):
    dic = collections.defaultdict(list)
    for k, v in pair_gen:
      dic[k.lower()].append((k, v))
    return dic

  def find_by_prefix(self, str, limit=50):
    result = []
    for key in self._index.iterkeys(str.lower()):
      result.extend(self._dic[key])
      if limit <= len(result):
        break
    return result[:limit]
Esempio n. 38
0
 def __init__(self, pair_gen):
   self._dic = self._construct_dic(pair_gen)
   self._index = Trie(self._dic.keys())
Esempio n. 39
0
from os.path import join, exists
from marisa_trie import Trie 

if __name__ == '__main__':
	assert len(sys.argv) == 2
	source_dir = sys.argv[1]
	if source_dir.endswith("/"):
		source_dir = source_dir[:-1]
	assert exists(source_dir)
	target_dir = source_dir + "_marisa"
	if exists(target_dir):
		os.rmdir(target_dir)
	makedirs(target_dir)
	source_files = listdir(source_dir)


	for filename in source_files:
		print filename 
		with open(join(source_dir, filename), 'r') as input_file:
			contents = input_file.read()
			
			if filename == 'mappings':
				with open(join(target_dir, 'mappings'), 'w') as output_file:
					# copy source to destination 
					output_file.write(contents)
			else:
				with open(join(target_dir, filename + ".marisa"), 'w') as output_file:
					lines = contents.split("\n")
					d = Trie(l for l in lines if len(l) > 0)
					d.write(output_file)