Beispiel #1
0
    def __init__(self, words_path=None):
        self.tree = TrieTree()

        if words_path:
            self.load_word_tree(words_path)
        else:
            self.build_word_tree()
Beispiel #2
0
    def _prepare(self):
        self.pinyin_phrase = pinyin_phrase_loader()
        self.pinyin_char = pinyin_char_loader()

        # 加载 trie 树
        self.trie_tree_obj = TrieTree()
        self.trie_tree_obj.build_trie_tree(self.pinyin_phrase, 'phrase')
        self.trie_tree_obj.build_trie_tree(self.pinyin_char, 'char')
Beispiel #3
0
def build_trie_with_routes(routes_and_prices):
    '''
    Load the routes from list of list into trie tree
    '''
    route_tree = TrieTree()

    for pair in routes_and_prices:
        route = pair[0]
        price = float(pair[1])
        route_tree.add(route, price)

    return route_tree
Beispiel #4
0
class Segment(object):
    def __init__(self, words_path=None):
        self.tree = TrieTree()

        if words_path:
            self.load_word_tree(words_path)
        else:
            self.build_word_tree()

    def load_word_tree(self, path):
        self.tree.load(path)

    def build_word_tree(self):
        pass

    def segment(self, sentence, seg_len=4):
        sentence = re.sub(PUNCTUATION, '', sentence)
        en = re.findall(EN, sentence)
        if en:
            sentence = re.sub(EN, '*', sentence)
        part = sentence[:seg_len]

        while True:
            seg_words, position = self.part_segment(part, en)
            if seg_words == '*':
                seg_words = en[0]
                en.pop(0)
            yield seg_words

            sentence = sentence[position:]
            part = sentence[:seg_len]

            if not sentence:
                break

    def part_segment(self, part_sentence, en):
        position = len(part_sentence)
        while True:
            if self.tree.is_has_word(
                    part_sentence) or part_sentence == '*' or len(
                        part_sentence) == 1:
                break

            position -= 1
            part_sentence = part_sentence[:position]

        return part_sentence, position
Beispiel #5
0
class TagMake(object):
    def __init__(self):
        self.tree = TrieTree()

    def add_tag(self, tag):
        self.tree.insert(tag)

    def add_tag_file(self, filename, func=lambda x: x):
        with open(filename) as f:
            for line in f:
                if line.strip():
                    self.tree.insert(func(line.strip()))

    def make(self, line):
        result = []
        for ind, _ in enumerate(line):
            cut = line[ind:]
            length = self.tree.findMax(cut)
            if length > 0:
                result.append((cut[:length], ind, length))
        return result
Beispiel #6
0
class TagMake(object):
    
    def __init__(self):
        self.tree = TrieTree()

    def add_tag(self, tag):
        self.tree.insert(tag)
    
    def add_tag_file(self, filename, func=lambda x:x):
        with open(filename) as f:
            for line in f:
                if line.strip():
                    self.tree.insert(func(line.strip()))

    def make(self, line):
        result = []
        for ind,_ in enumerate(line):
            cut = line[ind:]
            length = self.tree.findMax(cut)
            if length > 0:
                result.append((cut[:length], ind, length))
        return result
Beispiel #7
0
class Pinyin(object):
    ''' 为汉字标读音 '''
    def __init__(self):
        self.trie_tree_obj = None

    def _prepare(self):
        self.pinyin_phrase = pinyin_phrase_loader()
        self.pinyin_char = pinyin_char_loader()

        # 加载 trie 树
        self.trie_tree_obj = TrieTree()
        self.trie_tree_obj.build_trie_tree(self.pinyin_phrase, 'phrase')
        self.trie_tree_obj.build_trie_tree(self.pinyin_char, 'char')

    def __call__(self, text, mode=''):
        ''' 将汉字转为拼音,并提供额外的拼音展示方案,若对应字符无拼音,则添加 <unk> 作为标记 '''
        if self.trie_tree_obj is None:
            self._prepare()

        record_list = list()  # 输出最终结果
        i = 0
        end = len(text)
        while i < end:
            pointer = text[i:self.trie_tree_obj.depth + i]
            step, typing = self.trie_tree_obj.search(pointer)
            if typing == 'phrase':
                record_list.extend(self.pinyin_phrase[pointer[0:step]])
            elif typing == 'char':
                record_list.append(self.pinyin_char[pointer[0:step]])
            else:
                print(step, typing, pointer[0])
                record_list.append('<unk>')
            i += step

        assert len(record_list) == len(text)
        return record_list
Beispiel #8
0
 def __init__(self):
     self.tree = TrieTree()
Beispiel #9
0
 def __init__(self):
     self.tree = TrieTree()
Beispiel #10
0
def tt_fixture():
    from trie_tree import TrieTree
    tt = TrieTree()
    return tt