def __init__(self, words_path=None): self.tree = TrieTree() if words_path: self.load_word_tree(words_path) else: self.build_word_tree()
def _prepare(self): self.pinyin_phrase = pinyin_phrase_loader() self.pinyin_char = pinyin_char_loader() # 加载 trie 树 self.trie_tree_obj = TrieTree() self.trie_tree_obj.build_trie_tree(self.pinyin_phrase, 'phrase') self.trie_tree_obj.build_trie_tree(self.pinyin_char, 'char')
def build_trie_with_routes(routes_and_prices): ''' Load the routes from list of list into trie tree ''' route_tree = TrieTree() for pair in routes_and_prices: route = pair[0] price = float(pair[1]) route_tree.add(route, price) return route_tree
class Segment(object): def __init__(self, words_path=None): self.tree = TrieTree() if words_path: self.load_word_tree(words_path) else: self.build_word_tree() def load_word_tree(self, path): self.tree.load(path) def build_word_tree(self): pass def segment(self, sentence, seg_len=4): sentence = re.sub(PUNCTUATION, '', sentence) en = re.findall(EN, sentence) if en: sentence = re.sub(EN, '*', sentence) part = sentence[:seg_len] while True: seg_words, position = self.part_segment(part, en) if seg_words == '*': seg_words = en[0] en.pop(0) yield seg_words sentence = sentence[position:] part = sentence[:seg_len] if not sentence: break def part_segment(self, part_sentence, en): position = len(part_sentence) while True: if self.tree.is_has_word( part_sentence) or part_sentence == '*' or len( part_sentence) == 1: break position -= 1 part_sentence = part_sentence[:position] return part_sentence, position
class TagMake(object): def __init__(self): self.tree = TrieTree() def add_tag(self, tag): self.tree.insert(tag) def add_tag_file(self, filename, func=lambda x: x): with open(filename) as f: for line in f: if line.strip(): self.tree.insert(func(line.strip())) def make(self, line): result = [] for ind, _ in enumerate(line): cut = line[ind:] length = self.tree.findMax(cut) if length > 0: result.append((cut[:length], ind, length)) return result
class TagMake(object): def __init__(self): self.tree = TrieTree() def add_tag(self, tag): self.tree.insert(tag) def add_tag_file(self, filename, func=lambda x:x): with open(filename) as f: for line in f: if line.strip(): self.tree.insert(func(line.strip())) def make(self, line): result = [] for ind,_ in enumerate(line): cut = line[ind:] length = self.tree.findMax(cut) if length > 0: result.append((cut[:length], ind, length)) return result
class Pinyin(object): ''' 为汉字标读音 ''' def __init__(self): self.trie_tree_obj = None def _prepare(self): self.pinyin_phrase = pinyin_phrase_loader() self.pinyin_char = pinyin_char_loader() # 加载 trie 树 self.trie_tree_obj = TrieTree() self.trie_tree_obj.build_trie_tree(self.pinyin_phrase, 'phrase') self.trie_tree_obj.build_trie_tree(self.pinyin_char, 'char') def __call__(self, text, mode=''): ''' 将汉字转为拼音,并提供额外的拼音展示方案,若对应字符无拼音,则添加 <unk> 作为标记 ''' if self.trie_tree_obj is None: self._prepare() record_list = list() # 输出最终结果 i = 0 end = len(text) while i < end: pointer = text[i:self.trie_tree_obj.depth + i] step, typing = self.trie_tree_obj.search(pointer) if typing == 'phrase': record_list.extend(self.pinyin_phrase[pointer[0:step]]) elif typing == 'char': record_list.append(self.pinyin_char[pointer[0:step]]) else: print(step, typing, pointer[0]) record_list.append('<unk>') i += step assert len(record_list) == len(text) return record_list
def __init__(self): self.tree = TrieTree()
def tt_fixture(): from trie_tree import TrieTree tt = TrieTree() return tt