def __contains__(self, word): p = self._root word = strdecode(word) for c in word: if c not in p.children: return False p = p.children[c] return p.is_word
def get_attr(self, word): """Get the frequency of the given word. Return None if word not found. Word should be utf-8 encoded!""" p = self._root word = strdecode(word) for c in word: if c not in p.children: return None p = p.children[c] return p.attr
def __init__(self, word_bank_path, tags): if not isinstance(tags, list): raise ValueError("'default_setting' must be dict!") self._trie = Trie() for line in open(word_bank_path, 'r'): item = strdecode(line).strip().split(' ') attr = {} for index in range(len(item) - 1): attr[tags[index]] = item[index + 1] self._trie.add_new_word(item[0], attr)
def add_new_word(self, new_word, attributes): """Add a new word with given frequency""" p = self._root new_word = strdecode(new_word) for c in new_word: if c not in p.children: p.children[c] = _Node() p = p.children[c] p.attr = attributes p.is_word = True self._length += 1
def decode(self, m_part): m_ver = [] for i in m_part: if verify(str(i[0][1]), i[1][0], i[1][1], i[1][2], self.bitlen, self.generator, self.prime, self.pub_key): m_ver.append(i[0]) else: print("removing from consideration:") x = [i for i,j in m_ver] y = [j for i,j in m_ver] bits = [round(i) for i in polyfit(x, y, self.k-1)] print(bits) return strdecode(bits)
def delete_word(self, word): """Delete the given word. Return True if success, return false if doesn't find the word""" p = self._root word = strdecode(word) flag = None for c in word: if c not in p.children: return False if p.is_word or len(p.children) > 1: flag = p character = c p = p.children[c] if p.children: p.is_word = False elif flag: flag.delete(character) self._length -= 1 return True
def segment(opts): test_file = opts.test model_file = opts.model output_file = opts.output beam_size = opts.beam_size assert (test_file != "") assert (model_file != "") assert (output_file != "") assert (beam_size >= 1) params = pickle.load(open(model_file)) output= [] #decode with codecs.open(test_file, encoding='utf-8') as infile: for line in infile: line = strdecode(line) raw_sentence = line.strip() words = beam_search(False, raw_sentence, beam_size, params) output.append(words) #save segmented sentences to output_file with open(output_file) as outfile: for words in output: line = ' '.join(words) line += '\n' outfile.write(line)
def logic_translate(self, content): """Translate the content to logic expression for Baidu Search Engine""" content = strdecode(content) result_list = [] token_list = jieba.tokenize(content) has_logic = False for token in token_list: if token[0] in self._logic_thesaurus: has_logic = True result_list.append({"Type": self._logic_thesaurus.get_attr(token[0])["Logic"], "Content": token[0]}) else: result_list.append({"Type": "Common", "Content": token[0]}) if not has_logic: return {"Type": "general", "Content": content} translate_finish = False or_list = [] not_list = [] and_list = [] while(not translate_finish and len(result_list) > 0): for index in range(len(result_list)): if result_list[index]["Type"] == "NOT": # 如果是NOT逻辑词 if index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common": # 如果可以合并语句,则进行合并 not_list.append("-(" + result_list[index + 1]["Content"] + ")") del result_list[index + 1] del result_list[index] break else: # 若不能合并语句,则将逻辑词视为普通词语 result_list[index]["Type"] = "Common" if result_list[index]["Type"] == "AND": # 如果是AND逻辑词 if 0 < index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common" and\ result_list[index - 1]["Type"] == "Common": and_list.append("(" + result_list[index - 1]["Content"] + " " + result_list[index + 1]["Content"] + ")") del result_list[index + 1] del result_list[index] del result_list[index - 1] break else: # 若不能合并语句,则将逻辑词视为普通词语 result_list[index]["Type"] = "Common" if result_list[index]["Type"] == "OR": # 如果是OR连接词 if 0 < index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common" and\ result_list[index - 1]["Type"] == "Common": or_list.append("(" + result_list[index - 1]["Content"] + " | " + result_list[index + 1]["Content"] + ")") del result_list[index + 1] del result_list[index] del result_list[index - 1] break else: # 若不能合并语句,则将逻辑词视为普通词语 result_list[index]["Type"] = "Common" if index >= len(result_list) - 1: # 所有逻辑词处理完成 translate_finish = True result_content = " ".join([item["Content"] for item in result_list]) + " " + " ".join(and_list) + " " +\ " ".join(or_list) + " " + " ".join(not_list) if(result_content == ""): return {"Type": "general", "Content": content} return {"Type": "logic", "Content": result_content}
if(result_content == ""): return {"Type": "general", "Content": content} return {"Type": "logic", "Content": result_content} if __name__ == "__main__": my_tokenizer = TokenUtil("dict.txt.big") my_tokenizer.init_classification('construction_dict.txt') print len(my_tokenizer._customized_thesaurus) content = "普通墙与异形墙分别是什么" answer = my_tokenizer.get_keyword(content) print answer['Type'] for atoken in answer['Token']: print strdecode(atoken) content = "太阳是什么" answer = my_tokenizer.get_keyword(content) print answer['Type'] print answer['Token'] my_tokenizer.init_logic_translator("Logic_words.txt") content = "不要辣椒和大蒜或者洋葱的川菜" result = my_tokenizer.logic_translate(content) print result["Type"] print result["Content"] content = "太阳是什么" result = my_tokenizer.logic_translate(content) print result["Type"] print result["Content"] content = "太阳或许是假的" result = my_tokenizer.logic_translate(content)