Example #1
0
 def __contains__(self, word):
     p = self._root
     word = strdecode(word)
     for c in word:
         if c not in p.children:
             return False
         p = p.children[c]
     return p.is_word
Example #2
0
 def get_attr(self, word):
     """Get the frequency of the given word. Return None if word not found. Word should be utf-8 encoded!"""
     p = self._root
     word = strdecode(word)
     for c in word:
         if c not in p.children:
             return None
         p = p.children[c]
     return p.attr
Example #3
0
 def __init__(self, word_bank_path, tags):
     if not isinstance(tags, list):
         raise ValueError("'default_setting' must be dict!")
     self._trie = Trie()
     for line in open(word_bank_path, 'r'):
         item = strdecode(line).strip().split(' ')
         attr = {}
         for index in range(len(item) - 1):
             attr[tags[index]] = item[index + 1]
         self._trie.add_new_word(item[0], attr)
Example #4
0
 def add_new_word(self, new_word, attributes):
     """Add a new word with given frequency"""
     p = self._root
     new_word = strdecode(new_word)
     for c in new_word:
         if c not in p.children:
             p.children[c] = _Node()
         p = p.children[c]
     p.attr = attributes
     p.is_word = True
     self._length += 1
Example #5
0
    def decode(self, m_part):

        m_ver = []
        for i in m_part:
            if verify(str(i[0][1]), i[1][0], i[1][1], i[1][2], self.bitlen, self.generator, self.prime, self.pub_key):
                m_ver.append(i[0])
            else:
                print("removing from consideration:")

        x = [i for i,j in m_ver]
        y = [j for i,j in m_ver]

        bits = [round(i) for i in polyfit(x, y, self.k-1)]
        print(bits)
        return strdecode(bits)
Example #6
0
 def delete_word(self, word):
     """Delete the given word. Return True if success, return false if doesn't find the word"""
     p = self._root
     word = strdecode(word)
     flag = None
     for c in word:
         if c not in p.children:
             return False
         if p.is_word or len(p.children) > 1:
             flag = p
             character = c
         p = p.children[c]
     if p.children:
         p.is_word = False
     elif flag:
         flag.delete(character)
     self._length -= 1
     return True
def segment(opts):
    test_file = opts.test
    model_file = opts.model
    output_file = opts.output
    beam_size = opts.beam_size
    assert (test_file != "")
    assert (model_file != "")
    assert (output_file != "")
    assert (beam_size >= 1)
    params = pickle.load(open(model_file))
    output= []
    #decode
    with codecs.open(test_file, encoding='utf-8') as infile:
        for line in infile:
            line = strdecode(line)
            raw_sentence = line.strip()
            words = beam_search(False, raw_sentence, beam_size, params)
            output.append(words)
    #save segmented sentences to output_file
    with open(output_file) as outfile:
        for words in output:
            line = ' '.join(words)
            line += '\n'
            outfile.write(line)
Example #8
0
 def logic_translate(self, content):
     """Translate the content to logic expression for Baidu Search Engine"""
     content = strdecode(content)
     result_list = []
     token_list = jieba.tokenize(content)
     has_logic = False
     for token in token_list:
         if token[0] in self._logic_thesaurus:
             has_logic = True
             result_list.append({"Type": self._logic_thesaurus.get_attr(token[0])["Logic"],
                                 "Content": token[0]})
         else:
             result_list.append({"Type": "Common",
                                 "Content": token[0]})
     if not has_logic:
         return {"Type": "general",
                 "Content": content}
     translate_finish = False
     or_list = []
     not_list = []
     and_list = []
     while(not translate_finish and len(result_list) > 0):
         for index in range(len(result_list)):
             if result_list[index]["Type"] == "NOT":
                 # 如果是NOT逻辑词
                 if index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common":
                     # 如果可以合并语句,则进行合并
                     not_list.append("-(" + result_list[index + 1]["Content"] + ")")
                     del result_list[index + 1]
                     del result_list[index]
                     break
                 else:
                     # 若不能合并语句,则将逻辑词视为普通词语
                     result_list[index]["Type"] = "Common"
             if result_list[index]["Type"] == "AND":
                 # 如果是AND逻辑词
                 if 0 < index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common" and\
                     result_list[index - 1]["Type"] == "Common":
                     and_list.append("(" + result_list[index - 1]["Content"] + " " +
                                     result_list[index + 1]["Content"] + ")")
                     del result_list[index + 1]
                     del result_list[index]
                     del result_list[index - 1]
                     break
                 else:
                     # 若不能合并语句,则将逻辑词视为普通词语
                     result_list[index]["Type"] = "Common"
             if result_list[index]["Type"] == "OR":
                 # 如果是OR连接词
                 if 0 < index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common" and\
                     result_list[index - 1]["Type"] == "Common":
                     or_list.append("(" + result_list[index - 1]["Content"] + " | " +
                                     result_list[index + 1]["Content"] + ")")
                     del result_list[index + 1]
                     del result_list[index]
                     del result_list[index - 1]
                     break
                 else:
                     # 若不能合并语句,则将逻辑词视为普通词语
                     result_list[index]["Type"] = "Common"
             if index >= len(result_list) - 1:
                 # 所有逻辑词处理完成
                 translate_finish = True
     result_content = " ".join([item["Content"] for item in result_list]) + " " + " ".join(and_list) + " " +\
                      " ".join(or_list) + " " + " ".join(not_list)
     if(result_content == ""):
         return {"Type": "general",
                 "Content": content}
     return {"Type": "logic",
             "Content": result_content}
Example #9
0
        if(result_content == ""):
            return {"Type": "general",
                    "Content": content}
        return {"Type": "logic",
                "Content": result_content}


if __name__ == "__main__":
    my_tokenizer = TokenUtil("dict.txt.big")
    my_tokenizer.init_classification('construction_dict.txt')
    print len(my_tokenizer._customized_thesaurus)
    content = "普通墙与异形墙分别是什么"
    answer = my_tokenizer.get_keyword(content)
    print answer['Type']
    for atoken in answer['Token']:
        print strdecode(atoken)
    content = "太阳是什么"
    answer = my_tokenizer.get_keyword(content)
    print answer['Type']
    print answer['Token']
    my_tokenizer.init_logic_translator("Logic_words.txt")
    content = "不要辣椒和大蒜或者洋葱的川菜"
    result = my_tokenizer.logic_translate(content)
    print result["Type"]
    print result["Content"]
    content = "太阳是什么"
    result = my_tokenizer.logic_translate(content)
    print result["Type"]
    print result["Content"]
    content = "太阳或许是假的"
    result = my_tokenizer.logic_translate(content)