Example #1
0
 def __init__(self, word_list, window):
     self.dic_search = DictionarySearcher()
     for item in word_list:
         self.dic_search.addKey(item, "0")
     self.window = window
     self.block_chars_dict = {}
     self.block_chars_dict["。".decode("utf-8")] = 1
     self.block_chars_dict["?".decode("utf-8")] = 1
     self.block_chars_dict["\n".decode("utf-8")] = 1
     self.block_chars_dict["\r".decode("utf-8")] = 1
     self.block_chars_dict["?".decode("utf-8")] = 1
Example #2
0
 def initEntityInfo(self, file_name):
     self.search_tagmap = {}
     self.entity_map = {}
     self.dict_search = DictionarySearcher()
     if file_name == INVALID_FILE:
         return
     # read entity list
     self.entity_map = self.ReadKeyValue(file_name)
     for word in self.entity_map:
         tag = self.entity_map[word]
         self.dict_search.addKey(word)
         self.search_tagmap[word + TAG_MAP_DIVIDER + UNI_TAG_POS] = tag
Example #3
0
 def transTagstringToDict(self, content, tag_string):
     tag_map = {}
     all_search = DictionarySearcher()
     tmp_list2 = tag_string.split(" ")
     for item2 in tmp_list2:
         tmp_list3 = item2.split(":")
         word = tmp_list3[0]
         tmp_list4 = tmp_list3[1].split(",")
         for item4 in tmp_list4:
             tmp_list5 = item4.split("_")
             pos = tmp_list5[0]
             tag = tmp_list5[1]
             tag_map[word + TAG_MAP_DIVIDER + pos] = tag
             all_search.addKey(word)
     return self.generateDictFeature(content, all_search, tag_map, True)
Example #4
0
 def transDictToTagstring(self, content, tag_dict):
     # get labeled word list
     chunk_list = self.getChunkFromTag(content, tag_dict)
     all_search = DictionarySearcher()
     for (chunk, flag, tag) in chunk_list:
         if flag:
             all_search.addKey(chunk)
     # Check each occurs
     bFirst1 = True
     total_str = ""
     (matched_item, len_txt) = all_search.searchAll(content, "utf-8", True,
                                                    False)
     for item in matched_item:
         word_len = len(item)
         word = item.encode("utf-8")
         tmp_str = "%s:" % (word)
         word_cnt = 1
         bFirst2 = True
         for m_start in matched_item[item]:
             if self.isValidChunk(tag_dict, int(m_start), word_len):
                 tmp_list = tag_dict[str(m_start)].split("-")
                 if len(tmp_list) == 2:
                     tag = tmp_list[1]
                 else:
                     tag = ""
                 if bFirst2:
                     bFirst2 = False
                     tmp_str += str(word_cnt) + "_" + tag
                 else:
                     tmp_str += ("," + str(word_cnt) + "_" + tag)
             word_cnt += 1
         if bFirst1:
             total_str += tmp_str
             bFirst1 = False
         else:
             total_str += (" " + tmp_str)
     return total_str
Example #5
0
 def initPatternInfo(self, pattern_file, filter_file):
     # prepare pattern list
     self.pattern_list = []
     if pattern_file != INVALID_FILE:
         self.pattern_list = self.ReadPairWord(pattern_file)
     # prepare filter search
     self.filter_search = {}
     filter_item = {}
     if filter_file != INVALID_FILE:
         filter_item = self.ReadKeyValue(filter_file)
     # create searcher first
     for word in filter_item:
         tag = filter_item[word]
         if tag != self.common_tag and (not tag in self.filter_search):
             self.filter_search[tag] = DictionarySearcher()
     # add words
     for word in filter_item:
         tag = filter_item[word]
         if tag == self.common_tag:
             for key in self.filter_search:
                 self.filter_search[key].addKey(word)
         else:
             self.filter_search[tag].addKey(word)
     return (self.pattern_list, self.filter_search)