Example #1
0
 def transDictToTagstring(self, content, tag_dict):
     # get labeled word list
     chunk_list = self.getChunkFromTag(content, tag_dict)
     all_search = DictionarySearcher()
     for (chunk, flag, tag) in chunk_list:
         if flag:
             all_search.addKey(chunk)
     # Check each occurs
     bFirst1 = True
     total_str = ""
     (matched_item, len_txt) = all_search.searchAll(content, "utf-8", True,
                                                    False)
     for item in matched_item:
         word_len = len(item)
         word = item.encode("utf-8")
         tmp_str = "%s:" % (word)
         word_cnt = 1
         bFirst2 = True
         for m_start in matched_item[item]:
             if self.isValidChunk(tag_dict, int(m_start), word_len):
                 tmp_list = tag_dict[str(m_start)].split("-")
                 if len(tmp_list) == 2:
                     tag = tmp_list[1]
                 else:
                     tag = ""
                 if bFirst2:
                     bFirst2 = False
                     tmp_str += str(word_cnt) + "_" + tag
                 else:
                     tmp_str += ("," + str(word_cnt) + "_" + tag)
             word_cnt += 1
         if bFirst1:
             total_str += tmp_str
             bFirst1 = False
         else:
             total_str += (" " + tmp_str)
     return total_str
Example #2
0
 def transDictToTagstring(self, content, tag_dict):
     # get labeled word list
     chunk_list = self.getChunkFromTag(content, tag_dict)
     all_search = DictionarySearcher()
     for (chunk, flag,tag) in chunk_list:
         if flag:
             all_search.addKey(chunk)
     # Check each occurs
     bFirst1 = True
     total_str = ""
     (matched_item, len_txt) = all_search.searchAll(content, "utf-8", True, False)
     for item in matched_item:
         word_len = len(item)
         word = item.encode("utf-8")
         tmp_str = "%s:"%(word)
         word_cnt = 1
         bFirst2 = True
         for m_start in matched_item[item]:
             if self.isValidChunk(tag_dict, int(m_start), word_len):
                 tmp_list =tag_dict[str(m_start)].split("-")
                 if len(tmp_list) == 2:
                     tag = tmp_list[1]
                 else:
                     tag = ""
                 if bFirst2:
                     bFirst2 = False
                     tmp_str += str(word_cnt)+"_"+tag
                 else:
                     tmp_str += (","+str(word_cnt)+"_"+tag)
             word_cnt += 1
         if bFirst1:
             total_str += tmp_str
             bFirst1 = False
         else:
             total_str += (" " + tmp_str)
     return total_str