def transTagstringToDict(self,content,tag_string): tag_map = {} all_search = DictionarySearcher() tmp_list2 = tag_string.split(" ") for item2 in tmp_list2: tmp_list3 = item2.split(":") word = tmp_list3[0] tmp_list4 = tmp_list3[1].split(",") for item4 in tmp_list4: tmp_list5 = item4.split("_") pos = tmp_list5[0] tag = tmp_list5[1] tag_map[word+TAG_MAP_DIVIDER+pos] = tag all_search.addKey(word) return self.generateDictFeature(content, all_search, tag_map, True)
def transTagstringToDict(self, content, tag_string): tag_map = {} all_search = DictionarySearcher() tmp_list2 = tag_string.split(" ") for item2 in tmp_list2: tmp_list3 = item2.split(":") word = tmp_list3[0] tmp_list4 = tmp_list3[1].split(",") for item4 in tmp_list4: tmp_list5 = item4.split("_") pos = tmp_list5[0] tag = tmp_list5[1] tag_map[word + TAG_MAP_DIVIDER + pos] = tag all_search.addKey(word) return self.generateDictFeature(content, all_search, tag_map, True)
def transDictToTagstring(self, content, tag_dict): # get labeled word list chunk_list = self.getChunkFromTag(content, tag_dict) all_search = DictionarySearcher() for (chunk, flag, tag) in chunk_list: if flag: all_search.addKey(chunk) # Check each occurs bFirst1 = True total_str = "" (matched_item, len_txt) = all_search.searchAll(content, "utf-8", True, False) for item in matched_item: word_len = len(item) word = item.encode("utf-8") tmp_str = "%s:" % (word) word_cnt = 1 bFirst2 = True for m_start in matched_item[item]: if self.isValidChunk(tag_dict, int(m_start), word_len): tmp_list = tag_dict[str(m_start)].split("-") if len(tmp_list) == 2: tag = tmp_list[1] else: tag = "" if bFirst2: bFirst2 = False tmp_str += str(word_cnt) + "_" + tag else: tmp_str += ("," + str(word_cnt) + "_" + tag) word_cnt += 1 if bFirst1: total_str += tmp_str bFirst1 = False else: total_str += (" " + tmp_str) return total_str
def transDictToTagstring(self, content, tag_dict): # get labeled word list chunk_list = self.getChunkFromTag(content, tag_dict) all_search = DictionarySearcher() for (chunk, flag,tag) in chunk_list: if flag: all_search.addKey(chunk) # Check each occurs bFirst1 = True total_str = "" (matched_item, len_txt) = all_search.searchAll(content, "utf-8", True, False) for item in matched_item: word_len = len(item) word = item.encode("utf-8") tmp_str = "%s:"%(word) word_cnt = 1 bFirst2 = True for m_start in matched_item[item]: if self.isValidChunk(tag_dict, int(m_start), word_len): tmp_list =tag_dict[str(m_start)].split("-") if len(tmp_list) == 2: tag = tmp_list[1] else: tag = "" if bFirst2: bFirst2 = False tmp_str += str(word_cnt)+"_"+tag else: tmp_str += (","+str(word_cnt)+"_"+tag) word_cnt += 1 if bFirst1: total_str += tmp_str bFirst1 = False else: total_str += (" " + tmp_str) return total_str
class WindowExtractor: def __init__(self, word_list, window): self.dic_search = DictionarySearcher() for item in word_list: self.dic_search.addKey(item, "0") self.window = window self.block_chars_dict = {} self.block_chars_dict["。".decode("utf-8")] = 1 self.block_chars_dict["?".decode("utf-8")] = 1 self.block_chars_dict["\n".decode("utf-8")] = 1 self.block_chars_dict["\r".decode("utf-8")] = 1 self.block_chars_dict["?".decode("utf-8")] = 1 def setBlockChar(self, char): self.block_chars_dict[char.decode("utf-8")] = 1 def extractPattern(self, s): ''' 以match的词为中心,左右移动self.window 个词 ''' pattern_result = [] s_unicode = s.decode("utf-8") content_length = len(s_unicode) (result, len_txt) = self.dic_search.maxSearch(s, "utf-8") for item in result: match_length = len(item) for index in result[item][1:]: index = int(index) (start, end) = self.getWinow(index, content_length, match_length, s_unicode) '''generate pattern''' result_range = {} self.generatePattern(index, match_length, start, end, result_range, s_unicode) for item in result_range: if result_range[item] == 1: (start, end) = item.split("-") start = int(start) end = int(end) pattern_result.append(s_unicode[start:end + 1]) pass return pattern_result def generatePattern(self, index, match_length, start, end, result_dict, content): ''' 递归函数,穷举组合 ''' if start >= index and end <= index + match_length - 1: return #输出自身 key = "%d-%d" % (start, end) if key in result_dict: return try: if is_chinese(content[start]) and is_chinese(content[end]): result_dict[key] = 1 else: result_dict[key] = 0 #左递归 while start < index: #start -> find the first chinese if is_chinese(content[start + 1]): self.generatePattern(index, match_length, start + 1, end, result_dict, content) break start = start + 1 #右递归 while end > index + match_length - 1: #end -> find the first chinese if is_chinese(content[end - 1]): self.generatePattern(index, match_length, start, end - 1, result_dict, content) end -= 1 except: return def getWinow(self, index, length, match_length, content): start = index window = self.window for i in xrange(index): new_index = index - i - 1 if new_index < 0: break if content[new_index] in self.block_chars_dict: break if is_chinese(content[new_index]): window -= 1 start = new_index if window == 0: break end = index + match_length - 1 window = self.window for i in xrange(index + match_length, length): new_index = i if new_index < 0: break if content[new_index] in self.block_chars_dict: break if is_chinese(content[new_index]): window -= 1 end = new_index if window == 0: break return (start, end)
class GenerateTag: def __init__(self,conf_map=None): # default conf if conf_map == None: conf_map = {} conf_map[CONF_FEATURE_STRING] = "char|dict" conf_map[CONF_FEATURE_REPTYPE] = "BIO" conf_map[CONF_DICT_FILE] = INVALID_FILE conf_map[CONF_PATTERN_FILE] = INVALID_FILE conf_map[CONF_FILTERWORD_FILE] = INVALID_FILE # ltp option self.ltp_ip=LTPSERVER[LTP_ENV]["Host"] self.ltp_port=int(LTPSERVER[LTP_ENV]["Port"]) self.ltpClient = LtpHelper(self.ltp_ip,self.ltp_port,None) # tag option self.BEGIN_TAG="B" self.INTER_TAG="I" self.END_TAG="E" self.SINGLE_TAG="S" self.OTHER_TAG="O" self.TAG_DIV="-" self.has_end_tag=False self.has_single_tag=False # feature option self.FEA_DIV="|" self.FEA_SUBDIV="_" # filter word option self.common_tag="COMMON" # prepare data info self.initTagger(conf_map) def __del__(self): pass '''#############getter and setter#################''' def getFeatureDict(self, feature_type): if feature_type in self.data_map: return self.data_map[feature_type] else: return None def getEntityMap(self): return self.entity_map def setFeatureDict(self, feature_type, tag_dict): self.data_map[feature_type] = tag_dict '''#############main process#################''' # reset tager def resetTagger(self): logging.info("start resetTagger()") self.data_map = {} self.charword_map = {} self.content = "" # generate features def mkTag(self,content,external_segment=None): logging.info("start mkTag()") # run ltp first segment_list = [] if self.segment_based: if external_segment == None: segment_list = self.getLtpResult(self.ltpClient,content) else: segment_list = external_segment tmp_string = "" for word_item in segment_list: (word, pos_tag) = word_item tmp_string += word # ltp may remove some char content = tmp_string self.content = content # generate feature for tmp_list in self.feature_list: for item in tmp_list: if item in self.data_map: logging.error("error: add same feature twice!!") continue if item == FEATURE_CHAR: self.data_map[item] = self.generateCharFeature(content) elif item == FEATURE_WORD: (tag_dict, charword_map) = self.generateWordFeature(segment_list) self.data_map[item] = tag_dict self.charword_map = charword_map elif item == FEATURE_SEGMENT: self.data_map[item] = self.generateSegmentFeature(segment_list) elif item == FEATURE_POS: self.data_map[item] = self.generatePosFeature(segment_list) elif item == FEATURE_DICT: self.data_map[item] = self.generateDictFeature(content, self.dict_search, self.search_tagmap) elif item == FEATURE_PATTERN: self.data_map[item] = self.generatePatternFeature(content) # generate char feature @staticmethod def generateCharFeature(content): logging.info("start generateCharFeature()") content_uni = content.decode("utf-8") cnt = 0 ret_map = {} while cnt < len(content_uni): ret_map[str(cnt)] = content_uni[cnt].encode("utf-8") cnt += 1 return ret_map # cal position for each word @staticmethod def generateWordFeature(self, segment_list): logging.info("start generateWordFeature()") cnt = 0 ret_map = {} total_pos = 0 charword_map = {} for (word,tag) in segment_list: word_len = len(word.decode("utf-8")) ret_map[str(cnt)] = word charword_map[str(total_pos)] = (cnt,word_len) total_pos += word_len cnt += 1 return (ret_map,charword_map) # cal position for each word def generateSegmentFeature(self, segment_list): logging.info("start generateSegmentFeature()") tag_dict = {} total_pos = 0 for (word,tag) in segment_list: word_uni = word.decode("utf-8") word_len = len(word_uni) self.markTag(tag_dict,total_pos,word_len) total_pos += word_len return tag_dict # cal position for each word def generatePosFeature(self, segment_list): logging.info("start generatePosFeature()") tag_dict = {} total_pos = 0 for (word,tag) in segment_list: word_uni = word.decode("utf-8") word_len = len(word_uni) self.markTag(tag_dict,total_pos,word_len,tag) total_pos += word_len return tag_dict # Get tag by search with word list def generateDictFeature(self, string, searcher, tagmap, all_search=False): logging.info("start generateDictFeature()") # preform max search first tag_dict = {} # all search if all_search: (matched_item, len_txt) = searcher.searchAll(string, "utf-8", True, False) else: (matched_item, len_txt) = searcher.maxSearchEx(string, "utf-8") for item in matched_item: word_len = len(item) word = item.encode("utf-8") pos = 1 for start in matched_item[item]: # mark tag, different position different tag uni_tag_code = word+TAG_MAP_DIVIDER+UNI_TAG_POS pos_tag_code = word+TAG_MAP_DIVIDER+str(pos) if pos_tag_code in tagmap: self.markTag(tag_dict,int(start),word_len,tagmap[pos_tag_code]) elif uni_tag_code in tagmap: self.markTag(tag_dict,int(start),word_len,tagmap[uni_tag_code]) pos += 1 return tag_dict # Get tag by search with pattern list def generatePatternFeature(self, string): logging.info("start generatePatternFeature()") tag_dict = {} # try every pattern for (pattern_des,tag) in self.pattern_list: try: pattern = re.compile(pattern_des,flags=re.IGNORECASE) except re.error: continue for m in pattern.finditer(string): (m_start,m_end) = m.span() # find out wrong word if self.filterByWord(string[m_start:m_end],tag): continue # print out pattern if m_start == 0: uni_start = 0 else: uni_start = len(string[0:m_start].decode("utf-8")) uni_end = len(string[0:m_end].decode("utf-8")) #logging.debug("%s\t%s"%(string[m_start:m_end],pattern_des)) self.markTag(tag_dict,uni_start,uni_end-uni_start,tag) return tag_dict '''#############format transformer function#################''' # transform position dict to string def transDictToTagstring(self, content, tag_dict): # get labeled word list chunk_list = self.getChunkFromTag(content, tag_dict) all_search = DictionarySearcher() for (chunk, flag,tag) in chunk_list: if flag: all_search.addKey(chunk) # Check each occurs bFirst1 = True total_str = "" (matched_item, len_txt) = all_search.searchAll(content, "utf-8", True, False) for item in matched_item: word_len = len(item) word = item.encode("utf-8") tmp_str = "%s:"%(word) word_cnt = 1 bFirst2 = True for m_start in matched_item[item]: if self.isValidChunk(tag_dict, int(m_start), word_len): tmp_list =tag_dict[str(m_start)].split("-") if len(tmp_list) == 2: tag = tmp_list[1] else: tag = "" if bFirst2: bFirst2 = False tmp_str += str(word_cnt)+"_"+tag else: tmp_str += (","+str(word_cnt)+"_"+tag) word_cnt += 1 if bFirst1: total_str += tmp_str bFirst1 = False else: total_str += (" " + tmp_str) return total_str # transform tagstring to dict def transTagstringToDict(self,content,tag_string): tag_map = {} all_search = DictionarySearcher() tmp_list2 = tag_string.split(" ") for item2 in tmp_list2: tmp_list3 = item2.split(":") word = tmp_list3[0] tmp_list4 = tmp_list3[1].split(",") for item4 in tmp_list4: tmp_list5 = item4.split("_") pos = tmp_list5[0] tag = tmp_list5[1] tag_map[word+TAG_MAP_DIVIDER+pos] = tag all_search.addKey(word) return self.generateDictFeature(content, all_search, tag_map, True) # word based transform if need def wordBasedTransform(self): logging.info("start wordBasedTransform()") if not self.word_based: return False for key in self.data_map: if key == FEATURE_CHAR or key == FEATURE_WORD: continue old_dict = self.data_map[key] new_dict = {} chunk_list = self.getChunkFromTag(self.content,old_dict) chunk_pos = 0 for (chunk, flag,tag) in chunk_list: word_len = len(chunk.decode("utf-8")) if not flag: chunk_pos += word_len continue adj_index = self.charIndexToWordIndex(chunk_pos,word_len) if adj_index == None: chunk_pos += word_len continue (adj_start,adj_len) = adj_index if key == FEATURE_POS: new_dict[str(adj_start)] = tag else: self.markTag(new_dict,adj_start,adj_len,tag) chunk_pos += word_len self.data_map[key] = new_dict return True # output tag info def outputByCrfFormat(self): logging.info("start outputByCrfFormat()") # merge feature tags in same column merged_tag = [] for tmp_list in self.feature_list: # no need do merge if len(tmp_list) == 1: merged_tag.append(self.data_map[tmp_list[0]]) continue # merge tmp_tag = {} for item in tmp_list: # error check if not item in self.data_map: print "no result for tag_type : %s"%item return "" tmp_tag = self.mergeTagDictEx(self.content, tmp_tag, self.data_map[item]) merged_tag.append(tmp_tag) # cal line num line_num = len(self.content.decode("utf-8")) if FEATURE_CHAR in self.data_map: line_num = len(self.data_map[FEATURE_CHAR]) elif FEATURE_WORD in self.data_map: line_num = len(self.data_map[FEATURE_WORD]) # output feature inx = 0 ret_value = "" while inx < line_num: inx_str = str(inx) column_inx = 1 for tmp_map in merged_tag: if inx_str in tmp_map: tag = tmp_map[inx_str] else: tag = self.OTHER_TAG if column_inx == 1: ret_value +="%s"%(tag) else: ret_value +="\t%s"%(tag) column_inx += 1 ret_value +="\n" inx +=1 ret_value +="\n" return ret_value '''#############init function#################''' # init tagger def initTagger(self,conf_map): self.initEntityInfo(conf_map[CONF_DICT_FILE]) self.initPatternInfo(conf_map[CONF_PATTERN_FILE],conf_map[CONF_FILTERWORD_FILE]) self.initFeatureInfo(conf_map[CONF_FEATURE_STRING]) self.initRepTypeInfo(conf_map[CONF_FEATURE_REPTYPE]) # load entity info def initEntityInfo(self, file_name): self.search_tagmap = {} self.entity_map = {} self.dict_search = DictionarySearcher() if file_name == INVALID_FILE: return # read entity list self.entity_map = self.ReadKeyValue(file_name) for word in self.entity_map: tag = self.entity_map[word] self.dict_search.addKey(word); self.search_tagmap[word+TAG_MAP_DIVIDER+UNI_TAG_POS] = tag # load entity info def initPatternInfo(self, pattern_file,filter_file): # prepare pattern list self.pattern_list = [] if pattern_file != INVALID_FILE: self.pattern_list = self.ReadPairWord(pattern_file) # prepare filter search self.filter_search = {} filter_item = {} if filter_file != INVALID_FILE: filter_item = self.ReadKeyValue(filter_file) # create searcher first for word in filter_item: tag = filter_item[word] if tag != self.common_tag and (not tag in self.filter_search): self.filter_search[tag] = DictionarySearcher() # add words for word in filter_item: tag = filter_item[word] if tag == self.common_tag: for key in self.filter_search: self.filter_search[key].addKey(word) else: self.filter_search[tag].addKey(word) return (self.pattern_list,self.filter_search) # init feature info def initFeatureInfo(self,feature_string): # char based or word based if feature_string.find(FEATURE_WORD) != -1 \ and feature_string.find(FEATURE_CHAR) != -1: logging.error("error: both char based and word based is forbiden.") return False self.word_based = False if feature_string.find(FEATURE_WORD) != -1: self.word_based = True # segment based self.segment_based = False if feature_string.find(FEATURE_SEGMENT) != -1 \ or feature_string.find(FEATURE_POS) != -1 \ or feature_string.find(FEATURE_WORD) != -1: self.segment_based = True # feature list self.feature_list = [] tmp_list = feature_string.split(self.FEA_DIV) for item in tmp_list: if item.find(self.FEA_SUBDIV) != -1: self.feature_list.append(item.split(self.FEA_SUBDIV)) else: self.feature_list.append([item]) return True # init feature representation type def initRepTypeInfo(self,type_string): if type_string.find(self.BEGIN_TAG) == -1 or\ type_string.find(self.INTER_TAG) == -1 or\ type_string.find(self.OTHER_TAG) == -1: logging.error("error: must have B,I,O at least.") return False # check end tag if type_string.find(self.END_TAG) != -1: self.has_end_tag = True # check single tag if type_string.find(self.SINGLE_TAG) != -1: self.has_single_tag = True # set current end tag if self.has_end_tag: self.CUR_END_TAG = self.END_TAG else: self.CUR_END_TAG = self.INTER_TAG # read entity wrod list @staticmethod def ReadKeyValue(file_name): entity_hash = {} if os.path.exists(file_name) == False: return entity_hash fhandle = open(file_name) for line in fhandle: line = line.rstrip() if line == "": continue tmp_list = line.split("\t") if len(tmp_list) < 2: continue entity = tmp_list[0] tag = tmp_list[1] if entity in entity_hash: continue entity_hash[entity] = tag fhandle.close() return entity_hash # read entity wrod list @staticmethod def ReadPairWord(file_name): ret_list = [] if os.path.exists(file_name) == False: return ret_list fhandle = open(file_name) for line in fhandle: line = line.rstrip() if line == "": continue tmp_list = line.split("\t") if len(tmp_list) < 2: continue pattern = tmp_list[0] tag = tmp_list[1] ret_list.append((pattern,tag)) fhandle.close() return ret_list # tag_dict中相应位置做上标记 '''#############util function#################''' # get word & position from tag dict def mergeTagDictEx(self, content, dict1,dict2, all_merge=False): ret_dict = {} for item in dict1: ret_dict[item] = dict1[item] # get labeled word list chunk_list = self.getChunkFromTag(content,dict2) chunk_pos = 0 for (chunk, flag,tag) in chunk_list: word_len = len(chunk.decode("utf-8")) if all_merge or flag: self.markTag(ret_dict,chunk_pos,word_len,tag) chunk_pos += word_len return ret_dict # tag_dict中相应位置做上标记 def markTag(self,tag_dict,begin,word_len,appendix=None): # search conflict inx = 0 while inx < word_len: index = str(begin + inx) inx += 1 if index in tag_dict: return # make tag if appendix != None: app = self.TAG_DIV+appendix else: app = "" inx = 0 while inx < word_len: index = str(begin + inx) if index in tag_dict: inx += 1 continue if inx == 0: if self.has_single_tag and word_len == 1: tag_dict[index] = self.SINGLE_TAG+app else: tag_dict[index] = self.BEGIN_TAG+app elif inx == word_len - 1: tag_dict[index] = self.CUR_END_TAG+app else: tag_dict[index] = self.INTER_TAG+app inx += 1 # find out wrong word def filterByWord(self, string, tag): if tag in self.filter_search: (filter_result, tmp_none) = self.filter_search[tag].maxSearchEx(string, "utf-8") if len(filter_result) > 0: return True return False # restore chunk from tag dict def getChunkFromTag(self, content, entity_tag): content_uni = content.decode("utf-8") inx = 0 curFlag = False bWord = "".decode("utf-8") chunk_list = [] while inx < len(content_uni): un_char = content_uni[inx] str_inx = str(inx) if str_inx in entity_tag: next_flag = True else: next_flag = False if str(inx-1) in entity_tag: oldTag = entity_tag[str(inx-1)].split(self.TAG_DIV)[1] else: oldTag = "UNKNOWN" # find a B-, print the previous chunk if (str_inx in entity_tag) and (self.checkTagPrefix(entity_tag[str_inx],self.BEGIN_TAG)\ or self.checkTagPrefix(entity_tag[str_inx],self.SINGLE_TAG)): if len(bWord) > 0: chunk_list.append((bWord.encode("utf-8"),curFlag,oldTag)) curFlag = next_flag bWord = "".decode("utf-8") elif curFlag != next_flag: # if flag changed,print the previous chunk if len(bWord) > 0: chunk_list.append((bWord.encode("utf-8"),curFlag,oldTag)) curFlag = next_flag bWord = "".decode("utf-8") bWord += un_char inx += 1 if curFlag: oldTag = entity_tag[str(len(content_uni)-1)].split("-")[1] else: oldTag = "UNKNOWN" if len(bWord) > 0: chunk_list.append((bWord.encode("utf-8"),curFlag,oldTag)) return chunk_list def getTagFromCrfResult(self, line_list,column_id): tag_dict = {} cnt = 0 for line in line_list: cur_cnt = cnt cnt += 1 tmp_list = line.split("\t") if len(tmp_list) < column_id+1: continue if tmp_list[column_id] != self.OTHER_TAG: tag_dict[str(cur_cnt)] = tmp_list[column_id] return tag_dict @staticmethod def getContentFromCrfResult(line_list,column_id=0): content = "" for line in line_list: tmp_list = line.split("\t") if len(tmp_list) < column_id+1: break content += tmp_list[column_id] return content # check prefix def checkTagPrefix(self,tag_string,prefix): if tag_string == prefix or tag_string.find(prefix+self.TAG_DIV) != -1: return True return False # is valid chunk def isValidChunk(self,tag_dict,begin,word_len): if word_len < 1: return False if not str(begin) in tag_dict: return False # not B- or S- begins if (not self.checkTagPrefix(tag_dict[str(begin)],self.BEGIN_TAG)) and \ (not self.checkTagPrefix(tag_dict[str(begin)],self.SINGLE_TAG)): return False # check inter tag inx = 1 while inx < (word_len - 1): inter = str(begin+inx) if not inter in tag_dict: #logging.debug("b=%d,inx=%d,word_len=%d"%(begin,inx,word_len)) return False if not self.checkTagPrefix(tag_dict[inter],self.INTER_TAG): return False inx += 1 # check end, skip when word_len = 1 end_str = str(begin+word_len-1) if word_len > 1: if (not end_str in tag_dict) or (not self.checkTagPrefix(tag_dict[end_str],self.CUR_END_TAG)): return False # next must be another chunk next_str = str(begin+word_len) if (not next_str in tag_dict) or \ self.checkTagPrefix(tag_dict[next_str],self.BEGIN_TAG) or\ self.checkTagPrefix(tag_dict[next_str],self.SINGLE_TAG): return True else: return False # char index to word index def charIndexToWordIndex(self,char_pos,char_len): if not str(char_pos) in self.charword_map: return None (adj_start,tmp) = self.charword_map[str(char_pos)] pos = char_pos end = char_pos + char_len adj_len = 0 while pos < end: if not str(pos) in self.charword_map: return None (tmp,word_len) = self.charword_map[str(pos)] adj_len += 1 pos += word_len return (adj_start,adj_len) # get split tag def normalizeTag(self,old_tag): if old_tag.find(self.TAG_DIV): return old_tag tmp_list = old_tag.split(self.TAG_DIV) return tmp_list[1] # get ltp result @staticmethod def getLtpResult(ltpClient,content): (a,b,flag) = ltpClient.getSegment(content,1,2,0) segment_list = [] if flag : tmp_list = ltpClient.getSegmentInfo(b) for word_item in tmp_list: (word, begin, str_len, pos_tag) = word_item segment_list.append((word,pos_tag)) return segment_list
class GenerateTag: def __init__(self, conf_map=None): # default conf if conf_map == None: conf_map = {} conf_map[CONF_FEATURE_STRING] = "char|dict" conf_map[CONF_FEATURE_REPTYPE] = "BIO" conf_map[CONF_DICT_FILE] = INVALID_FILE conf_map[CONF_PATTERN_FILE] = INVALID_FILE conf_map[CONF_FILTERWORD_FILE] = INVALID_FILE # ltp option self.ltp_ip = LTPSERVER[LTP_ENV]["Host"] self.ltp_port = int(LTPSERVER[LTP_ENV]["Port"]) self.ltpClient = LtpHelper(self.ltp_ip, self.ltp_port, None) # tag option self.BEGIN_TAG = "B" self.INTER_TAG = "I" self.END_TAG = "E" self.SINGLE_TAG = "S" self.OTHER_TAG = "O" self.TAG_DIV = "-" self.has_end_tag = False self.has_single_tag = False # feature option self.FEA_DIV = "|" self.FEA_SUBDIV = "_" # filter word option self.common_tag = "COMMON" # prepare data info self.initTagger(conf_map) def __del__(self): pass '''#############getter and setter#################''' def getFeatureDict(self, feature_type): if feature_type in self.data_map: return self.data_map[feature_type] else: return None def getEntityMap(self): return self.entity_map def setFeatureDict(self, feature_type, tag_dict): self.data_map[feature_type] = tag_dict '''#############main process#################''' # reset tager def resetTagger(self): logging.info("start resetTagger()") self.data_map = {} self.charword_map = {} self.content = "" # generate features def mkTag(self, content, external_segment=None): logging.info("start mkTag()") # run ltp first segment_list = [] if self.segment_based: if external_segment == None: segment_list = self.getLtpResult(self.ltpClient, content) else: segment_list = external_segment tmp_string = "" for word_item in segment_list: (word, pos_tag) = word_item tmp_string += word # ltp may remove some char content = tmp_string self.content = content # generate feature for tmp_list in self.feature_list: for item in tmp_list: if item in self.data_map: logging.error("error: add same feature twice!!") continue if item == FEATURE_CHAR: self.data_map[item] = self.generateCharFeature(content) elif item == FEATURE_WORD: (tag_dict, charword_map) = self.generateWordFeature(segment_list) self.data_map[item] = tag_dict self.charword_map = charword_map elif item == FEATURE_SEGMENT: self.data_map[item] = self.generateSegmentFeature( segment_list) elif item == FEATURE_POS: self.data_map[item] = self.generatePosFeature(segment_list) elif item == FEATURE_DICT: self.data_map[item] = self.generateDictFeature( content, self.dict_search, self.search_tagmap) elif item == FEATURE_PATTERN: self.data_map[item] = self.generatePatternFeature(content) # generate char feature @staticmethod def generateCharFeature(content): logging.info("start generateCharFeature()") content_uni = content.decode("utf-8") cnt = 0 ret_map = {} while cnt < len(content_uni): ret_map[str(cnt)] = content_uni[cnt].encode("utf-8") cnt += 1 return ret_map # cal position for each word @staticmethod def generateWordFeature(self, segment_list): logging.info("start generateWordFeature()") cnt = 0 ret_map = {} total_pos = 0 charword_map = {} for (word, tag) in segment_list: word_len = len(word.decode("utf-8")) ret_map[str(cnt)] = word charword_map[str(total_pos)] = (cnt, word_len) total_pos += word_len cnt += 1 return (ret_map, charword_map) # cal position for each word def generateSegmentFeature(self, segment_list): logging.info("start generateSegmentFeature()") tag_dict = {} total_pos = 0 for (word, tag) in segment_list: word_uni = word.decode("utf-8") word_len = len(word_uni) self.markTag(tag_dict, total_pos, word_len) total_pos += word_len return tag_dict # cal position for each word def generatePosFeature(self, segment_list): logging.info("start generatePosFeature()") tag_dict = {} total_pos = 0 for (word, tag) in segment_list: word_uni = word.decode("utf-8") word_len = len(word_uni) self.markTag(tag_dict, total_pos, word_len, tag) total_pos += word_len return tag_dict # Get tag by search with word list def generateDictFeature(self, string, searcher, tagmap, all_search=False): logging.info("start generateDictFeature()") # preform max search first tag_dict = {} # all search if all_search: (matched_item, len_txt) = searcher.searchAll(string, "utf-8", True, False) else: (matched_item, len_txt) = searcher.maxSearchEx(string, "utf-8") for item in matched_item: word_len = len(item) word = item.encode("utf-8") pos = 1 for start in matched_item[item]: # mark tag, different position different tag uni_tag_code = word + TAG_MAP_DIVIDER + UNI_TAG_POS pos_tag_code = word + TAG_MAP_DIVIDER + str(pos) if pos_tag_code in tagmap: self.markTag(tag_dict, int(start), word_len, tagmap[pos_tag_code]) elif uni_tag_code in tagmap: self.markTag(tag_dict, int(start), word_len, tagmap[uni_tag_code]) pos += 1 return tag_dict # Get tag by search with pattern list def generatePatternFeature(self, string): logging.info("start generatePatternFeature()") tag_dict = {} # try every pattern for (pattern_des, tag) in self.pattern_list: try: pattern = re.compile(pattern_des, flags=re.IGNORECASE) except re.error: continue for m in pattern.finditer(string): (m_start, m_end) = m.span() # find out wrong word if self.filterByWord(string[m_start:m_end], tag): continue # print out pattern if m_start == 0: uni_start = 0 else: uni_start = len(string[0:m_start].decode("utf-8")) uni_end = len(string[0:m_end].decode("utf-8")) #logging.debug("%s\t%s"%(string[m_start:m_end],pattern_des)) self.markTag(tag_dict, uni_start, uni_end - uni_start, tag) return tag_dict '''#############format transformer function#################''' # transform position dict to string def transDictToTagstring(self, content, tag_dict): # get labeled word list chunk_list = self.getChunkFromTag(content, tag_dict) all_search = DictionarySearcher() for (chunk, flag, tag) in chunk_list: if flag: all_search.addKey(chunk) # Check each occurs bFirst1 = True total_str = "" (matched_item, len_txt) = all_search.searchAll(content, "utf-8", True, False) for item in matched_item: word_len = len(item) word = item.encode("utf-8") tmp_str = "%s:" % (word) word_cnt = 1 bFirst2 = True for m_start in matched_item[item]: if self.isValidChunk(tag_dict, int(m_start), word_len): tmp_list = tag_dict[str(m_start)].split("-") if len(tmp_list) == 2: tag = tmp_list[1] else: tag = "" if bFirst2: bFirst2 = False tmp_str += str(word_cnt) + "_" + tag else: tmp_str += ("," + str(word_cnt) + "_" + tag) word_cnt += 1 if bFirst1: total_str += tmp_str bFirst1 = False else: total_str += (" " + tmp_str) return total_str # transform tagstring to dict def transTagstringToDict(self, content, tag_string): tag_map = {} all_search = DictionarySearcher() tmp_list2 = tag_string.split(" ") for item2 in tmp_list2: tmp_list3 = item2.split(":") word = tmp_list3[0] tmp_list4 = tmp_list3[1].split(",") for item4 in tmp_list4: tmp_list5 = item4.split("_") pos = tmp_list5[0] tag = tmp_list5[1] tag_map[word + TAG_MAP_DIVIDER + pos] = tag all_search.addKey(word) return self.generateDictFeature(content, all_search, tag_map, True) # word based transform if need def wordBasedTransform(self): logging.info("start wordBasedTransform()") if not self.word_based: return False for key in self.data_map: if key == FEATURE_CHAR or key == FEATURE_WORD: continue old_dict = self.data_map[key] new_dict = {} chunk_list = self.getChunkFromTag(self.content, old_dict) chunk_pos = 0 for (chunk, flag, tag) in chunk_list: word_len = len(chunk.decode("utf-8")) if not flag: chunk_pos += word_len continue adj_index = self.charIndexToWordIndex(chunk_pos, word_len) if adj_index == None: chunk_pos += word_len continue (adj_start, adj_len) = adj_index if key == FEATURE_POS: new_dict[str(adj_start)] = tag else: self.markTag(new_dict, adj_start, adj_len, tag) chunk_pos += word_len self.data_map[key] = new_dict return True # output tag info def outputByCrfFormat(self): logging.info("start outputByCrfFormat()") # merge feature tags in same column merged_tag = [] for tmp_list in self.feature_list: # no need do merge if len(tmp_list) == 1: merged_tag.append(self.data_map[tmp_list[0]]) continue # merge tmp_tag = {} for item in tmp_list: # error check if not item in self.data_map: print "no result for tag_type : %s" % item return "" tmp_tag = self.mergeTagDictEx(self.content, tmp_tag, self.data_map[item]) merged_tag.append(tmp_tag) # cal line num line_num = len(self.content.decode("utf-8")) if FEATURE_CHAR in self.data_map: line_num = len(self.data_map[FEATURE_CHAR]) elif FEATURE_WORD in self.data_map: line_num = len(self.data_map[FEATURE_WORD]) # output feature inx = 0 ret_value = "" while inx < line_num: inx_str = str(inx) column_inx = 1 for tmp_map in merged_tag: if inx_str in tmp_map: tag = tmp_map[inx_str] else: tag = self.OTHER_TAG if column_inx == 1: ret_value += "%s" % (tag) else: ret_value += "\t%s" % (tag) column_inx += 1 ret_value += "\n" inx += 1 ret_value += "\n" return ret_value '''#############init function#################''' # init tagger def initTagger(self, conf_map): self.initEntityInfo(conf_map[CONF_DICT_FILE]) self.initPatternInfo(conf_map[CONF_PATTERN_FILE], conf_map[CONF_FILTERWORD_FILE]) self.initFeatureInfo(conf_map[CONF_FEATURE_STRING]) self.initRepTypeInfo(conf_map[CONF_FEATURE_REPTYPE]) # load entity info def initEntityInfo(self, file_name): self.search_tagmap = {} self.entity_map = {} self.dict_search = DictionarySearcher() if file_name == INVALID_FILE: return # read entity list self.entity_map = self.ReadKeyValue(file_name) for word in self.entity_map: tag = self.entity_map[word] self.dict_search.addKey(word) self.search_tagmap[word + TAG_MAP_DIVIDER + UNI_TAG_POS] = tag # load entity info def initPatternInfo(self, pattern_file, filter_file): # prepare pattern list self.pattern_list = [] if pattern_file != INVALID_FILE: self.pattern_list = self.ReadPairWord(pattern_file) # prepare filter search self.filter_search = {} filter_item = {} if filter_file != INVALID_FILE: filter_item = self.ReadKeyValue(filter_file) # create searcher first for word in filter_item: tag = filter_item[word] if tag != self.common_tag and (not tag in self.filter_search): self.filter_search[tag] = DictionarySearcher() # add words for word in filter_item: tag = filter_item[word] if tag == self.common_tag: for key in self.filter_search: self.filter_search[key].addKey(word) else: self.filter_search[tag].addKey(word) return (self.pattern_list, self.filter_search) # init feature info def initFeatureInfo(self, feature_string): # char based or word based if feature_string.find(FEATURE_WORD) != -1 \ and feature_string.find(FEATURE_CHAR) != -1: logging.error("error: both char based and word based is forbiden.") return False self.word_based = False if feature_string.find(FEATURE_WORD) != -1: self.word_based = True # segment based self.segment_based = False if feature_string.find(FEATURE_SEGMENT) != -1 \ or feature_string.find(FEATURE_POS) != -1 \ or feature_string.find(FEATURE_WORD) != -1: self.segment_based = True # feature list self.feature_list = [] tmp_list = feature_string.split(self.FEA_DIV) for item in tmp_list: if item.find(self.FEA_SUBDIV) != -1: self.feature_list.append(item.split(self.FEA_SUBDIV)) else: self.feature_list.append([item]) return True # init feature representation type def initRepTypeInfo(self, type_string): if type_string.find(self.BEGIN_TAG) == -1 or\ type_string.find(self.INTER_TAG) == -1 or\ type_string.find(self.OTHER_TAG) == -1: logging.error("error: must have B,I,O at least.") return False # check end tag if type_string.find(self.END_TAG) != -1: self.has_end_tag = True # check single tag if type_string.find(self.SINGLE_TAG) != -1: self.has_single_tag = True # set current end tag if self.has_end_tag: self.CUR_END_TAG = self.END_TAG else: self.CUR_END_TAG = self.INTER_TAG # read entity wrod list @staticmethod def ReadKeyValue(file_name): entity_hash = {} if os.path.exists(file_name) == False: return entity_hash fhandle = open(file_name) for line in fhandle: line = line.rstrip() if line == "": continue tmp_list = line.split("\t") if len(tmp_list) < 2: continue entity = tmp_list[0] tag = tmp_list[1] if entity in entity_hash: continue entity_hash[entity] = tag fhandle.close() return entity_hash # read entity wrod list @staticmethod def ReadPairWord(file_name): ret_list = [] if os.path.exists(file_name) == False: return ret_list fhandle = open(file_name) for line in fhandle: line = line.rstrip() if line == "": continue tmp_list = line.split("\t") if len(tmp_list) < 2: continue pattern = tmp_list[0] tag = tmp_list[1] ret_list.append((pattern, tag)) fhandle.close() return ret_list # tag_dict中相应位置做上标记 '''#############util function#################''' # get word & position from tag dict def mergeTagDictEx(self, content, dict1, dict2, all_merge=False): ret_dict = {} for item in dict1: ret_dict[item] = dict1[item] # get labeled word list chunk_list = self.getChunkFromTag(content, dict2) chunk_pos = 0 for (chunk, flag, tag) in chunk_list: word_len = len(chunk.decode("utf-8")) if all_merge or flag: self.markTag(ret_dict, chunk_pos, word_len, tag) chunk_pos += word_len return ret_dict # tag_dict中相应位置做上标记 def markTag(self, tag_dict, begin, word_len, appendix=None): # search conflict inx = 0 while inx < word_len: index = str(begin + inx) inx += 1 if index in tag_dict: return # make tag if appendix != None: app = self.TAG_DIV + appendix else: app = "" inx = 0 while inx < word_len: index = str(begin + inx) if index in tag_dict: inx += 1 continue if inx == 0: if self.has_single_tag and word_len == 1: tag_dict[index] = self.SINGLE_TAG + app else: tag_dict[index] = self.BEGIN_TAG + app elif inx == word_len - 1: tag_dict[index] = self.CUR_END_TAG + app else: tag_dict[index] = self.INTER_TAG + app inx += 1 # find out wrong word def filterByWord(self, string, tag): if tag in self.filter_search: (filter_result, tmp_none) = self.filter_search[tag].maxSearchEx(string, "utf-8") if len(filter_result) > 0: return True return False # restore chunk from tag dict def getChunkFromTag(self, content, entity_tag): content_uni = content.decode("utf-8") inx = 0 curFlag = False bWord = "".decode("utf-8") chunk_list = [] while inx < len(content_uni): un_char = content_uni[inx] str_inx = str(inx) if str_inx in entity_tag: next_flag = True else: next_flag = False if str(inx - 1) in entity_tag: oldTag = entity_tag[str(inx - 1)].split(self.TAG_DIV)[1] else: oldTag = "UNKNOWN" # find a B-, print the previous chunk if (str_inx in entity_tag) and (self.checkTagPrefix(entity_tag[str_inx],self.BEGIN_TAG)\ or self.checkTagPrefix(entity_tag[str_inx],self.SINGLE_TAG)): if len(bWord) > 0: chunk_list.append((bWord.encode("utf-8"), curFlag, oldTag)) curFlag = next_flag bWord = "".decode("utf-8") elif curFlag != next_flag: # if flag changed,print the previous chunk if len(bWord) > 0: chunk_list.append((bWord.encode("utf-8"), curFlag, oldTag)) curFlag = next_flag bWord = "".decode("utf-8") bWord += un_char inx += 1 if curFlag: oldTag = entity_tag[str(len(content_uni) - 1)].split("-")[1] else: oldTag = "UNKNOWN" if len(bWord) > 0: chunk_list.append((bWord.encode("utf-8"), curFlag, oldTag)) return chunk_list def getTagFromCrfResult(self, line_list, column_id): tag_dict = {} cnt = 0 for line in line_list: cur_cnt = cnt cnt += 1 tmp_list = line.split("\t") if len(tmp_list) < column_id + 1: continue if tmp_list[column_id] != self.OTHER_TAG: tag_dict[str(cur_cnt)] = tmp_list[column_id] return tag_dict @staticmethod def getContentFromCrfResult(line_list, column_id=0): content = "" for line in line_list: tmp_list = line.split("\t") if len(tmp_list) < column_id + 1: break content += tmp_list[column_id] return content # check prefix def checkTagPrefix(self, tag_string, prefix): if tag_string == prefix or tag_string.find(prefix + self.TAG_DIV) != -1: return True return False # is valid chunk def isValidChunk(self, tag_dict, begin, word_len): if word_len < 1: return False if not str(begin) in tag_dict: return False # not B- or S- begins if (not self.checkTagPrefix(tag_dict[str(begin)],self.BEGIN_TAG)) and \ (not self.checkTagPrefix(tag_dict[str(begin)],self.SINGLE_TAG)): return False # check inter tag inx = 1 while inx < (word_len - 1): inter = str(begin + inx) if not inter in tag_dict: #logging.debug("b=%d,inx=%d,word_len=%d"%(begin,inx,word_len)) return False if not self.checkTagPrefix(tag_dict[inter], self.INTER_TAG): return False inx += 1 # check end, skip when word_len = 1 end_str = str(begin + word_len - 1) if word_len > 1: if (not end_str in tag_dict) or (not self.checkTagPrefix( tag_dict[end_str], self.CUR_END_TAG)): return False # next must be another chunk next_str = str(begin + word_len) if (not next_str in tag_dict) or \ self.checkTagPrefix(tag_dict[next_str],self.BEGIN_TAG) or\ self.checkTagPrefix(tag_dict[next_str],self.SINGLE_TAG): return True else: return False # char index to word index def charIndexToWordIndex(self, char_pos, char_len): if not str(char_pos) in self.charword_map: return None (adj_start, tmp) = self.charword_map[str(char_pos)] pos = char_pos end = char_pos + char_len adj_len = 0 while pos < end: if not str(pos) in self.charword_map: return None (tmp, word_len) = self.charword_map[str(pos)] adj_len += 1 pos += word_len return (adj_start, adj_len) # get split tag def normalizeTag(self, old_tag): if old_tag.find(self.TAG_DIV): return old_tag tmp_list = old_tag.split(self.TAG_DIV) return tmp_list[1] # get ltp result @staticmethod def getLtpResult(ltpClient, content): (a, b, flag) = ltpClient.getSegment(content, 1, 2, 0) segment_list = [] if flag: tmp_list = ltpClient.getSegmentInfo(b) for word_item in tmp_list: (word, begin, str_len, pos_tag) = word_item segment_list.append((word, pos_tag)) return segment_list
class WindowExtractor: def __init__(self, word_list, window): self.dic_search = DictionarySearcher() for item in word_list: self.dic_search.addKey(item, "0") self.window = window self.block_chars_dict = {} self.block_chars_dict["。".decode("utf-8")] = 1 self.block_chars_dict["?".decode("utf-8")] = 1 self.block_chars_dict["\n".decode("utf-8")] = 1 self.block_chars_dict["\r".decode("utf-8")] = 1 self.block_chars_dict["?".decode("utf-8")] = 1 def setBlockChar(self, char): self.block_chars_dict[char.decode("utf-8")] = 1 def extractPattern(self, s): ''' 以match的词为中心,左右移动self.window 个词 ''' pattern_result = [] s_unicode = s.decode("utf-8") content_length = len(s_unicode) ; (result, len_txt) = self.dic_search.maxSearch(s, "utf-8") for item in result: match_length = len(item) for index in result[item][1:]: index = int(index) (start, end ) = self.getWinow( index, content_length, match_length, s_unicode) '''generate pattern''' result_range = {} self.generatePattern(index, match_length, start, end, result_range, s_unicode) for item in result_range: if result_range[item] == 1: (start, end) = item.split("-") start = int(start) end = int(end) pattern_result.append( s_unicode[start:end+1]) pass return pattern_result def generatePattern(self, index, match_length, start, end, result_dict, content): ''' 递归函数,穷举组合 ''' if start >= index and end <= index + match_length - 1: return #输出自身 key = "%d-%d"%(start, end) if key in result_dict: return try: if is_chinese(content[start]) and is_chinese(content[end]): result_dict[key] = 1 else: result_dict[key] = 0 #左递归 while start < index: #start -> find the first chinese if is_chinese(content[start + 1] ): self.generatePattern( index, match_length, start + 1, end, result_dict, content) break start = start + 1 #右递归 while end > index + match_length -1: #end -> find the first chinese if is_chinese(content[end -1]): self.generatePattern( index, match_length, start, end -1, result_dict, content) end -= 1 except: return def getWinow(self, index, length, match_length, content): start = index window = self.window for i in xrange(index): new_index = index -i -1 if new_index < 0: break if content[new_index] in self.block_chars_dict: break if is_chinese(content[new_index]): window -= 1 start = new_index if window == 0: break end = index + match_length -1 window = self.window for i in xrange(index+match_length, length): new_index = i if new_index < 0: break if content[new_index] in self.block_chars_dict: break if is_chinese(content[new_index]): window -= 1 end = new_index if window == 0: break return (start, end)