class WindowExtractor: def __init__(self, word_list, window): self.dic_search = DictionarySearcher() for item in word_list: self.dic_search.addKey(item, "0") self.window = window self.block_chars_dict = {} self.block_chars_dict["。".decode("utf-8")] = 1 self.block_chars_dict["?".decode("utf-8")] = 1 self.block_chars_dict["\n".decode("utf-8")] = 1 self.block_chars_dict["\r".decode("utf-8")] = 1 self.block_chars_dict["?".decode("utf-8")] = 1 def setBlockChar(self, char): self.block_chars_dict[char.decode("utf-8")] = 1 def extractPattern(self, s): ''' 以match的词为中心,左右移动self.window 个词 ''' pattern_result = [] s_unicode = s.decode("utf-8") content_length = len(s_unicode) (result, len_txt) = self.dic_search.maxSearch(s, "utf-8") for item in result: match_length = len(item) for index in result[item][1:]: index = int(index) (start, end) = self.getWinow(index, content_length, match_length, s_unicode) '''generate pattern''' result_range = {} self.generatePattern(index, match_length, start, end, result_range, s_unicode) for item in result_range: if result_range[item] == 1: (start, end) = item.split("-") start = int(start) end = int(end) pattern_result.append(s_unicode[start:end + 1]) pass return pattern_result def generatePattern(self, index, match_length, start, end, result_dict, content): ''' 递归函数,穷举组合 ''' if start >= index and end <= index + match_length - 1: return #输出自身 key = "%d-%d" % (start, end) if key in result_dict: return try: if is_chinese(content[start]) and is_chinese(content[end]): result_dict[key] = 1 else: result_dict[key] = 0 #左递归 while start < index: #start -> find the first chinese if is_chinese(content[start + 1]): self.generatePattern(index, match_length, start + 1, end, result_dict, content) break start = start + 1 #右递归 while end > index + match_length - 1: #end -> find the first chinese if is_chinese(content[end - 1]): self.generatePattern(index, match_length, start, end - 1, result_dict, content) end -= 1 except: return def getWinow(self, index, length, match_length, content): start = index window = self.window for i in xrange(index): new_index = index - i - 1 if new_index < 0: break if content[new_index] in self.block_chars_dict: break if is_chinese(content[new_index]): window -= 1 start = new_index if window == 0: break end = index + match_length - 1 window = self.window for i in xrange(index + match_length, length): new_index = i if new_index < 0: break if content[new_index] in self.block_chars_dict: break if is_chinese(content[new_index]): window -= 1 end = new_index if window == 0: break return (start, end)
class WindowExtractor: def __init__(self, word_list, window): self.dic_search = DictionarySearcher() for item in word_list: self.dic_search.addKey(item, "0") self.window = window self.block_chars_dict = {} self.block_chars_dict["。".decode("utf-8")] = 1 self.block_chars_dict["?".decode("utf-8")] = 1 self.block_chars_dict["\n".decode("utf-8")] = 1 self.block_chars_dict["\r".decode("utf-8")] = 1 self.block_chars_dict["?".decode("utf-8")] = 1 def setBlockChar(self, char): self.block_chars_dict[char.decode("utf-8")] = 1 def extractPattern(self, s): ''' 以match的词为中心,左右移动self.window 个词 ''' pattern_result = [] s_unicode = s.decode("utf-8") content_length = len(s_unicode) ; (result, len_txt) = self.dic_search.maxSearch(s, "utf-8") for item in result: match_length = len(item) for index in result[item][1:]: index = int(index) (start, end ) = self.getWinow( index, content_length, match_length, s_unicode) '''generate pattern''' result_range = {} self.generatePattern(index, match_length, start, end, result_range, s_unicode) for item in result_range: if result_range[item] == 1: (start, end) = item.split("-") start = int(start) end = int(end) pattern_result.append( s_unicode[start:end+1]) pass return pattern_result def generatePattern(self, index, match_length, start, end, result_dict, content): ''' 递归函数,穷举组合 ''' if start >= index and end <= index + match_length - 1: return #输出自身 key = "%d-%d"%(start, end) if key in result_dict: return try: if is_chinese(content[start]) and is_chinese(content[end]): result_dict[key] = 1 else: result_dict[key] = 0 #左递归 while start < index: #start -> find the first chinese if is_chinese(content[start + 1] ): self.generatePattern( index, match_length, start + 1, end, result_dict, content) break start = start + 1 #右递归 while end > index + match_length -1: #end -> find the first chinese if is_chinese(content[end -1]): self.generatePattern( index, match_length, start, end -1, result_dict, content) end -= 1 except: return def getWinow(self, index, length, match_length, content): start = index window = self.window for i in xrange(index): new_index = index -i -1 if new_index < 0: break if content[new_index] in self.block_chars_dict: break if is_chinese(content[new_index]): window -= 1 start = new_index if window == 0: break end = index + match_length -1 window = self.window for i in xrange(index+match_length, length): new_index = i if new_index < 0: break if content[new_index] in self.block_chars_dict: break if is_chinese(content[new_index]): window -= 1 end = new_index if window == 0: break return (start, end)