Example #1
0
class WindowExtractor:
    def __init__(self, word_list, window):
        self.dic_search = DictionarySearcher()
        for item in word_list:
            self.dic_search.addKey(item, "0")
        self.window = window
        self.block_chars_dict = {}
        self.block_chars_dict["。".decode("utf-8")] = 1
        self.block_chars_dict["?".decode("utf-8")] = 1
        self.block_chars_dict["\n".decode("utf-8")] = 1
        self.block_chars_dict["\r".decode("utf-8")] = 1
        self.block_chars_dict["?".decode("utf-8")] = 1

    def setBlockChar(self, char):
        self.block_chars_dict[char.decode("utf-8")] = 1

    def extractPattern(self, s):
        '''
        以match的词为中心,左右移动self.window 个词
        '''
        pattern_result = []
        s_unicode = s.decode("utf-8")
        content_length = len(s_unicode)
        (result, len_txt) = self.dic_search.maxSearch(s, "utf-8")
        for item in result:
            match_length = len(item)
            for index in result[item][1:]:
                index = int(index)
                (start, end) = self.getWinow(index, content_length,
                                             match_length, s_unicode)
                '''generate pattern'''
                result_range = {}
                self.generatePattern(index, match_length, start, end,
                                     result_range, s_unicode)
                for item in result_range:
                    if result_range[item] == 1:
                        (start, end) = item.split("-")
                        start = int(start)
                        end = int(end)
                        pattern_result.append(s_unicode[start:end + 1])
                pass
        return pattern_result

    def generatePattern(self, index, match_length, start, end, result_dict,
                        content):
        '''
        递归函数,穷举组合
        '''
        if start >= index and end <= index + match_length - 1:
            return
        #输出自身
        key = "%d-%d" % (start, end)
        if key in result_dict:
            return
        try:
            if is_chinese(content[start]) and is_chinese(content[end]):
                result_dict[key] = 1
            else:
                result_dict[key] = 0
            #左递归
            while start < index:
                #start -> find the first chinese
                if is_chinese(content[start + 1]):
                    self.generatePattern(index, match_length, start + 1, end,
                                         result_dict, content)
                    break
                start = start + 1
            #右递归
            while end > index + match_length - 1:
                #end -> find the first chinese
                if is_chinese(content[end - 1]):
                    self.generatePattern(index, match_length, start, end - 1,
                                         result_dict, content)
                end -= 1
        except:
            return

    def getWinow(self, index, length, match_length, content):
        start = index
        window = self.window
        for i in xrange(index):
            new_index = index - i - 1
            if new_index < 0:
                break
            if content[new_index] in self.block_chars_dict:
                break
            if is_chinese(content[new_index]):
                window -= 1
            start = new_index
            if window == 0:
                break
        end = index + match_length - 1
        window = self.window
        for i in xrange(index + match_length, length):
            new_index = i
            if new_index < 0:
                break
            if content[new_index] in self.block_chars_dict:
                break
            if is_chinese(content[new_index]):
                window -= 1
            end = new_index
            if window == 0:
                break
        return (start, end)
Example #2
0
class WindowExtractor:

    def __init__(self, word_list, window):
        self.dic_search = DictionarySearcher()
        for item in word_list:
            self.dic_search.addKey(item, "0")
        self.window = window
        self.block_chars_dict = {}
        self.block_chars_dict["。".decode("utf-8")] = 1
        self.block_chars_dict["?".decode("utf-8")] = 1
        self.block_chars_dict["\n".decode("utf-8")] = 1
        self.block_chars_dict["\r".decode("utf-8")] = 1
        self.block_chars_dict["?".decode("utf-8")] = 1

    def setBlockChar(self, char):
        self.block_chars_dict[char.decode("utf-8")] = 1

    def extractPattern(self, s):
        '''
        以match的词为中心,左右移动self.window 个词
        '''
        pattern_result = []
        s_unicode = s.decode("utf-8")
        content_length = len(s_unicode) ;
        (result, len_txt) = self.dic_search.maxSearch(s, "utf-8")
        for item in result:
            match_length = len(item)
            for index in result[item][1:]:
                index = int(index)
                (start, end ) = self.getWinow( index,  content_length, match_length, s_unicode)
                '''generate pattern'''
                result_range = {}
                self.generatePattern(index, match_length, start, end, result_range, s_unicode)
                for item in result_range:
                    if result_range[item] == 1:
                        (start, end) = item.split("-") 
                        start = int(start)
                        end = int(end)
                        pattern_result.append( s_unicode[start:end+1])
                pass
        return pattern_result

    def generatePattern(self, index, match_length, start, end, result_dict, content):
        '''
        递归函数,穷举组合
        '''
        if start >= index and end <= index + match_length - 1:
            return
        #输出自身
        key = "%d-%d"%(start, end)
        if key in result_dict:
            return
        try:
            if is_chinese(content[start]) and is_chinese(content[end]):
                result_dict[key] = 1
            else:
                result_dict[key] = 0
            #左递归
            while start < index:
                #start -> find the first chinese
                if is_chinese(content[start + 1] ):
                    self.generatePattern( index, match_length, start + 1, end, result_dict, content)
                    break
                start = start + 1
            #右递归
            while end > index + match_length -1:
                #end -> find the first chinese
                if is_chinese(content[end -1]):
                    self.generatePattern( index, match_length, start, end -1, result_dict, content)
                end -= 1
        except:
            return

    def getWinow(self, index, length, match_length, content):
        start = index
        window = self.window
        for i in xrange(index):
            new_index = index -i -1
            if new_index < 0:
                break
            if content[new_index] in self.block_chars_dict:
                break
            if is_chinese(content[new_index]):
                window -= 1
            start = new_index
            if window == 0:
                break
        end = index +  match_length -1
        window = self.window
        for i in xrange(index+match_length, length):
            new_index = i
            if new_index < 0:
                break
            if content[new_index] in self.block_chars_dict:
                break
            if is_chinese(content[new_index]):
                window -= 1
            end = new_index
            if window == 0:
                break
        return (start, end)