Example #1
0
 def initialize_detector(self):
     t1 = time.time()
     self.lm = kenlm.Model(self.language_model_path)
     t2 = time.time()
     default_logger.debug('Loaded language model: %s, spend: %s s' %
                          (self.language_model_path, str(t2 - t1)))
     # 词、频数dict
     self.word_freq = self.load_word_freq_dict(self.word_freq_path)
     t3 = time.time()
     default_logger.debug(
         'Loaded word freq file: %s, size: %d, spend: %s s' %
         (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
     # 自定义混淆集
     self.custom_confusion = self._get_custom_confusion_dict(
         self.custom_confusion_path)
     t4 = time.time()
     default_logger.debug(
         'Loaded confusion file: %s, size: %d, spend: %s s' %
         (self.custom_confusion_path, len(
             self.custom_confusion), str(t4 - t3)))
     # 自定义切词词典
     self.custom_word_dict = self.load_word_freq_dict(self.custom_word_path)
     # 合并切词词典及自定义词典
     self.word_freq.update(self.custom_word_dict)
     t5 = time.time()
     default_logger.debug(
         'Loaded custom word file: %s, size: %d, spend: %s s' %
         (self.custom_confusion_path, len(
             self.custom_word_dict), str(t5 - t4)))
     self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                custom_word_freq_dict=self.custom_word_dict,
                                custom_confusion_dict=self.custom_confusion)
     t6 = time.time()
     default_logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
     self.initialized_detector = True
Example #2
0
    def initialize_detector(self):
        t1 = time.time()
        if self.enable_rnnlm:
            self.lm = LM(self.rnnlm_model_dir, self.rnnlm_vocab_path)
            logger.debug('Loaded language model: %s, spend: %s s' %
                         (self.rnnlm_model_dir, str(time.time() - t1)))
        else:
            try:
                import kenlm
            except ImportError:
                raise ImportError(
                    'pycorrector dependencies are not fully installed, '
                    'they are required for statistical language model.'
                    'Please use "pip install kenlm" to install it, not support Win.'
                    'if you are Win, Please install tensorflow and set enable_rnnlm=True.'
                )

            self.lm = kenlm.Model(self.language_model_path)
            logger.debug('Loaded language model: %s, spend: %s s' %
                         (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        t3 = time.time()
        logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' %
                     (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t6 = time.time()
        logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
        self.initialized_detector = True
Example #3
0
 def set_custom_word(self, path):
     self.check_detector_initialized()
     word_freqs = self.load_word_freq_dict(path)
     # 合并字典
     self.custom_word_freq.update(word_freqs)
     # 合并切词词典及自定义词典
     self.word_freq.update(self.custom_word_freq)
     self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq,
                                custom_confusion_dict=self.custom_confusion)
     for k, v in word_freqs.items():
         self.set_word_frequency(k, v)
     logger.info('Loaded custom word path: %s, size: %d' % (path, len(word_freqs)))
Example #4
0
 def __init__(self,
              word_freq_path=config.word_freq_path,
              name_sort_path=ccm_conf.name_sort_path,
              leader_job_path=ccm_conf.leader_job_path,
              leader_job_freq_dict_path=ccm_conf.leader_job_freq_dict_path):
     self.leader_job_freq_dict = Detector.load_word_freq_dict(
         leader_job_freq_dict_path)
     self.word_freq_path = word_freq_path
     print(self.leader_job_freq_dict)
     self.tokenizer = Tokenizer(
         dict_path=self.word_freq_path,
         custom_word_freq_dict=self.leader_job_freq_dict)
     self.name_sort_path = name_sort_path
     self.leader_job_path = leader_job_path
Example #5
0
class Detector(object):
    def __init__(self, language_model_path=config.language_model_path,
                 word_freq_path=config.word_freq_path,
                 custom_word_freq_path=config.custom_word_freq_path,
                 custom_confusion_path=config.custom_confusion_path,
                 person_name_path=config.person_name_path,
                 place_name_path=config.place_name_path,
                 stopwords_path=config.stopwords_path,
                 enable_rnnlm=False,
                 rnnlm_vocab_path=config.rnnlm_vocab_path,
                 rnnlm_model_dir=config.rnnlm_model_dir):
        self.name = 'detector'
        self.language_model_path = language_model_path
        self.word_freq_path = word_freq_path
        self.custom_word_freq_path = custom_word_freq_path
        self.custom_confusion_path = custom_confusion_path
        self.person_name_path = person_name_path
        self.place_name_path = place_name_path
        self.stopwords_path = stopwords_path
        self.is_char_error_detect = True
        self.is_word_error_detect = True
        self.initialized_detector = False
        self.enable_rnnlm = enable_rnnlm
        self.rnnlm_vocab_path = rnnlm_vocab_path
        self.rnnlm_model_dir = rnnlm_model_dir

    def initialize_detector(self):
        t1 = time.time()
        if self.enable_rnnlm:
            self.lm = LM(self.rnnlm_model_dir, self.rnnlm_vocab_path)
            logger.debug('Loaded language model: %s, spend: %s s' % (self.rnnlm_model_dir, str(time.time() - t1)))
        else:
            try:
                import kenlm
            except ImportError:
                raise ImportError('pycorrector dependencies are not fully installed, '
                                  'they are required for statistical language model.'
                                  'Please use "pip install kenlm" to install it, not support Win.'
                                  'if you are Win, Please install tensorflow and set enable_rnnlm=True.')

            self.lm = kenlm.Model(self.language_model_path)
            logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        t3 = time.time()
        logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' %
                     (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t6 = time.time()
        logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
        self.initialized_detector = True

    def check_detector_initialized(self):
        if not self.initialized_detector:
            self.initialize_detector()

    @staticmethod
    def load_word_freq_dict(path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 1:
                    continue
                word = info[0]
                # 取词频,默认1
                freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = freq
        return word_freq

    def _get_custom_confusion_dict(self, path):
        """
        取自定义困惑集
        :param path:
        :return: dict, {variant: origin}, eg: {"交通先行": "交通限行"}
        """
        confusion = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 2:
                    continue
                variant = info[0]
                origin = info[1]
                freq = int(info[2]) if len(info) > 2 else 1
                self.word_freq[origin] = freq
                confusion[variant] = origin
        return confusion

    def set_language_model_path(self, path):
        self.check_detector_initialized()
        import kenlm
        self.lm = kenlm.Model(path)
        logger.info('Loaded language model: %s' % path)

    def set_custom_confusion_dict(self, path):
        self.check_detector_initialized()
        custom_confusion = self._get_custom_confusion_dict(path)
        self.custom_confusion.update(custom_confusion)
        logger.info('Loaded confusion path: %s, size: %d' % (path, len(custom_confusion)))

    def set_custom_word(self, path):
        self.check_detector_initialized()
        word_freqs = self.load_word_freq_dict(path)
        # 合并字典
        self.custom_word_freq.update(word_freqs)
        # 合并切词词典及自定义词典
        self.word_freq.update(self.custom_word_freq)
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        for k, v in word_freqs.items():
            self.set_word_frequency(k, v)
        logger.info('Loaded custom word path: %s, size: %d' % (path, len(word_freqs)))

    def enable_char_error(self, enable=True):
        """
        is open char error detect
        :param enable:
        :return:
        """
        self.is_char_error_detect = enable

    def enable_word_error(self, enable=True):
        """
        is open word error detect
        :param enable:
        :return:
        """
        self.is_word_error_detect = enable

    def ngram_score(self, chars):
        """
        取n元文法得分
        :param chars: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.score(' '.join(chars), bos=False, eos=False)

    def char_scores(self, chars):
        """
        取RNN语言模型各字的得分
        :param chars: list, 以字切分
        :return: scores, list
        """
        self.check_detector_initialized()
        return self.lm.char_scores(chars)

    def ppl_score(self, words):
        """
        取语言模型困惑度得分,越小句子越通顺
        :param words: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.perplexity(' '.join(words))

    def word_frequency(self, word):
        """
        取词在样本中的词频
        :param word:
        :return:
        """
        self.check_detector_initialized()
        return self.word_freq.get(word, 0)

    def set_word_frequency(self, word, num):
        """
        更新在样本中的词频
        """
        self.check_detector_initialized()
        self.word_freq[word] = num
        return self.word_freq

    @staticmethod
    def _check_contain_error(maybe_err, maybe_errors):
        """
        检测错误集合(maybe_errors)是否已经包含该错误位置(maybe_err)
        :param maybe_err: [error_word, begin_pos, end_pos, error_type]
        :param maybe_errors:
        :return:
        """
        error_word_idx = 0
        begin_idx = 1
        end_idx = 2
        for err in maybe_errors:
            if maybe_err[error_word_idx] in err[error_word_idx] and maybe_err[begin_idx] >= err[begin_idx] and \
                            maybe_err[end_idx] <= err[end_idx]:
                return True
        return False

    def _add_maybe_error_item(self, maybe_err, maybe_errors):
        """
        新增错误
        :param maybe_err:
        :param maybe_errors:
        :return:
        """
        if maybe_err not in maybe_errors and not self._check_contain_error(maybe_err, maybe_errors):
            maybe_errors.append(maybe_err)

    @staticmethod
    def _get_maybe_error_index(scores, ratio=0.6745, threshold=1.4):
        """
        取疑似错字的位置,通过平均绝对离差(MAD)
        :param scores: np.array
        :param threshold: 阈值越小,得到疑似错别字越多
        :return: 全部疑似错误字的index: list
        """
        result = []
        scores = np.array(scores)
        if len(scores.shape) == 1:
            scores = scores[:, None]
        median = np.median(scores, axis=0)  # get median of all scores
        margin_median = np.sqrt(np.sum((scores - median) ** 2, axis=-1))  # deviation from the median
        # 平均绝对离差值
        med_abs_deviation = np.median(margin_median)
        if med_abs_deviation == 0:
            return result
        y_score = ratio * margin_median / med_abs_deviation
        # 打平
        scores = scores.flatten()
        maybe_error_indices = np.where((y_score > threshold) & (scores < median))
        # 取全部疑似错误字的index
        result = list(maybe_error_indices[0])
        return result

    @staticmethod
    def _get_maybe_error_index_by_rnnlm(scores, n=3):
        """
        取疑似错字的位置,通过平均值上下三倍标准差之间属于正常点
        :param scores: list, float
        :param threshold: 阈值越小,得到疑似错别字越多
        :return: 全部疑似错误字的index: list
        """
        std = np.std(scores, ddof=1)
        mean = np.mean(scores)
        down_limit = mean - n * std
        upper_limit = mean + n * std
        maybe_error_indices = np.where((scores > upper_limit) | (scores < down_limit))
        # 取全部疑似错误字的index
        result = list(maybe_error_indices[0])
        return result

    @staticmethod
    def is_filter_token(token):
        result = False
        # pass blank
        if not token.strip():
            result = True
        # pass punctuation
        if token in PUNCTUATION_LIST:
            result = True
        # pass num
        if token.isdigit():
            result = True
        # pass alpha
        if is_alphabet_string(token.lower()):
            result = True
        return result

    def detect(self, sentence):
        """
        检测句子中的疑似错误信息,包括[词、位置、错误类型]
        :param sentence:
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 初始化
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [confuse, idx, idx + len(confuse), ErrorType.confusion]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass filter word
                if self.is_filter_token(word):
                    continue
                # pass in dict
                if word in self.word_freq:
                    continue
                maybe_err = [word, begin_idx, end_idx, ErrorType.word]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            if self.enable_rnnlm:
                scores = self.char_scores(sentence)
                # 取疑似错字信息
                for i in self._get_maybe_error_index_by_rnnlm(scores):
                    token = sentence[i]
                    # pass filter word
                    if self.is_filter_token(token):
                        continue
                    maybe_err = [token, i, i + 1, ErrorType.char]  # token, begin_idx, end_idx, error_type
                    self._add_maybe_error_item(maybe_err, maybe_errors)
            else:
                try:
                    ngram_avg_scores = []
                    for n in [2, 3]:
                        scores = []
                        for i in range(len(sentence) - n + 1):
                            word = sentence[i:i + n]
                            score = self.ngram_score(list(word))
                            scores.append(score)
                        if not scores:
                            continue
                        # 移动窗口补全得分
                        for _ in range(n - 1):
                            scores.insert(0, scores[0])
                            scores.append(scores[-1])
                        avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))]
                        ngram_avg_scores.append(avg_scores)

                    # 取拼接后的n-gram平均得分
                    sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
                    # 取疑似错字信息
                    for i in self._get_maybe_error_index(sent_scores):
                        token = sentence[i]
                        # pass filter word
                        if self.is_filter_token(token):
                            continue
                        maybe_err = [token, i, i + 1, ErrorType.char]  # token, begin_idx, end_idx, error_type
                        self._add_maybe_error_item(maybe_err, maybe_errors)
                except IndexError as ie:
                    logger.warn("index error, sentence:" + sentence + str(ie))
                except Exception as e:
                    logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
Example #6
0
class Detector(object):
    def __init__(self,
                 language_model_path='',
                 word_freq_path='',
                 custom_word_freq_path='',
                 custom_confusion_path='',
                 person_name_path='',
                 place_name_path='',
                 stopwords_path=''):
        self.name = 'detector'
        self.language_model_path = os.path.join(pwd_path, language_model_path)
        self.word_freq_path = os.path.join(pwd_path, word_freq_path)
        self.custom_word_freq_path = os.path.join(pwd_path,
                                                  custom_word_freq_path)
        self.custom_confusion_path = os.path.join(pwd_path,
                                                  custom_confusion_path)
        self.person_name_path = os.path.join(pwd_path, person_name_path)
        self.place_name_path = os.path.join(pwd_path, place_name_path)
        self.stopwords_path = os.path.join(pwd_path, stopwords_path)
        self.is_char_error_detect = True
        self.is_word_error_detect = True
        self.initialized_detector = False

    def initialize_detector(self):
        t1 = time.time()
        self.lm = kenlm.Model(self.language_model_path)
        t2 = time.time()
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(t2 - t1)))
        # 词、频数dict
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        t3 = time.time()
        logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' %
                     (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        logger.debug('Loaded all word freq file done, size: %d' %
                     len(self.word_freq))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t6 = time.time()
        logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
        self.initialized_detector = True

    def check_detector_initialized(self):
        if not self.initialized_detector:
            self.initialize_detector()

    @staticmethod
    def load_word_freq_dict(path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 1:
                    continue
                word = info[0]
                # 取词频,默认1
                freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = freq
        return word_freq

    def _get_custom_confusion_dict(self, path):
        """
        取自定义困惑集
        :param path:
        :return: dict, {variant: origin}, eg: {"交通先行": "交通限行"}
        """
        confusion = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 2:
                    continue
                variant = info[0]
                origin = info[1]
                freq = int(info[2]) if len(info) > 2 else 1
                self.word_freq[origin] = freq
                confusion[variant] = origin
        return confusion

    def set_language_model_path(self, path):
        self.check_detector_initialized()
        self.lm = kenlm.Model(path)
        logger.info('Loaded language model: %s' % path)

    def set_custom_confusion_dict(self, path):
        self.check_detector_initialized()
        custom_confusion = self._get_custom_confusion_dict(path)
        self.custom_confusion.update(custom_confusion)
        logger.info('Loaded confusion path: %s, size: %d' %
                    (path, len(custom_confusion)))

    def set_custom_word(self, path):
        self.check_detector_initialized()
        word_freqs = self.load_word_freq_dict(path)
        # 合并字典
        self.custom_word_freq.update(word_freqs)
        # 合并切词词典及自定义词典
        self.word_freq.update(self.custom_word_freq)
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        for k, v in word_freqs.items():
            self.set_word_frequency(k, v)
        logger.info('Loaded custom word path: %s, size: %d' %
                    (path, len(word_freqs)))

    def enable_char_error(self, enable=True):
        """
        is open char error detect
        :param enable:
        :return:
        """
        self.is_char_error_detect = enable

    def enable_word_error(self, enable=True):
        """
        is open word error detect
        :param enable:
        :return:
        """
        self.is_word_error_detect = enable

    def ngram_score(self, chars):
        """
        取n元文法得分
        :param chars: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.score(' '.join(chars), bos=False, eos=False)

    def ppl_score(self, words):
        """
        取语言模型困惑度得分,越小句子越通顺
        :param words: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.perplexity(' '.join(words))

    def word_frequency(self, word):
        """
        取词在样本中的词频
        :param word:
        :return:
        """
        self.check_detector_initialized()
        return self.word_freq.get(word, 0)

    def set_word_frequency(self, word, num):
        """
        更新在样本中的词频
        """
        self.check_detector_initialized()
        self.word_freq[word] = num
        return self.word_freq

    @staticmethod
    def _check_contain_error(maybe_err, maybe_errors):
        """
        检测错误集合(maybe_errors)是否已经包含该错误位置(maybe_err)
        :param maybe_err: [error_word, begin_pos, end_pos, error_type]
        :param maybe_errors:
        :return:
        """
        error_word_idx = 0
        begin_idx = 1
        end_idx = 2
        for err in maybe_errors:
            if maybe_err[error_word_idx] in err[error_word_idx] and maybe_err[begin_idx] >= err[begin_idx] and \
                            maybe_err[end_idx] <= err[end_idx]:
                return True
        return False

    def _add_maybe_error_item(self, maybe_err, maybe_errors):
        """
        新增错误
        :param maybe_err:
        :param maybe_errors:
        :return:
        """
        if maybe_err not in maybe_errors and not self._check_contain_error(
                maybe_err, maybe_errors):
            maybe_errors.append(maybe_err)

    @staticmethod
    def _get_maybe_error_index(scores, ratio=0.6745, threshold=1.4):
        """
        取疑似错字的位置,通过平均绝对离差(MAD)
        :param scores: np.array
        :param threshold: 阈值越小,得到疑似错别字越多
        :return:
        """
        scores = np.array(scores)
        if len(scores.shape) == 1:
            scores = scores[:, None]
        median = np.median(scores, axis=0)  # get median of all scores
        margin_median = np.sqrt(np.sum((scores - median)**2,
                                       axis=-1))  # deviation from the median
        # 平均绝对离差值
        med_abs_deviation = np.median(margin_median)
        if med_abs_deviation == 0:
            return []
        y_score = ratio * margin_median / med_abs_deviation
        # 打平
        scores = scores.flatten()
        maybe_error_indices = np.where((y_score > threshold)
                                       & (scores < median))
        # 取全部疑似错误字的index
        return list(maybe_error_indices[0])

    def detect(self, sentence):
        """
        检测句子中的疑似错误信息,包括[词、位置、错误类型]
        :param sentence:
        :return: [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [
                    confuse, idx, idx + len(confuse), error_type["confusion"]
                ]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass blank
                if not word.strip():
                    continue
                # punctuation
                if word in PUNCTUATION_LIST:
                    continue
                # pass num
                if word.isdigit():
                    continue
                # pass alpha
                if is_alphabet_string(word.lower()):
                    continue
                # in dict
                if word in self.word_freq:
                    continue
                maybe_err = [word, begin_idx, end_idx, error_type["word"]]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            ngram_avg_scores = []
            try:
                for n in [2, 3]:
                    scores = []
                    for i in range(len(sentence) - n + 1):
                        word = sentence[i:i + n]
                        score = self.ngram_score(list(word))
                        scores.append(score)
                    if not scores:
                        continue
                    # 移动窗口补全得分
                    for _ in range(n - 1):
                        scores.insert(0, scores[0])
                        scores.append(scores[-1])
                    avg_scores = [
                        sum(scores[i:i + n]) / len(scores[i:i + n])
                        for i in range(len(sentence))
                    ]
                    ngram_avg_scores.append(avg_scores)

                # 取拼接后的ngram平均得分
                sent_scores = list(
                    np.average(np.array(ngram_avg_scores), axis=0))
                # 取疑似错字信息
                for i in self._get_maybe_error_index(sent_scores):
                    maybe_err = [sentence[i], i, i + 1, error_type["char"]]
                    self._add_maybe_error_item(maybe_err, maybe_errors)
            except IndexError as ie:
                logger.warn("index error, sentence:" + sentence + str(ie))
            except Exception as e:
                logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
Example #7
0
class NameSort(object):
    def __init__(self,
                 word_freq_path=config.word_freq_path,
                 name_sort_path=ccm_conf.name_sort_path,
                 leader_job_path=ccm_conf.leader_job_path,
                 leader_job_freq_dict_path=ccm_conf.leader_job_freq_dict_path):
        self.leader_job_freq_dict = Detector.load_word_freq_dict(
            leader_job_freq_dict_path)
        self.word_freq_path = word_freq_path
        print(self.leader_job_freq_dict)
        self.tokenizer = Tokenizer(
            dict_path=self.word_freq_path,
            custom_word_freq_dict=self.leader_job_freq_dict)
        self.name_sort_path = name_sort_path
        self.leader_job_path = leader_job_path

    def is_filter_token(self, token):
        result = False
        # pass blank
        if not token.strip():
            result = True
        # pass punctuation
        if token in PUNCTUATION_LIST:
            result = True
        # pass num
        if token.isdigit():
            result = True
        # pass alpha
        if is_alphabet_string(token.lower()):
            result = True
        return result

    def load_ccm_word_freq_dict(self, path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split('+')
                if len(info) < 1:
                    continue
                word = info[0]  # word为姓名
                # 取词频,默认1 长度大于一时 freq=info[1]为顺序 否则定义为1
                freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = freq
            # print("++++" + str(word_freq))
        return word_freq

    def load_ccm_job_freq_dict(self, path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#') or not line:
                    continue
                info = line.split(':')
                if len(info) < 1:
                    continue
                # print("dddddd")
                word = info[0]  # 名字-习近平
                # print(word)
                job = info[1]  # 职务+称谓
                # print(job)
                s1 = job.split('?')
                if len(s1) > 1:
                    # print(s1)
                    s2 = s1[0].split('、')  # 将职务用、隔开
                    s3 = s1[1].split('、')  # 将称谓用、隔开
                    # print(s2)
                    # print(s3)
                    b = {'1': s2, '2': s3}
                else:
                    s2 = s1[0].split('、')  # 将职务用、隔开
                    b = {'1': s2}
                # 取词频,默认1
                # freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = b
        return word_freq

    def ccm_sort(self, sentence):
        """
        """
        # 加载排序词典
        name_model = self.load_ccm_word_freq_dict(self.name_sort_path)
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        print(tokens)
        temp = None
        error_list = []
        correct_list = []
        new = []
        i = -1
        for word, begin_idx, end_idx in tokens:
            new.append(word)
            i += 1
            if word in LINK_WORD:
                temp = None
            if name_model.get(word):
                if not temp:
                    temp = name_model.get(word)
                    continue
                else:
                    if temp > name_model.get(word):
                        p = tokens[i]
                        tokens[i] = tokens[i - 2]
                        tokens[i - 2] = p
                        print(tokens[i][0])
                        print(tokens[i - 2][0])
                        correct_list.append((tokens[i][0], i))
                        correct_list.append((tokens[i - 2][0], i - 2))
                        error_list.append((tokens[i][0], i))
                    else:
                        pass
            # print(tokens)
        # correct_list.append((tokens[i][0]))
        for word, p in correct_list:
            new[p] = word
        print(new)
        print("ls:" + str(correct_list))
        correct = ''.join(new)
        print("correct:" + correct)
        # print(error_list)

        # print(tokens)
        # print(tokens[0])

        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)

    def name_job(self, sentence):
        """
        """
        # 加载人名-职务词典
        job_model = self.load_ccm_job_freq_dict(self.leader_job_path)
        print(job_model)
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        print(tokens)
        # temp = None
        error_list = []
        correct_list = []
        new = []
        i = 0
        j = 0
        for word, begin_idx, end_idx in tokens:
            if job_model.get(word):
                print(i)  # 如果找到人名了,那么现在的i就是该人名的坐标
                a = job_model.get(word)
                front = a.get('1')
                temp_list = []
                for x in range(j, i):  # j就是起点坐标,i就是终点坐标
                    if self.leader_job_freq_dict.get(tokens[x][0]):
                        if tokens[x][0] not in front:
                            temp_list.append(tokens[x][0])
                if temp_list:
                    error_list.append({word: temp_list})
                else:
                    pass
                j = i + 1  # 起点坐标变为上一个人坐标的下一位坐标
            i += 1
        print(error_list)