Python Tokenizer Examples

Programming Language: Python

Namespace/Package Name: pycorrector.tokenizer

Class/Type: Tokenizer

Examples at hotexamples.com: 7

Python Tokenizer - 7 examples found. These are the top rated real world Python examples of pycorrector.tokenizer.Tokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tokenizer(4)

tokenize(3)

Frequently Used Methods

Tokenizer (4)

tokenize (3)

Example #1

Show file

File: detector.py Project: zw76859420/pycorrector

 def initialize_detector(self):
     t1 = time.time()
     self.lm = kenlm.Model(self.language_model_path)
     t2 = time.time()
     default_logger.debug('Loaded language model: %s, spend: %s s' %
                          (self.language_model_path, str(t2 - t1)))
     # 词、频数dict
     self.word_freq = self.load_word_freq_dict(self.word_freq_path)
     t3 = time.time()
     default_logger.debug(
         'Loaded word freq file: %s, size: %d, spend: %s s' %
         (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
     # 自定义混淆集
     self.custom_confusion = self._get_custom_confusion_dict(
         self.custom_confusion_path)
     t4 = time.time()
     default_logger.debug(
         'Loaded confusion file: %s, size: %d, spend: %s s' %
         (self.custom_confusion_path, len(
             self.custom_confusion), str(t4 - t3)))
     # 自定义切词词典
     self.custom_word_dict = self.load_word_freq_dict(self.custom_word_path)
     # 合并切词词典及自定义词典
     self.word_freq.update(self.custom_word_dict)
     t5 = time.time()
     default_logger.debug(
         'Loaded custom word file: %s, size: %d, spend: %s s' %
         (self.custom_confusion_path, len(
             self.custom_word_dict), str(t5 - t4)))
     self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                custom_word_freq_dict=self.custom_word_dict,
                                custom_confusion_dict=self.custom_confusion)
     t6 = time.time()
     default_logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
     self.initialized_detector = True

Example #2

Show file

File: detector.py Project: dlml/pycorrector

    def initialize_detector(self):
        t1 = time.time()
        if self.enable_rnnlm:
            self.lm = LM(self.rnnlm_model_dir, self.rnnlm_vocab_path)
            logger.debug('Loaded language model: %s, spend: %s s' %
                         (self.rnnlm_model_dir, str(time.time() - t1)))
        else:
            try:
                import kenlm
            except ImportError:
                raise ImportError(
                    'pycorrector dependencies are not fully installed, '
                    'they are required for statistical language model.'
                    'Please use "pip install kenlm" to install it, not support Win.'
                    'if you are Win, Please install tensorflow and set enable_rnnlm=True.'
                )

            self.lm = kenlm.Model(self.language_model_path)
            logger.debug('Loaded language model: %s, spend: %s s' %
                         (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        t3 = time.time()
        logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' %
                     (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t6 = time.time()
        logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
        self.initialized_detector = True

Example #3

Show file

 def set_custom_word(self, path):
     self.check_detector_initialized()
     word_freqs = self.load_word_freq_dict(path)
     # 合并字典
     self.custom_word_freq.update(word_freqs)
     # 合并切词词典及自定义词典
     self.word_freq.update(self.custom_word_freq)
     self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq,
                                custom_confusion_dict=self.custom_confusion)
     for k, v in word_freqs.items():
         self.set_word_frequency(k, v)
     logger.info('Loaded custom word path: %s, size: %d' % (path, len(word_freqs)))

Example #4

Show file

 def __init__(self,
              word_freq_path=config.word_freq_path,
              name_sort_path=ccm_conf.name_sort_path,
              leader_job_path=ccm_conf.leader_job_path,
              leader_job_freq_dict_path=ccm_conf.leader_job_freq_dict_path):
     self.leader_job_freq_dict = Detector.load_word_freq_dict(
         leader_job_freq_dict_path)
     self.word_freq_path = word_freq_path
     print(self.leader_job_freq_dict)
     self.tokenizer = Tokenizer(
         dict_path=self.word_freq_path,
         custom_word_freq_dict=self.leader_job_freq_dict)
     self.name_sort_path = name_sort_path
     self.leader_job_path = leader_job_path

Example #5

Show file

class Detector(object):
    def __init__(self, language_model_path=config.language_model_path,
                 word_freq_path=config.word_freq_path,
                 custom_word_freq_path=config.custom_word_freq_path,
                 custom_confusion_path=config.custom_confusion_path,
                 person_name_path=config.person_name_path,
                 place_name_path=config.place_name_path,
                 stopwords_path=config.stopwords_path,
                 enable_rnnlm=False,
                 rnnlm_vocab_path=config.rnnlm_vocab_path,
                 rnnlm_model_dir=config.rnnlm_model_dir):
        self.name = 'detector'
        self.language_model_path = language_model_path
        self.word_freq_path = word_freq_path
        self.custom_word_freq_path = custom_word_freq_path
        self.custom_confusion_path = custom_confusion_path
        self.person_name_path = person_name_path
        self.place_name_path = place_name_path
        self.stopwords_path = stopwords_path
        self.is_char_error_detect = True
        self.is_word_error_detect = True
        self.initialized_detector = False
        self.enable_rnnlm = enable_rnnlm
        self.rnnlm_vocab_path = rnnlm_vocab_path
        self.rnnlm_model_dir = rnnlm_model_dir

    def initialize_detector(self):
        t1 = time.time()
        if self.enable_rnnlm:
            self.lm = LM(self.rnnlm_model_dir, self.rnnlm_vocab_path)
            logger.debug('Loaded language model: %s, spend: %s s' % (self.rnnlm_model_dir, str(time.time() - t1)))
        else:
            try:
                import kenlm
            except ImportError:
                raise ImportError('pycorrector dependencies are not fully installed, '
                                  'they are required for statistical language model.'
                                  'Please use "pip install kenlm" to install it, not support Win.'
                                  'if you are Win, Please install tensorflow and set enable_rnnlm=True.')

            self.lm = kenlm.Model(self.language_model_path)
            logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        t3 = time.time()
        logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' %
                     (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t6 = time.time()
        logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
        self.initialized_detector = True

    def check_detector_initialized(self):
        if not self.initialized_detector:
            self.initialize_detector()

    @staticmethod
    def load_word_freq_dict(path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 1:
                    continue
                word = info[0]
                # 取词频，默认1
                freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = freq
        return word_freq

    def _get_custom_confusion_dict(self, path):
        """
        取自定义困惑集
        :param path:
        :return: dict, {variant: origin}, eg: {"交通先行": "交通限行"}
        """
        confusion = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 2:
                    continue
                variant = info[0]
                origin = info[1]
                freq = int(info[2]) if len(info) > 2 else 1
                self.word_freq[origin] = freq
                confusion[variant] = origin
        return confusion

    def set_language_model_path(self, path):
        self.check_detector_initialized()
        import kenlm
        self.lm = kenlm.Model(path)
        logger.info('Loaded language model: %s' % path)

    def set_custom_confusion_dict(self, path):
        self.check_detector_initialized()
        custom_confusion = self._get_custom_confusion_dict(path)
        self.custom_confusion.update(custom_confusion)
        logger.info('Loaded confusion path: %s, size: %d' % (path, len(custom_confusion)))

    def set_custom_word(self, path):
        self.check_detector_initialized()
        word_freqs = self.load_word_freq_dict(path)
        # 合并字典
        self.custom_word_freq.update(word_freqs)
        # 合并切词词典及自定义词典
        self.word_freq.update(self.custom_word_freq)
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        for k, v in word_freqs.items():
            self.set_word_frequency(k, v)
        logger.info('Loaded custom word path: %s, size: %d' % (path, len(word_freqs)))

    def enable_char_error(self, enable=True):
        """
        is open char error detect
        :param enable:
        :return:
        """
        self.is_char_error_detect = enable

    def enable_word_error(self, enable=True):
        """
        is open word error detect
        :param enable:
        :return:
        """
        self.is_word_error_detect = enable

    def ngram_score(self, chars):
        """
        取n元文法得分
        :param chars: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.score(' '.join(chars), bos=False, eos=False)

    def char_scores(self, chars):
        """
        取RNN语言模型各字的得分
        :param chars: list, 以字切分
        :return: scores, list
        """
        self.check_detector_initialized()
        return self.lm.char_scores(chars)

    def ppl_score(self, words):
        """
        取语言模型困惑度得分，越小句子越通顺
        :param words: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.perplexity(' '.join(words))

    def word_frequency(self, word):
        """
        取词在样本中的词频
        :param word:
        :return:
        """
        self.check_detector_initialized()
        return self.word_freq.get(word, 0)

    def set_word_frequency(self, word, num):
        """
        更新在样本中的词频
        """
        self.check_detector_initialized()
        self.word_freq[word] = num
        return self.word_freq

    @staticmethod
    def _check_contain_error(maybe_err, maybe_errors):
        """
        检测错误集合(maybe_errors)是否已经包含该错误位置（maybe_err)
        :param maybe_err: [error_word, begin_pos, end_pos, error_type]
        :param maybe_errors:
        :return:
        """
        error_word_idx = 0
        begin_idx = 1
        end_idx = 2
        for err in maybe_errors:
            if maybe_err[error_word_idx] in err[error_word_idx] and maybe_err[begin_idx] >= err[begin_idx] and \
                            maybe_err[end_idx] <= err[end_idx]:
                return True
        return False

    def _add_maybe_error_item(self, maybe_err, maybe_errors):
        """
        新增错误
        :param maybe_err:
        :param maybe_errors:
        :return:
        """
        if maybe_err not in maybe_errors and not self._check_contain_error(maybe_err, maybe_errors):
            maybe_errors.append(maybe_err)

    @staticmethod
    def _get_maybe_error_index(scores, ratio=0.6745, threshold=1.4):
        """
        取疑似错字的位置，通过平均绝对离差（MAD）
        :param scores: np.array
        :param threshold: 阈值越小，得到疑似错别字越多
        :return: 全部疑似错误字的index: list
        """
        result = []
        scores = np.array(scores)
        if len(scores.shape) == 1:
            scores = scores[:, None]
        median = np.median(scores, axis=0)  # get median of all scores
        margin_median = np.sqrt(np.sum((scores - median) ** 2, axis=-1))  # deviation from the median
        # 平均绝对离差值
        med_abs_deviation = np.median(margin_median)
        if med_abs_deviation == 0:
            return result
        y_score = ratio * margin_median / med_abs_deviation
        # 打平
        scores = scores.flatten()
        maybe_error_indices = np.where((y_score > threshold) & (scores < median))
        # 取全部疑似错误字的index
        result = list(maybe_error_indices[0])
        return result

    @staticmethod
    def _get_maybe_error_index_by_rnnlm(scores, n=3):
        """
        取疑似错字的位置，通过平均值上下三倍标准差之间属于正常点
        :param scores: list, float
        :param threshold: 阈值越小，得到疑似错别字越多
        :return: 全部疑似错误字的index: list
        """
        std = np.std(scores, ddof=1)
        mean = np.mean(scores)
        down_limit = mean - n * std
        upper_limit = mean + n * std
        maybe_error_indices = np.where((scores > upper_limit) | (scores < down_limit))
        # 取全部疑似错误字的index
        result = list(maybe_error_indices[0])
        return result

    @staticmethod
    def is_filter_token(token):
        result = False
        # pass blank
        if not token.strip():
            result = True
        # pass punctuation
        if token in PUNCTUATION_LIST:
            result = True
        # pass num
        if token.isdigit():
            result = True
        # pass alpha
        if is_alphabet_string(token.lower()):
            result = True
        return result

    def detect(self, sentence):
        """
        检测句子中的疑似错误信息，包括[词、位置、错误类型]
        :param sentence:
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 初始化
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [confuse, idx, idx + len(confuse), ErrorType.confusion]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass filter word
                if self.is_filter_token(word):
                    continue
                # pass in dict
                if word in self.word_freq:
                    continue
                maybe_err = [word, begin_idx, end_idx, ErrorType.word]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            if self.enable_rnnlm:
                scores = self.char_scores(sentence)
                # 取疑似错字信息
                for i in self._get_maybe_error_index_by_rnnlm(scores):
                    token = sentence[i]
                    # pass filter word
                    if self.is_filter_token(token):
                        continue
                    maybe_err = [token, i, i + 1, ErrorType.char]  # token, begin_idx, end_idx, error_type
                    self._add_maybe_error_item(maybe_err, maybe_errors)
            else:
                try:
                    ngram_avg_scores = []
                    for n in [2, 3]:
                        scores = []
                        for i in range(len(sentence) - n + 1):
                            word = sentence[i:i + n]
                            score = self.ngram_score(list(word))
                            scores.append(score)
                        if not scores:
                            continue
                        # 移动窗口补全得分
                        for _ in range(n - 1):
                            scores.insert(0, scores[0])
                            scores.append(scores[-1])
                        avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))]
                        ngram_avg_scores.append(avg_scores)

                    # 取拼接后的n-gram平均得分
                    sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
                    # 取疑似错字信息
                    for i in self._get_maybe_error_index(sent_scores):
                        token = sentence[i]
                        # pass filter word
                        if self.is_filter_token(token):
                            continue
                        maybe_err = [token, i, i + 1, ErrorType.char]  # token, begin_idx, end_idx, error_type
                        self._add_maybe_error_item(maybe_err, maybe_errors)
                except IndexError as ie:
                    logger.warn("index error, sentence:" + sentence + str(ie))
                except Exception as e:
                    logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)

Example #6

Show file

File: detector.py Project: zhangxiaofanL/pycorrector

class Detector(object):
    def __init__(self,
                 language_model_path='',
                 word_freq_path='',
                 custom_word_freq_path='',
                 custom_confusion_path='',
                 person_name_path='',
                 place_name_path='',
                 stopwords_path=''):
        self.name = 'detector'
        self.language_model_path = os.path.join(pwd_path, language_model_path)
        self.word_freq_path = os.path.join(pwd_path, word_freq_path)
        self.custom_word_freq_path = os.path.join(pwd_path,
                                                  custom_word_freq_path)
        self.custom_confusion_path = os.path.join(pwd_path,
                                                  custom_confusion_path)
        self.person_name_path = os.path.join(pwd_path, person_name_path)
        self.place_name_path = os.path.join(pwd_path, place_name_path)
        self.stopwords_path = os.path.join(pwd_path, stopwords_path)
        self.is_char_error_detect = True
        self.is_word_error_detect = True
        self.initialized_detector = False

    def initialize_detector(self):
        t1 = time.time()
        self.lm = kenlm.Model(self.language_model_path)
        t2 = time.time()
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(t2 - t1)))
        # 词、频数dict
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        t3 = time.time()
        logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' %
                     (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        logger.debug('Loaded all word freq file done, size: %d' %
                     len(self.word_freq))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t6 = time.time()
        logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
        self.initialized_detector = True

    def check_detector_initialized(self):
        if not self.initialized_detector:
            self.initialize_detector()

    @staticmethod
    def load_word_freq_dict(path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 1:
                    continue
                word = info[0]
                # 取词频，默认1
                freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = freq
        return word_freq

    def _get_custom_confusion_dict(self, path):
        """
        取自定义困惑集
        :param path:
        :return: dict, {variant: origin}, eg: {"交通先行": "交通限行"}
        """
        confusion = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 2:
                    continue
                variant = info[0]
                origin = info[1]
                freq = int(info[2]) if len(info) > 2 else 1
                self.word_freq[origin] = freq
                confusion[variant] = origin
        return confusion

    def set_language_model_path(self, path):
        self.check_detector_initialized()
        self.lm = kenlm.Model(path)
        logger.info('Loaded language model: %s' % path)

    def set_custom_confusion_dict(self, path):
        self.check_detector_initialized()
        custom_confusion = self._get_custom_confusion_dict(path)
        self.custom_confusion.update(custom_confusion)
        logger.info('Loaded confusion path: %s, size: %d' %
                    (path, len(custom_confusion)))

    def set_custom_word(self, path):
        self.check_detector_initialized()
        word_freqs = self.load_word_freq_dict(path)
        # 合并字典
        self.custom_word_freq.update(word_freqs)
        # 合并切词词典及自定义词典
        self.word_freq.update(self.custom_word_freq)
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        for k, v in word_freqs.items():
            self.set_word_frequency(k, v)
        logger.info('Loaded custom word path: %s, size: %d' %
                    (path, len(word_freqs)))

    def enable_char_error(self, enable=True):
        """
        is open char error detect
        :param enable:
        :return:
        """
        self.is_char_error_detect = enable

    def enable_word_error(self, enable=True):
        """
        is open word error detect
        :param enable:
        :return:
        """
        self.is_word_error_detect = enable

    def ngram_score(self, chars):
        """
        取n元文法得分
        :param chars: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.score(' '.join(chars), bos=False, eos=False)

    def ppl_score(self, words):
        """
        取语言模型困惑度得分，越小句子越通顺
        :param words: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.perplexity(' '.join(words))

    def word_frequency(self, word):
        """
        取词在样本中的词频
        :param word:
        :return:
        """
        self.check_detector_initialized()
        return self.word_freq.get(word, 0)

    def set_word_frequency(self, word, num):
        """
        更新在样本中的词频
        """
        self.check_detector_initialized()
        self.word_freq[word] = num
        return self.word_freq

    @staticmethod
    def _check_contain_error(maybe_err, maybe_errors):
        """
        检测错误集合(maybe_errors)是否已经包含该错误位置（maybe_err)
        :param maybe_err: [error_word, begin_pos, end_pos, error_type]
        :param maybe_errors:
        :return:
        """
        error_word_idx = 0
        begin_idx = 1
        end_idx = 2
        for err in maybe_errors:
            if maybe_err[error_word_idx] in err[error_word_idx] and maybe_err[begin_idx] >= err[begin_idx] and \
                            maybe_err[end_idx] <= err[end_idx]:
                return True
        return False

    def _add_maybe_error_item(self, maybe_err, maybe_errors):
        """
        新增错误
        :param maybe_err:
        :param maybe_errors:
        :return:
        """
        if maybe_err not in maybe_errors and not self._check_contain_error(
                maybe_err, maybe_errors):
            maybe_errors.append(maybe_err)

    @staticmethod
    def _get_maybe_error_index(scores, ratio=0.6745, threshold=1.4):
        """
        取疑似错字的位置，通过平均绝对离差（MAD）
        :param scores: np.array
        :param threshold: 阈值越小，得到疑似错别字越多
        :return:
        """
        scores = np.array(scores)
        if len(scores.shape) == 1:
            scores = scores[:, None]
        median = np.median(scores, axis=0)  # get median of all scores
        margin_median = np.sqrt(np.sum((scores - median)**2,
                                       axis=-1))  # deviation from the median
        # 平均绝对离差值
        med_abs_deviation = np.median(margin_median)
        if med_abs_deviation == 0:
            return []
        y_score = ratio * margin_median / med_abs_deviation
        # 打平
        scores = scores.flatten()
        maybe_error_indices = np.where((y_score > threshold)
                                       & (scores < median))
        # 取全部疑似错误字的index
        return list(maybe_error_indices[0])

    def detect(self, sentence):
        """
        检测句子中的疑似错误信息，包括[词、位置、错误类型]
        :param sentence:
        :return: [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [
                    confuse, idx, idx + len(confuse), error_type["confusion"]
                ]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass blank
                if not word.strip():
                    continue
                # punctuation
                if word in PUNCTUATION_LIST:
                    continue
                # pass num
                if word.isdigit():
                    continue
                # pass alpha
                if is_alphabet_string(word.lower()):
                    continue
                # in dict
                if word in self.word_freq:
                    continue
                maybe_err = [word, begin_idx, end_idx, error_type["word"]]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            ngram_avg_scores = []
            try:
                for n in [2, 3]:
                    scores = []
                    for i in range(len(sentence) - n + 1):
                        word = sentence[i:i + n]
                        score = self.ngram_score(list(word))
                        scores.append(score)
                    if not scores:
                        continue
                    # 移动窗口补全得分
                    for _ in range(n - 1):
                        scores.insert(0, scores[0])
                        scores.append(scores[-1])
                    avg_scores = [
                        sum(scores[i:i + n]) / len(scores[i:i + n])
                        for i in range(len(sentence))
                    ]
                    ngram_avg_scores.append(avg_scores)

                # 取拼接后的ngram平均得分
                sent_scores = list(
                    np.average(np.array(ngram_avg_scores), axis=0))
                # 取疑似错字信息
                for i in self._get_maybe_error_index(sent_scores):
                    maybe_err = [sentence[i], i, i + 1, error_type["char"]]
                    self._add_maybe_error_item(maybe_err, maybe_errors)
            except IndexError as ie:
                logger.warn("index error, sentence:" + sentence + str(ie))
            except Exception as e:
                logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)

Example #7

Show file

class NameSort(object):
    def __init__(self,
                 word_freq_path=config.word_freq_path,
                 name_sort_path=ccm_conf.name_sort_path,
                 leader_job_path=ccm_conf.leader_job_path,
                 leader_job_freq_dict_path=ccm_conf.leader_job_freq_dict_path):
        self.leader_job_freq_dict = Detector.load_word_freq_dict(
            leader_job_freq_dict_path)
        self.word_freq_path = word_freq_path
        print(self.leader_job_freq_dict)
        self.tokenizer = Tokenizer(
            dict_path=self.word_freq_path,
            custom_word_freq_dict=self.leader_job_freq_dict)
        self.name_sort_path = name_sort_path
        self.leader_job_path = leader_job_path

    def is_filter_token(self, token):
        result = False
        # pass blank
        if not token.strip():
            result = True
        # pass punctuation
        if token in PUNCTUATION_LIST:
            result = True
        # pass num
        if token.isdigit():
            result = True
        # pass alpha
        if is_alphabet_string(token.lower()):
            result = True
        return result

    def load_ccm_word_freq_dict(self, path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split('+')
                if len(info) < 1:
                    continue
                word = info[0]  # word为姓名
                # 取词频，默认1 长度大于一时 freq=info[1]为顺序 否则定义为1
                freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = freq
            # print("++++" + str(word_freq))
        return word_freq

    def load_ccm_job_freq_dict(self, path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#') or not line:
                    continue
                info = line.split('：')
                if len(info) < 1:
                    continue
                # print("dddddd")
                word = info[0]  # 名字-习近平
                # print(word)
                job = info[1]  # 职务+称谓
                # print(job)
                s1 = job.split('？')
                if len(s1) > 1:
                    # print(s1)
                    s2 = s1[0].split('、')  # 将职务用、隔开
                    s3 = s1[1].split('、')  # 将称谓用、隔开
                    # print(s2)
                    # print(s3)
                    b = {'1': s2, '2': s3}
                else:
                    s2 = s1[0].split('、')  # 将职务用、隔开
                    b = {'1': s2}
                # 取词频，默认1
                # freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = b
        return word_freq

    def ccm_sort(self, sentence):
        """
        """
        # 加载排序词典
        name_model = self.load_ccm_word_freq_dict(self.name_sort_path)
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        print(tokens)
        temp = None
        error_list = []
        correct_list = []
        new = []
        i = -1
        for word, begin_idx, end_idx in tokens:
            new.append(word)
            i += 1
            if word in LINK_WORD:
                temp = None
            if name_model.get(word):
                if not temp:
                    temp = name_model.get(word)
                    continue
                else:
                    if temp > name_model.get(word):
                        p = tokens[i]
                        tokens[i] = tokens[i - 2]
                        tokens[i - 2] = p
                        print(tokens[i][0])
                        print(tokens[i - 2][0])
                        correct_list.append((tokens[i][0], i))
                        correct_list.append((tokens[i - 2][0], i - 2))
                        error_list.append((tokens[i][0], i))
                    else:
                        pass
            # print(tokens)
        # correct_list.append((tokens[i][0]))
        for word, p in correct_list:
            new[p] = word
        print(new)
        print("ls:" + str(correct_list))
        correct = ''.join(new)
        print("correct:" + correct)
        # print(error_list)

        # print(tokens)
        # print(tokens[0])

        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)

    def name_job(self, sentence):
        """
        """
        # 加载人名-职务词典
        job_model = self.load_ccm_job_freq_dict(self.leader_job_path)
        print(job_model)
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        print(tokens)
        # temp = None
        error_list = []
        correct_list = []
        new = []
        i = 0
        j = 0
        for word, begin_idx, end_idx in tokens:
            if job_model.get(word):
                print(i)  # 如果找到人名了，那么现在的i就是该人名的坐标
                a = job_model.get(word)
                front = a.get('1')
                temp_list = []
                for x in range(j, i):  # j就是起点坐标，i就是终点坐标
                    if self.leader_job_freq_dict.get(tokens[x][0]):
                        if tokens[x][0] not in front:
                            temp_list.append(tokens[x][0])
                if temp_list:
                    error_list.append({word: temp_list})
                else:
                    pass
                j = i + 1  # 起点坐标变为上一个人坐标的下一位坐标
            i += 1
        print(error_list)