Python tokenize Exemples, pycorrector.utils.text_utils.tokenize Python Exemples

Exemple #1

0

Afficher le fichier

def detect(sentence):
    maybe_error_indices = set()
    # 文本归一化
    sentence = uniform(sentence)
    # 切词
    tokens = tokenize(sentence)
    # 未登录词加入疑似错误字典
    for word, begin_idx, end_idx in tokens:
        if word not in PUNCTUATION_LIST and word not in word_freq.keys():
            for i in range(begin_idx, end_idx):
                maybe_error_indices.add(i)
    # 语言模型检测疑似错字
    ngram_avg_scores = []
    for n in [2, 3]:
        scores = []
        for i in range(len(sentence) - n + 1):
            word = sentence[i:i + n]
            score = get_ngram_score(list(word), mode=trigram_char)
            scores.append(score)
        # 移动窗口补全得分
        for _ in range(n - 1):
            scores.insert(0, scores[0])
            scores.append(scores[-1])
        avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))]
        ngram_avg_scores.append(avg_scores)
    # 取拼接后的ngram平均得分
    sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
    maybe_error_char_indices = _get_maybe_error_index(sent_scores)
    # 合并字、词错误
    maybe_error_indices |= set(maybe_error_char_indices)
    return sorted(maybe_error_indices)

Exemple #2

0

Afficher le fichier

Fichier : detector.py Projet : rongchen89/pycorrector

def detect(sentence):
    maybe_error_indices = set()
    # 文本归一化
    sentence = uniform(sentence)
    # 切词
    tokens = tokenize(sentence)
    # 未登录词加入疑似错误字典
    for word, begin_idx, end_idx in tokens:
        # fixed: pass num alpha
        if word.isalnum(): continue
        # punctuation
        if word in PUNCTUATION_LIST: continue
        # in dict
        if word in word_freq.keys(): continue
        for i in range(begin_idx, end_idx):
            maybe_error_indices.add(i)
    # 语言模型检测疑似错字
    ngram_avg_scores = []
    try:
        for n in [2, 3]:
            scores = []
            for i in range(len(sentence) - n + 1):
                word = sentence[i:i + n]
                score = get_ngram_score(list(word), mode=trigram_char)
                scores.append(score)
            if not scores: continue
            # 移动窗口补全得分
            for _ in range(n - 1):
                scores.insert(0, scores[0])
                scores.append(scores[-1])
            avg_scores = [
                sum(scores[i:i + n]) / len(scores[i:i + n])
                for i in range(len(sentence))
            ]
            ngram_avg_scores.append(avg_scores)

        # 取拼接后的ngram平均得分
        sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
        maybe_error_char_indices = _get_maybe_error_index(sent_scores)
        # 合并字、词错误
        maybe_error_indices |= set(maybe_error_char_indices)
    except IndexError as ie:
        default_logger.warn("index error, sentence:" + sentence + ie)
    except Exception as e:
        default_logger.warn("detect error, sentence:" + sentence + e)
    return sorted(maybe_error_indices)

Exemple #3

0

Afficher le fichier

Fichier : detector.py Projet : djk111/pycorrector

def detect(sentence):
    maybe_error_indices = set()
    # 文本归一化
    sentence = uniform(sentence)
    # 切词
    tokens = tokenize(sentence)
    # 未登录词加入疑似错误字典
    for word, begin_idx, end_idx in tokens:
        if word not in PUNCTUATION_LIST and word not in word_freq.keys():
            for i in range(begin_idx, end_idx):
                maybe_error_indices.add(i)
    # 语言模型检测疑似错字
    ngram_avg_scores = []
    try:
        for n in [2, 3]:
            scores = []
            for i in range(len(sentence) - n + 1):
                word = sentence[i:i + n]
                score = get_ngram_score(list(word), mode=trigram_char)
                scores.append(score)
            # 移动窗口补全得分
            for _ in range(n - 1):
                scores.insert(0, scores[0])
                scores.append(scores[-1])
            avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))]
            ngram_avg_scores.append(avg_scores)

        # 取拼接后的ngram平均得分
        sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
        maybe_error_char_indices = _get_maybe_error_index(sent_scores)
        # 合并字、词错误
        maybe_error_indices |= set(maybe_error_char_indices)
    except IndexError as ie:
        print("index error, sentence:", sentence, ie)
        pass
    except Exception as e:
        print("detect error, sentence:", sentence, e)
    return sorted(maybe_error_indices)

Exemple #4

0

Afficher le fichier

from pypinyin import lazy_pinyin

from pycorrector.utils.text_utils import traditional2simplified, simplified2traditional
from pycorrector.utils.text_utils import tokenize, get_homophones_by_char, get_homophones_by_pinyin

traditional_sentence = '憂郁的臺灣烏龜'
simplified_sentence = traditional2simplified(traditional_sentence)
print(simplified_sentence)

simplified_sentence = '忧郁的台湾乌龟'
traditional_sentence = simplified2traditional(simplified_sentence)
print(traditional_sentence)

print(lazy_pinyin('中心'))  # 不带音调

print(tokenize('小姑娘蹦蹦跳跳的去了她外公家'))

# 判断拼音还是英文
en_dict = enchant.Dict("en_US")
print(en_dict.check("hello"))
print(en_dict.check("hello boy what is your name"))
strs = "hello boy what is your name"
flag = False
for word in strs:
    if en_dict.check(word):
        flag = True
    else:
        flag = False
        break
print(flag)
print(en_dict.check("zhangsan"))

Exemple #5

0

Afficher le fichier

    def detect(self, sentence):
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = tokenize(sentence)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [confuse, idx, idx + len(confuse)]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        # 未登录词加入疑似错误词典
        for word, begin_idx, end_idx in tokens:
            # pass blank
            if not word.strip():
                continue
            # punctuation
            if word in PUNCTUATION_LIST:
                continue
            # pass num
            if word.isdigit():
                continue
            # pass alpha
            if is_alphabet_string(word.lower()):
                continue
            # in dict
            if word in self.word_freq:
                continue
            maybe_err = [word, begin_idx, end_idx]
            self._add_maybe_error_item(maybe_err, maybe_errors)

        # 语言模型检测疑似错误字
        ngram_avg_scores = []
        try:
            for n in [2, 3]:
                scores = []
                for i in range(len(sentence) - n + 1):
                    word = sentence[i:i + n]
                    score = self.ngram_score(list(word))
                    scores.append(score)
                if not scores:
                    continue
                # 移动窗口补全得分
                for _ in range(n - 1):
                    scores.insert(0, scores[0])
                    scores.append(scores[-1])
                avg_scores = [
                    sum(scores[i:i + n]) / len(scores[i:i + n])
                    for i in range(len(sentence))
                ]
                ngram_avg_scores.append(avg_scores)

            # 取拼接后的ngram平均得分
            sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
            # 取疑似错字信息
            for i in self._get_maybe_error_index(sent_scores):
                maybe_err = [sentence[i], i, i + 1]
                self._add_maybe_error_item(maybe_err, maybe_errors)
        except IndexError as ie:
            default_logger.warn("index error, sentence:" + sentence + str(ie))
        except Exception as e:
            default_logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)

Exemple #6

0

Afficher le fichier

def detect(sentence):
    maybe_error_indices = set()

    sentence = uniform(sentence)

    tokens = tokenize(sentence)

    # unknown chars
    for word, begin_idx, end_idx in tokens:
        if word not in PUNCTUATION_LIST and word not in word_freq.keys():
            for i in range(begin_idx, end_idx):
                maybe_error_indices.add(i)

    ngram_avg_scores = []
    try:
        for n in [1, 2, 3]:
            scores = []
            for i in range(len(sentence) - n + 1):
                word = sentence[i:i + n]
                score = get_ngram_score(list(word), mode=trigram_char)
                scores.append(score)

            for _ in range(n - 1):
                scores.insert(0, scores[0])
                scores.append(scores[-1])

            avg_scores = [
                sum(scores[i:i + n]) / len(scores[i:i + n])
                for i in range(len(sentence))
            ]
            ngram_avg_scores.append(avg_scores)

        sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
        maybe_error_char_indices = _get_maybe_error_index(sent_scores)

        maybe_error_indices |= set(maybe_error_char_indices)
    except IndexError as ie:
        print("index error, sentence:", sentence, ie)
        pass
    except Exception as e:
        print("detect error, sentence:", sentence, e)

    # # to get rid of special nouns like name
    seg = pseg.lcut(sentence)
    # # in the form of list of pair(w.word, w.flag)
    word = [w.word for w in seg]
    tag = [w.flag for w in seg]

    for i in range(len(tag)):
        if tag[i] in {'nz', 'nr', 'nt', 'ns'}:
            if i > 0 and tag[i - 1] == 'd':
                continue

            if len(word[i]) > 1:
                maybe_error_indices -= set(range(len(''.join(word[:i])), \
                                                 len(''.join(word[:i + 1]))))
            elif i + 1 < len(tag) and tag[i + 1] in {'nz', 'nr', 'nt', 'ns'}:
                maybe_error_indices -= set(range(len(''.join(word[:i])), \
                                                 len(''.join(word[:i + 2]))))
        # if tag[i] == 'j' and len(word[i]) > 1:
        #     maybe_error_indices -= set(range(len(''.join(word[:i])), \
        #                                      len(''.join(word[:i + 1]))))
    return sorted(maybe_error_indices)