Exemple #1
0
def main():
    args = parse()
    pwd_path = os.path.abspath(os.path.dirname(__file__))
    file_path = os.path.join(pwd_path, "/" + args.input_file)

    if not os.path.exists(file_path):
        default_logger = get_logger(__file__)
        default_logger.debug("file not exists:", file_path)

    file_in = codecs.open(args.input_file, 'rb', encoding = 'utf-8').readlines()
    file_ou = codecs.open(args.output_file, 'w', encoding = 'utf-8')

    if args.effect:
        PUNCTUATION_LIST = "。,,、?:;{}[]【】“‘’”《》/!%……()<>@#$~^¥%&*\"\'=+-"
        for line in tqdm(file_in):
            line = line.strip()
            if not is_chinese(line[0]) and line[0] not in {'“', '‘', '{', '[', '【', '(', '<', '《'}:
                continue
            if line[-1] not in {'。', '?', '”', '!', '……', '’', ')'}:
                if is_chinese(line[-1]):
                    line += '。'
                else:
                    continue
            if len(line) < 5:
                continue
            if False not in [(char in PUNCTUATION_LIST or is_chinese(char)) for char in line]:
                line = traditional2simplified(line)
                file_ou.write(line + '\n')
        file_ou.close()
    else:
        for line in tqdm(file_in):
            line = line.strip()
            line = traditional2simplified(line)
            file_ou.write(line + '\n')
        file_ou.close()
Exemple #2
0
# -*- coding: utf-8 -*-
# Author: XuMing <*****@*****.**>
# Brief: Train seq2seq model for text grammar error correction

import numpy as np

from pycorrector.seq2seq import cged_config as config
from pycorrector.seq2seq.corpus_reader import CGEDReader
from pycorrector.seq2seq.seq2seq_model import create_model, callback, eval
from pycorrector.utils.io_utils import get_logger

logger = get_logger(__name__)


def train(train_path=None,
          save_model_path=None,
          batch_size=64,
          epochs=10,
          rnn_hidden_dim=200):
    print('Training model...')
    data_reader = CGEDReader(train_path)
    input_texts, target_texts = data_reader.build_dataset(train_path)
    print('input_texts:', input_texts[0])
    print('target_texts:', target_texts[0])

    input_characters = data_reader.read_vocab(input_texts)
    target_characters = data_reader.read_vocab(target_texts)
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_len = max([len(text) for text in input_texts])
    max_decoder_seq_len = max([len(text) for text in target_texts])
# -*- coding: utf-8 -*-
# Author: XuMing <*****@*****.**>
# Brief: corrector with spell and stroke
import codecs
import operator
import os
import time

from pypinyin import lazy_pinyin

from pycorrector.detector import Detector, error_type
from pycorrector.utils.io_utils import get_logger
from pycorrector.utils.math_utils import edit_distance_word
from pycorrector.utils.text_utils import is_chinese_string

default_logger = get_logger(__file__)
pwd_path = os.path.abspath(os.path.dirname(__file__))


def load_char_set(path):
    words = set()
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for w in f:
            words.add(w.strip())
    return words


def load_same_pinyin(path, sep='\t'):
    """
    加载同音字
    :param path:
# -*- coding: utf-8 -*-
# Author: XuMing <*****@*****.**>
# Brief: error word detector
import codecs
import kenlm
import os
import time

import numpy as np

from pycorrector.tokenizer import Tokenizer
from pycorrector.utils.io_utils import get_logger
from pycorrector.utils.text_utils import uniform, is_alphabet_string

logger = get_logger(__file__)
PUNCTUATION_LIST = "。,,、?:;{}[]【】“‘’”《》/!!%……()<>@#$~^¥%&*\"\'=+-"
pwd_path = os.path.abspath(os.path.dirname(__file__))
error_type = {"confusion": 1, "word": 2, "char": 3}


class Detector(object):
    def __init__(self,
                 language_model_path='',
                 word_freq_path='',
                 custom_word_freq_path='',
                 custom_confusion_path='',
                 person_name_path='',
                 place_name_path='',
                 stopwords_path=''):
        self.name = 'detector'
        self.language_model_path = os.path.join(pwd_path, language_model_path)
Exemple #5
0
# -*- coding: utf-8 -*-
# Author: XuMing <*****@*****.**>
# Brief: Train seq2seq model for text grammar error correction

import numpy as np

from pycorrector.seq2seq import cged_config as config
from pycorrector.seq2seq.corpus_reader import CGEDReader
from pycorrector.seq2seq.seq2seq_model import create_model, callback, eval
from pycorrector.utils.io_utils import get_logger

logger = get_logger(__name__)


def train(train_path=None,
          save_model_path=None,
          batch_size=64,
          epochs=10,
          rnn_hidden_dim=200):
    print('Training model...')
    data_reader = CGEDReader(train_path)
    input_texts, target_texts = data_reader.build_dataset(train_path)
    print('input_texts:', input_texts[0])
    print('target_texts:', target_texts[0])

    input_characters = data_reader.read_vocab(input_texts)
    target_characters = data_reader.read_vocab(target_texts)
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_len = max([len(text) for text in input_texts])
    max_decoder_seq_len = max([len(text) for text in target_texts])
Exemple #6
0
from pycorrector import config
from pycorrector.detector import detect
from pycorrector.detector import get_frequency
from pycorrector.detector import get_ppl_score
from pycorrector.detector import trigram_char
from pycorrector.detector import word_freq
from pycorrector.utils.io_utils import dump_pkl
from pycorrector.utils.io_utils import get_logger
from pycorrector.utils.io_utils import load_pkl
from pycorrector.utils.text_utils import is_chinese_string
from pycorrector.utils.text_utils import traditional2simplified

pwd_path = os.path.abspath(os.path.dirname(__file__))
char_file_path = os.path.join(pwd_path, config.char_file_path)

default_logger = get_logger(__file__)


def load_word_dict(path):
    word_dict = ''
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for w in f:
            word_dict += w.strip()
    return word_dict


def load_same_pinyin(path, sep='\t'):
    """
    加载同音字
    :param path:
    :return: