def main(): args = parse() pwd_path = os.path.abspath(os.path.dirname(__file__)) file_path = os.path.join(pwd_path, "/" + args.input_file) if not os.path.exists(file_path): default_logger = get_logger(__file__) default_logger.debug("file not exists:", file_path) file_in = codecs.open(args.input_file, 'rb', encoding = 'utf-8').readlines() file_ou = codecs.open(args.output_file, 'w', encoding = 'utf-8') if args.effect: PUNCTUATION_LIST = "。,,、?:;{}[]【】“‘’”《》/!%……()<>@#$~^¥%&*\"\'=+-" for line in tqdm(file_in): line = line.strip() if not is_chinese(line[0]) and line[0] not in {'“', '‘', '{', '[', '【', '(', '<', '《'}: continue if line[-1] not in {'。', '?', '”', '!', '……', '’', ')'}: if is_chinese(line[-1]): line += '。' else: continue if len(line) < 5: continue if False not in [(char in PUNCTUATION_LIST or is_chinese(char)) for char in line]: line = traditional2simplified(line) file_ou.write(line + '\n') file_ou.close() else: for line in tqdm(file_in): line = line.strip() line = traditional2simplified(line) file_ou.write(line + '\n') file_ou.close()
# -*- coding: utf-8 -*- # Author: XuMing <*****@*****.**> # Brief: Train seq2seq model for text grammar error correction import numpy as np from pycorrector.seq2seq import cged_config as config from pycorrector.seq2seq.corpus_reader import CGEDReader from pycorrector.seq2seq.seq2seq_model import create_model, callback, eval from pycorrector.utils.io_utils import get_logger logger = get_logger(__name__) def train(train_path=None, save_model_path=None, batch_size=64, epochs=10, rnn_hidden_dim=200): print('Training model...') data_reader = CGEDReader(train_path) input_texts, target_texts = data_reader.build_dataset(train_path) print('input_texts:', input_texts[0]) print('target_texts:', target_texts[0]) input_characters = data_reader.read_vocab(input_texts) target_characters = data_reader.read_vocab(target_texts) num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters) max_encoder_seq_len = max([len(text) for text in input_texts]) max_decoder_seq_len = max([len(text) for text in target_texts])
# -*- coding: utf-8 -*- # Author: XuMing <*****@*****.**> # Brief: corrector with spell and stroke import codecs import operator import os import time from pypinyin import lazy_pinyin from pycorrector.detector import Detector, error_type from pycorrector.utils.io_utils import get_logger from pycorrector.utils.math_utils import edit_distance_word from pycorrector.utils.text_utils import is_chinese_string default_logger = get_logger(__file__) pwd_path = os.path.abspath(os.path.dirname(__file__)) def load_char_set(path): words = set() with codecs.open(path, 'r', encoding='utf-8') as f: for w in f: words.add(w.strip()) return words def load_same_pinyin(path, sep='\t'): """ 加载同音字 :param path:
# -*- coding: utf-8 -*- # Author: XuMing <*****@*****.**> # Brief: error word detector import codecs import kenlm import os import time import numpy as np from pycorrector.tokenizer import Tokenizer from pycorrector.utils.io_utils import get_logger from pycorrector.utils.text_utils import uniform, is_alphabet_string logger = get_logger(__file__) PUNCTUATION_LIST = "。,,、?:;{}[]【】“‘’”《》/!!%……()<>@#$~^¥%&*\"\'=+-" pwd_path = os.path.abspath(os.path.dirname(__file__)) error_type = {"confusion": 1, "word": 2, "char": 3} class Detector(object): def __init__(self, language_model_path='', word_freq_path='', custom_word_freq_path='', custom_confusion_path='', person_name_path='', place_name_path='', stopwords_path=''): self.name = 'detector' self.language_model_path = os.path.join(pwd_path, language_model_path)
# -*- coding: utf-8 -*- # Author: XuMing <*****@*****.**> # Brief: Train seq2seq model for text grammar error correction import numpy as np from pycorrector.seq2seq import cged_config as config from pycorrector.seq2seq.corpus_reader import CGEDReader from pycorrector.seq2seq.seq2seq_model import create_model, callback, eval from pycorrector.utils.io_utils import get_logger logger = get_logger(__name__) def train(train_path=None, save_model_path=None, batch_size=64, epochs=10, rnn_hidden_dim=200): print('Training model...') data_reader = CGEDReader(train_path) input_texts, target_texts = data_reader.build_dataset(train_path) print('input_texts:', input_texts[0]) print('target_texts:', target_texts[0]) input_characters = data_reader.read_vocab(input_texts) target_characters = data_reader.read_vocab(target_texts) num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters) max_encoder_seq_len = max([len(text) for text in input_texts]) max_decoder_seq_len = max([len(text) for text in target_texts])
from pycorrector import config from pycorrector.detector import detect from pycorrector.detector import get_frequency from pycorrector.detector import get_ppl_score from pycorrector.detector import trigram_char from pycorrector.detector import word_freq from pycorrector.utils.io_utils import dump_pkl from pycorrector.utils.io_utils import get_logger from pycorrector.utils.io_utils import load_pkl from pycorrector.utils.text_utils import is_chinese_string from pycorrector.utils.text_utils import traditional2simplified pwd_path = os.path.abspath(os.path.dirname(__file__)) char_file_path = os.path.join(pwd_path, config.char_file_path) default_logger = get_logger(__file__) def load_word_dict(path): word_dict = '' with codecs.open(path, 'r', encoding='utf-8') as f: for w in f: word_dict += w.strip() return word_dict def load_same_pinyin(path, sep='\t'): """ 加载同音字 :param path: :return: