Beispiel #1
0
def proc_item(item):
    """
    处理训练数据集
    Args:
        item:
    Returns:
        list
    """
    root = etree.XML(item)
    passages = dict()
    mistakes = []
    for passage in root.xpath('/ESSAY/TEXT/PASSAGE'):
        passages[passage.get('id')] = traditional2simplified(passage.text)
    for mistake in root.xpath('/ESSAY/MISTAKE'):
        mistakes.append({'id': mistake.get('id'),
                         'location': int(mistake.get('location')) - 1,
                         'wrong': traditional2simplified(mistake.xpath('./WRONG/text()')[0].strip()),
                         'correction': traditional2simplified(mistake.xpath('./CORRECTION/text()')[0].strip())})

    rst_items = dict()

    def get_passages_by_id(pgs, _id):
        p = pgs.get(_id)
        if p:
            return p
        _id = _id[:-1] + str(int(_id[-1]) + 1)
        p = pgs.get(_id)
        if p:
            return p
        raise ValueError(f'passage not found by {_id}')

    for mistake in mistakes:
        if mistake['id'] not in rst_items.keys():
            rst_items[mistake['id']] = {'original_text': get_passages_by_id(passages, mistake['id']),
                                        'wrong_ids': [],
                                        'correct_text': get_passages_by_id(passages, mistake['id'])}

        # todo 繁体转简体字符数量或位置发生改变校验
        ori_text = rst_items[mistake['id']]['original_text']
        cor_text = rst_items[mistake['id']]['correct_text']
        if len(ori_text) == len(cor_text):
            if ori_text[mistake['location']] in mistake['wrong']:
                rst_items[mistake['id']]['wrong_ids'].append(mistake['location'])
                wrong_char_idx = mistake['wrong'].index(ori_text[mistake['location']])
                start = mistake['location'] - wrong_char_idx
                end = start + len(mistake['wrong'])
                rst_items[mistake['id']][
                    'correct_text'] = f'{cor_text[:start]}{mistake["correction"]}{cor_text[end:]}'
        else:
            print(f'error line:\n{mistake["id"]}\n{ori_text}\n{cor_text}')
    rst = []
    for k in rst_items.keys():
        if len(rst_items[k]['correct_text']) == len(rst_items[k]['original_text']):
            rst.append({'id': k, **rst_items[k]})
        else:
            text = rst_items[k]['correct_text']
            rst.append({'id': k, 'correct_text': text, 'original_text': text, 'wrong_ids': []})
    return rst
Beispiel #2
0
def proc_test_set(fp):
    """
    生成sighan15的测试集
    Args:
        fp:
    Returns:
    """
    inputs = dict()
    with open(os.path.join(fp, 'SIGHAN15_CSC_TestInput.txt'),
              'r',
              encoding='utf-8') as f:
        for line in f:
            pid = line[5:14]
            text = line[16:].strip()
            inputs[pid] = text

    rst = []
    with open(os.path.join(fp, 'SIGHAN15_CSC_TestTruth.txt'),
              'r',
              encoding='utf-8') as f:
        for line in f:
            pid = line[0:9]
            mistakes = line[11:].strip().split(', ')
            if len(mistakes) <= 1:
                text = traditional2simplified(inputs[pid])
                rst.append({
                    'id': pid,
                    'original_text': text,
                    'wrong_ids': [],
                    'correct_text': text
                })
            else:
                wrong_ids = []
                original_text = inputs[pid]
                cor_text = inputs[pid]
                for i in range(len(mistakes) // 2):
                    idx = int(mistakes[2 * i]) - 1
                    cor_char = mistakes[2 * i + 1]
                    wrong_ids.append(idx)
                    cor_text = f'{cor_text[:idx]}{cor_char}{cor_text[idx + 1:]}'
                original_text = traditional2simplified(original_text)
                cor_text = traditional2simplified(cor_text)
                if len(original_text) != len(cor_text):
                    print('error line:\n', pid)
                    print(original_text)
                    print(cor_text)
                    continue
                rst.append({
                    'id': pid,
                    'original_text': original_text,
                    'wrong_ids': wrong_ids,
                    'correct_text': cor_text
                })

    return rst
Beispiel #3
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

import sys

sys.path.append("..")

import pycorrector

if __name__ == '__main__':
    traditional_sentence = '憂郁的臺灣烏龜'
    simplified_sentence = pycorrector.traditional2simplified(
        traditional_sentence)
    print(traditional_sentence, '=>', simplified_sentence)

    simplified_sentence = '忧郁的台湾乌龟'
    traditional_sentence = pycorrector.simplified2traditional(
        simplified_sentence)
    print(simplified_sentence, '=>', traditional_sentence)
Beispiel #4
0
 def handler(self):
     self.initialize()  # 初始化
     while True:
         retry_times = 100  # 搶lock失敗的重試次數
         for i in range(retry_times):
             try:
                 fh = open(f'{self.scan_path}/share_temp_file.log',
                           'a+',
                           encoding='utf-8')  # 開啟共享文件load檔名出來
                 portalocker.Lock(fh, timeout=60)  # lock住檔案權限
                 fh.seek(0)
                 filenames = list(
                     filter(lambda x: x != '',
                            fh.read().split('\n')))
                 fh.truncate(0)  # load完就清空
                 fh.flush()  # 歸還lock
                 os.fsync(fh.fileno())
                 with open(f'{self.scan_path}/temp_file_queue.log',
                           'a+',
                           encoding='utf-8'
                           ) as f:  # 開啟僅有本檔案所維護的filename queue
                     if len(filenames) != 0:  # 「共享文件」有filename才寫入
                         f.write('\n'.join(filenames) +
                                 '\n')  # 新增讀入的filenames
                     f.seek(0)
                     filenames = list(
                         filter(lambda x: x != '',
                                f.read().split('\n')))
                     if len(filenames
                            ) != 0:  # 「filename queue」有filename才往下做
                         tra_text = ''
                         word_list, keywords_list = list(), list()
                         filename = filenames[0]  # 一次只讀第一個檔案
                         pattern = re.match(
                             '^(.+)_(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2}).png$',
                             filename)
                         username = pattern.group(1)
                         snapshot_date = pattern.group(2)
                         snapshot_time = pattern.group(3)
                         try:
                             pil_image = self.image_preprocessing(
                                 f'{self.scan_path}/{snapshot_date}/{username}/{filename}'
                             )
                             rcn_result = image_to_string(
                                 pil_image,
                                 lang='ase_chi_tra_3',
                                 config=r'--oem 3 --psm 6')  # OCR辨識
                             sim_text = traditional2simplified(
                                 rcn_result.replace(' ',
                                                    '').replace('\n',
                                                                ''))  # 繁轉簡
                             corrected_sent, _ = correct(sim_text)  # 糾正錯字
                             tra_text = simplified2traditional(
                                 corrected_sent)  # 簡轉繁
                             word_list = self.get_wordlist(tra_text)
                             keywords_list = self.get_keywordlist(word_list)
                         except Exception as e:
                             print(f'[Recognition Fail] Message:{e}')
                             continue
                         self.save_to_json({
                             'computer_id': username,
                             'snapshot_date': snapshot_date,
                             'snapshot_time': snapshot_time,
                             'address':
                             f'{snapshot_date}/{username}/{filename}',
                             'keywords': keywords_list,
                             'wordlist': word_list,
                             'rawtext': tra_text
                         })
                         f.truncate(0)  # 確認儲存成功才清空
                         f.write('\n'.join(filenames[1:]) +
                                 '\n')  # 濾除第一個檔案(已完成)並重新寫入檔案
                     f.close()
             except Exception as e:
                 print(
                     f'[Get Lock Fail] To read temp_file.log (Message:{e})')
                 time.sleep(1)
                 continue
Beispiel #5
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

import pycorrector

with open('eng_chi.txt', encoding='utf-8') as f1, open('a.txt',
                                                       'w',
                                                       encoding='utf-8') as f2:
    for line in f1:
        line = line.strip()
        parts = line.split('\t')
        eng = parts[0]
        chi = parts[1]
        f2.write('src: ' + eng + "\n")
        f2.write('dst: ' + pycorrector.traditional2simplified(chi) + '\n')