def _set_align(rsc_src: Tuple[Aligner, dict, Dict[str, int]], entries: List[Entry]): """ 음절과 형태소 분석 결과를 정렬한다. Args: rsc_src: (Aligner, restore dic, vocab out) resource triple Word: Word 타입 entries: 엔트리 리스트 """ bad_entries = [] for entry in entries: if entry.is_sharp: continue entry.left_align = align_patch(rsc_src, entry.raw, Morph.to_str(entry.left)) if not entry.left_align: entry.err_msg = 'fail to align left' bad_entries.append(entry) continue entry.right_align = align_patch(rsc_src, entry.raw, Morph.to_str(entry.right)) if not entry.right_align: entry.err_msg = 'fail to align right' bad_entries.append(entry) continue assert len(entry.left_align) == len(entry.right_align) print_errors(bad_entries)
def _load_entries(args: Namespace) -> List[Entry]: """ 패치 엔트리를 파일로부터 로드한다. Args: args: program arguments Returns: 엔트리 리스트 """ good_entries = [] bad_entries = [] for file_path in glob.glob('{}/{}.errpatch.*'.format( args.rsc_src, args.model_size)): file_name = os.path.basename(file_path) logging.info(file_name) for line_num, line in enumerate(open(file_path, 'r', encoding='UTF-8'), start=1): line = line.rstrip('\r\n') if not line: continue entry = Entry(file_path, line_num, line) if entry.err_msg: bad_entries.append(entry) else: good_entries.append(entry) print_errors(bad_entries) return good_entries
def _check_dup(entries: List[Entry]): """ 중복된 엔트리가 없는 지 확인한다. Args: entries: 엔트리 리스트 """ bad_entries = [] key_dic = {} for entry in entries: if entry.key_str() in key_dic: dup_entry = key_dic[entry.key_str()] entry.err_msg = '[DUPLICATED] with "{}"'.format(dup_entry) bad_entries.append(entry) else: key_dic[entry.key_str()] = entry print_errors(bad_entries)