def run(): """ run function which is the start point of program """ file_name = os.path.basename(sys.stdin.name) for line_num, line in enumerate(sys.stdin, start=1): line = line.rstrip('\r\n') if not WORD_ID_PTN.match(line): print(line) continue try: Word.parse(line, file_name, line_num) except ParseError as par_err: if 'raw-morph mismatch' in str(par_err): line = _recover(line) else: raise par_err print(line)
def run(): """ run function which is the start point of program """ file_name = os.path.basename(sys.stdin.name) for line_num, line in enumerate(sys.stdin, start=1): line = line.rstrip('\r\n') if not WORD_ID_PTN.match(line): print(line) continue word = Word.parse(line, file_name, line_num) _attach_missing_symbol(word) print(word)
def align_patch(rsc_src: Tuple[Aligner, Dict, Dict[str, int]], raw: str, morph_str: str) \ -> List[int]: """ 패치의 원문과 분석 결과를 음절단위 매핑(정렬)을 수행한다. Args: rsc_src: (Aligner, restore dic, vocab out) resource triple raw: 원문 morph_str: 형태소 분석 결과 (패치 기술 형식) Returns: 정렬에 기반한 출력 태그 번호 """ aligner, restore_dic, vocab_out = rsc_src raw_words = raw.strip().split() morphs = morph_str.split(' + ') morphs_strip = morphs if morphs[0] in [WORD_DELIM_STR, SENT_DELIM_STR]: morphs_strip = morphs_strip[1:] if morphs[-1] in [WORD_DELIM_STR, SENT_DELIM_STR]: morphs_strip = morphs_strip[:-1] morph_words = _split_list(morphs_strip, WORD_DELIM_STR) tag_nums = [] restore_new = defaultdict(dict) vocab_new = defaultdict(list) for raw_word, morph_word in zip(raw_words, morph_words): word = Word.parse('\t'.join(['', raw_word, ' + '.join(morph_word)]), '', 0) try: word_align = aligner.align(word) _, word_tag_nums = align_to_tag(raw_word, word_align, (restore_dic, restore_new), (vocab_out, vocab_new)) if restore_new or vocab_new: logging.debug('needs dic update: %s', word) return [] except AlignError as algn_err: logging.debug('alignment error: %s', word) logging.debug(str(algn_err)) return [] if tag_nums: tag_nums.append(WORD_DELIM_NUM) tag_nums.extend(word_tag_nums) if morphs[0] in [WORD_DELIM_STR, SENT_DELIM_STR]: tag_nums.insert( 0, WORD_DELIM_NUM if morphs[0] == WORD_DELIM_STR else SENT_DELIM_NUM) if morphs[-1] in [WORD_DELIM_STR, SENT_DELIM_STR]: tag_nums.append(WORD_DELIM_NUM if morphs[-1] == WORD_DELIM_STR else SENT_DELIM_NUM) return tag_nums
def run(): """ run function which is the start point of program """ file_name = os.path.basename(sys.stdin.name) for line_num, line in enumerate(sys.stdin, start=1): line = line.rstrip('\r\n') if not WORD_ID_PTN.match(line): print(line) continue word = Word.parse(line, file_name, line_num) try: _recover(word) except IndexError as idx_err: logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word) print(word)