def _recover(word: Word): """ recover wide char quotations Args: word: Word object """ word_quots = [_ for _ in word.raw if _ in _QUOT_NORM] morph_quots = [] for idx, morph in enumerate(word.morphs): if morph.tag != 'SS' or morph.lex not in _QUOT_NORM: continue morph_quots.append((idx, morph)) quot_idx = len(morph_quots) - 1 if len(word_quots) <= quot_idx or _QUOT_NORM[ word_quots[quot_idx]] != _QUOT_NORM[morph.lex]: logging.error('%d-th quots are different: %s', quot_idx + 1, word) return if len(word_quots) != len(morph_quots): morph_quots = [_ for _ in word.morph_str() if _ in _QUOT_NORM] if word_quots != morph_quots: logging.error('number of quots are different: %s', word) return for word_char, (idx, morph) in zip(word_quots, morph_quots): if word_char == morph.lex: continue morph.lex = word_char
def run(): """ run function which is the start point of program """ file_name = os.path.basename(sys.stdin.name) for line_num, line in enumerate(sys.stdin, start=1): line = line.rstrip('\r\n') if not WORD_ID_PTN.match(line): print(line) continue try: Word.parse(line, file_name, line_num) except ParseError as par_err: if 'raw-morph mismatch' in str(par_err): line = _recover(line) else: raise par_err print(line)
def run(): """ run function which is the start point of program """ file_name = os.path.basename(sys.stdin.name) for line_num, line in enumerate(sys.stdin, start=1): line = line.rstrip('\r\n') if not WORD_ID_PTN.match(line): print(line) continue word = Word.parse(line, file_name, line_num) _attach_missing_symbol(word) print(word)
def align_patch(rsc_src: Tuple[Aligner, Dict, Dict[str, int]], raw: str, morph_str: str) \ -> List[int]: """ 패치의 원문과 분석 결과를 음절단위 매핑(정렬)을 수행한다. Args: rsc_src: (Aligner, restore dic, vocab out) resource triple raw: 원문 morph_str: 형태소 분석 결과 (패치 기술 형식) Returns: 정렬에 기반한 출력 태그 번호 """ aligner, restore_dic, vocab_out = rsc_src raw_words = raw.strip().split() morphs = morph_str.split(' + ') morphs_strip = morphs if morphs[0] in [WORD_DELIM_STR, SENT_DELIM_STR]: morphs_strip = morphs_strip[1:] if morphs[-1] in [WORD_DELIM_STR, SENT_DELIM_STR]: morphs_strip = morphs_strip[:-1] morph_words = _split_list(morphs_strip, WORD_DELIM_STR) tag_nums = [] restore_new = defaultdict(dict) vocab_new = defaultdict(list) for raw_word, morph_word in zip(raw_words, morph_words): word = Word.parse('\t'.join(['', raw_word, ' + '.join(morph_word)]), '', 0) try: word_align = aligner.align(word) _, word_tag_nums = align_to_tag(raw_word, word_align, (restore_dic, restore_new), (vocab_out, vocab_new)) if restore_new or vocab_new: logging.debug('needs dic update: %s', word) return [] except AlignError as algn_err: logging.debug('alignment error: %s', word) logging.debug(str(algn_err)) return [] if tag_nums: tag_nums.append(WORD_DELIM_NUM) tag_nums.extend(word_tag_nums) if morphs[0] in [WORD_DELIM_STR, SENT_DELIM_STR]: tag_nums.insert( 0, WORD_DELIM_NUM if morphs[0] == WORD_DELIM_STR else SENT_DELIM_NUM) if morphs[-1] in [WORD_DELIM_STR, SENT_DELIM_STR]: tag_nums.append(WORD_DELIM_NUM if morphs[-1] == WORD_DELIM_STR else SENT_DELIM_NUM) return tag_nums
def run(): """ run function which is the start point of program """ file_name = os.path.basename(sys.stdin.name) for line_num, line in enumerate(sys.stdin, start=1): line = line.rstrip('\r\n') if not WORD_ID_PTN.match(line): print(line) continue word = Word.parse(line, file_name, line_num) try: _recover(word) except IndexError as idx_err: logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word) print(word)
def _recover(word: Word): """ recover cases Args: word: Word object """ word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)] letter_idx = -1 is_recovered = False word_copy = copy.deepcopy(word) for morph in word_copy.morphs: for idx, char in enumerate(morph.lex): if not re.match(r'[a-zA-Z]', char): continue letter_idx += 1 if word_letters[letter_idx] == char: continue morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[ idx + 1:] is_recovered = True if is_recovered: logging.info('%s => %s', str(word), word_copy.morph_str()) word.morphs = word_copy.morphs