def _recover(word: Word):
    """
    recover wide char quotations
    Args:
        word:  Word object
    """
    word_quots = [_ for _ in word.raw if _ in _QUOT_NORM]
    morph_quots = []
    for idx, morph in enumerate(word.morphs):
        if morph.tag != 'SS' or morph.lex not in _QUOT_NORM:
            continue
        morph_quots.append((idx, morph))
        quot_idx = len(morph_quots) - 1
        if len(word_quots) <= quot_idx or _QUOT_NORM[
                word_quots[quot_idx]] != _QUOT_NORM[morph.lex]:
            logging.error('%d-th quots are different: %s', quot_idx + 1, word)
            return
    if len(word_quots) != len(morph_quots):
        morph_quots = [_ for _ in word.morph_str() if _ in _QUOT_NORM]
        if word_quots != morph_quots:
            logging.error('number of quots are different: %s', word)
        return
    for word_char, (idx, morph) in zip(word_quots, morph_quots):
        if word_char == morph.lex:
            continue
        morph.lex = word_char
Beispiel #2
0
def run():
    """
    run function which is the start point of program
    """
    file_name = os.path.basename(sys.stdin.name)
    for line_num, line in enumerate(sys.stdin, start=1):
        line = line.rstrip('\r\n')
        if not WORD_ID_PTN.match(line):
            print(line)
            continue
        try:
            Word.parse(line, file_name, line_num)
        except ParseError as par_err:
            if 'raw-morph mismatch' in str(par_err):
                line = _recover(line)
            else:
                raise par_err
        print(line)
def run():
    """
    run function which is the start point of program
    """
    file_name = os.path.basename(sys.stdin.name)
    for line_num, line in enumerate(sys.stdin, start=1):
        line = line.rstrip('\r\n')
        if not WORD_ID_PTN.match(line):
            print(line)
            continue
        word = Word.parse(line, file_name, line_num)
        _attach_missing_symbol(word)
        print(word)
Beispiel #4
0
def align_patch(rsc_src: Tuple[Aligner, Dict, Dict[str, int]], raw: str, morph_str: str) \
        -> List[int]:
    """
    패치의 원문과 분석 결과를 음절단위 매핑(정렬)을 수행한다.
    Args:
        rsc_src:  (Aligner, restore dic, vocab out) resource triple
        raw:  원문
        morph_str:  형태소 분석 결과 (패치 기술 형식)
    Returns:
        정렬에 기반한 출력 태그 번호
    """
    aligner, restore_dic, vocab_out = rsc_src
    raw_words = raw.strip().split()
    morphs = morph_str.split(' + ')
    morphs_strip = morphs
    if morphs[0] in [WORD_DELIM_STR, SENT_DELIM_STR]:
        morphs_strip = morphs_strip[1:]
    if morphs[-1] in [WORD_DELIM_STR, SENT_DELIM_STR]:
        morphs_strip = morphs_strip[:-1]
    morph_words = _split_list(morphs_strip, WORD_DELIM_STR)
    tag_nums = []
    restore_new = defaultdict(dict)
    vocab_new = defaultdict(list)
    for raw_word, morph_word in zip(raw_words, morph_words):
        word = Word.parse('\t'.join(['', raw_word, ' + '.join(morph_word)]),
                          '', 0)
        try:
            word_align = aligner.align(word)
            _, word_tag_nums = align_to_tag(raw_word, word_align,
                                            (restore_dic, restore_new),
                                            (vocab_out, vocab_new))
            if restore_new or vocab_new:
                logging.debug('needs dic update: %s', word)
                return []
        except AlignError as algn_err:
            logging.debug('alignment error: %s', word)
            logging.debug(str(algn_err))
            return []
        if tag_nums:
            tag_nums.append(WORD_DELIM_NUM)
        tag_nums.extend(word_tag_nums)
    if morphs[0] in [WORD_DELIM_STR, SENT_DELIM_STR]:
        tag_nums.insert(
            0,
            WORD_DELIM_NUM if morphs[0] == WORD_DELIM_STR else SENT_DELIM_NUM)
    if morphs[-1] in [WORD_DELIM_STR, SENT_DELIM_STR]:
        tag_nums.append(WORD_DELIM_NUM if morphs[-1] ==
                        WORD_DELIM_STR else SENT_DELIM_NUM)
    return tag_nums
def run():
    """
    run function which is the start point of program
    """
    file_name = os.path.basename(sys.stdin.name)
    for line_num, line in enumerate(sys.stdin, start=1):
        line = line.rstrip('\r\n')
        if not WORD_ID_PTN.match(line):
            print(line)
            continue
        word = Word.parse(line, file_name, line_num)
        try:
            _recover(word)
        except IndexError as idx_err:
            logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word)
        print(word)
def _recover(word: Word):
    """
    recover cases
    Args:
        word:  Word object
    """
    word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)]
    letter_idx = -1
    is_recovered = False
    word_copy = copy.deepcopy(word)
    for morph in word_copy.morphs:
        for idx, char in enumerate(morph.lex):
            if not re.match(r'[a-zA-Z]', char):
                continue
            letter_idx += 1
            if word_letters[letter_idx] == char:
                continue
            morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[
                idx + 1:]
            is_recovered = True
    if is_recovered:
        logging.info('%s  =>  %s', str(word), word_copy.morph_str())
        word.morphs = word_copy.morphs