Ejemplo n.º 1
0
def run(args: Namespace):
    """
    run function which is the start point of program
    Args:
        args:  program arguments
    """
    aligner = Aligner(args.rsc_src)
    funmap = open(args.unmapped, 'w',
                  encoding='UTF-8') if args.unmapped else None

    for sent in sent_iter(args):
        if 0.0 < args.sample < 1.0 and random.random() >= args.sample:
            continue
        word_per_maps = []
        for word in sent.words:
            try:
                maps = aligner.align(word)
            except AlignError as algn_err:
                if funmap:
                    algn_err.add_msg(str(word))
                    print(algn_err, file=funmap)
                maps = []
            word_per_maps.append(maps)
        _print_sent(sent, word_per_maps)

    _print_restore_dic(args)
    aligner.print_middle_cnt()
Ejemplo n.º 2
0
def run(args: Namespace):
    """
    actual function which is doing some task
    Args:
        args:  program arguments
    """
    aligner = Aligner(args.rsc_src)
    restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src))
    if not restore_dic:
        sys.exit(1)
    vocab_out = load_vocab_out(args.rsc_src)

    khaiii_api = KhaiiiApi(args.lib_path, args.rsc_dir, '{"errpatch": false}')

    for line_num, line in enumerate(sys.stdin, start=1):
        line = line.rstrip('\r\n')
        if not line or line[0] == '#':
            continue
        raw, left, right = line.split('\t')
        left_align = align_patch((aligner, restore_dic, vocab_out), raw, left)
        if not left_align:
            logging.info('invalid %d-th line: left align: %s', line_num, line)
            continue
        right_align = align_patch((aligner, restore_dic, vocab_out), raw,
                                  right)
        if not right_align:
            logging.info('invalid %d-th line: right align: %s', line_num, line)
            continue
        if len(left_align) != len(right_align):
            logging.info('invalid %d-th line: left/right diff: %s', line_num,
                         line)
            continue
        pos_cnt = 0
        neg_cnt = 0
        for sent in _sent_iter(args):
            pos_cnt_sent, neg_cnt_sent = _cnt_pos_neg(
                khaiii_api, raw, (left_align, right_align),
                (aligner, restore_dic, vocab_out), sent)
            pos_cnt += pos_cnt_sent
            neg_cnt += neg_cnt_sent
            if neg_cnt > 0:
                break
        if neg_cnt > 0 or pos_cnt == 0:
            logging.info('invalid %d-th line: +%d, -%d: %s', line_num, pos_cnt,
                         neg_cnt, line)
            continue
        print('{}\t{}\t{}'.format(raw, left, right))
Ejemplo n.º 3
0
def run(args: Namespace):
    """
    run function which is the start point of program
    Args:
        args:  program arguments
    """
    aligner = Aligner(args.rsc_src)
    restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src))
    if not restore_dic:
        sys.exit(1)
    vocab_out = load_vocab_out(args.rsc_src)

    entries = _load_entries(args)
    if not entries:
        logging.error('no entry to compile')
        sys.exit(2)
    _check_dup(entries)
    entries = [e for e in entries if not e.is_sharp]  # 주석 처리한 엔트리는 제외
    _set_align((aligner, restore_dic, vocab_out), entries)
    _save_trie(args.rsc_dir, entries)
Ejemplo n.º 4
0
def run(args: Namespace):
    """
    run function which is the start point of program
    Args:
        args:  program arguments
    """
    aligner = Aligner(args.rsc_src)
    restore_dic = parse_restore_dic('{}/restore.dic'.format(args.rsc_src))
    if not restore_dic:
        sys.exit(1)
    restore_new = defaultdict(dict)
    vocab_out = load_vocab_out(args.rsc_src)
    vocab_new = {}

    entries = _load_entries(args)
    _check_dup(entries)
    entries = [e for e in entries if not e.is_sharp]  # 주석 처리한 엔트리는 제외
    _set_align(aligner, sejong_corpus.Word, entries)
    _set_tag_out(restore_dic, restore_new, vocab_out, vocab_new, entries)

    append_new_entries(args.rsc_src, restore_new, vocab_new)
    _save_trie(args.rsc_dir, entries)
Ejemplo n.º 5
0
def _set_align(aligner: Aligner, Word: type, entries: List[Entry]):  # pylint: disable=invalid-name
    """
    음절과 형태소 분석 결과를 정렬한다.
    Args:
        aligner:  Aligner 객체
        Word:  Word 타입
        entries:  엔트리 리스트
    """
    bad_entries = []
    for entry in entries:
        if entry.is_sharp:
            continue
        word = Word.parse(
            '\t'.join(['', entry.word,
                       Morph.to_str(entry.morphs)]), '', 0)
        try:
            entry.align = aligner.align(word)
        except AlignError as map_exc:
            entry.err_msg = 'fail to align'
            logging.error(map_exc)
            bad_entries.append(entry)
    print_errors(bad_entries)