Esempio n. 1
0
def test_parser_is_nonproj_tree(
    proj_tree, nonproj_tree, partial_tree, multirooted_tree
):
    assert is_nonproj_tree(proj_tree) is False
    assert is_nonproj_tree(nonproj_tree) is True
    assert is_nonproj_tree(partial_tree) is False
    assert is_nonproj_tree(multirooted_tree) is True
Esempio n. 2
0
def test_is_nonproj_tree():
    proj_tree = [1, 2, 2, 4, 5, 2, 7, 5, 2]
    nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
    partial_tree = [1, 2, 2, 4, 5, None, 7, 4, 2]
    multirooted_tree = [3, 2, 0, 3, 3, 7, 7, 3, 7, 10, 7, 10, 11, 12, 18, 16, 18, 17, 12, 3]
    assert is_nonproj_tree(proj_tree) == False
    assert is_nonproj_tree(nonproj_tree) == True
    assert is_nonproj_tree(partial_tree) == False
    assert is_nonproj_tree(multirooted_tree) == True
Esempio n. 3
0
def test_is_nonproj_tree():
    proj_tree = [1, 2, 2, 4, 5, 2, 7, 5, 2]
    nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
    partial_tree = [1, 2, 2, 4, 5, None, 7, 4, 2]
    multirooted_tree = [
        3, 2, 0, 3, 3, 7, 7, 3, 7, 10, 7, 10, 11, 12, 18, 16, 18, 17, 12, 3
    ]
    assert (is_nonproj_tree(proj_tree) == False)
    assert (is_nonproj_tree(nonproj_tree) == True)
    assert (is_nonproj_tree(partial_tree) == False)
    assert (is_nonproj_tree(multirooted_tree) == True)
Esempio n. 4
0
def convert_lines(path, lines, tokenizer, paragraph_id_regex, n_sents):
    paragraphs = []
    raw = ''
    sentences = []
    paragraph_id = None
    sentence_id = None
    sentence = ''
    tokens = []
    ents = []
    ent_start_char = None
    ent_label = None
    offset = 0
    state = 'sid'

    def error_line(_state, _path, _line_index, _sentence_id, _sentence, _line):
        print('Illegal format: state={}, file={} ({}), sent_id={}, {}'.format(
            _state, _path, _line_index + 1, _sentence_id, _sentence
        ), file=sys.stderr)
        print(_line, file=sys.stderr)
        raise ValueError

    for line_index, line in enumerate(lines):
        line = line.rstrip()

        if state == 'sid':
            m = SID_PATTERN.match(line)
            if m is None:
                error_line(state, path, line_index, sentence_id, sentence, line)
                return []

            sentence_id = m.group(1)
            m = re.match(paragraph_id_regex, sentence_id)
            if m:
                new_paragraph_id = m.group(1)
            else:
                new_paragraph_id = ''
            if paragraph_id is None or paragraph_id != new_paragraph_id:
                paragraph_id = new_paragraph_id
                if sentences:
                    paragraphs.append({
                        'raw': raw,
                        'sentences': sentences,
                    })
                    raw = ''
                    sentences = []

            state = 'text'

        elif state == 'text':
            m = TEXT_PATTERN.match(line)
            if m is None:
                error_line(state, path, line_index, sentence_id, sentence, line)
                return []

            sentence = m.group(1)
            raw += sentence
            state = 'ios'

        elif state == 'ios' and line != '':
            m = TOKEN_PATTERN.match(line)
            if m is None:
                error_line(state, path, line_index, sentence_id, sentence, line)
                return []

            token_id = int(m.group(1)) - 1
            orth = m.group(2)
            lemma = m.group(3)
            pos = m.group(4)
            tag = m.group(5)
            head_id = int(m.group(7)) - 1
            if head_id < 0:
                head_id = token_id
            dep = m.group(8)
            options = m.group(10)
            whitespace = options.find('SpaceAfter=No') < 0
            tokens.append({
                'id': token_id,
                'orth': orth,
                'lemma': lemma,
                'pos': pos,
                'tag': tag,
                'dep': dep,
                'head': head_id - token_id,
                'whitespace': whitespace,
            })
            m = re.search(r'NE=([^|]+)', options)
            if m:
                label = m.group(1)
                if label.startswith('B-'):
                    if ent_label:
                        ents.append({
                            'start': ent_start_char,
                            'end': offset,
                            'label': ent_label,
                        })
                    ent_start_char = offset
                    ent_label = label[2:]
                elif not label.startswith('I-') or not ent_label:
                    raise Exception('Bad NE label: ' + line)
            elif ent_label:
                ents.append({
                    'start': ent_start_char,
                    'end': offset,
                    'label': ent_label,
                })
                ent_start_char = None
                ent_label = None
            offset += len(orth)
            if whitespace:
                offset += 1

        elif state == 'ios' and line == '':
            if len(tokens) == 0:
                error_line(state, path, line_index, sentence_id, sentence, line)
                return []
            if ent_label:
                ents.append({
                    'start': ent_start_char,
                    'end': offset,
                    'label': ent_label,
                })

            heads = [t['id'] + t['head'] for t in tokens]
            if is_nonproj_tree(heads):
                print(file=sys.stderr)
                print('skip(non-projective):', path, sentence_id, file=sys.stderr)
            elif contains_cycle(heads):
                print(file=sys.stderr)
                print('skip(cyclic)', path, sentence_id, file=sys.stderr)
            else:
                if tokenizer:
                    retokenize(tokens, tokenizer(
                        ''.join([t['orth'] + (' ' if t['whitespace'] else '') for t in tokens])
                    ))
                offset = 0
                ent_label = None
                ent_end = 0
                ent_queue = []
                for t in tokens:
                    end = offset + len(t['orth'])
                    if t['whitespace']:
                        end += 1
                    if ent_end > 0:
                        if offset < ent_end:
                            ent_queue.append(t)
                            offset = end
                            continue
                        if end >= ent_end:
                            if len(ent_queue) == 1:
                                ent_queue[0]['ner'] = 'U-' + ent_label
                            else:
                                ent_queue[0]['ner'] = 'B-' + ent_label
                                for et in ent_queue[1:-1]:
                                    et['ner'] = 'I-' + ent_label
                                ent_queue[-1]['ner'] = 'L-' + ent_label
                            ent_label = None
                            ent_end = 0
                            ent_queue.clear()
                    for ent in ents:
                        if ent['start'] < end and offset < ent['end']:
                            ent_label = ent['label']
                            ent_end = ent['end']
                            ent_queue.append(t)
                            break
                    offset = end
                if ent_end > 0:
                    if len(ent_queue) == 1:
                        ent_queue[0]['ner'] = 'U-' + ent_label
                    else:
                        ent_queue[0]['ner'] = 'B-' + ent_label
                        for et in ent_queue[1:-1]:
                            et['ner'] = 'I-' + ent_label
                        ent_queue[-1]['ner'] = 'L-' + ent_label
                for t in tokens:
                    if 'ner' not in t:
                        t['ner'] = 'O'

                sentences.append({'tokens': tokens})
                if len(sentences) >= n_sents:
                    paragraphs.append({
                        'raw': raw,
                        'sentences': sentences,
                    })
                    raw = ''
                    sentences = []

            sentence_id = None
            sentence = ""
            tokens = []
            ents = []
            ent_start_char = None
            ent_label = None
            offset = 0
            state = 'sid'

        else:
            error_line(state, path, line_index, sentence_id, sentence, line)
            return []

    if state != 'sid':
        error_line(state, path, len(lines), sentence_id, sentence, '<END OF FILE>')
        return []

    if sentences:
        paragraphs.append({
            'raw': raw,
            'sentences': sentences,
        })

    return paragraphs
Esempio n. 5
0
def retokenize(gold_tokens, doc, debug=False):
    if debug:
        print(doc.text)
        print([g['orth'] + (' ' if g['whitespace'] else '') for g in gold_tokens])
        print([t.orth_ + t.whitespace_ for t in doc])
    length = len(doc.text)
    index_g = 0
    g_offset = 0
    index_t = 0
    t_offset = 0
    align_from_g = None
    align_from_t = None
    last_aligned_g = 0
    last_aligned_t = 0
    while g_offset < length and t_offset < length:
        g = gold_tokens[index_g]
        g_end = g_offset + len(g['orth'])
        if g['whitespace']:
            g_end += 1
        t = doc[index_t]
        t_end = t_offset + len(t.orth_)
        if t.whitespace_:
            t_end += 1
        if debug:
            print(index_g, g_offset, g_end, g['orth'], align_from_g, index_t, t_offset, t_end, t.orth_, align_from_t)
        if g_end == t_end:
            if align_from_t is not None:
                if debug:
                    _print('>', gold_tokens[index_g:index_g + 1], doc[align_from_t:index_t + 1])
                rewrite_with_tokens(gold_tokens, index_g, doc[align_from_t:index_t + 1])
                index_g += index_t - align_from_t
                align_from_t = None
            elif align_from_g is not None:
                if debug:
                    _print('<', gold_tokens[align_from_g:index_g + 1], doc[index_t:index_t + 1])
                if unify_range(gold_tokens, align_from_g, index_g + 1, doc[index_t]):
                    index_g = align_from_g
                align_from_g = None
            elif g_offset == t_offset:
                if debug:
                    tag = g['tag'] == t.tag_
                    _print(
                        '==' if tag else '=',
                        gold_tokens[index_g:index_g + 1],
                        doc[index_t:index_t + 1]
                    )
                rewrite_with_tokens(gold_tokens, index_g, doc[index_t:index_t + 1])
            else:
                if debug:
                    _print('!', gold_tokens[last_aligned_g:index_g + 1], doc[last_aligned_t:index_t + 1])
            index_g += 1
            g_offset = g_end
            last_aligned_g = index_g
            index_t += 1
            t_offset = t_end
            last_aligned_t = index_t
        elif g_end > t_end:
            if g_offset == t_offset:
                align_from_t = index_t
            if align_from_g is not None:
                align_from_g = None
            index_t += 1
            t_offset = t_end
        else:
            if g_offset == t_offset:
                align_from_g = index_g
            if align_from_t is not None:
                align_from_t = None
            index_g += 1
            g_offset = g_end
    if last_aligned_g != len(gold_tokens) or g_offset != length or t_offset != length:
        raise Exception(
            'Unexpected state: len(gold_tokens)={},last_aligned_g={},len(gold)={},g_offset={},t_offset={}'.format(
                len(gold_tokens),
                last_aligned_g,
                length,
                g_offset,
                t_offset,
            )
        )
    for g in gold_tokens:
        if g['head'] != 0 and g['tag'].endswith('可能') and g['dep'].find('as_') == -1:
            g['dep'] = '{}_as_{}'.format(g['dep'], g['pos'])
    heads = [g['id'] + g['head'] for g in gold_tokens]
    if is_nonproj_tree(heads):
        print(list(enumerate(heads)), file=sys.stderr)
        for t in gold_tokens:
            print(t, file=sys.stderr)
        raise Exception('non-projective')
    elif contains_cycle(heads):
        print(list(enumerate(heads)), file=sys.stderr)
        for t in gold_tokens:
            print(t, file=sys.stderr)
        raise Exception('cyclic')
Esempio n. 6
0
def test_parser_is_nonproj_tree(proj_tree, nonproj_tree, partial_tree,
                                multirooted_tree):
    assert is_nonproj_tree(proj_tree) is False
    assert is_nonproj_tree(nonproj_tree) is True
    assert is_nonproj_tree(partial_tree) is False
    assert is_nonproj_tree(multirooted_tree) is True
Esempio n. 7
0
def convert_lines(
        path,
        lines,
        tokenizer,
        paragraph_id_regex,
        n_sents,
        extend_dep_labels,
        ensure_end_period,
        luw_ent,
        _print_bunsetu_dep=False,
):
    paragraphs = []
    raw = ''
    sentences = []
    paragraph_id = None
    sentence_id = None
    sentence = ''
    tokens = []
    bunsetu_head_deps = {}
    bunsetu_all_deps = {}
    bunsetu_begin = None
    bunsetu_root = None
    bunsetu_head = None
    bunsetu_heads = None
    bunsetu_dep = None
    ent_target = False
    ents = []
    ent_start_char = None
    ent_end_char = None
    ent_label = None
    skip = False
    offset = 0
    state = 'sid'

    def error_line(_state, _path, _line_index, _sentence_id, _sentence, _line):
        print('Illegal format: state={}, file={} ({}), sent_id={}, {}'.format(
            _state, _path, _line_index + 1, _sentence_id, _sentence
        ), file=sys.stderr)
        print(_line, file=sys.stderr)
        raise ValueError

    for line_index, line in enumerate(lines):
        line = line.rstrip()

        if state == 'sid':
            m = SID_PATTERN.match(line)
            if m is None:
                m = NEW_DOC_ID_PATTERN.match(line)
                if m is not None:
                    continue
                error_line(state, path, line_index, sentence_id, sentence, line)
                return []

            sentence_id = m.group(1)
            m = re.match(paragraph_id_regex, sentence_id)
            if m:
                new_paragraph_id = m.group(1)
            else:
                new_paragraph_id = ''
            if paragraph_id is None or paragraph_id != new_paragraph_id:
                paragraph_id = new_paragraph_id
                if sentences:
                    paragraphs.append({
                        'raw': raw,
                        'sentences': sentences,
                    })
                    raw = ''
                    sentences = []

            state = 'text'

        elif state == 'text':
            m = TEXT_PATTERN.match(line)
            if m is None:
                error_line(state, path, line_index, sentence_id, sentence, line)
                return []

            sentence = m.group(1)
            state = 'ios'

        elif state == 'ios' and line != '':
            m = TOKEN_PATTERN.match(line)
            if m is None:
                m = TEXT_EN_PATTERN.match(line)
                if m is not None:
                    continue
                error_line(state, path, line_index, sentence_id, sentence, line)
                return []

            token_id = int(m.group(1)) - 1
            orth = m.group(2)
            lemma = m.group(3)
            pos = m.group(4)
            tag = m.group(5)
            head_id = int(m.group(7)) - 1
            if head_id < 0:
                head_id = token_id
            dep = m.group(8)
            options = m.group(10)
            whitespace = options.find('SpaceAfter=No') < 0
            tokens.append({
                'id': token_id,
                'orth': orth,
                'lemma': lemma,
                'pos': pos,
                'tag': tag,
                'dep': dep,
                'head': head_id - token_id,
                'whitespace': whitespace,
                'ner': 'O',
            })

            m = BUNSETU_PATTERN.search(options)
            if m.group(1) == "B":
                if bunsetu_dep:
                    for h, d in bunsetu_heads:
                        assert bunsetu_begin <= h < token_id or h == bunsetu_head, str(bunsetu_heads) + line
                    if extend_dep_labels and bunsetu_dep.lower() != 'root':
                        tokens[bunsetu_root]['dep'] += '_bunsetu'
                    if bunsetu_dep not in bunsetu_head_deps:
                        bunsetu_head_deps[bunsetu_dep] = 0
                    bunsetu_head_deps[bunsetu_dep] += 1
                bunsetu_begin = token_id
                bunsetu_root = token_id
                bunsetu_head = head_id
                bunsetu_heads = []
                bunsetu_dep = dep
                bunsetu_heads.append((head_id, dep))
            elif head_id < bunsetu_begin or token_id <= bunsetu_head < head_id or dep.lower() == "root":
                bunsetu_root = token_id
                bunsetu_head = head_id
                bunsetu_dep = dep
                bunsetu_heads.append((head_id, dep))

            if bunsetu_dep not in bunsetu_all_deps:
                bunsetu_all_deps[bunsetu_dep] = 0
            bunsetu_all_deps[bunsetu_dep] += 1

            if luw_ent:
                m = LUW_PATTERN.search(options)
            else:
                m = NE_PATTERN.search(options)
            if m:
                ent_target = True
                if luw_ent:
                    label = m.group(1) + "-" + m.group(2)
                else:
                    label = m.group(1)
                if label[0] == "U":
                    label = "B" + label[1:]
                elif label[0] == "L":
                    label = "I" + label[1:]

                if label.startswith('B'):
                    if ent_label:
                        ents.append({
                            'start': ent_start_char,
                            'end': ent_end_char,
                            'label': ent_label,
                        })
                    ent_start_char = offset
                    ent_end_char = offset + len(orth)
                    ent_label = label[2:]
                elif label.startswith('I'):
                    if not ent_label or ent_label != label[2:]:
                        print('inconsistent ENT label: ' + str(ent_label) + ', ' + line, file=sys.stderr)
                        skip = True
                    else:
                        ent_end_char = offset + len(orth)
                elif not luw_ent and label == "O":
                    if ent_label:
                        ents.append({
                            'start': ent_start_char,
                            'end': ent_end_char,
                            'label': ent_label,
                        })
                        ent_start_char = None
                        ent_end_char = None
                        ent_label = None
                else:
                    print('bad ENT label: ' + line, file=sys.stderr)
                    skip = True
                    ent_start_char = None
                    ent_end_char = None
                    ent_label = None
            elif luw_ent:
                print('missing LUW label: ' + line, file=sys.stderr)
                skip = True
            elif ent_label:
                ents.append({
                    'start': ent_start_char,
                    'end': ent_end_char,
                    'label': ent_label,
                })
                ent_start_char = None
                ent_end_char = None
                ent_label = None
            offset += len(orth)
            if whitespace:
                offset += 1

        elif state == 'ios' and line == '':
            if len(tokens) == 0:
                error_line(state, path, line_index, sentence_id, sentence, line)
                return []
            if ent_label:
                ents.append({
                    'start': ent_start_char,
                    'end': ent_end_char,
                    'label': ent_label,
                })
            if bunsetu_dep:
                if extend_dep_labels and bunsetu_dep.lower() != 'root':
                    tokens[bunsetu_root]['dep'] += '_bunsetu'
                if bunsetu_dep not in bunsetu_head_deps:
                    bunsetu_head_deps[bunsetu_dep] = 0
                bunsetu_head_deps[bunsetu_dep] += 1

            heads = [t['id'] + t['head'] for t in tokens]
            if is_nonproj_tree(heads):
                print(file=sys.stderr)
                print('skip(non-projective):', path, sentence_id, file=sys.stderr)
            elif contains_cycle(heads):
                print(file=sys.stderr)
                print('skip(cyclic)', path, sentence_id, file=sys.stderr)
            elif skip:
                print(file=sys.stderr)
                print('skip(bad-luw-label)', path, sentence_id, file=sys.stderr)
            else:
                if tokenizer:
                    retokenize_gold(
                        tokens,
                        tokenizer(
                            ''.join([t['orth'] + (' ' if t['whitespace'] else '') for t in tokens])
                        ),
                    )
                if ent_target:
                    offset = 0
                    ent_label = None
                    ent_end = 0
                    ent_queue = []
                    for t in tokens:
                        end = offset + len(t['orth'])
                        if t['whitespace']:
                            end += 1
                        if ent_end > 0:
                            if offset < ent_end:
                                ent_queue.append(t)
                                offset = end
                                continue
                            if end >= ent_end:
                                if len(ent_queue) == 1:
                                    ent_queue[0]['ner'] = 'U-' + ent_label
                                else:
                                    ent_queue[0]['ner'] = 'B-' + ent_label
                                    for et in ent_queue[1:-1]:
                                        et['ner'] = 'I-' + ent_label
                                    ent_queue[-1]['ner'] = 'L-' + ent_label
                                ent_label = None
                                ent_end = 0
                                ent_queue.clear()
                        for ent in ents:
                            if ent['start'] < end and offset < ent['end']:
                                ent_label = ent['label']
                                ent_end = ent['end']
                                ent_queue.append(t)
                                break
                        offset = end
                    if ent_end > 0:
                        if len(ent_queue) == 1:
                            ent_queue[0]['ner'] = 'U-' + ent_label
                        else:
                            ent_queue[0]['ner'] = 'B-' + ent_label
                            for et in ent_queue[1:-1]:
                                et['ner'] = 'I-' + ent_label
                            ent_queue[-1]['ner'] = 'L-' + ent_label

                raw += sentence
                sentences.append({'tokens': tokens})
                if len(sentences) >= n_sents and (not ensure_end_period or tokens[-1]['orth'] == '。'):
                    paragraphs.append({
                        'raw': raw,
                        'sentences': sentences,
                    })
                    raw = ''
                    sentences = []

            sentence_id = None
            sentence = ""
            tokens = []
            bunsetu_begin = None
            bunsetu_head = None
            bunsetu_dep = None
            ent_target = False
            ents = []
            ent_start_char = None
            ent_end_char = None
            ent_label = None
            skip = False
            offset = 0
            state = 'sid'

        else:
            error_line(state, path, line_index, sentence_id, sentence, line)
            return []

    if state != 'sid':
        error_line(state, path, len(lines), sentence_id, sentence, '<END OF FILE>')
        return []

    if sentences:
        if not ensure_end_period or sentences[-1]['tokens'][-1]['orth'] == '。':
            paragraphs.append({
                'raw': raw,
                'sentences': sentences,
            })
        else:
            paragraph = paragraphs[-1]
            paragraphs[-1] = {
                'raw': raw + paragraphs['raw'],
                'sentences': sentences + paragraph['sentences'],
            }

    if _print_bunsetu_dep:
        for dep, count in sorted(bunsetu_head_deps.items()):
            print("bunsetu_dep:", dep, count, bunsetu_all_deps[dep], sep='\t')

    return paragraphs