コード例 #1
0
ファイル: evaluate_ucca_snacs_2.py プロジェクト: jakpra/ucca
def get_snacs_refined_ucca(passage):
    p_snacs = convert.join_passages([passage])
    p_refined = convert.join_passages([passage])
    edges_snacs = (c.edge for _c in extract_candidates(p_snacs).values() for c in _c)
    edges_refined = (c.edge for _c in extract_candidates(p_refined).values() for c in _c)
    for e_snacs, e_refined in zip(edges_snacs, edges_refined):
        assert e_snacs.parent.ID == e_refined.parent.ID and e_snacs.child.ID == e_refined.child.ID
        old_tags, e_snacs.categories, e_refined.categories = e_snacs.tags, [], []
        all_old_tags = []
        for tag in old_tags:
            all_old_tags.extend(tag.split(':'))
#        new_tags = []
        if any(t.startswith('p.') for t in all_old_tags):
            for t in all_old_tags:
                if t.startswith('p.'):
                    e_snacs.add(t)
                elif t[0] not in '?`':
                    e_refined.add(t)
                else:
                    assert False, (t, str(e_snacs.parent), str(e_snacs.child), all_old_tags)
#                    if edge not in edges: edges.add((edge, tuple(sorted(all_old_tags))))
#                    edge.add(t)
#        if new_tags:
#            edge.add(':'.join(sorted(new_tags)))
    return p_snacs, p_refined
コード例 #2
0
ファイル: evaluate_ucca_snacs_2.py プロジェクト: jakpra/ucca
def remove_preterminals(passage):
    _p = convert.join_passages([passage])
    for edge in (c.edge for _c in extract_candidates(_p, constructions=('preterminals',)).values() for c in _c):
#        old_term_edge = terminal.incoming[0]
        non_preterminal_cats, pss = [], []
        for c in edge.categories:
            if c.tag.startswith('Preterminal'):
                tags = c.tag.split(':')
                for t in tags:
                    if t.startswith('p.'):
                        pss.append(t)
            else:
                non_preterminal_cats.append(c.tag)
        assert len(pss) <= 1, (str(edge.parent), pss)
            #if len(tags) >= 2:
            #    refinements += ':' if refinements else '' + ':'.join([t for t in tags[1:] if t.startswith('p.')])
        prepreterminal = edge.parent
        outgoing = [(e.categories, e.child) for e in edge.child.outgoing if isinstance(e.child, layer0.Terminal)]
        assert len(outgoing) <= 1, (prepreterminal, [([c.tag for c in _cats], str(n)) for _cats, n in outgoing])
        if non_preterminal_cats:
            edge.categories = [c for c in edge.categories if not c.tag.startswith('Preterminal')]
            print('WARNING: preterminals and non-preterminals', prepreterminal, outgoing)
        else:
            edge.child.destroy()
            for _cats, n in outgoing:
                new_edge = prepreterminal.add_multiple([(c.tag, '', c.layer, '') for c in _cats] + [(t,) for t in pss] , n)
                if pss:
                    assert n.text
                    new_edge.refinement = pss[0]

    return _p
コード例 #3
0
ファイル: evaluate_ucca_snacs.py プロジェクト: jakpra/ucca
def get_snacs_ucca(passage):
    _p = convert.join_passages([passage])
    for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c):
        old_tags, edge.categories = edge.tags, []
        for tag in old_tags:
            ucca_snacs = tag.split(':')
            if len(ucca_snacs) > 1:
                for t in ucca_snacs[1:]:
                    edge.add(t)
    return _p
コード例 #4
0
ファイル: evaluate_ucca_snacs.py プロジェクト: jakpra/ucca
def get_refined_ucca(passage):
    _p = convert.join_passages([passage])
    for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c):
        old_tags, edge.categories = edge.tags, []
        for tag in old_tags:
            ucca_snacs = tag.split(':')
            if len(ucca_snacs) >= 2:
                if any(t.startswith('p.') for t in ucca_snacs[1:]):
                    edge.add(ucca_snacs[0])
    return _p
コード例 #5
0
ファイル: evaluate_ucca_snacs.py プロジェクト: jakpra/ucca
def get_vanilla_ucca(passage):
    _p = convert.join_passages([passage])
    for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c):
        old_tags, edge.categories = edge.tags, []
        for tag in old_tags:
            ucca_snacs = tag.split(':')
            edge.add(ucca_snacs[0])
            #if len(ucca_snacs) >= 2:
            #    edge.refinement = ucca_snacs[1]
    return _p
コード例 #6
0
ファイル: evaluate_ucca_snacs_2.py プロジェクト: jakpra/ucca
def get_snacs_ucca(passage):
    _p = convert.join_passages([passage])
    edges = set()
    for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c):
        old_tags, edge.categories = edge.tags, []
        all_old_tags = []
        for tag in old_tags:
            all_old_tags.extend(tag.split(':'))
        if any(t.startswith('p.') for t in all_old_tags):
            for t in all_old_tags:
                if t.startswith('p.'):
                    if edge not in edges: edges.add((edge, tuple(sorted(all_old_tags))))
                    edge.add(t)
    return _p, edges
コード例 #7
0
ファイル: evaluate_ucca_snacs_2.py プロジェクト: jakpra/ucca
def get_full_ucca(passage):
    _p = convert.join_passages([passage])
    for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c):
        old_tags, edge.categories = edge.tags, []
        all_old_tags, _ucca, _snacs = [], [], []
        for tag in old_tags:
            for t in tag.split(':'):
                all_old_tags.append(t)
                if t.startswith('p.'):
                    _snacs.append(t)
                else:
                    _ucca.append(t)
#        for t in sorted(_ucca):
#            edge.add(f'{t}:{":".join(sorted(_snacs))}')
        edge.add(f'{":".join(sorted(set(all_old_tags)))}')
#        for tag in old_tags:
#            ucca_snacs = tag.split(':')
#            _tag = ucca_snacs[0]
#            if len(ucca_snacs) >= 2:
#                for t in sorted(ucca_snacs[1:]):
#                    if t.startswith('p.'):
#                        _tag += ':' + t
#            edge.add(_tag)
    return _p
コード例 #8
0
def main(args):

    streusle_file = args[0]
    ucca_path = args[1]
    outpath = args[2]

    for doc, passage, term2tok in get_passages(streusle_file,
                                               ucca_path,
                                               annotate=True,
                                               target='prep'):

        sent_ids = map(lambda x: ''.join(x['sent_id'].split('-')[-2:]),
                       doc['sents'])

        sent_passage = zip(sent_ids,
                           uconv.split_passage(passage, doc['ends'], sent_ids))

        for sent, psg in sent_passage:

            p = uconv.join_passages([psg])
            l0 = p.layer(ul0.LAYER_ID)
            l1 = p.layer(ul1.LAYER_ID)

            for pos, terminal in l0.pairs:

                # print(terminal.extra)
                if 'ss' not in terminal.extra or not isinstance(
                        terminal.extra['ss'],
                        str) or terminal.extra['ss'][0] != 'p':
                    # print(terminal.extra)
                    continue

                unit = doc["exprs"][tuple(
                    map(int, terminal.extra["toknums"].split()))]

                # pt = terminal.incoming[0].parent
                # node = pt.fparent
                # if node.fparent:
                #     node = node.fparent
                # nodes = set(get_all_descendants(node, remotes=True))

                # print(refined)

                # for n in nodes:
                ID = f'{doc["id"]}_{unit["sent_offs"]}_{unit["local_toknums"][0]}-{unit["local_toknums"][-1]}'

                # p = ucore.Passage(ID)
                # other_l0 = ul0.Layer0(p)
                # other_l1 = ul1.Layer1(p)
                #
                # root = other_l1.add_fnode(other_l1._head_fnode, ul1.EdgeTags.ParallelScene)
                #
                # # prep
                # term = create_terminal(pt, unit, other_l0, True)
                # if not term: continue
                # preterminal = other_l1.add_fnode(root, str(pt._fedge() in refined))
                # preterminal.add(ul1.EdgeTags.Terminal, term)
                #
                # # other node
                # term = create_terminal(n, unit, other_l0, False)
                # if not term: continue
                # preterminal = other_l1.add_fnode(root, str(n._fedge() in refined))
                # preterminal.add(ul1.EdgeTags.Terminal, term)

                refined, error = find_refined(terminal,
                                              dict(l0.pairs),
                                              local=True)

                for _, term in p.layer(ul0.LAYER_ID).pairs:
                    _pt = term.incoming[0].parent
                    toks = [t.text for t in _pt.get_terminals()]
                    term.extra['lexlemma'] = ' '.join(toks)
                    term.extra['lexcat'] = _pt.ftag
                    # term.extra.update(unit.get('heuristic_relation', {}))
                    term.extra['is_part_of_mwe'] = len(toks) > 1
                    term.extra['identified_for_pss'] = str(
                        term.ID == terminal.ID)

                edges = [
                    c.edge for cs in uconst.extract_candidates(p).values()
                    for c in cs
                ]
                for edge in edges:
                    edge.categories = []
                    edge.add(str(edge in refined))

                uconv.passage2file(p, f'{outpath}/{ID}.xml')
コード例 #9
0
def test_split_join_paragraphs(create):
    p = create()
    split = convert.split2paragraphs(p, remarks=True)
    copy = convert.join_passages(split)
    diffutil.diff_passages(p, copy)
    assert p.equals(copy)
コード例 #10
0
ファイル: test_ucca.py プロジェクト: macleginn/ucca
 def test_split_join_paragraphs(self):
     p = TestUtil.create_multi_passage()
     split = convert.split2paragraphs(p, remarks=True)
     copy = convert.join_passages(split)
     diffutil.diff_passages(p, copy)
     self.assertTrue(p.equals(copy))
コード例 #11
0
ファイル: test_ucca.py プロジェクト: borgr/ucca
 def test_split_join_paragraphs(self):
     p = TestUtil.create_multi_passage()
     split = convert.split2paragraphs(p, remarks=True)
     copy = convert.join_passages(split)
     diffutil.diff_passages(p, copy)
     self.assertTrue(p.equals(copy))