def get_snacs_refined_ucca(passage): p_snacs = convert.join_passages([passage]) p_refined = convert.join_passages([passage]) edges_snacs = (c.edge for _c in extract_candidates(p_snacs).values() for c in _c) edges_refined = (c.edge for _c in extract_candidates(p_refined).values() for c in _c) for e_snacs, e_refined in zip(edges_snacs, edges_refined): assert e_snacs.parent.ID == e_refined.parent.ID and e_snacs.child.ID == e_refined.child.ID old_tags, e_snacs.categories, e_refined.categories = e_snacs.tags, [], [] all_old_tags = [] for tag in old_tags: all_old_tags.extend(tag.split(':')) # new_tags = [] if any(t.startswith('p.') for t in all_old_tags): for t in all_old_tags: if t.startswith('p.'): e_snacs.add(t) elif t[0] not in '?`': e_refined.add(t) else: assert False, (t, str(e_snacs.parent), str(e_snacs.child), all_old_tags) # if edge not in edges: edges.add((edge, tuple(sorted(all_old_tags)))) # edge.add(t) # if new_tags: # edge.add(':'.join(sorted(new_tags))) return p_snacs, p_refined
def remove_preterminals(passage): _p = convert.join_passages([passage]) for edge in (c.edge for _c in extract_candidates(_p, constructions=('preterminals',)).values() for c in _c): # old_term_edge = terminal.incoming[0] non_preterminal_cats, pss = [], [] for c in edge.categories: if c.tag.startswith('Preterminal'): tags = c.tag.split(':') for t in tags: if t.startswith('p.'): pss.append(t) else: non_preterminal_cats.append(c.tag) assert len(pss) <= 1, (str(edge.parent), pss) #if len(tags) >= 2: # refinements += ':' if refinements else '' + ':'.join([t for t in tags[1:] if t.startswith('p.')]) prepreterminal = edge.parent outgoing = [(e.categories, e.child) for e in edge.child.outgoing if isinstance(e.child, layer0.Terminal)] assert len(outgoing) <= 1, (prepreterminal, [([c.tag for c in _cats], str(n)) for _cats, n in outgoing]) if non_preterminal_cats: edge.categories = [c for c in edge.categories if not c.tag.startswith('Preterminal')] print('WARNING: preterminals and non-preterminals', prepreterminal, outgoing) else: edge.child.destroy() for _cats, n in outgoing: new_edge = prepreterminal.add_multiple([(c.tag, '', c.layer, '') for c in _cats] + [(t,) for t in pss] , n) if pss: assert n.text new_edge.refinement = pss[0] return _p
def get_snacs_ucca(passage): _p = convert.join_passages([passage]) for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c): old_tags, edge.categories = edge.tags, [] for tag in old_tags: ucca_snacs = tag.split(':') if len(ucca_snacs) > 1: for t in ucca_snacs[1:]: edge.add(t) return _p
def get_refined_ucca(passage): _p = convert.join_passages([passage]) for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c): old_tags, edge.categories = edge.tags, [] for tag in old_tags: ucca_snacs = tag.split(':') if len(ucca_snacs) >= 2: if any(t.startswith('p.') for t in ucca_snacs[1:]): edge.add(ucca_snacs[0]) return _p
def get_vanilla_ucca(passage): _p = convert.join_passages([passage]) for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c): old_tags, edge.categories = edge.tags, [] for tag in old_tags: ucca_snacs = tag.split(':') edge.add(ucca_snacs[0]) #if len(ucca_snacs) >= 2: # edge.refinement = ucca_snacs[1] return _p
def get_snacs_ucca(passage): _p = convert.join_passages([passage]) edges = set() for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c): old_tags, edge.categories = edge.tags, [] all_old_tags = [] for tag in old_tags: all_old_tags.extend(tag.split(':')) if any(t.startswith('p.') for t in all_old_tags): for t in all_old_tags: if t.startswith('p.'): if edge not in edges: edges.add((edge, tuple(sorted(all_old_tags)))) edge.add(t) return _p, edges
def get_full_ucca(passage): _p = convert.join_passages([passage]) for edge in (c.edge for _c in extract_candidates(_p).values() for c in _c): old_tags, edge.categories = edge.tags, [] all_old_tags, _ucca, _snacs = [], [], [] for tag in old_tags: for t in tag.split(':'): all_old_tags.append(t) if t.startswith('p.'): _snacs.append(t) else: _ucca.append(t) # for t in sorted(_ucca): # edge.add(f'{t}:{":".join(sorted(_snacs))}') edge.add(f'{":".join(sorted(set(all_old_tags)))}') # for tag in old_tags: # ucca_snacs = tag.split(':') # _tag = ucca_snacs[0] # if len(ucca_snacs) >= 2: # for t in sorted(ucca_snacs[1:]): # if t.startswith('p.'): # _tag += ':' + t # edge.add(_tag) return _p
def main(args): streusle_file = args[0] ucca_path = args[1] outpath = args[2] for doc, passage, term2tok in get_passages(streusle_file, ucca_path, annotate=True, target='prep'): sent_ids = map(lambda x: ''.join(x['sent_id'].split('-')[-2:]), doc['sents']) sent_passage = zip(sent_ids, uconv.split_passage(passage, doc['ends'], sent_ids)) for sent, psg in sent_passage: p = uconv.join_passages([psg]) l0 = p.layer(ul0.LAYER_ID) l1 = p.layer(ul1.LAYER_ID) for pos, terminal in l0.pairs: # print(terminal.extra) if 'ss' not in terminal.extra or not isinstance( terminal.extra['ss'], str) or terminal.extra['ss'][0] != 'p': # print(terminal.extra) continue unit = doc["exprs"][tuple( map(int, terminal.extra["toknums"].split()))] # pt = terminal.incoming[0].parent # node = pt.fparent # if node.fparent: # node = node.fparent # nodes = set(get_all_descendants(node, remotes=True)) # print(refined) # for n in nodes: ID = f'{doc["id"]}_{unit["sent_offs"]}_{unit["local_toknums"][0]}-{unit["local_toknums"][-1]}' # p = ucore.Passage(ID) # other_l0 = ul0.Layer0(p) # other_l1 = ul1.Layer1(p) # # root = other_l1.add_fnode(other_l1._head_fnode, ul1.EdgeTags.ParallelScene) # # # prep # term = create_terminal(pt, unit, other_l0, True) # if not term: continue # preterminal = other_l1.add_fnode(root, str(pt._fedge() in refined)) # preterminal.add(ul1.EdgeTags.Terminal, term) # # # other node # term = create_terminal(n, unit, other_l0, False) # if not term: continue # preterminal = other_l1.add_fnode(root, str(n._fedge() in refined)) # preterminal.add(ul1.EdgeTags.Terminal, term) refined, error = find_refined(terminal, dict(l0.pairs), local=True) for _, term in p.layer(ul0.LAYER_ID).pairs: _pt = term.incoming[0].parent toks = [t.text for t in _pt.get_terminals()] term.extra['lexlemma'] = ' '.join(toks) term.extra['lexcat'] = _pt.ftag # term.extra.update(unit.get('heuristic_relation', {})) term.extra['is_part_of_mwe'] = len(toks) > 1 term.extra['identified_for_pss'] = str( term.ID == terminal.ID) edges = [ c.edge for cs in uconst.extract_candidates(p).values() for c in cs ] for edge in edges: edge.categories = [] edge.add(str(edge in refined)) uconv.passage2file(p, f'{outpath}/{ID}.xml')
def test_split_join_paragraphs(create): p = create() split = convert.split2paragraphs(p, remarks=True) copy = convert.join_passages(split) diffutil.diff_passages(p, copy) assert p.equals(copy)
def test_split_join_paragraphs(self): p = TestUtil.create_multi_passage() split = convert.split2paragraphs(p, remarks=True) copy = convert.join_passages(split) diffutil.diff_passages(p, copy) self.assertTrue(p.equals(copy))
def test_split_join_paragraphs(self): p = TestUtil.create_multi_passage() split = convert.split2paragraphs(p, remarks=True) copy = convert.join_passages(split) diffutil.diff_passages(p, copy) self.assertTrue(p.equals(copy))