def test_equals(self): p1 = core.Passage("1") p2 = core.Passage("2") p1l0 = layer0.Layer0(p1) p2l0 = layer0.Layer0(p2) p1l1 = layer1.Layer1(p1) p2l1 = layer1.Layer1(p2) self.assertTrue(p1.equals(p2) and p2.equals(p1)) # Checks basic passage equality and Attrib/tag/len differences p1l0.add_terminal("0", False) p1l0.add_terminal("1", False) p1l0.add_terminal("2", False) p2l0.add_terminal("0", False) p2l0.add_terminal("1", False) p2l0.add_terminal("2", False) self.assertTrue(p1.equals(p2) and p2.equals(p1)) pnct2 = p2l0.add_terminal("3", True) self.assertFalse(p1.equals(p2) or p2.equals(p1)) temp = p1l0.add_terminal("3", False) self.assertFalse(p1.equals(p2) or p2.equals(p1)) temp.destroy() pnct1 = p1l0.add_terminal("3", True) self.assertTrue(p1.equals(p2) and p2.equals(p1)) # Check Edge and node equality ps1 = p1l1.add_fnode(None, layer1.EdgeTags.ParallelScene) self.assertFalse(p1.equals(p2) or p2.equals(p1)) ps2 = p2l1.add_fnode(None, layer1.EdgeTags.ParallelScene) self.assertTrue(p1.equals(p2) and p2.equals(p1)) p1l1.add_fnode(ps1, layer1.EdgeTags.Participant) self.assertFalse(p1.equals(p2) or p2.equals(p1)) self.assertTrue(ps1.equals(ps2, recursive=False)) p2l1.add_fnode(ps2, layer1.EdgeTags.Process) self.assertFalse(p1.equals(p2) or p2.equals(p1)) p2l1.add_fnode(ps2, layer1.EdgeTags.Participant) self.assertFalse(p1.equals(p2) or p2.equals(p1)) p1l1.add_fnode(ps1, layer1.EdgeTags.Process) self.assertTrue(p1.equals(p2) and p2.equals(p1)) self.assertFalse( p1.equals(p2, ordered=True) or p2.equals(p1, ordered=True)) p1l1.add_fnode(ps1, layer1.EdgeTags.Adverbial, implicit=True) ps2d3 = p2l1.add_fnode(ps2, layer1.EdgeTags.Adverbial) self.assertFalse(p1.equals(p2) or p2.equals(p1)) ps2d3.attrib["implicit"] = True self.assertTrue(p1.equals(p2) and p2.equals(p1)) ps2[2].attrib["remote"] = True self.assertFalse(p1.equals(p2) or p2.equals(p1)) ps1[2].attrib["remote"] = True self.assertTrue(p1.equals(p2) and p2.equals(p1)) p1l1.add_punct(None, pnct1) self.assertFalse(p1.equals(p2) or p2.equals(p1)) p2l1.add_punct(None, pnct2) self.assertTrue(p1.equals(p2) and p2.equals(p1)) core.Layer("2", p1) self.assertFalse(p1.equals(p2) or p2.equals(p1))
def crossing(): """Creates a :class:`Passage` with multiple sentences and paragraphs, with crossing edges. Passage: [1 2 [3 P(remote)] H] . [[3 P] . 4 . H] """ p = core.Passage("1") l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) terms = [ l0.add_terminal("1", False), l0.add_terminal("2", False), l0.add_terminal(".", True), l0.add_terminal("3", False, paragraph=2), l0.add_terminal(".", True, paragraph=2), l0.add_terminal("4", False, paragraph=2), l0.add_terminal(".", True, paragraph=2), ] h1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) h2 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p1 = l1.add_fnode(h2, layer1.EdgeTags.Process) l1.add_remote(h1, layer1.EdgeTags.Process, p1) h1.add(layer1.EdgeTags.Terminal, terms[0]) h1.add(layer1.EdgeTags.Terminal, terms[1]) l1.add_punct(None, terms[2]) p1.add(layer1.EdgeTags.Terminal, terms[3]) l1.add_punct(h2, terms[4]) h2.add(layer1.EdgeTags.Terminal, terms[5]) l1.add_punct(h2, terms[6]) return p
def function1(): p = core.Passage("1") l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) # 5 terminals (1-5), #5 is punctuation terms = [l0.add_terminal(text=str(i), punct=(i == 5)) for i in range(1, 6)] # Scene #1: [H [P 1] [A 2]] ps1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p1 = l1.add_fnode(ps1, layer1.EdgeTags.Process) a = l1.add_fnode(ps1, layer1.EdgeTags.Participant) p1.add(layer1.EdgeTags.Terminal, terms[0]) a.add(layer1.EdgeTags.Terminal, terms[1]) # Function #1 with terminal 3 - its location should not affect evaluation f = l1.add_fnode(None, layer1.EdgeTags.Function) f.add(layer1.EdgeTags.Terminal, terms[2]) # Scene #2: [H [A* 2] [S 4]] ps2 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p2 = l1.add_fnode(ps2, layer1.EdgeTags.State) p2.add(layer1.EdgeTags.Terminal, terms[3]) l1.add_fnode(ps2, layer1.EdgeTags.Participant, implicit=True) # implicit should not affect evaluation # Punctuation #5 - not under a scene l1.add_punct(ps2, terms[4]) # punctuation should not affect evaluation # adding remote argument to scene #2 l1.add_remote(ps2, layer1.EdgeTags.Participant, a) return p
def _from_site_terminals(elem, passage, elem2node): """Extract the Terminals from the site XML format. Some of the terminals metadata (remarks, type) is saved in a wrapper unit which excapsulates each terminal, so we use both for creating our :class:layer0.Terminal objects. Args: elem: root element of the XML heirarchy passage: passage to add the Terminals to, already with Layer0 object elem2node: dictionary whose keys are site IDs and values are the created UCCA Nodes which are equivalent. This function updates the dictionary by mapping each word wrapper to a UCCA Terminal. """ l0 = layer0.Layer0(passage) for para_num, paragraph in enumerate( elem.iterfind(SiteCfg.Paths.Paragraphs)): words = list(paragraph.iter(SiteCfg.Tags.Terminal)) wrappers = [] for word in words: # the list added has only one element, because XML is hierarichal wrappers.extend([ x for x in paragraph.iter(SiteCfg.Tags.Unit) if word in list(x) ]) for word, wrapper in zip(words, wrappers): punct = (wrapper.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct) text = SiteUtil.unescape(word.text) # Paragraphs start at 1 and enumeration at 0, so add +1 to para_num t = passage.layer(layer0.LAYER_ID).add_terminal( text, punct, para_num + 1) SiteUtil.set_id(word, t.ID) SiteUtil.set_node(wrapper, t, elem2node)
def discontiguous(): """Creates a highly-discontiguous Passage object.""" p = core.Passage("1") l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) # 20 terminals (1-20), #10 and #20 are punctuation terms = [l0.add_terminal(text=str(i), punct=(i % 10 == 0)) for i in range(1, 21)] # First parallel scene, stretching on terminals 1-10 # The dashed edge tags (e.g. -C, C-) mean discontiguous units # [PS [D [E 0] [C- 1] [E 2] [-C 3]] # [A- 4] [P- 5 6] [-A 7] [F 8] [-P [U 9]]] # In addition, D takes P as a remote G ps1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) d1 = l1.add_fnode(ps1, layer1.EdgeTags.Adverbial) e1 = l1.add_fnode(d1, layer1.EdgeTags.Elaborator) c1 = l1.add_fnode(d1, layer1.EdgeTags.Center) e2 = l1.add_fnode(d1, layer1.EdgeTags.Elaborator) a1 = l1.add_fnode(ps1, layer1.EdgeTags.Participant) p1 = l1.add_fnode(ps1, layer1.EdgeTags.Process) f1 = l1.add_fnode(ps1, layer1.EdgeTags.Function) l1.add_remote(d1, layer1.EdgeTags.Ground, p1) e1.add(layer1.EdgeTags.Terminal, terms[0]) c1.add(layer1.EdgeTags.Terminal, terms[1]) e2.add(layer1.EdgeTags.Terminal, terms[2]) c1.add(layer1.EdgeTags.Terminal, terms[3]) a1.add(layer1.EdgeTags.Terminal, terms[4]) p1.add(layer1.EdgeTags.Terminal, terms[5]) p1.add(layer1.EdgeTags.Terminal, terms[6]) a1.add(layer1.EdgeTags.Terminal, terms[7]) f1.add(layer1.EdgeTags.Terminal, terms[8]) l1.add_punct(p1, terms[9]) # Second parallel scene, stretching on terminals 11-14 + 18-20 # [PS- [D IMPLICIT] [G IMPLICIT] [P 10 11 12 13]] # [-PS [A 17 18 [U 19]]] ps2 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) l1.add_fnode(ps2, layer1.EdgeTags.Adverbial, implicit=True) l1.add_fnode(ps2, layer1.EdgeTags.Ground, implicit=True) p2 = l1.add_fnode(ps2, layer1.EdgeTags.Process) a2 = l1.add_fnode(ps2, layer1.EdgeTags.Participant) p2.add(layer1.EdgeTags.Terminal, terms[10]) p2.add(layer1.EdgeTags.Terminal, terms[11]) p2.add(layer1.EdgeTags.Terminal, terms[12]) p2.add(layer1.EdgeTags.Terminal, terms[13]) a2.add(layer1.EdgeTags.Terminal, terms[17]) a2.add(layer1.EdgeTags.Terminal, terms[18]) l1.add_punct(a2, terms[19]) # Third parallel scene, stretching on terminals 15-17 # [PS [P IMPLICIT] 14 [A 15 16]] ps3 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) ps3.add(layer1.EdgeTags.Terminal, terms[14]) l1.add_fnode(ps3, layer1.EdgeTags.Process, implicit=True) a3 = l1.add_fnode(ps3, layer1.EdgeTags.Participant) a3.add(layer1.EdgeTags.Terminal, terms[15]) a3.add(layer1.EdgeTags.Terminal, terms[16]) return p
def function2(): p = core.Passage("2") l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) # 5 terminals (1-5), #5 is punctuation terms = [l0.add_terminal(text=str(i), punct=(i == 5)) for i in range(1, 6)] # Scene #1: [H [S 1] [D 2] [F 2]] ps1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p1 = l1.add_fnode(ps1, layer1.EdgeTags.State) a = l1.add_fnode(ps1, layer1.EdgeTags.Adverbial) p1.add(layer1.EdgeTags.Terminal, terms[0]) a.add(layer1.EdgeTags.Terminal, terms[1]) f = l1.add_fnode(ps1, layer1.EdgeTags.Function) f.add(layer1.EdgeTags.Terminal, terms[2]) # Scene #2: [H [A* 2] [S 4]] ps2 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p2 = l1.add_fnode(ps2, layer1.EdgeTags.State) p2.add(layer1.EdgeTags.Terminal, terms[3]) # Punctuation #5 - not under a scene l1.add_punct(None, terms[4]) # adding remote argument to scene #2 l1.add_remote(ps2, layer1.EdgeTags.Adverbial, a) return p
def test_terminals(self): """Tests :class:layer0.Terminal new and inherited functionality.""" p = core.Passage("1") layer0.Layer0(p) terms = [ layer0.Terminal(ID="0.1", root=p, tag=layer0.NodeTags.Word, attrib={"text": "1", "paragraph": 1, "paragraph_position": 1}), layer0.Terminal(ID="0.2", root=p, tag=layer0.NodeTags.Word, attrib={"text": "2", "paragraph": 2, "paragraph_position": 1}), layer0.Terminal(ID="0.3", root=p, tag=layer0.NodeTags.Punct, attrib={"text": ".", "paragraph": 2, "paragraph_position": 2}) ] p_copy = core.Passage("2") layer0.Layer0(p_copy) equal_term = layer0.Terminal(ID="0.1", root=p_copy, tag=layer0.NodeTags.Word, attrib={"text": "1", "paragraph": 1, "paragraph_position": 1}) unequal_term = layer0.Terminal(ID="0.2", root=p_copy, tag=layer0.NodeTags.Word, attrib={"text": "two", "paragraph": 2, "paragraph_position": 1}) self.assertSequenceEqual([t.punct for t in terms], [False, False, True]) self.assertSequenceEqual([t.text for t in terms], ["1", "2", "."]) self.assertSequenceEqual([t.position for t in terms], [1, 2, 3]) self.assertSequenceEqual([t.paragraph for t in terms], [1, 2, 2]) self.assertSequenceEqual([t.para_pos for t in terms], [1, 1, 2]) self.assertFalse(terms[0] == terms[1]) self.assertFalse(terms[0] == terms[2]) self.assertFalse(terms[1] == terms[2]) self.assertTrue(terms[0] == terms[0]) self.assertTrue(terms[0].equals(equal_term)) self.assertFalse(terms[1].equals(unequal_term))
def test_layer0(): p = core.Passage("1") l0 = layer0.Layer0(p) t1 = l0.add_terminal(text="1", punct=False) l0.add_terminal(text="2", punct=True, paragraph=2) t3 = l0.add_terminal(text="3", punct=False, paragraph=2) assert [x[0] for x in l0.pairs] == [1, 2, 3] assert [t.para_pos for t in l0.all] == [1, 1, 2] assert l0.words == (t1, t3)
def create_passage(num_terms=3, *punct): p = core.Passage("1") l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) terms = [ l0.add_terminal(text=str(i), punct=(i in punct)) for i in range(1, num_terms + 1) ] return p, l1, terms
def _build_passage(self): p = core.Passage(self.sentence_id or self.passage_id) l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) paragraph = 1 # add normal nodes while self.pending_nodes: for i in reversed(range(len(self.pending_nodes))): parent_id, edge_tag, node_id = self.pending_nodes[i] parent = self.node_by_id.get(parent_id, -1) if parent != -1: del self.pending_nodes[i] implicit = node_id not in self.node_ids_with_children node = l1.add_fnode(parent, edge_tag, implicit=implicit) if edge_tag == EdgeTags.Punctuation: node.tag = layer1.NodeTags.Punctuation self.node_by_id[node_id] = node # add remotes for parent_id, edge_tag, node_id in self.remotes: l1.add_remote(self.node_by_id[parent_id], edge_tag, self.node_by_id[node_id]) # add linkages for node_id, children in self.linkages.items(): link_relation = next(self.node_by_id[i] for i, t in children if t == EdgeTags.LinkRelation) link_arguments = [ self.node_by_id[i] for i, t in children if t == EdgeTags.LinkArgument ] l1.add_linkage(link_relation, *link_arguments) # add terminals for text, tag, edge_tag, parent_id in self.terminals: punctuation = (tag == layer0.NodeTags.Punct) terminal = l0.add_terminal(text=text, punct=punctuation, paragraph=paragraph) try: parent = self.node_by_id[parent_id] except KeyError as e: raise ValueError( "Terminal ('%s') with bad parent (%s) in passage %s" % (text, parent_id, p.ID)) from e if parent is None: print("Terminal is a child of the root: '%s'" % text, file=sys.stderr) parent = l1.add_fnode(parent, edge_tag) if edge_tag != EdgeTags.Terminal: print("Terminal with incoming %s edge: '%s'" % (edge_tag, text), file=sys.stderr) parent.add(EdgeTags.Terminal, terminal) return p
def test_layer0(self): p = core.Passage("1") l0 = layer0.Layer0(p) t1 = l0.add_terminal(text="1", punct=False) l0.add_terminal(text="2", punct=True, paragraph=2) t3 = l0.add_terminal(text="3", punct=False, paragraph=2) self.assertSequenceEqual([x[0] for x in l0.pairs], [1, 2, 3]) self.assertSequenceEqual([t.para_pos for t in l0.all], [1, 1, 2]) self.assertSequenceEqual(l0.words, (t1, t3))
def build_passage(self, graph, terminals_only=False): passage = core.Passage(graph.id) self.is_ucca = (graph.format == "ucca") if graph.format is None or graph.format == self.format: passage.extra["format"] = self.format self.create_terminals(graph, layer0.Layer0(passage)) if not terminals_only: self.create_non_terminals(graph, layer1.Layer1(passage)) graph.link_pre_terminals() return passage
def main(args): streusle_file = args[0] outpath = args[1] for doc_id, doc in get_streusle_docs(streusle_file).items(): for unit in list(doc['exprs'].values()): ID = f'{doc_id}_{unit["sent_offs"]}_{unit["local_toknums"][0]}-{unit["local_toknums"][-1]}' sent = doc['sents'][int(unit['sent_offs'])-1] # print(sent) # print(unit) p = ucore.Passage(ID) l0 = ul0.Layer0(p) l1 = ul1.Layer1(p) root = l1.add_fnode(l1._head_fnode, ul1.EdgeTags.ParallelScene) # gov preterminal = l1.add_fnode(root, 'gov') # preterminal._fedge().attrib['remote'] = True if unit['heuristic_relation']['gov'] is not None: rel = sent['toks'][unit['heuristic_relation'][f'local_gov']-1] rel_unit = sent['swes'].get(str(rel['#'])) if rel_unit is None: rel_unit = sent['smwes'].get(str(rel.get('smwe', [-1, -1])[0]), None) term = create_terminal(rel, rel_unit, l0, False) preterminal.add(ul1.EdgeTags.Terminal, term) # P unit preterminal = l1.add_fnode(root, unit['ss']) for i in unit["toknums"]: tok = doc['toks'][i-1] term = create_terminal(tok, unit, l0, True) preterminal.add(ul1.EdgeTags.Terminal, term) # obj preterminal = l1.add_fnode(root, 'obj') # preterminal._fedge().attrib['remote'] = True if unit['heuristic_relation']['obj'] is not None and unit['lexcat'] != 'PP': rel = sent['toks'][unit['heuristic_relation'][f'local_obj'] - 1] rel_unit = sent['swes'].get(str(rel['#'])) if rel_unit is None: rel_unit = sent['smwes'].get(str(rel.get('smwe', [-1, -1])[0]), None) term = create_terminal(rel, rel_unit, l0, False) preterminal.add(ul1.EdgeTags.Terminal, term) uconv.passage2file(p, f'{outpath}/{ID}.xml')
def main(args): for i, line in enumerate(tqdm(gen_lines(args.filenames), unit=" lines", desc="Creating passages"), start=1): p = core.Passage(args.format % i) l0 = layer0.Layer0(p) layer1.Layer1(p) for tok in line.split(): l0.add_terminal(text=tok, punct=PUNCTUATION.issuperset(tok)) write_passage(p, outdir=args.out_dir, binary=args.binary, verbose=False)
def _build_passage(self, stream): # p = core.Passage(self.sentence_id or self.passage_id) p = core.Passage(self.passage_id) l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) paragraph = 1 next(self.parse(stream)) # add normal nodes self.pending_nodes = list(reversed(self.pending_nodes)) while self.pending_nodes: for i in reversed(range(len(self.pending_nodes))): parent_id, edge_tag, node_id = self.pending_nodes[i] parent = self.node_by_id.get(parent_id, -1) if parent != -1: del self.pending_nodes[i] implicit = node_id not in self.node_ids_with_children node = l1.add_fnode(parent, edge_tag, implicit=implicit) if edge_tag == EdgeTags.Punctuation: node.tag = layer1.NodeTags.Punctuation self.node_by_id[node_id] = node # add terminals for text, tag, edge_tag, parent_id in self.terminals: punctuation = (tag == layer0.NodeTags.Punct) terminal = l0.add_terminal(text=text, punct=punctuation, paragraph=paragraph) try: parent = self.node_by_id[parent_id] except KeyError as e: raise ValueError( "Terminal ('%s') with bad parent (%s) in passage %s" % (text, parent_id, p.ID)) from e if parent is None: print("Terminal is a child of the root: '%s'" % text, file=sys.stderr) parent = l1.add_fnode(parent, edge_tag) if edge_tag != EdgeTags.Terminal: print("Terminal with incoming %s edge: '%s'" % (edge_tag, text), file=sys.stderr) parent.add(EdgeTags.Terminal, terminal) return p
def multi_sent_with_quotes(): """Creates a :class:`Passage` with multiple sentences and paragraphs, with quotes in them. Passage: [1 2 [" U] [3 P] H] . [" U] [[5 6 . P] H] [[8 P] . 10 . H] """ p = core.Passage("1") l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) terms = [l0.add_terminal(str(i), False) for i in range(1, 3)] terms.append(l0.add_terminal('"', True)) terms.append(l0.add_terminal("3", False)) terms.append(l0.add_terminal(".", True)) terms.append(l0.add_terminal('"', True)) terms.append(l0.add_terminal("5", False)) terms.append(l0.add_terminal("6", False)) terms.append(l0.add_terminal(".", True)) terms.append(l0.add_terminal("8", False, paragraph=2)) terms.append(l0.add_terminal(".", True, paragraph=2)) terms.append(l0.add_terminal("10", False, paragraph=2)) terms.append(l0.add_terminal(".", True, paragraph=2)) h1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) h2 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) h3 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p1 = l1.add_fnode(h1, layer1.EdgeTags.Process) p2 = l1.add_fnode(h2, layer1.EdgeTags.Process) p3 = l1.add_fnode(h3, layer1.EdgeTags.Process) h1.add(layer1.EdgeTags.Terminal, terms[0]) h1.add(layer1.EdgeTags.Terminal, terms[1]) l1.add_punct(None, terms[2]) p1.add(layer1.EdgeTags.Terminal, terms[3]) l1.add_punct(None, terms[4]) l1.add_punct(None, terms[5]) p2.add(layer1.EdgeTags.Terminal, terms[6]) p2.add(layer1.EdgeTags.Terminal, terms[7]) l1.add_punct(p2, terms[8]) p3.add(layer1.EdgeTags.Terminal, terms[9]) l1.add_punct(h3, terms[10]) h3.add(layer1.EdgeTags.Terminal, terms[11]) l1.add_punct(h3, terms[12]) return p
def graph2passage(graph, input): passage = core.Passage(graph.id) l0 = layer0.Layer0(passage) anchors = {(anchor["from"], anchor["to"], is_punct(node)) for node in graph.nodes for anchor in node.anchors or ()} terminals = {(i, j): l0.add_terminal(text=input[i:j], punct=punct) for i, j, punct in sorted(anchors)} l1 = layer1.Layer1(passage) queue = [(node, None if node.is_top else layer1.FoundationalNode( root=l1.root, tag=layer1.NodeTags.Foundational, ID=l1.next_id())) for node in graph.nodes if is_primary_root(node)] id_to_unit = {node.id: unit for (node, unit) in queue} remotes = [] while queue: parent, parent_unit = queue.pop(0) for tgt, edges in groupby(sorted(parent.outgoing_edges, key=attrgetter("tgt")), key=attrgetter("tgt")): edges = list(edges) labels = [edge.lab for edge in edges] if is_remote(edges[0]): remotes.append((parent_unit, labels, tgt)) else: child = graph.find_node(tgt) child_unit = id_to_unit[tgt] = l1.add_fnode_multiple( parent_unit, labels, implicit=is_implicit(child)) queue.append((child, child_unit)) for anchor in parent.anchors or (): if parent_unit is None: # Terminal children of the root are not valid in UCCA, so warn but be faithful print( "graph2passage(): anchors of the root node converted to Terminal children in ‘{}’." "".format(graph.id), file=sys.stderr) parent_unit = l1.heads[0] parent_unit.add(layer1.EdgeTags.Terminal, terminals[anchor["from"], anchor["to"]]) for parent, labels, tgt in remotes: l1.add_remote_multiple(parent, labels, id_to_unit[tgt]) return passage
def from_text(text, passage_id='1'): """Converts from tokenized strings to a Passage object. Args: text: a sequence of strings, where each one will be a new paragraph. Returns: a Passage object with only Terminals units. """ p = core.Passage(passage_id) l0 = layer0.Layer0(p) punct = re.compile('^[{}]+$'.format(string.punctuation)) for i, par in enumerate(text): for token in par.split(): # i is paragraph index, but it starts with 0, so we need to add +1 l0.add_terminal(text=token, punct=punct.match(token), paragraph=(i + 1)) return p
def create_multi_passage(): """Creates a :class:Passage with multiple sentences and paragraphs. Passage: [1 2 [3 P] H] . [[5 6 . P] H] [[8 P] . 10 . H] """ p = core.Passage('1') l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) terms = [l0.add_terminal(str(i), False) for i in range(1, 4)] terms.append(l0.add_terminal('.', True)) terms.append(l0.add_terminal('5', False)) terms.append(l0.add_terminal('6', False)) terms.append(l0.add_terminal('.', True)) terms.append(l0.add_terminal('8', False, paragraph=2)) terms.append(l0.add_terminal('.', True, paragraph=2)) terms.append(l0.add_terminal('10', False, paragraph=2)) terms.append(l0.add_terminal('.', True, paragraph=2)) h1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) h2 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) h3 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p1 = l1.add_fnode(h1, layer1.EdgeTags.Process) p2 = l1.add_fnode(h2, layer1.EdgeTags.Process) p3 = l1.add_fnode(h3, layer1.EdgeTags.Process) h1.add(layer1.EdgeTags.Terminal, terms[0]) h1.add(layer1.EdgeTags.Terminal, terms[1]) p1.add(layer1.EdgeTags.Terminal, terms[2]) l1.add_punct(None, terms[3]) p2.add(layer1.EdgeTags.Terminal, terms[4]) p2.add(layer1.EdgeTags.Terminal, terms[5]) l1.add_punct(p2, terms[6]) p3.add(layer1.EdgeTags.Terminal, terms[7]) l1.add_punct(h3, terms[8]) h3.add(layer1.EdgeTags.Terminal, terms[9]) l1.add_punct(h3, terms[10]) return p
def empty(): p = core.Passage(ID="1") layer0.Layer0(p) layer1.Layer1(p) return p
def l1_passage(): """Creates a Passage to work with using layer1 objects. Annotation layout (what annotation each terminal has): 1: Linker, linked with the first parallel scene 2-10: Parallel scene #1, 2-5 ==> Participant #1 6-9 ==> Process #1, 10 ==> Punctuation, remote Participant is Adverbial #2 11-19: Parallel scene #23, which encapsulated 2 scenes and a linker (not a real scene, has no process, only for grouping) 11-15: Parallel scene #2 (under #23), 11-14 ==> Participant #3, 15 ==> Adverbial #2, remote Process is Process #1 16: Linker #2, links Parallel scenes #2 and #3 17-19: Parallel scene #3, 17-18 ==> Process #3, 19 ==> Participant #3, implicit Participant 20: Punctuation (under the head) """ p = core.Passage("1") l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) # 20 terminals (1-20), #10 and #20 are punctuation terms = [l0.add_terminal(text=str(i), punct=(i % 10 == 0)) for i in range(1, 21)] # Linker #1 with terminal 1 link1 = l1.add_fnode(None, layer1.EdgeTags.Linker) link1.add(layer1.EdgeTags.Terminal, terms[0]) # Scene #1: [[2 3 4 5 P] [6 7 8 9 A] [10 U] H] ps1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p1 = l1.add_fnode(ps1, layer1.EdgeTags.Process) a1 = l1.add_fnode(ps1, layer1.EdgeTags.Participant) p1.add(layer1.EdgeTags.Terminal, terms[1]) p1.add(layer1.EdgeTags.Terminal, terms[2]) p1.add(layer1.EdgeTags.Terminal, terms[3]) p1.add(layer1.EdgeTags.Terminal, terms[4]) a1.add(layer1.EdgeTags.Terminal, terms[5]) a1.add(layer1.EdgeTags.Terminal, terms[6]) a1.add(layer1.EdgeTags.Terminal, terms[7]) a1.add(layer1.EdgeTags.Terminal, terms[8]) l1.add_punct(ps1, terms[9]) # Scene #2: [[11 12 13 14 P] [15 D]] #ps12 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) ps2 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) a2 = l1.add_fnode(ps2, layer1.EdgeTags.Participant) a2.add(layer1.EdgeTags.Terminal, terms[10]) a2.add(layer1.EdgeTags.Terminal, terms[11]) a2.add(layer1.EdgeTags.Terminal, terms[12]) a2.add(layer1.EdgeTags.Terminal, terms[13]) d2 = l1.add_fnode(ps2, layer1.EdgeTags.Adverbial) d2.add(layer1.EdgeTags.Terminal, terms[14]) # Linker #2: [16 L] link2 = l1.add_fnode(None, layer1.EdgeTags.Linker) link2.add(layer1.EdgeTags.Terminal, terms[15]) # Scene #3: [[16 17 S] [18 A] (implicit participant) H] ps3 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) p3 = l1.add_fnode(ps3, layer1.EdgeTags.State) p3.add(layer1.EdgeTags.Terminal, terms[16]) p3.add(layer1.EdgeTags.Terminal, terms[17]) a3 = l1.add_fnode(ps3, layer1.EdgeTags.Participant) a3.add(layer1.EdgeTags.Terminal, terms[18]) l1.add_fnode(ps3, layer1.EdgeTags.Participant, implicit=True) # Punctuation #20 - not under a scene l1.add_punct(None, terms[19]) # adding remote argument to scene #1, remote process to scene #2 # creating linkages L1->H1, H2<-L2->H3 l1.add_remote(ps1, layer1.EdgeTags.Participant, d2) l1.add_remote(ps2, layer1.EdgeTags.Process, p1) l1.add_linkage(link1, ps1) l1.add_linkage(link2, ps2, ps3) return p
def create_passage(self, verify=True): """ Create final passage from temporary representation :param verify: fail if this results in an improper passage :return: core.Passage created from self.nodes """ passage = core.Passage(self.passage.ID) l0 = layer0.Layer0(passage) terminals = [ l0.add_terminal(text=terminal.text, punct=terminal.tag == layer0.NodeTags.Punct, paragraph=terminal.paragraph) for terminal in self.terminals ] l1 = layer1.Layer1(passage) self.root.node = l1.heads[0] self.root.set_node_label() if self.labeled: # We have a reference passage self.root.set_node_id() self.fix_terminal_tags(terminals) remotes = [] # To be handled after all nodes are created linkages = [] # To be handled after all non-linkage nodes are created self.topological_sort() # Sort self.nodes for node in self.nodes: if self.labeled and verify: assert node.text or node.outgoing or node.implicit, "Non-terminal leaf node: %s" % node if node.is_linkage: linkages.append(node) else: for edge in node.outgoing: if edge.remote: remotes.append((node, edge)) else: edge.child.add_to_l1(l1, node, edge.tag, terminals, self.labeled) for node, edge in remotes: # Add remote edges try: assert node.node is not None, "Remote edge from nonexistent node" assert edge.child.node is not None, "Remote edge to nonexistent node" l1.add_remote(node.node, edge.tag, edge.child.node) except AssertionError: if verify: raise for node in linkages: # Add linkage nodes and edges try: link_relation = None link_args = [] for edge in node.outgoing: assert edge.child.node is not None, "Linkage edge to nonexistent node" if edge.tag == EdgeTags.LinkRelation: assert link_relation is None, \ "Multiple link relations: %s, %s" % (link_relation, edge.child.node) link_relation = edge.child.node elif edge.tag == EdgeTags.LinkArgument: link_args.append(edge.child.node) assert link_relation is not None, "No link relations: %s" % node # if len(link_args) < 2: # print("Less than two link arguments for linkage %s" % node, file=sys.stderr) node.node = l1.add_linkage(link_relation, *link_args) if node.node_id: # We are in training and we have a gold passage node.node.extra["remarks"] = node.node_id # For reference except AssertionError: if verify: raise return passage
def test_terminals(self): """Tests :class:layer0.Terminal new and inherited functionality.""" p = core.Passage('1') layer0.Layer0(p) terms = [ layer0.Terminal(ID='0.1', root=p, tag=layer0.NodeTags.Word, attrib={ 'text': '1', 'paragraph': 1, 'paragraph_position': 1 }), layer0.Terminal(ID='0.2', root=p, tag=layer0.NodeTags.Word, attrib={ 'text': '2', 'paragraph': 2, 'paragraph_position': 1 }), layer0.Terminal(ID='0.3', root=p, tag=layer0.NodeTags.Punct, attrib={ 'text': '.', 'paragraph': 2, 'paragraph_position': 2 }) ] p_copy = core.Passage('2') layer0.Layer0(p_copy) equal_term = layer0.Terminal(ID='0.1', root=p_copy, tag=layer0.NodeTags.Word, attrib={ 'text': '1', 'paragraph': 1, 'paragraph_position': 1 }) unequal_term = layer0.Terminal(ID='0.2', root=p_copy, tag=layer0.NodeTags.Word, attrib={ 'text': 'two', 'paragraph': 2, 'paragraph_position': 1 }) self.assertSequenceEqual([t.punct for t in terms], [False, False, True]) self.assertSequenceEqual([t.text for t in terms], ['1', '2', '.']) self.assertSequenceEqual([t.position for t in terms], [1, 2, 3]) self.assertSequenceEqual([t.paragraph for t in terms], [1, 2, 2]) self.assertSequenceEqual([t.para_pos for t in terms], [1, 1, 2]) self.assertFalse(terms[0] == terms[1]) self.assertFalse(terms[0] == terms[2]) self.assertFalse(terms[1] == terms[2]) self.assertTrue(terms[0] == terms[0]) self.assertTrue(terms[0].equals(equal_term)) self.assertFalse(terms[1].equals(unequal_term))
def n_evaluate(sent_tensor, model, attn, ori_sent, dev_passage, pos, pos_tensor): """ predict a passage :param sent_tensor: :param model: :param attn: :param ori_sent: :param dev_passage: :param pos: :return: """ # print("original sent") # print(ori_sent) create_by_leftmost = True max_recur = 5 i = 0 k = 0 l1_node_list = [] l0_node_list = [] output, hidden = model(sent_tensor, pos_tensor) # initialize passage passageID = dev_passage.ID passage = core.Passage(passageID) l0 = layer0.Layer0(root=passage) l1 = layer1.Layer1(passage) while i < len(ori_sent): terminal_token = ori_sent[i] pos_tag = pos[i] # proper nouns (only use when there are more than one consecutive PROPNs if pos_tag == "PROPN" and i + 1 < len(ori_sent) and (pos[i + 1] == "PROPN" or pos[i + 1] == "NUM") \ or (pos_tag == "DET" and i + 1 < len(ori_sent) and pos[i + 1] == "PROPN"): left_most_idx = i output_i = output[i] combine_list = [] # For cases like "April(PROPN) 30(NUM) ,(PUNCT) 2008(NUM)" if i + 3 < len(ori_sent) and pos[i + 1] == "NUM" and pos[ i + 2] == "PUNCT" and pos[i + 3] == "NUM": for _ in range(4): # create terminal node in l0 terminal_token = ori_sent[i] is_punc = terminal_token in punc terminal_node = l0.add_terminal(terminal_token, is_punc) l0_node_list.append(terminal_node) combine_list.append(terminal_node) i += 1 # elif pos_tag == "PROPN": # while True: # if pos[i] != "PROPN": # break # # create terminal node in l0 # terminal_token = ori_sent[i] # is_punc = terminal_token in punc # terminal_node = l0.add_terminal(terminal_token, is_punc) # l0_node_list.append(terminal_node) # combine_list.append(terminal_node) # i += 1 # else: # # for cases like "The Bahamas" # while True: # # create terminal node in l0 # terminal_token = ori_sent[i] # is_punc = terminal_token in punc # terminal_node = l0.add_terminal(terminal_token, is_punc) # l0_node_list.append(terminal_node) # combine_list.append(terminal_node) # i += 1 # if pos[i] != "PROPN": # break # including cases like "The Bahamas" else: while True: # create terminal node in l0 terminal_token = ori_sent[i] is_punc = terminal_token in punc terminal_node = l0.add_terminal(terminal_token, is_punc) l0_node_list.append(terminal_node) combine_list.append(terminal_node) i += 1 if i >= len(ori_sent): break # for cases like "Lara Croft: Tomb Raider" if ori_sent[i] == ":" and i + 1 < len(pos) and pos[ i + 1] == "PROPN": continue elif pos[i] != "PROPN": break # combine the nodes in combine_list to one node in l1 l1_position = len(l1._all) + 1 ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, l1_position) terminal_node_in_l1 = FoundationalNode( ID, passage, tag=layer1.NodeTags.Foundational) for terminal_node in combine_list: terminal_node_in_l1.add(terminal_tag, terminal_node) l1_node_list.append(terminal_node_in_l1) i -= 1 else: # create terminal node in l0 is_punc = terminal_token in punc terminal_node = l0.add_terminal(terminal_token, is_punc) l0_node_list.append(terminal_node) l1_position = len(l1._all) + 1 ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, l1_position) terminal_node_in_l1 = FoundationalNode( ID, passage, tag=layer1.NodeTags.Punctuation if is_punc else layer1.NodeTags.Foundational) terminal_node_in_l1.add(terminal_tag, terminal_node) l1_node_list.append(terminal_node_in_l1) output_i = output[i] attn_i = attn(output_i) top_k_value, top_k_ind = torch.topk(attn_i, 1) # for debugging tki = top_k_ind.data[0][0] # attend to the current terminal itself if top_k_ind.data[0] >= i: i += 1 continue else: top_k_node = l0_node_list[top_k_ind] parent_node = get_parent_node(top_k_node) new_node_position = len(l1._all) + 1 new_node_ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, new_node_position) new_node = FoundationalNode(new_node_ID, passage, tag=layer1.NodeTags.Foundational) children = [] while True: item_node = l1_node_list.pop() itemid = item_node.ID pid = parent_node.ID children.append(item_node) if item_node.ID == parent_node.ID: for child in children: new_node.add(str(k), child) k += 1 l1_node_list.append(new_node) break left_most_idx = get_left_most_id(new_node) # recursive call to see if need to create new node for r in range(1, max_recur + 1): new_node_output = output_i - output[left_most_idx] new_node_attn_weight = attn(new_node_output) r_top_k_value, r_top_k_ind = torch.topk(new_node_attn_weight, 1) #predict out of boundary if r_top_k_ind > i: break # attend to the new node itself elif left_most_idx <= r_top_k_ind <= i: break # create new node else: r_top_k_node = l0_node_list[r_top_k_ind] r_parent_node = get_parent_node(r_top_k_node) new_node_position = len(l1._all) + 1 new_node_ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, new_node_position) new_node = FoundationalNode(new_node_ID, passage, tag=layer1.NodeTags.Foundational) children = [] while True: item_node = l1_node_list.pop() children.append(item_node) if item_node.ID == r_parent_node.ID: for child in children: new_node.add(str(k), child) k += 1 l1_node_list.append(new_node) break left_most_idx = get_left_most_id(new_node) i += 1 # print(passage) # check if Node(1.1) is empty head_node = l1.heads[0] if len(head_node.get_terminals()) == 0: for node in l1_node_list: head_node.add(str(k), node) k += 1 return passage
def passage2(): p = core.Passage("2") l0 = layer0.Layer0(p) l1 = layer1.Layer1(p) # 20 terminals (1-20), #10 and #20 are punctuation terms = [l0.add_terminal(text=str(i), punct=(i % 10 == 0)) for i in range(1, 21)] # Linker #1 with terminal 1 link1 = l1.add_fnode(None, layer1.EdgeTags.Linker) # true link1.add(layer1.EdgeTags.Terminal, terms[0]) # Scene #1: [[2 3 4 5 P] [6 7 8 9 A] [10 U] H] ps1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) # true p1 = l1.add_fnode(ps1, layer1.EdgeTags.Process) # true a1 = l1.add_fnode(ps1, layer1.EdgeTags.Participant) # true p1.add(layer1.EdgeTags.Terminal, terms[1]) p1.add(layer1.EdgeTags.Terminal, terms[2]) p1.add(layer1.EdgeTags.Terminal, terms[3]) p1.add(layer1.EdgeTags.Terminal, terms[4]) a1.add(layer1.EdgeTags.Terminal, terms[5]) a1.add(layer1.EdgeTags.Terminal, terms[6]) a1.add(layer1.EdgeTags.Terminal, terms[7]) a1.add(layer1.EdgeTags.Terminal, terms[8]) l1.add_punct(ps1, terms[9]) # Scene #23: [[11 12 13 14 15 H] [16 L] [17 18 19 H] H] # Scene #2: [[11 12 13 14 H] [15 E]] ps23 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene) # true ps2 = l1.add_fnode(ps23, layer1.EdgeTags.ParallelScene) # true a2 = l1.add_fnode(ps2, layer1.EdgeTags.ParallelScene) # false a2.add(layer1.EdgeTags.Terminal, terms[10]) a2.add(layer1.EdgeTags.Terminal, terms[11]) a2.add(layer1.EdgeTags.Terminal, terms[12]) a2.add(layer1.EdgeTags.Terminal, terms[13]) d2 = l1.add_fnode(ps1, layer1.EdgeTags.Elaborator) # false d2.add(layer1.EdgeTags.Terminal, terms[14]) # Linker #2: [16 L] link2 = l1.add_fnode(ps23, layer1.EdgeTags.Linker) # true link2.add(layer1.EdgeTags.Terminal, terms[15]) # Scene #3: [[16 17 P] [18 A] (implicit participant) H] ps3 = l1.add_fnode(ps23, layer1.EdgeTags.ParallelScene) # true p3 = l1.add_fnode(ps3, layer1.EdgeTags.Process) # false p3.add(layer1.EdgeTags.Terminal, terms[16]) p3.add(layer1.EdgeTags.Terminal, terms[17]) a3 = l1.add_fnode(ps3, layer1.EdgeTags.Participant) # true a3.add(layer1.EdgeTags.Terminal, terms[18]) l1.add_fnode(ps3, layer1.EdgeTags.Participant, implicit=True) # Punctuation #20 - not under a scene l1.add_punct(None, terms[19]) # adding remote argument to scene #1, remote process to scene #2 # creating linkages L1->H1, H2<-L2->H3 l1.add_remote(ps1, layer1.EdgeTags.Participant, d2) l1.add_remote(ps1, layer1.EdgeTags.Participant, a3) l1.add_remote(ps2, layer1.EdgeTags.State, p1) l1.add_linkage(link1, ps1) l1.add_linkage(link2, ps2, ps3) return p
def evaluate_with_label(sent_tensor, model, a_model, label_model, s_model, rm_model, rm_lstm_model, ori_sent, dev_passage, pos, pos_tensor, labels, label2index, ent, ent_tensor, case_tensor, unroll): """ :param sent_tensor: :param model: :param a_model: :param label_model: :param ori_sent: :param dev_passage: :param pos: :param pos_tensor: :param labels: :param label2index: :return: """ # print("original sent") # print(ori_sent) create_by_leftmost = True using_s_model = False if not isinstance(s_model, str): using_s_model = True using_rm_model = False if not isinstance(rm_model, str): using_rm_model = True output_rm, hidden_rm = rm_lstm_model(sent_tensor, pos_tensor, ent_tensor, case_tensor, unroll) output_2d_rm = output_rm.squeeze(1) max_recur = 7 i = 0 sent_length = len(ori_sent) l1_node_list = [] l0_node_list = [] node_encoding = {} ck_node_encoding = {} output, hidden = model(sent_tensor, pos_tensor, ent_tensor, case_tensor, unroll) output_2d = output.squeeze(1) # initialize passage passageID = dev_passage.ID passage = core.Passage(passageID) l0 = layer0.Layer0(root=passage) l1 = layer1.Layer1(passage) predicted_scene = False already_in_propn = [] rm_to_add = defaultdict(list) while i < sent_length: terminal_token = ori_sent[i] pos_tag = pos[i] ent_type = ent[i] if not predict_l1: # moved to l0_l1_rule.py pass # predict l0 to l1 else: # create terminal node in l0 is_punc = terminal_token in punc terminal_node = l0.add_terminal(terminal_token, is_punc) l0_node_list.append(terminal_node) l1_position = len(l1._all) + 1 ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, l1_position) terminal_node_in_l1 = FoundationalNode(ID, passage, tag=layer1.NodeTags.Punctuation if is_punc else layer1.NodeTags.Foundational) terminal_node_in_l1.add(terminal_tag, terminal_node) l1_node_list.append(terminal_node_in_l1) node_encoding[terminal_node_in_l1] = output[i] ck_node_encoding[terminal_node_in_l1] = [i, i] output_i = output[i] attn_i = a_model(output_i, output_2d, i) top_k_value, top_k_ind = torch.topk(attn_i, 1) # for debugging tki = top_k_ind.data[0][0] # attend to the current terminal itself if top_k_ind.data[0] >= i: # # remote node to a node to the right of the parent # if i in rm_to_add: # for remote_pred in rm_to_add[i]: # rm_parent, rm_label = remote_pred # rm_parent.add(rm_label, terminal_node_in_l1, edge_attrib={'remote': True}) i += 1 continue else: top_k_node = l0_node_list[top_k_ind] parent_node = get_parent_node(top_k_node) # new_node_position = len(l1._all) + 1 # new_node_ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, new_node_position) # new_node = FoundationalNode(new_node_ID, passage, tag=layer1.NodeTags.Foundational) """TODO: check this. not sure if it should be the left most child or top_k_ind""" debug_left_most_id = get_left_most_id(parent_node) # debug_left_most_id = top_k_ind # if using_s_model: # output_boundary = output[debug_left_most_id: i + 1] # if unroll and debug_left_most_id > 0: # new_node_enc, combine_l0 = s_model(output_boundary, inp_hidden=hidden[debug_left_most_id - 1], # layer0=True) # else: output_boundary = output[debug_left_most_id: i + 1] new_node_enc, combine_l0, is_dis = s_model(output_boundary, layer0=True, dis=True) if using_rm_model: output_boundary_rm = output_rm[debug_left_most_id: i + 1] new_node_enc_rm, _ = s_model(output_boundary_rm) # else: # new_node_enc = output[i] - output[debug_left_most_id] propn_topk_value, propn_topk_ind = torch.topk(combine_l0, 1) dis_topk_value, dis_topk_ind = torch.topk(is_dis, 1) # need to combine nodes in l0 if dis_topk_ind.data[0] == 1 and propn_topk_ind.data[0] == 1: dis_left_node_l0 = l0_node_list[top_k_ind] dis_left_node_l1 = dis_left_node_l0.parents[0] dis_left_node_l0._incoming = [] dis_left_node_l1._outgoing = [] terminal_node_in_l1.add(terminal_tag, dis_left_node_l0) # i += 1 # continue combined = False if propn_topk_ind.data[0] == 1 and dis_topk_ind.data[0] == 0 and \ debug_left_most_id not in already_in_propn: # check if within the left and right boundary if there is already a node in propn valid_attention = True for j in range(debug_left_most_id, i + 1): if j in already_in_propn: valid_attention = False if valid_attention: combine_list = [] while True: item_node = l1_node_list.pop() l1_node_to_l0_idx = get_left_most_id(item_node) itemid = item_node.ID pid = parent_node.ID combine_list.append(item_node) if l1_node_to_l0_idx == debug_left_most_id: break # make sure not to attend to a node with parents for ck_node in combine_list: # ck_node can be a combined node ck_node_l0 = l0_node_list[get_left_most_id(ck_node)] ck_node_l1 = ck_node_l0.parents[0] if len(ck_node_l1.parents) > 0: valid_attention = False break # push back without change if not valid_attention: combined = False # to be consistent with popping, we loop in the reverse order for ck_node in reversed(combine_list): l1_node_list.append(ck_node) else: combined = True l1_position = len(l1._all) + 1 ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, l1_position) terminal_node_in_l1 = FoundationalNode(ID, passage, tag=layer1.NodeTags.Foundational) for l1_node in combine_list: assert len(l1_node.children) == 1, "l1_node has more than 1 children" terminal_node = l1_node.children[0] # remove node_in_l1 # cannot use "remove" function # l1_node.remove(terminal_node) terminal_node._incoming = [] l1_node._outgoing = [] # if remove node from l1 then ID will be a problem # try: # l1._remove_node(l1_node) # except: # pass # combine nodes terminal_node_in_l1.add(terminal_tag, terminal_node) already_in_propn.append(get_left_most_id(terminal_node)) l1_node_list.append(terminal_node_in_l1) left_most_idx = get_left_most_id(terminal_node_in_l1) node_encoding[terminal_node_in_l1] = new_node_enc ck_node_encoding[terminal_node_in_l1] = [debug_left_most_id, i] # # remote node to a node to the right of the parent # if i in rm_to_add: # for remote_pred in rm_to_add[i]: # rm_parent, rm_label = remote_pred # rm_parent.add(rm_label, terminal_node_in_l1, edge_attrib={'remote': True}) if not combined: children = [] new_node_position = len(l1._all) + 1 new_node_ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, new_node_position) new_node = FoundationalNode(new_node_ID, passage, tag=layer1.NodeTags.Foundational) while True: item_node = l1_node_list.pop() itemid = item_node.ID pid = parent_node.ID children.append(item_node) if item_node.ID == parent_node.ID: for child in children: child_enc = node_encoding[child] ck_child_enc = ck_node_encoding[child] label_weight = label_model(new_node_enc, child_enc) # restrict predicting "H" label label_top_k_value, label_top_k_ind = torch.topk(label_weight, 1) # label_top_k_values, label_top_k_inds = torch.topk(label_weight, 2) # label_top_k_ind = label_top_k_inds[0][0] # if label_top_k_ind == label2index["H"]: # if not (debug_left_most_id == 0 and i == len(ori_sent) - 1): # label_top_k_ind = label_top_k_inds[0][1] # else: # predicted_scene = True pred_label = labels[label_top_k_ind] new_node.add(pred_label, child) # predict remote edge if using_rm_model: rm_weight = rm_model(new_node_enc_rm, output_2d_rm, sent_length) rm_top_k_value, rm_top_k_ind = torch.topk(rm_weight, 1) if rm_top_k_ind < get_left_most_id(new_node): rm_pred_label = "A" new_node.add(rm_pred_label, get_primary_parent(l0_node_list[rm_top_k_ind]), edge_attrib={'remote': True}) elif rm_top_k_ind > get_right_most_id(new_node): rm_pred_label = "A" # new_node.add(rm_pred_label, get_primary_parent(l0_node_list[rm_top_k_ind]), # edge_attrib={'remote': True}) rm_to_add[rm_top_k_ind.data.cpu().numpy()[0][0]].append((new_node, rm_pred_label)) l1_node_list.append(new_node) node_encoding[new_node] = new_node_enc ck_node_encoding[new_node] = [debug_left_most_id, i] break left_most_idx = get_left_most_id(new_node) if left_most_idx > top_k_ind: left_most_idx = top_k_ind # recursive call to see if need to create new node for r in range(1, max_recur + 1): if using_s_model: output_boundary = output[left_most_idx: i + 1] if left_most_idx >= i + 1: print("ERROR:") print("Combined?") print(combined) print("left_most_idx") print(left_most_idx) print("i") print(i) if unroll and left_most_idx > 0: new_node_output, combine_l0 = s_model(output_boundary, inp_hidden=hidden[left_most_idx - 1]) else: new_node_output, combine_l0 = s_model(output_boundary) else: new_node_output = output[i] - output[left_most_idx] new_node_attn_weight = a_model(new_node_output, output_2d, i) r_top_k_value, r_top_k_ind = torch.topk(new_node_attn_weight, 1) # predict out of boundary if r_top_k_ind > i: break # attend to the new node itself elif left_most_idx <= r_top_k_ind <= i: break # create new node else: r_top_k_node = l0_node_list[r_top_k_ind] r_parent_node = get_parent_node(r_top_k_node) new_node_position = len(l1._all) + 1 new_node_ID = "{}{}{}".format("1", core.Node.ID_SEPARATOR, new_node_position) new_node = FoundationalNode(new_node_ID, passage, tag=layer1.NodeTags.Foundational) """TODO: same as before. check this. not sure if it should be the left most child or top_k_ind""" debug_left_most_id = get_left_most_id(r_parent_node) if debug_left_most_id > r_top_k_ind: debug_left_most_id = r_top_k_ind if using_s_model: output_boundary = output[debug_left_most_id: i + 1] if unroll and debug_left_most_id > 0: r_new_node_enc, combine_l0 = s_model(output_boundary, inp_hidden=hidden[debug_left_most_id - 1]) else: r_new_node_enc, combine_l0 = s_model(output_boundary) if using_rm_model: output_boundary_rm = output_rm[debug_left_most_id: i + 1] r_new_node_enc_rm, _ = s_model(output_boundary_rm) else: r_new_node_enc = output[i] - output[debug_left_most_id] # r_new_node_enc = output[i] - output[get_left_most_id(r_parent_node)] children = [] while True: item_node = l1_node_list.pop() children.append(item_node) if item_node.ID == r_parent_node.ID: for child in children: child_enc = node_encoding[child] ck_child_enc = ck_node_encoding[child] label_weight = label_model(r_new_node_enc, child_enc) # restrict predicting "H" label label_top_k_value, label_top_k_ind = torch.topk(label_weight, 1) # label_top_k_values, label_top_k_inds = torch.topk(label_weight, 2) # label_top_k_ind = label_top_k_inds[0][0] # if label_top_k_ind == label2index["H"]: # if not (debug_left_most_id == 0 and i == len(ori_sent) - 1): # label_top_k_ind = label_top_k_inds[0][1] # else: # predicted_scene = True pred_label = labels[label_top_k_ind] new_node.add(pred_label, child) # predict remote edge if using_rm_model: rm_weight = rm_model(r_new_node_enc_rm, output_2d_rm, sent_length) rm_top_k_value, rm_top_k_ind = torch.topk(rm_weight, 1) if rm_top_k_ind < get_left_most_id(new_node): rm_pred_label = "A" new_node.add(rm_pred_label, get_primary_parent(l0_node_list[rm_top_k_ind]), edge_attrib={'remote': True}) elif rm_top_k_ind > get_right_most_id(new_node): rm_pred_label = "A" # new_node.add(rm_pred_label, get_primary_parent(l0_node_list[rm_top_k_ind]), # edge_attrib={'remote': True}) rm_to_add[rm_top_k_ind.data.cpu().numpy()[0][0]].append((new_node, rm_pred_label)) l1_node_list.append(new_node) """WARNING: seems this is wrong. changed""" # node_encoding[new_node] = output[i] - r_new_node_enc node_encoding[new_node] = r_new_node_enc ck_node_encoding[new_node] = [debug_left_most_id, i] break left_most_idx = get_left_most_id(new_node) i += 1 # # check if Node(1.1) is empty # if not predicted_scene: # head_node = l1.heads[0] # head_node_enc = output[-1] - output[0] # for node in l1_node_list: # # print(node.get_terminals()) # current_node_encoding = node_encoding[node] # label_weight = label_model(head_node_enc, current_node_encoding) # label_top_k_value, label_top_k_ind = torch.topk(label_weight, 1) # pred_label = labels[label_top_k_ind] # head_node.add(pred_label, node) # passage = clean_nodes(passage) # print(passage.ID) # ioutil.write_passage(passage, outdir="pred_test/") return passage
def test_terminals(): """Tests :class:`layer0`.Terminal new and inherited functionality.""" p = core.Passage("1") layer0.Layer0(p) terms = [ layer0.Terminal(ID="0.1", root=p, tag=layer0.NodeTags.Word, attrib={ "text": "1", "paragraph": 1, "paragraph_position": 1 }), layer0.Terminal(ID="0.2", root=p, tag=layer0.NodeTags.Word, attrib={ "text": "2", "paragraph": 2, "paragraph_position": 1 }), layer0.Terminal(ID="0.3", root=p, tag=layer0.NodeTags.Punct, attrib={ "text": ".", "paragraph": 2, "paragraph_position": 2 }) ] p_copy = core.Passage("2") layer0.Layer0(p_copy) equal_term = layer0.Terminal(ID="0.1", root=p_copy, tag=layer0.NodeTags.Word, attrib={ "text": "1", "paragraph": 1, "paragraph_position": 1 }) unequal_term = layer0.Terminal(ID="0.2", root=p_copy, tag=layer0.NodeTags.Word, attrib={ "text": "two", "paragraph": 2, "paragraph_position": 1 }) assert [t.punct for t in terms] == [False, False, True] assert [t.text for t in terms] == ["1", "2", "."] assert [t.position for t in terms] == [1, 2, 3] assert [t.paragraph for t in terms] == [1, 2, 2] assert [t.para_pos for t in terms] == [1, 1, 2] assert not (terms[0] == terms[1]) assert not (terms[0] == terms[2]) assert not (terms[1] == terms[2]) assert terms[0] == terms[0] assert terms[0].equals(equal_term) assert not (terms[1].equals(unequal_term)) assert p.copy(layer0.LAYER_ID).equals(p) assert p_copy.copy(layer0.LAYER_ID).equals(p_copy)