def phrases(ccg): for s in paren_utils.paren_iter(ccg, bottom_up=True, lparen='{', rparen='}'): if Readible.Phrase_RE.match(s): yield Readible.Phrase(s)
def clean_parens(tag): marked_tag = paren_utils.mark_depth(tag) for t in paren_utils.paren_iter(f'({tag})', bottom_up=True): depth = paren_utils.depth_at(tag, tag.index(t)) + 1 t = paren_utils.mark_depth(t) j = 2 while f'<{j}>' in t: t = t.replace(f'<{j}>', '(').replace(f'</{j}>', ')') j += 1 mod = CCG_Tag._Modifier_RE.match(t) if mod and CCG_Tag.remove_features( mod.group('a')) == CCG_Tag.remove_features(mod.group('b')): a = mod.group('a') b = mod.group('b') slash = mod.group('slash') marked_tag = marked_tag.replace( f'<{depth}>{a}</{depth}>{slash}<{depth}>{b}</{depth}>', f'({a}){slash}({b})', 1) elif CCG_Tag._Left_RE.search(t): x = CCG_Tag._Left_RE.match(t) a = x.group('a') marked_tag = marked_tag.replace(f'<{depth}>{a}</{depth}>', a, 1) elif CCG_Tag._Right_RE.search(t): x = CCG_Tag._Right_RE.search(t) a = x.group('a') marked_tag = marked_tag.replace(f'<{depth}>{a}</{depth}>', f'({a})', 1) return paren_utils.unmark_depth(marked_tag)
def edge_triples(self): if self.edges_are_memoized: return self.edge_triples_memoized memo = [] text = str(self) idx = [(e.start(), e.group()) for e in self.EDGE_RE.finditer(text)] for rel_position, rel in idx: Target_RE = re.compile(f'{rel}\s*[(]?\s*{self.NODE_RE.pattern}') for t in paren_utils.paren_iter(text): pos = rel_position - text.index(t) if t.startswith(rel, pos) and paren_utils.depth_at(t, pos) == 0: root = self.NODE_RE.match(t) source = root.group().split('/')[0].strip() x = Target_RE.match(t, pos=pos) if x: target = x.group('id') memo.append((source, rel, target)) else: print('Missing target node! ', rel, re.sub('\s+', ' ', t)) memo.append((source, rel, '?')) break self.edge_triples_memoized = memo self.edges_are_memoized = True return memo
def latex(text): amr = AMR(text) text = str(amr) for x in re.findall('x[0-9]+ ?/ ?[^()\s]+', text): text = text.replace(x, '(' + x + ')') edges = [(e, id) for e, id in zip(amr.edges(), amr.edge_ids())] elems = [] max_depth = paren_utils.max_depth(text) prev_depth = 0 depth = 0 i = 0 node_depth = {} for t in paren_utils.paren_iter(text): node = amr.NODE_RE.match(t).group() id = node.split('/')[0].strip() # clean node if re.match('x[0-9]+/', node): node = node.split('/')[1] node = node.replace('"', '``', 1).replace('"', "''", 1) prev_depth = depth depth = paren_utils.depth_at(text, text.index(t)) if depth > prev_depth: i = 0 node_depth[id] = depth num_nodes = paren_utils.mark_depth(text).count(f'<{depth}>') x = AMR_Latex.get_x(i, num_nodes) y = AMR_Latex.get_y(depth, max_depth) color = AMR_Latex.get_color(i) elems.append(f'\t\\node[{color}]({id}) at ({x},{y}) {{{node}}};') i += 1 for edge, id in edges: source = id.split('_')[0] target = id.split('_')[2] dir1 = 'south' dir2 = 'north' if node_depth[source] > node_depth[target]: dir1 = 'north' dir2 = 'south' if node_depth[source] == node_depth[target]: dir1 = 'north' dir2 = 'north' elems.append( f'\t\draw[->, thick] ({source}.{dir1}) -- ({target}.{dir2}) node[midway, above, sloped] {{{edge}}};' ) latex = '\n\\begin{tikzpicture}[\n' latex += 'red/.style={rectangle, draw=red!60, fill=red!5, very thick, minimum size=7mm},\n' latex += 'blue/.style={rectangle, draw=blue!60, fill=blue!5, very thick, minimum size=7mm},\n' latex += 'green/.style={rectangle, draw=green!60, fill=green!5, very thick, minimum size=7mm},\n' latex += 'purple/.style={rectangle, draw=purple!60, fill=purple!5, very thick, minimum size=7mm},\n' latex += 'orange/.style={rectangle, draw=orange!60, fill=orange!5, very thick, minimum size=7mm},\n' latex += ']\n' latex += '\n'.join(elems) latex += '\n\end{tikzpicture}\n' return latex
def named_entities(self): NE_RE = re.compile( f'(?P<root>{self.NODE_RE.pattern}).*:name\s+<1>(?P<name>.*?)</1>', re.DOTALL) for t in paren_utils.paren_iter(str(self)): t = paren_utils.mark_depth(t) x = NE_RE.match(t) if x: root = x.group('root') name = x.group('name') yield AMR(f'({root} :name ({name}) )')
def phrases(ccg): ccg = ccg.replace('{', '-LBR-').replace('}', '-RBR-') ccg = ccg.replace('(', '{').replace(')', '}') for t in re.finditer('<[LT] (?P<tag>[^\s>]+) .*?>', ccg): tag = t.group('tag') ccg = ccg.replace( f' {tag} ', ' ' + tag.replace('{', '(').replace('}', ')') + ' ') for s in paren_utils.paren_iter(ccg, bottom_up=True): p = CCGBank.Phrase_RE.match(s) if not p: continue yield CCGBank.Phrase(s)
def phrases_and_indices(ccg): phrases = Readible.phrases(ccg) ccg_ids = ccg ID_RE = re.compile('[*](?P<n>[0-9]+)[*]') for i, w in enumerate(Readible.words(ccg)): word = w.text ccg_ids = ccg_ids.replace(word, f'{{*{i}*}}', 1) for s in paren_utils.paren_iter(ccg_ids, bottom_up=True, lparen='{', rparen='}'): if Readible.Phrase_RE.match(s): indices = [i.group('n') for i in ID_RE.finditer(s)] yield (next(phrases), f'{indices[0]}-{indices[-1]}')
def sub_amrs(self): for t in paren_utils.paren_iter(self.text): yield AMR('(' + t + ')')