def clean_parens(tag): marked_tag = paren_utils.mark_depth(tag) for t in paren_utils.paren_iter(f'({tag})', bottom_up=True): depth = paren_utils.depth_at(tag, tag.index(t)) + 1 t = paren_utils.mark_depth(t) j = 2 while f'<{j}>' in t: t = t.replace(f'<{j}>', '(').replace(f'</{j}>', ')') j += 1 mod = CCG_Tag._Modifier_RE.match(t) if mod and CCG_Tag.remove_features( mod.group('a')) == CCG_Tag.remove_features(mod.group('b')): a = mod.group('a') b = mod.group('b') slash = mod.group('slash') marked_tag = marked_tag.replace( f'<{depth}>{a}</{depth}>{slash}<{depth}>{b}</{depth}>', f'({a}){slash}({b})', 1) elif CCG_Tag._Left_RE.search(t): x = CCG_Tag._Left_RE.match(t) a = x.group('a') marked_tag = marked_tag.replace(f'<{depth}>{a}</{depth}>', a, 1) elif CCG_Tag._Right_RE.search(t): x = CCG_Tag._Right_RE.search(t) a = x.group('a') marked_tag = marked_tag.replace(f'<{depth}>{a}</{depth}>', f'({a})', 1) return paren_utils.unmark_depth(marked_tag)
def __str__(self): if CCGBank.test(self.text): ccg = self.text ccg = ccg.replace('{', '-LBR-').replace('}', '-RBR-') ccg = ccg.replace('(', '{').replace(')', '}') for p in CCGBank.Phrase_RE.finditer(ccg): tag = p.group('tag').replace('{', '(').replace('}', ')') ccg = ccg.replace(p.group(), tag) for w in CCGBank.Word_RE.finditer(ccg): tag = w.group('tag').replace('{', '(').replace('}', ')') word = w.group('word').replace('{', '(').replace('}', ')') ccg = ccg.replace(w.group(), tag + ' ' + word) max = paren_utils.max_depth(ccg, lparen='{', rparen='}') ccg = paren_utils.mark_depth(ccg, lparen='{', rparen='}') j = 1 while j <= max: tabs = ''.join(' ' for i in range(j - 1)) ccg = ccg.replace(f'<{j}>', '\n' + tabs + '{') j += 1 ccg = re.sub(r'</[0-9]+>', '}', ccg) ccg = ccg.replace('-LBR-', '{').replace('-RBR-', '}') return ccg else: return self.text
def latex(text): amr = AMR(text) text = str(amr) for x in re.findall('x[0-9]+ ?/ ?[^()\s]+', text): text = text.replace(x, '(' + x + ')') edges = [(e, id) for e, id in zip(amr.edges(), amr.edge_ids())] elems = [] max_depth = paren_utils.max_depth(text) prev_depth = 0 depth = 0 i = 0 node_depth = {} for t in paren_utils.paren_iter(text): node = amr.NODE_RE.match(t).group() id = node.split('/')[0].strip() # clean node if re.match('x[0-9]+/', node): node = node.split('/')[1] node = node.replace('"', '``', 1).replace('"', "''", 1) prev_depth = depth depth = paren_utils.depth_at(text, text.index(t)) if depth > prev_depth: i = 0 node_depth[id] = depth num_nodes = paren_utils.mark_depth(text).count(f'<{depth}>') x = AMR_Latex.get_x(i, num_nodes) y = AMR_Latex.get_y(depth, max_depth) color = AMR_Latex.get_color(i) elems.append(f'\t\\node[{color}]({id}) at ({x},{y}) {{{node}}};') i += 1 for edge, id in edges: source = id.split('_')[0] target = id.split('_')[2] dir1 = 'south' dir2 = 'north' if node_depth[source] > node_depth[target]: dir1 = 'north' dir2 = 'south' if node_depth[source] == node_depth[target]: dir1 = 'north' dir2 = 'north' elems.append( f'\t\draw[->, thick] ({source}.{dir1}) -- ({target}.{dir2}) node[midway, above, sloped] {{{edge}}};' ) latex = '\n\\begin{tikzpicture}[\n' latex += 'red/.style={rectangle, draw=red!60, fill=red!5, very thick, minimum size=7mm},\n' latex += 'blue/.style={rectangle, draw=blue!60, fill=blue!5, very thick, minimum size=7mm},\n' latex += 'green/.style={rectangle, draw=green!60, fill=green!5, very thick, minimum size=7mm},\n' latex += 'purple/.style={rectangle, draw=purple!60, fill=purple!5, very thick, minimum size=7mm},\n' latex += 'orange/.style={rectangle, draw=orange!60, fill=orange!5, very thick, minimum size=7mm},\n' latex += ']\n' latex += '\n'.join(elems) latex += '\n\end{tikzpicture}\n' return latex
def named_entities(self): NE_RE = re.compile( f'(?P<root>{self.NODE_RE.pattern}).*:name\s+<1>(?P<name>.*?)</1>', re.DOTALL) for t in paren_utils.paren_iter(str(self)): t = paren_utils.mark_depth(t) x = NE_RE.match(t) if x: root = x.group('root') name = x.group('name') yield AMR(f'({root} :name ({name}) )')
def to_html(tag): tag = CCG_Tag.add_indices(tag) x = paren_utils.mark_depth(tag) Paren_RE = re.compile('<1>.*?</1>') while Paren_RE.search(x): x = Paren_RE.sub('X', x) arg_count = len(re.findall(r'[/\\]', x)) if arg_count > 0: tag = tag + f'<args> : {arg_count}</args>' elif tag == 'conj': tag = 'conj<args> : 2</args>' tag = tag.replace('[', '<sub>').replace(']', '</sub>') return tag
def children(self): num_children = int(self._match.group('children')) ccg_phrase = paren_utils.mark_depth(self.phrase) x = self.Children_RE.search(ccg_phrase) a = x.group('a') a = paren_utils.unmark_depth(a) if num_children == 1: a = CCGBank.Phrase(a) if CCGBank.Phrase_RE.match( a) else CCGBank.Word(a) return [a] elif num_children == 2: b = x.group('b') if not b: print(self.phrase, x.group()) b = paren_utils.unmark_depth(b) a = CCGBank.Phrase(a) if CCGBank.Phrase_RE.match( a) else CCGBank.Word(a) b = CCGBank.Phrase(b) if CCGBank.Phrase_RE.match( b) else CCGBank.Word(b) return [a, b] else: return []
def children(self): ccg_phrase = paren_utils.mark_depth(self.phrase, lparen='{', rparen='}') num_children = ccg_phrase.count('<1>') x = self.Children_RE.search(ccg_phrase) a = x.group('a') a = paren_utils.unmark_depth(a, lparen='{', rparen='}') if num_children == 1: a = Readible.Phrase(a) if '{' in a else Readible.Word('{' + a + '}') return [a] elif num_children == 2: a = Readible.Phrase(a) if '{' in a else Readible.Word('{' + a + '}') b = x.group('b') if not b: print(self.phrase, x.group()) b = paren_utils.unmark_depth(b, lparen='{', rparen='}') b = Readible.Phrase(b) if '{' in b else Readible.Word('{' + b + '}') return [a, b] else: return []
def add_indices(tag): old_tag = tag tag = paren_utils.mark_depth(tag) max = paren_utils.max_depth(tag) tag = tag.replace('NP[expl]', '*EXPL*') tag = tag.replace('NP[thr]', '*THR*') # get spans for each modifier pattern modifier_spans = [] j = 1 while j <= max: Modifier_RE = re.compile( fr'<{j}>(?P<a>.*?)</{j}>(?P<slash>[/\\])<{j}>(?P<b>.*?)</{j}>') for mod in Modifier_RE.finditer(tag): a = CCG_Tag.remove_features(mod.group('a')) b = CCG_Tag.remove_features(mod.group('b')) if a == b and 'NP' in a: modifier_spans.append((mod.start('a'), mod.end('a'), mod.start('b'), mod.end('b'))) j += 1 Cat_RE = re.compile(r'([^<>()/\\]+|</?[0-9]+>|.)') cats = [c.group() for c in Cat_RE.finditer(tag)] cat_indices = [c.start() for c in Cat_RE.finditer(tag)] CATS = cats.copy() i = 1 for j, c in enumerate(CATS): if c.startswith('NP'): cats[j] = f'{c}.{i}' i += 1 if re.match(r'^NP[/\\]NP[/\\]?', tag): cats[0] = 'NP.1' cats[2] = 'NP.1' # handle matching indices within a modifier modifier_memo = [] for a_start, a_end, b_start, b_end in modifier_spans: for j, c in enumerate(CATS): if c.startswith('NP'): me = cat_indices[j] if a_start <= me < a_end: x = re.match('.*[.](?P<n>[0-9]+)$', cats[j]) if x: modifier_memo.append(int(x.group('n'))) elif b_start <= me < b_end: m = modifier_memo.pop(0) cats[j] = f'{c}.{m}' i = 1 for j, c in enumerate(CATS): if c.startswith('NP'): x = re.match('.*[.](?P<n>[0-9]+)$', cats[j]) if x and int(x.group('n')) > i: cats[j] = f'{c}.{i}' continue elif x and int(x.group('n')) < i: continue i += 1 tag = ''.join(cats) if tag.count('NP') < 2: tag = old_tag # fix parens for "want", "should", etc. # If nodes are the same but features are different, # remove parentheses around first half of expression. # This is important for getting number of args! j = 1 while j <= max: Modifier_RE = re.compile( fr'<{j}>(?P<a>.*?)</{j}>(?P<slash>[/\\])<{j}>(?P<b>.*?)</{j}>') for mod in Modifier_RE.finditer(tag): a = mod.group('a') b = mod.group('b') slash = mod.group('slash') if a != b: tag = tag.replace(mod.group(), f'{a}{slash}({b})') j += 1 tag = paren_utils.unmark_depth(tag) obj_ctrl = re.match( r'[(]?S\[.*?\]\\NP.1[)]?/[(]S\[.*?\]\\NP.1[)]/NP.2', tag) obj_raise = re.match( r'[(]?S\[.*?\]\\NP.1[)]?/[(]S\[.*?\]\\NP.2/NP.3[)]', tag) if obj_ctrl: tag = tag.replace('NP.1)/NP.2', 'NP.2)/NP.2', 1) if obj_raise: tag = tag.replace('NP.2/NP.3)', 'NP.2/NP.1)', 1) r'S[adj]\NP.1/(S[to]\NP.2/NP.1)' tag = tag.replace('*EXPL*', 'NP[expl]') tag = tag.replace('*THR*', 'NP[thr]') return tag