def connect_graph_if_not_connected(graph): try: encoded = penman.encode(graph, model=amr_model) return graph, ParsedStatus.OK except: pass nxgraph = nx.MultiGraph() variables = graph.variables() for v1, _, v2 in graph.triples: if v1 in variables and v2 in variables: nxgraph.add_edge(v1, v2) elif v1 in variables: nxgraph.add_edge(v1, v1) triples = graph.triples.copy() new_triples = [] addition = f'a{len(variables) + 1}' triples.append(penman.Triple(addition, ':instance', 'and')) for i, conn_set in enumerate(nx.connected_components(nxgraph), start=1): edge = f':op{i}' conn_set = sorted(conn_set, key=lambda x: int(x[1:])) conn_set = [c for c in conn_set if c in variables] node = conn_set[0] new_triples.append(penman.Triple(addition, edge, node)) triples = new_triples + triples metadata = graph.metadata graph = penman.Graph(triples) graph.metadata.update(metadata) penman.encode(graph, model=amr_model) return graph, ParsedStatus.FIXED
def __str__(self): if self.penman: return penman.encode(self.penman) else: return legacy_graph_printer(self.get_metadata(), self.nodes, self.root, self.edges)
def pgraph_to_gstring(pgraph): pgraph = deepcopy(pgraph) pgraph.metadata = {} gstring = penman.encode(pgraph, indent=0) gstring = gstring.replace('\n', ' ') gstring = re.sub(' +', ' ', gstring) return gstring
def test_for_decode_encode_issue(gold): graph = penman.decode(gold, model=NoOpModel()) test = penman.encode(graph, indent=6, compact=True, model=NoOpModel()) gold = to_graph_line(gold) test = to_graph_line(test) is_good = test == gold return graph, is_good
def _get_nodes_and_backreferences(self, graph): graph_ = copy.deepcopy(graph) graph_.metadata = {} linearized = penman.encode(graph_) linearized_nodes = self._tokenize_encoded_graph(linearized) if self.use_pointer_tokens: remap = {} for i in range(1, len(linearized_nodes)): nxt = linearized_nodes[i] lst = linearized_nodes[i - 1] if nxt == '/': remap[lst] = f'<pointer:{len(remap)}>' i = 1 linearized_nodes_ = [linearized_nodes[0]] while i < (len(linearized_nodes)): nxt = linearized_nodes[i] lst = linearized_nodes_[-1] if nxt in remap: if lst == '(' and linearized_nodes[i + 1] == '/': nxt = remap[nxt] i += 1 elif lst.startswith(':'): nxt = remap[nxt] linearized_nodes_.append(nxt) i += 1 linearized_nodes = linearized_nodes_ if self.remove_pars: linearized_nodes = [n for n in linearized_nodes if n != '('] backreferences = list(range(len(linearized_nodes))) return linearized_nodes, backreferences
def to_format(self, passage, metadata=True, wikification=True, verbose=False, use_original=True, default_label=None, **kwargs): self.wikification = wikification if use_original: original = passage.extra.get("original") if original: return original textutil.annotate(passage, as_array=True) if self.wikification: if verbose: print("Wikifying passage...") WIKIFIER.wikify_passage(passage) if verbose: print("Expanding names...") self._expand_names(passage.layer(layer1.LAYER_ID)) triples = list(self._to_triples( passage, default_label=default_label)) or [("y", INSTANCE, "yes")] return (self.header(passage, **kwargs) if metadata else []) + (penman.encode(penman.Graph(triples)).split("\n"))
def __str__(self): if self.penman: return ' '.join(self.tokens) + '\n\n' + penman.encode(self.penman) else: return legacy_graph_printer(self.get_metadata(), self.nodes, self.root, self.edges)
def toJAMRString(self): """ FIXME: Just modifies ::node line with respect to the original """ output = penman.encode(self.penman) # Try first to just modify existing JAMR annotation new_lines = [] modified = False for line in output.split('\n'): if line.startswith('# ::node'): modified = True items = line.split('\t') node_id = items[1] if node_id in self.alignments: start = min(self.alignments[node_id]) dend = max(self.alignments[node_id]) + 1 if len(items) == 4: items[-1] = f'{start}-{dend}' elif len(items) == 3: items.append(f'{start}-{dend}') else: raise Exception() line = '\t'.join(items) new_lines.append(line) # if not we write it ourselves if not modified: from ipdb import set_trace set_trace(context=30) print() return ('\n'.join(new_lines)) + '\n'
def to_format(self, passage, metadata=True, wikification=True): textutil.annotate(passage) lines = ["# ::id " + passage.ID, "# ::tok " + " ".join(t.text for t in passage.layer(layer0.LAYER_ID).all)] if metadata else [] if wikification: WIKIFIER.wikify_passage(passage) self._expand_names(passage.layer(layer1.LAYER_ID)) return lines + [penman.encode(penman.Graph(list(self._to_triples(passage)))) or "(y / yes)"]
def build(self, concepts, relations): self.concepts = concepts self.relations = relations triples = self.build_instance_triples() # add self.names triples += self.build_edge_attrib_triples() graph = penman.graph.Graph(triples) string = penman.encode(graph, indent=6) # Strip the uniqueness post tag (ie.. 2007@attr1@ -> 2007) string = re.sub(r'@attr\d+@', '', string) return string
def get_penman(self, return_type='object', indent=None): if return_type == 'object': return pm.decode(self.penman) elif return_type == 'str': if indent is None: return self.penman else: return pm.encode(pm.decode(self.penman), top=self.top, indent=indent)
def main(args): pattern = re.compile(r'''[\s()":/,\\'#]+''') with open(args.input, encoding='utf-8') as f, open(args.output, mode='w', encoding='utf-8') as out: for amr_data in f.readlines(): if amr_data == '' or amr_data is None: break amr_data = json.loads(amr_data) amr_nodes = amr_data.pop('nodes') amr_edges = amr_data.pop('edges', []) triples = [] concepts = [] for node in amr_nodes: short_name = f'c{node["id"]}' concept = node["label"] if pattern.search(concept): concept = f"\"{concept}\"" concepts.append(concept) triples.append((short_name, 'instance', concept)) for attr, value in zip(node.get('properties', []), node.get('values', [])): if pattern.search(value): value = f"\"{value}\"" triples.append((short_name, attr, value)) for edge in amr_edges: src = f'c{edge["source"]}' target = f'c{edge["target"]}' label = edge["label"] target = f"\"{target}\"" if pattern.search(target) else target triples.append((src, label, target)) top = amr_data.pop('tops')[0] # id = amr_data['id'] # snt = json.dumps(amr_data['input']) # token = json.dumps(amr_data['token']) # lemma = json.dumps(amr_data['lemma']) # upos = json.dumps(amr_data['upos']) # xpos = json.dumps(amr_data['xpos']) # ner = json.dumps(amr_data['ner']) for key, value in amr_data.items(): amr_data[key] = json.dumps(value, ensure_ascii=False) graph = Graph(triples, top=f"c{top}", metadata=amr_data) graph_en = pp.encode(graph) graph_de = pp.decode(graph_en) out.write(graph_en + '\n\n')
def link_to_graph(number, snt, string): amr = get_amr_from_snt(snt=snt, amr_list=amr_list) html_penman = penman.encode(amr.penman, indent=4).replace("\"", "'") if amr is not None: return string.replace( snt, f'<a id="link_to_{str(number)}" value="{str(number)}">' f'<input type="hidden" id="amr_hidden_link_hidden_to_{str(number)}" ' f'value="{html_penman}"/>' f'{snt}' f'</a>')
def build(self, concepts, relations): self.concepts = concepts self.relations = relations self.used_arcs = defaultdict( set) # keep track of edge names aready seen (key is source_id) triples = self.build_instance_triples() # add self.names triples += self.build_edge_attrib_triples() graph = penman.graph.Graph(triples) string = penman.encode(graph, indent=6) # Strip the uniqueness post tag (ie.. 2007@attr1@ -> 2007) string = re.sub(r'@attr\d+@', '', string) return string
def tokenize_amr(self, graph): if self.raw_graph: graph_ = copy.deepcopy(graph) graph_.metadata = {} linearized = penman.encode(graph_) linearized = re.sub(r"\s+", ' ', linearized) bpe_tokens = [self.bos_token] + self._tokenize(linearized)[:1022] bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens] bpe_backreferences = list(range(len(bpe_token_ids))) return bpe_tokens, bpe_token_ids, bpe_backreferences else: return super().tokenize_amr(graph)
def parse_sents(self, sents, add_metadata=True, return_penman=False, disable_progress=True, pbar_desc=None): assert isinstance(sents, list) # Loop though batches gen_graphs = [] dataloader = torch.utils.data.DataLoader(sents, batch_size=self.batch_size, shuffle=False) pbar = tqdm(total=len(dataloader.dataset), disable=disable_progress, ncols=100, desc=pbar_desc) for batch in dataloader: # I'm Ignoring potential for clipped sentences. If return_overflowing_tokens=True is passed into # the the lower level call to batch_encode_plus(), I could get these back out if needed. # Bart supports up to 1024 input tokens so this would have to be paragraphs of text, all # concatenated into a single "sent", in order to overflow. This isn't valid for AMR anyway. x, _ = self.tokenizer.batch_encode_sentences(batch, device=self.device) # model.config.max_length=20 is the base model. Set this much higher for generating AMR graphs. with torch.no_grad(): model_out = self.model.generate(**x, max_length=512, num_beams=self.num_beams) # re-encode the model output assert len(model_out) == len(batch) for tokk, sent in zip(model_out, batch): graph, status, _ = self.tokenizer.decode_amr( tokk.tolist(), restore_name_ops=self.restore_name_ops) # Handle status errors (also has ParsedStatus.FIXED for fixed disconnected graphs) if status == ParsedStatus.BACKOFF: graph = self.invalid_graph # In Penman 1.2.0, metadata does not impact penam.Graph.__eq__() so code checking for # Inference.invalid_graph should still work, even if 'snt' metadata is different. if add_metadata: graph.metadata['snt'] = sent gen_graphs.append(graph) pbar.update(len(batch)) pbar.close() # Return the penman graphs if return_penman: return gen_graphs # The required behavior across all parse_mdoels, is to return graphs as strings by default gstrings = [ penman.encode(g, indent=6, model=amr_model) for g in gen_graphs ] return gstrings
def parse_spans(self, spans, add_metadata=True): sio_f = io.StringIO() for i, span in enumerate(spans): sent = span.text tokens = list(span) entry = '# ::snt %s\n' % sent entry += '(d / dummy)\n' # not-used but required for proper AMR file format pen_graph = annotate_graph(entry, tokens) amr_string = penman.encode(pen_graph) sio_f.write(amr_string + '\n') if i != len(spans)-1: sio_f.write('\n') sio_f.seek(0) return self.parse_file_handle(sio_f, add_metadata)
def to_format(self, passage, metadata=True, wikification=True, verbose=False, use_original=True): if use_original: original = passage.extra.get("original") if original: return original textutil.annotate(passage, as_array=True) lines = self.header(passage) if metadata else [] if wikification: if verbose: print("Wikifying passage...") WIKIFIER.wikify_passage(passage) if verbose: print("Expanding names...") self._expand_names(passage.layer(layer1.LAYER_ID)) return lines + (penman.encode(penman.Graph(list(self._to_triples(passage)))).split("\n") or ["(y / yes)"])
def get_amr_line(input_f): """ Read the file containing AMRs. AMRs are separated by a blank line. Each call of get_amr_line() returns the next available AMR (in one-line form). Note: this function does not verify if the AMR is valid """ key, value = [], [] # read_amr = '' regex1 = r'# ::tok (.+)' regex = r'# ::snt (.+)' cur_amr = [] has_content = False sentence = '' for line in input_f: line = line.strip() if line == "": if not has_content: # empty lines before current AMR continue else: # end of current AMR break if line.strip().startswith("# ::snt "): sentence = re.match(regex, line.strip()).group(1) # updated # ignore the comment line (starting with "#") in the AMR file # continue elif line.strip().startswith('# ::tok '): tokens = re.match(regex1, line.strip()).group(1).lower().split() elif line.strip().startswith('# ::node'): token = line.split('\t') key.append(token[2].rstrip()) value.append(token[1]) # level[token[2].rstrip()] = token[1] elif line.strip().startswith('#'): continue else: has_content = True # read_amr += line cur_amr.append(line.strip()) if cur_amr: g = penman.decode(' '.join(cur_amr)) amr_penam = penman.encode(g) return "".join(cur_amr), sentence, key, value, amr_penam, tokens else: return '', '', '', '', '', ''
def split_multi_sentence(pgraph): # Get the graph string and variable to concept dictionary pgraph = deepcopy(pgraph) gid = pgraph.metadata.get('id', 'none') # for logging pgraph.metadata = {} var2concept = {t.source: t.target for t in pgraph.instances()} gstring = penman.encode(pgraph, indent=0) # delete the multi-sentence line and any modifiers like (:li, :mode) glines = gstring.split('\n') assert glines[0].startswith('(m / multi-sentence') glines = glines[1:] while glines: if glines[0].startswith(':') and not glines[0].startswith(':snt'): glines = glines[1:] else: break # rejoin the lines remove extra spaces and remove ending paren gstring = ' '.join(glines) gstring = re.sub(r' +', ' ', gstring).strip() assert gstring.endswith(')') gstring = gstring[:-1] # Split on the :snt lines and separate each sentence to its own graph gs_list = [gs.strip() for gs in re.split(':snt\d+', gstring)] gs_list = [gs for gs in gs_list if gs] # Convert the separated graphs to penman objects pgraphs = [] for gidx, gstring in enumerate(gs_list): try: pgraph = penman.decode(gstring) except penman.DecodeError: logger.error('Error decoding %s %d\n%s' % (gid, gidx, gstring)) continue # If a variable is not in this portion of the graph then penman will treat it like an # attribute. In this case we need to add an instance for it. The way penman 1.1.0 # works, this will fix the graph. missing_set = set(t.target for t in pgraph.attributes() if re_attrib.match(t.target)) if missing_set: logger.info('%s %d missing variables: %s' % (gid, gidx, str(missing_set))) # Add the variables and re-decode the graph for var in missing_set: concept = var2concept.get(var, None) if concept is not None: pgraph.triples.append((var, ':instance', concept)) pgraphs.append(pgraph) return pgraphs
def parse_sents(self, sents, add_metadata=True): assert isinstance(sents, list) # Annotate the entry then compile it in a StringIO file-type object # For Simplicity, convert the sentences into an in-memory AMR text file # This could be simplified but the DataLoader is setup to load from an AMR file # and this method will create an in-memory file-type object in the AMR format. sio_f = io.StringIO() for i, sent in enumerate(sents): entry = '# ::snt %s\n' % sent entry += '(d / dummy)\n' # not-used but required for proper AMR file format pen_graph = annotate_graph(entry) amr_string = penman.encode(pen_graph) sio_f.write(amr_string + '\n') if i != len(sents)-1: sio_f.write('\n') sio_f.seek(0) return self.parse_file_handle(sio_f, add_metadata)
def __init__(self, **kwargs): """ :param kwargs: - string - triples - top - object -copy """ if kwargs: if len(kwargs) == 3 and ('string' and 'triples' and 'top') in kwargs: self.penman = kwargs['string'] self.list_triples = kwargs['graph'] self.top = kwargs['top'] elif len(kwargs) == 2 and ('triples' and 'top') in kwargs: self.top = kwargs['top'] self.list_triples = kwargs['triples'] self.penman = triple_model_list_to_penman(self.list_triples, top_id=self.top) self.list_triples = self.delete_wiki() elif len(kwargs) == 1 and 'object' in kwargs: self.penman = pm.encode(kwargs['object'], top=kwargs['object'].top) self.list_triples, self.top = penman_to_model(kwargs['object']) self.list_triples = self.delete_wiki() elif len(kwargs) == 1 and 'copy' in kwargs: _source_amr: AMRModel = kwargs['copy'] self.penman = _source_amr.get_penman(return_type='str') self.list_triples = InstrumentedList( [Triple(copy=t) for t in _source_amr.get_triples()]) self.list_triples = self.delete_wiki() self.top = _source_amr.get_top() self.penman = triple_model_list_to_penman(self.list_triples, top_id=self.top)
def get_amr_line(input_f): """ Read the file containing AMRs. AMRs are separated by a blank line. Each call of get_amr_line() returns the next available AMR (in one-line form). Note: this function does not verify if the AMR is valid """ regex = r'# ::snt (.+)' sentence = '' cur_amr = [] has_content = False for line in input_f: line = line.strip() if line == "": if not has_content: # empty lines before current AMR continue else: # end of current AMR break if line.strip().startswith('# ::snt'): sentence = re.match(regex, line.strip()).group(1) if line.strip().startswith("#"): # ignore the comment line (starting with "#") in the AMR file continue else: has_content = True cur_amr.append(line.strip()) if cur_amr: g = penman.decode(' '.join(cur_amr)) amr_penman = penman.encode(g) c = PENMANCodec() t = c.parse(amr_penman) l = layout.interpret(t) value, key = t.positions(l, 0) return "".join(cur_amr), sentence, key, value, amr_penman else: return '', '', '', '', ''
def encode(d, properties=True, lnk=True, indent=False): """ Serialize a DMRS object to a PENMAN string. Args: d: a DMRS object properties (bool): if `False`, suppress variable properties lnk: if `False`, suppress surface alignments and strings indent (bool, int): if `True` or an integer value, add newlines and indentation Returns: a PENMAN-serialization of the DMRS object """ if indent is True: indent = -1 elif indent is False: indent = None triples = to_triples(d, properties=properties, lnk=lnk) g = penman.Graph(triples) try: return penman.encode(g, indent=indent) except penman.PenmanError as exc: raise PyDelphinException('could not decode with Penman') from exc
def __init__(self, graph, force_annotate=False): # Convert or copy the input graph to penman format if isinstance(graph, str): pgraph = penman.decode(graph, model=NoOpModel()) elif isinstance(graph, penman.graph.Graph): pgraph = deepcopy(pgraph) else: raise ValueError('Code requires either a string a penman graph') # Annotate if needed (aligner/tagging require annotation) is_annotated = all([ key in pgraph.metadata for key in ('tokens', 'lemmas', 'pos_tags') ]) if not is_annotated or force_annotate: sentence = pgraph.metadata[ 'snt'] # Sanity check required tag. Throws KeyError if missing pgraph = annotate_penman(pgraph) self.annotation_performed = True # for unit-testing and debug else: self.annotation_performed = False # Align the graph. For simplicity, always do this. # If there are existing alignments they need to be removed. # See https://penman.readthedocs.io/en/latest/api/penman.surface.html if penman.surface.alignments(pgraph) or penman.surface.role_alignments( pgraph): for key, items in pgraph.epidata.items(): pgraph.epidata[key] = [ x for x in items if not isinstance(x, penman.surface.AlignmentMarker) ] pgraph = RBWAligner.from_penman_w_json(pgraph).get_penman_graph() # get the graph string and pos tags for the tagger self.metadata = pgraph.metadata.copy() pos_tags = json.loads(self.metadata['pos_tags']) pgraph.metadata = {} gstring = penman.encode(pgraph, model=NoOpModel(), indent=6) # Tag the graph string self.gstring_tagged = self.tag(gstring, pos_tags)
def toJAMRString(self): """ FIXME: Just modifies ::node line with respect to the original """ output = penman.encode(self.penman) new_lines = [] modified = False for line in output.split('\n'): if line.startswith('# ::node'): modified = True items = line.split('\t') node_id = items[1] start = min(self.alignments[node_id]) dend = max(self.alignments[node_id]) + 1 if len(items) == 4: items[-1] = f'{start}-{dend}' elif len(items) == 3: items.append(f'{start}-{dend}') else: raise Exception() line = '\t'.join(items) new_lines.append(line) assert modified return ('\n'.join(new_lines)) + '\n'
exit() # Print and generate gnum = int(gnum) print('Original:', sents[gnum]) print() # Get the original graph as a penman object and add back in the sentence to # metadata (stripped during loading) pgraph = penman.decode(graphs[gnum]) pgraph.metadata['snt'] = sents[gnum] # Loop through all variables and select appropriate candidates for the new top variable candidate_tops = pgraph.variables() candidate_tops.remove(pgraph.top) # (optional) Remove nodes with incoming edges - significantly reduces the number of candidates candidate_tops = [ v for v in candidate_tops if incoming_edge_count(v, pgraph) == 0 ] # Create the list to try, keeping the original top first new_tops = [pgraph.top] + candidate_tops new_graphs = [penman.encode(pgraph, top=t) for t in new_tops] # Get the mapping from top variables to the concept for debug var2concept = {t.source: t.target for t in pgraph.instances()} # Generate print('Generated (first is original top variable):') gen_sents, _ = gtos.generate(new_graphs, disable_progress=True) for sent, top in zip(gen_sents, new_tops): print('top: (%s / %s)' % (top, var2concept[top])) print(' ', sent) print() print('-' * 40) print()
def get_graph_string(self): return penman.encode(self.graph, model=NoOpModel(), indent=6)
def __str__(self): return penman.encode(self)
def triple_model_list_to_penman(triple_model_list, top_id): return pm.encode(pm.Graph(data=[(triple.source, triple.relation, triple.target) for triple in triple_model_list]), top=top_id)