def get_penman(self, return_type='object', indent=None): if return_type == 'object': return pm.decode(self.penman) elif return_type == 'str': if indent is None: return self.penman else: return pm.encode(pm.decode(self.penman), top=self.top, indent=indent)
def __init__(self, gstring): self.graph = penman.decode(gstring, model=NoOpModel()) # Run the serialization self.elements = [] # clear elements list self.nodes = set() # nodes visited (to prevent recursion) self.serialize(self.graph.top) self.tokens = self.elements_to_tokens(self.elements)
def add_lemmas(entry, snt_key, verify_tok_key=None): global spacy_nlp load_spacy() graph = penman.decode(entry, model=NoOpModel()) # do not de-invert graphs doc = spacy_nlp(graph.metadata[snt_key]) nlp_tokens = [t.text for t in doc] graph.metadata['tokens'] = json.dumps(nlp_tokens) # Create lemmas # SpaCy's lemmatizer returns -PRON- for pronouns so strip these # Don't try to lemmatize any named-entities or proper nouns. Lower-case any other words. lemmas = [] for t in doc: if t.lemma_ == '-PRON-': lemma = t.text.lower() elif t.tag_.startswith('NNP') or t.ent_type_ not in ('', 'O'): lemma = t.text else: lemma = t.lemma_.lower() lemmas.append(lemma) graph.metadata['lemmas'] = json.dumps(lemmas) # If verify_tok_key is not None, verify that the new tokenization is the same as the existing # and only return the graph if the tokenized length is the same if verify_tok_key is not None: isi_tokens = graph.metadata[verify_tok_key].split() if len(isi_tokens) == len(lemmas) == len(nlp_tokens): return graph else: return None else: return graph
def test_for_decode_encode_issue(gold): graph = penman.decode(gold, model=NoOpModel()) test = penman.encode(graph, indent=6, compact=True, model=NoOpModel()) gold = to_graph_line(gold) test = to_graph_line(test) is_good = test == gold return graph, is_good
def _process_entry(entry, tokens=None): pen = penman.decode(entry) # standard de-inverting penman loading process # Filter out old tags and add the tags from SpaCy parse global keep_tags if keep_tags is not None: pen.metadata = {k:v for k,v in pen.metadata.items() if k in keep_tags} # filter extra tags # If tokens aren't supplied then annoate the graph if not tokens: global spacy_nlp assert spacy_nlp is not None tokens = spacy_nlp(pen.metadata['snt']) pen.metadata['tokens'] = json.dumps([t.text for t in tokens]) ner_tags = [t.ent_type_ if t.ent_type_ else 'O' for t in tokens] # replace empty with 'O' pen.metadata['ner_tags'] = json.dumps(ner_tags) pen.metadata['ner_iob'] = json.dumps([t.ent_iob_ for t in tokens]) pen.metadata['pos_tags'] = json.dumps([t.tag_ for t in tokens]) # Create lemmas # SpaCy's lemmatizer returns -PRON- for pronouns so strip these # Don't try to lemmatize any named-entities or proper nouns. Lower-case any other words. lemmas = [] for t in tokens: if t.lemma_ == '-PRON-': lemma = t.text.lower() elif t.tag_.startswith('NNP') or t.ent_type_ not in ('', 'O'): lemma = t.text else: lemma = t.lemma_.lower() lemmas.append(lemma) pen.metadata['lemmas'] = json.dumps(lemmas) return pen
def from_penman(cls, penman_text, tokenize=False): """ Read AMR from penman notation (will ignore graph data in metadata) """ graph = penman.decode(penman_text) nodes, edges = get_simple_graph(graph) if tokenize: assert 'snt' in graph.metadata, "AMR must contain field ::snt" tokens, _ = protected_tokenizer(graph.metadata['snt']) else: assert 'tok' in graph.metadata, "AMR must contain field ::tok " \ "(or call this with tokenize=True)" tokens = graph.metadata['tok'].split() graph_id = None if 'id' in graph.metadata: graph_id = graph.metadata['id'] return cls(tokens, nodes, edges, graph.top, penman=graph, clean=True, connect=False, id=graph_id)
def parse_AMR_line(line): g = penman.decode(line) instances = g.instances() node_name_list, node_value_list = zip(*[(instance.source, instance.target) for instance in instances]) node_name_list = list(node_name_list) node_value_list = list(node_value_list) positions = { concept: idx for idx, concept in enumerate(node_name_list) } relation_list = [[] for _ in node_name_list] attribute_list = [[] for _ in node_name_list] for src, label, tgt in g.edges(): relation_list[positions[src]].append([label[1:], tgt]) for src, label, tgt in g.attributes(): if label[1:] == "mod": attribute_list[positions[src]].append( ["domain", tgt.strip('\"\'')]) else: attribute_list[positions[src]].append( [label[1:], tgt.strip('\"\'')]) attribute_list[positions[g.top]].append( ['TOP', node_value_list[positions[g.top]]]) result_amr = AMR(node_name_list, node_value_list, relation_list, attribute_list) return result_amr
def test_reentrancies(self, x1, x2): g = penman.Graph(x1[1]) assert g.reentrancies() == {'x1': 2} g = penman.Graph(x2[1]) assert g.reentrancies() == {10001: 1} # top has an implicit entrancy g = penman.decode('(b / bark :ARG1 (d / dog) :ARG1-of (w / wild))') assert g.reentrancies() == {'b': 1}
def get_concept(self, variable): itsamr = self.amrs[self.focus] some_amr = penman.decode(itsamr) e = [ x.target for x in some_amr.triples() if x.relation == 'instance' and x.source == variable ] return e
def main(args): with open(args.input, encoding='utf-8') as f, \ open(args.extra, encoding='utf-8') as e, \ open(args.output, mode='w', encoding='utf-8') as out: amr_pair: List[Tuple[Graph, Graph]] amr_pair = [] for sent_num, (cur_amr1, cur_amr2) in enumerate(generate_amr_lines(f, e), start=1): amr_pair.append((pp.decode(cur_amr1), pp.decode(cur_amr2))) amrs = [] for idx, (amr, extra) in enumerate(amr_pair): amr.metadata.update(extra.metadata) amrs.append(amr) out.write(pp.dumps(amrs))
def from_string_w_json(cls, graph, token_key='tokens', lemma_key='lemmas', **kwargs): assert isinstance(graph, str) graph = penman.decode(graph, model=NoOpModel()) return cls.from_penman_w_json(graph, token_key, lemma_key, **kwargs)
def load_glove(args, sentences, data_div='', dataset='ptb'): data_path = './tmp/glove_' + args.task + data_div + '.npz' if os.path.exists(data_path): return np.load(data_path) savez_dict = {} embeddings_dict = {} with open('./tmp/glove/glove.42B.300d.txt', 'r') as f: for line in f: values = line.split() word = values[0] vector = np.asarray(values[1:], "float32") embeddings_dict[word] = vector if dataset == 'ptb': for s in range(len(sentences)): word_emb = [] for w in sentences[s]: if w.lower() in embeddings_dict: word_emb.append( np.expand_dims(embeddings_dict[w.lower()], axis=0)) else: word_emb.append( np.expand_dims(embeddings_dict[','], axis=0)) savez_dict['s' + str(s)] = np.concatenate(word_emb) else: for s in range(len(sentences)): word_emb = [] # parse penman_g = penman.decode(sentences[s]) sen = penman_g.metadata.get('tok').split(' ') wid = [] var = [] # k=word id; v=variable for k, v in penman_g.epidata.items(): if k[1] == ':instance': if len(v): if type(v[0]) == penman.surface.Alignment: wid.append(v[0].indices[0]) var.append(k[0]) c_s = [] for w in sen: c_w = clean_string(w) if len(c_w) == 0: c_w = ',' c_s.append(c_w) for w in c_s: if w.lower() in embeddings_dict: word_emb.append( np.expand_dims(embeddings_dict[w.lower()], axis=0)) else: word_emb.append( np.expand_dims(embeddings_dict[','], axis=0)) if len(wid) == 0: wid = [0] savez_dict['s' + str(s)] = np.concatenate( [word_emb[i] for i in wid]) np.savez('./tmp/glove_' + args.task + data_div + '.npz', **savez_dict) return np.load(data_path)
def read(file_path): with open(file_path, encoding='utf-8') as f: while f.readable(): graph_line = AMR.get_amr_line(f) if graph_line == "": break graph = pp.decode(graph_line) amr = AMR.from_graph(graph) yield graph, amr
def main(args): with open(args.input, encoding='utf-8') as f, \ open(args.output, mode='w', encoding='utf-8') as out: anr_pair: List[Graph] anr_pair = [] for sent_num, cur_amr1 in enumerate(generate_amr_lines(f), start=1): anr_pair.append(pp.decode(cur_amr1)) for idx, amr in enumerate(anr_pair): metadata = amr.metadata nodes = [] node_map = {} for index, (src, role, tgt) in enumerate(amr.instances()): tgt: str node_map[src] = index nodes.append({ 'id': index, "label": tgt }) edges = [] for src, role, tgt in amr.edges(): label: str = role[1:] source = node_map[src] target = node_map[tgt] edges.append({ "source": source, "target": target, "label": label }) for src, role, tgt in amr.attributes(): source = node_map[src] label: str = role[1:] nodes[source].setdefault('properties', []) nodes[source].setdefault('values', []) nodes[source]['properties'].append(label) nodes[source]['values'].append(tgt.lower().strip("\"")) top = node_map[amr.top] out.write(json.dumps({ "id": metadata['id'], "flavor": 2, "framework": "amr", "language": 'eng', "version": 1.1, "tops": [top], "input": json.loads(metadata['snt']), "time": "2020-06-22", "nodes": nodes, "edges": edges}) + '\n')
def build_from_graph(self, entry, debug=False, allow_deinvert=False): # Parse the AMR text if allow_deinvert: penman_graph = penman.decode(entry) else: model = NoOpModel() # does not de-invert edges penman_graph = penman.decode(entry, model=model) # Build g.instances() => concept relations (these are nodes) for t in penman_graph.instances(): self._add_instance(t) if debug: print(t) # Build g.edges() => relations between nodes for t in penman_graph.edges(): self._add_edge(t) if debug: print(t) # Build g.attributes => relations between nodes and a constant for t in penman_graph.attributes(): self._add_attribute(t) if debug: print(t)
def decode(s): """ Deserialize a DMRS object from a PENMAN string. """ try: g = penman.decode(s) except penman.PenmanError as exc: raise PyDelphinException('could not decode with Penman') from exc return from_triples(g.triples)
def main(args): pattern = re.compile(r'''[\s()":/,\\'#]+''') with open(args.input, encoding='utf-8') as f, open(args.output, mode='w', encoding='utf-8') as out: for amr_data in f.readlines(): if amr_data == '' or amr_data is None: break amr_data = json.loads(amr_data) amr_nodes = amr_data.pop('nodes') amr_edges = amr_data.pop('edges', []) triples = [] concepts = [] for node in amr_nodes: short_name = f'c{node["id"]}' concept = node["label"] if pattern.search(concept): concept = f"\"{concept}\"" concepts.append(concept) triples.append((short_name, 'instance', concept)) for attr, value in zip(node.get('properties', []), node.get('values', [])): if pattern.search(value): value = f"\"{value}\"" triples.append((short_name, attr, value)) for edge in amr_edges: src = f'c{edge["source"]}' target = f'c{edge["target"]}' label = edge["label"] target = f"\"{target}\"" if pattern.search(target) else target triples.append((src, label, target)) top = amr_data.pop('tops')[0] # id = amr_data['id'] # snt = json.dumps(amr_data['input']) # token = json.dumps(amr_data['token']) # lemma = json.dumps(amr_data['lemma']) # upos = json.dumps(amr_data['upos']) # xpos = json.dumps(amr_data['xpos']) # ner = json.dumps(amr_data['ner']) for key, value in amr_data.items(): amr_data[key] = json.dumps(value, ensure_ascii=False) graph = Graph(triples, top=f"c{top}", metadata=amr_data) graph_en = pp.encode(graph) graph_de = pp.decode(graph_en) out.write(graph_en + '\n\n')
def get_amrs(inp): dats = inp.read().split("\n\n") dats = [" ".join([z for z in x.split("\n") if not z.startswith("#")]) for x in dats if x] dat=[] for d in dats: dat.append(penman.decode(d)) datg = [nx.DiGraph() for x in dat] for i,g in enumerate(dat): for t in g.triples: datg[i].add_edge(t[0],t[2],label=t[1]) return datg
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input', '-i', required=True, help='Input file', type=str) parser.add_argument('--output', '-o', required=True, help='Output file', type=str) args = parser.parse_args() # input JSON is expected to have tuples of following type: # (id, amr, (word1, word2)) with io.open(args.input, 'r', encoding='utf-8') as f: data = json.load(f) output_paths = [] l = [] for id, amr, words in tqdm(data): try: amr_parsed = amr while amr_parsed != re.sub(r'"([^~"]*)"+([^~]*)"~', '"\\1\\2"~', amr_parsed): amr_parsed = re.sub(r'"([^~"]*)"+([^~]*)"~', '"\\1\\2"~', amr_parsed) amr_parsed = re.sub(r"\~e\.[0-9,]+", "", amr_parsed) # Removing alignment tags # amr_parsed = re.sub(r'"{2,}', '""', amr_parsed) # Removing """"" this kind of things graph = penman.decode(amr_parsed) except: print(amr) print(amr_parsed) raise Exception("AMR can not be parsed by PenMan") paths = paths_for_words(graph, words[0], words[1]) if paths: for path in paths: sentences = [[ (word_from_node(graph, nodes_from_word(graph, words[0])[0]), None) ], [(word_from_node( graph, nodes_from_word(graph, words[1])[0]), None)]] sentences[0] += sentence_from_path(graph, path[0]) sentences[1] += sentence_from_path(graph, path[1]) output_paths.append((id, sentences)) else: output_paths.append((id, [[], []])) with io.open(args.output, 'w', encoding='utf-8') as f: json.dump(output_paths, f, indent=True)
def _process_entry(entry): pen = penman.decode(entry) # Remove :wiki from the graphs since we want to ignore these triples = [t for t in pen.attributes() if t.role == ':wiki'] for t in triples: try: pen.triples.remove(t) del pen.epidata[t] except: logger.error('Unable to remove triple: %s' % (t)) return pen
def from_penman(cls, penman_text, tokenize=False): """ Read AMR from penman notation (will ignore graph data in metadata) """ graph = penman.decode(penman_text) nodes, edges = get_simple_graph(graph) if tokenize: assert 'snt' in graph.metadata, "AMR must contain field ::tok" tokens, _ = protected_tokenizer(graph.metadata['snt']) else: assert 'tok' in graph.metadata, "AMR must contain field ::tok" tokens = graph.metadata['tok'].split() return cls(tokens, nodes, edges, graph.top, penman=graph)
def penman_to_dot(graph_fragment, lex_label, lemma, form, replacement, pos, prefix="n"): """ Converts a supertag to a little dot graph. """ import penman if isinstance(graph_fragment, str): g = penman.decode(graph_fragment) else: g = graph_fragment name2name = dict() accounted_for = set() counter = 0 r = "" for f,rel, to in g.triples: if f not in name2name: new_name = prefix+str(counter) counter += 1 name2name[f] = new_name if rel != ":instance" and to not in name2name: new_name = prefix+str(counter) counter += 1 name2name[to] = new_name for f,rel, to in g.triples: if rel == ":instance": is_root = f == g.top if to is None: source = f.split("<")[1][:-1] if is_root: r += name2name[f] + ' [label="' + source + '", fontcolor="red", style="bold"];\n' else: r += name2name[f] + ' [label="' + source + '", fontcolor="red"];\n' else: label = relex(to, lex_label, lemma, form, replacement, pos) if is_root: r += name2name[f] + ' [style="bold", label="' + label + '"];\n' else: r += name2name[f] + ' [label="' + label + '"];\n' accounted_for.add(name2name[f]) else: r += name2name[f] + " -> " + name2name[to] + ' [label="' + rel[1:] + '"];\n' assert set(accounted_for) == set(name2name.values()) return r, name2name[g.top]
def _get_graph(block): amr_str = ' '.join( [line for line in block if not line.startswith('#')]) graph = penman.decode(amr_str) named_concepts = set() attributes = set() for t in graph.triples(): if t.relation == 'instance': named_concepts.add(t.source) for t in graph.triples(): if t.relation != 'instance' and t.target not in named_concepts: attributes.add(t.target) return graph, attributes
def get_amr_line(input_f): """ Read the file containing AMRs. AMRs are separated by a blank line. Each call of get_amr_line() returns the next available AMR (in one-line form). Note: this function does not verify if the AMR is valid """ key, value = [], [] # read_amr = '' regex1 = r'# ::tok (.+)' regex = r'# ::snt (.+)' cur_amr = [] has_content = False sentence = '' for line in input_f: line = line.strip() if line == "": if not has_content: # empty lines before current AMR continue else: # end of current AMR break if line.strip().startswith("# ::snt "): sentence = re.match(regex, line.strip()).group(1) # updated # ignore the comment line (starting with "#") in the AMR file # continue elif line.strip().startswith('# ::tok '): tokens = re.match(regex1, line.strip()).group(1).lower().split() elif line.strip().startswith('# ::node'): token = line.split('\t') key.append(token[2].rstrip()) value.append(token[1]) # level[token[2].rstrip()] = token[1] elif line.strip().startswith('#'): continue else: has_content = True # read_amr += line cur_amr.append(line.strip()) if cur_amr: g = penman.decode(' '.join(cur_amr)) amr_penam = penman.encode(g) return "".join(cur_amr), sentence, key, value, amr_penam, tokens else: return '', '', '', '', '', ''
def split_multi_sentence(pgraph): # Get the graph string and variable to concept dictionary pgraph = deepcopy(pgraph) gid = pgraph.metadata.get('id', 'none') # for logging pgraph.metadata = {} var2concept = {t.source: t.target for t in pgraph.instances()} gstring = penman.encode(pgraph, indent=0) # delete the multi-sentence line and any modifiers like (:li, :mode) glines = gstring.split('\n') assert glines[0].startswith('(m / multi-sentence') glines = glines[1:] while glines: if glines[0].startswith(':') and not glines[0].startswith(':snt'): glines = glines[1:] else: break # rejoin the lines remove extra spaces and remove ending paren gstring = ' '.join(glines) gstring = re.sub(r' +', ' ', gstring).strip() assert gstring.endswith(')') gstring = gstring[:-1] # Split on the :snt lines and separate each sentence to its own graph gs_list = [gs.strip() for gs in re.split(':snt\d+', gstring)] gs_list = [gs for gs in gs_list if gs] # Convert the separated graphs to penman objects pgraphs = [] for gidx, gstring in enumerate(gs_list): try: pgraph = penman.decode(gstring) except penman.DecodeError: logger.error('Error decoding %s %d\n%s' % (gid, gidx, gstring)) continue # If a variable is not in this portion of the graph then penman will treat it like an # attribute. In this case we need to add an instance for it. The way penman 1.1.0 # works, this will fix the graph. missing_set = set(t.target for t in pgraph.attributes() if re_attrib.match(t.target)) if missing_set: logger.info('%s %d missing variables: %s' % (gid, gidx, str(missing_set))) # Add the variables and re-decode the graph for var in missing_set: concept = var2concept.get(var, None) if concept is not None: pgraph.triples.append((var, ':instance', concept)) pgraphs.append(pgraph) return pgraphs
def get_amr_line(input_f): """ Read the file containing AMRs. AMRs are separated by a blank line. Each call of get_amr_line() returns the next available AMR (in one-line form). Note: this function does not verify if the AMR is valid """ regex = r'# ::snt (.+)' sentence = '' cur_amr = [] has_content = False for line in input_f: line = line.strip() if line == "": if not has_content: # empty lines before current AMR continue else: # end of current AMR break if line.strip().startswith('# ::snt'): sentence = re.match(regex, line.strip()).group(1) if line.strip().startswith("#"): # ignore the comment line (starting with "#") in the AMR file continue else: has_content = True cur_amr.append(line.strip()) if cur_amr: g = penman.decode(' '.join(cur_amr)) amr_penman = penman.encode(g) c = PENMANCodec() t = c.parse(amr_penman) l = layout.interpret(t) value, key = t.positions(l, 0) return "".join(cur_amr), sentence, key, value, amr_penman else: return '', '', '', '', ''
def __init__(self, graph, force_annotate=False): # Convert or copy the input graph to penman format if isinstance(graph, str): pgraph = penman.decode(graph, model=NoOpModel()) elif isinstance(graph, penman.graph.Graph): pgraph = deepcopy(pgraph) else: raise ValueError('Code requires either a string a penman graph') # Annotate if needed (aligner/tagging require annotation) is_annotated = all([ key in pgraph.metadata for key in ('tokens', 'lemmas', 'pos_tags') ]) if not is_annotated or force_annotate: sentence = pgraph.metadata[ 'snt'] # Sanity check required tag. Throws KeyError if missing pgraph = annotate_penman(pgraph) self.annotation_performed = True # for unit-testing and debug else: self.annotation_performed = False # Align the graph. For simplicity, always do this. # If there are existing alignments they need to be removed. # See https://penman.readthedocs.io/en/latest/api/penman.surface.html if penman.surface.alignments(pgraph) or penman.surface.role_alignments( pgraph): for key, items in pgraph.epidata.items(): pgraph.epidata[key] = [ x for x in items if not isinstance(x, penman.surface.AlignmentMarker) ] pgraph = RBWAligner.from_penman_w_json(pgraph).get_penman_graph() # get the graph string and pos tags for the tagger self.metadata = pgraph.metadata.copy() pos_tags = json.loads(self.metadata['pos_tags']) pgraph.metadata = {} gstring = penman.encode(pgraph, model=NoOpModel(), indent=6) # Tag the graph string self.gstring_tagged = self.tag(gstring, pos_tags)
def get_edge_idx_amr(s): # parse penman_g = penman.decode(s) s = penman_g.metadata.get('tok').split(' ') wid = [] var = [] # k=word id; v=variable for k, v in penman_g.epidata.items(): if k[1] == ':instance': if len(v): if type(v[0]) == penman.surface.Alignment: wid.append(v[0].indices[0]) var.append(k[0]) # graph construction g = nx.Graph() for v in penman_g.variables(): g.add_node(v) for e in penman_g.edges(): g.add_edge(e.source, e.target) edge_space = [] for i in range(len(var)): for j in range(len(var)): edge_space.append((i, j)) # random.shuffle(edge_space) src_idx = [i for (i, j) in edge_space] dst_idx = [j for (i, j) in edge_space] edge_labels = [] for e in edge_space: if (var[e[0]], var[e[1]]) in g.edges(): edge_labels.append(1) elif (var[e[1]], var[e[0]]) in g.edges(): edge_labels.append(1) else: edge_labels.append(0) return src_idx, dst_idx, np.array(edge_labels)
def _fix_and_make_graph(self, nodes): nodes_ = [] for n in nodes: if isinstance(n, str): if n.startswith('<') and n.endswith('>') and (not n.startswith('<pointer:')): pass else: nodes_.append(n) else: nodes_.append(n) nodes = nodes_ if not nodes: return penman.Graph() if self.use_pointer_tokens: i = 0 nodes_ = [] while i < len(nodes): nxt = nodes[i] pst = None if isinstance(nxt, str) and nxt.startswith('<pointer:'): e = nxt.find('>') if e != len(nxt) - 1: pst = nxt[e + 1:] nxt = nxt[:e + 1] nodes_.append(nxt) if pst is not None: nodes_.append(pst) else: nodes_.append(nxt) i += 1 nodes = nodes_ i = 1 nodes_ = [nodes[0]] while i < len(nodes): nxt = nodes[i] if isinstance(nxt, str) and nxt.startswith('<pointer:') and i + 1 < len(nodes): nxt = 'z' + nxt[9:-1] fol = nodes[i + 1] # is not expansion if isinstance(fol, str) and (fol.startswith(':') or (fol == ')')): nodes_.append(nxt) else: if self.remove_pars: nodes_.append('(') else: if nodes_[-1] != '(': nodes_.append('(') # pass nodes_.append(nxt) nodes_.append('/') else: nodes_.append(nxt) i += 1 nodes = nodes_ i = 0 nodes_ = [] while i < (len(nodes) - 1): if nodes[i] == ':': nodes_.append(nodes[i] + nodes[i + 1]) i += 2 last = False else: nodes_.append(nodes[i]) i += 1 last = True if last: nodes_.append(nodes[-1]) nodes = nodes_ i = 0 nodes_ = [] while i < (len(nodes)): if i < 2: nodes_.append(nodes[i]) i += 1 elif nodes_[-2] == '/' and nodes[i] == '/': i += 2 else: nodes_.append(nodes[i]) i += 1 nodes = nodes_ i = 0 newvars = 0 variables = set() remap = {} nodes_ = [] while i < (len(nodes)): next = nodes[i] if next == '/': last = nodes_[-1] if last in variables: last_remap = f"z{newvars + 1000}" newvars += 1 nodes_[-1] = last_remap remap[last] = last_remap variables.add(last) nodes_.append(next) elif self._classify(next) == 'VAR' and next in remap and (i < len(nodes) - 1) and nodes[i + 1] != '/': next = remap[next] nodes_.append(next) else: nodes_.append(next) i += 1 nodes = nodes_ pieces_ = [] open_cnt = 0 closed_cnt = 0 if nodes[0] != '(': pieces_.append('(') open_cnt += 1 for p in nodes: if p == '(': open_cnt += 1 elif p == ')': closed_cnt += 1 pieces_.append(p) if open_cnt == closed_cnt: break nodes = pieces_ + [')'] * (open_cnt - closed_cnt) pieces = [] for piece in nodes: if not pieces: pieces.append('(') else: piece = str(piece) if piece.startswith('"') or piece.startswith('"') or '"' in piece.strip('"'): piece = '"' + piece.replace('"', '') + '"' prev = self._classify(pieces[-1]) next = self._classify(piece) if next == 'CONST': quote = False for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\', '_', '='): if char in piece: quote = True break if quote: piece = '"' + piece.strip('"') + '"' if prev == '(': if next in ('VAR', 'I'): pieces.append(piece) elif prev == ')': if next in (')', 'EDGE', 'MODE'): pieces.append(piece) elif prev == 'VAR': if next in ('/', 'EDGE', 'MODE', ')'): pieces.append(piece) elif prev == '/': if next in ('INST', 'I'): pieces.append(piece) elif prev == 'INST': if next in (')', 'EDGE', 'MODE'): pieces.append(piece) elif prev == 'I': if next in ('/', ')', 'EDGE', 'MODE'): pieces.append(piece) elif prev == 'EDGE': if next in ('(', 'VAR', 'CONST', 'I'): pieces.append(piece) elif next == ')': pieces[-1] = piece elif next in ('EDGE', 'MODE'): pieces[-1] = piece elif prev == 'MODE': if next == 'INST': pieces.append(piece) elif prev == 'CONST': if next in (')', 'EDGE', 'MODE'): pieces.append(piece) pieces_ = [] open_cnt = 0 closed_cnt = 0 if pieces[0] != '(': pieces_.append('(') open_cnt += 1 for p in pieces: if p == '(': open_cnt += 1 elif p == ')': closed_cnt += 1 pieces_.append(p) if open_cnt == closed_cnt: break pieces = pieces_ + [')'] * (open_cnt - closed_cnt) linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip() """ line = linearized # make sure parentheses match # copied from https://github.com/RikVN/AMR/blob/master/restoreAMR/restore_amr.py open_count = 0 close_count = 0 for i, c in enumerate(line): if c == '(': open_count += 1 elif c == ')': close_count += 1 if open_count == close_count and open_count > 0: line = line[:i].strip() break old_line = line while True: open_count = len(re.findall(r'\(', line)) close_count = len(re.findall(r'\)', line)) if open_count > close_count: line += ')' * (open_count - close_count) elif close_count > open_count: for i in range(close_count - open_count): line = line.rstrip(')') line = line.rstrip(' ') if old_line == line: break old_line = line """ graph = penman.decode(linearized + ' ') triples = [] newvars = 2000 for triple in graph.triples: x, rel, y = triple if x is None: pass elif rel == ':instance' and y is None: triples.append(penman.Triple(x, rel, 'thing')) elif y is None: var = f'z{newvars}' newvars += 1 triples.append(penman.Triple(x, rel, var)) triples.append(penman.Triple(var, ':instance', 'thing')) else: triples.append(triple) graph = penman.Graph(triples) linearized = encode(graph) def fix_text(linearized=linearized): n = 0 def _repl1(match): nonlocal n out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3) n += 1 return out linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized, flags=re.IGNORECASE | re.MULTILINE) def _repl2(match): return match.group(1) linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2, linearized, flags=re.IGNORECASE | re.MULTILINE) # adds a ':' to args w/o it linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized) # removes edges with no node # linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE) return linearized linearized = fix_text(linearized) g = penman.decode(linearized) return g
def parse_penman(graph_fragment): import penman return penman.decode(graph_fragment)