Esempio n. 1
0
 def get_penman(self, return_type='object', indent=None):
     if return_type == 'object':
         return pm.decode(self.penman)
     elif return_type == 'str':
         if indent is None:
             return self.penman
         else:
             return pm.encode(pm.decode(self.penman),
                              top=self.top,
                              indent=indent)
Esempio n. 2
0
 def __init__(self, gstring):
     self.graph    = penman.decode(gstring, model=NoOpModel())
     # Run the serialization
     self.elements = []              # clear elements list
     self.nodes    = set()           # nodes visited (to prevent recursion)
     self.serialize(self.graph.top)
     self.tokens   = self.elements_to_tokens(self.elements)
Esempio n. 3
0
def add_lemmas(entry, snt_key, verify_tok_key=None):
    global spacy_nlp
    load_spacy()
    graph  = penman.decode(entry, model=NoOpModel())    # do not de-invert graphs
    doc        = spacy_nlp(graph.metadata[snt_key])
    nlp_tokens = [t.text for t in doc]
    graph.metadata['tokens'] = json.dumps(nlp_tokens)
    # Create lemmas
    # SpaCy's lemmatizer returns -PRON- for pronouns so strip these
    # Don't try to lemmatize any named-entities or proper nouns.  Lower-case any other words.
    lemmas = []
    for t in doc:
        if t.lemma_ == '-PRON-':
            lemma = t.text.lower()
        elif t.tag_.startswith('NNP') or t.ent_type_ not in ('', 'O'):
            lemma = t.text
        else:
            lemma = t.lemma_.lower()
        lemmas.append(lemma)
    graph.metadata['lemmas'] = json.dumps(lemmas)
    # If verify_tok_key is not None, verify that the new tokenization is the same as the existing
    # and only return the graph if the tokenized length is the same
    if verify_tok_key is not None:
        isi_tokens = graph.metadata[verify_tok_key].split()
        if len(isi_tokens) == len(lemmas) == len(nlp_tokens):
            return graph
        else:
            return None
    else:
        return graph
Esempio n. 4
0
def test_for_decode_encode_issue(gold):
    graph = penman.decode(gold, model=NoOpModel())
    test = penman.encode(graph, indent=6, compact=True, model=NoOpModel())
    gold = to_graph_line(gold)
    test = to_graph_line(test)
    is_good = test == gold
    return graph, is_good
Esempio n. 5
0
def _process_entry(entry, tokens=None):
    pen = penman.decode(entry)      # standard de-inverting penman loading process
    # Filter out old tags and add the tags from SpaCy parse
    global keep_tags
    if keep_tags is not None:
        pen.metadata = {k:v for k,v in pen.metadata.items() if k in keep_tags}  # filter extra tags
    # If tokens aren't supplied then annoate the graph
    if not tokens:
        global spacy_nlp
        assert spacy_nlp is not None
        tokens = spacy_nlp(pen.metadata['snt'])
    pen.metadata['tokens']   = json.dumps([t.text      for t in tokens])
    ner_tags = [t.ent_type_ if t.ent_type_ else 'O' for t in tokens]    # replace empty with 'O'
    pen.metadata['ner_tags'] = json.dumps(ner_tags)
    pen.metadata['ner_iob']  = json.dumps([t.ent_iob_  for t in tokens])
    pen.metadata['pos_tags'] = json.dumps([t.tag_      for t in tokens])
    # Create lemmas
    # SpaCy's lemmatizer returns -PRON- for pronouns so strip these
    # Don't try to lemmatize any named-entities or proper nouns.  Lower-case any other words.
    lemmas = []
    for t in tokens:
        if t.lemma_ == '-PRON-':
            lemma = t.text.lower()
        elif t.tag_.startswith('NNP') or t.ent_type_ not in ('', 'O'):
            lemma = t.text
        else:
            lemma = t.lemma_.lower()
        lemmas.append(lemma)
    pen.metadata['lemmas'] = json.dumps(lemmas)
    return pen
Esempio n. 6
0
    def from_penman(cls, penman_text, tokenize=False):
        """
        Read AMR from penman notation (will ignore graph data in metadata)
        """
        graph = penman.decode(penman_text)
        nodes, edges = get_simple_graph(graph)
        if tokenize:
            assert 'snt' in graph.metadata, "AMR must contain field ::snt"
            tokens, _ = protected_tokenizer(graph.metadata['snt'])
        else:
            assert 'tok' in graph.metadata, "AMR must contain field ::tok " \
                "(or call this with tokenize=True)"
            tokens = graph.metadata['tok'].split()

        graph_id = None
        if 'id' in graph.metadata:
            graph_id = graph.metadata['id']

        return cls(tokens,
                   nodes,
                   edges,
                   graph.top,
                   penman=graph,
                   clean=True,
                   connect=False,
                   id=graph_id)
Esempio n. 7
0
File: amr.py Progetto: AlongWY/AMR
    def parse_AMR_line(line):
        g = penman.decode(line)
        instances = g.instances()
        node_name_list, node_value_list = zip(*[(instance.source,
                                                 instance.target)
                                                for instance in instances])
        node_name_list = list(node_name_list)
        node_value_list = list(node_value_list)
        positions = {
            concept: idx
            for idx, concept in enumerate(node_name_list)
        }
        relation_list = [[] for _ in node_name_list]
        attribute_list = [[] for _ in node_name_list]

        for src, label, tgt in g.edges():
            relation_list[positions[src]].append([label[1:], tgt])

        for src, label, tgt in g.attributes():
            if label[1:] == "mod":
                attribute_list[positions[src]].append(
                    ["domain", tgt.strip('\"\'')])
            else:
                attribute_list[positions[src]].append(
                    [label[1:], tgt.strip('\"\'')])

        attribute_list[positions[g.top]].append(
            ['TOP', node_value_list[positions[g.top]]])

        result_amr = AMR(node_name_list, node_value_list, relation_list,
                         attribute_list)
        return result_amr
Esempio n. 8
0
 def test_reentrancies(self, x1, x2):
     g = penman.Graph(x1[1])
     assert g.reentrancies() == {'x1': 2}
     g = penman.Graph(x2[1])
     assert g.reentrancies() == {10001: 1}
     # top has an implicit entrancy
     g = penman.decode('(b / bark :ARG1 (d / dog) :ARG1-of (w / wild))')
     assert g.reentrancies() == {'b': 1}
Esempio n. 9
0
 def get_concept(self, variable):
     itsamr = self.amrs[self.focus]
     some_amr = penman.decode(itsamr)
     e = [
         x.target for x in some_amr.triples()
         if x.relation == 'instance' and x.source == variable
     ]
     return e
Esempio n. 10
0
def main(args):
    with open(args.input, encoding='utf-8') as f, \
            open(args.extra, encoding='utf-8') as e, \
            open(args.output, mode='w', encoding='utf-8') as out:
        amr_pair: List[Tuple[Graph, Graph]]
        amr_pair = []
        for sent_num, (cur_amr1,
                       cur_amr2) in enumerate(generate_amr_lines(f, e),
                                              start=1):
            amr_pair.append((pp.decode(cur_amr1), pp.decode(cur_amr2)))

        amrs = []
        for idx, (amr, extra) in enumerate(amr_pair):
            amr.metadata.update(extra.metadata)
            amrs.append(amr)

        out.write(pp.dumps(amrs))
Esempio n. 11
0
 def from_string_w_json(cls,
                        graph,
                        token_key='tokens',
                        lemma_key='lemmas',
                        **kwargs):
     assert isinstance(graph, str)
     graph = penman.decode(graph, model=NoOpModel())
     return cls.from_penman_w_json(graph, token_key, lemma_key, **kwargs)
Esempio n. 12
0
 def test_reentrancies(self, x1, x2):
     g = penman.Graph(x1[1])
     assert g.reentrancies() == {'x1': 2}
     g = penman.Graph(x2[1])
     assert g.reentrancies() == {10001: 1}
     # top has an implicit entrancy
     g = penman.decode('(b / bark :ARG1 (d / dog) :ARG1-of (w / wild))')
     assert g.reentrancies() == {'b': 1}
Esempio n. 13
0
def load_glove(args, sentences, data_div='', dataset='ptb'):
    data_path = './tmp/glove_' + args.task + data_div + '.npz'
    if os.path.exists(data_path):
        return np.load(data_path)

    savez_dict = {}
    embeddings_dict = {}
    with open('./tmp/glove/glove.42B.300d.txt', 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector

    if dataset == 'ptb':
        for s in range(len(sentences)):
            word_emb = []
            for w in sentences[s]:
                if w.lower() in embeddings_dict:
                    word_emb.append(
                        np.expand_dims(embeddings_dict[w.lower()], axis=0))
                else:
                    word_emb.append(
                        np.expand_dims(embeddings_dict[','], axis=0))
            savez_dict['s' + str(s)] = np.concatenate(word_emb)
    else:
        for s in range(len(sentences)):
            word_emb = []
            # parse
            penman_g = penman.decode(sentences[s])
            sen = penman_g.metadata.get('tok').split(' ')
            wid = []
            var = []  # k=word id; v=variable
            for k, v in penman_g.epidata.items():
                if k[1] == ':instance':
                    if len(v):
                        if type(v[0]) == penman.surface.Alignment:
                            wid.append(v[0].indices[0])
                            var.append(k[0])
            c_s = []
            for w in sen:
                c_w = clean_string(w)
                if len(c_w) == 0: c_w = ','
                c_s.append(c_w)
            for w in c_s:
                if w.lower() in embeddings_dict:
                    word_emb.append(
                        np.expand_dims(embeddings_dict[w.lower()], axis=0))
                else:
                    word_emb.append(
                        np.expand_dims(embeddings_dict[','], axis=0))
            if len(wid) == 0: wid = [0]
            savez_dict['s' + str(s)] = np.concatenate(
                [word_emb[i] for i in wid])
    np.savez('./tmp/glove_' + args.task + data_div + '.npz', **savez_dict)

    return np.load(data_path)
Esempio n. 14
0
 def read(file_path):
     with open(file_path, encoding='utf-8') as f:
         while f.readable():
             graph_line = AMR.get_amr_line(f)
             if graph_line == "":
                 break
             graph = pp.decode(graph_line)
             amr = AMR.from_graph(graph)
             yield graph, amr
Esempio n. 15
0
def main(args):
    with open(args.input, encoding='utf-8') as f, \
            open(args.output, mode='w', encoding='utf-8') as out:
        anr_pair: List[Graph]
        anr_pair = []
        for sent_num, cur_amr1 in enumerate(generate_amr_lines(f), start=1):
            anr_pair.append(pp.decode(cur_amr1))

        for idx, amr in enumerate(anr_pair):
            metadata = amr.metadata
            nodes = []
            node_map = {}

            for index, (src, role, tgt) in enumerate(amr.instances()):
                tgt: str
                node_map[src] = index

                nodes.append({
                    'id': index,
                    "label": tgt
                })

            edges = []
            for src, role, tgt in amr.edges():
                label: str = role[1:]
                source = node_map[src]
                target = node_map[tgt]
                edges.append({
                    "source": source,
                    "target": target,
                    "label": label
                })

            for src, role, tgt in amr.attributes():
                source = node_map[src]
                label: str = role[1:]

                nodes[source].setdefault('properties', [])
                nodes[source].setdefault('values', [])

                nodes[source]['properties'].append(label)
                nodes[source]['values'].append(tgt.lower().strip("\""))

            top = node_map[amr.top]

            out.write(json.dumps({
                "id": metadata['id'],
                "flavor": 2,
                "framework": "amr",
                "language": 'eng',
                "version": 1.1,
                "tops": [top],
                "input": json.loads(metadata['snt']),
                "time": "2020-06-22",
                "nodes": nodes,
                "edges": edges}) + '\n')
Esempio n. 16
0
 def build_from_graph(self, entry, debug=False, allow_deinvert=False):
     # Parse the AMR text
     if allow_deinvert:
         penman_graph = penman.decode(entry)
     else:
         model = NoOpModel()  # does not de-invert edges
         penman_graph = penman.decode(entry, model=model)
     # Build g.instances() => concept relations  (these are nodes)
     for t in penman_graph.instances():
         self._add_instance(t)
         if debug: print(t)
     # Build g.edges() => relations between nodes
     for t in penman_graph.edges():
         self._add_edge(t)
         if debug: print(t)
     # Build g.attributes  => relations between nodes and a constant
     for t in penman_graph.attributes():
         self._add_attribute(t)
         if debug: print(t)
Esempio n. 17
0
def decode(s):
    """
    Deserialize a DMRS object from a PENMAN string.
    """
    try:
        g = penman.decode(s)
    except penman.PenmanError as exc:
        raise PyDelphinException('could not decode with Penman') from exc

    return from_triples(g.triples)
Esempio n. 18
0
def main(args):
    pattern = re.compile(r'''[\s()":/,\\'#]+''')
    with open(args.input,
              encoding='utf-8') as f, open(args.output,
                                           mode='w',
                                           encoding='utf-8') as out:
        for amr_data in f.readlines():
            if amr_data == '' or amr_data is None:
                break
            amr_data = json.loads(amr_data)
            amr_nodes = amr_data.pop('nodes')
            amr_edges = amr_data.pop('edges', [])

            triples = []
            concepts = []

            for node in amr_nodes:
                short_name = f'c{node["id"]}'
                concept = node["label"]
                if pattern.search(concept):
                    concept = f"\"{concept}\""
                concepts.append(concept)
                triples.append((short_name, 'instance', concept))
                for attr, value in zip(node.get('properties', []),
                                       node.get('values', [])):
                    if pattern.search(value):
                        value = f"\"{value}\""
                    triples.append((short_name, attr, value))

            for edge in amr_edges:
                src = f'c{edge["source"]}'
                target = f'c{edge["target"]}'
                label = edge["label"]
                target = f"\"{target}\"" if pattern.search(target) else target
                triples.append((src, label, target))

            top = amr_data.pop('tops')[0]

            # id = amr_data['id']
            # snt = json.dumps(amr_data['input'])
            # token = json.dumps(amr_data['token'])
            # lemma = json.dumps(amr_data['lemma'])
            # upos = json.dumps(amr_data['upos'])
            # xpos = json.dumps(amr_data['xpos'])
            # ner = json.dumps(amr_data['ner'])

            for key, value in amr_data.items():
                amr_data[key] = json.dumps(value, ensure_ascii=False)

            graph = Graph(triples, top=f"c{top}", metadata=amr_data)

            graph_en = pp.encode(graph)
            graph_de = pp.decode(graph_en)

            out.write(graph_en + '\n\n')
def get_amrs(inp):
    dats = inp.read().split("\n\n")
    dats = [" ".join([z for z in x.split("\n") if not z.startswith("#")]) for x in dats if x]
    dat=[]
    for d in dats:
        dat.append(penman.decode(d))
    datg = [nx.DiGraph() for x in dat]
    for i,g in enumerate(dat):
        for t in g.triples:
            datg[i].add_edge(t[0],t[2],label=t[1])
    return datg
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        required=True,
                        help='Input file',
                        type=str)
    parser.add_argument('--output',
                        '-o',
                        required=True,
                        help='Output file',
                        type=str)
    args = parser.parse_args()

    # input JSON is expected to have tuples of following type:
    #             (id, amr, (word1, word2))

    with io.open(args.input, 'r', encoding='utf-8') as f:
        data = json.load(f)
    output_paths = []
    l = []
    for id, amr, words in tqdm(data):
        try:
            amr_parsed = amr
            while amr_parsed != re.sub(r'"([^~"]*)"+([^~]*)"~', '"\\1\\2"~',
                                       amr_parsed):
                amr_parsed = re.sub(r'"([^~"]*)"+([^~]*)"~', '"\\1\\2"~',
                                    amr_parsed)
            amr_parsed = re.sub(r"\~e\.[0-9,]+", "",
                                amr_parsed)  # Removing alignment tags
            # amr_parsed = re.sub(r'"{2,}', '""', amr_parsed)  # Removing """"" this kind of things
            graph = penman.decode(amr_parsed)
        except:
            print(amr)
            print(amr_parsed)
            raise Exception("AMR can not be parsed by PenMan")
        paths = paths_for_words(graph, words[0], words[1])
        if paths:
            for path in paths:
                sentences = [[
                    (word_from_node(graph,
                                    nodes_from_word(graph, words[0])[0]), None)
                ],
                             [(word_from_node(
                                 graph,
                                 nodes_from_word(graph, words[1])[0]), None)]]
                sentences[0] += sentence_from_path(graph, path[0])
                sentences[1] += sentence_from_path(graph, path[1])
                output_paths.append((id, sentences))
        else:
            output_paths.append((id, [[], []]))

    with io.open(args.output, 'w', encoding='utf-8') as f:
        json.dump(output_paths, f, indent=True)
Esempio n. 21
0
def _process_entry(entry):
    pen = penman.decode(entry)
    # Remove :wiki from the graphs since we want to ignore these
    triples = [t for t in pen.attributes() if t.role == ':wiki']
    for t in triples:
        try:
            pen.triples.remove(t)
            del pen.epidata[t]
        except:
            logger.error('Unable to remove triple: %s' % (t))
    return pen
Esempio n. 22
0
 def from_penman(cls, penman_text, tokenize=False):
     """
     Read AMR from penman notation (will ignore graph data in metadata)
     """
     graph = penman.decode(penman_text)
     nodes, edges = get_simple_graph(graph)
     if tokenize:
         assert 'snt' in graph.metadata, "AMR must contain field ::tok"
         tokens, _ = protected_tokenizer(graph.metadata['snt'])
     else:
         assert 'tok' in graph.metadata, "AMR must contain field ::tok"
         tokens = graph.metadata['tok'].split()
     return cls(tokens, nodes, edges, graph.top, penman=graph)
Esempio n. 23
0
def penman_to_dot(graph_fragment, lex_label, lemma, form, replacement, pos, prefix="n"):
    """
    Converts a supertag to a little dot graph.
    """

    import penman
    if isinstance(graph_fragment, str):
        g = penman.decode(graph_fragment)
    else:
        g = graph_fragment
    name2name = dict()
    accounted_for = set()
    counter = 0
    r = ""

    for f,rel, to in g.triples:

        if f not in name2name:
            new_name = prefix+str(counter)
            counter += 1
            name2name[f] = new_name

        if rel != ":instance" and to not in name2name:
            new_name = prefix+str(counter)
            counter += 1
            name2name[to] = new_name

    for f,rel, to in g.triples:

        if rel == ":instance":
            is_root = f == g.top
            if to is None:
                source = f.split("<")[1][:-1]
                if is_root:
                    r += name2name[f] + ' [label="' + source + '", fontcolor="red", style="bold"];\n'
                else:
                    r += name2name[f] + ' [label="' + source + '", fontcolor="red"];\n'
            else:
                label = relex(to, lex_label, lemma, form, replacement, pos)
                if is_root:
                    r += name2name[f] + ' [style="bold", label="' + label + '"];\n'
                else:
                    r += name2name[f] + ' [label="' + label + '"];\n'
            accounted_for.add(name2name[f])
        else:

            r += name2name[f] + " -> " + name2name[to] + ' [label="' + rel[1:] + '"];\n'

    assert set(accounted_for) == set(name2name.values())

    return r, name2name[g.top]
Esempio n. 24
0
    def _get_graph(block):
        amr_str = ' '.join(
            [line for line in block if not line.startswith('#')])
        graph = penman.decode(amr_str)

        named_concepts = set()
        attributes = set()
        for t in graph.triples():
            if t.relation == 'instance':
                named_concepts.add(t.source)
        for t in graph.triples():
            if t.relation != 'instance' and t.target not in named_concepts:
                attributes.add(t.target)
        return graph, attributes
Esempio n. 25
0
    def get_amr_line(input_f):
        """
        Read the file containing AMRs. AMRs are separated by a blank line.
        Each call of get_amr_line() returns the next available AMR (in one-line form).
        Note: this function does not verify if the AMR is valid

        """
        key, value = [], []
        # read_amr = ''
        regex1 = r'# ::tok (.+)'
        regex = r'# ::snt (.+)'
        cur_amr = []
        has_content = False
        sentence = ''
        for line in input_f:
            line = line.strip()
            if line == "":
                if not has_content:
                    # empty lines before current AMR
                    continue
                else:
                    # end of current AMR
                    break
            if line.strip().startswith("# ::snt "):
                sentence = re.match(regex, line.strip()).group(1)
                # updated
                # ignore the comment line (starting with "#") in the AMR file
                # continue
            elif line.strip().startswith('# ::tok '):
                tokens = re.match(regex1,
                                  line.strip()).group(1).lower().split()
            elif line.strip().startswith('# ::node'):
                token = line.split('\t')
                key.append(token[2].rstrip())
                value.append(token[1])
                # level[token[2].rstrip()] = token[1]
            elif line.strip().startswith('#'):
                continue
            else:
                has_content = True
                # read_amr += line
                cur_amr.append(line.strip())
        if cur_amr:
            g = penman.decode(' '.join(cur_amr))
            amr_penam = penman.encode(g)
            return "".join(cur_amr), sentence, key, value, amr_penam, tokens
        else:
            return '', '', '', '', '', ''
Esempio n. 26
0
def split_multi_sentence(pgraph):
    # Get the graph string and variable to concept dictionary
    pgraph = deepcopy(pgraph)
    gid = pgraph.metadata.get('id', 'none')  # for logging
    pgraph.metadata = {}
    var2concept = {t.source: t.target for t in pgraph.instances()}
    gstring = penman.encode(pgraph, indent=0)
    # delete the multi-sentence line and any modifiers like (:li, :mode)
    glines = gstring.split('\n')
    assert glines[0].startswith('(m / multi-sentence')
    glines = glines[1:]
    while glines:
        if glines[0].startswith(':') and not glines[0].startswith(':snt'):
            glines = glines[1:]
        else:
            break
    # rejoin the lines remove extra spaces and remove ending paren
    gstring = ' '.join(glines)
    gstring = re.sub(r' +', ' ', gstring).strip()
    assert gstring.endswith(')')
    gstring = gstring[:-1]
    # Split on the :snt lines and separate each sentence to its own graph
    gs_list = [gs.strip() for gs in re.split(':snt\d+', gstring)]
    gs_list = [gs for gs in gs_list if gs]
    # Convert the separated graphs to penman objects
    pgraphs = []
    for gidx, gstring in enumerate(gs_list):
        try:
            pgraph = penman.decode(gstring)
        except penman.DecodeError:
            logger.error('Error decoding %s %d\n%s' % (gid, gidx, gstring))
            continue
        # If a variable is not in this portion of the graph then penman will treat it like an
        # attribute.  In this case we need to add an instance for it.  The way penman 1.1.0
        # works, this will fix the graph.
        missing_set = set(t.target for t in pgraph.attributes()
                          if re_attrib.match(t.target))
        if missing_set:
            logger.info('%s %d missing variables: %s' %
                        (gid, gidx, str(missing_set)))
        # Add the variables and re-decode the graph
        for var in missing_set:
            concept = var2concept.get(var, None)
            if concept is not None:
                pgraph.triples.append((var, ':instance', concept))
        pgraphs.append(pgraph)
    return pgraphs
Esempio n. 27
0
    def get_amr_line(input_f):
        """
        Read the file containing AMRs. AMRs are separated by a blank line.
        Each call of get_amr_line() returns the next available AMR (in one-line form).
        Note: this function does not verify if the AMR is valid

        """
        regex = r'# ::snt (.+)'
        sentence = ''
        cur_amr = []
        has_content = False
        for line in input_f:
            line = line.strip()
            if line == "":
                if not has_content:
                    # empty lines before current AMR
                    continue
                else:
                    # end of current AMR
                    break
            if line.strip().startswith('# ::snt'):
                sentence = re.match(regex, line.strip()).group(1)
            if line.strip().startswith("#"):
                # ignore the comment line (starting with "#") in the AMR file
                continue
            else:
                has_content = True
                cur_amr.append(line.strip())
        if cur_amr:
            g = penman.decode(' '.join(cur_amr))
            amr_penman = penman.encode(g)

            c = PENMANCodec()
            t = c.parse(amr_penman)
            l = layout.interpret(t)
            value, key = t.positions(l, 0)

            return "".join(cur_amr), sentence, key, value, amr_penman
        else:
            return '', '', '', '', ''
Esempio n. 28
0
 def __init__(self, graph, force_annotate=False):
     # Convert or copy the input graph to penman format
     if isinstance(graph, str):
         pgraph = penman.decode(graph, model=NoOpModel())
     elif isinstance(graph, penman.graph.Graph):
         pgraph = deepcopy(pgraph)
     else:
         raise ValueError('Code requires either a string a penman graph')
     # Annotate if needed (aligner/tagging require annotation)
     is_annotated = all([
         key in pgraph.metadata for key in ('tokens', 'lemmas', 'pos_tags')
     ])
     if not is_annotated or force_annotate:
         sentence = pgraph.metadata[
             'snt']  # Sanity check required tag.  Throws KeyError if missing
         pgraph = annotate_penman(pgraph)
         self.annotation_performed = True  # for unit-testing and debug
     else:
         self.annotation_performed = False
     # Align the graph.  For simplicity, always do this.
     # If there are existing alignments they need to be removed.
     # See https://penman.readthedocs.io/en/latest/api/penman.surface.html
     if penman.surface.alignments(pgraph) or penman.surface.role_alignments(
             pgraph):
         for key, items in pgraph.epidata.items():
             pgraph.epidata[key] = [
                 x for x in items
                 if not isinstance(x, penman.surface.AlignmentMarker)
             ]
     pgraph = RBWAligner.from_penman_w_json(pgraph).get_penman_graph()
     # get the graph string and pos tags for the tagger
     self.metadata = pgraph.metadata.copy()
     pos_tags = json.loads(self.metadata['pos_tags'])
     pgraph.metadata = {}
     gstring = penman.encode(pgraph, model=NoOpModel(), indent=6)
     # Tag the graph string
     self.gstring_tagged = self.tag(gstring, pos_tags)
Esempio n. 29
0
def get_edge_idx_amr(s):
    # parse
    penman_g = penman.decode(s)
    s = penman_g.metadata.get('tok').split(' ')
    wid = []
    var = []  # k=word id; v=variable
    for k, v in penman_g.epidata.items():
        if k[1] == ':instance':
            if len(v):
                if type(v[0]) == penman.surface.Alignment:
                    wid.append(v[0].indices[0])
                    var.append(k[0])
    # graph construction
    g = nx.Graph()
    for v in penman_g.variables():
        g.add_node(v)
    for e in penman_g.edges():
        g.add_edge(e.source, e.target)

    edge_space = []
    for i in range(len(var)):
        for j in range(len(var)):
            edge_space.append((i, j))
    # random.shuffle(edge_space)
    src_idx = [i for (i, j) in edge_space]
    dst_idx = [j for (i, j) in edge_space]

    edge_labels = []
    for e in edge_space:
        if (var[e[0]], var[e[1]]) in g.edges():
            edge_labels.append(1)
        elif (var[e[1]], var[e[0]]) in g.edges():
            edge_labels.append(1)
        else:
            edge_labels.append(0)

    return src_idx, dst_idx, np.array(edge_labels)
Esempio n. 30
0
    def _fix_and_make_graph(self, nodes):

        nodes_ = []
        for n in nodes:
            if isinstance(n, str):
                if n.startswith('<') and n.endswith('>') and (not n.startswith('<pointer:')):
                    pass
                else:
                    nodes_.append(n)
            else:
                nodes_.append(n)
        nodes = nodes_
        if not nodes:
            return penman.Graph()

        if self.use_pointer_tokens:

            i = 0
            nodes_ = []
            while i < len(nodes):
                nxt = nodes[i]
                pst = None
                if isinstance(nxt, str) and nxt.startswith('<pointer:'):
                    e = nxt.find('>')
                    if e != len(nxt) - 1:
                        pst = nxt[e + 1:]
                        nxt = nxt[:e + 1]
                    nodes_.append(nxt)
                    if pst is not None:
                        nodes_.append(pst)
                else:
                    nodes_.append(nxt)
                i += 1
            nodes = nodes_

            i = 1
            nodes_ = [nodes[0]]
            while i < len(nodes):
                nxt = nodes[i]
                if isinstance(nxt, str) and nxt.startswith('<pointer:') and i + 1 < len(nodes):
                    nxt = 'z' + nxt[9:-1]
                    fol = nodes[i + 1]
                    # is not expansion
                    if isinstance(fol, str) and (fol.startswith(':') or (fol == ')')):
                        nodes_.append(nxt)
                    else:
                        if self.remove_pars:
                            nodes_.append('(')
                        else:
                            if nodes_[-1] != '(':
                                nodes_.append('(')
                                # pass
                        nodes_.append(nxt)
                        nodes_.append('/')
                else:
                    nodes_.append(nxt)
                i += 1
            nodes = nodes_

        i = 0
        nodes_ = []
        while i < (len(nodes) - 1):
            if nodes[i] == ':':
                nodes_.append(nodes[i] + nodes[i + 1])
                i += 2
                last = False
            else:
                nodes_.append(nodes[i])
                i += 1
                last = True
        if last:
            nodes_.append(nodes[-1])
        nodes = nodes_

        i = 0
        nodes_ = []
        while i < (len(nodes)):
            if i < 2:
                nodes_.append(nodes[i])
                i += 1
            elif nodes_[-2] == '/' and nodes[i] == '/':
                i += 2
            else:
                nodes_.append(nodes[i])
                i += 1
        nodes = nodes_

        i = 0
        newvars = 0
        variables = set()
        remap = {}
        nodes_ = []
        while i < (len(nodes)):

            next = nodes[i]

            if next == '/':
                last = nodes_[-1]
                if last in variables:
                    last_remap = f"z{newvars + 1000}"
                    newvars += 1
                    nodes_[-1] = last_remap
                    remap[last] = last_remap
                variables.add(last)
                nodes_.append(next)

            elif self._classify(next) == 'VAR' and next in remap and (i < len(nodes) - 1) and nodes[i + 1] != '/':
                next = remap[next]
                nodes_.append(next)

            else:
                nodes_.append(next)

            i += 1

        nodes = nodes_
        pieces_ = []
        open_cnt = 0
        closed_cnt = 0
        if nodes[0] != '(':
            pieces_.append('(')
            open_cnt += 1
        for p in nodes:
            if p == '(':
                open_cnt += 1
            elif p == ')':
                closed_cnt += 1
            pieces_.append(p)
            if open_cnt == closed_cnt:
                break
        nodes = pieces_ + [')'] * (open_cnt - closed_cnt)

        pieces = []
        for piece in nodes:
            if not pieces:
                pieces.append('(')
            else:
                piece = str(piece)
                if piece.startswith('"') or piece.startswith('"') or '"' in piece.strip('"'):
                    piece = '"' + piece.replace('"', '') + '"'

                prev = self._classify(pieces[-1])
                next = self._classify(piece)

                if next == 'CONST':
                    quote = False
                    for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\', '_', '='):
                        if char in piece:
                            quote = True
                            break
                    if quote:
                        piece = '"' + piece.strip('"') + '"'

                if prev == '(':
                    if next in ('VAR', 'I'):
                        pieces.append(piece)
                elif prev == ')':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'VAR':
                    if next in ('/', 'EDGE', 'MODE', ')'):
                        pieces.append(piece)
                elif prev == '/':
                    if next in ('INST', 'I'):
                        pieces.append(piece)
                elif prev == 'INST':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'I':
                    if next in ('/', ')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'EDGE':
                    if next in ('(', 'VAR', 'CONST', 'I'):
                        pieces.append(piece)
                    elif next == ')':
                        pieces[-1] = piece
                    elif next in ('EDGE', 'MODE'):
                        pieces[-1] = piece
                elif prev == 'MODE':
                    if next == 'INST':
                        pieces.append(piece)
                elif prev == 'CONST':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)

        pieces_ = []
        open_cnt = 0
        closed_cnt = 0
        if pieces[0] != '(':
            pieces_.append('(')
            open_cnt += 1
        for p in pieces:
            if p == '(':
                open_cnt += 1
            elif p == ')':
                closed_cnt += 1
            pieces_.append(p)
            if open_cnt == closed_cnt:
                break
        pieces = pieces_ + [')'] * (open_cnt - closed_cnt)

        linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip()

        """
        line = linearized
        # make sure parentheses match
        # copied from https://github.com/RikVN/AMR/blob/master/restoreAMR/restore_amr.py
        open_count = 0
        close_count = 0
        for i, c in enumerate(line):
            if c == '(':
                open_count += 1
            elif c == ')':
                close_count += 1
            if open_count == close_count and open_count > 0:
                line = line[:i].strip()
                break
        old_line = line
        while True:
            open_count = len(re.findall(r'\(', line))
            close_count = len(re.findall(r'\)', line))
            if open_count > close_count:
                line += ')' * (open_count - close_count)
            elif close_count > open_count:
                for i in range(close_count - open_count):
                    line = line.rstrip(')')
                    line = line.rstrip(' ')
            if old_line == line:
                break
            old_line = line
        """

        graph = penman.decode(linearized + ' ')
        triples = []
        newvars = 2000
        for triple in graph.triples:
            x, rel, y = triple
            if x is None:
                pass
            elif rel == ':instance' and y is None:
                triples.append(penman.Triple(x, rel, 'thing'))
            elif y is None:
                var = f'z{newvars}'
                newvars += 1
                triples.append(penman.Triple(x, rel, var))
                triples.append(penman.Triple(var, ':instance', 'thing'))
            else:
                triples.append(triple)
        graph = penman.Graph(triples)
        linearized = encode(graph)

        def fix_text(linearized=linearized):
            n = 0

            def _repl1(match):
                nonlocal n
                out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3)
                n += 1
                return out

            linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized,
                                flags=re.IGNORECASE | re.MULTILINE)

            def _repl2(match):
                return match.group(1)

            linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2,
                                linearized,
                                flags=re.IGNORECASE | re.MULTILINE)

            # adds a ':' to args w/o it
            linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized)

            # removes edges with no node
            # linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE)

            return linearized

        linearized = fix_text(linearized)

        g = penman.decode(linearized)
        return g
Esempio n. 31
0
def parse_penman(graph_fragment):
    import penman
    return penman.decode(graph_fragment)