Example #1
0
def _decode(lines):
    surface = None
    newline = False
    for line in lines:
        if line.startswith('SENT: '):
            surface = line[6:].rstrip()
        # regular ACE output
        elif line.startswith('['):
            m = line.partition(' ;  ')[0].strip()
            m = simplemrs.decode(m)
            m.surface = surface
            yield m
        # with --tsdb-stdout
        elif line.startswith('('):
            while line:
                data, remainder = SExpr.parse(line)
                line = remainder.lstrip()
                if len(data) == 2 and data[0] == ':results':
                    for result in data[1]:
                        for key, val in result:
                            if key == ':mrs':
                                yield simplemrs.decode(val)
        elif line == '\n':
            if newline:
                surface = None
                newline = False
            else:
                newline = True
        else:
            pass
Example #2
0
def compare(testsuite, gold, select='i-id i-input mrs'):
    """
    Compare two [incr tsdb()] profiles.

    Args:
        testsuite (str, TestSuite): path to the test [incr tsdb()]
            testsuite or a :class:`TestSuite` object
        gold (str, TestSuite): path to the gold [incr tsdb()]
            testsuite or a :class:`TestSuite` object
        select: TSQL query to select (id, input, mrs) triples
            (default: `i-id i-input mrs`)
    Yields:
        dict: Comparison results as::

            {"id": "item identifier",
             "input": "input sentence",
             "test": number_of_unique_results_in_test,
             "shared": number_of_shared_results,
             "gold": number_of_unique_results_in_gold}

    """
    from delphin import mrs
    from delphin.codecs import simplemrs

    if not isinstance(testsuite, itsdb.TestSuite):
        testsuite = itsdb.TestSuite(_validate_tsdb(testsuite))
    if not isinstance(gold, itsdb.TestSuite):
        gold = itsdb.TestSuite(_validate_tsdb(gold))

    queryobj = tsql.inspect_query('select ' + select)
    if len(queryobj['projection']) != 3:
        raise CommandError('select does not return 3 fields: ' + select)

    input_select = '{} {}'.format(queryobj['projection'][0],
                                  queryobj['projection'][1])
    i_inputs = dict(tsql.select(input_select, testsuite))

    matched_rows = itsdb.match_rows(
        tsql.select(select, testsuite),
        tsql.select(select, gold),
        0)

    for (key, testrows, goldrows) in matched_rows:
        (test_unique, shared, gold_unique) = mrs.compare_bags(
            [simplemrs.decode(row[2]) for row in testrows],
            [simplemrs.decode(row[2]) for row in goldrows])
        yield {'id': key,
               'input': i_inputs.get(key),
               'test': test_unique,
               'shared': shared,
               'gold': gold_unique}
def __cli_parse__(args):
    """"""
    # validate IRI prefix
    # handle exceptions
    # handle invalid profile
    # handle output exceptions

    ts = itsdb.TestSuite(args.profile)
    prefix = args.prefix.strip("/")
    graph = Graph()

    for row in tsql.select('i-id i-input mrs', ts):
        id = row[0]
        text = row[1]
        if args.verbosity > 0:
            print("Parsing sentence {}".format(id))
        # parse mrs from profile
        m = simplemrs.decode(row[2])
        # transform to eds:
        d = dmrs.from_mrs(m)
        graph = p.dmrs_to_rdf(d=d,
                              prefix=prefix,
                              identifier=id,
                              graph=graph,
                              text=text)
    # serializes output
    graph.serialize(destination=args.output, format=args.format)
Example #4
0
    def mrs(self):
        """
        Interpret and return an MRS object.

        If :mod:`delphin.codecs.simplemrs` is available and the value
        of the `mrs` key in the result is a valid SimpleMRS string, or
        if :mod:`delphin.codecs.mrsjson` is available and the value is
        a dictionary, return the interpreted MRS object. If there is
        no `mrs` key in the result, return `None`.

        Raises:
            InterfaceError: when the value is an unsupported type or
                the corresponding module is unavailable
        """
        mrs = self.get('mrs')
        try:
            if isinstance(mrs, dict):
                from delphin.codecs import mrsjson
                mrs = mrsjson.from_dict(mrs)
            elif isinstance(mrs, str):
                from delphin.codecs import simplemrs
                mrs = simplemrs.decode(mrs)
            elif mrs is not None:
                raise TypeError(mrs.__class__.__name__)
        except (ImportError, TypeError) as exc:
            raise InterfaceError('can not get MRS object') from exc
        return mrs
Example #5
0
def __cli_parse__(args):
    """"""

    print(args)

    fmt = args.fmt
    path = args.path
    pref = args.prefix
    idtf = args.identifier

    # if a testsuite dir
    if not sys.stdin.isatty():
        input_text = sys.stdin.read()
    elif os.path.isdir(path):
        path = path.strip("/") + "/result.mrs"
        input_text = open(path, "r").read()

    if fmt == "simplemrs":
        m = simplemrs.decode(input_text)
        g = p.mrs_to_rdf(m, pref, idtf)
    elif fmt == "eds":
        pass
    elif fmt == "simpledmrs":
        pass
    # dmrsjson, dmrspenman, dmrx, edsjson, edspenman
    # indexedmrs, mrsjson, mrsprolog, mrx
    else:
        raise ValueError("No a valid format given")

    print(g.serialize(args.output, None))
Example #6
0
def m1b():
    return simplemrs.decode('''
    [ LTOP: h0
      INDEX: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ]
      RELS: < [ "_rain_v_1_rel"<0:6> LBL: h1 ARG0: e2 ] >
      HCONS: < h0 qeq h1 > ]
    ''')
Example #7
0
def pathological2():
    return simplemrs.decode('''
    [ LTOP: h0 INDEX: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ]
      RELS: < [ udef_q_rel<0:13> LBL: h4 ARG0: x3 [ x PERS: 3 NUM: pl ] RSTR: h5 BODY: h6 ]
              [ udef_q_rel<0:4> LBL: h7 ARG0: x8 [ x PERS: 3 NUM: pl IND: + ] RSTR: h9 BODY: h10 ]
              [ "_dog_n_1_rel"<0:4> LBL: h11 ARG0: x8 ]
              [ _and_c_rel<5:8> LBL: h12 ARG0: x3 L-INDEX: x8 R-INDEX: x13 [ x PERS: 3 NUM: pl IND: + ] ]
              [ udef_q_rel<9:13> LBL: h14 ARG0: x13 RSTR: h15 BODY: h16 ]
              [ "_dog_n_1_rel"<9:13> LBL: h17 ARG0: x13 ]
              [ "_chase_v_1_rel"<14:19> LBL: h18 ARG0: e19 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] ARG1: x3 ARG2: x20 [ x PERS: 3 NUM: pl ] ]
              [ udef_q_rel<20:33> LBL: h21 ARG0: x20 RSTR: h22 BODY: h23 ]
              [ udef_q_rel<20:24> LBL: h24 ARG0: x25 [ x PERS: 3 NUM: pl IND: + ] RSTR: h26 BODY: h27 ]
              [ "_dog_n_1_rel"<20:24> LBL: h28 ARG0: x25 ]
              [ _and_c_rel<25:28> LBL: h29 ARG0: x20 L-INDEX: x25 R-INDEX: x30 [ x PERS: 3 NUM: sg IND: + ] ]
              [ udef_q_rel<29:33> LBL: h31 ARG0: x30 RSTR: h32 BODY: h33 ]
              [ "_dog_n_1_rel"<29:33> LBL: h34 ARG0: x30 ]
              [ _and_c_rel<34:37> LBL: h1 ARG0: e2 L-INDEX: e19 R-INDEX: e35 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] L-HNDL: h18 R-HNDL: h36 ]
              [ "_chase_v_1_rel"<38:43> LBL: h36 ARG0: e35 ARG1: x3 ARG2: x37 [ x PERS: 3 NUM: pl ] ]
              [ udef_q_rel<44:58> LBL: h38 ARG0: x37 RSTR: h39 BODY: h40 ]
              [ udef_q_rel<44:48> LBL: h41 ARG0: x42 [ x PERS: 3 NUM: pl IND: + ] RSTR: h43 BODY: h44 ]
              [ "_dog_n_1_rel"<44:48> LBL: h45 ARG0: x42 ]
              [ _and_c_rel<49:52> LBL: h46 ARG0: x37 L-INDEX: x42 R-INDEX: x47 [ x PERS: 3 NUM: pl IND: + ] ]
              [ udef_q_rel<53:58> LBL: h48 ARG0: x47 RSTR: h49 BODY: h50 ]
              [ "_dog_n_1_rel"<53:58> LBL: h51 ARG0: x47 ] >
      HCONS: < h0 qeq h1 h5 qeq h12 h9 qeq h11 h15 qeq h17 h22 qeq h29 h26 qeq h28 h32 qeq h34 h39 qeq h46 h43 qeq h45 h49 qeq h51 > ]
    ''')
Example #8
0
def m2b():
    return simplemrs.decode('''
    [ "The dog chased the dogs."
      TOP: h0
      INDEX: e2 [ e SF: prop TENSE: past MOOD: indicative PROG: - PERF: - ]
      RELS: < [ _the_q<0:3> LBL: h4 ARG0: x3 [ x PERS: 3 NUM: sg IND: + ] RSTR: h5 BODY: h6 ]
              [ _dog_n_1<4:7> LBL: h7 ARG0: x3 ]
              [ _chase_v_1<8:14> LBL: h1 ARG0: e2 ARG1: x3 ARG2: x8 [ x PERS: 3 NUM: pl IND: + ] ]
              [ _the_q<15:18> LBL: h9 ARG0: x8 RSTR: h10 BODY: h11 ]
              [ _dog_n_1<19:23> LBL: h12 ARG0: x8 ] >
      HCONS: < h0 qeq h1 h5 qeq h7 h10 qeq h12 > ]
    ''')
def __cli_parse__(args):
    """"""
    # validate IRI prefix
    # handle exceptions
    # handle invalid profile
    # handle output exceptions

    ts = itsdb.TestSuite(args.profile)
    prefix = args.prefix.strip("/")
    graph = Graph()

    for row in tsql.select('i-id i-input mrs', ts):
        id = row[0]
        text = row[1]
        m = simplemrs.decode(row[2])
        # parse mrs from profile
        p.mrs_to_rdf(m, prefix, id, graph, text=text)
    # serializes output
    graph.serialize(destination=args.output, format=args.format)
def test_Result():
    r = Result()
    assert len(r) == 0
    assert r.mrs() is None
    assert r.dmrs() is None
    assert r.eds() is None
    assert r.derivation() is None

    mrs_s = ('[ TOP: h0'
             '  RELS: < ["_rain_v_1_rel" LBL: h1 ARG0: e2 ] >'
             '  HCONS: < h0 qeq h1 > ]')
    mrs_d = {
        'top':
        'h0',
        'relations': [{
            'predicate': '_rain_v_1',
            'label': 'h1',
            'arguments': {
                'ARG0': 'e2'
            }
        }],
        'constraints': [{
            'relation': 'qeq',
            'high': 'h0',
            'low': 'h1'
        }]
    }
    mrs = simplemrs.decode(mrs_s)

    r = Result(mrs=mrs_s)
    assert len(r) == 1
    assert r['mrs'] == mrs_s
    assert r.mrs() == mrs

    r = Result(mrs=mrs_d)
    assert len(r) == 1
    assert r['mrs'] == mrs_d
    assert r.mrs() == mrs

    r = Result(mrs=mrs_d)
    assert len(r) == 1
    assert r['mrs'] == mrs_d
    assert r.mrs() == mrs

    # r = Result(mrs='nonsense')
    # assert r['mrs'] == 'nonsense'
    # with pytest.raises(PyDelphinSyntaxError):
    #     r.mrs()

    dmrs_d = {
        'nodes': [{
            'nodeid': 10000,
            'predicate': '_rain_v_1',
            'sortinfo': {
                'cvarsort': 'e'
            }
        }],
        'links': [{
            'from': 0,
            'to': 10000,
            'rargname': None,
            'post': 'H'
        }]
    }
    dmrs = dmrsjson.from_dict(dmrs_d)

    r = Result(dmrs=dmrs_d)
    assert len(r) == 1
    assert r['dmrs'] == dmrs_d
    assert r.dmrs() == dmrs

    # r = Result(dmrs='nonsense')
    # assert len(r) == 1
    # assert r['dmrs'] == 'nonsense'
    # with pytest.raises(PyDelphinSyntaxError):
    #     r.dmrs()

    eds_d = {
        'top': 'e2',
        'nodes': {
            'e2': {
                'label': '_rain_v_1',
                'lnk': {
                    'from': 3,
                    'to': 9
                },
                'edges': {}
            }
        }
    }
    eds_s = '{e2: e2:_rain_v_1<3:9>[]}'
    eds = edsjson.from_dict(eds_d)

    r = Result(eds=eds_s)
    assert len(r) == 1
    assert r['eds'] == eds_s
    assert r.eds() == eds

    r = Result(eds=eds_d)
    assert len(r) == 1
    assert r['eds'] == eds_d
    assert r.eds() == eds

    # r = Result(eds='nonsense')
    # assert len(r) == 1
    # assert r['eds'] == 'nonsense'
    # with pytest.raises(PyDelphinSyntaxError):
    #     r.eds()

    # several changes were made to the below for compatibility:
    #  - removed head annotation (on W_PERIOD_PLR)
    #  - removed type info
    #  - removed from/to info
    #  - added start/end
    #  - escaped quotes
    #  - capitalized entity names

    deriv_s = '(189 SB-HD_MC_C 0.228699 0 2 (37 it 0.401245 0 1 ("it" 34 "token [ +FORM \\"it\\" +FROM #1=\\"0\\" +TO \\"2\\" ]")) (188 W_PERIOD_PLR -0.113641 1 2 (187 V_PST_OLR 0 1 2 (56 rain_v1 0 1 2 ("rained." 32 "token [ +FORM \\"rained.\\" +FROM #1=\\"3\\" +TO \\"10\\" ]")))))'
    deriv_d = {
        "id":
        189,
        "entity":
        "SB-HD_MC_C",
        "label":
        "S",
        "score":
        0.228699,
        "start":
        0,
        "end":
        2,
        "daughters": [  # , "type": "subjh_mc_rule"
            {
                "id":
                37,
                "entity":
                "it",
                "score":
                0.401245,
                "start":
                0,
                "end":
                1,
                "form":
                "it",
                "tokens": [  # , "type": "n_-_pr-it-x_le" , "from": 0, "to": 2
                    {
                        "id":
                        34,
                        "tfs":
                        "token [ +FORM \\\"it\\\" +FROM #1=\\\"0\\\" +TO \\\"2\\\" ]"
                    }
                ]
            },  # , "from": 0, "to": 2
            {
                "id":
                188,
                "entity":
                "W_PERIOD_PLR",
                "score":
                -0.113641,
                "start":
                1,
                "end":
                2,
                "daughters": [  # , "type": "punctuation_period_rule"
                    {
                        "id":
                        187,
                        "entity":
                        "V_PST_OLR",
                        "score":
                        0,
                        "start":
                        1,
                        "end":
                        2,
                        "daughters": [  # , "type": "v_pst_inflrule"
                            {
                                "id":
                                56,
                                "entity":
                                "rain_v1",
                                "score":
                                0,
                                "start":
                                1,
                                "end":
                                2,
                                "form":
                                "rained.",
                                "tokens":
                                [  # , "type": "v_-_it_le", "from": 3, "to": 10
                                    {
                                        "id":
                                        32,
                                        "tfs":
                                        "token [ +FORM \\\"rained.\\\" +FROM #1=\\\"3\\\" +TO \\\"10\\\" ]"
                                    }
                                ]
                            }
                        ]
                    }
                ]
            }
        ]  # , "from": 3, "to": 10
    }
    deriv = derivation.from_dict(deriv_d)

    r = Result(derivation=deriv_s)
    assert len(r) == 1
    assert r['derivation'] == deriv_s
    assert r.derivation() == deriv

    r = Result(derivation=deriv_d)
    assert len(r) == 1
    assert r['derivation'] == deriv_d
    assert r.derivation() == deriv
Example #11
0
def _transform_mrs(s):
    return mrsjson.to_dict(simplemrs.decode(s))
Example #12
0
from delphin import ace
from delphin import itsdb
from delphin import tsql
from delphin import dmrs, eds
from delphin.codecs import eds as edsnative
from delphin.codecs import simplemrs
from delphin.codecs import dmrx

# import parser as p
from delphin.rdf import parser as p
from rdflib import Graph
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("profile", help="profile path")
ts = itsdb.TestSuite(parser.parse_args().profile)
graph = Graph()
for row in tsql.select('i-id mrs', ts):
    m = simplemrs.decode(row[1])
    p.mrs_to_rdf(m, "http://example.com/example", row[0], graph)

graph.serialize(destination="test.ttl", format="turtle")
Example #13
0
def __cli_parse__(args):
    # remove the not well formed sentences? add option?
    # print MRS or parse to DMRS format?

    path = args.profile
    prefix = args.prefix.strip("/")
    semrep = args.semrep.lower()
    parser = None
    # Setting verbosity; need to figure a better solution.
    if args.verbosity == 1:
        logger.setLevel(20)
    elif args.verbosity >= 2:
        logger.setLevel(10)

    try:
        # validates path
        if not isdir(path):
            raise NotADirectoryError(f"Path is not a directory: {path}")
        # validates profile
        if not is_database_directory(path):
            raise TSDBError(f'Invalid test suite directory: {path}')
        # validates URI prefix
        if not _is_valid_uri(prefix):
            raise Exception(f'Invalid URI: {prefix}')
        # validate format and get converter
        to_rdf, from_mrs = _get_converters(semrep)

        # open Test Suite and start conversion
        ts = itsdb.TestSuite(path)
        # logger.info(f"Converting {len(ts['result'])} analysis of {len(ts['item'])} sentences from {args.profile}")
        logger.info(
            f"Converting {len(ts['result'])} analysis of {len(ts['item'])} sentences from {args.profile}"
        )

        # Creating the store and the default graph
        store = plugin.get("IOMemory", Store)()
        defaultGraph = Graph(store, identifier=BNode())
        PROFILE = URIRef(f"{prefix}")  # review later
        defaultGraph.add((PROFILE, RDF.type, DELPH.Profile))
        semrepURI, prof_semrep_relation = _get_RDF_semrep(semrep, store)
        store.bind("erg", ERG)
        store.bind("delph", DELPH)
        store.bind("pos", POS)
        # store.bind("upref", prefix) # may be useful

        # The tsql takes some time to be processed:
        # logger.info(f"Loading the profile")
        logger.info(f"Loading the profile")
        profile_data = tsql.select('parse-id result-id i-input mrs', ts)
        logger.info(f"Converting the profile")
        # Iterating over the results:
        for (parse_id, result_id, text, mrs_string) in profile_data:
            logger.debug(
                f"Converting the result {result_id} of sentence {parse_id}")
            m = simplemrs.decode(mrs_string)

            # making sure of the well formedness of "m"
            if not is_well_formed(m):
                logger.warning(
                    f"Result {result_id} of sentence {parse_id} is not well formed"
                )
                # continue

            # converting the MRS object to the representation intended to be converted
            obj = from_mrs(m)
            # logger.debug(f"Result {result_id} of item {parse_id}: \n\t{text}\n\t{obj}\n\t{mrs_string}")

            # Creating URIs for relevant resources.
            ITEM = URIRef(
                f"{prefix}/{parse_id}"
            )  # The item part may be redundant, maybe iterate before the itens
            RESULT = URIRef(f"{prefix}/{parse_id}/{result_id}")
            SEMREPI = URIRef(f"{prefix}/{parse_id}/{result_id}/{semrep}")

            # adding types:
            defaultGraph.add((ITEM, RDF.type, DELPH.Item))
            defaultGraph.add((RESULT, RDF.type, DELPH.Result))
            defaultGraph.add((SEMREPI, RDF.type, semrepURI))

            # Associating text to item:
            defaultGraph.add((ITEM, DELPH.hasText, Literal(text)))

            # Linking those nodes:
            defaultGraph.add((PROFILE, DELPH.hasItem, ITEM))
            defaultGraph.add((ITEM, DELPH.hasResult, RESULT))
            defaultGraph.add((RESULT, prof_semrep_relation, SEMREPI))

            to_rdf(obj, SEMREPI, store, defaultGraph)

        # serializes results
        logger.info(f"Serializing results to {args.output}")
        ConjunctiveGraph(store).serialize(destination=args.output,
                                          format=args.format)
        logger.info(f"DONE")

    # except PyDelphinSyntaxError as e:
    #     logger.exception(e)
    # except ImportError as e:
    #     logger.exception(e)
    # except TSDBError as e:
    #     logger.exception(e)
    except Exception as e:
        logger.error(e)
Example #14
0
def read_profile(input_dir, output_dir, profile_name, mrp_eds, lexicon, args):
    ts = d_itsdb.TestSuite(input_dir)
 
    derivation_strs = []
    supertag_strs = []
    dmrs_json_strs = []

    for iid, sentence, parse_tokens, result_derivation, result_mrs in d_tsql.select('i-id i-input p-tokens derivation mrs', ts):
        tokens_rep = d_tokens.YYTokenLattice.from_string(parse_tokens)
        token_dict = {tok.id : tok for tok in tokens_rep.tokens}
        derivation_rep = d_derivation.from_string(result_derivation)
        assert len(derivation_rep.daughters) == 1 
        derivation_rep = derivation_rep.daughters[0]

        if mrp_eds:
            if iid in mrp_eds:
                try:
                    eds_rep = dc_eds.decode(mrp_eds[iid])
                    dmrs_rep = eds_to_dmrs(eds_rep)
                except d_eds._exceptions.EDSSyntaxError:
                    #print("Skipping: EDS syntax error", mrp_eds[iid])
                    continue
            else:
                    #print("Unmatched:", iid)
                    continue
        else:
            try:
                mrs_rep = dc_simplemrs.decode(result_mrs)
            except d_mrs._exceptions.MRSSyntaxError:
                #print("Skipping: MRS syntax error", result_mrs)
                continue

            dmrs_rep = d_dmrs.from_mrs(mrs_rep)

        mr = semantics.SemanticRepresentation(profile_name + ":" + iid, sentence, token_dict, derivation_rep, lexicon) # read derivation tree

        if args.convert_semantics:
            mr.map_dmrs(dmrs_rep)
            mr.process_semantic_tree(mr.root_node_id, dmrs_rep)

        mr.print_mrs()

        if args.extract_syntax:
            derivation_strs.append(mr.derivation_tree_str(mr.root_node_id, newline=False).lstrip())
            supertag_strs.append(mr.supertag_str(mr.root_node_id).strip())

        if args.extract_semantics:
            dmrs_json_strs.append(mr.dmrs_json_str(dmrs_rep))

    if args.extract_syntax:
        with open(output_dir + ".tree", 'w') as dt_out:
            for s in derivation_strs:
                dt_out.write(s + "\n")
        with open(output_dir + ".tags", 'w') as st_out:
            for s in supertag_strs:
                st_out.write(s + "\n")

    if args.extract_semantics:
        with open(output_dir + ".dmrs", 'w') as d_out:
            for s in dmrs_json_strs:
                if s != "":
                    d_out.write(s + "\n")