def _decode(lines): surface = None newline = False for line in lines: if line.startswith('SENT: '): surface = line[6:].rstrip() # regular ACE output elif line.startswith('['): m = line.partition(' ; ')[0].strip() m = simplemrs.decode(m) m.surface = surface yield m # with --tsdb-stdout elif line.startswith('('): while line: data, remainder = SExpr.parse(line) line = remainder.lstrip() if len(data) == 2 and data[0] == ':results': for result in data[1]: for key, val in result: if key == ':mrs': yield simplemrs.decode(val) elif line == '\n': if newline: surface = None newline = False else: newline = True else: pass
def compare(testsuite, gold, select='i-id i-input mrs'): """ Compare two [incr tsdb()] profiles. Args: testsuite (str, TestSuite): path to the test [incr tsdb()] testsuite or a :class:`TestSuite` object gold (str, TestSuite): path to the gold [incr tsdb()] testsuite or a :class:`TestSuite` object select: TSQL query to select (id, input, mrs) triples (default: `i-id i-input mrs`) Yields: dict: Comparison results as:: {"id": "item identifier", "input": "input sentence", "test": number_of_unique_results_in_test, "shared": number_of_shared_results, "gold": number_of_unique_results_in_gold} """ from delphin import mrs from delphin.codecs import simplemrs if not isinstance(testsuite, itsdb.TestSuite): testsuite = itsdb.TestSuite(_validate_tsdb(testsuite)) if not isinstance(gold, itsdb.TestSuite): gold = itsdb.TestSuite(_validate_tsdb(gold)) queryobj = tsql.inspect_query('select ' + select) if len(queryobj['projection']) != 3: raise CommandError('select does not return 3 fields: ' + select) input_select = '{} {}'.format(queryobj['projection'][0], queryobj['projection'][1]) i_inputs = dict(tsql.select(input_select, testsuite)) matched_rows = itsdb.match_rows( tsql.select(select, testsuite), tsql.select(select, gold), 0) for (key, testrows, goldrows) in matched_rows: (test_unique, shared, gold_unique) = mrs.compare_bags( [simplemrs.decode(row[2]) for row in testrows], [simplemrs.decode(row[2]) for row in goldrows]) yield {'id': key, 'input': i_inputs.get(key), 'test': test_unique, 'shared': shared, 'gold': gold_unique}
def __cli_parse__(args): """""" # validate IRI prefix # handle exceptions # handle invalid profile # handle output exceptions ts = itsdb.TestSuite(args.profile) prefix = args.prefix.strip("/") graph = Graph() for row in tsql.select('i-id i-input mrs', ts): id = row[0] text = row[1] if args.verbosity > 0: print("Parsing sentence {}".format(id)) # parse mrs from profile m = simplemrs.decode(row[2]) # transform to eds: d = dmrs.from_mrs(m) graph = p.dmrs_to_rdf(d=d, prefix=prefix, identifier=id, graph=graph, text=text) # serializes output graph.serialize(destination=args.output, format=args.format)
def mrs(self): """ Interpret and return an MRS object. If :mod:`delphin.codecs.simplemrs` is available and the value of the `mrs` key in the result is a valid SimpleMRS string, or if :mod:`delphin.codecs.mrsjson` is available and the value is a dictionary, return the interpreted MRS object. If there is no `mrs` key in the result, return `None`. Raises: InterfaceError: when the value is an unsupported type or the corresponding module is unavailable """ mrs = self.get('mrs') try: if isinstance(mrs, dict): from delphin.codecs import mrsjson mrs = mrsjson.from_dict(mrs) elif isinstance(mrs, str): from delphin.codecs import simplemrs mrs = simplemrs.decode(mrs) elif mrs is not None: raise TypeError(mrs.__class__.__name__) except (ImportError, TypeError) as exc: raise InterfaceError('can not get MRS object') from exc return mrs
def __cli_parse__(args): """""" print(args) fmt = args.fmt path = args.path pref = args.prefix idtf = args.identifier # if a testsuite dir if not sys.stdin.isatty(): input_text = sys.stdin.read() elif os.path.isdir(path): path = path.strip("/") + "/result.mrs" input_text = open(path, "r").read() if fmt == "simplemrs": m = simplemrs.decode(input_text) g = p.mrs_to_rdf(m, pref, idtf) elif fmt == "eds": pass elif fmt == "simpledmrs": pass # dmrsjson, dmrspenman, dmrx, edsjson, edspenman # indexedmrs, mrsjson, mrsprolog, mrx else: raise ValueError("No a valid format given") print(g.serialize(args.output, None))
def m1b(): return simplemrs.decode(''' [ LTOP: h0 INDEX: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] RELS: < [ "_rain_v_1_rel"<0:6> LBL: h1 ARG0: e2 ] > HCONS: < h0 qeq h1 > ] ''')
def pathological2(): return simplemrs.decode(''' [ LTOP: h0 INDEX: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] RELS: < [ udef_q_rel<0:13> LBL: h4 ARG0: x3 [ x PERS: 3 NUM: pl ] RSTR: h5 BODY: h6 ] [ udef_q_rel<0:4> LBL: h7 ARG0: x8 [ x PERS: 3 NUM: pl IND: + ] RSTR: h9 BODY: h10 ] [ "_dog_n_1_rel"<0:4> LBL: h11 ARG0: x8 ] [ _and_c_rel<5:8> LBL: h12 ARG0: x3 L-INDEX: x8 R-INDEX: x13 [ x PERS: 3 NUM: pl IND: + ] ] [ udef_q_rel<9:13> LBL: h14 ARG0: x13 RSTR: h15 BODY: h16 ] [ "_dog_n_1_rel"<9:13> LBL: h17 ARG0: x13 ] [ "_chase_v_1_rel"<14:19> LBL: h18 ARG0: e19 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] ARG1: x3 ARG2: x20 [ x PERS: 3 NUM: pl ] ] [ udef_q_rel<20:33> LBL: h21 ARG0: x20 RSTR: h22 BODY: h23 ] [ udef_q_rel<20:24> LBL: h24 ARG0: x25 [ x PERS: 3 NUM: pl IND: + ] RSTR: h26 BODY: h27 ] [ "_dog_n_1_rel"<20:24> LBL: h28 ARG0: x25 ] [ _and_c_rel<25:28> LBL: h29 ARG0: x20 L-INDEX: x25 R-INDEX: x30 [ x PERS: 3 NUM: sg IND: + ] ] [ udef_q_rel<29:33> LBL: h31 ARG0: x30 RSTR: h32 BODY: h33 ] [ "_dog_n_1_rel"<29:33> LBL: h34 ARG0: x30 ] [ _and_c_rel<34:37> LBL: h1 ARG0: e2 L-INDEX: e19 R-INDEX: e35 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] L-HNDL: h18 R-HNDL: h36 ] [ "_chase_v_1_rel"<38:43> LBL: h36 ARG0: e35 ARG1: x3 ARG2: x37 [ x PERS: 3 NUM: pl ] ] [ udef_q_rel<44:58> LBL: h38 ARG0: x37 RSTR: h39 BODY: h40 ] [ udef_q_rel<44:48> LBL: h41 ARG0: x42 [ x PERS: 3 NUM: pl IND: + ] RSTR: h43 BODY: h44 ] [ "_dog_n_1_rel"<44:48> LBL: h45 ARG0: x42 ] [ _and_c_rel<49:52> LBL: h46 ARG0: x37 L-INDEX: x42 R-INDEX: x47 [ x PERS: 3 NUM: pl IND: + ] ] [ udef_q_rel<53:58> LBL: h48 ARG0: x47 RSTR: h49 BODY: h50 ] [ "_dog_n_1_rel"<53:58> LBL: h51 ARG0: x47 ] > HCONS: < h0 qeq h1 h5 qeq h12 h9 qeq h11 h15 qeq h17 h22 qeq h29 h26 qeq h28 h32 qeq h34 h39 qeq h46 h43 qeq h45 h49 qeq h51 > ] ''')
def m2b(): return simplemrs.decode(''' [ "The dog chased the dogs." TOP: h0 INDEX: e2 [ e SF: prop TENSE: past MOOD: indicative PROG: - PERF: - ] RELS: < [ _the_q<0:3> LBL: h4 ARG0: x3 [ x PERS: 3 NUM: sg IND: + ] RSTR: h5 BODY: h6 ] [ _dog_n_1<4:7> LBL: h7 ARG0: x3 ] [ _chase_v_1<8:14> LBL: h1 ARG0: e2 ARG1: x3 ARG2: x8 [ x PERS: 3 NUM: pl IND: + ] ] [ _the_q<15:18> LBL: h9 ARG0: x8 RSTR: h10 BODY: h11 ] [ _dog_n_1<19:23> LBL: h12 ARG0: x8 ] > HCONS: < h0 qeq h1 h5 qeq h7 h10 qeq h12 > ] ''')
def __cli_parse__(args): """""" # validate IRI prefix # handle exceptions # handle invalid profile # handle output exceptions ts = itsdb.TestSuite(args.profile) prefix = args.prefix.strip("/") graph = Graph() for row in tsql.select('i-id i-input mrs', ts): id = row[0] text = row[1] m = simplemrs.decode(row[2]) # parse mrs from profile p.mrs_to_rdf(m, prefix, id, graph, text=text) # serializes output graph.serialize(destination=args.output, format=args.format)
def test_Result(): r = Result() assert len(r) == 0 assert r.mrs() is None assert r.dmrs() is None assert r.eds() is None assert r.derivation() is None mrs_s = ('[ TOP: h0' ' RELS: < ["_rain_v_1_rel" LBL: h1 ARG0: e2 ] >' ' HCONS: < h0 qeq h1 > ]') mrs_d = { 'top': 'h0', 'relations': [{ 'predicate': '_rain_v_1', 'label': 'h1', 'arguments': { 'ARG0': 'e2' } }], 'constraints': [{ 'relation': 'qeq', 'high': 'h0', 'low': 'h1' }] } mrs = simplemrs.decode(mrs_s) r = Result(mrs=mrs_s) assert len(r) == 1 assert r['mrs'] == mrs_s assert r.mrs() == mrs r = Result(mrs=mrs_d) assert len(r) == 1 assert r['mrs'] == mrs_d assert r.mrs() == mrs r = Result(mrs=mrs_d) assert len(r) == 1 assert r['mrs'] == mrs_d assert r.mrs() == mrs # r = Result(mrs='nonsense') # assert r['mrs'] == 'nonsense' # with pytest.raises(PyDelphinSyntaxError): # r.mrs() dmrs_d = { 'nodes': [{ 'nodeid': 10000, 'predicate': '_rain_v_1', 'sortinfo': { 'cvarsort': 'e' } }], 'links': [{ 'from': 0, 'to': 10000, 'rargname': None, 'post': 'H' }] } dmrs = dmrsjson.from_dict(dmrs_d) r = Result(dmrs=dmrs_d) assert len(r) == 1 assert r['dmrs'] == dmrs_d assert r.dmrs() == dmrs # r = Result(dmrs='nonsense') # assert len(r) == 1 # assert r['dmrs'] == 'nonsense' # with pytest.raises(PyDelphinSyntaxError): # r.dmrs() eds_d = { 'top': 'e2', 'nodes': { 'e2': { 'label': '_rain_v_1', 'lnk': { 'from': 3, 'to': 9 }, 'edges': {} } } } eds_s = '{e2: e2:_rain_v_1<3:9>[]}' eds = edsjson.from_dict(eds_d) r = Result(eds=eds_s) assert len(r) == 1 assert r['eds'] == eds_s assert r.eds() == eds r = Result(eds=eds_d) assert len(r) == 1 assert r['eds'] == eds_d assert r.eds() == eds # r = Result(eds='nonsense') # assert len(r) == 1 # assert r['eds'] == 'nonsense' # with pytest.raises(PyDelphinSyntaxError): # r.eds() # several changes were made to the below for compatibility: # - removed head annotation (on W_PERIOD_PLR) # - removed type info # - removed from/to info # - added start/end # - escaped quotes # - capitalized entity names deriv_s = '(189 SB-HD_MC_C 0.228699 0 2 (37 it 0.401245 0 1 ("it" 34 "token [ +FORM \\"it\\" +FROM #1=\\"0\\" +TO \\"2\\" ]")) (188 W_PERIOD_PLR -0.113641 1 2 (187 V_PST_OLR 0 1 2 (56 rain_v1 0 1 2 ("rained." 32 "token [ +FORM \\"rained.\\" +FROM #1=\\"3\\" +TO \\"10\\" ]")))))' deriv_d = { "id": 189, "entity": "SB-HD_MC_C", "label": "S", "score": 0.228699, "start": 0, "end": 2, "daughters": [ # , "type": "subjh_mc_rule" { "id": 37, "entity": "it", "score": 0.401245, "start": 0, "end": 1, "form": "it", "tokens": [ # , "type": "n_-_pr-it-x_le" , "from": 0, "to": 2 { "id": 34, "tfs": "token [ +FORM \\\"it\\\" +FROM #1=\\\"0\\\" +TO \\\"2\\\" ]" } ] }, # , "from": 0, "to": 2 { "id": 188, "entity": "W_PERIOD_PLR", "score": -0.113641, "start": 1, "end": 2, "daughters": [ # , "type": "punctuation_period_rule" { "id": 187, "entity": "V_PST_OLR", "score": 0, "start": 1, "end": 2, "daughters": [ # , "type": "v_pst_inflrule" { "id": 56, "entity": "rain_v1", "score": 0, "start": 1, "end": 2, "form": "rained.", "tokens": [ # , "type": "v_-_it_le", "from": 3, "to": 10 { "id": 32, "tfs": "token [ +FORM \\\"rained.\\\" +FROM #1=\\\"3\\\" +TO \\\"10\\\" ]" } ] } ] } ] } ] # , "from": 3, "to": 10 } deriv = derivation.from_dict(deriv_d) r = Result(derivation=deriv_s) assert len(r) == 1 assert r['derivation'] == deriv_s assert r.derivation() == deriv r = Result(derivation=deriv_d) assert len(r) == 1 assert r['derivation'] == deriv_d assert r.derivation() == deriv
def _transform_mrs(s): return mrsjson.to_dict(simplemrs.decode(s))
from delphin import ace from delphin import itsdb from delphin import tsql from delphin import dmrs, eds from delphin.codecs import eds as edsnative from delphin.codecs import simplemrs from delphin.codecs import dmrx # import parser as p from delphin.rdf import parser as p from rdflib import Graph import argparse parser = argparse.ArgumentParser() parser.add_argument("profile", help="profile path") ts = itsdb.TestSuite(parser.parse_args().profile) graph = Graph() for row in tsql.select('i-id mrs', ts): m = simplemrs.decode(row[1]) p.mrs_to_rdf(m, "http://example.com/example", row[0], graph) graph.serialize(destination="test.ttl", format="turtle")
def __cli_parse__(args): # remove the not well formed sentences? add option? # print MRS or parse to DMRS format? path = args.profile prefix = args.prefix.strip("/") semrep = args.semrep.lower() parser = None # Setting verbosity; need to figure a better solution. if args.verbosity == 1: logger.setLevel(20) elif args.verbosity >= 2: logger.setLevel(10) try: # validates path if not isdir(path): raise NotADirectoryError(f"Path is not a directory: {path}") # validates profile if not is_database_directory(path): raise TSDBError(f'Invalid test suite directory: {path}') # validates URI prefix if not _is_valid_uri(prefix): raise Exception(f'Invalid URI: {prefix}') # validate format and get converter to_rdf, from_mrs = _get_converters(semrep) # open Test Suite and start conversion ts = itsdb.TestSuite(path) # logger.info(f"Converting {len(ts['result'])} analysis of {len(ts['item'])} sentences from {args.profile}") logger.info( f"Converting {len(ts['result'])} analysis of {len(ts['item'])} sentences from {args.profile}" ) # Creating the store and the default graph store = plugin.get("IOMemory", Store)() defaultGraph = Graph(store, identifier=BNode()) PROFILE = URIRef(f"{prefix}") # review later defaultGraph.add((PROFILE, RDF.type, DELPH.Profile)) semrepURI, prof_semrep_relation = _get_RDF_semrep(semrep, store) store.bind("erg", ERG) store.bind("delph", DELPH) store.bind("pos", POS) # store.bind("upref", prefix) # may be useful # The tsql takes some time to be processed: # logger.info(f"Loading the profile") logger.info(f"Loading the profile") profile_data = tsql.select('parse-id result-id i-input mrs', ts) logger.info(f"Converting the profile") # Iterating over the results: for (parse_id, result_id, text, mrs_string) in profile_data: logger.debug( f"Converting the result {result_id} of sentence {parse_id}") m = simplemrs.decode(mrs_string) # making sure of the well formedness of "m" if not is_well_formed(m): logger.warning( f"Result {result_id} of sentence {parse_id} is not well formed" ) # continue # converting the MRS object to the representation intended to be converted obj = from_mrs(m) # logger.debug(f"Result {result_id} of item {parse_id}: \n\t{text}\n\t{obj}\n\t{mrs_string}") # Creating URIs for relevant resources. ITEM = URIRef( f"{prefix}/{parse_id}" ) # The item part may be redundant, maybe iterate before the itens RESULT = URIRef(f"{prefix}/{parse_id}/{result_id}") SEMREPI = URIRef(f"{prefix}/{parse_id}/{result_id}/{semrep}") # adding types: defaultGraph.add((ITEM, RDF.type, DELPH.Item)) defaultGraph.add((RESULT, RDF.type, DELPH.Result)) defaultGraph.add((SEMREPI, RDF.type, semrepURI)) # Associating text to item: defaultGraph.add((ITEM, DELPH.hasText, Literal(text))) # Linking those nodes: defaultGraph.add((PROFILE, DELPH.hasItem, ITEM)) defaultGraph.add((ITEM, DELPH.hasResult, RESULT)) defaultGraph.add((RESULT, prof_semrep_relation, SEMREPI)) to_rdf(obj, SEMREPI, store, defaultGraph) # serializes results logger.info(f"Serializing results to {args.output}") ConjunctiveGraph(store).serialize(destination=args.output, format=args.format) logger.info(f"DONE") # except PyDelphinSyntaxError as e: # logger.exception(e) # except ImportError as e: # logger.exception(e) # except TSDBError as e: # logger.exception(e) except Exception as e: logger.error(e)
def read_profile(input_dir, output_dir, profile_name, mrp_eds, lexicon, args): ts = d_itsdb.TestSuite(input_dir) derivation_strs = [] supertag_strs = [] dmrs_json_strs = [] for iid, sentence, parse_tokens, result_derivation, result_mrs in d_tsql.select('i-id i-input p-tokens derivation mrs', ts): tokens_rep = d_tokens.YYTokenLattice.from_string(parse_tokens) token_dict = {tok.id : tok for tok in tokens_rep.tokens} derivation_rep = d_derivation.from_string(result_derivation) assert len(derivation_rep.daughters) == 1 derivation_rep = derivation_rep.daughters[0] if mrp_eds: if iid in mrp_eds: try: eds_rep = dc_eds.decode(mrp_eds[iid]) dmrs_rep = eds_to_dmrs(eds_rep) except d_eds._exceptions.EDSSyntaxError: #print("Skipping: EDS syntax error", mrp_eds[iid]) continue else: #print("Unmatched:", iid) continue else: try: mrs_rep = dc_simplemrs.decode(result_mrs) except d_mrs._exceptions.MRSSyntaxError: #print("Skipping: MRS syntax error", result_mrs) continue dmrs_rep = d_dmrs.from_mrs(mrs_rep) mr = semantics.SemanticRepresentation(profile_name + ":" + iid, sentence, token_dict, derivation_rep, lexicon) # read derivation tree if args.convert_semantics: mr.map_dmrs(dmrs_rep) mr.process_semantic_tree(mr.root_node_id, dmrs_rep) mr.print_mrs() if args.extract_syntax: derivation_strs.append(mr.derivation_tree_str(mr.root_node_id, newline=False).lstrip()) supertag_strs.append(mr.supertag_str(mr.root_node_id).strip()) if args.extract_semantics: dmrs_json_strs.append(mr.dmrs_json_str(dmrs_rep)) if args.extract_syntax: with open(output_dir + ".tree", 'w') as dt_out: for s in derivation_strs: dt_out.write(s + "\n") with open(output_dir + ".tags", 'w') as st_out: for s in supertag_strs: st_out.write(s + "\n") if args.extract_semantics: with open(output_dir + ".dmrs", 'w') as d_out: for s in dmrs_json_strs: if s != "": d_out.write(s + "\n")