def test_write(self, single_item_profile, tmpdir): t = itsdb.TestSuite(single_item_profile) assert t['item'][0]['i-input'] == 'The dog barks.' t['item'][0]['i-input'] = 'The dog sleeps.' assert t['item'][0]['i-input'] == 'The dog sleeps.' t.write() t.reload() assert t['item'][0]['i-input'] == 'The dog sleeps.' t['item'][0]['i-input'] = 'The cat sleeps.' t.write('parse') t.reload() assert t['item'][0]['i-input'] == 'The dog sleeps.' t['item'][0]['i-input'] = 'The cat sleeps.' t.write(['item', 'parse']) assert t['item'][0]['i-input'] == 'The cat sleeps.' record = itsdb.Record.from_dict(t.relations['item'], { 'i-id': 0, 'i-input': 'The cat meows.' }) t.write({'item': [record]}) t.reload() assert t['item'][0]['i-input'] == 'The cat meows.' d = tmpdir.mkdir('alt') altrels = itsdb.Relations.from_string(_alt_relations) t.write(path=str(d), relations=altrels) assert d.join('relations').read_text('utf-8') == _alt_relations assert sorted(x.basename for x in d.listdir()) == [ 'item', 'parse', 'relations', 'result' ] ts = itsdb.TestSuite(str(d)) assert 'i-date' in ts['item'].fields
def test_init(self, single_item_profile): rels = itsdb.Relations.from_string(_simple_relations) t = itsdb.TestSuite(relations=rels) assert len(t['item']) == 0 assert len(t['parse']) == 0 assert len(t['result']) == 0 t = itsdb.TestSuite(single_item_profile) assert len(t['item']) == 1 assert len(t['parse']) == 1 assert len(t['result']) == 1
def compare(testsuite, gold, select='i-id i-input mrs'): """ Compare two [incr tsdb()] profiles. Args: testsuite (str, TestSuite): path to the test [incr tsdb()] testsuite or a :class:`TestSuite` object gold (str, TestSuite): path to the gold [incr tsdb()] testsuite or a :class:`TestSuite` object select: TSQL query to select (id, input, mrs) triples (default: `i-id i-input mrs`) Yields: dict: Comparison results as:: {"id": "item identifier", "input": "input sentence", "test": number_of_unique_results_in_test, "shared": number_of_shared_results, "gold": number_of_unique_results_in_gold} """ from delphin.mrs import simplemrs, compare as mrs_compare if not isinstance(testsuite, itsdb.TestSuite): if isinstance(testsuite, itsdb.ItsdbProfile): testsuite = testsuite.root testsuite = itsdb.TestSuite(testsuite) if not isinstance(gold, itsdb.TestSuite): if isinstance(gold, itsdb.ItsdbProfile): gold = gold.root gold = itsdb.TestSuite(gold) queryobj = tsql.inspect_query('select ' + select) if len(queryobj['projection']) != 3: raise ValueError('select does not return 3 fields: ' + select) input_select = '{} {}'.format(queryobj['projection'][0], queryobj['projection'][1]) i_inputs = dict(tsql.select(input_select, testsuite)) matched_rows = itsdb.match_rows(tsql.select(select, testsuite), tsql.select(select, gold), 0) for (key, testrows, goldrows) in matched_rows: (test_unique, shared, gold_unique) = mrs_compare.compare_bags( [simplemrs.loads_one(row[2]) for row in testrows], [simplemrs.loads_one(row[2]) for row in goldrows]) yield { 'id': key, 'input': i_inputs[key], 'test': test_unique, 'shared': shared, 'gold': gold_unique }
def test_init(self, single_item_profile): with pytest.raises(itsdb.ITSDBError): itsdb.TestSuite() rel = pathlib.Path(single_item_profile, 'relations') t = itsdb.TestSuite(schema=rel) assert len(t['item']) == 0 assert len(t['parse']) == 0 assert len(t['result']) == 0 t = itsdb.TestSuite(single_item_profile) assert len(t['item']) == 1 assert len(t['parse']) == 1 assert len(t['result']) == 1
def test_select(mini_testsuite): ts = itsdb.TestSuite(mini_testsuite) assert list(tsql.select('i-input', ts)) == [('It rained.', ), ('Rained.', ), ('It snowed.', )] assert list(tsql.select('i-input from item', ts)) == [('It rained.', ), ('Rained.', ), ('It snowed.', )] assert list(tsql.select('i-input from item item', ts)) == [('It rained.', ), ('Rained.', ), ('It snowed.', )] assert list(tsql.select('i-input from result', ts)) == [('It rained.', ), ('It snowed.', )] assert list(tsql.select('i-input from item result', ts)) == [('It rained.', ), ('It snowed.', )] assert list(tsql.select('i-id i-input', ts)) == [('10', 'It rained.'), ('20', 'Rained.'), ('30', 'It snowed.')] assert list(tsql.select('i-id i-input', ts, record_class=itsdb.Row)) == [(10, 'It rained.'), (20, 'Rained.'), (30, 'It snowed.')] res = ts['result'] assert list(tsql.select('i-id mrs', ts)) == [('10', res[0]['mrs']), ('30', res[1]['mrs'])] with pytest.raises(tsql.TSQLSyntaxError): tsql.select('*', ts)
def __cli_parse__(args): """""" # validate IRI prefix # handle exceptions # handle invalid profile # handle output exceptions ts = itsdb.TestSuite(args.profile) prefix = args.prefix.strip("/") graph = Graph() for row in tsql.select('i-id i-input mrs', ts): id = row[0] text = row[1] if args.verbosity > 0: print("Parsing sentence {}".format(id)) # parse mrs from profile m = simplemrs.decode(row[2]) # transform to eds: d = dmrs.from_mrs(m) graph = p.dmrs_to_rdf(d=d, prefix=prefix, identifier=id, graph=graph, text=text) # serializes output graph.serialize(destination=args.output, format=args.format)
def _iter_representations(path: Path, fmt: str, p: int): if tsdb.is_database_directory(path): ts = itsdb.TestSuite(path) for response in ts.processed_items(): try: result = response.result(p) except IndexError: yield None else: yield from_mrs(result.mrs(), predicate_modifiers=True) elif path.is_file(): codec = util.import_codec(fmt) rep = codec.CODEC_INFO.get('representation', '').lower() if rep == 'mrs': for mrs in codec.load(path): yield from_mrs(mrs, predicate_modifiers=True) elif rep in ('dmrs', 'eds'): for sr in codec.load(path): yield sr else: raise ValueError(f'unsupported representation: {rep}') else: raise ValueError(f'not a file or TSDB database: {path}')
def test_reload(self, single_item_profile): t = itsdb.TestSuite(single_item_profile) assert t['item'][0]['i-input'] == 'The dog barks.' t['item'][0] = (0, 'The dog sleeps.') assert t['item'][0]['i-input'] == 'The dog sleeps.' t.reload() assert t['item'][0]['i-input'] == 'The dog barks.'
def test_Row(empty_alt_testsuite): ts = itsdb.TestSuite(str(empty_alt_testsuite)) item = ts['item'] r = itsdb.Row(item.fields, [0, 'sentence', datetime(2009, 9, 7)]) assert r.fields == item.fields assert r.keys() == ['i-id', 'i-input', 'i-date'] assert len(r) == 3 assert r['i-id'] == r[0] == 0 assert r['i-input'] == r[1] == 'sentence' assert r['i-date'] == r[2] == datetime(2009, 9, 7) assert str(r) == '0@sentence@7-sep-2009' assert r == (0, 'sentence', datetime(2009, 9, 7)) assert r.data == ('0', 'sentence', '7-sep-2009') assert r == itsdb.Row(item.fields, [0, 'sentence', datetime(2009, 9, 7)]) assert r != itsdb.Row(item.fields, [1, 'sentence', datetime(2009, 9, 7)]) assert r != itsdb.Row(item.fields, [0, 'string', datetime(2009, 9, 7)]) assert r != itsdb.Row(item.fields, [0, 'sentence', datetime(2009, 7, 9)]) # incorrect number of fields with pytest.raises(itsdb.ITSDBError): itsdb.Row(item.fields, [0]) # None values get set to default, and # non-string values are left as-is r = itsdb.Row(item.fields, [0, None, None]) assert r['i-id'] == 0 assert r['i-input'] is None assert r['i-date'] is None
def main(args): total_sums = _make_counters() total_record_count = 0 for profile in args.PROFILE: ts = itsdb.TestSuite(profile) sums = _make_counters() record_count = 0 for record in ts['result']: record_count += 1 total_record_count += 1 mrs = simplemrs.loads_one(record['mrs']) for var in mrs.variables(): vartype = var_sort(var) for prop, val in mrs.properties(var).items(): sums[vartype][prop.upper()][val.lower()] += 1 print('{} ({} MRSs):'.format(profile, record_count)) report(sums) for vartype, props in sums.items(): for prop, vals in props.items(): for val, count in vals.items(): total_sums[vartype][prop][val] += count print('TOTAL ({} MRSs):'.format(total_record_count)) report(total_sums)
def test_select_where_types_issue_261(mini_testsuite): # https://github.com/delph-in/pydelphin/issues/261 ts = itsdb.TestSuite(mini_testsuite) with pytest.raises(tsql.TSQLError): tsql.select('i-id where i-id ~ "regex"', ts) with pytest.raises(tsql.TSQLError): tsql.select('i-id where i-input < 1', ts) with pytest.raises(tsql.TSQLError): tsql.select('i-id where i-input = 1', ts)
def test_processed_items(self, mini_testsuite): ts = itsdb.TestSuite(mini_testsuite) responses = list(ts.processed_items()) assert len(responses) == 3 assert responses[0]['i-input'] == 'It rained.' assert len(responses[0].results()) == 1 assert responses[0].result(0)['mrs'] == ( '[ TOP: h0 INDEX: e2 [ e TENSE: past ]' ' RELS: < [ _rain_v_1<3:9> LBL: h1 ARG0: e2 ] >' ' HCONS: < h0 qeq h1 > ]') assert len(responses[1].results()) == 0 assert len(responses[2].results()) == 1
def test_select_where(ts0): ts = itsdb.TestSuite(str(ts0)) assert list(tsql.select('i-input where i-input ~ "It"', ts)) == [['It rained.'], ['It snowed.']] assert list(tsql.select('i-input where i-input ~ "It" or i-id = 20', ts)) == [['It rained.'], ['Rained.'], ['It snowed.']] assert list(tsql.select('i-input where i-date >= 2018-02-01', ts)) == [['It rained.'], ['Rained.'], ['It snowed.']] assert list(tsql.select('i-input where readings > 0', ts)) == [['It rained.'], ['It snowed.']]
def select(dataspec, testsuite, mode='list', cast=True): """ Select data from [incr tsdb()] profiles. Args: query (str): TSQL select query (e.g., `'i-id i-input mrs'` or `'* from item where readings > 0'`) testsuite (str, TestSuite): testsuite or path to testsuite containing data to select mode (str): see :func:`delphin.itsdb.select_rows` for a description of the *mode* parameter (default: `list`) cast (bool): if `True`, cast column values to their datatype according to the relations file (default: `True`) Returns: a generator that yields selected data """ if isinstance(testsuite, itsdb.ItsdbProfile): testsuite = itsdb.TestSuite(testsuite.root) elif not isinstance(testsuite, itsdb.TestSuite): testsuite = itsdb.TestSuite(testsuite) return tsql.select(dataspec, testsuite, mode=mode, cast=cast)
def test_select_where(mini_testsuite): ts = itsdb.TestSuite(mini_testsuite) assert list(tsql.select('i-input where i-input ~ "It"', ts)) == [('It rained.', ), ('It snowed.', )] assert list(tsql.select('i-input where i-input ~ "It" or i-id = 20', ts)) == [('It rained.', ), ('Rained.', ), ('It snowed.', )] assert list(tsql.select('i-input where i-date >= 2018-02-01', ts)) == [('It rained.', ), ('Rained.', ), ('It snowed.', )] assert list(tsql.select('i-input where readings > 0', ts)) == [('It rained.', ), ('It snowed.', )]
def on_get_name(self, req, resp, name): try: entry = self.index[name] except KeyError: raise falcon.HTTPNotFound() ts = itsdb.TestSuite(entry['path']) quote = urllib.parse.quote base = req.uri resp.media = { tablename: '/'.join([base, quote(tablename)]) for tablename in ts.schema } resp.status = falcon.HTTP_OK
def test_bad_date_issue_279b(tmp_path, empty_alt_testsuite): tmp_ts = tmp_path.joinpath('test_bad_date_issue_279b') tmp_ts.mkdir() schema = tsdb.read_schema(empty_alt_testsuite) fields = schema['item'] tsdb.write_schema(tmp_ts, schema) tsdb.write(tmp_ts, 'item', [(0, 'The cat meows.', 'September 8, 1999')], fields) ts = itsdb.TestSuite(tmp_ts) assert list(ts['item'].select('i-date', cast=False)) == [('September 8, 1999', )] with pytest.warns(tsdb.TSDBWarning): ts['item'][0]['i-date']
def test_process(self, parser_cpu, single_item_skeleton): ts = itsdb.TestSuite(single_item_skeleton) assert len(ts['parse']) == 0 assert len(ts['result']) == 0 ts.process(parser_cpu) assert len(ts['parse']) == 1 assert len(ts['result']) == 2 assert ts['parse'][0]['parse-id'] == 0 assert ts['parse'][0]['run-id'] == 0 assert ts['result'][0]['parse-id'] == 0 assert ts['result'][0]['result-id'] == 0 assert ts['result'][1]['parse-id'] == 0 assert ts['result'][1]['result-id'] == 1
def test_commit(self, single_item_profile, empty_alt_testsuite): t = itsdb.TestSuite(single_item_profile) item = t['item'] # uncommitted changes do not persist assert item[0]['i-input'] == 'The dog barks.' item[0] = (0, 'The dog sleeps.') assert item[0]['i-input'] == 'The dog sleeps.' assert t.in_transaction t.reload() assert item[0]['i-input'] == 'The dog barks.' assert not t.in_transaction # committing them makes them persist item[0] = (0, 'The dog sleeps.') assert t.in_transaction t.commit() assert not t.in_transaction t.reload() assert item[0]['i-input'] == 'The dog sleeps.'
def test_in_transaction(self, empty_testsuite): t = itsdb.TestSuite(empty_testsuite) item = t['item'] assert not t.in_transaction item.append((10, 'Dogs bark.')) assert t.in_transaction t.commit() assert not t.in_transaction item.update(-1, {'i-input': 'Cats meow.'}) assert t.in_transaction t.commit() assert not t.in_transaction item[-1:] = [] assert t.in_transaction item.append((10, 'Dogs bark.')) t.commit() item.clear() assert t.in_transaction
def __cli_parse__(args): """""" # validate IRI prefix # handle exceptions # handle invalid profile # handle output exceptions ts = itsdb.TestSuite(args.profile) prefix = args.prefix.strip("/") graph = Graph() for row in tsql.select('i-id i-input mrs', ts): id = row[0] text = row[1] m = simplemrs.decode(row[2]) # parse mrs from profile p.mrs_to_rdf(m, prefix, id, graph, text=text) # serializes output graph.serialize(destination=args.output, format=args.format)
def tsdb_min(path_to_profile): """ The argument path_to_profile should be, for example, '/delphin/erg2018/tsdb/mrs'. Both skeletons and filled/parsed profiles can be inspected. This is why tsql.select is done in multiple queries. All profiles always have, minimally, the 'items' file. Anything else is should be checked. This function returns a dictionary based on i-ids of that profile: data[1]['i-wf'] = 1 data[1]['i-input'] = "This is an example sentence." data[1]['i-comment'] = "The comment left inside the items-file." Optionally it can include: data[1]['i-readings'] = 23 # number of derivation trees """ ts = itsdb.TestSuite(path.join(ROOT, path_to_profile)) data = dd(lambda: dd()) for row in tsql.select( 'i-id i-wf i-input i-comment i-length i-origin i-translation', ts): i_id = row[0] data[i_id]['i-wf'] = row[1] data[i_id]['i-input'] = row[2] data[i_id]['i-comment'] = row[3] data[i_id]['i-length'] = row[4] data[i_id]['i-origin'] = row[5] data[i_id]['i-translation'] = row[6] ####################################################################### # If we don't check if the file 'parse' exists, then pydelphin creates # an empty 'parse' file. This is undesirable, especially for skeletons ####################################################################### if path.isfile(path.join(ROOT, path_to_profile + 'parse')): for row in tsql.select('i-id readings', ts): data[row[0]]['readings'] = row[1] return data
def test_select(ts0): ts = itsdb.TestSuite(str(ts0)) assert list(tsql.select('i-input', ts)) == [['It rained.'], ['Rained.'], ['It snowed.']] assert list(tsql.select('i-input from item', ts)) == [['It rained.'], ['Rained.'], ['It snowed.']] assert list(tsql.select('i-input from item item', ts)) == [['It rained.'], ['Rained.'], ['It snowed.']] assert list(tsql.select('i-input from result', ts)) == [['It rained.'], ['It snowed.']] assert list(tsql.select('i-input from item result', ts)) == [['It rained.'], ['It snowed.']] assert list(tsql.select('i-id i-input', ts)) == [[10, 'It rained.'], [20, 'Rained.'], [30, 'It snowed.']] res = ts['result'] assert list(tsql.select('i-id mrs', ts)) == [[10, res[0]['mrs']], [30, res[1]['mrs']]] with pytest.raises(tsql.TSQLSyntaxError): tsql.select('*', ts) assert list(tsql.select('* from item', ts, cast=True)) == list(ts['item'])
def test_join(single_item_profile): p = itsdb.TestSuite(single_item_profile) j = itsdb.join(p['parse'], p['result']) assert j.name == 'parse+result' assert len(j) == 1 assert len( j.fields) == len(p['parse'].fields) + len(p['result'].fields) - 1 r = j[0] assert r['parse:run-id'] == r['run-id'] assert r['result:mrs'] == r['mrs'] assert r['parse:parse-id'] == r['result:parse-id'] == r['parse-id'] j2 = itsdb.join(p['item'], j) assert j2.name == 'item+parse+result' assert len(j2) == 1 assert len(j2.fields) == len(j.fields) + len(p['item'].fields) - 1 r = j2[0] assert r['item:i-input'] == r['i-input'] assert r['item:i-id'] == r['parse:i-id'] j3 = itsdb.join(j, p['item']) assert j3.name == 'parse+result+item'
def on_get_table(self, req, resp, name, table): try: entry = self.index[name] except KeyError: raise falcon.HTTPNotFound() ts = itsdb.TestSuite(entry['path']) table_ = ts[table] limit = req.get_param_as_int('limit', default=len(table_)) page = req.get_param_as_int('page', default=1) rowslice = slice((page - 1) * limit, page * limit) rows = [] transforms = [(table_.column_index(colname), transform) for colname, transform in self.transforms.get(table, [])] for row in table_[rowslice]: row = list(row) for colidx, transform in transforms: row[colidx] = transform(row[colidx]) rows.append(row) resp.media = rows resp.status = falcon.HTTP_OK
def test_mkprof_issue_273(mini_testsuite, tmp_path): # https://github.com/delph-in/pydelphin/issues/273 from delphin import itsdb ts1_ = tmp_path.joinpath('ts1') ts1_.mkdir() ts1 = str(ts1_) ts0 = mini_testsuite # this is when the condition occurs on a single row mkprof(ts1, source=ts0, full=True, where='mrs ~ "_snow_v_1"') item = pathlib.Path(ts1, 'item') assert item.read_text() == ('30@It snowed.@1@2018-2-1 (15:00:00)\n') # this is when the condition occurs on multiple rows _ts0 = itsdb.TestSuite(ts0) _ts0['parse'].update(2, {'readings': 2}) _ts0['result'].append( (30, 1, '[ TOP: h0 INDEX e2 [ e TENSE: past ]' ' RELS: < [ pronoun_q<0:2> LBL h3 ARG0: x4 RSTR: h5 BODY: h6 ]' ' [ pron<0:2> LBL: h7 ARG0: x4 ]' ' [ _snow_v_1<3:9> LBL: h1 ARG0: e2 ARG1: x4 ] >' ' HCONS: < h0 qeq h1 h5 qeq h7 > ]')) _ts0.commit() mkprof(ts1, source=ts0, full=True, where='mrs ~ "_snow_v_1"') item = pathlib.Path(ts1, 'item') assert item.read_text() == ('30@It snowed.@1@2018-2-1 (15:00:00)\n')
from delphin import ace from delphin import itsdb from delphin import tsql from delphin import dmrs, eds from delphin.codecs import eds as edsnative from delphin.codecs import simplemrs from delphin.codecs import dmrx # import parser as p from delphin.rdf import parser as p from rdflib import Graph import argparse parser = argparse.ArgumentParser() parser.add_argument("profile", help="profile path") ts = itsdb.TestSuite(parser.parse_args().profile) graph = Graph() for row in tsql.select('i-id mrs', ts): m = simplemrs.decode(row[1]) p.mrs_to_rdf(m, "http://example.com/example", row[0], graph) graph.serialize(destination="test.ttl", format="turtle")
from delphin import ace from delphin import tsdb from delphin import itsdb ts = itsdb.TestSuite('sample-200-py') with ace.ACEParser('terg-mac.dat', cmdargs=['--disable-generalization'], full_forest=True) as cpu: ts.process(cpu)
def process(grammar, testsuite, source=None, select=None, generate=False, transfer=False, full_forest=False, options=None, all_items=False, result_id=None, gzip=False, stderr=None): """ Process (e.g., parse) a [incr tsdb()] profile. Results are written to directly to *testsuite*. If *select* is `None`, the defaults depend on the task: ========== ========================= Task Default value of *select* ========== ========================= Parsing `item.i-input` Transfer `result.mrs` Generation `result.mrs` ========== ========================= Args: grammar (str): path to a compiled grammar image testsuite (str): path to a [incr tsdb()] testsuite where data will be read from (see *source*) and written to source (str): path to a [incr tsdb()] testsuite; if `None`, *testsuite* is used as the source of data select (str): TSQL query for selecting processor inputs (default depends on the processor type) generate (bool): if `True`, generate instead of parse (default: `False`) transfer (bool): if `True`, transfer instead of parse (default: `False`) options (list): list of ACE command-line options to use when invoking the ACE subprocess; unsupported options will give an error message all_items (bool): if `True`, don't exclude ignored items (those with `i-wf==2`) when parsing result_id (int): if given, only keep items with the specified `result-id` gzip (bool): if `True`, non-empty tables will be compressed with gzip stderr (file): stream for ACE's stderr """ from delphin import ace grammar = Path(grammar).expanduser() testsuite = Path(testsuite).expanduser() if not grammar.is_file(): raise CommandError(f'{grammar} is not a file') kwargs = {} kwargs['stderr'] = stderr if sum(1 if mode else 0 for mode in (generate, transfer, full_forest)) > 1: raise CommandError("'generate', 'transfer', and 'full-forest' " "are mutually exclusive") if source is None: source = _validate_tsdb(testsuite) else: source = _validate_tsdb(source) if not tsdb.is_database_directory(testsuite): if testsuite.exists(): raise CommandError( f'{testsuite} exists and is not a TSDB database; ' 'remove it or select a different destination path') mkprof(testsuite, source=source, full=False, quiet=True) else: pass # both source and testsuite are valid TSDB databases if select is None: select = 'result.mrs' if (generate or transfer) else 'item.i-input' if generate: processor = ace.ACEGenerator elif transfer: processor = ace.ACETransferer else: if full_forest: kwargs['full_forest'] = True if not all_items: select += ' where i-wf != 2' processor = ace.ACEParser if result_id is not None: select += f' where result-id == {result_id}' target = itsdb.TestSuite(testsuite) column, relation, condition = _interpret_selection(select, source) with tempfile.TemporaryDirectory() as dir: # use a temporary test suite directory for filtered inputs mkprof(dir, source=source, where=condition, full=True, gzip=True, quiet=True) tmp = itsdb.TestSuite(dir) with processor(grammar, cmdargs=options, **kwargs) as cpu: target.process(cpu, selector=(relation, column), source=tmp, gzip=gzip)
from delphin import itsdb from delphin import ace from delphin import commands src_path = 'golden' tgt_path = 'p' commands.mkprof(tgt_path, source=src_path) src_ts = itsdb.TestSuite(src_path) tgt_ts = itsdb.TestSuite(tgt_path) with ace.ACEGenerator('erg.dat') as cpu: tgt_ts.process(cpu, source=src_ts)