def test_ItsdbProfile(empty_profile, single_item_skeleton, single_item_profile): p = itsdb.ItsdbProfile(empty_profile) # tests p = itsdb.ItsdbProfile(single_item_skeleton) # tests p = itsdb.ItsdbProfile(single_item_profile)
def do(args): join_table = 'g-result' hyp_spec = 'g-result:surface' ref_spec = 'item:i-translation' if args['--rephrasing']: join_table = 'r-result' hyp_spec = 'r-result:surface' ref_spec = 'item:i-input' make_hyp = make_ref = lambda s: s if args['--tokenize']: make_hyp = make_ref = lambda s: ' '.join(_tokenize(s)) select = select_oracle if args['--oracle-bleu'] else select_first for i, itemdir in enumerate(args['ITEM']): itemdir = os.path.normpath(itemdir) p = itsdb.ItsdbProfile(itemdir) if args['--item-id']: for i_id, hyp, ref in select(p, join_table, hyp_spec, ref_spec, with_id=True): print('{}\t{}\t{}'.format(i_id, make_hyp(hyp), make_ref(ref))) else: for hyp, ref in select(p, join_table, hyp_spec, ref_spec): print('{}\t{}'.format(make_hyp(hyp), make_ref(ref)))
def _parse_tables(item): info, results = [], [] makeinfo = lambda a, b, c: {'i-id': a, 'time': b, 'memory': c} makeresult = lambda a, b, c, d, e: { 'i-id': a, 'p-id': b, 'derivation': c, 'mrs': d, 'score': e } if os.path.isdir(item): p = itsdb.ItsdbProfile(item) fn = os.path.join(p.root, 'parse') if os.path.isfile(fn) or os.path.isfile(fn + '.gz'): for row in p.read_table('parse'): info.append(makeinfo(row['i-id'], row['total'], row['others'])) fn = os.path.join(p.root, 'result') if os.path.isfile(fn) or os.path.isfile(fn + '.gz'): for row in p.join('parse', 'result'): results.append( makeresult( row['parse:i-id'], row['result:result-id'], row['result:derivation'], row['result:mrs'], '1.0' # for now )) else: raise ValueError('Only profiles allowed with --full: ' + str(item)) return info, results
def item_rows(item, reverse=False): data = [] if os.path.isdir(item): p = itsdb.ItsdbProfile(item) output_fn = os.path.join(p.root, 'output') if ((os.path.isfile(output_fn) or os.path.isfile(output_fn + '.gz')) and len(list(p.read_table('output'))) > 0): for row in p.join('item', 'output'): data.append((row['item:i-id'], row['item:i-input'], row['output:o-surface'])) else: data.extend(p.select('item', ['i-id', 'i-input', 'i-translation'])) elif os.path.isfile(item): for i, line in enumerate(open(item)): src, tgt = line.split('\t', 1) data.append(((i + 1) * 10, src.rstrip(), tgt.rstrip())) else: raise ValueError('Invalid item: ' + str(item)) rows = [] for i_id, src, tgt in data: if reverse: src, tgt = tgt, src rows.append({ 'i-id': i_id, 'i-input': src, 'i-length': len(src.split()), 'i-translation': tgt }) return rows
def read_profile(f, args): p = itsdb.ItsdbProfile(f) mrs_dataspec = args inputs = dict((r['i-id'], r['i-input']) for r in p.read_table('item')) cur_id, mrss = None, [] if p.exists('p-result'): rows = p.read_table('p-result') id_spec = 'i-id' mrs_spec = 'mrs' else: rows = p.join('parse', 'result') id_spec = 'parse:i-id' mrs_spec = 'result:mrs' for row in rows: mrs = simplemrs.loads_one(row[mrs_spec]) if cur_id is None: cur_id = row[id_spec] if cur_id == row[id_spec]: mrss.append(mrs) else: yield (cur_id, inputs[cur_id], mrss) cur_id, mrss = row[id_spec], [mrs] if mrss: yield (cur_id, inputs[cur_id], mrss)
def mkprof(args): """ Create [incr tsdb()] profiles or skeletons. """ outdir = args['DEST'] if args['--in-place']: if args['--skeleton']: sys.exit('Creating a skeleton with --in-place is not allowed.') args['--source'] = args['DEST'] if args['--source']: # input is profile p = _prepare_input_profile(args['--source'], filters=args['--filter'], applicators=args['--apply']) relations = args['--relations'] or os.path.join(p.root, 'relations') else: # input is stdin or txt file p = None relations = args['--relations'] if not os.path.isfile(relations): sys.exit('Invalid relations file: {}'.format(relations)) if args['--full']: _prepare_output_directory(outdir) p.write_profile(outdir, relations_filename=relations, key_filter=True, gzip=args['--gzip']) else: if p is not None: rows = p.read_table('item') elif args['--input'] is not None: rows = _lines_to_rows(open(args['--input'])) else: rows = _lines_to_rows(sys.stdin) p = itsdb.make_skeleton(outdir, relations, rows, gzip=args['--gzip']) # unless a skeleton was requested, make empty files for other tables if not args['--skeleton']: for tbl in p.relations: p.write_table(tbl, []) # summarize what was done if sys.stdout.isatty(): _red = lambda s: '\x1b[1;31m{}\x1b[0m'.format(s) else: _red = lambda s: s fmt = '{:>8} bytes\t{}' prof = itsdb.ItsdbProfile(outdir, index=False) relations = prof.relations tables = [tbl for i, tbl in sorted(enumerate(relations))] for filename in ['relations'] + tables: f = os.path.join(outdir, filename) if os.path.isfile(f): stat = os.stat(f) print(fmt.format(stat.st_size, filename)) elif os.path.isfile(f + '.gz'): stat = os.stat(f + '.gz') print(fmt.format(stat.st_size, _red(filename + '.gz')))
def _prepare_input_profile(path, filters=None, applicators=None): flts = [_make_itsdb_action(*f.split('=', 1)) for f in (filters or [])] apls = [_make_itsdb_action(*f.split('=', 1)) for f in (applicators or [])] index = len(flts) > 0 prof = itsdb.ItsdbProfile(path, filters=flts, applicators=apls, index=index) return prof
def test_select_rows(single_item_profile): p = itsdb.ItsdbProfile(single_item_profile) # assert list(itsdb.select_rows(None, p.read_table('item'))) == [['0', 'The dog barks.']] assert list( itsdb.select_rows(['i-id', 'i-input'], p.read_table('item'))) == [['0', 'The dog barks.']] assert list( itsdb.select_rows(['item:i-id', 'parse:parse-id'], p.join('item', 'parse'))) == [['0', '0']]
def show_cov(profile): newprof = itsdb.ItsdbProfile(profile, index=False) newtotal = 0 newparsed = 0 for row in newprof.read_table('parse'): readings = int(row.get('readings')) newtotal +=1 if readings > 0: newparsed +=1 print("Coverage for {}:\n\t {}/{} ({}%)".format(profile, newparsed, newtotal, 100.0 * newparsed / newtotal))
def main(): args = docopt.docopt(USAGE) p = itsdb.ItsdbProfile(args['PROFILE']) if args['--rephrases']: refs = dict(p.select('item', ('i-id', 'i-input'))) else: refs = dict(p.select('item', ('i-id', 'i-translation'))) if args['--oracle']: pairs = oracle_pairs(p, refs) else: pairs = single_pairs(p, refs) print('{:4.2f}'.format(bleu(pairs) * 100))
def check_mrs(profile): prof = itsdb.ItsdbProfile(profile, index=False) warnings.simplefilter('error', XmrsWarning) #warnings.filterwarnings('XmrsWarning') inp = dict() warned=set() for row in prof.read_table('item'): inp[row.get('i-id')] = row.get('i-input') for row in prof.read_table('result'): pid, mrs = row.get('parse-id'), row.get('mrs') try: simplemrs.loads(mrs, single=True) except XmrsWarning as w: if pid not in warned: print('\nItem: {}\t{}'.format(pid,inp[pid])) warned.add(pid) print('Warning: {}!'.format(w)) print("Bad MRS for {}:\n\t {}/{} ({}%)".format(profile, len(warned), len(inp), 100.0 * len(warned) / len(inp)))
def prof_entries(prof, typemap, lexmap, table='result', cols=('derivation', 'mrs')): p = itsdb.ItsdbProfile(prof) seen = set() for derivation, mrs in p.select(table, cols): d = Derivation.from_string(derivation) for entity, typ, form in _derivation_les(d): if typ is None: typ = lexmap.get(entity) orth = ', '.join('"{}"'.format(part) for part in form) if (typ, orth) not in seen and typ in typemap: supertype = typemap[typ][0] # more than 1? lename = '+'.join(form) + '-' + supertype pred = None print(lename, supertype, orth, pred, None) yield (lename, supertype, orth, pred, None) seen.add((typ, orth))
def do(args): if args['--all']: args['--coverage'] = True args['--bleu'] = True args['--oracle-bleu'] = True # args['--meteor'] = True numitems = len(args['ITEM']) width = len(str(numitems)) stats = {} for i, itemdir in enumerate(args['ITEM']): itemdir = os.path.normpath(itemdir) logging.info('Evaluate {0:{1}d}/{2} {3}'.format( i + 1, width, numitems, itemdir)) p = itsdb.ItsdbProfile(itemdir) p_stats = {} if args['--coverage']: update_stats(p_stats, coverage(p, args['--ignore'])) if args['--bleu'] and p.size('g-result') > 0: update_stats(p_stats, bleu(p, 'realizations')) if args['--bleu'] and p.size('r-result') > 0: update_stats(p_stats, bleu(p, 'rephrases')) if args['--oracle-bleu'] and p.size('g-result') > 0: update_stats(p_stats, oracle_bleu(p, 'realizations')) if args['--oracle-bleu'] and p.size('r-result') > 0: update_stats(p_stats, oracle_bleu(p, 'rephrases')) # if args['--meteor']: # update_stats(p_stats, meteor(p)) if not args['--summary-only']: print(format_eval(itemdir, p_stats, args)) update_stats(stats, p_stats) if numitems > 1 or args['--summary-only']: print(format_eval('Summary', stats, args))
def read_profile(f, args): """Load MRS from tsdb. (Copied from mrs_to_penman.py)""" p = itsdb.ItsdbProfile(f) inputs = dict((r['i-id'], r['i-input']) for r in p.read_table('item')) cur_id, mrss = None, [] for row in p.join('parse', 'result'): try: mrs = simplemrs.loads_one(row['result:mrs']) if cur_id is None: cur_id = row['parse:i-id'] if cur_id == row['parse:i-id']: mrss.append(mrs) else: yield (cur_id, inputs[cur_id], mrss) cur_id, mrss = row['parse:i-id'], [mrs] except Exception as ex: print('Could not read profile from file {}, row: {}\n'.format( f, row)) mrss = None # error case, must be handled by caller yield (cur_id, inputs[cur_id], mrss)
def init(args): d = args['DIR'] prepare_workspace_dir(d) config = ConfigParser() config.read(os.path.join(d, 'default.conf')) config['DEFAULT'] = dict(default_config['DEFAULT']) util._update_config(config['DEFAULT'], args, None) for task in ('parse', 'transfer', 'generate', 'rephrase'): config.setdefault(task, default_config.get(task, {})) if args['--' + task]: argv = shlex.split(args['--' + task]) taskargs = docopt(OPTS_USAGE, argv=argv) util._update_config(config[task], taskargs, task) # default rephrase grammar to parse grammar if 'grammar' not in config['rephrase'] and 'grammar' in config['parse']: config['rephrase']['grammar'] = config['parse']['grammar'] for item in args['ITEM']: item = os.path.normpath(item) rows = item_rows(item, args['--reverse']) itemdir = _unique_pathname(d, os.path.basename(item)) os.makedirs(itemdir) with open(os.path.join(itemdir, 'relations'), 'w') as fh: print(relations_string, file=fh) p = itsdb.ItsdbProfile(itemdir) p.write_table('item', rows, gzip=True) if args['--full']: info, results = _parse_tables(item) p.write_table('p-info', info, gzip=True) p.write_table('p-result', results, gzip=True) with open(os.path.join(d, 'default.conf'), 'w') as fh: config.write(fh)
def do(taskname, args): task = tasks[taskname] infotbl = task.prefix + '-info' rslttbl = task.prefix + '-result' numitems = len(args['ITEM']) width = len(str(numitems)) for i, itemdir in enumerate(args['ITEM']): itemdir = os.path.normpath(itemdir) logging.info('{0} {1:{2}d}/{3} {4}'.format(taskname.title(), i + 1, width, numitems, itemdir)) config = _item_config(taskname, itemdir, args) with open(os.path.join(itemdir, 'run.conf'), 'w') as fh: config.write(fh) task_conf = config[taskname] n = task_conf.getint('num-results', -1) bufsize = task_conf.getint('result-buffer-size', fallback=500) p = itsdb.ItsdbProfile(itemdir) # clear previous files _clear_itsdb_file(p.root, infotbl, True) _clear_itsdb_file(p.root, rslttbl, True) with task.processor(os.path.expanduser(task_conf['grammar']), executable=task_conf['ace-bin'], cmdargs=task.cmdargs + _get_cmdargs(task_conf), tsdbinfo=task.tsdbinfo) as ap: inforows = [] resultrows = [] for row in p.read_table(task.in_table): logging.debug('Process: {}\t{}'.format( '|'.join(row[f] for f in task.id_fields), row[task.in_field])) response = ap.interact(row[task.in_field]) logging.debug(' {} results'.format(len(response['results']))) source_ids = [(f, row[f]) for f in task.id_fields] inforows.append( dict(source_ids + [('time', int(response.get('tcpu', -1))), ('memory', int(response.get('others', -1)))])) for i, result in enumerate(response.results()[:n]): score = -1.0 for attr, val in result.get('flags', []): if attr == ':probability': score = float(val) resultrows.append( dict(source_ids + [(f, result[f]) for f in task.out_fields] + [(task.prefix + '-id', i), ('score', score)])) if len(resultrows) >= bufsize: logging.debug('Writing intermediate results to disk.') p.write_table(infotbl, inforows, append=True, gzip=True) p.write_table(rslttbl, resultrows, append=True, gzip=True) inforows = [] resultrows = [] # write remaining data; also gzip at this time p.write_table(infotbl, inforows, append=True, gzip=True) p.write_table(rslttbl, resultrows, append=True, gzip=True)
golddir = '%s/tsdb/gold' % grmdir typefreq = dd(int) # typefreq[type] = freq lexfreq = dd(lambda: dd(int)) # lexfreq[lexid][surf] = freq lxidfreq = dd(lambda: dd(int)) # lxidfreq[typ][lexid] = freq typind = dd(lambda: dd(set)) # typind[type][sid]((frm, to), ...) sent = dd(list) # sent[sid][(surf, lexid)] pname = dict() # pname[sid]=profile roots = dd(lambda: 'rootless') allroots = set() for root, dirs, files in os.walk(golddir): ### find valid profiles if 'result' in files or 'result.gz' in files: # if 'mrs' not in root: ## debug # continue print("Processing %s" % root, file=sys.stderr) profile = itsdb.ItsdbProfile(root) head, profname = os.path.split(root) for row in profile.read_table('result'): pid = row['parse-id'] deriv = row['derivation'] ##print(pid, '\t', deriv) ##print('\n\n') ### Leaves (store as both type and token) ### lexemes, lexical types m = re.findall(mlex, deriv) lexids = set() if m: #print('leaves') #print(m) wid = 0 for (lexid, surf) in m:
def convert(args): """ Convert between various MRS codecs or to export formats. """ from delphin.mrs import (simplemrs, mrx, dmrx, eds, simpledmrs, penman) from delphin.extra import latex codecs = dict([('simplemrs', (simplemrs.loads, simplemrs.dumps)), ('mrx', (mrx.loads, mrx.dumps)), ('dmrx', (dmrx.loads, dmrx.dumps)), ('eds', (eds.loads, eds.dumps)), ('mrs-json', (_mrs_json.loads, _mrs_json.dumps)), ('dmrs-json', (_dmrs_json.loads, _dmrs_json.dumps)), ('eds-json', (_eds_json.loads, _eds_json.dumps)), ('dmrs-penman', (partial(penman.loads, model=xmrs.Dmrs), partial(penman.dumps, model=xmrs.Dmrs))), ('eds-penman', (partial(penman.loads, model=eds.Eds), partial(penman.dumps, model=eds.Eds))), ('simpledmrs', (None, simpledmrs.dumps)), ('dmrs-tikz', (None, latex.dmrs_tikz_dependency))]) decoders = set(k for k, cd in codecs.items() if cd[0]) encoders = set(k for k, cd in codecs.items() if cd[1]) # arg validation if args['--from'] not in decoders: sys.exit('Source format must be one of: {}'.format(', '.join( sorted(decoders)))) if args['--to'] not in encoders: sys.exit('Source format must be one of: {}'.format(', '.join( sorted(encoders)))) if args['--from'].startswith('eds') and not args['--to'].startswith('eds'): sys.exit('Conversion from EDS to non-EDS currently not supported.') args['--color'] = (args['--color'] == 'always' or (args['--color'] == 'auto' and sys.stdout.isatty())) if args['--indent']: args['--pretty-print'] = True if args['--indent'].isdigit(): args['--indent'] = int(args['--indent']) # read loads = codecs[args['--from']][0] if args['PATH'] is not None: if os.path.isdir(args['PATH']): p = itsdb.ItsdbProfile(args['PATH']) xs = [ next(iter(loads(r[0])), None) for r in p.select('result', ['mrs']) ] else: xs = loads(open(args['PATH'], 'r').read()) else: xs = loads(sys.stdin.read()) # write dumps = codecs[args['--to']][1] kwargs = {} if args['--color']: kwargs['color'] = args['--color'] if args['--pretty-print']: kwargs['pretty_print'] = args['--pretty-print'] if args['--indent']: kwargs['indent'] = args['--indent'] try: print(dumps(xs, **kwargs)) except TypeError: sys.exit('One or more parameters to {} are not supported: {}'.format( args['--to'], ', '.join(kwargs)))
# skip omitted preds if normpred == 'nominalization' or normpred.endswith('unknown'): continue # combine named with CARG value if normpred == 'named': pred = 'nmd_"{}"'.format(str(ep.carg or '')) # normalize verbs if ep.pred.pos == 'v': # mark if nominalized if ep.lnk in nmz_locs: pred = 'nmz_' + pred # add argument info pred += '@' + valency[ep.nodeid] preds.append(pred) yield (int(i_id), ' '.join(preds)) with open(args.source_preds, 'w') as src, \ open(args.target_preds, 'w') as tgt: source_profile = itsdb.ItsdbProfile(args.source_profile) target_profile = itsdb.ItsdbProfile(args.target_profile) sourcemrs = dict(pred_strings(source_profile)) targetmrs = dict(pred_strings(target_profile)) # only print pred strings where they exist in both sides for id_ in set(sourcemrs).intersection(targetmrs): print(sourcemrs[id_], file=src) print(targetmrs[id_], file=tgt)