Example #1
0
def test_ItsdbProfile(empty_profile, single_item_skeleton,
                      single_item_profile):
    p = itsdb.ItsdbProfile(empty_profile)
    # tests
    p = itsdb.ItsdbProfile(single_item_skeleton)
    # tests
    p = itsdb.ItsdbProfile(single_item_profile)
Example #2
0
def do(args):
    join_table = 'g-result'
    hyp_spec = 'g-result:surface'
    ref_spec = 'item:i-translation'
    if args['--rephrasing']:
        join_table = 'r-result'
        hyp_spec = 'r-result:surface'
        ref_spec = 'item:i-input'
    make_hyp = make_ref = lambda s: s
    if args['--tokenize']:
        make_hyp = make_ref = lambda s: ' '.join(_tokenize(s))

    select = select_oracle if args['--oracle-bleu'] else select_first

    for i, itemdir in enumerate(args['ITEM']):
        itemdir = os.path.normpath(itemdir)
        p = itsdb.ItsdbProfile(itemdir)
        if args['--item-id']:
            for i_id, hyp, ref in select(p,
                                         join_table,
                                         hyp_spec,
                                         ref_spec,
                                         with_id=True):
                print('{}\t{}\t{}'.format(i_id, make_hyp(hyp), make_ref(ref)))
        else:
            for hyp, ref in select(p, join_table, hyp_spec, ref_spec):
                print('{}\t{}'.format(make_hyp(hyp), make_ref(ref)))
Example #3
0
File: main.py Project: goodmami/xmt
def _parse_tables(item):
    info, results = [], []
    makeinfo = lambda a, b, c: {'i-id': a, 'time': b, 'memory': c}
    makeresult = lambda a, b, c, d, e: {
        'i-id': a,
        'p-id': b,
        'derivation': c,
        'mrs': d,
        'score': e
    }
    if os.path.isdir(item):
        p = itsdb.ItsdbProfile(item)
        fn = os.path.join(p.root, 'parse')
        if os.path.isfile(fn) or os.path.isfile(fn + '.gz'):
            for row in p.read_table('parse'):
                info.append(makeinfo(row['i-id'], row['total'], row['others']))
        fn = os.path.join(p.root, 'result')
        if os.path.isfile(fn) or os.path.isfile(fn + '.gz'):
            for row in p.join('parse', 'result'):
                results.append(
                    makeresult(
                        row['parse:i-id'],
                        row['result:result-id'],
                        row['result:derivation'],
                        row['result:mrs'],
                        '1.0'  # for now
                    ))
    else:
        raise ValueError('Only profiles allowed with --full: ' + str(item))
    return info, results
Example #4
0
File: main.py Project: goodmami/xmt
def item_rows(item, reverse=False):
    data = []
    if os.path.isdir(item):
        p = itsdb.ItsdbProfile(item)
        output_fn = os.path.join(p.root, 'output')
        if ((os.path.isfile(output_fn) or os.path.isfile(output_fn + '.gz'))
                and len(list(p.read_table('output'))) > 0):
            for row in p.join('item', 'output'):
                data.append((row['item:i-id'], row['item:i-input'],
                             row['output:o-surface']))
        else:
            data.extend(p.select('item', ['i-id', 'i-input', 'i-translation']))
    elif os.path.isfile(item):
        for i, line in enumerate(open(item)):
            src, tgt = line.split('\t', 1)
            data.append(((i + 1) * 10, src.rstrip(), tgt.rstrip()))
    else:
        raise ValueError('Invalid item: ' + str(item))

    rows = []
    for i_id, src, tgt in data:
        if reverse:
            src, tgt = tgt, src
        rows.append({
            'i-id': i_id,
            'i-input': src,
            'i-length': len(src.split()),
            'i-translation': tgt
        })
    return rows
Example #5
0
def read_profile(f, args):
    p = itsdb.ItsdbProfile(f)
    mrs_dataspec = args
    inputs = dict((r['i-id'], r['i-input']) for r in p.read_table('item'))
    cur_id, mrss = None, []

    if p.exists('p-result'):
        rows = p.read_table('p-result')
        id_spec = 'i-id'
        mrs_spec = 'mrs'
    else:
        rows = p.join('parse', 'result')
        id_spec = 'parse:i-id'
        mrs_spec = 'result:mrs'

    for row in rows:
        mrs = simplemrs.loads_one(row[mrs_spec])

        if cur_id is None:
            cur_id = row[id_spec]

        if cur_id == row[id_spec]:
            mrss.append(mrs)
        else:
            yield (cur_id, inputs[cur_id], mrss)
            cur_id, mrss = row[id_spec], [mrs]

    if mrss:
        yield (cur_id, inputs[cur_id], mrss)
Example #6
0
def mkprof(args):
    """
    Create [incr tsdb()] profiles or skeletons.
    """
    outdir = args['DEST']
    if args['--in-place']:
        if args['--skeleton']:
            sys.exit('Creating a skeleton with --in-place is not allowed.')
        args['--source'] = args['DEST']
    if args['--source']:  # input is profile
        p = _prepare_input_profile(args['--source'],
                                   filters=args['--filter'],
                                   applicators=args['--apply'])
        relations = args['--relations'] or os.path.join(p.root, 'relations')
    else:  # input is stdin or txt file
        p = None
        relations = args['--relations']

    if not os.path.isfile(relations):
        sys.exit('Invalid relations file: {}'.format(relations))

    if args['--full']:
        _prepare_output_directory(outdir)
        p.write_profile(outdir,
                        relations_filename=relations,
                        key_filter=True,
                        gzip=args['--gzip'])
    else:
        if p is not None:
            rows = p.read_table('item')
        elif args['--input'] is not None:
            rows = _lines_to_rows(open(args['--input']))
        else:
            rows = _lines_to_rows(sys.stdin)
        p = itsdb.make_skeleton(outdir, relations, rows, gzip=args['--gzip'])
        # unless a skeleton was requested, make empty files for other tables
        if not args['--skeleton']:
            for tbl in p.relations:
                p.write_table(tbl, [])

    # summarize what was done
    if sys.stdout.isatty():
        _red = lambda s: '\x1b[1;31m{}\x1b[0m'.format(s)
    else:
        _red = lambda s: s
    fmt = '{:>8} bytes\t{}'
    prof = itsdb.ItsdbProfile(outdir, index=False)
    relations = prof.relations
    tables = [tbl for i, tbl in sorted(enumerate(relations))]
    for filename in ['relations'] + tables:
        f = os.path.join(outdir, filename)
        if os.path.isfile(f):
            stat = os.stat(f)
            print(fmt.format(stat.st_size, filename))
        elif os.path.isfile(f + '.gz'):
            stat = os.stat(f + '.gz')
            print(fmt.format(stat.st_size, _red(filename + '.gz')))
Example #7
0
def _prepare_input_profile(path, filters=None, applicators=None):
    flts = [_make_itsdb_action(*f.split('=', 1)) for f in (filters or [])]
    apls = [_make_itsdb_action(*f.split('=', 1)) for f in (applicators or [])]
    index = len(flts) > 0
    prof = itsdb.ItsdbProfile(path,
                              filters=flts,
                              applicators=apls,
                              index=index)
    return prof
Example #8
0
def test_select_rows(single_item_profile):
    p = itsdb.ItsdbProfile(single_item_profile)
    # assert list(itsdb.select_rows(None, p.read_table('item'))) == [['0', 'The dog barks.']]
    assert list(
        itsdb.select_rows(['i-id', 'i-input'],
                          p.read_table('item'))) == [['0', 'The dog barks.']]
    assert list(
        itsdb.select_rows(['item:i-id', 'parse:parse-id'],
                          p.join('item', 'parse'))) == [['0', '0']]
Example #9
0
File: check.py Project: jomof/jacy
def show_cov(profile):    
    newprof = itsdb.ItsdbProfile(profile, index=False)
    newtotal = 0
    newparsed = 0
    for row in newprof.read_table('parse'):
        readings = int(row.get('readings'))
        newtotal +=1
        if readings > 0:
            newparsed +=1
            
    print("Coverage for {}:\n\t {}/{} ({}%)".format(profile,
                                                    newparsed, newtotal,
                                                    100.0 * newparsed / newtotal))
Example #10
0
def main():
    args = docopt.docopt(USAGE)

    p = itsdb.ItsdbProfile(args['PROFILE'])

    if args['--rephrases']:
        refs = dict(p.select('item', ('i-id', 'i-input')))
    else:
        refs = dict(p.select('item', ('i-id', 'i-translation')))

    if args['--oracle']:
        pairs = oracle_pairs(p, refs)
    else:
        pairs = single_pairs(p, refs)

    print('{:4.2f}'.format(bleu(pairs) * 100))
Example #11
0
File: check.py Project: jomof/jacy
def check_mrs(profile):
    prof = itsdb.ItsdbProfile(profile, index=False)
    warnings.simplefilter('error', XmrsWarning)
    #warnings.filterwarnings('XmrsWarning')
    inp = dict()
    warned=set()
    for row in prof.read_table('item'):
        inp[row.get('i-id')] = row.get('i-input')
    for row in prof.read_table('result'):
        pid, mrs = row.get('parse-id'), row.get('mrs')
        try:
            simplemrs.loads(mrs, single=True)
        except XmrsWarning as w:
            if pid not in warned: 
                print('\nItem: {}\t{}'.format(pid,inp[pid]))
                warned.add(pid)
            print('Warning: {}!'.format(w))
    print("Bad MRS for {}:\n\t {}/{} ({}%)".format(profile,
                                                    len(warned), len(inp),
                                                    100.0 * len(warned) /  len(inp)))
Example #12
0
def prof_entries(prof,
                 typemap,
                 lexmap,
                 table='result',
                 cols=('derivation', 'mrs')):
    p = itsdb.ItsdbProfile(prof)
    seen = set()
    for derivation, mrs in p.select(table, cols):
        d = Derivation.from_string(derivation)
        for entity, typ, form in _derivation_les(d):
            if typ is None:
                typ = lexmap.get(entity)
            orth = ', '.join('"{}"'.format(part) for part in form)
            if (typ, orth) not in seen and typ in typemap:
                supertype = typemap[typ][0]  # more than 1?
                lename = '+'.join(form) + '-' + supertype
                pred = None
                print(lename, supertype, orth, pred, None)
                yield (lename, supertype, orth, pred, None)
                seen.add((typ, orth))
Example #13
0
def do(args):
    if args['--all']:
        args['--coverage'] = True
        args['--bleu'] = True
        args['--oracle-bleu'] = True
        # args['--meteor'] = True

    numitems = len(args['ITEM'])
    width = len(str(numitems))
    stats = {}

    for i, itemdir in enumerate(args['ITEM']):
        itemdir = os.path.normpath(itemdir)
        logging.info('Evaluate {0:{1}d}/{2} {3}'.format(
            i + 1, width, numitems, itemdir))
        p = itsdb.ItsdbProfile(itemdir)
        p_stats = {}
        if args['--coverage']:
            update_stats(p_stats, coverage(p, args['--ignore']))

        if args['--bleu'] and p.size('g-result') > 0:
            update_stats(p_stats, bleu(p, 'realizations'))
        if args['--bleu'] and p.size('r-result') > 0:
            update_stats(p_stats, bleu(p, 'rephrases'))

        if args['--oracle-bleu'] and p.size('g-result') > 0:
            update_stats(p_stats, oracle_bleu(p, 'realizations'))
        if args['--oracle-bleu'] and p.size('r-result') > 0:
            update_stats(p_stats, oracle_bleu(p, 'rephrases'))

        # if args['--meteor']:
        #     update_stats(p_stats, meteor(p))

        if not args['--summary-only']:
            print(format_eval(itemdir, p_stats, args))

        update_stats(stats, p_stats)

    if numitems > 1 or args['--summary-only']:
        print(format_eval('Summary', stats, args))
Example #14
0
def read_profile(f, args):
    """Load MRS from tsdb. (Copied from mrs_to_penman.py)"""
    p = itsdb.ItsdbProfile(f)
    inputs = dict((r['i-id'], r['i-input']) for r in p.read_table('item'))
    cur_id, mrss = None, []
    for row in p.join('parse', 'result'):
        try:
            mrs = simplemrs.loads_one(row['result:mrs'])

            if cur_id is None:
                cur_id = row['parse:i-id']

            if cur_id == row['parse:i-id']:
                mrss.append(mrs)
            else:
                yield (cur_id, inputs[cur_id], mrss)
                cur_id, mrss = row['parse:i-id'], [mrs]
        except Exception as ex:
            print('Could not read profile from file {}, row: {}\n'.format(
                f, row))
            mrss = None  # error case, must be handled by caller

    yield (cur_id, inputs[cur_id], mrss)
Example #15
0
File: main.py Project: goodmami/xmt
def init(args):
    d = args['DIR']

    prepare_workspace_dir(d)
    config = ConfigParser()
    config.read(os.path.join(d, 'default.conf'))

    config['DEFAULT'] = dict(default_config['DEFAULT'])
    util._update_config(config['DEFAULT'], args, None)

    for task in ('parse', 'transfer', 'generate', 'rephrase'):
        config.setdefault(task, default_config.get(task, {}))
        if args['--' + task]:
            argv = shlex.split(args['--' + task])
            taskargs = docopt(OPTS_USAGE, argv=argv)
            util._update_config(config[task], taskargs, task)

    # default rephrase grammar to parse grammar
    if 'grammar' not in config['rephrase'] and 'grammar' in config['parse']:
        config['rephrase']['grammar'] = config['parse']['grammar']

    for item in args['ITEM']:
        item = os.path.normpath(item)
        rows = item_rows(item, args['--reverse'])
        itemdir = _unique_pathname(d, os.path.basename(item))
        os.makedirs(itemdir)
        with open(os.path.join(itemdir, 'relations'), 'w') as fh:
            print(relations_string, file=fh)
        p = itsdb.ItsdbProfile(itemdir)
        p.write_table('item', rows, gzip=True)
        if args['--full']:
            info, results = _parse_tables(item)
            p.write_table('p-info', info, gzip=True)
            p.write_table('p-result', results, gzip=True)

    with open(os.path.join(d, 'default.conf'), 'w') as fh:
        config.write(fh)
Example #16
0
File: task.py Project: goodmami/xmt
def do(taskname, args):
    task = tasks[taskname]
    infotbl = task.prefix + '-info'
    rslttbl = task.prefix + '-result'
    numitems = len(args['ITEM'])
    width = len(str(numitems))

    for i, itemdir in enumerate(args['ITEM']):
        itemdir = os.path.normpath(itemdir)
        logging.info('{0} {1:{2}d}/{3} {4}'.format(taskname.title(), i + 1,
                                                   width, numitems, itemdir))
        config = _item_config(taskname, itemdir, args)
        with open(os.path.join(itemdir, 'run.conf'), 'w') as fh:
            config.write(fh)
        task_conf = config[taskname]
        n = task_conf.getint('num-results', -1)
        bufsize = task_conf.getint('result-buffer-size', fallback=500)

        p = itsdb.ItsdbProfile(itemdir)
        # clear previous files
        _clear_itsdb_file(p.root, infotbl, True)
        _clear_itsdb_file(p.root, rslttbl, True)

        with task.processor(os.path.expanduser(task_conf['grammar']),
                            executable=task_conf['ace-bin'],
                            cmdargs=task.cmdargs + _get_cmdargs(task_conf),
                            tsdbinfo=task.tsdbinfo) as ap:

            inforows = []
            resultrows = []
            for row in p.read_table(task.in_table):
                logging.debug('Process: {}\t{}'.format(
                    '|'.join(row[f] for f in task.id_fields),
                    row[task.in_field]))

                response = ap.interact(row[task.in_field])
                logging.debug('  {} results'.format(len(response['results'])))

                source_ids = [(f, row[f]) for f in task.id_fields]

                inforows.append(
                    dict(source_ids +
                         [('time', int(response.get('tcpu', -1))),
                          ('memory', int(response.get('others', -1)))]))

                for i, result in enumerate(response.results()[:n]):
                    score = -1.0
                    for attr, val in result.get('flags', []):
                        if attr == ':probability':
                            score = float(val)
                    resultrows.append(
                        dict(source_ids + [(f, result[f])
                                           for f in task.out_fields] +
                             [(task.prefix + '-id', i), ('score', score)]))

                if len(resultrows) >= bufsize:
                    logging.debug('Writing intermediate results to disk.')
                    p.write_table(infotbl, inforows, append=True, gzip=True)
                    p.write_table(rslttbl, resultrows, append=True, gzip=True)
                    inforows = []
                    resultrows = []

            # write remaining data; also gzip at this time
            p.write_table(infotbl, inforows, append=True, gzip=True)
            p.write_table(rslttbl, resultrows, append=True, gzip=True)
Example #17
0
golddir = '%s/tsdb/gold' % grmdir
typefreq = dd(int)  # typefreq[type] = freq
lexfreq = dd(lambda: dd(int))  # lexfreq[lexid][surf] = freq
lxidfreq = dd(lambda: dd(int))  # lxidfreq[typ][lexid] = freq
typind = dd(lambda: dd(set))  # typind[type][sid]((frm, to), ...)
sent = dd(list)  # sent[sid][(surf, lexid)]
pname = dict()  # pname[sid]=profile
roots = dd(lambda: 'rootless')
allroots = set()
for root, dirs, files in os.walk(golddir):
    ### find valid profiles
    if 'result' in files or 'result.gz' in files:
        # if 'mrs' not in root: ## debug
        #     continue
        print("Processing %s" % root, file=sys.stderr)
        profile = itsdb.ItsdbProfile(root)
        head, profname = os.path.split(root)
        for row in profile.read_table('result'):
            pid = row['parse-id']
            deriv = row['derivation']
            ##print(pid, '\t', deriv)
            ##print('\n\n')
            ### Leaves (store as both type and token)
            ### lexemes, lexical types
            m = re.findall(mlex, deriv)
            lexids = set()
            if m:
                #print('leaves')
                #print(m)
                wid = 0
                for (lexid, surf) in m:
Example #18
0
def convert(args):
    """
    Convert between various MRS codecs or to export formats.
    """
    from delphin.mrs import (simplemrs, mrx, dmrx, eds, simpledmrs, penman)
    from delphin.extra import latex
    codecs = dict([('simplemrs', (simplemrs.loads, simplemrs.dumps)),
                   ('mrx', (mrx.loads, mrx.dumps)),
                   ('dmrx', (dmrx.loads, dmrx.dumps)),
                   ('eds', (eds.loads, eds.dumps)),
                   ('mrs-json', (_mrs_json.loads, _mrs_json.dumps)),
                   ('dmrs-json', (_dmrs_json.loads, _dmrs_json.dumps)),
                   ('eds-json', (_eds_json.loads, _eds_json.dumps)),
                   ('dmrs-penman', (partial(penman.loads, model=xmrs.Dmrs),
                                    partial(penman.dumps, model=xmrs.Dmrs))),
                   ('eds-penman', (partial(penman.loads, model=eds.Eds),
                                   partial(penman.dumps, model=eds.Eds))),
                   ('simpledmrs', (None, simpledmrs.dumps)),
                   ('dmrs-tikz', (None, latex.dmrs_tikz_dependency))])
    decoders = set(k for k, cd in codecs.items() if cd[0])
    encoders = set(k for k, cd in codecs.items() if cd[1])

    # arg validation
    if args['--from'] not in decoders:
        sys.exit('Source format must be one of: {}'.format(', '.join(
            sorted(decoders))))
    if args['--to'] not in encoders:
        sys.exit('Source format must be one of: {}'.format(', '.join(
            sorted(encoders))))
    if args['--from'].startswith('eds') and not args['--to'].startswith('eds'):
        sys.exit('Conversion from EDS to non-EDS currently not supported.')
    args['--color'] = (args['--color'] == 'always'
                       or (args['--color'] == 'auto' and sys.stdout.isatty()))
    if args['--indent']:
        args['--pretty-print'] = True
        if args['--indent'].isdigit():
            args['--indent'] = int(args['--indent'])

    # read
    loads = codecs[args['--from']][0]
    if args['PATH'] is not None:
        if os.path.isdir(args['PATH']):
            p = itsdb.ItsdbProfile(args['PATH'])
            xs = [
                next(iter(loads(r[0])), None)
                for r in p.select('result', ['mrs'])
            ]
        else:
            xs = loads(open(args['PATH'], 'r').read())
    else:
        xs = loads(sys.stdin.read())

    # write
    dumps = codecs[args['--to']][1]
    kwargs = {}
    if args['--color']: kwargs['color'] = args['--color']
    if args['--pretty-print']: kwargs['pretty_print'] = args['--pretty-print']
    if args['--indent']: kwargs['indent'] = args['--indent']
    try:
        print(dumps(xs, **kwargs))
    except TypeError:
        sys.exit('One or more parameters to {} are not supported: {}'.format(
            args['--to'], ', '.join(kwargs)))
Example #19
0
            # skip omitted preds
            if normpred == 'nominalization' or normpred.endswith('unknown'):
                continue
            # combine named with CARG value
            if normpred == 'named':
                pred = 'nmd_"{}"'.format(str(ep.carg or ''))
            # normalize verbs
            if ep.pred.pos == 'v':
                # mark if nominalized
                if ep.lnk in nmz_locs:
                    pred = 'nmz_' + pred
                # add argument info
                pred += '@' + valency[ep.nodeid]
            preds.append(pred)

        yield (int(i_id), ' '.join(preds))


with open(args.source_preds, 'w') as src, \
     open(args.target_preds, 'w') as tgt:
    source_profile = itsdb.ItsdbProfile(args.source_profile)
    target_profile = itsdb.ItsdbProfile(args.target_profile)

    sourcemrs = dict(pred_strings(source_profile))
    targetmrs = dict(pred_strings(target_profile))

    # only print pred strings where they exist in both sides
    for id_ in set(sourcemrs).intersection(targetmrs):
        print(sourcemrs[id_], file=src)
        print(targetmrs[id_], file=tgt)