Example #1
0
def featurize_transitions(data,
                          mappings,
                          invmappings,
                          feat_file,
                          transsys,
                          log=None):
    if feat_file is not None and op.exists(feat_file):
        with smart_open(feat_file, 'rb') as f:
            res = pickle.load(f)
            max_feat_size = pickle.load(f)

        log.info('Read %d featurized examples from saved feature file "%s".' %
                 (len(res), feat_file))

        return res, max_feat_size

    log.info('Featurizing %d examples...' % (len(data)))
    max_feat_size = -1
    res = []

    count = 0
    transsys = transsys(mappings, invmappings)

    for t in data:
        sent, trans = t[:2]
        state = ParserState(sent, transsys=transsys)

        feats = []
        labels = []
        featsizes = []
        for t in trans:
            feat, label = featurize_state(state, mappings, t)
            transsys.advance(state, label)

            max_feat_size = max(max_feat_size, len(feat))
            feats += [feat]
            labels += [label]
            featsizes += [len(feat)]

        assert (len(feats) == len(labels))
        res += [[feats, featsizes, labels]]

        count += 1
        if (count) % 100 == 0:
            log.debug("Featurized %d examples..." % (count))

    assert (len(res) == len(data))
    log.info("%d examples featurized, maximum feature size=%d" %
             (len(res), (max_feat_size - 1) * len(mappings['rel']) + 1))

    if feat_file is not None:
        log.info('Saving %d featurized examples to feature file "%s"...' %
                 (len(res), feat_file))
        with smart_open(feat_file, 'wb') as f:
            pickle.dump(res, f, pickle.HIGHEST_PROTOCOL)
            pickle.dump(max_feat_size, f, pickle.HIGHEST_PROTOCOL)

        log.info('Done.')

    return res, max_feat_size
Example #2
0
    def processlines(lines):
        arcs = [dict() for i in range(len(lines)+1)]

        pos = ["" for i in xrange(len(lines)+1)]
        fpos = ["" for i in xrange(len(lines)+1)]

        for i, line in enumerate(lines):
            pos[i+1] = line[3] # fine-grained
            fpos[i+1] = line[4]
            parent = int(line[6])
            relation = line[7]
            arcs[parent][i+1] = transsys.mappings['rel'][relation]

        res = [ParserState(["<ROOT>"] + lines, transsys=transsys, goldrels=arcs), pos]
        if fpos:
            res += [fpos]
        else:
            res == [None]
        return res
Example #3
0
def eval(args):
    transsys_lookup = {"ASw": ArcSwift,
                       "AER" : ArcEagerReduce,
                       "AES": ArcEagerShift,
                       "ASd"  : ArcStandard,
                       "AH"  : ArcHybrid,}
    transsys = transsys_lookup[args.transsys]

    vocab, vecs, pretrained = read_vocab(conll_file=args.conll_file, wordvec_file=args.wordvec_file, vocab_file=args.vocab_file, wordvec_dim=args.wordvec_dim, min_count=args.min_count, log=log)
    mappings, invmappings = read_mappings(args.mappings_file, transsys, log=log)
    data, sent_length, trans_length = read_data(conll_file=args.conll_file, seq_file=args.seq_file, vocab=vocab, mappings=mappings, transsys=transsys, fpos=args.fpos, log=log)

    feat_shape = [5] if args.transsys != 'ASw' else [sent_length, 5]

    transsys = transsys(mappings, invmappings)

    parser = Parser(args, vecs, pretrained, mappings, invmappings, sent_length, trans_length, -1, log, train=False)

    trans_predictors = parser.trans_predictors

    log.info('Computational graph successfully built.')
    log.info('Setting up tensorflow session...')

    saver = tf.train.Saver(max_to_keep=10000)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)
    for epoch in reversed(xrange(int(args.epochs * args.epoch_multiplier))):
        #with tf.Session(config=config) as sess:
        savedpath = '%s/model_epoch%d' % (args.model_dir, epoch)
        if not op.exists(savedpath + '.meta'):
            continue
        log.info('Evaluating Epoch %3d...' % (epoch))
        saver.restore(sess, savedpath)

        states = [[(0, ParserState(datum[0], transsys=transsys))] for datum in data]
        with smart_open('%s/%s_pos_eval_beam_%d_output_epoch%d.txt' % (args.model_dir, args.eval_dataset, args.beam_size, epoch), 'w') as outf2:
            with smart_open('%s/%s_eval_beam_%d_output_epoch%d.txt' % (args.model_dir, args.eval_dataset, args.beam_size, epoch), 'w') as outf:
                for batch in xrange((len(data)+args.batch_size-1) / args.batch_size):
                    idx = range(batch * args.batch_size, min((batch+1) * args.batch_size, len(data)))

                    batch_size = len(idx)

                    batch_data = [data[i] for i in idx]
                    batch_states = [states[i] for i in idx]

                    # prepare data in tensor shape
                    batch_sent_lengths = np.array([len(datum[0]) for datum in batch_data] + [sent_length] * (args.batch_size - batch_size), dtype=np.int32)
                    batch_words = np.zeros((args.batch_size, sent_length), dtype=np.int32)
                    batch_words2 = np.zeros((args.batch_size, sent_length), dtype=np.int32)
                    batch_gold_pos = np.zeros((args.batch_size, sent_length), dtype=np.int32)
                    for i in xrange(batch_size):
                        batch_words[i, :batch_sent_lengths[i]] = batch_data[i][0]
                        batch_words2[i, :batch_sent_lengths[i]] = batch_data[i][0]
                        batch_gold_pos[i, :batch_sent_lengths[i]] = batch_data[i][2]

                    batch_trans_feat_ids = np.zeros(tuple([args.batch_size * args.beam_size] + feat_shape), dtype=np.int32)
                    batch_trans_feat_sizes = np.zeros((args.batch_size * args.beam_size), dtype=np.int32)

                    preds_list = [parser.combined_head, parser.combined_dep, parser.pos_preds]
                    if args.transsys == 'ASw':
                        preds_list += [parser.transition_logit]
                    if args.fpos:
                        preds_list += [parser.fpos_preds]

                    preds = sess.run(preds_list,
                               feed_dict={parser.words: batch_words,
                                          parser.words2: batch_words2,
                                          parser.sent_lengths: batch_sent_lengths,
                                          parser.gold_pos: batch_gold_pos,})
                    # unpack predictions
                    batch_combined_head, batch_combined_dep, pos_preds = preds[:3]
                    preds = preds[3:]
                    if args.transsys == 'ASw':
                        batch_trans_logit = preds[0]
                        preds = preds[1:]
                    if args.fpos:
                        fpos_preds = preds[0]
                        preds = preds[1:]

                    if args.fpos:
                        for i in xrange(batch_size):
                            for j in xrange(batch_sent_lengths[i]-1):
                                outf2.write("%s\t%s\n" % (invmappings['pos'][pos_preds[i][j]], invmappings['fpos'][fpos_preds[i][j]]))
                            outf2.write("\n")
                    else:
                        for i in xrange(batch_size):
                            for j in xrange(batch_sent_lengths[i]-1):
                                outf2.write("%s\t_\n" % invmappings['pos'][pos_preds[i][j]])
                            outf2.write("\n")

                    j = 0
                    updated = range(batch_size)
                    batch_finished = [[] for _ in range(batch_size)]

                    feat_lengths = [[] for _ in range(batch_size)]

                    while True:
                        batch_feats = [[featurize_state(batch_states[i][k][1], mappings) for k in range(len(batch_states[i]))] for i in updated]
                        for i, beam_feats in zip(updated, batch_feats):
                            feats = beam_feats[0]
                            if len(feats) > 0:
                                if args.transsys == 'ASw':
                                    feat_lengths[i] += [len(feats)]
                                else:
                                    feat_lengths[i] += [len(batch_states[i][0][1].transitionset())]

                        preds = []
                        predsid = []
                        for i, beam_feats in zip(updated, batch_feats):
                            for k, feats in enumerate(beam_feats):
                                if len(feats) <= 0:
                                    if len(batch_finished[i]) < args.beam_size:
                                        heappush(batch_finished[i], batch_states[i][k])
                                    else:
                                        heappushpop(batch_finished[i], batch_states[i][k])

                                    continue

                                beamidx = i * args.beam_size + k
                                if args.transsys == 'ASw':
                                    batch_trans_feat_ids[beamidx, :len(feats)] = feats
                                else:
                                    batch_trans_feat_ids[beamidx] = feats

                                batch_trans_feat_sizes[beamidx] = len(feats)

                                assert(batch_trans_feat_sizes[beamidx] > 0)

                                predsid.append((i, k))
                                preds.append(trans_predictors[i][k])

                        if len(predsid) <= 0:
                            break

                        if args.transsys == 'ASw':
                            p = sess.run(preds, feed_dict={parser.combined_head_placeholder: batch_combined_head,
                                                       parser.combined_dep_placeholder: batch_combined_dep,
                                                       parser.trans_logit_placeholder:batch_trans_logit,
                                                       parser.trans_feat_ids: batch_trans_feat_ids,
                                                       parser.trans_feat_sizes: batch_trans_feat_sizes})
                        else:
                            p = sess.run(preds, feed_dict={parser.combined_head_placeholder: batch_combined_head,
                                                       parser.combined_dep_placeholder: batch_combined_dep,
                                                       parser.trans_feat_ids: batch_trans_feat_ids,
                                                       parser.trans_feat_sizes: batch_trans_feat_sizes})

                        next_batchstates = [[] for _ in xrange(batch_size)]
                        updated = set()
                        for ik, pred in izip(predsid, p):
                            i, k = ik

                            updated.add(i)

                            if len(batch_states[i][k][1].transitionset()) > 0:
                                # model outputs NLLs so the lower the better
                                sort = sorted(enumerate(pred), key=lambda x: x[1])
                                expanded_beams = 0
                                for choice, score in sort:
                                    newscore = batch_states[i][k][0] - score

                                    if transsys.tuple_trans_from_int(batch_states[i][k][1].transitionset(), choice)[0] in batch_states[i][k][1].transitionset():
                                        candidate = (newscore, batch_states[i][k][1], choice)
                                        if len(next_batchstates[i]) < args.beam_size:
                                            heappush(next_batchstates[i], candidate)
                                        elif newscore > next_batchstates[i][0][0]:
                                            heappushpop(next_batchstates[i], candidate)

                                        expanded_beams += 1
                                        if expanded_beams >= args.beam_size:
                                            break

                        for i in updated:
                            next_batchstates[i] = nlargest(args.beam_size, next_batchstates[i], key=lambda x:x[0])
                            for k, t in enumerate(next_batchstates[i]):
                                score, state, choice = t
                                state = state.clone()
                                transsys.advance(state, choice)
                                next_batchstates[i][k] = (score, state)

                        batch_states = next_batchstates

                        j += 1

                    for i in xrange(batch_size):
                        assert len(batch_finished) == batch_size
                        assert len(batch_finished[i]) > 0, "nothing finished: %d" % (i)
                        assert len(batch_finished[i][0]) > 1, "%s" % (batch_finished[i][0])
                        state_pred = nlargest(1, batch_finished[i], key=lambda x:x[0])[0][1]
                        for t in state_pred.head[1:]:
                            outf.write("%d\t%s\n" % (t[0], invmappings['rel'][t[1]]))
                        outf.write("\n")

                    log.info('Epoch %3d batch %4d' % (epoch, batch))
    sess.close()
def eval(args):
    transsys_lookup = {
        "Cov": Covington,
        "NCov": NewCovington,
        "Cov2": Covington2,
        "Cov3": Covington3
    }
    transsys = transsys_lookup[args.transsys]

    vocab, vecs, pretrained = read_vocab(conll_file=args.conll_file,
                                         wordvec_file=args.wordvec_file,
                                         vocab_file=args.vocab_file,
                                         wordvec_dim=args.wordvec_dim,
                                         min_count=args.min_count,
                                         log=log)
    mappings, invmappings = read_mappings(args.mappings_file,
                                          transsys,
                                          log=log)
    data, sent_length, trans_length = read_data(conll_file=args.conll_file,
                                                seq_file=args.seq_file,
                                                vocab=vocab,
                                                mappings=mappings,
                                                transsys=transsys,
                                                fpos=args.fpos,
                                                log=log)

    if args.transsys == 'NCov':
        sent_length = 70

    feat_shape = [5] if args.transsys == 'Cov' else [sent_length, 5]

    transsys = transsys(mappings, invmappings)

    parser = Parser(args,
                    vecs,
                    pretrained,
                    mappings,
                    invmappings,
                    sent_length,
                    trans_length,
                    -1,
                    log,
                    train=False)

    trans_predictors = parser.trans_predictors

    log.info('Computational graph successfully built.')
    log.info('Setting up tensorflow session...')

    saver = tf.train.Saver(max_to_keep=10000)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)
    for epoch in reversed(xrange(int(args.epochs * args.epoch_multiplier))):
        #with tf.Session(config=config) as sess:
        savedpath = '%s/model_epoch%d' % (args.model_dir, epoch)
        if not op.exists(savedpath + '.meta'):
            continue
        log.info('Evaluating Epoch %3d...' % (epoch))
        saver.restore(sess, savedpath)

        #print "aki empieza a crear los estados"
        states = [[(0, ParserState(datum[0], transsys=transsys))]
                  for datum in data]
        #print "aki termina++++++++++++++++++++++++++++++++++++++++++++++"

        with smart_open(
                '%s/%s_pos_eval_beam_%d_output_epoch%d.txt' %
            (args.model_dir, args.eval_dataset, args.beam_size, epoch),
                'w') as outf2:
            with smart_open(
                    '%s/%s_eval_beam_%d_output_epoch%d.txt' %
                (args.model_dir, args.eval_dataset, args.beam_size, epoch),
                    'w') as outf:
                for batch in xrange(
                    (len(data) + args.batch_size - 1) / args.batch_size):
                    #print "Empieza un nuevo batch"
                    idx = range(batch * args.batch_size,
                                min((batch + 1) * args.batch_size, len(data)))

                    batch_size = len(idx)

                    batch_data = [data[i] for i in idx]
                    batch_states = [states[i] for i in idx]

                    # prepare data in tensor shape
                    batch_sent_lengths = np.array(
                        [len(datum[0]) for datum in batch_data] +
                        [sent_length] * (args.batch_size - batch_size),
                        dtype=np.int32)
                    batch_words = np.zeros((args.batch_size, sent_length),
                                           dtype=np.int32)
                    batch_words2 = np.zeros((args.batch_size, sent_length),
                                            dtype=np.int32)
                    batch_gold_pos = np.zeros((args.batch_size, sent_length),
                                              dtype=np.int32)
                    for i in xrange(batch_size):
                        batch_words[
                            i, :batch_sent_lengths[i]] = batch_data[i][0]
                        batch_words2[
                            i, :batch_sent_lengths[i]] = batch_data[i][0]
                        batch_gold_pos[
                            i, :batch_sent_lengths[i]] = batch_data[i][2]

                    batch_trans_feat_ids = np.zeros(
                        tuple([args.batch_size * args.beam_size] + feat_shape),
                        dtype=np.int32)
                    batch_trans_feat_sizes = np.zeros(
                        (args.batch_size * args.beam_size), dtype=np.int32)

                    preds_list = [
                        parser.combined_head, parser.combined_dep,
                        parser.pos_preds
                    ]

                    if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
                        preds_list += [parser.transition_logit]
                    if args.fpos:
                        preds_list += [parser.fpos_preds]

                    preds = sess.run(preds_list,
                                     feed_dict={
                                         parser.words: batch_words,
                                         parser.words2: batch_words2,
                                         parser.sent_lengths:
                                         batch_sent_lengths,
                                         parser.gold_pos: batch_gold_pos,
                                     })
                    # unpack predictions
                    batch_combined_head, batch_combined_dep, pos_preds = preds[:
                                                                               3]
                    preds = preds[3:]
                    if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
                        batch_trans_logit = preds[0]
                        preds = preds[1:]
                    if args.fpos:
                        fpos_preds = preds[0]
                        preds = preds[1:]

                    if args.fpos:
                        for i in xrange(batch_size):
                            for j in xrange(batch_sent_lengths[i] - 1):
                                outf2.write(
                                    "%s\t%s\n" %
                                    (invmappings['pos'][pos_preds[i][j]],
                                     invmappings['fpos'][fpos_preds[i][j]]))
                            outf2.write("\n")
                    else:
                        for i in xrange(batch_size):
                            for j in xrange(batch_sent_lengths[i] - 1):
                                outf2.write(
                                    "%s\t_\n" %
                                    invmappings['pos'][pos_preds[i][j]])
                            outf2.write("\n")

                    j = 0
                    updated = range(batch_size)
                    batch_finished = [[] for _ in range(batch_size)]

                    feat_lengths = [[] for _ in range(batch_size)]

                    #print 'dale_____________2222222______________________________'

                    while True:
                        batch_feats = [[
                            featurize_state(batch_states[i][k][1], mappings)
                            for k in range(len(batch_states[i]))
                        ] for i in updated]

                        #print 'bach feats'
                        #print batch_feats

                        for i, beam_feats in zip(updated, batch_feats):

                            #print '====='
                            #print beam_feats

                            feats = beam_feats[0]
                            if len(feats) > 0:
                                if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
                                    feat_lengths[i] += [len(feats)]
                                else:
                                    feat_lengths[i] += [
                                        len(batch_states[i][0]
                                            [1].transitionset())
                                    ]
                                    ##print batch_states[i][0][1].transitionset()

                        preds = []
                        predsid = []
                        for i, beam_feats in zip(updated, batch_feats):
                            for k, feats in enumerate(beam_feats):
                                if len(feats) <= 0:
                                    if len(batch_finished[i]) < args.beam_size:
                                        heappush(batch_finished[i],
                                                 batch_states[i][k])
                                    else:
                                        heappushpop(batch_finished[i],
                                                    batch_states[i][k])

                                    continue

                                beamidx = i * args.beam_size + k
                                if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':

                                    #print('sent_length',sent_length)
                                    #print('btfi size', len(batch_trans_feat_ids))
                                    #print feats
                                    #print [args.batch_size * args.beam_size] + feat_shape
                                    batch_trans_feat_ids[
                                        beamidx, :len(feats)] = feats
                                else:
                                    batch_trans_feat_ids[beamidx] = feats

                                batch_trans_feat_sizes[beamidx] = len(feats)

                                assert (batch_trans_feat_sizes[beamidx] > 0)

                                predsid.append((i, k))
                                preds.append(trans_predictors[i][k])

                        if len(predsid) <= 0:
                            break

                        if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
                            p = sess.run(preds,
                                         feed_dict={
                                             parser.combined_head_placeholder:
                                             batch_combined_head,
                                             parser.combined_dep_placeholder:
                                             batch_combined_dep,
                                             parser.trans_logit_placeholder:
                                             batch_trans_logit,
                                             parser.trans_feat_ids:
                                             batch_trans_feat_ids,
                                             parser.trans_feat_sizes:
                                             batch_trans_feat_sizes
                                         })
                        else:
                            p = sess.run(preds,
                                         feed_dict={
                                             parser.combined_head_placeholder:
                                             batch_combined_head,
                                             parser.combined_dep_placeholder:
                                             batch_combined_dep,
                                             parser.trans_feat_ids:
                                             batch_trans_feat_ids,
                                             parser.trans_feat_sizes:
                                             batch_trans_feat_sizes
                                         })

                        next_batchstates = [[] for _ in xrange(batch_size)]
                        updated = set()

                        for ik, pred in izip(predsid, p):
                            i, k = ik

                            updated.add(i)

                            #print("deberia ser 0 al final", len(batch_states[i][k][1].transitionset()))
                            if len(batch_states[i][k][1].transitionset()) > 0:
                                # model outputs NLLs so the lower the better
                                sort = sorted(enumerate(pred),
                                              key=lambda x: x[1])
                                expanded_beams = 0
                                for choice, score in sort:
                                    newscore = batch_states[i][k][0] - score

                                    #print 'transition set'
                                    #print sort
                                    #print( 'choice', choice)

                                    #print  transsys.tuple_trans_from_int(batch_states[i][k][1].transitionset(), choice)[0]
                                    #print 'allowed antes de entrar'
                                    #print  batch_states[i][k][1].transitionset()
                                    if transsys.tuple_trans_from_int(
                                            batch_states[i][k]
                                        [1].transitionset(),
                                            choice)[0] in batch_states[i][k][
                                                1].transitionset():
                                        candidate = (newscore,
                                                     batch_states[i][k][1],
                                                     choice)
                                        if len(next_batchstates[i]
                                               ) < args.beam_size:
                                            heappush(next_batchstates[i],
                                                     candidate)
                                        elif newscore > next_batchstates[i][0][
                                                0]:
                                            heappushpop(
                                                next_batchstates[i], candidate)

                                        #print 'candidadte'
                                        #print candidate
                                        expanded_beams += 1
                                        if expanded_beams >= args.beam_size:
                                            break

                        #print 'dale____________ini pred______________________________'

                        for i in updated:
                            next_batchstates[i] = nlargest(args.beam_size,
                                                           next_batchstates[i],
                                                           key=lambda x: x[0])
                            for k, t in enumerate(next_batchstates[i]):
                                #print '------------config executing------------------'
                                score, state, choice = t
                                state = state.clone()
                                transsys.advance(state, choice)
                                next_batchstates[i][k] = (score, state)

                        #print 'dale____________fin pred______________________________'

                        batch_states = next_batchstates

                        j += 1

                    #print 'dale_______escribe____________________________________'
                    for i in xrange(batch_size):
                        assert len(batch_finished) == batch_size
                        assert len(
                            batch_finished[i]) > 0, "nothing finished: %d" % (
                                i)
                        assert len(batch_finished[i][0]) > 1, "%s" % (
                            batch_finished[i][0])
                        state_pred = nlargest(1,
                                              batch_finished[i],
                                              key=lambda x: x[0])[0][1]
                        for t in state_pred.head[1:]:
                            outf.write("%d\t%s\n" %
                                       (t[0], invmappings['rel'][t[1]]))
                        outf.write("\n")

                    log.info('Epoch %3d batch %4d' % (epoch, batch))
        log.info('Use exclusively the model of last epoch'
                 )  #Added to just use the last model
        break
    sess.close()