def featurize_transitions(data, mappings, invmappings, feat_file, transsys, log=None): if feat_file is not None and op.exists(feat_file): with smart_open(feat_file, 'rb') as f: res = pickle.load(f) max_feat_size = pickle.load(f) log.info('Read %d featurized examples from saved feature file "%s".' % (len(res), feat_file)) return res, max_feat_size log.info('Featurizing %d examples...' % (len(data))) max_feat_size = -1 res = [] count = 0 transsys = transsys(mappings, invmappings) for t in data: sent, trans = t[:2] state = ParserState(sent, transsys=transsys) feats = [] labels = [] featsizes = [] for t in trans: feat, label = featurize_state(state, mappings, t) transsys.advance(state, label) max_feat_size = max(max_feat_size, len(feat)) feats += [feat] labels += [label] featsizes += [len(feat)] assert (len(feats) == len(labels)) res += [[feats, featsizes, labels]] count += 1 if (count) % 100 == 0: log.debug("Featurized %d examples..." % (count)) assert (len(res) == len(data)) log.info("%d examples featurized, maximum feature size=%d" % (len(res), (max_feat_size - 1) * len(mappings['rel']) + 1)) if feat_file is not None: log.info('Saving %d featurized examples to feature file "%s"...' % (len(res), feat_file)) with smart_open(feat_file, 'wb') as f: pickle.dump(res, f, pickle.HIGHEST_PROTOCOL) pickle.dump(max_feat_size, f, pickle.HIGHEST_PROTOCOL) log.info('Done.') return res, max_feat_size
def processlines(lines): arcs = [dict() for i in range(len(lines)+1)] pos = ["" for i in xrange(len(lines)+1)] fpos = ["" for i in xrange(len(lines)+1)] for i, line in enumerate(lines): pos[i+1] = line[3] # fine-grained fpos[i+1] = line[4] parent = int(line[6]) relation = line[7] arcs[parent][i+1] = transsys.mappings['rel'][relation] res = [ParserState(["<ROOT>"] + lines, transsys=transsys, goldrels=arcs), pos] if fpos: res += [fpos] else: res == [None] return res
def eval(args): transsys_lookup = {"ASw": ArcSwift, "AER" : ArcEagerReduce, "AES": ArcEagerShift, "ASd" : ArcStandard, "AH" : ArcHybrid,} transsys = transsys_lookup[args.transsys] vocab, vecs, pretrained = read_vocab(conll_file=args.conll_file, wordvec_file=args.wordvec_file, vocab_file=args.vocab_file, wordvec_dim=args.wordvec_dim, min_count=args.min_count, log=log) mappings, invmappings = read_mappings(args.mappings_file, transsys, log=log) data, sent_length, trans_length = read_data(conll_file=args.conll_file, seq_file=args.seq_file, vocab=vocab, mappings=mappings, transsys=transsys, fpos=args.fpos, log=log) feat_shape = [5] if args.transsys != 'ASw' else [sent_length, 5] transsys = transsys(mappings, invmappings) parser = Parser(args, vecs, pretrained, mappings, invmappings, sent_length, trans_length, -1, log, train=False) trans_predictors = parser.trans_predictors log.info('Computational graph successfully built.') log.info('Setting up tensorflow session...') saver = tf.train.Saver(max_to_keep=10000) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) for epoch in reversed(xrange(int(args.epochs * args.epoch_multiplier))): #with tf.Session(config=config) as sess: savedpath = '%s/model_epoch%d' % (args.model_dir, epoch) if not op.exists(savedpath + '.meta'): continue log.info('Evaluating Epoch %3d...' % (epoch)) saver.restore(sess, savedpath) states = [[(0, ParserState(datum[0], transsys=transsys))] for datum in data] with smart_open('%s/%s_pos_eval_beam_%d_output_epoch%d.txt' % (args.model_dir, args.eval_dataset, args.beam_size, epoch), 'w') as outf2: with smart_open('%s/%s_eval_beam_%d_output_epoch%d.txt' % (args.model_dir, args.eval_dataset, args.beam_size, epoch), 'w') as outf: for batch in xrange((len(data)+args.batch_size-1) / args.batch_size): idx = range(batch * args.batch_size, min((batch+1) * args.batch_size, len(data))) batch_size = len(idx) batch_data = [data[i] for i in idx] batch_states = [states[i] for i in idx] # prepare data in tensor shape batch_sent_lengths = np.array([len(datum[0]) for datum in batch_data] + [sent_length] * (args.batch_size - batch_size), dtype=np.int32) batch_words = np.zeros((args.batch_size, sent_length), dtype=np.int32) batch_words2 = np.zeros((args.batch_size, sent_length), dtype=np.int32) batch_gold_pos = np.zeros((args.batch_size, sent_length), dtype=np.int32) for i in xrange(batch_size): batch_words[i, :batch_sent_lengths[i]] = batch_data[i][0] batch_words2[i, :batch_sent_lengths[i]] = batch_data[i][0] batch_gold_pos[i, :batch_sent_lengths[i]] = batch_data[i][2] batch_trans_feat_ids = np.zeros(tuple([args.batch_size * args.beam_size] + feat_shape), dtype=np.int32) batch_trans_feat_sizes = np.zeros((args.batch_size * args.beam_size), dtype=np.int32) preds_list = [parser.combined_head, parser.combined_dep, parser.pos_preds] if args.transsys == 'ASw': preds_list += [parser.transition_logit] if args.fpos: preds_list += [parser.fpos_preds] preds = sess.run(preds_list, feed_dict={parser.words: batch_words, parser.words2: batch_words2, parser.sent_lengths: batch_sent_lengths, parser.gold_pos: batch_gold_pos,}) # unpack predictions batch_combined_head, batch_combined_dep, pos_preds = preds[:3] preds = preds[3:] if args.transsys == 'ASw': batch_trans_logit = preds[0] preds = preds[1:] if args.fpos: fpos_preds = preds[0] preds = preds[1:] if args.fpos: for i in xrange(batch_size): for j in xrange(batch_sent_lengths[i]-1): outf2.write("%s\t%s\n" % (invmappings['pos'][pos_preds[i][j]], invmappings['fpos'][fpos_preds[i][j]])) outf2.write("\n") else: for i in xrange(batch_size): for j in xrange(batch_sent_lengths[i]-1): outf2.write("%s\t_\n" % invmappings['pos'][pos_preds[i][j]]) outf2.write("\n") j = 0 updated = range(batch_size) batch_finished = [[] for _ in range(batch_size)] feat_lengths = [[] for _ in range(batch_size)] while True: batch_feats = [[featurize_state(batch_states[i][k][1], mappings) for k in range(len(batch_states[i]))] for i in updated] for i, beam_feats in zip(updated, batch_feats): feats = beam_feats[0] if len(feats) > 0: if args.transsys == 'ASw': feat_lengths[i] += [len(feats)] else: feat_lengths[i] += [len(batch_states[i][0][1].transitionset())] preds = [] predsid = [] for i, beam_feats in zip(updated, batch_feats): for k, feats in enumerate(beam_feats): if len(feats) <= 0: if len(batch_finished[i]) < args.beam_size: heappush(batch_finished[i], batch_states[i][k]) else: heappushpop(batch_finished[i], batch_states[i][k]) continue beamidx = i * args.beam_size + k if args.transsys == 'ASw': batch_trans_feat_ids[beamidx, :len(feats)] = feats else: batch_trans_feat_ids[beamidx] = feats batch_trans_feat_sizes[beamidx] = len(feats) assert(batch_trans_feat_sizes[beamidx] > 0) predsid.append((i, k)) preds.append(trans_predictors[i][k]) if len(predsid) <= 0: break if args.transsys == 'ASw': p = sess.run(preds, feed_dict={parser.combined_head_placeholder: batch_combined_head, parser.combined_dep_placeholder: batch_combined_dep, parser.trans_logit_placeholder:batch_trans_logit, parser.trans_feat_ids: batch_trans_feat_ids, parser.trans_feat_sizes: batch_trans_feat_sizes}) else: p = sess.run(preds, feed_dict={parser.combined_head_placeholder: batch_combined_head, parser.combined_dep_placeholder: batch_combined_dep, parser.trans_feat_ids: batch_trans_feat_ids, parser.trans_feat_sizes: batch_trans_feat_sizes}) next_batchstates = [[] for _ in xrange(batch_size)] updated = set() for ik, pred in izip(predsid, p): i, k = ik updated.add(i) if len(batch_states[i][k][1].transitionset()) > 0: # model outputs NLLs so the lower the better sort = sorted(enumerate(pred), key=lambda x: x[1]) expanded_beams = 0 for choice, score in sort: newscore = batch_states[i][k][0] - score if transsys.tuple_trans_from_int(batch_states[i][k][1].transitionset(), choice)[0] in batch_states[i][k][1].transitionset(): candidate = (newscore, batch_states[i][k][1], choice) if len(next_batchstates[i]) < args.beam_size: heappush(next_batchstates[i], candidate) elif newscore > next_batchstates[i][0][0]: heappushpop(next_batchstates[i], candidate) expanded_beams += 1 if expanded_beams >= args.beam_size: break for i in updated: next_batchstates[i] = nlargest(args.beam_size, next_batchstates[i], key=lambda x:x[0]) for k, t in enumerate(next_batchstates[i]): score, state, choice = t state = state.clone() transsys.advance(state, choice) next_batchstates[i][k] = (score, state) batch_states = next_batchstates j += 1 for i in xrange(batch_size): assert len(batch_finished) == batch_size assert len(batch_finished[i]) > 0, "nothing finished: %d" % (i) assert len(batch_finished[i][0]) > 1, "%s" % (batch_finished[i][0]) state_pred = nlargest(1, batch_finished[i], key=lambda x:x[0])[0][1] for t in state_pred.head[1:]: outf.write("%d\t%s\n" % (t[0], invmappings['rel'][t[1]])) outf.write("\n") log.info('Epoch %3d batch %4d' % (epoch, batch)) sess.close()
def eval(args): transsys_lookup = { "Cov": Covington, "NCov": NewCovington, "Cov2": Covington2, "Cov3": Covington3 } transsys = transsys_lookup[args.transsys] vocab, vecs, pretrained = read_vocab(conll_file=args.conll_file, wordvec_file=args.wordvec_file, vocab_file=args.vocab_file, wordvec_dim=args.wordvec_dim, min_count=args.min_count, log=log) mappings, invmappings = read_mappings(args.mappings_file, transsys, log=log) data, sent_length, trans_length = read_data(conll_file=args.conll_file, seq_file=args.seq_file, vocab=vocab, mappings=mappings, transsys=transsys, fpos=args.fpos, log=log) if args.transsys == 'NCov': sent_length = 70 feat_shape = [5] if args.transsys == 'Cov' else [sent_length, 5] transsys = transsys(mappings, invmappings) parser = Parser(args, vecs, pretrained, mappings, invmappings, sent_length, trans_length, -1, log, train=False) trans_predictors = parser.trans_predictors log.info('Computational graph successfully built.') log.info('Setting up tensorflow session...') saver = tf.train.Saver(max_to_keep=10000) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) for epoch in reversed(xrange(int(args.epochs * args.epoch_multiplier))): #with tf.Session(config=config) as sess: savedpath = '%s/model_epoch%d' % (args.model_dir, epoch) if not op.exists(savedpath + '.meta'): continue log.info('Evaluating Epoch %3d...' % (epoch)) saver.restore(sess, savedpath) #print "aki empieza a crear los estados" states = [[(0, ParserState(datum[0], transsys=transsys))] for datum in data] #print "aki termina++++++++++++++++++++++++++++++++++++++++++++++" with smart_open( '%s/%s_pos_eval_beam_%d_output_epoch%d.txt' % (args.model_dir, args.eval_dataset, args.beam_size, epoch), 'w') as outf2: with smart_open( '%s/%s_eval_beam_%d_output_epoch%d.txt' % (args.model_dir, args.eval_dataset, args.beam_size, epoch), 'w') as outf: for batch in xrange( (len(data) + args.batch_size - 1) / args.batch_size): #print "Empieza un nuevo batch" idx = range(batch * args.batch_size, min((batch + 1) * args.batch_size, len(data))) batch_size = len(idx) batch_data = [data[i] for i in idx] batch_states = [states[i] for i in idx] # prepare data in tensor shape batch_sent_lengths = np.array( [len(datum[0]) for datum in batch_data] + [sent_length] * (args.batch_size - batch_size), dtype=np.int32) batch_words = np.zeros((args.batch_size, sent_length), dtype=np.int32) batch_words2 = np.zeros((args.batch_size, sent_length), dtype=np.int32) batch_gold_pos = np.zeros((args.batch_size, sent_length), dtype=np.int32) for i in xrange(batch_size): batch_words[ i, :batch_sent_lengths[i]] = batch_data[i][0] batch_words2[ i, :batch_sent_lengths[i]] = batch_data[i][0] batch_gold_pos[ i, :batch_sent_lengths[i]] = batch_data[i][2] batch_trans_feat_ids = np.zeros( tuple([args.batch_size * args.beam_size] + feat_shape), dtype=np.int32) batch_trans_feat_sizes = np.zeros( (args.batch_size * args.beam_size), dtype=np.int32) preds_list = [ parser.combined_head, parser.combined_dep, parser.pos_preds ] if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3': preds_list += [parser.transition_logit] if args.fpos: preds_list += [parser.fpos_preds] preds = sess.run(preds_list, feed_dict={ parser.words: batch_words, parser.words2: batch_words2, parser.sent_lengths: batch_sent_lengths, parser.gold_pos: batch_gold_pos, }) # unpack predictions batch_combined_head, batch_combined_dep, pos_preds = preds[: 3] preds = preds[3:] if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3': batch_trans_logit = preds[0] preds = preds[1:] if args.fpos: fpos_preds = preds[0] preds = preds[1:] if args.fpos: for i in xrange(batch_size): for j in xrange(batch_sent_lengths[i] - 1): outf2.write( "%s\t%s\n" % (invmappings['pos'][pos_preds[i][j]], invmappings['fpos'][fpos_preds[i][j]])) outf2.write("\n") else: for i in xrange(batch_size): for j in xrange(batch_sent_lengths[i] - 1): outf2.write( "%s\t_\n" % invmappings['pos'][pos_preds[i][j]]) outf2.write("\n") j = 0 updated = range(batch_size) batch_finished = [[] for _ in range(batch_size)] feat_lengths = [[] for _ in range(batch_size)] #print 'dale_____________2222222______________________________' while True: batch_feats = [[ featurize_state(batch_states[i][k][1], mappings) for k in range(len(batch_states[i])) ] for i in updated] #print 'bach feats' #print batch_feats for i, beam_feats in zip(updated, batch_feats): #print '=====' #print beam_feats feats = beam_feats[0] if len(feats) > 0: if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3': feat_lengths[i] += [len(feats)] else: feat_lengths[i] += [ len(batch_states[i][0] [1].transitionset()) ] ##print batch_states[i][0][1].transitionset() preds = [] predsid = [] for i, beam_feats in zip(updated, batch_feats): for k, feats in enumerate(beam_feats): if len(feats) <= 0: if len(batch_finished[i]) < args.beam_size: heappush(batch_finished[i], batch_states[i][k]) else: heappushpop(batch_finished[i], batch_states[i][k]) continue beamidx = i * args.beam_size + k if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3': #print('sent_length',sent_length) #print('btfi size', len(batch_trans_feat_ids)) #print feats #print [args.batch_size * args.beam_size] + feat_shape batch_trans_feat_ids[ beamidx, :len(feats)] = feats else: batch_trans_feat_ids[beamidx] = feats batch_trans_feat_sizes[beamidx] = len(feats) assert (batch_trans_feat_sizes[beamidx] > 0) predsid.append((i, k)) preds.append(trans_predictors[i][k]) if len(predsid) <= 0: break if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3': p = sess.run(preds, feed_dict={ parser.combined_head_placeholder: batch_combined_head, parser.combined_dep_placeholder: batch_combined_dep, parser.trans_logit_placeholder: batch_trans_logit, parser.trans_feat_ids: batch_trans_feat_ids, parser.trans_feat_sizes: batch_trans_feat_sizes }) else: p = sess.run(preds, feed_dict={ parser.combined_head_placeholder: batch_combined_head, parser.combined_dep_placeholder: batch_combined_dep, parser.trans_feat_ids: batch_trans_feat_ids, parser.trans_feat_sizes: batch_trans_feat_sizes }) next_batchstates = [[] for _ in xrange(batch_size)] updated = set() for ik, pred in izip(predsid, p): i, k = ik updated.add(i) #print("deberia ser 0 al final", len(batch_states[i][k][1].transitionset())) if len(batch_states[i][k][1].transitionset()) > 0: # model outputs NLLs so the lower the better sort = sorted(enumerate(pred), key=lambda x: x[1]) expanded_beams = 0 for choice, score in sort: newscore = batch_states[i][k][0] - score #print 'transition set' #print sort #print( 'choice', choice) #print transsys.tuple_trans_from_int(batch_states[i][k][1].transitionset(), choice)[0] #print 'allowed antes de entrar' #print batch_states[i][k][1].transitionset() if transsys.tuple_trans_from_int( batch_states[i][k] [1].transitionset(), choice)[0] in batch_states[i][k][ 1].transitionset(): candidate = (newscore, batch_states[i][k][1], choice) if len(next_batchstates[i] ) < args.beam_size: heappush(next_batchstates[i], candidate) elif newscore > next_batchstates[i][0][ 0]: heappushpop( next_batchstates[i], candidate) #print 'candidadte' #print candidate expanded_beams += 1 if expanded_beams >= args.beam_size: break #print 'dale____________ini pred______________________________' for i in updated: next_batchstates[i] = nlargest(args.beam_size, next_batchstates[i], key=lambda x: x[0]) for k, t in enumerate(next_batchstates[i]): #print '------------config executing------------------' score, state, choice = t state = state.clone() transsys.advance(state, choice) next_batchstates[i][k] = (score, state) #print 'dale____________fin pred______________________________' batch_states = next_batchstates j += 1 #print 'dale_______escribe____________________________________' for i in xrange(batch_size): assert len(batch_finished) == batch_size assert len( batch_finished[i]) > 0, "nothing finished: %d" % ( i) assert len(batch_finished[i][0]) > 1, "%s" % ( batch_finished[i][0]) state_pred = nlargest(1, batch_finished[i], key=lambda x: x[0])[0][1] for t in state_pred.head[1:]: outf.write("%d\t%s\n" % (t[0], invmappings['rel'][t[1]])) outf.write("\n") log.info('Epoch %3d batch %4d' % (epoch, batch)) log.info('Use exclusively the model of last epoch' ) #Added to just use the last model break sess.close()