np.random.seed(conf.shuffle_seed) # Replace train_idxs with this one to order the batches by sentence length ordered_batch_idxs = np.argsort(np.array([np.count_nonzero(s) for s in c.train_src_idxs]) * -1) xv = c.train_src_idxs.view([('w%d' % i, 'i4') for i in range(32)]) similar_batch_idxs = np.argsort(xv, axis=0, order=[('w%d' % i) for i in range(32)]).flatten() # Process validation data if conf.do_validate: no_val_improvement = 0 early_stopped = False y_val_strings = [c.trg_idx_to_sent(s) for s in c.valid_trg_idxs] X_val, y_val = next(cstm_model.batch_iterator(c.valid_src_idxs, c.valid_trg_idxs, c.valid_src_idxs.shape[0], len(c.trg_vocab))) logging.info("Will validate on (%s) %d sentences." % (conf.valid_prefix, c.valid_src_idxs.shape[0])) logging.info("Training will stop after %d validations without improvement." % conf.max_patience) scorer = MultiBleuScorer() decoder = Decoder(c.trg_vocab["</s>"], c.trg_vocab["<unk>"], conf.beam_size, generate_unk=False) best_val_bleu = BLEUScore() # Validation prediction placeholder y_pred_val = np.ndarray((X_val.shape[0], c.train_src_idxs.shape[1], len(c.trg_vocab))).astype(np.float32) # Process test data if conf.do_test: y_test_strings = [c.trg_idx_to_sent(s) for s in c.test_trg_idxs] X_test, y_test = next(cstm_model.batch_iterator(c.test_src_idxs, c.test_trg_idxs, c.test_src_idxs.shape[0], len(c.trg_vocab))) y_pred_test = np.ndarray((X_test.shape[0], c.train_src_idxs.shape[1], len(c.trg_vocab))).astype(np.float32) logging.info("Will test on (%s) %d sentences." % (conf.test_prefix, c.test_src_idxs.shape[0])) # Create the model
# Load vocabulary files and invert them print "Loading vocabulary files..." src_vocab = load_object(args.source_vocab) trg_vocab = load_object(args.target_vocab) corp.set_src_vocab(src_vocab) corp.set_trg_vocab(trg_vocab) print "Completed." # Map source sentences mapped_sents = corp.map_sentences(src_sents, corp.src_vocab, conf.max_seq_len, skip_unk=args.remove_unk) print "%d source sentences mapped." % len(mapped_sents) # Create decoder instance decoder = Decoder(trg_vocab["</s>"], trg_vocab["<unk>"], args.beamsize, args.generate_unk) # Validation prediction placeholder y_pred = np.ndarray((mapped_sents.shape[0], mapped_sents.shape[1], len(trg_vocab))).astype(np.float32) # Feed-forward pass y_pred_dict = model.predict({'input': mapped_sents}) for i in range(y_pred.shape[1]): y_pred[:, i, :] = y_pred_dict['output_%d' % i] scorer = MultiBleuScorer() d = decoder.process(y_pred) # Generate 1-best sentences hyps = [corp.trg_idx_to_sent(hyp[0]) for hyp in d[0]] bleu = scorer.score_sentences(ref_trans_file, hyps) print bleu