def _predict(sess, examples: [InputExample]):
        hypotheses, inputs = [], []

        features = []
        for example in examples:
            feature = convert_single_example(
                ex_index=0,
                example=example,
                max_seq_length=config_data.max_seq_length,
                tokenizer=tokenizer)
            features.append(feature)

        for feature in features:
            feed_dict = {
                src_input_ids: [feature.src_input_ids],
                src_segment_ids: [feature.src_segment_ids],
                tx.global_mode(): tf.estimator.ModeKeys.PREDICT,
            }

            fetches = {
                'beam_search_ids': beam_search_ids,
                'src_input_ids': src_input_ids
            }

            fetches_ = sess.run(fetches, feed_dict=feed_dict)

            hypotheses.extend(h.tolist() for h in fetches_['beam_search_ids'])
            inputs.extend(h.tolist() for h in fetches_['src_input_ids'])
            hypotheses = utils.list_strip_eos(hypotheses, eos_token_id)

        write_token_id_arrays_to_text_file(
            inputs, os.path.join(model_dir, 'predict-inputs.txt'), tokenizer)
        write_token_id_arrays_to_text_file(
            hypotheses, os.path.join(model_dir, 'predict-predictions.txt'),
            tokenizer)
Beispiel #2
0
    def _eval_epoch(sess, epoch, mode):
        if mode == 'eval':
            eval_data = dev_data
        elif mode == 'test':
            eval_data = test_data
        else:
            raise ValueError('`mode` should be either "eval" or "test".')

        references, hypotheses = [], []
        bsize = config_data.test_batch_size
        for i in range(0, len(eval_data), bsize):
            #print("eval {}/{}".format(i, len(eval_data)))
            sources, targets = zip(*eval_data[i:i + bsize])
            x_block = data_utils.source_pad_concat_convert(sources)
            feed_dict = {
                encoder_input: x_block,
                tx.global_mode(): tf.estimator.ModeKeys.EVAL,
            }
            fetches = {
                'inferred_ids': inferred_ids,
            }
            fetches_ = sess.run(fetches, feed_dict=feed_dict)

            hypotheses.extend(h.tolist() for h in fetches_['inferred_ids'])
            references.extend(r.tolist() for r in targets)
            hypotheses = utils.list_strip_eos(hypotheses, eos_token_id)
            references = utils.list_strip_eos(references, eos_token_id)

        if mode == 'eval':
            # Writes results to files to evaluate BLEU
            # For 'eval' mode, the BLEU is based on token ids (rather than
            # text tokens) and serves only as a surrogate metric to monitor
            # the training process
            fname = os.path.join(FLAGS.model_dir, 'tmp.eval')
            hypotheses = tx.utils.str_join(hypotheses)
            references = tx.utils.str_join(references)
            hyp_fn, ref_fn = tx.utils.write_paired_text(hypotheses,
                                                        references,
                                                        fname,
                                                        mode='s')
            eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True)
            eval_bleu = 100. * eval_bleu
            logger.info('epoch: %d, eval_bleu %.4f', epoch, eval_bleu)
            print('epoch: %d, eval_bleu %.4f' % (epoch, eval_bleu))

            if eval_bleu > best_results['score']:
                logger.info('epoch: %d, best bleu: %.4f', epoch, eval_bleu)
                best_results['score'] = eval_bleu
                best_results['epoch'] = epoch
                model_path = os.path.join(FLAGS.model_dir, 'best-model.ckpt')
                logger.info('saving model to %s', model_path)
                print('saving model to %s' % model_path)
                saver.save(sess, model_path)

        elif mode == 'test':
            # For 'test' mode, together with the cmds in README.md, BLEU
            # is evaluated based on text tokens, which is the standard metric.
            fname = os.path.join(FLAGS.model_dir, 'test.output')
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([id2w[y] for y in hyp])
                rwords.append([id2w[y] for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_fn, ref_fn = tx.utils.write_paired_text(hwords,
                                                        rwords,
                                                        fname,
                                                        mode='s')
            logger.info('Test output writtn to file: %s', hyp_fn)
            print('Test output writtn to file: %s' % hyp_fn)
Beispiel #3
0
    def _eval_epoch(epoch, mode, print_fn=None):
        if print_fn is None:
            print_fn = print
            tqdm_leave = True
        else:
            tqdm_leave = False
        model.eval()
        eval_data = datasets[mode]
        eval_iter = tx.data.DataIterator(eval_data)
        references, hypotheses = [], []
        for batch in tqdm.tqdm(eval_iter,
                               ncols=80,
                               leave=tqdm_leave,
                               desc=f"Eval on {mode} set"):
            predictions = model(
                encoder_input=batch.source,
                beam_width=beam_width,
            )
            if beam_width == 1:
                decoded_ids = predictions[0].sample_id
            else:
                decoded_ids = predictions["sample_id"][:, :, 0]

            hypotheses.extend(h.tolist() for h in decoded_ids)
            references.extend(r.tolist() for r in batch.target_output)
        hypotheses = utils.list_strip_eos(hypotheses, vocab.eos_token_id)
        references = utils.list_strip_eos(references, vocab.eos_token_id)

        if mode == "valid":
            # Writes results to files to evaluate BLEU
            # For 'eval' mode, the BLEU is based on token ids (rather than
            # text tokens) and serves only as a surrogate metric to monitor
            # the training process
            fname = os.path.join(args.output_dir, "tmp.eval")
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([str(y) for y in hyp])
                rwords.append([str(y) for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_file, ref_file = tx.utils.write_paired_text(
                hwords,
                rwords,
                fname,
                mode="s",
                src_fname_suffix="hyp",
                tgt_fname_suffix="ref",
            )
            eval_bleu = tx.evals.file_bleu(ref_file,
                                           hyp_file,
                                           case_sensitive=True)
            logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu)
            print_fn(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}")

            if eval_bleu > best_results["score"]:
                logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu)
                best_results["score"] = eval_bleu
                best_results["epoch"] = epoch
                model_path = os.path.join(args.output_dir,
                                          args.output_filename)
                logger.info("Saving model to %s", model_path)
                print_fn(f"Saving model to {model_path}")

                states = {
                    "model": model.state_dict(),
                    "optimizer": optim.state_dict(),
                    "scheduler": scheduler.state_dict(),
                }
                torch.save(states, model_path)

        elif mode == "test":
            # For 'test' mode, together with the commands in README.md, BLEU
            # is evaluated based on text tokens, which is the standard metric.
            fname = os.path.join(args.output_dir, "test.output")
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append(vocab.map_ids_to_tokens_py(hyp))
                rwords.append(vocab.map_ids_to_tokens_py(ref))
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_file, ref_file = tx.utils.write_paired_text(
                hwords,
                rwords,
                fname,
                mode="s",
                src_fname_suffix="hyp",
                tgt_fname_suffix="ref",
            )
            logger.info("Test output written to file: %s", hyp_file)
            print_fn(f"Test output written to file: {hyp_file}")
    def _eval_epoch(epoch, mode):
        torch.cuda.empty_cache()
        if mode == 'eval':
            eval_data = dev_data
        elif mode == 'test':
            eval_data = test_data
        else:
            raise ValueError("`mode` should be either \"eval\" or \"test\".")

        references, hypotheses = [], []
        bsize = config_data.test_batch_size
        for i in tqdm(range(0, len(eval_data), bsize)):
            sources, targets = zip(*eval_data[i:i + bsize])
            with torch.no_grad():
                x_block = data_utils.source_pad_concat_convert(
                    sources, device=device)
                predictions = model(
                    encoder_input=x_block,
                    is_train_mode=False,
                    beam_width=beam_width)
                if beam_width == 1:
                    decoded_ids = predictions[0].sample_id
                else:
                    decoded_ids = predictions["sample_id"][:, :, 0]

                hypotheses.extend(h.tolist() for h in decoded_ids)
                references.extend(r.tolist() for r in targets)
                hypotheses = utils.list_strip_eos(hypotheses, eos_token_id)
                references = utils.list_strip_eos(references, eos_token_id)

        if mode == 'eval':
            # Writes results to files to evaluate BLEU
            # For 'eval' mode, the BLEU is based on token ids (rather than
            # text tokens) and serves only as a surrogate metric to monitor
            # the training process
            # TODO: Use texar.evals.bleu
            fname = os.path.join(args.model_dir, 'tmp.eval')
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([str(y) for y in hyp])
                rwords.append([str(y) for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_fn, ref_fn = tx.utils.write_paired_text(
                hwords, rwords, fname, mode='s',
                src_fname_suffix='hyp', tgt_fname_suffix='ref')
            eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True)
            eval_bleu = 100. * eval_bleu
            logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu)
            print(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}")

            if eval_bleu > best_results['score']:
                logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu)
                best_results['score'] = eval_bleu
                best_results['epoch'] = epoch
                model_path = os.path.join(args.model_dir, args.model_fn)
                logger.info("Saving model to %s", model_path)
                print(f"Saving model to {model_path}")

                states = {
                    'model': model.state_dict(),
                    'optimizer': optim.state_dict(),
                    'scheduler': scheduler.state_dict(),
                }
                torch.save(states, model_path)

        elif mode == 'test':
            # For 'test' mode, together with the cmds in README.md, BLEU
            # is evaluated based on text tokens, which is the standard metric.
            fname = os.path.join(args.model_dir, 'test.output')
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([id2w[y] for y in hyp])
                rwords.append([id2w[y] for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_fn, ref_fn = tx.utils.write_paired_text(
                hwords, rwords, fname, mode='s',
                src_fname_suffix='hyp', tgt_fname_suffix='ref')
            logger.info("Test output written to file: %s", hyp_fn)
            print(f"Test output written to file: {hyp_fn}")
    def _eval_epoch(sess, epoch, mode):
        print('Starting %s' % mode)

        if mode is not 'eval' and not 'test':
            print("Unknown mode!")
            raise

        dataset_name = 'eval' if mode is 'eval' else 'test'

        data_iterator.restart_dataset(sess, dataset_name)
        references, hypotheses, inputs = [], [], []

        while True:
            try:
                feed_dict = {
                    data_iterator.handle:
                    data_iterator.get_handle(sess, dataset_name),
                    tx.global_mode():
                    tf.estimator.ModeKeys.EVAL,
                }
                fetches = {
                    'beam_search_ids': beam_search_ids,
                    'tgt_labels': tgt_labels,
                    # src_input_ids is not necessary for calculating the metric, but allows us to write it to a file.
                    'src_input_ids': src_input_ids
                }
                fetches_ = sess.run(fetches, feed_dict=feed_dict)

                hypotheses.extend(h.tolist()
                                  for h in fetches_['beam_search_ids'])
                references.extend(r.tolist() for r in fetches_['tgt_labels'])
                inputs.extend(h.tolist() for h in fetches_['src_input_ids'])
                hypotheses = utils.list_strip_eos(hypotheses, eos_token_id)
                references = utils.list_strip_eos(references, eos_token_id)
            except tf.errors.OutOfRangeError:
                break

        def calculate_scores():
            hyp_fn, ref_fn = 'tmp.%s.src' % mode, 'tmp.%s.tgt' % mode
            write_token_id_arrays_to_text_file(hypotheses,
                                               os.path.join(model_dir, hyp_fn),
                                               tokenizer)
            write_token_id_arrays_to_text_file(references,
                                               os.path.join(model_dir, ref_fn),
                                               tokenizer)

            hyp_fn, ref_fn = os.path.join(model_dir, hyp_fn), os.path.join(
                model_dir, ref_fn)

            files_rouge = FilesRouge(hyp_fn, ref_fn)
            rouge_scores = files_rouge.get_scores(avg=True)

            bleu_score = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True)

            return rouge_scores, bleu_score

        if mode == 'eval':
            try:
                rouge_scores, bleu_score = calculate_scores()
            except ValueError:
                print("Failed to calculate rouge scores!")
                return

            print_rouge_scores(rouge_scores)
            print('epoch: %d, bleu_score %.4f' % (epoch, bleu_score))

            if bleu_score > best_results['score']:
                best_results['score'] = bleu_score
                best_results['epoch'] = epoch
                model_path = os.path.join(model_dir, 'best-model.ckpt')
                print('saving model to %s' % model_path)

                # Also save the best results in a text file for manual evaluation
                write_token_id_arrays_to_text_file(
                    inputs, os.path.join(model_dir, 'eval-inputs.txt'),
                    tokenizer)
                write_token_id_arrays_to_text_file(
                    hypotheses,
                    os.path.join(model_dir, 'eval-predictions.txt'), tokenizer)
                write_token_id_arrays_to_text_file(
                    references, os.path.join(model_dir, 'eval-targets.txt'),
                    tokenizer)

                saver.save(sess, model_path)

        elif mode == 'test':
            rouge_scores, bleu_score = calculate_scores()

            print_rouge_scores(rouge_scores)
            print('bleu_score %.4f' % bleu_score)

            # Also save the results in a text file for manual evaluation
            write_token_id_arrays_to_text_file(
                inputs, os.path.join(model_dir, 'test-inputs.txt'), tokenizer)
            write_token_id_arrays_to_text_file(
                hypotheses, os.path.join(model_dir, 'test-predictions.txt'),
                tokenizer)
            write_token_id_arrays_to_text_file(
                references, os.path.join(model_dir, 'test-targets.txt'),
                tokenizer)