Exemple #1
0
 def test_open_read(self):
     text = "우체국\n"
     self.assertEqual(unicodedata.normalize("NFC", text), text)
     with tempfile.TemporaryDirectory() as tempdir:
         tmp_tsv = os.path.join(tempdir, "tmp.tsv")
         with open(tmp_tsv, "w") as w:
             w.write(text)
         with utils.OpenNormalize(tmp_tsv, normalize=False, mode="r") as f:
             unnormalized_text = list(f)[0]
         self.assertEqual(4, len(unnormalized_text))
         self.assertEqual(text, unnormalized_text)
Exemple #2
0
 def test_open_write(self):
     text = "우체국\n"
     normalized_text = unicodedata.normalize("NFD", text)
     self.assertNotEqual(text, normalized_text)
     with tempfile.TemporaryDirectory() as tempdir:
         tmp_tsv = os.path.join(tempdir, "tmp.tsv")
         with utils.OpenNormalize(tmp_tsv, normalize=False, mode="w") as w:
             w.write(normalized_text)
         with open(tmp_tsv, "r") as f:
             written_text = f.read()
         self.assertEqual(8, len(written_text))
         self.assertEqual(normalized_text, written_text)
Exemple #3
0
def main(args: argparse.Namespace):

    dargs = args.__dict__
    for key, value in dargs.items():
        logging.info("%s: %s", str(key).ljust(15), value)

    os.makedirs(args.output)

    if args.nfd:
        logging.info("Will perform training on NFD-normalized data.")
    else:
        logging.info("Will perform training on unnormalized data.")

    vocabulary_ = vocabulary.Vocabularies()

    training_data = []
    with utils.OpenNormalize(args.train, args.nfd) as f:
        for line in f:
            input_, target = line.rstrip().split("\t", 1)
            encoded_input = vocabulary_.encode_input(input_)
            vocabulary_.encode_actions(target)
            sample = utils.Sample(input_, target, encoded_input)
            training_data.append(sample)

    logging.info("%d actions: %s", len(vocabulary_.actions),
                 vocabulary_.actions)
    logging.info("%d chars: %s", len(vocabulary_.characters),
                 vocabulary_.characters)
    vocabulary_path = os.path.join(args.output, "vocabulary.pkl")
    vocabulary_.persist(vocabulary_path)
    logging.info("Wrote vocabulary to %s.", vocabulary_path)

    development_data = []
    with utils.OpenNormalize(args.dev, args.nfd) as f:
        for line in f:
            input_, target = line.rstrip().split("\t", 1)
            encoded_input = vocabulary_.encode_unseen_input(input_)
            sample = utils.Sample(input_, target, encoded_input)
            development_data.append(sample)

    if args.test is not None:
        test_data = []
        with utils.OpenNormalize(args.test, args.nfd) as f:
            for line in f:
                input_, *optional_target = line.rstrip().split("\t", 1)
                target = optional_target[0] if optional_target else None
                encoded_input = vocabulary_.encode_unseen_input(input_)
                sample = utils.Sample(input_, target, encoded_input)
                test_data.append(sample)

    sed_parameters_path = os.path.join(args.output, "sed.pkl")
    sed_aligner = sed.StochasticEditDistance.fit_from_data(
        training_data,
        em_iterations=args.sed_em_iterations,
        output_path=sed_parameters_path,
    )
    expert = optimal_expert_substitutions.OptimalSubstitutionExpert(
        sed_aligner)

    model = dy.Model()
    transducer_ = transducer.Transducer(model, vocabulary_, expert, **dargs)

    widgets = [progressbar.Bar(">"), " ", progressbar.ETA()]
    train_progress_bar = progressbar.ProgressBar(widgets=widgets,
                                                 maxval=args.epochs).start()

    train_log_path = os.path.join(args.output, "train.log")
    best_model_path = os.path.join(args.output, "best.model")

    with open(train_log_path, "w") as w:
        w.write("epoch\tavg_loss\ttrain_accuracy\tdev_accuracy\n")

    trainer = dy.AdadeltaTrainer(model)
    train_subset = training_data[:100]
    rollin_schedule = inverse_sigmoid_schedule(args.k)
    max_patience = args.patience
    batch_size = args.batch_size

    logging.info(
        "Training for a maximum of %d with a maximum patience of %d.",
        args.epochs,
        max_patience,
    )
    logging.info(
        "Number of train batches: %d.",
        math.ceil(len(training_data) / batch_size),
    )

    best_train_accuracy = 0
    best_dev_accuracy = 0
    best_epoch = 0
    patience = 0

    for epoch in range(args.epochs):

        logging.info("Training...")
        with utils.Timer():
            train_loss = 0.0
            random.shuffle(training_data)
            batches = [
                training_data[i:i + batch_size]
                for i in range(0, len(training_data), batch_size)
            ]
            rollin = rollin_schedule(epoch)
            j = 0
            for j, batch in enumerate(batches):
                losses = []
                dy.renew_cg()
                for sample in batch:
                    output = transducer_.transduce(
                        input_=sample.input,
                        encoded_input=sample.encoded_input,
                        target=sample.target,
                        rollin=rollin,
                        external_cg=True,
                    )
                    losses.extend(output.losses)
                batch_loss = -dy.average(losses)
                train_loss += batch_loss.scalar_value()
                batch_loss.backward()
                trainer.update()
                if j > 0 and j % 100 == 0:
                    logging.info("\t\t...%d batches", j)
            logging.info("\t\t...%d batches", j + 1)

        avg_loss = train_loss / len(batches)
        logging.info("Average train loss: %.4f.", avg_loss)

        logging.info("Evaluating on training data subset...")
        with utils.Timer():
            train_accuracy = decode(transducer_, train_subset).accuracy

        if train_accuracy > best_train_accuracy:
            best_train_accuracy = train_accuracy

        patience += 1

        logging.info("Evaluating on development data...")
        with utils.Timer():
            decoding_output = decode(transducer_, development_data)
            dev_accuracy = decoding_output.accuracy
            avg_dev_loss = decoding_output.loss

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy
            best_epoch = epoch
            patience = 0
            logging.info("Found best dev accuracy %.4f.", best_dev_accuracy)
            model.save(best_model_path)
            logging.info("Saved new best model to %s.", best_model_path)

        logging.info(
            f"Epoch {epoch} / {args.epochs - 1}: train loss: {avg_loss:.4f} "
            f"dev loss: {avg_dev_loss:.4f} train acc: {train_accuracy:.4f} "
            f"dev acc: {dev_accuracy:.4f} best train acc: {best_train_accuracy:.4f} "
            f"best dev acc: {best_dev_accuracy:.4f} best epoch: {best_epoch} "
            f"patience: {patience} / {max_patience - 1}")

        log_line = f"{epoch}\t{avg_loss:.4f}\t{train_accuracy:.4f}\t{dev_accuracy:.4f}\n"
        with open(train_log_path, "a") as a:
            a.write(log_line)

        if patience == max_patience:
            logging.info("Out of patience after %d epochs.", epoch + 1)
            train_progress_bar.finish()
            break

        train_progress_bar.update(epoch)

    logging.info("Finished training.")

    if not os.path.exists(best_model_path):
        sys.exit(0)

    model = dy.Model()
    transducer_ = transducer.Transducer(model, vocabulary_, expert, **dargs)
    model.populate(best_model_path)

    evaluations = [(development_data, "dev")]
    if args.test is not None:
        evaluations.append((test_data, "test"))
    for data, dataset_name in evaluations:

        logging.info(
            "Evaluating best model on %s data using beam search "
            "(beam width %d)...",
            dataset_name,
            args.beam_width,
        )
        with utils.Timer():
            greedy_decoding = decode(transducer_, data)
        utils.write_results(
            greedy_decoding.accuracy,
            greedy_decoding.predictions,
            args.output,
            args.nfd,
            dataset_name,
            dargs=dargs,
        )
        with utils.Timer():
            beam_decoding = decode(transducer_, data, args.beam_width)
        utils.write_results(
            beam_decoding.accuracy,
            beam_decoding.predictions,
            args.output,
            args.nfd,
            dataset_name,
            args.beam_width,
            dargs=dargs,
        )