def setUpClass(cls) -> None: cls.vocabulary = vocabulary.Vocabulary() for c in "foo": cls.vocabulary.encode(c) cls.vocabularies = vocabulary.Vocabularies() cls.vocabularies.encode_input("foo") cls.vocabularies.encode_actions("baa")
def setUpClass(cls) -> None: model = dy.Model() vocabulary_ = vocabulary.Vocabularies() vocabulary_.encode_input("foo") vocabulary_.encode_actions("bar") expert = optimal_expert.OptimalExpert() cls.transducer = transducer.Transducer(model, vocabulary_, expert, 3, 3, 3, 1, 3, 1)
def test_vocabularies_encode_actions(self): vocabulary1 = vocabulary.Vocabularies() vocabulary1.encode_actions("baa") expected_i2w = [ BeginOfSequence(), EndOfSequence(), ConditionalDel(), ConditionalCopy(), ConditionalSub("b"), ConditionalIns("b"), ConditionalSub("a"), ConditionalIns("a"), ] self.assertListEqual(expected_i2w, vocabulary1.actions.i2w)
def test_vocabularies_encode_input(self): vocabulary1 = vocabulary.Vocabularies() encoded_foo = vocabulary1.encode_input("foo") self.assertListEqual([0, 3, 4, 4, 1], encoded_foo)
def main(args: argparse.Namespace): dargs = args.__dict__ for key, value in dargs.items(): logging.info("%s: %s", str(key).ljust(15), value) os.makedirs(args.output) if args.nfd: logging.info("Will perform training on NFD-normalized data.") else: logging.info("Will perform training on unnormalized data.") vocabulary_ = vocabulary.Vocabularies() training_data = [] with utils.OpenNormalize(args.train, args.nfd) as f: for line in f: input_, target = line.rstrip().split("\t", 1) encoded_input = vocabulary_.encode_input(input_) vocabulary_.encode_actions(target) sample = utils.Sample(input_, target, encoded_input) training_data.append(sample) logging.info("%d actions: %s", len(vocabulary_.actions), vocabulary_.actions) logging.info("%d chars: %s", len(vocabulary_.characters), vocabulary_.characters) vocabulary_path = os.path.join(args.output, "vocabulary.pkl") vocabulary_.persist(vocabulary_path) logging.info("Wrote vocabulary to %s.", vocabulary_path) development_data = [] with utils.OpenNormalize(args.dev, args.nfd) as f: for line in f: input_, target = line.rstrip().split("\t", 1) encoded_input = vocabulary_.encode_unseen_input(input_) sample = utils.Sample(input_, target, encoded_input) development_data.append(sample) if args.test is not None: test_data = [] with utils.OpenNormalize(args.test, args.nfd) as f: for line in f: input_, *optional_target = line.rstrip().split("\t", 1) target = optional_target[0] if optional_target else None encoded_input = vocabulary_.encode_unseen_input(input_) sample = utils.Sample(input_, target, encoded_input) test_data.append(sample) sed_parameters_path = os.path.join(args.output, "sed.pkl") sed_aligner = sed.StochasticEditDistance.fit_from_data( training_data, em_iterations=args.sed_em_iterations, output_path=sed_parameters_path, ) expert = optimal_expert_substitutions.OptimalSubstitutionExpert( sed_aligner) model = dy.Model() transducer_ = transducer.Transducer(model, vocabulary_, expert, **dargs) widgets = [progressbar.Bar(">"), " ", progressbar.ETA()] train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=args.epochs).start() train_log_path = os.path.join(args.output, "train.log") best_model_path = os.path.join(args.output, "best.model") with open(train_log_path, "w") as w: w.write("epoch\tavg_loss\ttrain_accuracy\tdev_accuracy\n") trainer = dy.AdadeltaTrainer(model) train_subset = training_data[:100] rollin_schedule = inverse_sigmoid_schedule(args.k) max_patience = args.patience batch_size = args.batch_size logging.info( "Training for a maximum of %d with a maximum patience of %d.", args.epochs, max_patience, ) logging.info( "Number of train batches: %d.", math.ceil(len(training_data) / batch_size), ) best_train_accuracy = 0 best_dev_accuracy = 0 best_epoch = 0 patience = 0 for epoch in range(args.epochs): logging.info("Training...") with utils.Timer(): train_loss = 0.0 random.shuffle(training_data) batches = [ training_data[i:i + batch_size] for i in range(0, len(training_data), batch_size) ] rollin = rollin_schedule(epoch) j = 0 for j, batch in enumerate(batches): losses = [] dy.renew_cg() for sample in batch: output = transducer_.transduce( input_=sample.input, encoded_input=sample.encoded_input, target=sample.target, rollin=rollin, external_cg=True, ) losses.extend(output.losses) batch_loss = -dy.average(losses) train_loss += batch_loss.scalar_value() batch_loss.backward() trainer.update() if j > 0 and j % 100 == 0: logging.info("\t\t...%d batches", j) logging.info("\t\t...%d batches", j + 1) avg_loss = train_loss / len(batches) logging.info("Average train loss: %.4f.", avg_loss) logging.info("Evaluating on training data subset...") with utils.Timer(): train_accuracy = decode(transducer_, train_subset).accuracy if train_accuracy > best_train_accuracy: best_train_accuracy = train_accuracy patience += 1 logging.info("Evaluating on development data...") with utils.Timer(): decoding_output = decode(transducer_, development_data) dev_accuracy = decoding_output.accuracy avg_dev_loss = decoding_output.loss if dev_accuracy > best_dev_accuracy: best_dev_accuracy = dev_accuracy best_epoch = epoch patience = 0 logging.info("Found best dev accuracy %.4f.", best_dev_accuracy) model.save(best_model_path) logging.info("Saved new best model to %s.", best_model_path) logging.info( f"Epoch {epoch} / {args.epochs - 1}: train loss: {avg_loss:.4f} " f"dev loss: {avg_dev_loss:.4f} train acc: {train_accuracy:.4f} " f"dev acc: {dev_accuracy:.4f} best train acc: {best_train_accuracy:.4f} " f"best dev acc: {best_dev_accuracy:.4f} best epoch: {best_epoch} " f"patience: {patience} / {max_patience - 1}") log_line = f"{epoch}\t{avg_loss:.4f}\t{train_accuracy:.4f}\t{dev_accuracy:.4f}\n" with open(train_log_path, "a") as a: a.write(log_line) if patience == max_patience: logging.info("Out of patience after %d epochs.", epoch + 1) train_progress_bar.finish() break train_progress_bar.update(epoch) logging.info("Finished training.") if not os.path.exists(best_model_path): sys.exit(0) model = dy.Model() transducer_ = transducer.Transducer(model, vocabulary_, expert, **dargs) model.populate(best_model_path) evaluations = [(development_data, "dev")] if args.test is not None: evaluations.append((test_data, "test")) for data, dataset_name in evaluations: logging.info( "Evaluating best model on %s data using beam search " "(beam width %d)...", dataset_name, args.beam_width, ) with utils.Timer(): greedy_decoding = decode(transducer_, data) utils.write_results( greedy_decoding.accuracy, greedy_decoding.predictions, args.output, args.nfd, dataset_name, dargs=dargs, ) with utils.Timer(): beam_decoding = decode(transducer_, data, args.beam_width) utils.write_results( beam_decoding.accuracy, beam_decoding.predictions, args.output, args.nfd, dataset_name, args.beam_width, dargs=dargs, )