def run(ModelClass, output_dir, pipeline_options, model_options):
    """
    Implements the main logic of the training module.

    Instantiates the dataset, model class and sets their attributes according
    to the pipeline options received. Loads or creates a trainer and runs it.

    Args:
        ModelClass (Model): Python Type of the Model to train
        output_dir: Directory to save models
        pipeline_options (Namespace): Generic Train Options
            load_model: load pre-trained predictor model
            resume: load trainer state and resume training
            gpu_id: Set to non-negative integer to train on GPU
            train_batch_size: Batch Size for training
            valid_batch_size: Batch size for validation

        model_options(Namespace): Model Specific options

    Returns:
        The trainer object
    """
    model_name = getattr(ModelClass, "title", ModelClass.__name__)
    logger.info("Training the {} model".format(model_name))
    # FIXME: make sure all places use output_dir
    # del pipeline_options.output_dir
    pipeline_options.output_dir = None

    # Data step
    fieldset = ModelClass.fieldset(
        wmt18_format=model_options.__dict__.get("wmt18_format"))

    datasets = retrieve_datasets(fieldset, pipeline_options, model_options,
                                 output_dir)
    save_vocabularies_from_datasets(output_dir, *datasets)
    if pipeline_options.save_data:
        save_training_datasets(pipeline_options.save_data, *datasets)

    # Trainer step
    device_id = None
    if pipeline_options.gpu_id is not None and pipeline_options.gpu_id >= 0:
        device_id = pipeline_options.gpu_id

    vocabs = utils.fields_to_vocabs(datasets[0].fields)

    trainer = retrieve_trainer(
        ModelClass,
        pipeline_options,
        model_options,
        vocabs,
        output_dir,
        device_id,
    )

    logger.info(str(trainer.model))
    logger.info("{} parameters".format(trainer.model.num_parameters()))

    # Dataset iterators
    train_iter = build_bucket_iterator(
        datasets[0],
        batch_size=pipeline_options.train_batch_size,
        is_train=True,
        device=device_id,
    )
    valid_iter = build_bucket_iterator(
        datasets[1],
        batch_size=pipeline_options.valid_batch_size,
        is_train=False,
        device=device_id,
    )

    trainer.run(train_iter, valid_iter, epochs=pipeline_options.epochs)

    return trainer
Beispiel #2
0
def run(ModelClass, output_dir, pipeline_options, model_options, splits):
    model_name = getattr(ModelClass, 'title', ModelClass.__name__)
    logger.info('Jackknifing with the {} model'.format(model_name))

    # Data
    fieldset = ModelClass.fieldset(
        wmt18_format=model_options.__dict__.get('wmt18_format'))
    train_set, dev_set = train.retrieve_datasets(fieldset, pipeline_options,
                                                 model_options, output_dir)

    test_set = None
    try:
        test_set = build_test_dataset(fieldset, **vars(pipeline_options))
    except ValueError:
        pass
    except FileNotFoundError:
        pass

    device_id = None
    if pipeline_options.gpu_id is not None and pipeline_options.gpu_id >= 0:
        device_id = pipeline_options.gpu_id

    parent_dir = output_dir
    train_predictions = defaultdict(list)
    dev_predictions = defaultdict(list)
    test_predictions = defaultdict(list)
    splitted_datasets = cross_split_dataset(train_set, splits)
    for i, (train_fold, pred_fold) in enumerate(splitted_datasets):

        run_name = 'train_split_{}'.format(i)
        output_dir = Path(parent_dir, run_name)
        output_dir.mkdir(parents=True, exist_ok=True)
        # options.output_dir = str(options.output_dir)

        # Train
        vocabs = utils.fields_to_vocabs(train_fold.fields)

        tracking_run = tracking_logger.start_nested_run(run_name=run_name)
        with tracking_run:
            train.setup(
                output_dir=output_dir,
                seed=pipeline_options.seed,
                gpu_id=pipeline_options.gpu_id,
                debug=pipeline_options.debug,
                quiet=pipeline_options.quiet,
            )

            trainer = train.retrieve_trainer(
                ModelClass,
                pipeline_options,
                model_options,
                vocabs,
                output_dir,
                device_id,
            )

            # Dataset iterators
            train_iter = build_bucket_iterator(
                train_fold,
                batch_size=pipeline_options.train_batch_size,
                is_train=True,
                device=device_id,
            )
            valid_iter = build_bucket_iterator(
                pred_fold,
                batch_size=pipeline_options.valid_batch_size,
                is_train=False,
                device=device_id,
            )

            trainer.run(train_iter, valid_iter, epochs=pipeline_options.epochs)

        # Predict
        predictor = load_model(trainer.checkpointer.best_model_path())
        train_predictions_i = predictor.run(
            pred_fold, batch_size=pipeline_options.valid_batch_size)

        dev_predictions_i = predictor.run(
            dev_set, batch_size=pipeline_options.valid_batch_size)

        test_predictions_i = None
        if test_set:
            test_predictions_i = predictor.run(
                test_set, batch_size=pipeline_options.valid_batch_size)

        torch.cuda.empty_cache()

        for output_name in train_predictions_i:
            train_predictions[output_name] += train_predictions_i[output_name]
            dev_predictions[output_name].append(dev_predictions_i[output_name])
            if test_set:
                test_predictions[output_name].append(
                    test_predictions_i[output_name])

    dev_predictions = average_all(dev_predictions)
    if test_set:
        test_predictions = average_all(test_predictions)

    save_predicted_probabilities(parent_dir,
                                 train_predictions,
                                 prefix=const.TRAIN)
    save_predicted_probabilities(parent_dir, dev_predictions, prefix=const.DEV)
    if test_set:
        save_predicted_probabilities(parent_dir,
                                     test_predictions,
                                     prefix=const.TEST)

    teardown(pipeline_options)

    return train_predictions