Beispiel #1
0
def run_training(settings, data, vocabulary_idx_to_word,
                 vocabulary_word_to_idx, logger, use_cuda):

    reproduction_command = 'python main.py ' + '-c ' + os.path.join(
        logger.log_dir, logger.run_name + '.ini')
    logger.shout(reproduction_command)
    logger.log('# ' + reproduction_command)
    logger.log(
        'epoch\titeration\tfold\ttrain_loss\ttrain_acc\ttrain_macro_f1\ttrain_macro_f1_main\ttrain_total\tval_loss\tval_acc\tval_macro_f1\tval_macro_f1_main\tval_total\tmodel'
    )

    input_vecs, targets = data_utils.create_input_vectors(
        data, vocabulary_idx_to_word, vocabulary_word_to_idx)

    # Compute the class weights if necessary
    if settings.training.class_weights:
        class_weights = np.bincount(targets[targets != -1],
                                    minlength=settings.model.num_entities)
        class_weights = 1.0 / (
            np.sqrt(class_weights) + 1e-6
        )  # 1e-6 for numerical stability (though the inf values wouldn't be used anyway)
        settings.training.class_weights = class_weights
    else:
        settings.training.class_weights = None

    fold_indices = range(settings.data.folds)
    if settings.data.folds > 1:
        folds = data_utils.get_cv_folds(data, settings.data, logger)
    else:
        # No cross-validation:
        train_sequence_bounds = data_utils.get_sequence_bounds(
            data, settings.data.level)
        validation_sequence_bounds = []

    for fold_idx in fold_indices:
        # For bookkeeping (logging five folds in one file):
        logger.fold_idx = fold_idx

        # Select training and (if cross-validation) validation data:
        if settings.data.folds > 1:
            train_sequence_bounds = np.concatenate(
                tuple(folds[:fold_idx] + folds[fold_idx + 1:]))
            validation_sequence_bounds = folds[fold_idx]

        # Initialise model
        model = models.LSTM_basic(settings.model,
                                  padding_idx=data_utils.DUMMY_ENTITY_IDX)
        if use_cuda:
            model.cuda()

        # Train the model
        last_model, best_model = model_utils.train(
            model, input_vecs, targets, train_sequence_bounds,
            validation_sequence_bounds, settings.training,
            settings.training.no_shuffle, logger)

        # Save the best model through the logger
        logger.save_model(best_model)
Beispiel #2
0
def run_deploy(model_path, settings, data_path, vocabulary_idx_to_word, vocabulary_word_to_idx, entity_name_to_idx, answers_per_fold, no_cv, logger, use_cuda):

    data, keys_in_data = data_utils.load_data(data_path, entity_name_to_idx, logger=logger)
    input_vecs, targets = data_utils.create_input_vectors(data, vocabulary_idx_to_word,
                                                                                vocabulary_word_to_idx)

    # Load all models from model_path:
    model_list = []
    for path in model_path:
        model_fold = models.LSTM_basic(settings.model, padding_idx=data_utils.DUMMY_ENTITY_IDX)
        model_fold.load_state_dict(torch.load(path, map_location=lambda storage, loc: storage))
        if use_cuda:
            model_fold.cuda()
        model_list.append(model_fold)
        logger.whisper('Loaded model from ' + path)

    if no_cv:   # To deploy all models as ensemble to all the data
        test_sequence_bounds = data_utils.get_sequence_bounds(data, settings.data.level)

        collect_ensembles_preds = False     # TODO @Carina I vaguely recall this being used in the final rush before the deadline. Remove (also in model_utils)? See also the TODO in write_answers().

        predictions_zipped, _ = model_utils.get_indexed_predictions_with_targets(
            model_list, input_vecs, targets, test_sequence_bounds, use_cuda,
            collect_ensembles_preds=collect_ensembles_preds)

        # Write answers through logger
        answers_path = logger.write_answers_csv(data_path, predictions_zipped, model_suffix="--ensemble", config=settings.orig)

        # Optionally also per individual model (i.e., each model trained on one fold):
        if answers_per_fold:
            for i, model in enumerate(model_list):
                predictions_zipped, _ = model_utils.get_indexed_predictions_with_targets(
                    model, input_vecs, targets, test_sequence_bounds, use_cuda)
                logger.write_answers_csv(data_path, predictions_zipped, model_suffix='--fold'+str(i))

    else:   # To deploy per fold of the data
        results = []
        folds = data_utils.get_cv_folds(data, settings.data, logger)
        for fold_idx in range(settings.data.folds):

            # Obtain predictions for this fold:
            predictions_zipped, _ = model_utils.get_indexed_predictions_with_targets(
                model_list[fold_idx], input_vecs, targets, folds[fold_idx], use_cuda)

            # Optionally write answers for this one fold
            if answers_per_fold:
                logger.write_answers_csv(settings.data.dataset + "--fold" + str(fold_idx), predictions_zipped,
                                         model_suffix="--fold" + str(fold_idx), config=settings.orig)

            # But also store them, to be merged and sorted later, for writing merged answers
            results.extend(predictions_zipped)

        # Write answers merged over all folds through logger
        results.sort()
        answers_path = logger.write_answers_csv(settings.data.dataset, results, model_suffix="--cv", config=settings.orig)

    return answers_path, keys_in_data