Ejemplo n.º 1
0
def local_learn_model(x_all, targets_all: Targets, config):

    model = None
    if config.multicubist or config.multirandomforest:
        y = targets_all.observations
        weights = targets_all.weights
        model = all_modelmaps[config.algorithm](**config.algorithm_args)
        apply_multiple_masked(
            model.fit, (x_all, y), **{
                'fields': targets_all.fields,
                'parallel': True,
                'sample_weight': weights,
                'lon_lat': targets_all.positions
            })
    else:
        if mpiops.chunk_index == 0:
            y = targets_all.observations
            weights = targets_all.weights
            model = all_modelmaps[config.algorithm](**config.algorithm_args)
            apply_multiple_masked(
                model.fit, (x_all, y), **{
                    'fields': targets_all.fields,
                    'sample_weight': weights,
                    'lon_lat': targets_all.positions
                })
    return model
Ejemplo n.º 2
0
def calculate_validation_scores(ys, yt, eys):
    """ Calculates the validation scores for a prediction
    Given the test and training data, as well as the outputs from every model,
    this function calculates all of the applicable metrics in the following
    list, and returns a dictionary with the following (possible) keys:
        + r2_score
        + expvar
        + smse
        + lins_ccc
        + mll
        + msll

    Parameters
    ----------
    ys: numpy.array
        The test data outputs
    yt: numpy.array
        The training data's corresponding predictions
    eys: numpy.array
        The predictions made by the trained model on test data

    Returns
    -------
    scores: dict
        A dictionary containing all of the evaluated scores.
    """

    probscores = ['msll', 'mll']

    scores = {}

    # cubist can predict nan when a categorical variable is not
    # present in the training data
    # TODO: Can be removed except for cubist
    nans = ~np.isnan(eys[:, 0])
    ys = ys[nans]
    eys = eys[:, 0][nans]

    for m in metrics:

        if m not in probscores:
            score = apply_multiple_masked(score_first_dim(metrics[m]),
                                          (ys, eys))
        elif eys.ndim == 2:
            if m == 'mll' and eys.shape[1] > 1:
                score = apply_multiple_masked(mll, (ys, eys[:, 0], eys[:, 1]))
            elif m == 'msll' and eys.shape[1] > 1:
                score = apply_multiple_masked(msll, (ys, eys[:, 0], eys[:, 1]),
                                              (yt, ))
            else:
                continue
        else:
            continue

        scores[m] = score
    return scores
Ejemplo n.º 3
0
def test_apply_multiple_masked(masked_data):
    yt, Xt, ys, Xs = masked_data
    yt_masked = np.ma.masked_array(yt, mask=Xt.mask.flatten())

    def fit(X, y):
        assert np.allclose(X, Xt.data[~Xt.mask.flatten()])
        assert np.allclose(y, yt_masked.data[~yt_masked.mask.flatten()])
        return

    def predict(X, y):
        return y

    yr = apply_multiple_masked(predict, (Xt, yt_masked))
    assert np.ma.all(yt_masked == yr)
    assert apply_multiple_masked(fit, (Xt, yt_masked)) is None
Ejemplo n.º 4
0
def permutation_importance(model, x_all, targets_all, config):
    _logger.info("Computing permutation importance!!")
    if config.algorithm not in transformed_modelmaps.keys():
        raise AttributeError("Only the following can be used for permutation "
                             "importance {}".format(
            list(transformed_modelmaps.keys())))

    y = targets_all.observations

    classification = hasattr(model, 'predict_proba')

    if not classification:
        for score in ['explained_variance',
                      'r2',
                      'neg_mean_absolute_error',
                      'neg_mean_squared_error']:
            pi_cv = apply_multiple_masked(
                PermutationImportance(model, scoring=score,
                                      cv='prefit', n_iter=10,
                                      refit=False).fit, data=(x_all, y)
            )
            feature_names = geoio.feature_names(config)
            df_picv = eli5.explain_weights_df(
                pi_cv, feature_names=feature_names, top=100)
            csv = Path(config.output_dir).joinpath(
                config.name + "_permutation_importance_{}.csv".format(
                    score)).as_posix()
            df_picv.to_csv(csv, index=False)
Ejemplo n.º 5
0
def classification_validation_scores(ys, eys, pys):
    """ Calculates the validation scores for a regression prediction
    Given the test and training data, as well as the outputs from every model,
    this function calculates all of the applicable metrics in the following
    list, and returns a dictionary with the following (possible) keys:
        + accuracy
        + log_loss
        + f1

    Parameters
    ----------
    ys: numpy.array
        The test data outputs, one-hot representation
    eys: numpy.array
        The (hard) predictions made by the trained model on test data, one-hot
        representation
    pys: numpy.array
        The probabilistic predictions made by the trained model on test data

    Returns
    -------
    scores: dict
        A dictionary containing all of the evaluated scores.
    """
    scores = {}
    # in case we get hard probabilites and log freaks out
    pys = np.minimum(np.maximum(pys, MINPROB), 1. - MINPROB)

    for k, m in classification_metrics.items():
        scores[k] = apply_multiple_masked(m, (ys, eys, pys))

    return scores
Ejemplo n.º 6
0
def regression_validation_scores(y, ey, ws, model):
    """ Calculates the validation scores for a regression prediction
    Given the test and training data, as well as the outputs from every model,
    this function calculates all of the applicable metrics in the following
    list, and returns a dictionary with the following (possible) keys:
        + r2_score
        + expvar
        + smse
        + lins_ccc
        + mll
        + msll

    Parameters
    ----------
    y: numpy.array
        The test data outputs
    ey: numpy.array
        The predictions made by the trained model on test data
    ws: numpy.array
        The weights of the test data

    Returns
    -------
    scores: dict
        A dictionary containing all of the evaluated scores.
    """
    scores = {}

    result_tags = model.get_predict_tags()

    if 'Variance' in result_tags:
        py, vy = ey[:, 0], ey[:, 1]
    else:
        py, vy = ey[:, 0], ey[:, 0]
        # don't calculate mll when variance is not available
        regression_metrics.pop('mll', None)
        transformed_regression_metrics.pop('mll_transformed', None)

    if hasattr(model, '_notransform_predict') and not isinstance(model.target_transform, Identity):  #
        # is a transformed model
        y_t = model.target_transform.transform(y)  # transformed targets
        py_t = model.target_transform.transform(py)  # transformed prediction

        regression_metrics.update(transformed_regression_metrics)

        if 'Variance' in result_tags:
            # transformed standard dev
            v_t = model.target_transform.transform(np.sqrt(vy))
            vy_t = np.square(v_t)  # transformed variances
        else:
            vy_t = py
    else:  # don't calculate if Transformed Prediction is not available
        y_t = y
        py_t = py
        vy_t = py

    for k, m in regression_metrics.items():
        scores[k] = apply_multiple_masked(m, (y, py, vy, ws, y_t, py_t, vy_t))

    return scores
Ejemplo n.º 7
0
def local_learn_model(x_all, targets_all, config):

    model = None
    if config.multicubist or config.multirandomforest:
        y = targets_all.observations
        model = all_modelmaps[config.algorithm](**config.algorithm_args)
        apply_multiple_masked(model.fit, (x_all, y),
                              kwargs={
                                  'fields': targets_all.fields,
                                  'parallel': True,
                                  'lon_lat': targets_all.positions
                              })
        if config.multirandomforest:
            rf_dicts = model._randomforests
            rf_dicts = mpiops.comm.gather(rf_dicts, root=0)
            mpiops.comm.barrier()
            if mpiops.chunk_index == 0:
                for rf in rf_dicts:
                    model._randomforests.update(rf)
    else:
        if mpiops.chunk_index == 0:
            y = targets_all.observations
            model = all_modelmaps[config.algorithm](**config.algorithm_args)
            apply_multiple_masked(model.fit, (x_all, y),
                                  kwargs={
                                      'fields': targets_all.fields,
                                      'lon_lat': targets_all.positions
                                  })

    # Save transformed targets for diagnostics
    if mpiops.chunk_index == 0 and hasattr(model, 'target_transform'):
        hdr = 'nontransformed,transformed'
        y = targets_all.observations
        y_t = model.target_transform.transform(y)
        np.savetxt(config.transformed_targets_file,
                   X=np.column_stack((y, y_t)),
                   delimiter=',',
                   header=hdr,
                   fmt='%.4e')

        if config.plot_target_scaling:
            diagnostics.plot_target_scaling(
                config.transformed_targets_file).savefig(
                    config.plot_target_scaling)

    return model
Ejemplo n.º 8
0
def y_y_plot(y1,
             y2,
             y_label=None,
             y_exp_label=None,
             title=None,
             outfile=None,
             display=None):
    """ Makes a y-y plot from two corresponding vectors
    This function makes a y-y plot given two y vectors (y1, y2). This plot can
    be used to evaluate the performance of the machine learning models.

    Parameters
    ----------
    y1: numpy.array
        The first input vector
    y2: numpy.array
        The second input vector, of the same size as y1
    y_label: string
        The axis label for the first vector
    y_exp_label: string
        The axis label for the second vector
    title: string
        The plot title
    outfile: string
        The location to save an image of the plot
    display: boolean
        If true, a matplotlib graph will display in a window, note that this
        pauses the execution of the main program till this window is suspended.
    """

    fig = pl.figure()
    maxy = max(y1.max(), get_first_dim(y2).max())
    miny = min(y1.min(), get_first_dim(y2).min())
    apply_multiple_masked(pl.plot, (y1, get_first_dim(y2)), ('k.', ))
    pl.plot([miny, maxy], [miny, maxy], 'r')
    pl.grid(True)
    pl.xlabel(y_label)
    pl.ylabel(y_exp_label)
    pl.title(title)
    if outfile is not None:
        fig.savefig(outfile + ".png")
    if display:
        pl.show()
Ejemplo n.º 9
0
def local_crossval(x_all, targets_all, config):
    """ Performs K-fold cross validation to test the applicability of a model.
    Given a set of inputs and outputs, this function will evaluate the
    effectiveness of a model at predicting the targets, by splitting all of
    the known data. A model is trained on a subset of the total data, and then
    this model is used to predict all of the unseen targets, its performance
    can provide a benchmark to evaluate the effectiveness of a model.

    Parameters
    ----------
    x_all: numpy.array
        A 2D array containing all of the training inputs
    targets_all: numpy.array
        A 1D vector containing all of the training outputs
    config: dict
        The global config object, which is used to choose the model to train.

    Return
    ------
    result: dict
        A dictionary containing all of the cross validation metrics, evaluated
        on the unseen data subset.
    """
    parallel_model = config.multicubist or config.multirandomforest or config.bootstrap
    if config.bootstrap and config.parallel_validate:
        config.alrgorithm_args['parallel'] = False
    elif not config.bootstrap and not config.parallel_validate and mpiops.chunk_index != 0:
        return

    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = False
   
    _logger.info("Validating with {} folds".format(config.folds))
    model = modelmaps[config.algorithm](**config.algorithm_args)
    classification = hasattr(model, 'predict_proba')
    y = targets_all.observations
    lon_lat = targets_all.positions
    _, cv_indices = split_cfold(y.shape[0], config.folds, config.crossval_seed)

    # Split folds over workers
    fold_list = np.arange(config.folds)
    if config.parallel_validate:
        fold_node = \
            np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index]
    else:
        fold_node = fold_list

    y_pred = {}
    y_true = {}
    fold_scores = {}
    pos = {}

    # Train and score on each fold
    for fold in fold_node:
        _logger.info(":mpi:Training fold {} of {}".format(
            fold + 1, config.folds, mpiops.chunk_index))

        train_mask = cv_indices != fold
        test_mask = ~ train_mask
    
        y_k_train = y[train_mask]
        if config.target_weight_property:
            y_k_weight = targets_all.fields[config.target_weight_property][train_mask]
        else:
            y_k_weight = None
        lon_lat_train = lon_lat[train_mask]
        lon_lat_test = lon_lat[test_mask]

        # Extra fields
        fields_train = {f: v[train_mask]
                        for f, v in targets_all.fields.items()}
        fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()}

        # Train on this fold
        x_train = x_all[train_mask]
        apply_multiple_masked(model.fit, data=(x_train, y_k_train), fields=fields_train,
                              lon_lat=lon_lat_train, 
                              sample_weight=y_k_weight)

        # Testing
        if not config.parallel_validate and mpiops.chunk_index != 0:
            continue
        else:
            y_k_pred = predict.predict(x_all[test_mask], model,
                                       fields=fields_pred,
                                       lon_lat=lon_lat_test)
            y_pred[fold] = y_k_pred
            n_covariates = x_all[test_mask].shape[1]

            # Regression
            if not classification:
                y_k_test = y[test_mask]
                fold_scores[fold] = regression_validation_scores(
                    y_k_test, y_k_pred, n_covariates, model)

            # Classification
            else:
                y_k_test = model.le.transform(y[test_mask])
                y_k_hard, p_k = y_k_pred[:, 0], y_k_pred[:, 1:]
                fold_scores[fold] = classification_validation_scores(
                    y_k_test, y_k_hard, p_k
                )
            
            y_true[fold] = y_k_test
            pos[fold] = lon_lat_test

    if config.parallel_validate:
        y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0))
        y_true = _join_dicts(mpiops.comm.gather(y_true, root=0))
        pos = _join_dicts(mpiops.comm.gather(pos, root=0))
        scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0))
    else:
        scores = fold_scores

    result = None
    if mpiops.chunk_index == 0:
        y_true = np.concatenate([y_true[i] for i in range(config.folds)])
        y_pred = np.concatenate([y_pred[i] for i in range(config.folds)])
        pos = np.concatenate([pos[i] for i in range(config.folds)])
        valid_metrics = scores[0].keys()
        scores = {m: np.mean([d[m] for d in scores.values()], axis=0)
                  for m in valid_metrics}
        score_string = "Validation complete:\n"
        for metric, score in scores.items():
            score_string += "{}\t= {}\n".format(metric, score)
        _logger.info(score_string)

        result_tags = model.get_predict_tags()
        y_pred_dict = dict(zip(result_tags, y_pred.T))
        if hasattr(model, '_notransform_predict'):
            y_pred_dict['transformedpredict'] = \
                model.target_transform.transform(y_pred[:, 0])
        result = CrossvalInfo(scores, y_true, y_pred_dict, classification, pos)

    if parallel_model:
        config.algorithm_args['parallel'] = True

    return result
Ejemplo n.º 10
0
def local_crossval(x_all, targets_all: targ.Targets, config: Config):
    """ Performs K-fold cross validation to test the applicability of a model.
    Given a set of inputs and outputs, this function will evaluate the
    effectiveness of a model at predicting the targets, by splitting all of
    the known data. A model is trained on a subset of the total data, and then
    this model is used to predict all of the unseen targets, its performance
    can provide a benchmark to evaluate the effectiveness of a model.

    Parameters
    ----------
    x_all: numpy.array
        A 2D array containing all of the training inputs
    targets_all: numpy.array
        A 1D vector containing all of the training outputs
    config: dict
        The global config object, which is used to choose the model to train.

    Return
    ------
    result: dict
        A dictionary containing all of the cross validation metrics, evaluated
        on the unseen data subset.
    """
    # run cross validation in parallel, but one thread for each fold
    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = False

    if (mpiops.chunk_index != 0) and (not config.parallel_validate):
        return

    log.info("Validating with {} folds".format(config.folds))
    model = modelmaps[config.algorithm](**config.algorithm_args)
    classification = hasattr(model, 'predict_proba')
    groups = targets_all.groups

    if (len(np.unique(groups)) + 1 < config.folds) and config.group_targets:
        raise ValueError(f"Cannot continue cross-validation with chosen params as num of groups {max(groups) + 1} "
                         f"in data is less than the number of folds {config.folds}")
    random_state = \
        config.algorithm_args['random_state'] if 'random_state' in config.algorithm_args else np.random.randint(1000)
    x_all, y, lon_lat, groups, w, cv = setup_validation_data(x_all, targets_all, config.folds, random_state)
    _, cv_indices = split_gfold(groups, cv)

    # Split folds over workers
    fold_list = np.arange(config.folds)
    if config.parallel_validate:
        fold_node = np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index]
    else:
        fold_node = fold_list

    y_pred = {}
    y_true = {}
    weight = {}
    lon_lat_ = {}
    fold_scores = {}
    # Train and score on each fold
    for fold in fold_node:
        model = modelmaps[config.algorithm](**config.algorithm_args)

        print("Training fold {} of {} using process {}".format(
            fold + 1, config.folds, mpiops.chunk_index))
        train_mask = cv_indices != fold
        test_mask = ~ train_mask

        y_k_train = y[train_mask]
        w_k_train = w[train_mask]
        lon_lat_train = lon_lat[train_mask, :]
        lon_lat_test = lon_lat[test_mask, :]

        # Extra fields
        fields_train = {f: v[train_mask]
                        for f, v in targets_all.fields.items()}
        fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()}

        # Train on this fold
        x_train = x_all[train_mask]
        apply_multiple_masked(model.fit, data=(x_train, y_k_train),
                              ** {'fields': fields_train,
                                  'sample_weight': w_k_train,
                                  'lon_lat': lon_lat_train})

        # Testing
        y_k_pred = predict.predict(x_all[test_mask], model,
                                   fields=fields_pred,
                                   lon_lat=lon_lat_test)

        y_pred[fold] = y_k_pred

        # Regression
        if not classification:
            y_k_test = y[test_mask]
            y_true[fold] = y_k_test
            w_k_test = w[test_mask]
            weight[fold] = w_k_test
            lon_lat_[fold] = lon_lat_test
            fold_scores[fold] = regression_validation_scores(y_k_test, y_k_pred, w_k_test, model)

        # Classification
        else:
            y_k_test = model.le.transform(y[test_mask])
            y_true[fold] = y_k_test
            w_k_test = w[test_mask]
            weight[fold] = w_k_test
            lon_lat_[fold] = lon_lat_test
            y_k_hard, p_k = y_k_pred[:, 0], y_k_pred[:, 1:]
            fold_scores[fold] = classification_validation_scores(y_k_test, y_k_hard, w_k_test, p_k)

    if config.parallel_validate:
        y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0))
        lon_lat_ = _join_dicts(mpiops.comm.gather(lon_lat_, root=0))
        y_true = _join_dicts(mpiops.comm.gather(y_true, root=0))
        weight = _join_dicts(mpiops.comm.gather(weight, root=0))
        scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0))
    else:
        scores = fold_scores

    result = None
    if mpiops.chunk_index == 0:
        y_true = np.concatenate([y_true[i] for i in range(config.folds)])
        weight = np.concatenate([weight[i] for i in range(config.folds)])
        lon_lat = np.concatenate([lon_lat_[i] for i in range(config.folds)])
        y_pred = np.concatenate([y_pred[i] for i in range(config.folds)])
        valid_metrics = scores[0].keys()
        scores = {m: np.mean([d[m] for d in scores.values()], axis=0)
                  for m in valid_metrics}
        score_string = "Validation complete:\n"
        for metric, score in scores.items():
            score_string += "{}\t= {}\n".format(metric, score)
        log.info(score_string)

        result_tags = model.get_predict_tags()
        y_pred_dict = dict(zip(result_tags, y_pred.T))
        if hasattr(model, '_notransform_predict'):
            y_pred_dict['transformedpredict'] = \
                model.target_transform.transform(y_pred[:, 0])
        result = CrossvalInfo(scores, y_true, y_pred_dict, weight, lon_lat, classification)

    # change back to parallel
    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = True

    return result
Ejemplo n.º 11
0
def local_crossval(x_all, targets_all, config):
    """ Performs K-fold cross validation to test the applicability of a model.
    Given a set of inputs and outputs, this function will evaluate the
    effectiveness of a model at predicting the targets, by splitting all of
    the known data. A model is trained on a subset of the total data, and then
    this model is used to predict all of the unseen targets, its performance
    can provide a benchmark to evaluate the effectiveness of a model.

    Parameters
    ----------
    x_all: numpy.array
        A 2D array containing all of the training inputs
    targets_all: numpy.array
        A 1D vector containing all of the training outputs
    config: dict
        The global config object, which is used to choose the model to train.

    Return
    ------
    result: dict
        A dictionary containing all of the cross validation metrics, evaluated
        on the unseen data subset.
    """
    # run cross validation in parallel, but one thread for each fold
    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = False

    if (mpiops.chunk_index != 0) and (not config.parallel_validate):
        return

    log.info("Validating with {} folds".format(config.folds))
    model = modelmaps[config.algorithm](**config.algorithm_args)
    y = targets_all.observations
    lon_lat = targets_all.positions
    _, cv_indices = split_cfold(y.shape[0], config.folds, config.crossval_seed)

    # Split folds over workers
    fold_list = np.arange(config.folds)
    if config.parallel_validate:
        fold_node = np.array_split(fold_list,
                                   mpiops.chunks)[mpiops.chunk_index]
    else:
        fold_node = fold_list

    y_pred = {}
    y_true = {}
    fold_scores = {}

    # Train and score on each fold
    for fold in fold_node:

        print("Training fold {} of {} using process {}".format(
            fold + 1, config.folds, mpiops.chunk_index))
        train_mask = cv_indices != fold
        test_mask = ~train_mask

        y_k_train = y[train_mask]
        lon_lat_train = lon_lat[train_mask]
        lon_lat_test = lon_lat[test_mask]

        # Extra fields
        fields_train = {
            f: v[train_mask]
            for f, v in targets_all.fields.items()
        }
        fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()}

        # Train on this fold
        apply_multiple_masked(model.fit,
                              data=(x_all[train_mask], y_k_train),
                              kwargs={
                                  'fields': fields_train,
                                  'lon_lat': lon_lat_train
                              })

        # Testing
        y_k_pred = predict.predict(x_all[test_mask],
                                   model,
                                   fields=fields_pred,
                                   lon_lat=lon_lat_test)

        y_k_test = y[test_mask]
        y_pred[fold] = y_k_pred
        y_true[fold] = y_k_test

        fold_scores[fold] = calculate_validation_scores(
            y_k_test, y_k_train, y_k_pred)
    if config.parallel_validate:
        y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0))
        y_true = _join_dicts(mpiops.comm.gather(y_true, root=0))
        scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0))
    else:
        scores = fold_scores

    result = None
    if mpiops.chunk_index == 0:
        y_true = np.concatenate([y_true[i] for i in range(config.folds)])
        y_pred = np.concatenate([y_pred[i] for i in range(config.folds)])
        valid_metrics = scores[0].keys()
        scores = {
            m: np.mean([d[m] for d in scores.values()])
            for m in valid_metrics
        }
        score_string = "Validation complete:\n"
        for metric, score in scores.items():
            score_string += "{}\t= {}\n".format(metric, score)
        log.info(score_string)

        result_tags = model.get_predict_tags()
        y_pred_dict = dict(zip(result_tags, y_pred.T))
        result = CrossvalInfo(scores, y_true, y_pred_dict)

    # change back to parallel
    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = True

    return result
Ejemplo n.º 12
0
def local_learn_model(x_all, targets_all, config):
    """
    Trains a model. Handles special case of parallel models.

    Parameters
    ----------
    x_all : np.ndarray
        All covariate data, shape (n_samples, n_features), sorted using
        X, Y of target positions.
    targets_all : np.ndarray
        All target data, shape (n_samples), sorted using X, Y of
        target positions.
    config : :class:`~uncoverml.config.Config`
        Config object.

    Returns
    -------
    :class:`~uncoverml.model.Model`
        A trained Model.
    """
    mpiops.comm.barrier()
    model = None
    if config.target_weight_property:
        weights = targets_all.fields[config.target_weight_property]
    else:
        weights = None
    # Handle models that can be trained in parallel
    if config.multicubist or config.multirandomforest or config.bootstrap:
        y = targets_all.observations
        model = all_modelmaps[config.algorithm](**config.algorithm_args)
        apply_multiple_masked(model.fit, (x_all, y),
                              fields=targets_all.fields,
                              lon_lat=targets_all.positions,
                              sample_weight=weights)
        # Special case: for MRF we need to gather the forests from each
        # process and cache them in the model
        if config.multirandomforest:
            rf_dicts = model._randomforests
            rf_dicts = mpiops.comm.gather(rf_dicts, root=0)
            mpiops.comm.barrier()
            if mpiops.chunk_index == 0:
                for rf in rf_dicts:
                    model._randomforests.update(rf)
    # Single-threaded models
    else:
        if mpiops.chunk_index == 0:
            y = targets_all.observations
            model = all_modelmaps[config.algorithm](**config.algorithm_args)
            apply_multiple_masked(model.fit, (x_all, y),
                                  fields=targets_all.fields,
                                  lon_lat=targets_all.positions,
                                  sample_weight=weights)

    # Save transformed targets for diagnostics
    if mpiops.chunk_index == 0 and hasattr(model, 'target_transform'):
        hdr = 'nontransformed,transformed'
        y = targets_all.observations
        y_t = model.target_transform.transform(y)
        np.savetxt(config.transformed_targets_file,
                   X=np.column_stack((y, y_t)),
                   delimiter=',',
                   header=hdr,
                   fmt='%.4e')

        if config.plot_target_scaling:
            diagnostics.plot_target_scaling(
                config.transformed_targets_file)\
            .savefig(config.plot_target_scaling)

    return model