Esempio n. 1
0
def out_of_sample_validation(model, targets, features, config):
    _logger.info(
        f"Performing out-of-sample validation with {targets.observations.shape[0]} targets...")
    mpiops.comm.barrier()
    if mpiops.chunk_index != 0:
        with open(config.model_file, 'rb') as f:
            model, _, _ = pickle.load(f)
    model = mpiops.comm.bcast(model, root=0)
    classification = hasattr(model, 'predict_proba')
    pos = np.array_split(targets.positions, mpiops.chunks)[mpiops.chunk_index]
    fields = {}
    for k, v in targets.fields.items():
        fields[k] = np.array_split(v, mpiops.chunks)[mpiops.chunk_index]
    features = np.array_split(features, mpiops.chunks)[mpiops.chunk_index]
    pred = predict.predict(features, model,
                           fields=fields,
                           lon_lat=pos)

    pred = mpiops.comm.gather(pred, root=0)
    if mpiops.chunk_index == 0:
        pred = np.concatenate(pred)
        if classification:
            hard, p = pred[:, 0], pred[:, 1:]
            scores = classification_validation_scores(targets.observations, hard, p)
        else:
            scores = regression_validation_scores(targets.observations, pred, features.shape[1], model)

        _logger.info("Out of sample validation complete, scores:")
        for k, v in scores.items():
            _logger.info(f"{k}: {v}")

        result_tags = model.get_predict_tags()
        y_pred_dict = dict(zip(result_tags, pred.T))
        if hasattr(model, '_notransform_predict'):
            y_pred_dict['transformedpredict'] = \
                model.target_transform.transform(pred[:, 0])
        
        return OOSInfo(scores, targets.observations, y_pred_dict, classification, targets.positions)
    else:
        return None
Esempio n. 2
0
def oos_validate(targets_all, x_all, model, config):
    lon_lat = targets_all.positions
    weights = targets_all.weights
    observations = targets_all.observations
    predictions = predict.predict(x_all, model, interval=config.quantiles, lon_lat=lon_lat)
    if mpiops.chunk_index == 0:
        tags = model.get_predict_tags()
        y_true = targets_all.observations
        to_text = [predictions, y_true[:, np.newaxis], lon_lat]

        true_vs_pred = Path(config.output_dir).joinpath(config.name + "_oos_validation.csv")
        cols = tags + ['y_true', 'lon', 'lat']
        np.savetxt(true_vs_pred, X=np.hstack(to_text), delimiter=',',
                   fmt='%.8e',
                   header=','.join(cols),
                   comments='')
        scores = regression_validation_scores(observations, predictions, weights, model)
        score_string = "OOS Validation Scores:\n"
        for metric, score in scores.items():
            score_string += "{}\t= {}\n".format(metric, score)

        geoio.output_json(scores, Path(config.output_dir).joinpath(config.name + "_oos_validation_scores.json"))
        log.info(score_string)
Esempio n. 3
0
def local_crossval(x_all, targets_all, config):
    """ Performs K-fold cross validation to test the applicability of a model.
    Given a set of inputs and outputs, this function will evaluate the
    effectiveness of a model at predicting the targets, by splitting all of
    the known data. A model is trained on a subset of the total data, and then
    this model is used to predict all of the unseen targets, its performance
    can provide a benchmark to evaluate the effectiveness of a model.

    Parameters
    ----------
    x_all: numpy.array
        A 2D array containing all of the training inputs
    targets_all: numpy.array
        A 1D vector containing all of the training outputs
    config: dict
        The global config object, which is used to choose the model to train.

    Return
    ------
    result: dict
        A dictionary containing all of the cross validation metrics, evaluated
        on the unseen data subset.
    """
    parallel_model = config.multicubist or config.multirandomforest or config.bootstrap
    if config.bootstrap and config.parallel_validate:
        config.alrgorithm_args['parallel'] = False
    elif not config.bootstrap and not config.parallel_validate and mpiops.chunk_index != 0:
        return

    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = False
   
    _logger.info("Validating with {} folds".format(config.folds))
    model = modelmaps[config.algorithm](**config.algorithm_args)
    classification = hasattr(model, 'predict_proba')
    y = targets_all.observations
    lon_lat = targets_all.positions
    _, cv_indices = split_cfold(y.shape[0], config.folds, config.crossval_seed)

    # Split folds over workers
    fold_list = np.arange(config.folds)
    if config.parallel_validate:
        fold_node = \
            np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index]
    else:
        fold_node = fold_list

    y_pred = {}
    y_true = {}
    fold_scores = {}
    pos = {}

    # Train and score on each fold
    for fold in fold_node:
        _logger.info(":mpi:Training fold {} of {}".format(
            fold + 1, config.folds, mpiops.chunk_index))

        train_mask = cv_indices != fold
        test_mask = ~ train_mask
    
        y_k_train = y[train_mask]
        if config.target_weight_property:
            y_k_weight = targets_all.fields[config.target_weight_property][train_mask]
        else:
            y_k_weight = None
        lon_lat_train = lon_lat[train_mask]
        lon_lat_test = lon_lat[test_mask]

        # Extra fields
        fields_train = {f: v[train_mask]
                        for f, v in targets_all.fields.items()}
        fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()}

        # Train on this fold
        x_train = x_all[train_mask]
        apply_multiple_masked(model.fit, data=(x_train, y_k_train), fields=fields_train,
                              lon_lat=lon_lat_train, 
                              sample_weight=y_k_weight)

        # Testing
        if not config.parallel_validate and mpiops.chunk_index != 0:
            continue
        else:
            y_k_pred = predict.predict(x_all[test_mask], model,
                                       fields=fields_pred,
                                       lon_lat=lon_lat_test)
            y_pred[fold] = y_k_pred
            n_covariates = x_all[test_mask].shape[1]

            # Regression
            if not classification:
                y_k_test = y[test_mask]
                fold_scores[fold] = regression_validation_scores(
                    y_k_test, y_k_pred, n_covariates, model)

            # Classification
            else:
                y_k_test = model.le.transform(y[test_mask])
                y_k_hard, p_k = y_k_pred[:, 0], y_k_pred[:, 1:]
                fold_scores[fold] = classification_validation_scores(
                    y_k_test, y_k_hard, p_k
                )
            
            y_true[fold] = y_k_test
            pos[fold] = lon_lat_test

    if config.parallel_validate:
        y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0))
        y_true = _join_dicts(mpiops.comm.gather(y_true, root=0))
        pos = _join_dicts(mpiops.comm.gather(pos, root=0))
        scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0))
    else:
        scores = fold_scores

    result = None
    if mpiops.chunk_index == 0:
        y_true = np.concatenate([y_true[i] for i in range(config.folds)])
        y_pred = np.concatenate([y_pred[i] for i in range(config.folds)])
        pos = np.concatenate([pos[i] for i in range(config.folds)])
        valid_metrics = scores[0].keys()
        scores = {m: np.mean([d[m] for d in scores.values()], axis=0)
                  for m in valid_metrics}
        score_string = "Validation complete:\n"
        for metric, score in scores.items():
            score_string += "{}\t= {}\n".format(metric, score)
        _logger.info(score_string)

        result_tags = model.get_predict_tags()
        y_pred_dict = dict(zip(result_tags, y_pred.T))
        if hasattr(model, '_notransform_predict'):
            y_pred_dict['transformedpredict'] = \
                model.target_transform.transform(y_pred[:, 0])
        result = CrossvalInfo(scores, y_true, y_pred_dict, classification, pos)

    if parallel_model:
        config.algorithm_args['parallel'] = True

    return result
Esempio n. 4
0
def local_crossval(x_all, targets_all: targ.Targets, config: Config):
    """ Performs K-fold cross validation to test the applicability of a model.
    Given a set of inputs and outputs, this function will evaluate the
    effectiveness of a model at predicting the targets, by splitting all of
    the known data. A model is trained on a subset of the total data, and then
    this model is used to predict all of the unseen targets, its performance
    can provide a benchmark to evaluate the effectiveness of a model.

    Parameters
    ----------
    x_all: numpy.array
        A 2D array containing all of the training inputs
    targets_all: numpy.array
        A 1D vector containing all of the training outputs
    config: dict
        The global config object, which is used to choose the model to train.

    Return
    ------
    result: dict
        A dictionary containing all of the cross validation metrics, evaluated
        on the unseen data subset.
    """
    # run cross validation in parallel, but one thread for each fold
    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = False

    if (mpiops.chunk_index != 0) and (not config.parallel_validate):
        return

    log.info("Validating with {} folds".format(config.folds))
    model = modelmaps[config.algorithm](**config.algorithm_args)
    classification = hasattr(model, 'predict_proba')
    groups = targets_all.groups

    if (len(np.unique(groups)) + 1 < config.folds) and config.group_targets:
        raise ValueError(f"Cannot continue cross-validation with chosen params as num of groups {max(groups) + 1} "
                         f"in data is less than the number of folds {config.folds}")
    random_state = \
        config.algorithm_args['random_state'] if 'random_state' in config.algorithm_args else np.random.randint(1000)
    x_all, y, lon_lat, groups, w, cv = setup_validation_data(x_all, targets_all, config.folds, random_state)
    _, cv_indices = split_gfold(groups, cv)

    # Split folds over workers
    fold_list = np.arange(config.folds)
    if config.parallel_validate:
        fold_node = np.array_split(fold_list, mpiops.chunks)[mpiops.chunk_index]
    else:
        fold_node = fold_list

    y_pred = {}
    y_true = {}
    weight = {}
    lon_lat_ = {}
    fold_scores = {}
    # Train and score on each fold
    for fold in fold_node:
        model = modelmaps[config.algorithm](**config.algorithm_args)

        print("Training fold {} of {} using process {}".format(
            fold + 1, config.folds, mpiops.chunk_index))
        train_mask = cv_indices != fold
        test_mask = ~ train_mask

        y_k_train = y[train_mask]
        w_k_train = w[train_mask]
        lon_lat_train = lon_lat[train_mask, :]
        lon_lat_test = lon_lat[test_mask, :]

        # Extra fields
        fields_train = {f: v[train_mask]
                        for f, v in targets_all.fields.items()}
        fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()}

        # Train on this fold
        x_train = x_all[train_mask]
        apply_multiple_masked(model.fit, data=(x_train, y_k_train),
                              ** {'fields': fields_train,
                                  'sample_weight': w_k_train,
                                  'lon_lat': lon_lat_train})

        # Testing
        y_k_pred = predict.predict(x_all[test_mask], model,
                                   fields=fields_pred,
                                   lon_lat=lon_lat_test)

        y_pred[fold] = y_k_pred

        # Regression
        if not classification:
            y_k_test = y[test_mask]
            y_true[fold] = y_k_test
            w_k_test = w[test_mask]
            weight[fold] = w_k_test
            lon_lat_[fold] = lon_lat_test
            fold_scores[fold] = regression_validation_scores(y_k_test, y_k_pred, w_k_test, model)

        # Classification
        else:
            y_k_test = model.le.transform(y[test_mask])
            y_true[fold] = y_k_test
            w_k_test = w[test_mask]
            weight[fold] = w_k_test
            lon_lat_[fold] = lon_lat_test
            y_k_hard, p_k = y_k_pred[:, 0], y_k_pred[:, 1:]
            fold_scores[fold] = classification_validation_scores(y_k_test, y_k_hard, w_k_test, p_k)

    if config.parallel_validate:
        y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0))
        lon_lat_ = _join_dicts(mpiops.comm.gather(lon_lat_, root=0))
        y_true = _join_dicts(mpiops.comm.gather(y_true, root=0))
        weight = _join_dicts(mpiops.comm.gather(weight, root=0))
        scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0))
    else:
        scores = fold_scores

    result = None
    if mpiops.chunk_index == 0:
        y_true = np.concatenate([y_true[i] for i in range(config.folds)])
        weight = np.concatenate([weight[i] for i in range(config.folds)])
        lon_lat = np.concatenate([lon_lat_[i] for i in range(config.folds)])
        y_pred = np.concatenate([y_pred[i] for i in range(config.folds)])
        valid_metrics = scores[0].keys()
        scores = {m: np.mean([d[m] for d in scores.values()], axis=0)
                  for m in valid_metrics}
        score_string = "Validation complete:\n"
        for metric, score in scores.items():
            score_string += "{}\t= {}\n".format(metric, score)
        log.info(score_string)

        result_tags = model.get_predict_tags()
        y_pred_dict = dict(zip(result_tags, y_pred.T))
        if hasattr(model, '_notransform_predict'):
            y_pred_dict['transformedpredict'] = \
                model.target_transform.transform(y_pred[:, 0])
        result = CrossvalInfo(scores, y_true, y_pred_dict, weight, lon_lat, classification)

    # change back to parallel
    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = True

    return result
Esempio n. 5
0
def local_crossval(x_all, targets_all, config):
    """ Performs K-fold cross validation to test the applicability of a model.
    Given a set of inputs and outputs, this function will evaluate the
    effectiveness of a model at predicting the targets, by splitting all of
    the known data. A model is trained on a subset of the total data, and then
    this model is used to predict all of the unseen targets, its performance
    can provide a benchmark to evaluate the effectiveness of a model.

    Parameters
    ----------
    x_all: numpy.array
        A 2D array containing all of the training inputs
    targets_all: numpy.array
        A 1D vector containing all of the training outputs
    config: dict
        The global config object, which is used to choose the model to train.

    Return
    ------
    result: dict
        A dictionary containing all of the cross validation metrics, evaluated
        on the unseen data subset.
    """
    # run cross validation in parallel, but one thread for each fold
    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = False

    if (mpiops.chunk_index != 0) and (not config.parallel_validate):
        return

    log.info("Validating with {} folds".format(config.folds))
    model = modelmaps[config.algorithm](**config.algorithm_args)
    y = targets_all.observations
    lon_lat = targets_all.positions
    _, cv_indices = split_cfold(y.shape[0], config.folds, config.crossval_seed)

    # Split folds over workers
    fold_list = np.arange(config.folds)
    if config.parallel_validate:
        fold_node = np.array_split(fold_list,
                                   mpiops.chunks)[mpiops.chunk_index]
    else:
        fold_node = fold_list

    y_pred = {}
    y_true = {}
    fold_scores = {}

    # Train and score on each fold
    for fold in fold_node:

        print("Training fold {} of {} using process {}".format(
            fold + 1, config.folds, mpiops.chunk_index))
        train_mask = cv_indices != fold
        test_mask = ~train_mask

        y_k_train = y[train_mask]
        lon_lat_train = lon_lat[train_mask]
        lon_lat_test = lon_lat[test_mask]

        # Extra fields
        fields_train = {
            f: v[train_mask]
            for f, v in targets_all.fields.items()
        }
        fields_pred = {f: v[test_mask] for f, v in targets_all.fields.items()}

        # Train on this fold
        apply_multiple_masked(model.fit,
                              data=(x_all[train_mask], y_k_train),
                              kwargs={
                                  'fields': fields_train,
                                  'lon_lat': lon_lat_train
                              })

        # Testing
        y_k_pred = predict.predict(x_all[test_mask],
                                   model,
                                   fields=fields_pred,
                                   lon_lat=lon_lat_test)

        y_k_test = y[test_mask]
        y_pred[fold] = y_k_pred
        y_true[fold] = y_k_test

        fold_scores[fold] = calculate_validation_scores(
            y_k_test, y_k_train, y_k_pred)
    if config.parallel_validate:
        y_pred = _join_dicts(mpiops.comm.gather(y_pred, root=0))
        y_true = _join_dicts(mpiops.comm.gather(y_true, root=0))
        scores = _join_dicts(mpiops.comm.gather(fold_scores, root=0))
    else:
        scores = fold_scores

    result = None
    if mpiops.chunk_index == 0:
        y_true = np.concatenate([y_true[i] for i in range(config.folds)])
        y_pred = np.concatenate([y_pred[i] for i in range(config.folds)])
        valid_metrics = scores[0].keys()
        scores = {
            m: np.mean([d[m] for d in scores.values()])
            for m in valid_metrics
        }
        score_string = "Validation complete:\n"
        for metric, score in scores.items():
            score_string += "{}\t= {}\n".format(metric, score)
        log.info(score_string)

        result_tags = model.get_predict_tags()
        y_pred_dict = dict(zip(result_tags, y_pred.T))
        result = CrossvalInfo(scores, y_true, y_pred_dict)

    # change back to parallel
    if config.multicubist or config.multirandomforest:
        config.algorithm_args['parallel'] = True

    return result