Example #1
0
def train(config,
          predictor_file,
          estimator_file=None,
          no_obs=False,
          no_models=False,
          test_size=0):
    """
    Generate and train a scikit-learn machine learning estimator. The estimator object is saved as a pickle so that it
    may be imported and used for predictions at any time.

    :param config:
    :param predictor_file: str: full path to saved file of predictor data
    :param estimator_file: str: full path to output model file
    :param no_obs: bool: if True, generates the model with no OBS data
    :param no_models: bool: if True, generates the model with no BUFR data
    :param test_size: int: if > 0, returns a subset of the training data of size 'test_size' to test on
    :return: matplotlib Figure if plot_learning_curve is True
    """
    estimator = build_estimator(config)
    rain_tuning = config['Model'].get('Rain tuning', None)
    if test_size > 0:
        p_train, t_train, r_train, p_test, t_test, r_test = build_train_data(
            config,
            predictor_file,
            no_obs=no_obs,
            no_models=no_models,
            test_size=test_size)
    else:
        p_train, t_train, r_train = build_train_data(config,
                                                     predictor_file,
                                                     no_obs=no_obs,
                                                     no_models=no_models)

    print('train: training the estimator')
    if rain_tuning is not None and to_bool(
            rain_tuning.get('use_raw_rain', False)):
        estimator.fit(p_train, t_train, rain_array=r_train)
    else:
        estimator.fit(p_train, t_train)

    if estimator_file is None:
        estimator_file = '%s/%s_mosx.pkl' % (config['MOSX_ROOT'],
                                             config['station_id'])
    print('train: -> exporting to %s' % estimator_file)
    with open(estimator_file, 'wb') as handle:
        pickle.dump(estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)

    if test_size > 0:
        return p_test, t_test, r_test
    return
Example #2
0
def predict_rain_proba(config, predictor_file):
    """
    Predict probabilistic rain forecasts for 'pop' or 'categorical' types.
    :param config:
    :param predictor_file: str: file containing predictor data from mosx.model.format_predictors
    :return:
    """
    if config['Model']['rain_forecast_type'] not in ['pop', 'categorical']:
        raise TypeError(
            "'quantity' rain forecast is not probabilistic, cannot get probabilities"
        )
    rain_tuning = config['Model'].get('Rain tuning', None)
    if rain_tuning is None:
        raise TypeError(
            'Probabilistic rain forecasts are only possible with a RainTuningEstimator'
        )

    if config['multi_stations']:  #multiple stations
        station_ids = config['station_id']
        estimator_files = config['Model']['estimator_file']
        if len(estimator_files) != len(
                station_ids
        ):  #There has to be the same number of estimator files as station IDs, so raise error if not
            raise ValueError(
                "There must be the same number of estimator files as station IDs"
            )
    else:
        station_ids = [config['station_id']]  #just one station
        estimator_files = [config['Model']['estimator_file']]

    # Load the predictor data and estimator
    predictor_data = read_pkl(predictor_file)
    for i in range(len(station_ids)):
        station_id = station_ids[i]
        estimator_file = estimator_files[i]
        if config['verbose']:
            print('predict: loading estimator %s' % estimator_file)
        estimator = read_pkl(estimator_file)

        predictors = np.concatenate(
            (predictor_data['BUFKIT'], predictor_data['OBS']), axis=1)
        if to_bool(rain_tuning.get('use_raw_rain', False)):
            rain_proba = estimator.predict_rain_proba(
                predictors, rain_array=predictor_data.rain[i])
        else:
            rain_proba = estimator.predict_rain_proba(predictors)

    return rain_proba
Example #3
0
def train(config, predictor_file, no_obs=False, no_models=False, test_size=0, overwrite=False):
    """
    Generate and train a scikit-learn machine learning estimator. The estimator object is saved as a pickle so that it
    may be imported and used for predictions at any time.
    :param config:
    :param predictor_file: str: full path to saved file of predictor data
    :param no_obs: bool: if True, generates the model with no OBS data
    :param no_models: bool: if True, generates the model with no BUFR data
    :param test_size: int: if > 0, returns a subset of the training data of size 'test_size' to test on
    :param overwrite: bool: if True, retrains estimators and overwrites estimator files even if they already exist
    :return: tuple of training and testing sets if one station, or tuple of lists of such sets if multiple stations
    """
    if config['multi_stations']: #multiple stations
        station_ids = config['station_id']
        estimator_files = config['Model']['estimator_file']
        if len(estimator_files) != len(station_ids): #There has to be the same number of estimator files as station IDs, so raise error if not
            raise ValueError("There must be the same number of estimator files as station IDs")
    else:
        station_ids = [config['station_id']] #just one station
        estimator_files = [config['Model']['estimator_file']]
    
    if test_size > 0:
        p_train, t_train, r_train, p_test, t_test, r_test = build_train_data(config, predictor_file, no_obs=no_obs,
                                                                             no_models=no_models, test_size=test_size)
    else:
        p_train, t_train, r_train = build_train_data(config, predictor_file, no_obs=no_obs, no_models=no_models)

    for i in range(len(station_ids)):
        estimator_file = estimator_files[i]
        if overwrite or not os.path.exists(estimator_file): #only train if estimator file is not already there or overwriting is desired
            estimator = build_estimator(config)
        
            rain_tuning = config['Model'].get('Rain tuning', None)

            print('train: training the estimator for %s' % station_ids[i])
            if rain_tuning is not None and to_bool(rain_tuning.get('use_raw_rain', False)):
                estimator.fit(p_train[i], t_train[i], rain_array=r_train[i])
            else:
                estimator.fit(p_train[i], t_train[i])

            print('train: -> exporting to %s' % estimator_file)
            with open(estimator_file, 'wb') as handle:
                pickle.dump(estimator, handle, protocol=2)

    if test_size > 0:
        return p_test, t_test, r_test
    return
Example #4
0
def predict_rain_proba(config, predictor_file):
    """
    Predict probabilistic rain forecasts for 'pop' or 'categorical' types.

    :param config:
    :param predictor_file: str: file containing predictor data from mosx.model.format_predictors
    :return:
    """
    if config['Model']['rain_forecast_type'] not in ['pop', 'categorical']:
        raise TypeError(
            "'quantity' rain forecast is not probabilistic, cannot get probabilities"
        )
    rain_tuning = config['Model'].get('Rain tuning', None)
    if rain_tuning is None:
        raise TypeError(
            'Probabilistic rain forecasts are only possible with a RainTuningEstimator'
        )

    # Load the predictor data and estimator
    with open(predictor_file, 'rb') as handle:
        predictor_data = pickle.load(handle)
    if config['verbose']:
        print('predict: loading estimator %s' %
              config['Model']['estimator_file'])
    with open(config['Model']['estimator_file'], 'rb') as handle:
        estimator = pickle.load(handle)

    predictors = np.concatenate(
        (predictor_data['BUFKIT'], predictor_data['OBS']), axis=1)
    if to_bool(rain_tuning.get('use_raw_rain', False)):
        rain_proba = estimator.predict_rain_proba(
            predictors, rain_array=predictor_data.rain)
    else:
        rain_proba = estimator.predict_rain_proba(predictors)

    return rain_proba
Example #5
0
def predict_all(config,
                predictor_file,
                ensemble=False,
                time_series_date=None,
                naive_rain_correction=False,
                round_result=False,
                **kwargs):
    """
    Predict forecasts from the estimator in config. Also return probabilities and time series.
    :param config:
    :param predictor_file: str: file containing predictor data from mosx.model.format_predictors
    :param ensemble: bool: if True, return an array of num_trees-by-4 of the predictions of each tree in the estimator
    :param time_series_date: datetime: if set, returns a time series prediction from the estimator, where the datetime
    provided is the day the forecast is for (only works for single-day runs, or assumes last day)
    :param naive_rain_correction: bool: if True, applies manual tuning to the rain forecast
    :param round_result: bool: if True, rounds the predicted estimate
    :param kwargs: passed to the estimator's 'predict' method
    :return:
    predicted: ndarray: num_samples x num_predicted_variables predictions if one station, or list of such arrays if multiple stations
    all_predicted: ndarray: num_samples x num_predicted_variables x num_ensemble_members predictions for all trees if one station, or list of such arrays if multiple stations
    predicted_timeseries: DataFrame: time series for final sample if one station, or list of such time series if multiple stations
    """
    # Load the predictor data and estimator
    predictor_data = read_pkl(predictor_file)
    rain_tuning = config['Model'].get('Rain tuning', None)
    if config['multi_stations']:  #multiple stations
        station_ids = config['station_id']
        estimator_files = config['Model']['estimator_file']
        if len(estimator_files) != len(
                station_ids
        ):  #There has to be the same number of estimator files as station IDs, so raise error if not
            raise ValueError(
                "There must be the same number of estimator files as station IDs"
            )
    else:
        station_ids = [config['station_id']]  #just one station
        estimator_files = [config['Model']['estimator_file']]

    predictors = np.concatenate(
        (predictor_data['BUFKIT'], predictor_data['OBS']), axis=1)

    predicted = []
    all_predicted = []
    predicted_timeseries = []
    for i in range(len(station_ids)):
        estimator_file = estimator_files[i]
        estimator = read_pkl(estimator_file)
        if config['verbose']:
            print('predict: loading estimator %s' % estimator_file)
        if config['Model']['rain_forecast_type'] == 'pop' and getattr(
                estimator, 'is_classifier', False):
            predict_method = estimator.predict_proba
        else:
            predict_method = estimator.predict
        if rain_tuning is not None and to_bool(
                rain_tuning.get('use_raw_rain', False)):
            predicted_one = predict_method(predictors,
                                           rain_array=predictor_data.rain[i],
                                           **kwargs)
        else:
            predicted_one = predict_method(predictors, **kwargs)
        precip = predictor_data.rain[i]

        # Check for precipitation override
        if naive_rain_correction:
            for day in range(predicted_one.shape[0]):
                if sum(precip[day]) < 0.01:
                    if config['verbose']:
                        print(
                            'predict: warning: overriding MOS-X rain prediction of %0.2f on day %s with 0'
                            % (predicted_one[day, 3], day))
                    predicted_one[day, 3] = 0.
                elif predicted_one[day, 3] > max(
                        precip[day]) or predicted_one[day, 3] < min(
                            precip[day]):
                    if config['verbose']:
                        print(
                            'predict: warning: overriding MOS-X prediction of %0.2f on day %s with model mean'
                            % (predicted_one[day, 3], day))
                    predicted_one[day, 3] = max(
                        0., np.mean(precip[day] + [predicted_one[day, 3]]))
        else:
            # At least make sure we aren't predicting negative values...
            predicted_one[:, 3][predicted_one[:, 3] < 0] = 0.0

        # Round off daily values, if selected
        if round_result:
            predicted_one[:, :3] = np.round(predicted_one[:, :3])
            predicted_one[:, 3] = np.round(predicted_one[:, 3], 2)

        # If probabilities are requested and available, get the results from each tree
        if ensemble:
            num_samples = predictors.shape[0]
            if not hasattr(estimator, 'named_steps'):
                forest = estimator
            else:
                imputer = estimator.named_steps['imputer']
                forest = estimator.named_steps['regressor']
                predictors = imputer.transform(predictors)
            # If we generated our own ensemble by bootstrapping, it must be treated as such
            if config['Model']['train_individual'] and config['Model'].get(
                    'Bootstrapping', None) is None:
                num_trees = len(forest.estimators_[0].estimators_)
                all_predicted_one = np.zeros((num_samples, 4, num_trees))
                for v in range(4):
                    for t in range(num_trees):
                        try:
                            all_predicted_one[:, v, t] = forest.estimators_[
                                v].estimators_[t].predict(predictors)
                        except AttributeError:
                            # Work around the 2-D array of estimators for GBTrees
                            all_predicted_one[:, v, t] = forest.estimators_[
                                v].estimators_[t][0].predict(predictors)
            else:
                num_trees = len(forest.estimators_)
                all_predicted_one = np.zeros((num_samples, 4, num_trees))
                for t in range(num_trees):
                    try:
                        all_predicted_one[:, :,
                                          t] = forest.estimators_[t].predict(
                                              predictors)[:, :4]
                    except AttributeError:
                        # Work around the 2-D array of estimators for GBTrees
                        all_predicted_one[:, :, t] = forest.estimators_[t][
                            0].predict(predictors)[:, :4]
            all_predicted.append(all_predicted_one)
        else:
            all_predicted = None

        if config['Model']['predict_timeseries']:
            if time_series_date is None:
                date_now = datetime.utcnow()
                time_series_date = datetime(date_now.year, date_now.month,
                                            date_now.day) + timedelta(days=1)
                print(
                    'predict: warning: set time series start date to %s (was unspecified)'
                    % time_series_date)
            num_hours = int(24 / config['time_series_interval']) + 1
            predicted_array = predicted_one[-1, 4:].reshape((4, num_hours)).T

            # Get dewpoint
            predicted_array[:, 1] = dewpoint(predicted_array[:, 0],
                                             predicted_array[:, 1])
            times = pd.date_range(
                time_series_date.replace(hour=6),
                periods=num_hours,
                freq='%dH' %
                config['time_series_interval']).to_pydatetime().tolist()
            variables = ['temperature', 'dewpoint', 'windSpeed', 'rain']
            round_dict = {
                'temperature': 0,
                'dewpoint': 0,
                'windSpeed': 0,
                'rain': 2
            }
            predicted_timeseries_one = pd.DataFrame(predicted_array,
                                                    index=times,
                                                    columns=variables)
            predicted_timeseries_one = predicted_timeseries_one.round(
                round_dict)
            predicted_timeseries.append(predicted_timeseries_one)
        else:
            predicted_timeseries_one = None
        predicted.append(predicted_one)

    return predicted, all_predicted, predicted_timeseries
Example #6
0
def build_train_data(config,
                     predictor_file,
                     no_obs=False,
                     no_models=False,
                     test_size=0):
    """
    Build the array of training (and optionally testing) data.
    :param config:
    :param predictor_file:
    :param no_obs:
    :param no_models:
    :param test_size:
    :return:
    """
    from sklearn.model_selection import train_test_split
    if config['multi_stations']:  #multiple stations
        station_ids = config['station_id']
    else:
        station_ids = [config['station_id']]  #just one station
    if config['verbose']:
        print('build_train_data: reading predictor file')
    rain_tuning = config['Model'].get('Rain tuning', None)
    data = read_pkl(predictor_file)

    # Select data
    if no_obs and no_models:
        no_obs = False
        no_models = False
    if no_obs:
        if config['verbose']:
            print('build_train_data: not using observations to train')
        predictors = data['BUFKIT']
    elif no_models:
        if config['verbose']:
            print('build_train_data: not using models to train')
        predictors = data['OBS']
    else:
        predictors = np.concatenate((data['BUFKIT'], data['OBS']), axis=1)

    if test_size > 0:
        pred_len = len(predictors[0])
        targets_len = len(data['VERIF'][0][0])
        print(targets_len)
        targets_combined = [
        ]  #arrays of verification of different stations combined
        rain_combined = [
        ]  #arrays of rain arrays of different stations combined
        for day in range(len(data['VERIF'][0])):  #for each day
            rain_day = []
            for i in range(len(data['VERIF'])):  #for each station
                if i == 0:
                    targets_day = data['VERIF'][i][day]
                    rain_day = data.rain[i][day]
                else:
                    targets_day = np.concatenate(
                        (targets_day, data['VERIF'][i][day]))
                    rain_day = np.concatenate((rain_day, data.rain[i][day]))
            targets_combined.append(targets_day)
            rain_combined.append(rain_day)
        targets_combined = np.array(targets_combined)
        rain_combined = np.array(rain_combined)
        rain_len = len(data.rain[0][0])
        if rain_tuning is not None and to_bool(
                rain_tuning.get('use_raw_rain', False)):
            predictors = np.concatenate((predictors, rain_combined), axis=1)

        p_train = []
        t_train = []
        r_train = []
        p_test = []
        t_test = []
        r_test = []
        p_train_raw, p_test_raw, t_train_raw, t_test_raw = train_test_split(
            predictors, targets_combined, test_size=test_size)
        for i in range(len(station_ids)):
            if rain_tuning is not None and to_bool(
                    rain_tuning.get('use_raw_rain', False)):
                p_train_one = np.array([
                    p_train_raw[j][0:pred_len] for j in range(len(p_train_raw))
                ])
                r_train_one = np.array([
                    p_train_raw[j][pred_len + i * rain_len:pred_len +
                                   (i + 1) * rain_len]
                    for j in range(len(p_train_raw))
                ])
                p_test_one = np.array([
                    p_test_raw[j][0:pred_len] for j in range(len(p_test_raw))
                ])
                r_test_one = np.array([
                    p_test_raw[j][pred_len + i * rain_len:pred_len +
                                  (i + 1) * rain_len]
                    for j in range(len(p_test_raw))
                ])
                r_train.append(r_train_one)
                r_test.append(r_test_one)
            else:
                p_train_one = np.copy(p_train_raw)
                p_test_one = np.copy(p_test_raw)
                r_train = None
                r_test = None
            t_train_one = np.array([
                t_train_raw[j][i * targets_len:(i + 1) * targets_len]
                for j in range(len(t_train_raw))
            ])
            t_test_one = np.array([
                t_test_raw[j][i * targets_len:(i + 1) * targets_len]
                for j in range(len(t_test_raw))
            ])
            p_train.append(p_train_one)
            t_train.append(t_train_one)
            p_test.append(p_test_one)
            t_test.append(t_test_one)
            if i == len(station_ids) - 1:  #last station
                return p_train, t_train, r_train, p_test, t_test, r_test
    else:
        predictors_out = []
        targets = []
        rain_data = []
        for i in range(len(station_ids)):
            targets_one = data['VERIF'][i]
            if rain_tuning is not None and to_bool(
                    rain_tuning.get('use_raw_rain', False)):
                rain_data_one = np.array([data.rain[i]]).T
                predictors_one = np.concatenate((predictors, rain_data_one),
                                                axis=1)
                rain_shape = rain_data_one.shape[-1]
                predictors_out.append(predictors_one)
                targets.append(targets_one)
                rain_data.append(rain_data_one)
                if i == len(station_ids) - 1:  #last station
                    return predictors_out, targets, data.rain
            else:
                predictors_one = np.copy(predictors)
                predictors_out.append(predictors_one)
                targets.append(targets_one)
                if i == len(station_ids) - 1:  #last station
                    return predictors_out, targets, None
Example #7
0
def build_train_data(config,
                     predictor_file,
                     no_obs=False,
                     no_models=False,
                     test_size=0):
    """
    Build the array of training (and optionally testing) data.

    :param config:
    :param predictor_file:
    :param no_obs:
    :param no_models:
    :param test_size:
    :return:
    """
    from sklearn.model_selection import train_test_split

    if config['verbose']:
        print('build_train_data: reading predictor file')
    rain_tuning = config['Model'].get('Rain tuning', None)
    with open(predictor_file, 'rb') as handle:
        data = pickle.load(handle)

    # Select data
    if no_obs and no_models:
        no_obs = False
        no_models = False
    if no_obs:
        if config['verbose']:
            print('build_train_data: not using observations to train')
        predictors = data['BUFKIT']
    elif no_models:
        if config['verbose']:
            print('build_train_data: not using models to train')
        predictors = data['OBS']
    else:
        predictors = np.concatenate((data['BUFKIT'], data['OBS']), axis=1)
    if rain_tuning is not None and to_bool(
            rain_tuning.get('use_raw_rain', False)):
        predictors = np.concatenate((predictors, data.rain), axis=1)
        rain_shape = data.rain.shape[-1]
    targets = data['VERIF']

    if test_size > 0:
        p_train, p_test, t_train, t_test = train_test_split(
            predictors, targets, test_size=test_size)
        if rain_tuning is not None and to_bool(
                rain_tuning.get('use_raw_rain', False)):
            r_train = p_train[:, -1 * rain_shape:]
            p_train = p_train[:, :-1 * rain_shape]
            r_test = p_test[:, -1 * rain_shape:]
            p_test = p_test[:, :-1 * rain_shape]
        else:
            r_train = None
            r_test = None
        return p_train, t_train, r_train, p_test, t_test, r_test
    else:
        if rain_tuning is not None and to_bool(
                rain_tuning.get('use_raw_rain', False)):
            return predictors, targets, data.rain
        else:
            return predictors, targets, None