Exemple #1
0
def postprocess(data, new=False, ref_period = True):
    """
    Combine all the postprocessing functions in one data routine.

    :param data: xarray data array
    :param new: compute the statistics again (default = False)
    """
    small_print_header(f"Process {data.name} from {data.dataset}")
    toProcessedDir(data, new)
    #TODO: Do this better!
    global reference_period
    reference_period = ref_period

    saveAnomaly(data, new)
Exemple #2
0
def cross_training(model, pipeline, n_iter, **kwargs):
    """
    Training the model on different training sets in which each time a period\
    corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \
    ovserved date is spared.

    :param model: A model that follows the guidelines how a model object\
    should be set up.

    :param pipeline: a function that takes lead time as argument and returns\
    the corresponding feature, label, time and persistance.

    :param save_dir: The prefix of the save directory.

    :param **kwargs: Arguments that shell be passed to the .set_parameter() \
    method of the provided model.
    """

    for lead_time in lead_times:
        X, y, timey = pipeline(lead_time, return_persistance=False)

        print_header(f'Lead time: {lead_time} month')

        for j in range(n_decades-1):
            m = model(**kwargs)
            dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}"
            path = join(modeldir, dir_name)

            n_files=0
            if exists(path):
                n_files = len(listdir(path))

            if not exists(path) or n_files==0:
                small_print_header(f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

                test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
                train_indeces = np.invert(test_indeces)
                trainX, trainy, traintime = X[train_indeces,:], y[train_indeces], timey[train_indeces]

                m.fit_RandomizedSearch(trainX, trainy, traintime, n_iter=n_iter)
                m.save(location=modeldir, dir_name=dir_name)

            else:
                print(f'{dir_name} already exists')
            del m
import os
import time
plt.close("all")
K.clear_session()

#%% =============================================================================
# Deep ensemble
# =============================================================================
decades = [60, 70, 80, 90, 100, 110]

for lead_time in [0, 3, 6, 9, 12, 15]:
    X, y, timey, yp = pipeline(lead_time, return_persistance=True)
    print_header(f'Lead time: {lead_time} month')

    for decade in decades:
        small_print_header(
            f'Test period: {1902+decade}-01-01 till {1911+decade}-12-01')

        # jump loop iteration if already trained
        ens_dir = f'ensemble_decade{decade}_lead{lead_time}'
        out_dir = os.path.join(modeldir, ens_dir)

        modified_time = time.gmtime(os.path.getmtime(out_dir))
        compare_time = time.strptime("21-7-2019 13:00 UTC",
                                     "%d-%m-%Y %H:%M %Z")

        if modified_time > compare_time:
            print("Trained already!")
            continue

        test_indeces = (timey >= f'{1902+decade}-01-01') & (
            timey <= f'{1911+decade}-12-01')
Exemple #4
0
def cross_hindcast(model, pipeline, model_name):
    """
    Generate a hindcast from 1962 till today using the models which were
    trained by the .cross_training() method.

    :param model: The considered model.

    :param pipeline: The data pipeline that already was used before in \
    .cross_training().
    """

    first_lead_loop = True

    for i in range(n_lead):
        lead_time = lead_times[i]
        print_header(f'Lead time: {lead_time} months')

        X, y, timey, y_persistance = pipeline(lead_time,
                                              return_persistance=True)

        ytrue = np.array([])
        timeytrue = pd.DatetimeIndex([])

        first_dec_loop = True
        for j in range(n_decades - 1):
            small_print_header(
                f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

            # test indices
            test_indeces = (timey >= f'{decades[j]}-01-01') & (
                timey <= f'{decades[j+1]-1}-12-01')
            testX, testy, testtimey = X[
                test_indeces, :], y[test_indeces], timey[test_indeces]

            m = model()
            m.load(location=modeldir,
                   dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')

            # allocate arrays and variables for which the model must be loaded
            if first_dec_loop:
                n_outputs = m.n_outputs
                output_names = m.output_names
                pred_full = np.zeros((n_outputs, 0))
                first_dec_loop = False

            # make prediction
            pred = np.zeros((m.n_outputs, testX.shape[0]))
            pred[:, :] = m.predict(testX)

            # make the full time series
            pred_full = np.append(pred_full, pred, axis=1)
            ytrue = np.append(ytrue, testy)
            timeytrue = timeytrue.append(testtimey)
            del m

        if timeytrue[0] != pd.to_datetime('1963-01-01'):
            expected_first_date = '1963-01-01'
            got_first_date = timeytrue[0].isoformat()[:10]

            raise Exception(
                f"The first predicted date for lead time {lead_time} \
                            is {got_first_date} but expected {expected_first_date}"
            )

        # allocate arrays and variables for which the full length of the time
        # series must be known
        if first_lead_loop:
            n_time = len(timeytrue)
            pred_save = np.zeros((n_outputs, n_time, n_lead))
            first_lead_loop = False

        pred_save[:, :, i] = pred_full

    # Save data to a netcdf file
    save_dict = {}
    for i in range(n_outputs):
        save_dict[output_names[i]] = (['target_season',
                                       'lead'], pred_save[i, :, :])

    ds = xr.Dataset(save_dict,
                    coords={
                        'target_season': timeytrue,
                        'lead': lead_times
                    })
    ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))
Exemple #5
0
    def fit(self, trainX, trainy, valX=None, valy=None, use_pretrained=False):
        """
        Fit the model to training data
        """

        start_time = time.time()

        # clear memory
        K.clear_session()

        # allocate lists for the ensemble
        self.ensemble = []
        self.history = []
        self.val_loss = []

        self.segment_len = trainX.shape[0] // self.hyperparameters['n_segments']

        if self.hyperparameters['n_segments'] == 1 and (valX is not None
                                                        or valy is not None):
            warnings.warn(
                "Validation and test data set are the same if n_segements is 1!"
            )

        i = 0
        while i < self.hyperparameters['n_members_segment']:
            j = 0
            while j < self.hyperparameters['n_segments']:
                ensemble_member = self.build_model(trainX.shape[1])

                n_ens_sel = len(self.ensemble)
                small_print_header(
                    f"Train member Nr {n_ens_sel+1}/{self.hyperparameters['n_members']}"
                )

                if use_pretrained:
                    ensemble_member.load_weights(self.pretrained_weights)

                ensemble_member.compile(loss=self.loss,
                                        optimizer=self.optimizer,
                                        metrics=[self.loss])

                # validate on the spare segment
                if self.hyperparameters['n_segments'] != 1:
                    if valX is not None or valy is not None:
                        warnings.warn(
                            "Validation data set will be one of the segments. The provided validation data set is not used!"
                        )

                    start_ind = j * self.segment_len
                    end_ind = (j + 1) * self.segment_len

                    trainXens = np.delete(trainX,
                                          np.s_[start_ind:end_ind],
                                          axis=0)
                    trainyens = np.delete(trainy, np.s_[start_ind:end_ind])
                    valXens = trainX[start_ind:end_ind]
                    valyens = trainy[start_ind:end_ind]

                # validate on test data set
                elif self.hyperparameters['n_segments'] == 1:
                    if valX is None or valy is None:
                        raise MissingArgumentError(
                            "When segments length is 1, a validation data set must be provided."
                        )
                    trainXens = trainX
                    trainyens = trainy
                    valXens = valX
                    valyens = valy

                history = ensemble_member.fit(
                    trainXens,
                    trainyens,
                    epochs=self.hyperparameters['epochs'],
                    batch_size=self.hyperparameters['batch_size'],
                    verbose=self.hyperparameters['verbose'],
                    shuffle=True,
                    callbacks=[self.es],
                    validation_data=(valXens, valyens))

                self.history.append(history)
                self.val_loss.append(
                    ensemble_member.evaluate(valXens, valyens)[1])
                self.ensemble.append(ensemble_member)
                j += 1
            i += 1
        self.mean_val_loss = np.mean(self.val_loss)

        print(f'Loss: {self.mean_val_loss}')
        # print computation time
        end_time = time.time()
        passed_time = np.round(end_time - start_time, decimals=1)
        print(f'Computation time: {passed_time}s')
    def fit_RandomizedSearch(self, trainX, trainy,  n_iter=10, **kwargs):
        """
        Hyperparameter optimazation using random search.


        :type trainX: np.ndarray
        :param trainX: The training feature set. 2-D array with dimensions\
        (timesteps, features).

        :type trainy: np.ndarray
        :param trainy: The training label set. 2-D array with dimensions\
        (timesteps, labels).

        :param kwargs: Keyword arguments are passed to the .fit() method.
        """
        # check if hyperparameters where provided in lists for randomized search
        if len(self.hyperparameters_search) == 0:
            raise Exception("No variable indicated for hyperparameter search!")

        #iterate with randomized hyperparameters
        best_loss = np.inf
        for i in range(n_iter):
            print_header(f"Search iteration Nr {i+1}/{n_iter}")

            # random selection of hyperparameters
            for key in self.hyperparameters_search.keys():
                low = self.hyperparameters_search[key][0]
                high = self.hyperparameters_search[key][1]

                if type(low) is float and type(high) is float:
                    self.hyperparameters[key] = np.random.uniform(low, high)

                if type(low) is int and type(high) is int:
                    self.hyperparameters[key] = np.random.randint(low, high+1)

                if type(low) is tuple and type(high) is tuple:
                    hyp_list = []
                    for i in range(len(low)):
                        hyp_list.append(np.random.randint(low[i], high[i]+1))
                    self.hyperparameters[key] = tuple(hyp_list)

            self.fit(trainX, trainy, **kwargs)

            # check if validation score was enhanced
            if self.mean_val_loss<best_loss:
                best_loss = self.mean_val_loss
                self.best_hyperparameters = self.hyperparameters.copy()

                small_print_header("New best hyperparameters")
                print(f"Mean loss: {best_loss}")
                print(self.best_hyperparameters)

        # refit the model with optimized hyperparameter
        # AND to have the weights of the DE for the best hyperparameters again
        print_header("Refit the model with best hyperparamters")

        self.hyperparameters = self.best_hyperparameters.copy()
        print(self.hyperparameters)
        self.fit(trainX, trainy, **kwargs)

        print(f"best loss search: {best_loss}")
        print(f"loss refitting : {self.mean_val_loss}")
    def fit(self, trainX, trainy, valX=None, valy=None):
        """
        Fit the model. If n_segments is 1, then a validation data set needs to
        be supplied.

        :type trainX: np.ndarray
        :param trainX: The training feature set. 2-D array with dimensions\
        (timesteps, features)

        :type trainy: np.ndarray
        :param trainy: The training label set. 2-D array with dimensions\
        (timesteps, labels)

        :type valX: np.ndarray
        :param valX: The validation feature set. 2-D array with dimensions\
        (timesteps, features).

        :type valy:  np.ndarray
        :param valy: The validation label set. 2-D array with dimensions\
        (timesteps, labels).
        """

        # clear memory
        K.clear_session()

        # allocate lists for the ensemble
        self.ensemble = []
        self.history = []
        self.val_loss = []

        self.segment_len = trainX.shape[0]//self.n_segments

        if self.n_segments==1 and (valX is not None or valy is not None):
             warnings.warn("Validation and test data set are the same if n_segements is 1!")

        i = 0
        while i<self.n_members_segment:
            j = 0
            while j<self.n_segments:
                n_ens_sel = len(self.ensemble)
                small_print_header(f"Train member Nr {n_ens_sel+1}/{self.n_members}")

                # build model
                member, member_encoder, member_decoder = self.build_model(trainX.shape[1], trainy.shape[1])

                # compite model
                member.compile(loss='mse', optimizer=self.optimizer, metrics=['mse'])

                # validate on the spare segment
                if self.n_segments!=1:
                    if valX is not None or valy is not None:
                        warnings.warn("Validation data set will be one of the segments. The provided validation data set is not used!")

                    start_ind = j * self.segment_len
                    end_ind = (j+1) *  self.segment_len

                    trainXens = np.delete(trainX, np.s_[start_ind:end_ind], axis=0)
                    trainyens = np.delete(trainy, np.s_[start_ind:end_ind], axis=0)
                    valXens = trainX[start_ind:end_ind]
                    valyens = trainy[start_ind:end_ind]

                # validate on test data set
                elif self.n_segments==1:
                    if valX is None or valy is None:
                        raise MissingArgumentError("When segments length is 1, a validation data set must be provided.")
                    trainXens = trainX
                    trainyens = trainy
                    valXens = valX
                    valyens = valy

                history = member.fit(trainXens, trainyens,
                                            epochs=self.epochs, batch_size=self.hyperparameters['batch_size'],
                                            verbose=self.verbose,
                                            shuffle=True, callbacks=[self.es],
                                            validation_data=(valXens, valyens))

                self.history.append(history)
                self.val_loss.append(member.evaluate(valXens, valyens)[1])
                print(f"Loss: {self.val_loss[-1]}")
                self.ensemble.append(member)
                j+=1
            i+=1
        self.mean_val_loss = np.mean(self.val_loss)
        print(f"Mean loss: {self.mean_val_loss}")
Exemple #8
0
def cross_training(model, pipeline, n_iter, lead_times, **kwargs):
    """
    Training the model on different training sets in which each time a period\
    corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \
    observed date is spared.

    :param model: A model that follows the guidelines how a model object\
    should be set up.

    :param pipeline: a function that takes lead time as argument and returns\
    the corresponding feature, label, time and persistance.

    :param save_dir: The prefix of the save directory.

    :param **kwargs: Arguments that shell be passed to the .set_parameter() \
    method of the provided model.
    """

    for lead_time in lead_times:
        X, y, timey = pipeline(lead_time, return_persistance=False)

        print_header(f'Lead time: {lead_time} months')

        for j in range(n_decades - 1):
            m = model(**kwargs)
            dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}"
            path = join(modeldir, dir_name)

            n_files = 0
            if exists(path):
                n_files = len(listdir(path))

            if not exists(path) or n_files == 0:
                small_print_header(
                    f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01'
                )

                test_indeces = (timey >= f'{decades[j]}-01-01') & (
                    timey <= f'{decades[j+1]-1}-12-01')
                train_indeces = np.invert(test_indeces)
                trainX, trainy, traintime = X[
                    train_indeces, :], y[train_indeces], timey[train_indeces]

                m.fit_RandomizedSearch(trainX,
                                       trainy,
                                       traintime,
                                       n_iter=n_iter)
                m.save(location=modeldir, dir_name=dir_name)

            else:
                print(f'{dir_name} already exists')
            del m


# def cross_hindcast(model, pipeline, model_name, **kwargs):
#     """
#     Generate a hindcast from 1962 till today using the models which were
#     trained by the .cross_training() method.

#     :param model: The considered model.

#     :param pipeline: The data pipeline that already was used before in \
#     .cross_training().
#     """

#     first_lead_loop = True

#     for i in range(n_lead):
#         lead_time = lead_times[i]
#         print_header(f'Lead time: {lead_time} months')

#         X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True)

#         ytrue = np.array([])
#         timeytrue = pd.DatetimeIndex([])

#         first_dec_loop = True
#         for j in range(n_decades-1):
#             small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

#             # test indices
#             test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
#             testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces]

#             m = model(**kwargs)
#             m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')

#             # allocate arrays and variables for which the model must be loaded
#             if first_dec_loop:
#                 n_outputs = m.n_outputs

#                 output_names = m.output_names
#                 pred_full = np.zeros((n_outputs, 0))
#                 first_dec_loop=False

#             # make prediction
#             pred = np.zeros((m.n_outputs, testX.shape[0]))
#             pred[:,:] = m.predict(testX)

#             # make the full time series
#             pred_full = np.append(pred_full, pred, axis=1)
#             ytrue = np.append(ytrue, testy)
#             timeytrue = timeytrue.append(testtimey)
#             del m

#         if timeytrue[0]!=pd.to_datetime('1963-01-01'):
#             expected_first_date = '1963-01-01'
#             got_first_date = timeytrue[0].isoformat()[:10]

#             raise Exception(f"The first predicted date for lead time {lead_time} \
#                             is {got_first_date} but expected {expected_first_date}")

#         # allocate arrays and variables for which the full length of the time
#         # series must be known
#         if first_lead_loop:
#             n_time = len(timeytrue)
#             pred_save =  np.zeros((n_outputs, n_time, n_lead))
#             first_lead_loop=False

#         pred_save[:,:,i] =  pred_full

#     # Save data to a netcdf file
#     save_dict = {}
#     for i in range(n_outputs):
#         save_dict[output_names[i]] = (['target_season', 'lead'],  pred_save[i,:,:])

#     ds = xr.Dataset(save_dict, coords={'target_season': timeytrue,
#                                        'lead': lead_times} )
#     ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))

# def cross_hindcast_dem(model, pipeline, model_name):
#     """
#     Generate a hindcast from 1962 till today using the models which were
#     trained by the .cross_training() method. ONLY works for the DEM.
#     This routine returns an std estimate that is only based on the corrlation
#     skill of the DEM predicted mean.

#     :param model: The considered model.

#     :param pipeline: The data pipeline that already was used before in \
#     .cross_training().
#     """
#     #cross_hindcast(model, pipeline, model_name)

#     std_estimate = xr.open_dataarray(join(processeddir, f'{model_name}_std_estimate.nc'))

#     first_lead_loop = True

#     for i in range(n_lead):
#         lead_time = lead_times[i]
#         print_header(f'Lead time: {lead_time} months')

#         X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True)

#         ytrue = np.array([])
#         timeytrue = pd.DatetimeIndex([])

#         first_dec_loop = True
#         for j in range(n_decades-1):
#             small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

#             # test indices
#             test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
#             testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces]

#             m = model()
#             m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')

#             # allocate arrays and variables for which the model must be loaded
#             if first_dec_loop:
#                 n_outputs = m.n_outputs
#                 output_names = m.output_names
#                 pred_full = np.zeros((n_outputs+1, 0))
#                 first_dec_loop=False

#             # make prediction
#             pred = np.zeros((m.n_outputs+1, testX.shape[0]))
#             pred[:2,:] = m.predict(testX)

#             for k in range(len(testtimey)):
#                 month = testtimey[k].date().month
#                 pred[-1, k] = std_estimate[i, month-1]

#             # make the full time series
#             pred_full = np.append(pred_full, pred, axis=1)
#             ytrue = np.append(ytrue, testy)
#             timeytrue = timeytrue.append(testtimey)
#             del m

#         if timeytrue[0]!=pd.to_datetime('1963-01-01'):
#             expected_first_date = '1963-01-01'
#             got_first_date = timeytrue[0].isoformat()[:10]

#             raise Exception(f"The first predicted date for lead time {lead_time} \
#                             is {got_first_date} but expected {expected_first_date}")

#         # allocate arrays and variables for which the full length of the time
#         # series must be known
#         if first_lead_loop:
#             n_time = len(timeytrue)
#             pred_save =  np.zeros((n_outputs+1, n_time, n_lead))
#             first_lead_loop=False

#         pred_save[:,:,i] =  pred_full

#     # Save data to a netcdf file
#     save_dict = {}
#     for i in range(n_outputs + 1):
#         if i<n_outputs:
#             save_dict[output_names[i]] = (['target_season', 'lead'],  pred_save[i,:,:])
#         else:
#             save_dict['std_estimate'] = (['target_season', 'lead'],  pred_save[i,:,:])

#     ds = xr.Dataset(save_dict, coords={'target_season': timeytrue,
#                                        'lead': lead_times} )
#     ds.to_netcdf(join(processeddir, f'{model_name}_forecasts_with_std_estimated.nc'))
#     ds.close()