def postprocess(data, new=False, ref_period = True): """ Combine all the postprocessing functions in one data routine. :param data: xarray data array :param new: compute the statistics again (default = False) """ small_print_header(f"Process {data.name} from {data.dataset}") toProcessedDir(data, new) #TODO: Do this better! global reference_period reference_period = ref_period saveAnomaly(data, new)
def cross_training(model, pipeline, n_iter, **kwargs): """ Training the model on different training sets in which each time a period\ corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \ ovserved date is spared. :param model: A model that follows the guidelines how a model object\ should be set up. :param pipeline: a function that takes lead time as argument and returns\ the corresponding feature, label, time and persistance. :param save_dir: The prefix of the save directory. :param **kwargs: Arguments that shell be passed to the .set_parameter() \ method of the provided model. """ for lead_time in lead_times: X, y, timey = pipeline(lead_time, return_persistance=False) print_header(f'Lead time: {lead_time} month') for j in range(n_decades-1): m = model(**kwargs) dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}" path = join(modeldir, dir_name) n_files=0 if exists(path): n_files = len(listdir(path)) if not exists(path) or n_files==0: small_print_header(f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01') train_indeces = np.invert(test_indeces) trainX, trainy, traintime = X[train_indeces,:], y[train_indeces], timey[train_indeces] m.fit_RandomizedSearch(trainX, trainy, traintime, n_iter=n_iter) m.save(location=modeldir, dir_name=dir_name) else: print(f'{dir_name} already exists') del m
import os import time plt.close("all") K.clear_session() #%% ============================================================================= # Deep ensemble # ============================================================================= decades = [60, 70, 80, 90, 100, 110] for lead_time in [0, 3, 6, 9, 12, 15]: X, y, timey, yp = pipeline(lead_time, return_persistance=True) print_header(f'Lead time: {lead_time} month') for decade in decades: small_print_header( f'Test period: {1902+decade}-01-01 till {1911+decade}-12-01') # jump loop iteration if already trained ens_dir = f'ensemble_decade{decade}_lead{lead_time}' out_dir = os.path.join(modeldir, ens_dir) modified_time = time.gmtime(os.path.getmtime(out_dir)) compare_time = time.strptime("21-7-2019 13:00 UTC", "%d-%m-%Y %H:%M %Z") if modified_time > compare_time: print("Trained already!") continue test_indeces = (timey >= f'{1902+decade}-01-01') & ( timey <= f'{1911+decade}-12-01')
def cross_hindcast(model, pipeline, model_name): """ Generate a hindcast from 1962 till today using the models which were trained by the .cross_training() method. :param model: The considered model. :param pipeline: The data pipeline that already was used before in \ .cross_training(). """ first_lead_loop = True for i in range(n_lead): lead_time = lead_times[i] print_header(f'Lead time: {lead_time} months') X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True) ytrue = np.array([]) timeytrue = pd.DatetimeIndex([]) first_dec_loop = True for j in range(n_decades - 1): small_print_header( f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') # test indices test_indeces = (timey >= f'{decades[j]}-01-01') & ( timey <= f'{decades[j+1]-1}-12-01') testX, testy, testtimey = X[ test_indeces, :], y[test_indeces], timey[test_indeces] m = model() m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}') # allocate arrays and variables for which the model must be loaded if first_dec_loop: n_outputs = m.n_outputs output_names = m.output_names pred_full = np.zeros((n_outputs, 0)) first_dec_loop = False # make prediction pred = np.zeros((m.n_outputs, testX.shape[0])) pred[:, :] = m.predict(testX) # make the full time series pred_full = np.append(pred_full, pred, axis=1) ytrue = np.append(ytrue, testy) timeytrue = timeytrue.append(testtimey) del m if timeytrue[0] != pd.to_datetime('1963-01-01'): expected_first_date = '1963-01-01' got_first_date = timeytrue[0].isoformat()[:10] raise Exception( f"The first predicted date for lead time {lead_time} \ is {got_first_date} but expected {expected_first_date}" ) # allocate arrays and variables for which the full length of the time # series must be known if first_lead_loop: n_time = len(timeytrue) pred_save = np.zeros((n_outputs, n_time, n_lead)) first_lead_loop = False pred_save[:, :, i] = pred_full # Save data to a netcdf file save_dict = {} for i in range(n_outputs): save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i, :, :]) ds = xr.Dataset(save_dict, coords={ 'target_season': timeytrue, 'lead': lead_times }) ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))
def fit(self, trainX, trainy, valX=None, valy=None, use_pretrained=False): """ Fit the model to training data """ start_time = time.time() # clear memory K.clear_session() # allocate lists for the ensemble self.ensemble = [] self.history = [] self.val_loss = [] self.segment_len = trainX.shape[0] // self.hyperparameters['n_segments'] if self.hyperparameters['n_segments'] == 1 and (valX is not None or valy is not None): warnings.warn( "Validation and test data set are the same if n_segements is 1!" ) i = 0 while i < self.hyperparameters['n_members_segment']: j = 0 while j < self.hyperparameters['n_segments']: ensemble_member = self.build_model(trainX.shape[1]) n_ens_sel = len(self.ensemble) small_print_header( f"Train member Nr {n_ens_sel+1}/{self.hyperparameters['n_members']}" ) if use_pretrained: ensemble_member.load_weights(self.pretrained_weights) ensemble_member.compile(loss=self.loss, optimizer=self.optimizer, metrics=[self.loss]) # validate on the spare segment if self.hyperparameters['n_segments'] != 1: if valX is not None or valy is not None: warnings.warn( "Validation data set will be one of the segments. The provided validation data set is not used!" ) start_ind = j * self.segment_len end_ind = (j + 1) * self.segment_len trainXens = np.delete(trainX, np.s_[start_ind:end_ind], axis=0) trainyens = np.delete(trainy, np.s_[start_ind:end_ind]) valXens = trainX[start_ind:end_ind] valyens = trainy[start_ind:end_ind] # validate on test data set elif self.hyperparameters['n_segments'] == 1: if valX is None or valy is None: raise MissingArgumentError( "When segments length is 1, a validation data set must be provided." ) trainXens = trainX trainyens = trainy valXens = valX valyens = valy history = ensemble_member.fit( trainXens, trainyens, epochs=self.hyperparameters['epochs'], batch_size=self.hyperparameters['batch_size'], verbose=self.hyperparameters['verbose'], shuffle=True, callbacks=[self.es], validation_data=(valXens, valyens)) self.history.append(history) self.val_loss.append( ensemble_member.evaluate(valXens, valyens)[1]) self.ensemble.append(ensemble_member) j += 1 i += 1 self.mean_val_loss = np.mean(self.val_loss) print(f'Loss: {self.mean_val_loss}') # print computation time end_time = time.time() passed_time = np.round(end_time - start_time, decimals=1) print(f'Computation time: {passed_time}s')
def fit_RandomizedSearch(self, trainX, trainy, n_iter=10, **kwargs): """ Hyperparameter optimazation using random search. :type trainX: np.ndarray :param trainX: The training feature set. 2-D array with dimensions\ (timesteps, features). :type trainy: np.ndarray :param trainy: The training label set. 2-D array with dimensions\ (timesteps, labels). :param kwargs: Keyword arguments are passed to the .fit() method. """ # check if hyperparameters where provided in lists for randomized search if len(self.hyperparameters_search) == 0: raise Exception("No variable indicated for hyperparameter search!") #iterate with randomized hyperparameters best_loss = np.inf for i in range(n_iter): print_header(f"Search iteration Nr {i+1}/{n_iter}") # random selection of hyperparameters for key in self.hyperparameters_search.keys(): low = self.hyperparameters_search[key][0] high = self.hyperparameters_search[key][1] if type(low) is float and type(high) is float: self.hyperparameters[key] = np.random.uniform(low, high) if type(low) is int and type(high) is int: self.hyperparameters[key] = np.random.randint(low, high+1) if type(low) is tuple and type(high) is tuple: hyp_list = [] for i in range(len(low)): hyp_list.append(np.random.randint(low[i], high[i]+1)) self.hyperparameters[key] = tuple(hyp_list) self.fit(trainX, trainy, **kwargs) # check if validation score was enhanced if self.mean_val_loss<best_loss: best_loss = self.mean_val_loss self.best_hyperparameters = self.hyperparameters.copy() small_print_header("New best hyperparameters") print(f"Mean loss: {best_loss}") print(self.best_hyperparameters) # refit the model with optimized hyperparameter # AND to have the weights of the DE for the best hyperparameters again print_header("Refit the model with best hyperparamters") self.hyperparameters = self.best_hyperparameters.copy() print(self.hyperparameters) self.fit(trainX, trainy, **kwargs) print(f"best loss search: {best_loss}") print(f"loss refitting : {self.mean_val_loss}")
def fit(self, trainX, trainy, valX=None, valy=None): """ Fit the model. If n_segments is 1, then a validation data set needs to be supplied. :type trainX: np.ndarray :param trainX: The training feature set. 2-D array with dimensions\ (timesteps, features) :type trainy: np.ndarray :param trainy: The training label set. 2-D array with dimensions\ (timesteps, labels) :type valX: np.ndarray :param valX: The validation feature set. 2-D array with dimensions\ (timesteps, features). :type valy: np.ndarray :param valy: The validation label set. 2-D array with dimensions\ (timesteps, labels). """ # clear memory K.clear_session() # allocate lists for the ensemble self.ensemble = [] self.history = [] self.val_loss = [] self.segment_len = trainX.shape[0]//self.n_segments if self.n_segments==1 and (valX is not None or valy is not None): warnings.warn("Validation and test data set are the same if n_segements is 1!") i = 0 while i<self.n_members_segment: j = 0 while j<self.n_segments: n_ens_sel = len(self.ensemble) small_print_header(f"Train member Nr {n_ens_sel+1}/{self.n_members}") # build model member, member_encoder, member_decoder = self.build_model(trainX.shape[1], trainy.shape[1]) # compite model member.compile(loss='mse', optimizer=self.optimizer, metrics=['mse']) # validate on the spare segment if self.n_segments!=1: if valX is not None or valy is not None: warnings.warn("Validation data set will be one of the segments. The provided validation data set is not used!") start_ind = j * self.segment_len end_ind = (j+1) * self.segment_len trainXens = np.delete(trainX, np.s_[start_ind:end_ind], axis=0) trainyens = np.delete(trainy, np.s_[start_ind:end_ind], axis=0) valXens = trainX[start_ind:end_ind] valyens = trainy[start_ind:end_ind] # validate on test data set elif self.n_segments==1: if valX is None or valy is None: raise MissingArgumentError("When segments length is 1, a validation data set must be provided.") trainXens = trainX trainyens = trainy valXens = valX valyens = valy history = member.fit(trainXens, trainyens, epochs=self.epochs, batch_size=self.hyperparameters['batch_size'], verbose=self.verbose, shuffle=True, callbacks=[self.es], validation_data=(valXens, valyens)) self.history.append(history) self.val_loss.append(member.evaluate(valXens, valyens)[1]) print(f"Loss: {self.val_loss[-1]}") self.ensemble.append(member) j+=1 i+=1 self.mean_val_loss = np.mean(self.val_loss) print(f"Mean loss: {self.mean_val_loss}")
def cross_training(model, pipeline, n_iter, lead_times, **kwargs): """ Training the model on different training sets in which each time a period\ corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \ observed date is spared. :param model: A model that follows the guidelines how a model object\ should be set up. :param pipeline: a function that takes lead time as argument and returns\ the corresponding feature, label, time and persistance. :param save_dir: The prefix of the save directory. :param **kwargs: Arguments that shell be passed to the .set_parameter() \ method of the provided model. """ for lead_time in lead_times: X, y, timey = pipeline(lead_time, return_persistance=False) print_header(f'Lead time: {lead_time} months') for j in range(n_decades - 1): m = model(**kwargs) dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}" path = join(modeldir, dir_name) n_files = 0 if exists(path): n_files = len(listdir(path)) if not exists(path) or n_files == 0: small_print_header( f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01' ) test_indeces = (timey >= f'{decades[j]}-01-01') & ( timey <= f'{decades[j+1]-1}-12-01') train_indeces = np.invert(test_indeces) trainX, trainy, traintime = X[ train_indeces, :], y[train_indeces], timey[train_indeces] m.fit_RandomizedSearch(trainX, trainy, traintime, n_iter=n_iter) m.save(location=modeldir, dir_name=dir_name) else: print(f'{dir_name} already exists') del m # def cross_hindcast(model, pipeline, model_name, **kwargs): # """ # Generate a hindcast from 1962 till today using the models which were # trained by the .cross_training() method. # :param model: The considered model. # :param pipeline: The data pipeline that already was used before in \ # .cross_training(). # """ # first_lead_loop = True # for i in range(n_lead): # lead_time = lead_times[i] # print_header(f'Lead time: {lead_time} months') # X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True) # ytrue = np.array([]) # timeytrue = pd.DatetimeIndex([]) # first_dec_loop = True # for j in range(n_decades-1): # small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') # # test indices # test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01') # testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces] # m = model(**kwargs) # m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}') # # allocate arrays and variables for which the model must be loaded # if first_dec_loop: # n_outputs = m.n_outputs # output_names = m.output_names # pred_full = np.zeros((n_outputs, 0)) # first_dec_loop=False # # make prediction # pred = np.zeros((m.n_outputs, testX.shape[0])) # pred[:,:] = m.predict(testX) # # make the full time series # pred_full = np.append(pred_full, pred, axis=1) # ytrue = np.append(ytrue, testy) # timeytrue = timeytrue.append(testtimey) # del m # if timeytrue[0]!=pd.to_datetime('1963-01-01'): # expected_first_date = '1963-01-01' # got_first_date = timeytrue[0].isoformat()[:10] # raise Exception(f"The first predicted date for lead time {lead_time} \ # is {got_first_date} but expected {expected_first_date}") # # allocate arrays and variables for which the full length of the time # # series must be known # if first_lead_loop: # n_time = len(timeytrue) # pred_save = np.zeros((n_outputs, n_time, n_lead)) # first_lead_loop=False # pred_save[:,:,i] = pred_full # # Save data to a netcdf file # save_dict = {} # for i in range(n_outputs): # save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i,:,:]) # ds = xr.Dataset(save_dict, coords={'target_season': timeytrue, # 'lead': lead_times} ) # ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc')) # def cross_hindcast_dem(model, pipeline, model_name): # """ # Generate a hindcast from 1962 till today using the models which were # trained by the .cross_training() method. ONLY works for the DEM. # This routine returns an std estimate that is only based on the corrlation # skill of the DEM predicted mean. # :param model: The considered model. # :param pipeline: The data pipeline that already was used before in \ # .cross_training(). # """ # #cross_hindcast(model, pipeline, model_name) # std_estimate = xr.open_dataarray(join(processeddir, f'{model_name}_std_estimate.nc')) # first_lead_loop = True # for i in range(n_lead): # lead_time = lead_times[i] # print_header(f'Lead time: {lead_time} months') # X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True) # ytrue = np.array([]) # timeytrue = pd.DatetimeIndex([]) # first_dec_loop = True # for j in range(n_decades-1): # small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') # # test indices # test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01') # testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces] # m = model() # m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}') # # allocate arrays and variables for which the model must be loaded # if first_dec_loop: # n_outputs = m.n_outputs # output_names = m.output_names # pred_full = np.zeros((n_outputs+1, 0)) # first_dec_loop=False # # make prediction # pred = np.zeros((m.n_outputs+1, testX.shape[0])) # pred[:2,:] = m.predict(testX) # for k in range(len(testtimey)): # month = testtimey[k].date().month # pred[-1, k] = std_estimate[i, month-1] # # make the full time series # pred_full = np.append(pred_full, pred, axis=1) # ytrue = np.append(ytrue, testy) # timeytrue = timeytrue.append(testtimey) # del m # if timeytrue[0]!=pd.to_datetime('1963-01-01'): # expected_first_date = '1963-01-01' # got_first_date = timeytrue[0].isoformat()[:10] # raise Exception(f"The first predicted date for lead time {lead_time} \ # is {got_first_date} but expected {expected_first_date}") # # allocate arrays and variables for which the full length of the time # # series must be known # if first_lead_loop: # n_time = len(timeytrue) # pred_save = np.zeros((n_outputs+1, n_time, n_lead)) # first_lead_loop=False # pred_save[:,:,i] = pred_full # # Save data to a netcdf file # save_dict = {} # for i in range(n_outputs + 1): # if i<n_outputs: # save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i,:,:]) # else: # save_dict['std_estimate'] = (['target_season', 'lead'], pred_save[i,:,:]) # ds = xr.Dataset(save_dict, coords={'target_season': timeytrue, # 'lead': lead_times} ) # ds.to_netcdf(join(processeddir, f'{model_name}_forecasts_with_std_estimated.nc')) # ds.close()