def setUpModelRun(options, G): # count available CUDA devices if torch.cuda.is_available(): n_cuda_dev = torch.cuda.device_count() print('Cuda is available with %s devices.' % n_cuda_dev) # prepare model run config config = prepareConfig(options, G) # set standard optimizer as default, if none is specified if 'optimizer' not in config: print( "No optimizer config found. Using standard sgd optimizer with learning rate 0.001 and momentum 0.9." ) config['optimizer'] = { 'algorithm': 'sgd', 'learning_rate': 0.001, 'momentum': 0.9 } # load preprocessed time invariant data per stations with open( "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file: time_invarian_data = pkl.load(input_file) # load all station ids config['stations'] = time_invarian_data.station.data # if preprocessing "station" is used, load complete data into memory accessible by a dictionary. Access keys are the station ids data_dictionary = None config['inits'] = None if config['preprocessing'] == 'station': # load data per station into dictionary data_dictionary = {} for station in config['stations']: ds = xr.open_dataset(config['input_source'] + "/station/grid_size_%s/station_%s_data.nc" % (config['original_grid_size'], station)) data_dictionary[station] = ds.copy(deep=True) ds.close() # get all init times we have data for config['inits'] = ds.coords['init'].data # load or generate training and test folds, this requires several parameters, best described in the method itself train_test_folds = prepareTrainTestFolds(config) data_statistics = DataUtils.getDataStatistics(config=config) # the definitions of the grid time invariant parameters and the station parameters are hard-code config['grid_time_invariant_parameters'] = [ 'HH', 'HH_DIFF', 'FR_LAND', 'SOILTYP', 'LAT', 'LON', 'ABS_2D_DIST' ] config['station_parameters'] = ['height', 'lat', 'lon'] return config, train_test_folds, data_dictionary, data_statistics
def prepareTrainTestFolds(config): train_test_folds_file_name = '/train_test_folds_r_%s_sl_%s_tfw_%s_tf_%s_series_%s_s_%s.pkl' % ( config['runs'], config['slice_size'], config['test_filter_window'], config['test_fraction'], config['time_serie_length'] if 'time_serie_length' in config else 0, config['seed']) # if not already existing, generate filtered data splits for each run if not os.path.exists(config['input_source'] + train_test_folds_file_name): data_folds = DataUtils.getDataFolds(config=config) train_test_folds = DataUtils.getTrainTestFolds(config=config, data_folds=data_folds) # else load the existing filtered data split else: with open(config['input_source'] + train_test_folds_file_name, 'rb') as f: train_test_folds = pkl.load(file=f) print('Loaded existing train/test folds.') sys.stdout.flush() return train_test_folds
def getStationNeighbors(stationId, gridHeightData, gridLatData, gridLonData, station_height, station_lat, station_lon): # calculate height difference between grid heights and station heights gridHeightDifference = gridHeightData.squeeze() - station_height # calculate horizontal distance in meters grid_lat_lon_zip = np.array(list(zip( gridLatData.ravel(), gridLonData.ravel())), dtype=('float32,float32')) \ .reshape(gridLatData.shape) gridHorizontalDistance = np.vectorize( lambda lat_lon_zip: DataUtils.haversine(lat_lon_zip[0], lat_lon_zip[ 1], station_lat, station_lon))(grid_lat_lon_zip) closest2dId = gridHorizontalDistance.argmin() closest2dId = np.unravel_index(closest2dId, (674, 1058)) closest3dId = (gridHorizontalDistance + 500 * np.abs(gridHeightDifference)).argmin() closest3dId = np.unravel_index(closest3dId, (674, 1058)) return (stationId, ((closest2dId, gridHorizontalDistance[closest2dId], gridHeightDifference[closest2dId]), (closest3dId, gridHorizontalDistance[closest3dId], gridHeightDifference[closest3dId])))
def __getitem__(self, item): if item > self.__len__(): raise Exception('Tried to get data point out of range.') stationId, init = self.files[item] stationId = int(stationId) self.loadData((stationId, init)) Label = self.da.temp_station.data # if target is corrupted return None. This is later filtered out by the custom "collate_fn()" method if min(Label) < -1e10 or np.isnan(Label).any(): return None # if this line throws an exception, we are in the first execution of "__getitem__" and thus, we first have to # define the indices of the desired leads, parameters and grid bounds for a fast access of the data directly by # indices in the following calls. try: IP2d = self.da.cosmo_data.data[ self.lead_idx, self.lower_grid_bound:self.upper_grid_bound, self.lower_grid_bound:self.upper_grid_bound][:, :, self. parameter_idx] except AttributeError: self.calculateLowerAndUpperGridBound() all_leads = list(self.da.coords['lead'].data) all_parameters = list(self.da.coords['feature'].data) self.lead_idx = all_leads.index(self.lead_time) self.prediction_idx = [ all_leads.index(pt) for pt in self.prediction_times ] self.prediction_idx.sort() self.parameter_idx = [ all_parameters.index(p) for p in self.parameters ] self.parameter_idx.sort() IP2d = self.da.cosmo_data.data[ self.lead_idx, self.lower_grid_bound:self.upper_grid_bound, self.lower_grid_bound:self.upper_grid_bound][:, :, self. parameter_idx] # keep un-normalized temperature input from COSMO-1 TEMP_RAW = np.copy(self.da.cosmo_data.data[self.prediction_idx, self.closest_point_index, self.closest_point_index, 4] - 273.15) for p_idx in range(self.n_parameters): IP2d[:, :, p_idx] = self.featureScaling[p_idx](IP2d[:, :, p_idx]) # add temperature of lead time 0 (initial time of the model run) regardless to the lead time of the prediciton TEMP_T0 = self.featureScaling[4](self.da.cosmo_data.data[ 0, self.lower_grid_bound:self.upper_grid_bound, self.lower_grid_bound:self.upper_grid_bound][:, :, [4]]) IP2d = np.concatenate((IP2d, TEMP_T0), 2) # for CNN appraoch we need a structure like (batch_item, features, lat, lon) IP2d = np.rollaxis(IP2d, 2, 0) TimeFeatures = self.da.time_data.data[self.lead_idx] # TODO at the moment preprocessed data with grid size 3 has normalization of time features and grid size 1 has not if self.master_grid_size == 3: TimeFeatures[:-1] = DataUtils.normalizeTimeFeatures( TimeFeatures[:-1]) # get time invariant data for station if already calculated once try: (TimeInvGrid, TimeInvStation) = self.station_time_invariant_grid_data[stationId] except: # calculate time invariant data for station for the first time it is used for this station station_data = self.time_invariant_data.sel(station=stationId) TimeInvStation = station_data.station_position.sel( positinal_attribute=['height', 'lat', 'lon']).data TimeInvGrid = np.rollaxis( station_data.grid_data.sel( feature=self.grid_time_invariant_parameters).data[ self.lower_grid_bound:self.upper_grid_bound, self.lower_grid_bound:self.upper_grid_bound][..., ], 2, 0) self.station_time_invariant_grid_data[stationId] = (TimeInvGrid, TimeInvStation) return Label[self.prediction_idx], np.concatenate( (IP2d, TimeInvGrid), 0), TimeFeatures, TimeInvStation, (init, stationId, TEMP_RAW)
def runModel(config, data_dictionary, data_statistics, train_test_folds): program_start_time = time() # assign all program arguments to local variables with open(config['model']['path']) as handle: ModelDict = json.loads(handle.read()) # check if station and grid time invariant features should be used and set the list of desired parameters if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config[ 'grid_time_invariant_parameters'] = [] if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config[ 'station_parameters'] = [] # update general static model information experiment_info = config experiment_info['model'] = ModelDict experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash() # if needed, load time invariant features with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % ( config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file: time_invarian_data = pkl.load(input_file) # initialize feature scaling function for each feature featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics) # get optimizer config optimizer_config = config['optimizer'] # generate output path for experiment information setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % ( config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size']) output_path = '%s/%s' % (config['experiment_path'], setting_string) if not os.path.exists(output_path): raise Exception('Node folder of training run has been found for "%s"' % output_path) ds = xr.Dataset() # cross validation for run in range(config['runs']): print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1))) stations = sorted(config['stations']) # take the right preprocessed train/test data set for the current run train_fold, test_fold = train_test_folds[run] # get all inits all_inits_set = set(config['inits']) # get train and test inits train_inits_set = set([t[1] for t in train_fold]) test_inits_set = set([t[1] for t in test_fold]) # get all filtered inits filtere_inits = set( [init for init in all_inits_set if init not in train_inits_set and init not in test_inits_set]) # make sure, that all sets are distinct assert filtere_inits ^ train_inits_set ^ test_inits_set == all_inits_set init_type_mapping = {} for init in train_inits_set: init_type_mapping[init] = 'train' for init in test_inits_set: init_type_mapping[init] = 'test' for init in filtere_inits: init_type_mapping[init] = 'filterd' all_inits = sorted(list(all_inits_set)) all_data = [(station, init) for init in all_inits for station in stations] n_data_points = len(all_data) # keep mappings from init and station to index of result numpy array station_index_dict = {} for station_idx, station in enumerate(stations): station_index_dict[station] = station_idx init_index_dict = {} for init_idx, init in enumerate(all_inits): init_index_dict[init] = init_idx # initialize train and test dataloaders dataset = DataLoaders.ErrorPredictionCosmoData( config=config, station_data_dict=data_dictionary, files=all_data, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) dataloader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=False, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) # initialize network, optimizer and loss function net = Baseline.model_factory(model_dict=ModelDict, params=dataset.n_parameters, time_invariant_params=dataset.n_grid_time_invariant_parameters, grid=config['grid_size'], prediction_times=config['prediction_times']) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) optimizer = optim.SGD(net.parameters(), lr=optimizer_config['learning_rate'], momentum=optimizer_config['momentum']) net, optimizer, *_ = ModelUtils.load_checkpoint(output_path + '/stored_models/run_%s' % run, model=net, optimizer=optimizer) if torch.cuda.is_available(): net.cuda() # we do not train, but only output the evaluation of the network on train and test data net.eval() # initialize result array of errors per init and station and initialize it with NaN run_error_statistics = np.empty((len(init_index_dict), len(station_index_dict), 5)) run_error_statistics.fill(np.nan) # loop over complete data set for i, data in enumerate(dataloader, 0): try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue out = net(input, time_data, station_time_inv_input).squeeze() target = target.squeeze() diff = (out - target).squeeze() for item in range(Blabel.shape[0]): init = init_station_temp[0][item] station = init_station_temp[1][item].item() cosmo_temperature = init_station_temp[2][item].item() target_temperature = init_station_temp[3][item].item() station_idx = station_index_dict[station] init_idx = init_index_dict[init] run_error_statistics[init_idx, station_idx, :] = np.array((out[item].item(), cosmo_temperature, target[item].item(), diff[item].item(), target_temperature)) processed_samples = (i + 1) * int(config['batch_size']) if (i+1) % np.max((1, ((n_data_points // config['batch_size']) // 100))) == 0: print("%s samples have been processed. [%2.1f%%]" % (processed_samples, (processed_samples / n_data_points) * 100)) sys.stdout.flush() da = xr.DataArray(run_error_statistics, dims=('init', 'station', 'data'), coords=[all_inits, stations, ['prediction', 'cosmo', 'target', 'difference', 'target_temperature']]) da = da.sortby(variables='init') da.attrs['init_type_mapping'] = sorted(list(init_type_mapping.items())) ds['run_%s' % run] = da ds.attrs['config'] = config print('Error results of run %s have been processed.' % run) # flush output to see progress sys.stdout.flush() if not os.path.exists(output_path): raise Exception('Node folder of training run has been found for "%s"' % output_path) # dump experiment statistic with open(output_path + '/model_run_error.pkl', 'wb') as handle: pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL) # print program execution time m, s = divmod(time() - program_start_time, 60) h, m = divmod(m, 60) print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))
def runModel(config, data_dictionary, data_statistics, train_test_folds): program_start_time = time() # assign all program arguments to local variables with open(config['model']['path']) as handle: ModelDict = json.loads(handle.read()) # check if station and grid time invariant features should be used and set the list of desired parameters if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config['grid_time_invariant_parameters'] =[] if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config['station_parameters'] = [] # update general static model information experiment_info = config experiment_info['model'] = ModelDict experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash() # if needed, load time invariant features with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file: time_invarian_data = pkl.load(input_file) # initialize feature scaling function for each feature featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics) # get optimizer config optimizer_config = config['optimizer'] # generate output path for experiment information setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % ( config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size']) output_path = '%s/%s' % (config['experiment_path'], setting_string) if not os.path.exists(output_path): os.makedirs(output_path) # time for the set up until first run experiment_info['set_up_time'] = time() - program_start_time print('[Time]: Set-up %s' % strftime("%H:%M:%S", gmtime(experiment_info['set_up_time']))) sys.stdout.flush() # initialize statistics error_statistics = None run_times = None skip_statistics = None if 'per_station_rmse' in config: error_per_station_statistics = None # keep used learning rates experiment_info['scheduled_learning_rates'] = [] # cross validation for run in range(config['runs']): # logger for tensorboardX train_logger = Logger(output_path + '/logs/run_%s/train' % run) test_logger = Logger(output_path + '/logs/run_%s/test' % run) print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1))) # take the right preprocessed train/test data set for the current run train_fold, test_fold = train_test_folds[run] # initialize best epoch test error best_epoch_test_rmse = float("inf") # use different data loader if we want to train a 3nn model approach if "knn" in ModelDict: # initialize train and test dataloaders trainset = DataLoaders.CosmoData3NNData( config=config, station_data_dict=data_dictionary, files=train_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) testset = DataLoaders.CosmoData3NNData( config=config, station_data_dict=data_dictionary, files=test_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) else: # initialize train and test dataloaders trainset = DataLoaders.CosmoDataGridData( config=config, station_data_dict=data_dictionary, files=train_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) testset = DataLoaders.CosmoDataGridData( config=config, station_data_dict=data_dictionary, files=test_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) # initialize network, optimizer and loss function net = Baseline.model_factory(ModelDict, trainset.n_parameters, trainset.n_grid_time_invariant_parameters, config['grid_size'], config['prediction_times']) # store class name experiment_info['model_class'] = net.__class__.__name__ if torch.cuda.device_count() > 1: net = nn.DataParallel(net) if torch.cuda.is_available(): net.cuda() # load number of train and test samples n_train_samples, n_test_samples = len(train_fold), len(test_fold) optimizer, scheduler = ModelUtils.initializeOptimizer(optimizer_config, net) criterion = nn.MSELoss() # keep number of processed smaples over all epochs for tensorboard processed_train_samples_global = 0 processed_test_samples_global = 0 # start learning for epoch in range(config['epochs']): epoch_train_time = np.zeros((5,)) epoch_start_time = time() print('Epoch: ' + str(epoch + 1) + '\n------------------------------------------------------------') # adapt learning rate and store information in experiment attributes if scheduler is not None: scheduler.step() if run == 0: experiment_info['scheduled_learning_rates'] += scheduler.get_lr() print('Using learning rate %s' % str(scheduler.get_lr())) # TRAINING # initialize variables for epoch statistics LABELS, MODELoutputs, COSMOoutputs = None, None, None processed_train_samples = 0 net.train(True) train_start_time = time() # loop over complete train set for i, data in enumerate(trainloader, 0): time_start = time() try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue time_after_data_preparation = time() processed_train_samples += len(Blabel) optimizer.zero_grad() out = net(input, time_data, station_time_inv_input) time_after_forward_pass = time() loss = criterion(out, target) loss.backward() optimizer.step() time_after_backward_pass = time() if LABELS is None: LABELS = Blabel.data MODELoutputs = out.data COSMOoutputs = init_station_temp[2].data else: LABELS = np.vstack((LABELS, Blabel.data)) MODELoutputs = np.vstack((MODELoutputs, out.data)) COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data)) time_after_label_stack = time() if (i + 1) % 64 == 0: print('Sample: %s \t Loss: %s' % (processed_train_samples, float(np.sqrt(loss.data)))) # ============ TensorBoard logging ============# # (1) Log the scalar values info = { setting_string: np.sqrt(loss.item()), } for tag, value in info.items(): train_logger.scalar_summary(tag, value, processed_train_samples_global + processed_train_samples) # (2) Log values and gradients of the parameters (histogram) for tag, value in net.named_parameters(): tag = tag.replace('.', '/') train_logger.histo_summary(tag, ModelUtils.to_np(value), i + 1) train_logger.histo_summary(tag + '/grad', ModelUtils.to_np(value.grad), i + 1) epoch_train_time += np.array((time_start - time_end, time_after_data_preparation - time_start, time_after_forward_pass - time_after_data_preparation, time_after_backward_pass - time_after_forward_pass, time_after_label_stack - time_after_backward_pass)) time_end = time() # calculate error statistic of current epoch diff_model = MODELoutputs - LABELS diff_cosmo = COSMOoutputs - LABELS epoch_train_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0) epoch_train_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0) # update global processed samples processed_train_samples_global += processed_train_samples if np.isnan(epoch_train_rmse_model).any(): print("Learning rate too large resulted in NaN-error while training. Stopped training...") return # print epoch training times print('Timing: Waiting on data=%s, Data Preparation=%s,' 'Forward Pass=%s, Backward Pass=%s, Data Stacking=%s' % tuple(list(epoch_train_time / len(epoch_train_time)))) # RMSE of epoch print('Train/test statistic for epoch: %s' % str(epoch + 1)) print('Train RMSE COSMO: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_cosmo[idx]) for idx in range(len(epoch_train_rmse_cosmo))])) print('Train RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_model[idx]) for idx in range(len(epoch_train_rmse_model))])) sys.stdout.flush() train_time = time() - train_start_time # TESTING test_start_time = time() LABELS, MODELoutputs, COSMOoutputs, STATION = None, None, None, None processed_test_samples = 0 net.eval() for i, data in enumerate(testloader, 0): try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue processed_test_samples += len(Blabel) out = net(input, time_data, station_time_inv_input) loss = criterion(out, target) if LABELS is None: LABELS = Blabel.data MODELoutputs = out.data COSMOoutputs = init_station_temp[2].data STATION = init_station_temp[1].data else: LABELS = np.vstack((LABELS, Blabel.data)) MODELoutputs = np.vstack((MODELoutputs, out.data)) COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data)) STATION = np.hstack((STATION, init_station_temp[1].data)) if i % 16: # ============ TensorBoard logging ============# # (1) Log the scalar values info = { setting_string: np.sqrt(loss.item()), } for tag, value in info.items(): test_logger.scalar_summary(tag, value, processed_test_samples_global + processed_test_samples) # calculate error statistic of current epoch diff_model = MODELoutputs - LABELS diff_cosmo = COSMOoutputs - LABELS # rmse epoch_test_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0) epoch_test_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0) overall_test_rmse_model = ModelUtils.rmse(diff_model) overall_test_rmse_cosmo = ModelUtils.rmse(diff_cosmo) # mae epoch_test_mae_model = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_model, axis=0) epoch_test_mae_cosmo = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_cosmo, axis=0) overall_test_mae_model = ModelUtils.mae(diff_model) overall_test_mae_cosmo = ModelUtils.mae(diff_cosmo) # calculate per station rmse if desired (especially for K-fold station generalization experiment if "per_station_rmse" in config: max_station_id = 1435 squared_errors_per_epoch = np.array((np.square(diff_model), np.square(diff_cosmo))).squeeze() # the highest index of data is 1435, thus we expect at least 1435 entries, which we can access by # station id test_samples_per_station = np.bincount(STATION, minlength=max_station_id+1) model_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[0], minlength=max_station_id+1) cosmo_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[1], minlength=max_station_id+1) # set division by zero/NaN warning to 'ignore' np.seterr(divide='ignore', invalid='ignore') # calculate rmse per station rmse_per_station = np.vstack((np.sqrt(np.divide(model_squared_error_per_station, test_samples_per_station)), np.sqrt(np.divide(cosmo_squared_error_per_station, test_samples_per_station)))).T # set division by zero/NaN warning to 'warn' np.seterr(divide='warn', invalid='warn') # update global processed samples processed_test_samples_global += processed_test_samples # RMSE of epoch print('Test RMSE COSMO: ', ", ".join( ["T=%s: %s" % (idx, epoch_test_rmse_cosmo[idx]) for idx in range(len(epoch_test_rmse_cosmo))]), " (Overall: %s" % overall_test_rmse_cosmo) print('Test RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_rmse_model[idx]) for idx in range(len(epoch_test_rmse_model))]), " (Overall: %s" % overall_test_rmse_model) # mae of epoch print('Test MAE COSMO: ', ", ".join( ["T=%s: %s" % (idx, epoch_test_mae_cosmo[idx]) for idx in range(len(epoch_test_mae_cosmo))]), " (Overall: %s" % overall_test_mae_cosmo) print('Test MAE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_mae_model[idx]) for idx in range(len(epoch_test_mae_model))]), " (Overall: %s" % overall_test_mae_model) sys.stdout.flush() test_time = time() - test_start_time # time for epoch epoch_time = time() - epoch_start_time # update error statistics error_statistics = ModelUtils.updateErrorStatistic(error_statistics, np.array([epoch_train_rmse_model, epoch_test_rmse_model])[None, None, ...], run, epoch, config['prediction_times']) # update run times statistic run_times = ModelUtils.updateRuntimeStatistic(run_times, np.array([epoch_time, train_time, test_time])[None, None, ...], run, epoch) # update skip statistic skip_statistics = ModelUtils.updateSkipStatistic(skip_statistics, np.array([n_train_samples, processed_train_samples, n_test_samples, processed_test_samples])[None, None, ...], run, epoch) # update per station rmse data array over runs if desired (especially for K-fold station generalization experiment if "per_station_rmse" in config: error_per_station_statistics = ModelUtils.updatePerStationErrorStatistic(error_per_station_statistics, rmse_per_station, run, epoch, np.arange(max_station_id+1)) # store model if it was the best yes is_best = overall_test_rmse_model <= best_epoch_test_rmse best_epoch_test_rmse = min(overall_test_rmse_model, best_epoch_test_rmse) ModelUtils.save_checkpoint({ 'epoch': epoch, 'run': run, 'arch': net.__class__.__name__, 'state_dict': net.state_dict(), 'overall_test_rmse': overall_test_rmse_model, 'lead_test_rmse' : overall_test_rmse_model, 'best_epoch_test_rmse': best_epoch_test_rmse, 'optimizer': optimizer.state_dict(), }, is_best, output_path + '/stored_models/run_%s' % run) # flush output to see progress sys.stdout.flush() # update statistics dict ModelUtils.get_model_details(experiment_info, net, optimizer, criterion) # complete program runtime experiment_info['program_runtime'] = time() - program_start_time # generate data set of all experiment statistics and additional information experiment_statistic = xr.Dataset({ 'error_statistic' : error_statistics, 'run_time_statistic': run_times, 'samples_statistic' : skip_statistics}).assign_attrs(experiment_info) # dump experiment statistic with open(output_path + '/experiment_statistic.pkl', 'wb') as handle: pkl.dump(experiment_statistic, handle, protocol=pkl.HIGHEST_PROTOCOL) if 'per_station_rmse' in config: # dump experiment statistic with open(output_path + '/rmse_per_station.pkl', 'wb') as handle: pkl.dump(error_per_station_statistics, handle, protocol=pkl.HIGHEST_PROTOCOL) # print program execution time m, s = divmod(experiment_info['program_runtime'], 60) h, m = divmod(m, 60) print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))
def CreateDataByStationAndInit(GridSize, DateBegin, DateEnd, PredictionWindow, ListParam, WithTopo, TopoListParam, isLocal, n_parallel): time_begin = time() if DateEnd < DateBegin: raise Exception('DateEnd is smaller than DateBegin.') assert GridSize % 2 == 1, 'Grid size must be an odd number.' # different paths, whether we run the script locally or on a cluster node if isLocal: ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset' # COSMO-1 outputs ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data' # base address of topo files ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/' # base adress of obs files DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/station_init/grid_size_' + str( GridSize) # target directory for all generated files else: ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1' # COSMO-1 outputs ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo' # base address of topo files ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations' # base adress of obs files DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/station_init/grid_size_' + str( GridSize) # target directory for all generated files # create an output folder for each station, based on the station ids OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc') station_ids = OBS['station_id'].data OBS.close() station_paths = [] for S in station_ids: # prepare output folders for each station station_paths += [DESTINATION + '/Station_' + str(S)] if not os.path.exists(station_paths[-1]): os.makedirs(station_paths[-1]) # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not # already exists in the output folder folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION, 'StationAndInit', DateBegin, DateEnd) folders.sort() # calculate begin and end index of array to exclude files, that are not in the specified time interval begin, end = -1, -1 for idx, folder in enumerate(folders): if folder[:-4] >= DateBegin: begin = idx break for idx, folder in enumerate(folders): if folder[:-4] <= DateEnd: end = idx else: break if begin == -1 or end == -1: raise Exception('Could not find start or end in array.') folders = folders[begin:end + 1] print('%s files are left to be preprocessed.' % len(folders)) # split the folders into K approx. equal splits if n_parallel <= 1: folder_splits = [folders] else: n_folders = len(folders) indices = np.linspace(0, n_folders, n_parallel + 1).astype(int) folder_splits = [ folders[indices[i]:indices[i + 1]] for i in range(n_parallel) ] folder_splits = [l for l in folder_splits if len(l) > 0] # take timestamp after set-up time_setup = time() with Pool(processes=n_parallel) as pool: # run preprocessing in parallel for all splits and keep the processes in a list to sync them later process_results = [] for idx_split, split in enumerate(folder_splits): print('Process %s with range [%s, %s] queued.' % (idx_split, split[0], split[-1])) # only calculate topo data by the first process, since it is invariant if idx_split == 0: isTopo = WithTopo else: isTopo = 0 process_results.append( pool.apply_async( GetData, (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst, DESTINATION, ListParam, TopoListParam, GridSize, isTopo, split, PredictionWindow, isLocal))) # forces the parent process to wait on all forked children processes for ps_idx, ps_result in enumerate(process_results): # sync processes _ = ps_result.get() print('[Process %s] Synchronized after data creation.' % ps_idx) # take timestamp after completing all processes time_end = time() # dump preprocessing information in a descriptive JSON file preprocessing_information = { 'grid_size': GridSize, 'data_begin': DateBegin, 'data_end': DateEnd, 'parameters': ListParam, 'future_hours': PredictionWindow, 'n_processes': n_parallel, 'time_setup': str(timedelta(seconds=(time_setup - time_begin))), 'time_preprocessing': str(timedelta(seconds=(time_end - time_setup))) } preprocessing_information_json = json.dumps(preprocessing_information) f = open(DESTINATION + '/setup.json', 'w') f.write(preprocessing_information_json) f.close() print('Preprocessing sucessfully finished in %s.' % str(timedelta(seconds=(time_end - time_begin))))
def GetData(processId, ADDRESSdata, ADDRESStopo, ADDRESSobst, DESTINATION, ListParam, TopoListParam, GridSize, WithTopo, Files, PredictionWindow, isLocal): # processId: (int) -> the id of the process running this method # ADDRESSdata: (string) -> base path to COSMO-1 data # ADDRESStopo: (string) -> base path to all topology files # ADDRESSobs: (string) -> base path to all observation files # DESTINATION: (string) -> base path to target output folder # GridSize: (int)-> side length of square around each station # WithTopo: (bool)-> whether we want to generate preprocessed time invariant features for each station # Files: (list(string)) -> list of all files ('yymmddHH') to be processed, e.g. ['15031203', '15031206', ...] # PredictionWindow: (list of int) -> all future hours t's [t,t+1,t+2,...] being processed, e.g. y_t, y_t+1,... # isLocal: (bool) -> for setting the right paths if the script is running on a local machine or in cloud, etc. # path to observation and topological data if isLocal: OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc') TOPO = xr.open_dataset(ADDRESStopo + '/topodata.nc') else: # to fix parallelization errors, each process gets its own set of TOPO and OBS files OBS = xr.open_dataset( ADDRESSobst + '/process_%s/meteoswiss_t2m_20151001-20180331.nc' % processId) TOPO = xr.open_dataset(ADDRESStopo + '/process_%s/topodata.nc' % processId) # load all station ids stationIds = OBS['station_id'].data # generate a view on temperature observation at each station TempObs = OBS['t2m'].sel(station_id=stationIds) # we need to localize the stations on the 1058*674 grid GPSgrid = np.dstack(( TOPO['lat'][:, :], TOPO['lon'][:, :])) # 1058*674*2 grid of lon lat values of each square # a list with the (lat,lon)-id of the nearest grid point for each station closestGridPointPerStation = [] # a dictionary with the sub-grid around each station stationSquaresDict = {} # generate sub-grids for each station of the closest GridSize**2 grid points for S in stationIds: # we compute each grid square's distance with the station # and we take the one with the smalles distance to be our reference dist = GPSgrid - np.array( [[OBS['lat'].sel(station_id=S), OBS['lon'].sel(station_id=S)]]) dist *= dist Id = (dist.sum(axis=2)).argmin() Id = np.unravel_index(Id, (674, 1058)) closestGridPointPerStation += [ Id ] # Id=(x,y) coordinates of the station (approx.. to the closest point) on the 1058*674 grid SQUARE = {} # the variable stationSquaresData contains, for each station, the coord of the squares which form an N*N grid around the station SQUARE['lat_idx'] = [ x + Id[0] - int(GridSize / 2) for x in range(GridSize) ] SQUARE['lon_idx'] = [ x + Id[1] - int(GridSize / 2) for x in range(GridSize) ] stationSquaresDict[S] = SQUARE # pandas data frame with dictionary of the sub-grid for each station stationSquares = pd.DataFrame(data=stationSquaresDict) # extract time invariant features for each station and the corresponding sub-grid if WithTopo: if not os.path.exists(DESTINATION + '/time_invariant_data_per_station.pkl'): ds = DataUtils.getTimeInvariantStationFeatures( TOPO=TOPO, OBS=OBS, stationSquares=stationSquares, stationIds=stationIds, closestGridPointPerStation=closestGridPointPerStation, GridSize=GridSize, Features=TopoListParam) with open(DESTINATION + '/time_invariant_data_per_station.pkl', 'wb') as handle: pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL) ds.close() print( '[Process %s] Time invariant features have been processed and stored.' % processId) else: print( 'Time invariant features have been found on disk and were therefore not created again.' ) # we now start the iteration through: Each folder, each file, each parameter, each station for file in Files: # loop over all outputs of COSMO-1, e.g. for 3h interval every day try: # mark start of preprocessing of n-th file print('[Process %s] Start processing %s' % (processId, file)) # initialize data variables DATA = np.zeros( (len(stationIds), len(PredictionWindow), GridSize, GridSize, len(ListParam))) TempForecast = np.zeros((len(stationIds), len(PredictionWindow))) Target = np.zeros((len(stationIds), len(PredictionWindow))) TimeStamp = np.zeros((len(PredictionWindow))) TimeData = np.zeros((len(PredictionWindow), 5)) for idx_T, T in enumerate( PredictionWindow ): # loop over all future predictions, e.g. current hour + T if T < 10: NAME = ADDRESSdata + '/' + file + '/c1ffsurf00' + str( T) + '.nc' else: NAME = ADDRESSdata + '/' + file + '/c1ffsurf0' + str( T) + '.nc' # load netCRF4 dataset dataset = xr.open_dataset(NAME) # get initialization time of COSMO-1 data point t = dataset['time'].data # check that we do not process a data point before the first observation if t < OBS['time'].data[0]: print('[Process %s] Skipped %s' % (processId, file)) raise SkipException() # Transform day and hour into a cyclic datetime feature days_rad = (DataUtils.passed_days_per_month_dict[int( file[2:4])] + int(file[4:6])) / 365 * (2 * np.pi) hours = (int(file[6:8]) + T) % 24 hour_rad = hours / 24 * (2 * np.pi) TimeData[idx_T] = [ np.cos(hour_rad), np.sin(hour_rad), np.cos(days_rad), np.sin(days_rad), T / 33 ] # ______________________________________________________________________ for P in range(len(ListParam)): MAP = dataset[ListParam[P]].data.squeeze() for idx_S, S in enumerate(stationIds): stationSquare = stationSquares[S] DATA[idx_S, idx_T, :, :, P] = MAP[ stationSquare.lat_idx][:, stationSquare.lon_idx] # We compare the forecasted temperature with the actual observation MAP = np.squeeze(dataset['T']).data TempForecast[:, idx_T] = np.array( [MAP[x] for x in closestGridPointPerStation]) TimeStamp[idx_T] = t[0] # this dataset is not used anymore and can be closed dataset.close() try: Target[:, idx_T] = TempObs.sel(time=t).data except RuntimeError: print('Error with time=%s.' % t) raise # we write the data in a binary file for idx_S, S in enumerate(stationIds): leads = PredictionWindow time_features = [ 'cos_hour', 'sin_hour', 'cos_day', 'sin_day', 'lead' ] lats = stationSquares[S].lat_idx lons = stationSquares[S].lon_idx dims = ('lead', 'lat', 'lon', 'feature') cosmo_data = xr.DataArray( DATA[idx_S], dims=dims, coords=[leads, lats, lons, ListParam]) temp_forecast = xr.DataArray(TempForecast[idx_S], dims=('lead'), coords=[leads]) temp_station = xr.DataArray(Target[idx_S], dims=('lead'), coords=[leads]) time_data = xr.DataArray(TimeData, dims=('lead', 'time_feature'), coords=[leads, time_features]) time_data.attrs['time_stamp'] = TimeStamp ds = xr.Dataset({ 'cosmo_data': cosmo_data, 'temp_forecast': temp_forecast, 'temp_station': temp_station, 'time_data': time_data }) ds.attrs['station_id'] = S try: # ds.to_netcdf(DESTINATION + '/Station_%s/%s.nc' % (S, file[:8])) with open( DESTINATION + '/Station_%s/%s.pkl' % (S, file[:8]), 'wb') as handle: pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL) except FileNotFoundError: fileExists = os.path.exists(DESTINATION + '/Station_%s' % S) print('Error that file does not exist, check says: %s' % str(fileExists)) raise ds.close() # print that processing of data point has been completed print('[Process %s] Finished %s' % (processId, file)) except SkipException: continue OBS.close() TOPO.close() print('[Process %s] Data split successfully preprocessed!' % processId) return 1
def CreateData(config, data_dictionary, data_statistics, train_test_folds): # assign all program arguments to local variables with open(config['model']['path']) as handle: ModelDict = json.loads(handle.read()) # check if station and grid time invariant features should be used and set the list of desired parameters if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config['grid_time_invariant_parameters'] = [] if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config['station_parameters'] = [] # if needed, load time invariant features with open( "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file: time_invarian_data = pkl.load(input_file) # initialize feature scaling function for each feature featureScaleFunctions = DataUtils.getFeatureScaleFunctions( ModelUtils.ParamNormalizationDict, data_statistics) # add revision short hash to the config config['code_commit'] = ModelUtils.get_git_revision_short_hash() # take the right preprocessed train/test data set for the first run train_fold, test_fold = train_test_folds[0] # initialize train and test dataloaders trainset = DataLoaders.CosmoDataGridData( config=config, station_data_dict=data_dictionary, files=train_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) testset = DataLoaders.CosmoDataGridData( config=config, station_data_dict=data_dictionary, files=test_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) # loop over complete train set train_data = None train_inits = [] train_stations = None for i, data in enumerate(trainloader, 0): try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable( StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() try: batch_data = np.concatenate( (input.squeeze(), station_time_inv_input, time_data, target, init_station_temp[2]), axis=1) except: batch_data = np.concatenate( (input.squeeze(), time_data, target, init_station_temp[2]), axis=1) train_inits += init_station_temp[0] if train_data is None: train_data = batch_data train_stations = init_station_temp[1] else: train_data = np.vstack((train_data, batch_data)) train_stations = np.hstack( (train_stations, init_station_temp[1])) except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue # define column names for data frame column_names = [ 'Pressure', 'Wind U-Comp.', 'Wind V-Comp.', 'Wind VMAX', '2m-Temperature', 'Temp. of Dew Point', 'Cloud Coverage (High)', 'Cloud Coverage (Medium)', 'Cloud Coverage (Low)', 'Tot. Precipitation', 'ALB_RAD', 'ASOB', 'ATHB', 'HPBL', '2m-Temperature (Lead=0)' ] column_names += [ 'Grid Height', 'Grid-Station Height Diff.', 'Fraction of Land', 'Soiltype', 'Latitiude', 'Longitued', 'Grid-Station 2d Distance' ] if train_data.shape[1] >= 31: column_names += [ 'Station Height', 'Station Latitude', 'Station Longitude' ] column_names += [ 'Hour (Cosine)', 'Hour (Sine)', 'Month (Cosine)', 'Month (Sine)', 'Lead-Time' ] column_names += ['Target 2m-Temp.'] column_names += ['COSMO 2m-Temp.'] train_keys = pd.DataFrame.from_dict({ 'Station': train_stations, 'Init': train_inits }) train_data = pd.DataFrame(data=train_data, columns=column_names) train_ds = pd.concat([train_keys, train_data], axis=1) test_data = None test_inits = [] test_stations = None for i, data in enumerate(testloader, 0): try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable( StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() try: batch_data = np.concatenate( (input.squeeze(), station_time_inv_input, time_data, target, init_station_temp[2]), axis=1) except: batch_data = np.concatenate( (input.squeeze(), time_data, target, init_station_temp[2]), axis=1) test_inits += init_station_temp[0] if test_data is None: test_data = batch_data test_stations = init_station_temp[1] else: test_data = np.vstack((test_data, batch_data)) test_stations = np.hstack( (test_stations, init_station_temp[1])) except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue test_keys = pd.DataFrame.from_dict({ 'Station': test_stations, 'Init': test_inits }) test_data = pd.DataFrame(data=test_data, columns=column_names) test_ds = pd.concat([test_keys, test_data], axis=1) network_ready_data_path = config['input_source'] + '/network_ready_data' if not os.path.exists(network_ready_data_path): os.makedirs(network_ready_data_path) network_ready_train_data_path = network_ready_data_path + '/train_data' network_ready_test_data_path = network_ready_data_path + '/test_data' train_ds.to_pickle(network_ready_train_data_path) test_ds.to_pickle(network_ready_test_data_path) # shap specific config entries for analysis in jupyter notebook config['train_data_path'] = network_ready_data_path + '/train_data' config['test_data_path'] = network_ready_data_path + '/test_data' # dump config with open(network_ready_data_path + '/config.pkl', 'wb') as handle: pkl.dump(config, handle, protocol=pkl.HIGHEST_PROTOCOL) print('Network ready data analysis successfully executed.')
def CreateBaselineData(DateBegin, DateEnd, PredictionWindow, isLocal, n_parallel): time_begin = time() if DateEnd < DateBegin: raise Exception('DateEnd is smaller than DateBegin.') # different paths, whether we run the script locally or on a cluster node if isLocal: ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset' # COSMO-1 outputs ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data' # base address of topo files ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/' # base adress of obs files DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/baseline' # target directory for all generated files else: ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1' # COSMO-1 outputs ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo' # base address of topo files ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations' # base adress of obs files DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/baseline' # target directory for all generated files if not os.path.exists(DESTINATION): os.makedirs(DESTINATION) # create an output folder for each station, based on the station ids OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc') TOPO = xr.open_dataset(ADDRESStopo + '/topodata.nc') station_ids = OBS['station_id'].data # extract time invariant features for each station and the corresponding sub-grid if not os.path.exists(DESTINATION + '/station_neighbors.pkl'): station_neighbors = {} # calculate for each station the neighbors on the grid in parallel with Pool(processes=n_parallel) as pool: process_results = [] gridHeightData = TOPO.HH.data gridLatData = TOPO.lat.data gridLonData = TOPO.lon.data # start a new process with the work function for each data split for idx_S, S in enumerate(station_ids): # calculate height difference between grid points and station station_height = OBS['height'].sel(station_id=S).data station_lat = OBS['lat'].sel(station_id=S).data station_lon = OBS['lon'].sel(station_id=S).data print('Neighborhood calculation for staiton %s queued.' % S) process_results.append( pool.apply_async( getStationNeighbors, (S, gridHeightData, gridLatData, gridLonData, station_height, station_lat, station_lon))) # aggregate results from all processes for ps_idx, ps_result in enumerate(process_results): # sync processes S, neighbor_data = ps_result.get() station_neighbors[S] = neighbor_data print('[Process %s] Synchronized after data creation.' % ps_idx) with open(DESTINATION + '/station_neighbors.pkl', 'wb') as handle: pkl.dump(station_neighbors, handle, protocol=pkl.HIGHEST_PROTOCOL) print( 'Station time invariant features have been calculated and stored.') else: with open(DESTINATION + '/station_neighbors.pkl', 'rb') as handle: station_neighbors = pkl.load(handle) print( 'Station time invariant features have been found on disk and were therefore not created again.' ) OBS.close() TOPO.close() for S in station_ids: temp_output_path = DESTINATION + '/temp/station_%s' % S if not os.path.exists(temp_output_path): os.makedirs(temp_output_path) # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not # already exists in the output folder folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION, 'Station', DateBegin, DateEnd) folders.sort() # calculate begin and end index of array to exclude files, that are not in the specified time interval begin, end = -1, -1 for idx, folder in enumerate(folders): if folder[:-4] >= DateBegin: begin = idx break for idx, folder in enumerate(folders): if folder[:-4] <= DateEnd: end = idx else: break if begin == -1 or end == -1: raise Exception('Could not find start or end in array.') folders = folders[begin:end + 1] print('%s files are left to be preprocessed.' % len(folders)) # split the folders into K approx. equal splits if n_parallel <= 1: folder_splits = [folders] else: n_folders = len(folders) indices = np.linspace(0, n_folders, n_parallel + 1).astype(int) folder_splits = [ folders[indices[i]:indices[i + 1]] for i in range(n_parallel) ] folder_splits = [l for l in folder_splits if len(l) > 0] # take timestamp after set-up time_setup = time() # run preprocessing in parallel for all splits and keep the processes in a list to sync them later # calculate min/max and select samples on data split in parallel with Pool(processes=n_parallel) as pool: process_results = [] # start a new process with the work function for each data split for idx_split, split in enumerate(folder_splits): print('Process %s with range [%s, %s] queued.' % (idx_split, split[0], split[-1])) process_results.append( pool.apply_async( GetDataWrapper, (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst, DESTINATION, split, station_neighbors, PredictionWindow, isLocal))) # aggregate results from all processes for ps_idx, ps_result in enumerate(process_results): # sync processes result = ps_result.get() print('[Process %s] Synchronized after data creation.' % ps_idx) station_folders_paths = [ f for f in os.listdir(DESTINATION + '/temp') if re.match(r'^station_([0-9]+?)$', f) ] process_results = [] for ps_idx, station_folder in enumerate(station_folders_paths): print('Process %s with station folder %s queued.' % (ps_idx, station_folder)) process_results.append( pool.apply_async(aggregateProcessFiles, (ps_idx, DESTINATION, station_folder))) # aggregate results from all processes for ps_idx, ps_result in enumerate(process_results): # sync processes result = ps_result.get() print('[Process %s] Synchronized after aggregation.' % ps_idx) # take timestamp after completing all processes time_end = time() # dump preprocessing information in a descriptive JSON file preprocessing_information = { 'data_begin': DateBegin, 'data_end': DateEnd, 'future_hours': PredictionWindow, 'n_processes': n_parallel, 'time_setup': str(timedelta(seconds=(time_setup - time_begin))), 'time_preprocessing': str(timedelta(seconds=(time_end - time_setup))) } preprocessing_information_json = json.dumps(preprocessing_information) f = open(DESTINATION + '/setup.json', 'w') f.write(preprocessing_information_json) f.close() print('Station baseline reprocessing sucessfully finished in %s.' % str(timedelta(seconds=(time_end - time_begin))))
% (config['distance_metric'], h, m, s)) # runs a generalization experiment on stations only used in prediction # this requires >=1 model config file in the "models" folder of an experiment and an "experiment_parameters.txt" file # sample model configs and "experiment_parameters.txt" files can be found under /results/runs/spatial_generalization # IMPORTANT: in the "experiment_parameters.txt" file one has to specify, how many test stations should be use (and therefore # left out for training) and what station is the first in consecutive order to defined the test stations elif options.script == 'spatialGeneralizationExperiment': experiment_start = time() config, train_test_folds, data_dictionary, data_statistics = ModelUtils.setUpModelRun( options=options, G=G) # we filter out in config specified test station from train set and filter all train # stations from test set for each run train_test_folds = DataUtils.filterUnseenTestStations( train_test_folds=train_test_folds, config=config) print('Starting to run %s' % options.script) print("Test Stations:", config['test_stations']) models = [ f[:-4] for f in os.listdir(config['experiment_path'] + '/models') ] n_models = len(models) print('%s models found to run.' % n_models) for m_idx, m in enumerate(models): config['model'][ 'path'] = config['experiment_path'] + '/models/%s.txt' % m config['model']['name'] = m ModelRun.runModel(config=config,
def runModel(config, data_dictionary, data_statistics, train_test_folds): # load time invariant data source_path = config['input_source'] experiment_path = config['experiment_path'] # assign all program arguments to local variables config['batch_size'] = 1 config['runs'] = 3 config['grid_size'] = 9 # if needed, load time invariant features with open( "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file: time_invarian_data = pkl.load(input_file) # initialize feature scaling function for each feature featureScaleFunctions = DataUtils.getFeatureScaleFunctions( ParamNormalizationDict, data_statistics) plot_config = { 'features': config['input_parameters'], 'time_invariant_features': config['grid_time_invariant_parameters'], 'station_features': config['station_parameters'] } # cross validation for run in range(config['runs']): print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1))) # take the right preprocessed train/test data set for the current run train_fold, test_fold = train_test_folds[run] # initialize train and test dataloaders trainset = DataLoaders.SinglePredictionCosmoData( config=config, station_data_dict=data_dictionary, files=train_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) testset = DataLoaders.SinglePredictionCosmoData( config=config, station_data_dict=data_dictionary, files=test_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) train_features = [[] for _ in trainset.parameters] train_time_invariant_grid_features = [ [] for _ in trainset.grid_time_invariant_parameters ] train_station_features = [[] for _ in trainset.station_parameters] train_labels = [] # loop over complete train set for i, data in enumerate(trainloader, 0): try: # get training batch, e.g. label, cosmo-1 output and external features Blabel, Bip2d, StationTimeInv = data except ValueError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue train_labels += list(Blabel.numpy().flatten()) for feature_idx, _ in enumerate(trainset.parameters): train_features[feature_idx] += list( Bip2d[:, feature_idx, :, :].numpy().flatten()) for ti_feature_idx, _ in enumerate( trainset.grid_time_invariant_parameters): train_time_invariant_grid_features[ti_feature_idx] += list( Bip2d[:, trainset.n_parameters + ti_feature_idx, :, :].numpy().flatten()) for station_feature_idx, _ in enumerate( trainset.station_parameters): train_station_features[station_feature_idx] += list( StationTimeInv[:, station_feature_idx].numpy().flatten()) test_features = [[] for _ in testset.parameters] test_time_invariant_grid_features = [ [] for _ in testset.grid_time_invariant_parameters ] test_station_features = [[] for _ in testset.station_parameters] test_labels = [] # loop over complete train set for i, data in enumerate(testloader, 0): try: # get training batch, e.g. label, cosmo-1 output and external features Blabel, Bip2d, StationTimeInv = data except ValueError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue test_labels += list(Blabel.numpy().flatten()) for feature_idx, _ in enumerate(testset.parameters): test_features[feature_idx] += list( Bip2d[:, feature_idx, :, :].numpy().flatten()) for ti_feature_idx, _ in enumerate( testset.grid_time_invariant_parameters): test_time_invariant_grid_features[ti_feature_idx] += list( Bip2d[:, testset.n_parameters + ti_feature_idx, :, :].numpy().flatten()) for station_feature_idx, _ in enumerate( testset.station_parameters): test_station_features[station_feature_idx] += list( StationTimeInv[:, station_feature_idx].numpy().flatten()) plot_config['run'] = run PlotUtils.plotFeatureDistribution( output_path=experiment_path, config=plot_config, train_features=train_features, train_time_invariant_grid_features= train_time_invariant_grid_features, train_station_features=train_station_features, train_labels=train_labels, test_features=test_features, test_time_invariant_grid_features=test_time_invariant_grid_features, test_station_features=test_station_features, test_labels=test_labels)
def plotPerStationPredictionRun(source_path, observation_path, n_parallel): # gather all models in source folder error_data_per_run_dict = defaultdict() for path in glob.glob(source_path + '/**/model_run_error.pkl', recursive=True): model_name = path.split('/')[-2] with open(path, 'rb') as file: ds = pkl.load(file) for data_var in ds.data_vars: da = ds[data_var] try: error_data_per_run_dict[data_var] += [(model_name, da)] except: error_data_per_run_dict[data_var] = [(model_name, da)] # load observations OBS = xr.open_dataset(observation_path) # get the prediction lead time to adjust time labels prediciton_lead_time = ds.attrs['config']['prediction_times'][0] if 'config' in ds.attrs else 1 for run_error_data in error_data_per_run_dict.items(): run = run_error_data[0] models = run_error_data[1] stations = run_error_data[1][0][1].station.data inits = run_error_data[1][0][1].init.data init_type_mapping = np.array(run_error_data[1][0][1].init_type_mapping) train_indices = [idx for idx, item in enumerate(init_type_mapping) if item[1] == 'train'] test_indices = [idx for idx, item in enumerate(init_type_mapping) if item[1] == 'test'] sample_type_color_mapping = [mapping[1] for mapping in init_type_mapping] times = DataUtils.getTimeFromFileName(inits, prediciton_lead_time) time_labels = [str(t)[:-13] for t in times] station_name_dict = get_station_dict(OBS, stations) model_station_mean_errors = {} # plot for each station the prediction run results in parallel with Pool(processes=n_parallel) as pool: process_results = [] for station_idx, station in enumerate(stations): print('Plotting of prediction run for station %s queued.' % station) process_results.append(pool.apply_async(plotPerStationPredictionRunWorker, (models, station, train_indices, test_indices, station_name_dict,sample_type_color_mapping, time_labels, source_path, run))) # aggregate results from all processes for ps_idx, ps_result in enumerate(process_results): # sync processes model_station_mean_error = ps_result.get() for experiment_title, station_data_list in model_station_mean_error.items(): try: model_station_mean_errors[experiment_title] += station_data_list except KeyError: model_station_mean_errors[experiment_title] = station_data_list print('[Process %s] Synchronized after plotting station.' % ps_idx) run_path = source_path + '/plots/prediction_runs/%s' % run if not os.path.exists(run_path): os.makedirs(run_path) generateStationPredictionResultTable(output_path=run_path, results=model_station_mean_errors)
def plotAveragedPredictionRun(source_path): # gather all models in source folder error_data_per_run_dict = {} for path in glob.glob(source_path + '/**/model_run_error.pkl', recursive=True): model_name = path.split('/')[-2] with open(path, 'rb') as file: ds = pkl.load(file) for data_var in ds.data_vars: inits = ds[data_var].init.data sample_type_mapping = [mapping[1] for mapping in ds[data_var].init_type_mapping] prediction_data = ds[data_var].data try: error_data_per_run_dict[data_var] += [(model_name, inits, prediction_data, sample_type_mapping)] except: error_data_per_run_dict[data_var] = [(model_name, inits, prediction_data, sample_type_mapping)] # get the prediction lead time to adjust time labels prediciton_lead_time = ds.attrs['config']['prediction_times'][0] if 'config' in ds.attrs else 1 times = DataUtils.getTimeFromFileName(inits, prediciton_lead_time) time_labels = [str(t)[:-13] for t in times] for run_error_data in error_data_per_run_dict.items(): run = run_error_data[0] model_mean_errors = {} n_subplots = 10 fig, axes = plt.subplots(n_subplots, figsize=(60, 20), sharey=True) for model_idx, model_error_data in enumerate(run_error_data[1]): N = len(model_error_data[1]) split_length = N // n_subplots ind = np.arange(N) # the x locations for the groups experiment_title = model_error_data[0] prediction_data = model_error_data[2] init_type_mapping = model_error_data[3] train_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'train'] test_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'test'] filtered_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'filterd'] for i in range(n_subplots): # split indexes into slices for each subplot index_split = ind[i * split_length:(i + 1) * split_length] if model_idx == 0: sampleTypeBackgroundColoring(axes[i], index_split, init_type_mapping[i * split_length:(i + 1) * split_length]) axes[i].set_xlim([np.min(index_split), np.max(index_split)]) axes[i].plot(index_split, np.nanmean(prediction_data[i * split_length:(i + 1) * split_length,:, 0],axis=1), label=experiment_title, linewidth=0.15, alpha=0.8) train_model_bias = np.nanmean(prediction_data[train_indices][:,:,3]) train_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[train_indices][:,:,3]))) train_model_mae = np.nanmean(np.absolute(prediction_data[train_indices][:,:,3])) test_model_bias = np.nanmean(prediction_data[test_indices][:,:,3]) test_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[test_indices][:,:,3]))) test_model_mae = np.nanmean(np.absolute(prediction_data[test_indices][:,:,3])) filtered_model_bias = np.nanmean(prediction_data[filtered_indices][:,:,3]) filtered_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[filtered_indices][:,:,3]))) filtered_model_mae = np.nanmean(np.absolute(prediction_data[filtered_indices][:,:,3])) model_mean_errors[experiment_title] = (train_model_bias, train_model_rmse, train_model_mae, test_model_bias, test_model_rmse, test_model_mae, filtered_model_bias, filtered_model_rmse, filtered_model_mae) # add mean errors of cosmo output predictions train_diff_cosmo = prediction_data[train_indices][:,:,1] - prediction_data[train_indices][:,:,2] train_cosmo_bias = np.nanmean(train_diff_cosmo) train_cosmo_rmse = np.sqrt(np.nanmean(np.square(train_diff_cosmo))) train_cosmo_mae = np.nanmean(np.absolute(train_diff_cosmo)) test_diff_cosmo = prediction_data[test_indices][:,:,1] - prediction_data[test_indices][:,:,2] test_cosmo_bias = np.nanmean(test_diff_cosmo) test_cosmo_rmse = np.sqrt(np.nanmean(np.square(test_diff_cosmo))) test_cosmo_mae = np.nanmean(np.absolute(test_diff_cosmo)) filtered_diff_cosmo = prediction_data[filtered_indices][:,:,1] - prediction_data[filtered_indices][:,:,2] filtered_cosmo_bias = np.nanmean(filtered_diff_cosmo) filtered_cosmo_rmse = np.sqrt(np.nanmean(np.square(filtered_diff_cosmo))) filtered_cosmo_mae = np.nanmean(np.absolute(filtered_diff_cosmo)) # add COSMO-1 output prediction error model_mean_errors['COSMO-1'] = (train_cosmo_bias, train_cosmo_rmse, train_cosmo_mae, test_cosmo_bias, test_cosmo_rmse, test_cosmo_mae, filtered_cosmo_bias, filtered_cosmo_rmse, filtered_cosmo_mae) for i in range(n_subplots): axes[i].plot(ind[i * split_length:(i + 1) * split_length], np.nanmean(prediction_data[i * split_length:(i + 1) * split_length,:, 1], axis=1), label='COSMO-1', linewidth=0.15, alpha=0.8, color='b', linestyle='-.') axes[i].plot(ind[i * split_length:(i + 1) * split_length], np.nanmean(prediction_data[i * split_length:(i + 1) * split_length, 2], axis=1), label='Prediction', linewidth=0.15, alpha=0.8, color='m', linestyle='--') tick_step_size = np.maximum(split_length // 30, 1) axes[i].set_xticks(ind[i * split_length:(i + 1) * split_length][::tick_step_size]) axes[i].set_xticklabels(time_labels[i * split_length:(i + 1) * split_length][::tick_step_size]) axes[i].set_xticks(ind[i * split_length:(i + 1) * split_length], minor=True) # And a corresponding grid axes[i].grid(which='both') # Or if you want different settings for the grids: axes[i].grid(which='minor', alpha=0.2) axes[i].grid(which='major', alpha=0.5) handles, labels = axes[0].get_legend_handles_labels() axes[n_subplots - 1].set_xlabel('Time') axes[0].legend(handles, labels) plt.tight_layout() run_path = source_path + '/plots/prediction_runs/%s' % run if not os.path.exists(run_path): os.makedirs(run_path) fig.savefig(run_path + '/averaged_prediction.png', dpi=300) generatePredictionResultTable(output_path=run_path, results=model_mean_errors)