Beispiel #1
0
def setUpModelRun(options, G):
    # count available CUDA devices
    if torch.cuda.is_available():
        n_cuda_dev = torch.cuda.device_count()
        print('Cuda is available with %s devices.' % n_cuda_dev)

    # prepare model run config
    config = prepareConfig(options, G)

    # set standard optimizer as default, if none is specified
    if 'optimizer' not in config:
        print(
            "No optimizer config found. Using standard sgd optimizer with learning rate 0.001 and momentum 0.9."
        )
        config['optimizer'] = {
            'algorithm': 'sgd',
            'learning_rate': 0.001,
            'momentum': 0.9
        }

    # load preprocessed time invariant data per stations
    with open(
            "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" %
        (config['input_source'], config['preprocessing'],
         config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # load all station ids
    config['stations'] = time_invarian_data.station.data

    # if preprocessing "station" is used, load complete data into memory accessible by a dictionary. Access keys are the station ids
    data_dictionary = None
    config['inits'] = None
    if config['preprocessing'] == 'station':
        # load data per station into dictionary
        data_dictionary = {}
        for station in config['stations']:
            ds = xr.open_dataset(config['input_source'] +
                                 "/station/grid_size_%s/station_%s_data.nc" %
                                 (config['original_grid_size'], station))
            data_dictionary[station] = ds.copy(deep=True)
            ds.close()

        # get all init times we have data for
        config['inits'] = ds.coords['init'].data

    # load or generate training and test folds, this requires several parameters, best described in the method itself
    train_test_folds = prepareTrainTestFolds(config)

    data_statistics = DataUtils.getDataStatistics(config=config)

    # the definitions of the grid time invariant parameters and the station parameters are hard-code
    config['grid_time_invariant_parameters'] = [
        'HH', 'HH_DIFF', 'FR_LAND', 'SOILTYP', 'LAT', 'LON', 'ABS_2D_DIST'
    ]
    config['station_parameters'] = ['height', 'lat', 'lon']

    return config, train_test_folds, data_dictionary, data_statistics
Beispiel #2
0
def prepareTrainTestFolds(config):
    train_test_folds_file_name = '/train_test_folds_r_%s_sl_%s_tfw_%s_tf_%s_series_%s_s_%s.pkl' % (
        config['runs'], config['slice_size'], config['test_filter_window'],
        config['test_fraction'], config['time_serie_length']
        if 'time_serie_length' in config else 0, config['seed'])
    # if not already existing, generate filtered data splits for each run
    if not os.path.exists(config['input_source'] + train_test_folds_file_name):
        data_folds = DataUtils.getDataFolds(config=config)
        train_test_folds = DataUtils.getTrainTestFolds(config=config,
                                                       data_folds=data_folds)
    # else load the existing filtered data split
    else:
        with open(config['input_source'] + train_test_folds_file_name,
                  'rb') as f:
            train_test_folds = pkl.load(file=f)
            print('Loaded existing train/test folds.')
            sys.stdout.flush()

    return train_test_folds
Beispiel #3
0
def getStationNeighbors(stationId, gridHeightData, gridLatData, gridLonData,
                        station_height, station_lat, station_lon):
    # calculate height difference between grid heights and station heights
    gridHeightDifference = gridHeightData.squeeze() - station_height

    # calculate horizontal distance in meters
    grid_lat_lon_zip = np.array(list(zip(
        gridLatData.ravel(), gridLonData.ravel())), dtype=('float32,float32')) \
        .reshape(gridLatData.shape)
    gridHorizontalDistance = np.vectorize(
        lambda lat_lon_zip: DataUtils.haversine(lat_lon_zip[0], lat_lon_zip[
            1], station_lat, station_lon))(grid_lat_lon_zip)

    closest2dId = gridHorizontalDistance.argmin()
    closest2dId = np.unravel_index(closest2dId, (674, 1058))

    closest3dId = (gridHorizontalDistance +
                   500 * np.abs(gridHeightDifference)).argmin()
    closest3dId = np.unravel_index(closest3dId, (674, 1058))

    return (stationId, ((closest2dId, gridHorizontalDistance[closest2dId],
                         gridHeightDifference[closest2dId]),
                        (closest3dId, gridHorizontalDistance[closest3dId],
                         gridHeightDifference[closest3dId])))
    def __getitem__(self, item):
        if item > self.__len__():
            raise Exception('Tried to get data point out of range.')

        stationId, init = self.files[item]
        stationId = int(stationId)
        self.loadData((stationId, init))

        Label = self.da.temp_station.data
        # if target is corrupted return None. This is later filtered out by the custom "collate_fn()" method
        if min(Label) < -1e10 or np.isnan(Label).any():
            return None

        # if this line throws an exception, we are in the first execution of "__getitem__" and thus, we first have to
        # define the indices of the desired leads, parameters and grid bounds for a fast access of the data directly by
        # indices in the following calls.
        try:
            IP2d = self.da.cosmo_data.data[
                self.lead_idx, self.lower_grid_bound:self.upper_grid_bound,
                self.lower_grid_bound:self.upper_grid_bound][:, :, self.
                                                             parameter_idx]
        except AttributeError:
            self.calculateLowerAndUpperGridBound()
            all_leads = list(self.da.coords['lead'].data)
            all_parameters = list(self.da.coords['feature'].data)
            self.lead_idx = all_leads.index(self.lead_time)
            self.prediction_idx = [
                all_leads.index(pt) for pt in self.prediction_times
            ]
            self.prediction_idx.sort()
            self.parameter_idx = [
                all_parameters.index(p) for p in self.parameters
            ]
            self.parameter_idx.sort()
            IP2d = self.da.cosmo_data.data[
                self.lead_idx, self.lower_grid_bound:self.upper_grid_bound,
                self.lower_grid_bound:self.upper_grid_bound][:, :, self.
                                                             parameter_idx]

        # keep un-normalized temperature input from COSMO-1
        TEMP_RAW = np.copy(self.da.cosmo_data.data[self.prediction_idx,
                                                   self.closest_point_index,
                                                   self.closest_point_index,
                                                   4] - 273.15)

        for p_idx in range(self.n_parameters):
            IP2d[:, :, p_idx] = self.featureScaling[p_idx](IP2d[:, :, p_idx])

        # add temperature of lead time 0 (initial time of the model run) regardless to the lead time of the prediciton
        TEMP_T0 = self.featureScaling[4](self.da.cosmo_data.data[
            0, self.lower_grid_bound:self.upper_grid_bound,
            self.lower_grid_bound:self.upper_grid_bound][:, :, [4]])
        IP2d = np.concatenate((IP2d, TEMP_T0), 2)

        # for CNN appraoch we need a structure like (batch_item, features, lat, lon)
        IP2d = np.rollaxis(IP2d, 2, 0)

        TimeFeatures = self.da.time_data.data[self.lead_idx]
        # TODO at the moment preprocessed data with grid size 3 has normalization of time features and grid size 1 has not
        if self.master_grid_size == 3:
            TimeFeatures[:-1] = DataUtils.normalizeTimeFeatures(
                TimeFeatures[:-1])

        # get time invariant data for station if already calculated once
        try:
            (TimeInvGrid,
             TimeInvStation) = self.station_time_invariant_grid_data[stationId]
        except:
            # calculate time invariant data for station for the first time it is used for this station
            station_data = self.time_invariant_data.sel(station=stationId)
            TimeInvStation = station_data.station_position.sel(
                positinal_attribute=['height', 'lat', 'lon']).data
            TimeInvGrid = np.rollaxis(
                station_data.grid_data.sel(
                    feature=self.grid_time_invariant_parameters).data[
                        self.lower_grid_bound:self.upper_grid_bound,
                        self.lower_grid_bound:self.upper_grid_bound][..., ], 2,
                0)
            self.station_time_invariant_grid_data[stationId] = (TimeInvGrid,
                                                                TimeInvStation)

        return Label[self.prediction_idx], np.concatenate(
            (IP2d, TimeInvGrid),
            0), TimeFeatures, TimeInvStation, (init, stationId, TEMP_RAW)
Beispiel #5
0
def runModel(config, data_dictionary, data_statistics, train_test_folds):
    program_start_time = time()

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config[
        'grid_time_invariant_parameters'] = []
    if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config[
        'station_parameters'] = []

    # update general static model information
    experiment_info = config
    experiment_info['model'] = ModelDict
    experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash()

    # if needed, load time invariant features
    with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (
    config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics)

    # get optimizer config
    optimizer_config = config['optimizer']

    # generate output path for experiment information
    setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % (
        config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size'])
    output_path = '%s/%s' % (config['experiment_path'], setting_string)
    if not os.path.exists(output_path):
        raise Exception('Node folder of training run has been found for "%s"' % output_path)

    ds = xr.Dataset()

    # cross validation
    for run in range(config['runs']):
        print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1)))

        stations = sorted(config['stations'])

        # take the right preprocessed train/test data set for the current run
        train_fold, test_fold = train_test_folds[run]

        # get all inits
        all_inits_set = set(config['inits'])

        # get train and test inits
        train_inits_set = set([t[1] for t in train_fold])
        test_inits_set = set([t[1] for t in test_fold])

        # get all filtered inits
        filtere_inits = set(
            [init for init in all_inits_set if init not in train_inits_set and init not in test_inits_set])

        # make sure, that all sets are distinct
        assert filtere_inits ^ train_inits_set ^ test_inits_set == all_inits_set

        init_type_mapping = {}
        for init in train_inits_set: init_type_mapping[init] = 'train'
        for init in test_inits_set: init_type_mapping[init] = 'test'
        for init in filtere_inits: init_type_mapping[init] = 'filterd'

        all_inits = sorted(list(all_inits_set))
        all_data = [(station, init) for init in all_inits for station in stations]

        n_data_points = len(all_data)

        # keep mappings from init and station to index of result numpy array
        station_index_dict = {}
        for station_idx, station in enumerate(stations): station_index_dict[station] = station_idx
        init_index_dict = {}
        for init_idx, init in enumerate(all_inits): init_index_dict[init] = init_idx

        # initialize train and test dataloaders
        dataset = DataLoaders.ErrorPredictionCosmoData(
            config=config,
            station_data_dict=data_dictionary,
            files=all_data,
            featureScaling=featureScaleFunctions,
            time_invariant_data=time_invarian_data)
        dataloader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=False,
                                num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

        # initialize network, optimizer and loss function
        net = Baseline.model_factory(model_dict=ModelDict, params=dataset.n_parameters, time_invariant_params=dataset.n_grid_time_invariant_parameters,
                                     grid=config['grid_size'], prediction_times=config['prediction_times'])

        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

        optimizer = optim.SGD(net.parameters(), lr=optimizer_config['learning_rate'], momentum=optimizer_config['momentum'])

        net, optimizer, *_ = ModelUtils.load_checkpoint(output_path + '/stored_models/run_%s' % run, model=net,
                                                        optimizer=optimizer)


        if torch.cuda.is_available():
            net.cuda()

        # we do not train, but only output the evaluation of the network on train and test data
        net.eval()

        # initialize result array of errors per init and station and initialize it with NaN
        run_error_statistics = np.empty((len(init_index_dict), len(station_index_dict), 5))
        run_error_statistics.fill(np.nan)

        # loop over complete data set
        for i, data in enumerate(dataloader, 0):
            try:
                # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                DATA = data
                # DATA has only length 4 if we do not use the station time invariant features
                if len(DATA) == 4:
                    Blabel, Bip2d, BTimeData, init_station_temp = DATA
                    station_time_inv_input = None
                elif len(DATA) == 5:
                    Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                    station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                else:
                    raise Exception('Unknown data format for training...')
                input = ModelUtils.getVariable(Bip2d).float()
                time_data = ModelUtils.getVariable(BTimeData).float()
                target = ModelUtils.getVariable(Blabel).float()

            except TypeError:
                # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                # collate_fn would return an empty list
                print('Value error...')
                continue

            out = net(input, time_data, station_time_inv_input).squeeze()
            target = target.squeeze()
            diff = (out - target).squeeze()

            for item in range(Blabel.shape[0]):
                init = init_station_temp[0][item]
                station = init_station_temp[1][item].item()
                cosmo_temperature = init_station_temp[2][item].item()
                target_temperature = init_station_temp[3][item].item()
                station_idx = station_index_dict[station]
                init_idx = init_index_dict[init]
                run_error_statistics[init_idx, station_idx, :] = np.array((out[item].item(), cosmo_temperature, target[item].item(), diff[item].item(), target_temperature))

            processed_samples = (i + 1)  * int(config['batch_size'])
            if (i+1) % np.max((1, ((n_data_points // config['batch_size']) // 100))) == 0:
                print("%s samples have been processed. [%2.1f%%]" % (processed_samples, (processed_samples / n_data_points) * 100))
                sys.stdout.flush()


        da = xr.DataArray(run_error_statistics, dims=('init', 'station', 'data'),
                          coords=[all_inits, stations, ['prediction', 'cosmo', 'target', 'difference', 'target_temperature']])
        da = da.sortby(variables='init')
        da.attrs['init_type_mapping'] = sorted(list(init_type_mapping.items()))

        ds['run_%s' % run] = da
        ds.attrs['config'] = config

        print('Error results of run %s have been processed.' % run)
        # flush output to see progress
        sys.stdout.flush()

    if not os.path.exists(output_path):
        raise Exception('Node folder of training run has been found for "%s"' % output_path)

    # dump experiment statistic
    with open(output_path + '/model_run_error.pkl', 'wb') as handle:
        pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL)

    # print program execution time
    m, s = divmod(time() - program_start_time, 60)
    h, m = divmod(m, 60)
    print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))
def runModel(config, data_dictionary, data_statistics, train_test_folds):
    program_start_time = time()

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config['grid_time_invariant_parameters'] =[]
    if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config['station_parameters'] = []

    # update general static model information
    experiment_info = config
    experiment_info['model'] = ModelDict
    experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash()


    # if needed, load time invariant features
    with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)


    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics)

    # get optimizer config
    optimizer_config = config['optimizer']

    # generate output path for experiment information
    setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % (
        config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size'])
    output_path = '%s/%s' % (config['experiment_path'], setting_string)
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # time for the set up until first run
    experiment_info['set_up_time'] = time() - program_start_time
    print('[Time]: Set-up %s' % strftime("%H:%M:%S", gmtime(experiment_info['set_up_time'])))
    sys.stdout.flush()

    # initialize statistics
    error_statistics = None
    run_times = None
    skip_statistics = None
    if 'per_station_rmse' in config:
        error_per_station_statistics = None

    # keep used learning rates
    experiment_info['scheduled_learning_rates'] = []

    # cross validation
    for run in range(config['runs']):
        # logger  for tensorboardX
        train_logger = Logger(output_path + '/logs/run_%s/train' % run)
        test_logger = Logger(output_path + '/logs/run_%s/test' % run)

        print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1)))

        # take the right preprocessed train/test data set for the current run
        train_fold, test_fold = train_test_folds[run]

        # initialize best epoch test error
        best_epoch_test_rmse = float("inf")

        # use different data loader if we want to train a 3nn model approach
        if "knn" in ModelDict:
            # initialize train and test dataloaders
            trainset = DataLoaders.CosmoData3NNData(
                config=config,
                station_data_dict=data_dictionary,
                files=train_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True,
                                     num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

            testset = DataLoaders.CosmoData3NNData(
                config=config,
                station_data_dict=data_dictionary,
                files=test_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True,
                                    num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)
        else:
            # initialize train and test dataloaders
            trainset = DataLoaders.CosmoDataGridData(
                config=config,
                station_data_dict=data_dictionary,
                files=train_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True,
                                     num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

            testset = DataLoaders.CosmoDataGridData(
                config=config,
                station_data_dict=data_dictionary,
                files=test_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True,
                                    num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

        # initialize network, optimizer and loss function
        net = Baseline.model_factory(ModelDict, trainset.n_parameters, trainset.n_grid_time_invariant_parameters,
                                     config['grid_size'], config['prediction_times'])
        # store class name
        experiment_info['model_class'] = net.__class__.__name__

        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

        if torch.cuda.is_available():
            net.cuda()

        # load number of train and test samples
        n_train_samples, n_test_samples = len(train_fold), len(test_fold)

        optimizer, scheduler = ModelUtils.initializeOptimizer(optimizer_config, net)
        criterion = nn.MSELoss()

        # keep number of processed smaples over all epochs for tensorboard
        processed_train_samples_global = 0
        processed_test_samples_global = 0

        # start learning
        for epoch in range(config['epochs']):
            epoch_train_time = np.zeros((5,))
            epoch_start_time = time()
            print('Epoch: ' + str(epoch + 1) + '\n------------------------------------------------------------')

            # adapt learning rate and store information in experiment attributes
            if scheduler is not None:
                scheduler.step()
                if run == 0: experiment_info['scheduled_learning_rates'] += scheduler.get_lr()
                print('Using learning rate %s' % str(scheduler.get_lr()))

            # TRAINING
            # initialize variables for epoch statistics
            LABELS, MODELoutputs, COSMOoutputs = None, None, None
            processed_train_samples = 0
            net.train(True)

            train_start_time = time()
            # loop over complete train set
            for i, data in enumerate(trainloader, 0):
                time_start = time()
                try:
                    # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                    DATA = data
                    # DATA has only length 4 if we do not use the station time invariant features
                    if len(DATA) == 4:
                        Blabel, Bip2d, BTimeData, init_station_temp = DATA
                        station_time_inv_input = None
                    elif len(DATA) == 5:
                        Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                        station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                    else:
                        raise Exception('Unknown data format for training...')
                    input = ModelUtils.getVariable(Bip2d).float()
                    time_data = ModelUtils.getVariable(BTimeData).float()
                    target = ModelUtils.getVariable(Blabel).float()

                except TypeError:
                    # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                    # collate_fn would return an empty list
                    print('Value error...')
                    continue
                time_after_data_preparation = time()

                processed_train_samples += len(Blabel)

                optimizer.zero_grad()
                out = net(input, time_data, station_time_inv_input)
                time_after_forward_pass = time()
                loss = criterion(out, target)
                loss.backward()
                optimizer.step()
                time_after_backward_pass = time()

                if LABELS is None:
                    LABELS = Blabel.data
                    MODELoutputs = out.data
                    COSMOoutputs = init_station_temp[2].data
                else:
                    LABELS = np.vstack((LABELS, Blabel.data))
                    MODELoutputs = np.vstack((MODELoutputs, out.data))
                    COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data))

                time_after_label_stack = time()

                if (i + 1) % 64 == 0:

                    print('Sample: %s \t Loss: %s' % (processed_train_samples, float(np.sqrt(loss.data))))

                    # ============ TensorBoard logging ============#
                    # (1) Log the scalar values
                    info = {
                        setting_string: np.sqrt(loss.item()),
                    }

                    for tag, value in info.items():
                        train_logger.scalar_summary(tag, value, processed_train_samples_global + processed_train_samples)

                    # (2) Log values and gradients of the parameters (histogram)
                    for tag, value in net.named_parameters():
                        tag = tag.replace('.', '/')
                        train_logger.histo_summary(tag, ModelUtils.to_np(value), i + 1)
                        train_logger.histo_summary(tag + '/grad', ModelUtils.to_np(value.grad), i + 1)

                    epoch_train_time += np.array((time_start - time_end,
                                                  time_after_data_preparation - time_start,
                                                  time_after_forward_pass - time_after_data_preparation,
                                                  time_after_backward_pass - time_after_forward_pass,
                                                  time_after_label_stack - time_after_backward_pass))

                time_end = time()

            # calculate error statistic of current epoch
            diff_model = MODELoutputs - LABELS
            diff_cosmo = COSMOoutputs - LABELS
            epoch_train_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0)
            epoch_train_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0)


            # update global processed samples
            processed_train_samples_global += processed_train_samples

            if np.isnan(epoch_train_rmse_model).any():
                print("Learning rate too large resulted in NaN-error while training. Stopped training...")
                return
            # print epoch training times
            print('Timing: Waiting on data=%s, Data Preparation=%s,'
                  'Forward Pass=%s, Backward Pass=%s, Data Stacking=%s' % tuple(list(epoch_train_time / len(epoch_train_time))))

            # RMSE of epoch
            print('Train/test statistic for epoch: %s' % str(epoch + 1))
            print('Train RMSE COSMO: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_cosmo[idx]) for idx in range(len(epoch_train_rmse_cosmo))]))
            print('Train RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_model[idx]) for idx in range(len(epoch_train_rmse_model))]))
            sys.stdout.flush()

            train_time = time() - train_start_time

            # TESTING
            test_start_time = time()

            LABELS, MODELoutputs, COSMOoutputs, STATION = None, None, None, None
            processed_test_samples = 0
            net.eval()
            for i, data in enumerate(testloader, 0):
                try:
                    # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                    DATA = data
                    # DATA has only length 4 if we do not use the station time invariant features
                    if len(DATA) == 4:
                        Blabel, Bip2d, BTimeData, init_station_temp = DATA
                        station_time_inv_input = None
                    elif len(DATA) == 5:
                        Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                        station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                    else:
                        raise Exception('Unknown data format for training...')
                    input = ModelUtils.getVariable(Bip2d).float()
                    time_data = ModelUtils.getVariable(BTimeData).float()
                    target = ModelUtils.getVariable(Blabel).float()

                except TypeError:
                    # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                    # collate_fn would return an empty list
                    print('Value error...')
                    continue

                processed_test_samples += len(Blabel)

                out = net(input, time_data, station_time_inv_input)
                loss = criterion(out, target)

                if LABELS is None:
                    LABELS = Blabel.data
                    MODELoutputs = out.data
                    COSMOoutputs = init_station_temp[2].data
                    STATION = init_station_temp[1].data
                else:
                    LABELS = np.vstack((LABELS, Blabel.data))
                    MODELoutputs = np.vstack((MODELoutputs, out.data))
                    COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data))
                    STATION = np.hstack((STATION, init_station_temp[1].data))

                if i % 16:
                    # ============ TensorBoard logging ============#
                    # (1) Log the scalar values
                    info = {
                        setting_string: np.sqrt(loss.item()),
                    }

                    for tag, value in info.items():
                        test_logger.scalar_summary(tag, value, processed_test_samples_global + processed_test_samples)

            # calculate error statistic of current epoch
            diff_model = MODELoutputs - LABELS
            diff_cosmo = COSMOoutputs - LABELS

            # rmse
            epoch_test_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0)
            epoch_test_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0)
            overall_test_rmse_model = ModelUtils.rmse(diff_model)
            overall_test_rmse_cosmo = ModelUtils.rmse(diff_cosmo)

            # mae
            epoch_test_mae_model = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_model, axis=0)
            epoch_test_mae_cosmo = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_cosmo, axis=0)
            overall_test_mae_model = ModelUtils.mae(diff_model)
            overall_test_mae_cosmo = ModelUtils.mae(diff_cosmo)

            # calculate per station rmse if desired (especially for K-fold station generalization experiment
            if "per_station_rmse" in config:
                max_station_id = 1435

                squared_errors_per_epoch = np.array((np.square(diff_model), np.square(diff_cosmo))).squeeze()

                # the highest index of data is 1435, thus we expect at least 1435 entries, which we can access by
                # station id
                test_samples_per_station = np.bincount(STATION, minlength=max_station_id+1)
                model_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[0], minlength=max_station_id+1)
                cosmo_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[1], minlength=max_station_id+1)

                # set division by zero/NaN warning to 'ignore'
                np.seterr(divide='ignore', invalid='ignore')

                # calculate rmse per station
                rmse_per_station = np.vstack((np.sqrt(np.divide(model_squared_error_per_station, test_samples_per_station)),
                                              np.sqrt(np.divide(cosmo_squared_error_per_station, test_samples_per_station)))).T

                # set division by zero/NaN warning to 'warn'
                np.seterr(divide='warn', invalid='warn')






            # update global processed samples
            processed_test_samples_global += processed_test_samples

            # RMSE of epoch
            print('Test RMSE COSMO: ', ", ".join(
                ["T=%s: %s" % (idx, epoch_test_rmse_cosmo[idx]) for idx in range(len(epoch_test_rmse_cosmo))]),
                  " (Overall: %s" % overall_test_rmse_cosmo)
            print('Test RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_rmse_model[idx]) for idx in range(len(epoch_test_rmse_model))]),
                  " (Overall: %s" % overall_test_rmse_model)
            # mae of epoch
            print('Test MAE COSMO: ', ", ".join(
                ["T=%s: %s" % (idx, epoch_test_mae_cosmo[idx]) for idx in range(len(epoch_test_mae_cosmo))]),
                  " (Overall: %s" % overall_test_mae_cosmo)
            print('Test MAE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_mae_model[idx]) for idx in range(len(epoch_test_mae_model))]),
                  " (Overall: %s" % overall_test_mae_model)

            sys.stdout.flush()

            test_time = time() - test_start_time

            # time for epoch
            epoch_time = time() - epoch_start_time

            # update error statistics
            error_statistics = ModelUtils.updateErrorStatistic(error_statistics,
                                                               np.array([epoch_train_rmse_model, epoch_test_rmse_model])[None, None, ...],
                                                               run, epoch, config['prediction_times'])
            # update run times statistic
            run_times = ModelUtils.updateRuntimeStatistic(run_times, np.array([epoch_time, train_time, test_time])[None, None, ...],
                                                          run, epoch)
            # update skip statistic
            skip_statistics = ModelUtils.updateSkipStatistic(skip_statistics,
                                                             np.array([n_train_samples, processed_train_samples,
                                                                       n_test_samples, processed_test_samples])[None, None, ...],
                                                             run, epoch)

            # update per station rmse data array over runs if desired (especially for K-fold station generalization experiment
            if "per_station_rmse" in config:
                error_per_station_statistics = ModelUtils.updatePerStationErrorStatistic(error_per_station_statistics, rmse_per_station, run, epoch, np.arange(max_station_id+1))

            # store model if it was the best yes
            is_best = overall_test_rmse_model <= best_epoch_test_rmse
            best_epoch_test_rmse = min(overall_test_rmse_model, best_epoch_test_rmse)
            ModelUtils.save_checkpoint({
                'epoch': epoch,
                'run': run,
                'arch': net.__class__.__name__,
                'state_dict': net.state_dict(),
                'overall_test_rmse': overall_test_rmse_model,
                'lead_test_rmse' : overall_test_rmse_model,
                'best_epoch_test_rmse': best_epoch_test_rmse,
                'optimizer': optimizer.state_dict(),
            }, is_best, output_path + '/stored_models/run_%s' % run)

            # flush output to see progress
            sys.stdout.flush()

    # update statistics dict
    ModelUtils.get_model_details(experiment_info, net, optimizer, criterion)

    # complete program runtime
    experiment_info['program_runtime'] = time() - program_start_time

    # generate data set of all experiment statistics and additional information
    experiment_statistic = xr.Dataset({
        'error_statistic' : error_statistics,
        'run_time_statistic': run_times,
        'samples_statistic' : skip_statistics}).assign_attrs(experiment_info)

    # dump experiment statistic
    with open(output_path + '/experiment_statistic.pkl', 'wb') as handle:
        pkl.dump(experiment_statistic, handle, protocol=pkl.HIGHEST_PROTOCOL)

    if 'per_station_rmse' in config:
        # dump experiment statistic
        with open(output_path + '/rmse_per_station.pkl', 'wb') as handle:
            pkl.dump(error_per_station_statistics, handle, protocol=pkl.HIGHEST_PROTOCOL)

    # print program execution time
    m, s = divmod(experiment_info['program_runtime'], 60)
    h, m = divmod(m, 60)
    print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))
def CreateDataByStationAndInit(GridSize, DateBegin, DateEnd, PredictionWindow,
                               ListParam, WithTopo, TopoListParam, isLocal,
                               n_parallel):
    time_begin = time()

    if DateEnd < DateBegin:
        raise Exception('DateEnd is smaller than DateBegin.')

    assert GridSize % 2 == 1, 'Grid size must be an odd number.'

    # different paths, whether we run the script locally or on a cluster node
    if isLocal:
        ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset'  # COSMO-1 outputs
        ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data'  # base address of topo files
        ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/'  # base adress of obs files
        DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/station_init/grid_size_' + str(
            GridSize)  # target directory for all generated files
    else:
        ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1'  # COSMO-1 outputs
        ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo'  # base address of topo files
        ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations'  # base adress of obs files
        DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/station_init/grid_size_' + str(
            GridSize)  # target directory for all generated files

    # create an output folder for each station, based on the station ids
    OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc')
    station_ids = OBS['station_id'].data
    OBS.close()

    station_paths = []
    for S in station_ids:
        # prepare output folders for each station
        station_paths += [DESTINATION + '/Station_' + str(S)]
        if not os.path.exists(station_paths[-1]):
            os.makedirs(station_paths[-1])

    # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not
    # already exists in the output folder
    folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION,
                                          'StationAndInit', DateBegin, DateEnd)
    folders.sort()

    # calculate begin and end index of array to exclude files, that are not in the specified time interval
    begin, end = -1, -1
    for idx, folder in enumerate(folders):
        if folder[:-4] >= DateBegin:
            begin = idx
            break

    for idx, folder in enumerate(folders):
        if folder[:-4] <= DateEnd:
            end = idx
        else:
            break

    if begin == -1 or end == -1:
        raise Exception('Could not find start or end in array.')

    folders = folders[begin:end + 1]
    print('%s files are left to be preprocessed.' % len(folders))

    # split the folders into K approx. equal splits
    if n_parallel <= 1:
        folder_splits = [folders]
    else:
        n_folders = len(folders)
        indices = np.linspace(0, n_folders, n_parallel + 1).astype(int)
        folder_splits = [
            folders[indices[i]:indices[i + 1]] for i in range(n_parallel)
        ]

    folder_splits = [l for l in folder_splits if len(l) > 0]

    # take timestamp after set-up
    time_setup = time()

    with Pool(processes=n_parallel) as pool:
        # run preprocessing in parallel for all splits and keep the processes in a list to sync them later
        process_results = []
        for idx_split, split in enumerate(folder_splits):
            print('Process %s with range [%s, %s] queued.' %
                  (idx_split, split[0], split[-1]))
            # only calculate topo data by the first process, since it is invariant
            if idx_split == 0:
                isTopo = WithTopo
            else:
                isTopo = 0

            process_results.append(
                pool.apply_async(
                    GetData, (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst,
                              DESTINATION, ListParam, TopoListParam, GridSize,
                              isTopo, split, PredictionWindow, isLocal)))

        # forces the parent process to wait on all forked children processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            _ = ps_result.get()
            print('[Process %s] Synchronized after data creation.' % ps_idx)

    # take timestamp after completing all processes
    time_end = time()

    # dump preprocessing information in a descriptive JSON file
    preprocessing_information = {
        'grid_size': GridSize,
        'data_begin': DateBegin,
        'data_end': DateEnd,
        'parameters': ListParam,
        'future_hours': PredictionWindow,
        'n_processes': n_parallel,
        'time_setup': str(timedelta(seconds=(time_setup - time_begin))),
        'time_preprocessing': str(timedelta(seconds=(time_end - time_setup)))
    }

    preprocessing_information_json = json.dumps(preprocessing_information)
    f = open(DESTINATION + '/setup.json', 'w')
    f.write(preprocessing_information_json)
    f.close()

    print('Preprocessing sucessfully finished in %s.' %
          str(timedelta(seconds=(time_end - time_begin))))
def GetData(processId, ADDRESSdata, ADDRESStopo, ADDRESSobst, DESTINATION,
            ListParam, TopoListParam, GridSize, WithTopo, Files,
            PredictionWindow, isLocal):
    # processId: (int) -> the id of the process running this method
    # ADDRESSdata: (string) -> base path to COSMO-1 data
    # ADDRESStopo: (string) -> base path to all topology files
    # ADDRESSobs: (string) -> base path to all observation files
    # DESTINATION: (string) -> base path to target output folder
    # GridSize: (int)-> side length of square around each station
    # WithTopo: (bool)-> whether we want to generate preprocessed time invariant features for each station
    # Files: (list(string)) -> list of all files ('yymmddHH') to be processed, e.g. ['15031203', '15031206', ...]
    # PredictionWindow: (list of int) -> all future hours t's [t,t+1,t+2,...] being processed, e.g. y_t, y_t+1,...
    # isLocal: (bool) -> for setting the right paths if the script is running on a local machine or in cloud, etc.

    # path to observation and topological data
    if isLocal:
        OBS = xr.open_dataset(ADDRESSobst +
                              '/meteoswiss_t2m_20151001-20180331.nc')
        TOPO = xr.open_dataset(ADDRESStopo + '/topodata.nc')
    else:
        # to fix parallelization errors, each process gets its own set of TOPO and OBS files
        OBS = xr.open_dataset(
            ADDRESSobst +
            '/process_%s/meteoswiss_t2m_20151001-20180331.nc' % processId)
        TOPO = xr.open_dataset(ADDRESStopo +
                               '/process_%s/topodata.nc' % processId)

    # load all station ids
    stationIds = OBS['station_id'].data

    # generate a view on temperature observation at each station
    TempObs = OBS['t2m'].sel(station_id=stationIds)

    # we need to localize the stations on the 1058*674 grid
    GPSgrid = np.dstack((
        TOPO['lat'][:, :],
        TOPO['lon'][:, :]))  # 1058*674*2 grid of lon lat values of each square

    # a list with the (lat,lon)-id of the nearest grid point for each station
    closestGridPointPerStation = []
    # a dictionary with the sub-grid around each station
    stationSquaresDict = {}
    # generate sub-grids for each station of the closest GridSize**2 grid points
    for S in stationIds:

        # we compute each grid square's distance with the station
        # and we take the one with the smalles distance to be our reference
        dist = GPSgrid - np.array(
            [[OBS['lat'].sel(station_id=S), OBS['lon'].sel(station_id=S)]])
        dist *= dist
        Id = (dist.sum(axis=2)).argmin()
        Id = np.unravel_index(Id, (674, 1058))

        closestGridPointPerStation += [
            Id
        ]  # Id=(x,y) coordinates of the station (approx.. to the closest point) on the 1058*674 grid

        SQUARE = {}
        # the variable stationSquaresData contains, for each station, the coord of the squares which form an N*N grid around the station
        SQUARE['lat_idx'] = [
            x + Id[0] - int(GridSize / 2) for x in range(GridSize)
        ]
        SQUARE['lon_idx'] = [
            x + Id[1] - int(GridSize / 2) for x in range(GridSize)
        ]

        stationSquaresDict[S] = SQUARE

    # pandas data frame with dictionary of the sub-grid for each station
    stationSquares = pd.DataFrame(data=stationSquaresDict)

    # extract time invariant features for each station and the corresponding sub-grid
    if WithTopo:
        if not os.path.exists(DESTINATION +
                              '/time_invariant_data_per_station.pkl'):
            ds = DataUtils.getTimeInvariantStationFeatures(
                TOPO=TOPO,
                OBS=OBS,
                stationSquares=stationSquares,
                stationIds=stationIds,
                closestGridPointPerStation=closestGridPointPerStation,
                GridSize=GridSize,
                Features=TopoListParam)
            with open(DESTINATION + '/time_invariant_data_per_station.pkl',
                      'wb') as handle:
                pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL)
            ds.close()
            print(
                '[Process %s] Time invariant features have been processed and stored.'
                % processId)
        else:
            print(
                'Time invariant features have been found on disk and were therefore not created again.'
            )

    # we now start the iteration through: Each folder, each file, each parameter, each station
    for file in Files:  # loop over all  outputs of COSMO-1, e.g. for 3h interval every day
        try:
            # mark start of preprocessing of n-th file
            print('[Process %s] Start processing %s' % (processId, file))

            # initialize data variables
            DATA = np.zeros(
                (len(stationIds), len(PredictionWindow), GridSize, GridSize,
                 len(ListParam)))
            TempForecast = np.zeros((len(stationIds), len(PredictionWindow)))
            Target = np.zeros((len(stationIds), len(PredictionWindow)))
            TimeStamp = np.zeros((len(PredictionWindow)))
            TimeData = np.zeros((len(PredictionWindow), 5))

            for idx_T, T in enumerate(
                    PredictionWindow
            ):  # loop over all future predictions, e.g. current hour + T

                if T < 10:
                    NAME = ADDRESSdata + '/' + file + '/c1ffsurf00' + str(
                        T) + '.nc'
                else:
                    NAME = ADDRESSdata + '/' + file + '/c1ffsurf0' + str(
                        T) + '.nc'

                # load netCRF4 dataset
                dataset = xr.open_dataset(NAME)

                # get initialization time of COSMO-1 data point
                t = dataset['time'].data

                # check that we do not process a data point before the first observation
                if t < OBS['time'].data[0]:
                    print('[Process %s] Skipped %s' % (processId, file))
                    raise SkipException()

                # Transform day and hour into a cyclic datetime feature
                days_rad = (DataUtils.passed_days_per_month_dict[int(
                    file[2:4])] + int(file[4:6])) / 365 * (2 * np.pi)
                hours = (int(file[6:8]) + T) % 24
                hour_rad = hours / 24 * (2 * np.pi)

                TimeData[idx_T] = [
                    np.cos(hour_rad),
                    np.sin(hour_rad),
                    np.cos(days_rad),
                    np.sin(days_rad), T / 33
                ]
                # ______________________________________________________________________

                for P in range(len(ListParam)):
                    MAP = dataset[ListParam[P]].data.squeeze()
                    for idx_S, S in enumerate(stationIds):
                        stationSquare = stationSquares[S]
                        DATA[idx_S, idx_T, :, :, P] = MAP[
                            stationSquare.lat_idx][:, stationSquare.lon_idx]

                # We compare the forecasted temperature with the actual observation
                MAP = np.squeeze(dataset['T']).data
                TempForecast[:, idx_T] = np.array(
                    [MAP[x] for x in closestGridPointPerStation])

                TimeStamp[idx_T] = t[0]

                # this dataset is not used anymore and can be closed
                dataset.close()

                try:
                    Target[:, idx_T] = TempObs.sel(time=t).data
                except RuntimeError:
                    print('Error with time=%s.' % t)
                    raise

            # we write the data in a binary file
            for idx_S, S in enumerate(stationIds):
                leads = PredictionWindow
                time_features = [
                    'cos_hour', 'sin_hour', 'cos_day', 'sin_day', 'lead'
                ]
                lats = stationSquares[S].lat_idx
                lons = stationSquares[S].lon_idx
                dims = ('lead', 'lat', 'lon', 'feature')
                cosmo_data = xr.DataArray(
                    DATA[idx_S],
                    dims=dims,
                    coords=[leads, lats, lons, ListParam])
                temp_forecast = xr.DataArray(TempForecast[idx_S],
                                             dims=('lead'),
                                             coords=[leads])
                temp_station = xr.DataArray(Target[idx_S],
                                            dims=('lead'),
                                            coords=[leads])
                time_data = xr.DataArray(TimeData,
                                         dims=('lead', 'time_feature'),
                                         coords=[leads, time_features])
                time_data.attrs['time_stamp'] = TimeStamp
                ds = xr.Dataset({
                    'cosmo_data': cosmo_data,
                    'temp_forecast': temp_forecast,
                    'temp_station': temp_station,
                    'time_data': time_data
                })
                ds.attrs['station_id'] = S

                try:
                    # ds.to_netcdf(DESTINATION + '/Station_%s/%s.nc' % (S, file[:8]))
                    with open(
                            DESTINATION + '/Station_%s/%s.pkl' % (S, file[:8]),
                            'wb') as handle:
                        pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL)
                except FileNotFoundError:
                    fileExists = os.path.exists(DESTINATION +
                                                '/Station_%s' % S)
                    print('Error that file does not exist, check says: %s' %
                          str(fileExists))
                    raise
                ds.close()

            # print that processing of data point has been completed
            print('[Process %s] Finished %s' % (processId, file))
        except SkipException:
            continue

    OBS.close()
    TOPO.close()

    print('[Process %s] Data split successfully preprocessed!' % processId)

    return 1
def CreateData(config, data_dictionary, data_statistics, train_test_folds):

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict
            and ModelDict['grid_time_invariant']):
        config['grid_time_invariant_parameters'] = []
    if not ('station_time_invariant' in ModelDict
            and ModelDict['station_time_invariant']):
        config['station_parameters'] = []

    # if needed, load time invariant features
    with open(
            "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" %
        (config['input_source'], config['preprocessing'],
         config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(
        ModelUtils.ParamNormalizationDict, data_statistics)

    # add revision short hash to the config
    config['code_commit'] = ModelUtils.get_git_revision_short_hash()

    # take the right preprocessed train/test data set for the first run
    train_fold, test_fold = train_test_folds[0]

    # initialize train and test dataloaders
    trainset = DataLoaders.CosmoDataGridData(
        config=config,
        station_data_dict=data_dictionary,
        files=train_fold,
        featureScaling=featureScaleFunctions,
        time_invariant_data=time_invarian_data)
    trainloader = DataLoader(trainset,
                             batch_size=config['batch_size'],
                             shuffle=True,
                             num_workers=config['n_loaders'],
                             collate_fn=DataLoaders.collate_fn)

    testset = DataLoaders.CosmoDataGridData(
        config=config,
        station_data_dict=data_dictionary,
        files=test_fold,
        featureScaling=featureScaleFunctions,
        time_invariant_data=time_invarian_data)
    testloader = DataLoader(testset,
                            batch_size=config['batch_size'],
                            shuffle=True,
                            num_workers=config['n_loaders'],
                            collate_fn=DataLoaders.collate_fn)

    # loop over complete train set
    train_data = None
    train_inits = []
    train_stations = None
    for i, data in enumerate(trainloader, 0):
        try:
            # get training batch, e.g. label, cosmo-1 output and time inv. features for station
            DATA = data
            # DATA has only length 4 if we do not use the station time invariant features
            if len(DATA) == 4:
                Blabel, Bip2d, BTimeData, init_station_temp = DATA
                station_time_inv_input = None
            elif len(DATA) == 5:
                Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                station_time_inv_input = ModelUtils.getVariable(
                    StationTimeInv).float()
            else:
                raise Exception('Unknown data format for training...')
            input = ModelUtils.getVariable(Bip2d).float()
            time_data = ModelUtils.getVariable(BTimeData).float()
            target = ModelUtils.getVariable(Blabel).float()

            try:
                batch_data = np.concatenate(
                    (input.squeeze(), station_time_inv_input, time_data,
                     target, init_station_temp[2]),
                    axis=1)
            except:
                batch_data = np.concatenate(
                    (input.squeeze(), time_data, target, init_station_temp[2]),
                    axis=1)

            train_inits += init_station_temp[0]

            if train_data is None:
                train_data = batch_data
                train_stations = init_station_temp[1]
            else:
                train_data = np.vstack((train_data, batch_data))
                train_stations = np.hstack(
                    (train_stations, init_station_temp[1]))

        except TypeError:
            # when the batch size is small, it could happen, that all labels have been corrupted and therefore
            # collate_fn would return an empty list
            print('Value error...')
            continue

    # define column names for data frame
    column_names = [
        'Pressure', 'Wind U-Comp.', 'Wind V-Comp.', 'Wind VMAX',
        '2m-Temperature', 'Temp. of Dew Point', 'Cloud Coverage (High)',
        'Cloud Coverage (Medium)', 'Cloud Coverage (Low)',
        'Tot. Precipitation', 'ALB_RAD', 'ASOB', 'ATHB', 'HPBL',
        '2m-Temperature (Lead=0)'
    ]
    column_names += [
        'Grid Height', 'Grid-Station Height Diff.', 'Fraction of Land',
        'Soiltype', 'Latitiude', 'Longitued', 'Grid-Station 2d Distance'
    ]
    if train_data.shape[1] >= 31:
        column_names += [
            'Station Height', 'Station Latitude', 'Station Longitude'
        ]
    column_names += [
        'Hour (Cosine)', 'Hour (Sine)', 'Month (Cosine)', 'Month (Sine)',
        'Lead-Time'
    ]
    column_names += ['Target 2m-Temp.']
    column_names += ['COSMO 2m-Temp.']

    train_keys = pd.DataFrame.from_dict({
        'Station': train_stations,
        'Init': train_inits
    })
    train_data = pd.DataFrame(data=train_data, columns=column_names)
    train_ds = pd.concat([train_keys, train_data], axis=1)

    test_data = None
    test_inits = []
    test_stations = None
    for i, data in enumerate(testloader, 0):
        try:
            # get training batch, e.g. label, cosmo-1 output and time inv. features for station
            DATA = data
            # DATA has only length 4 if we do not use the station time invariant features
            if len(DATA) == 4:
                Blabel, Bip2d, BTimeData, init_station_temp = DATA
                station_time_inv_input = None
            elif len(DATA) == 5:
                Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                station_time_inv_input = ModelUtils.getVariable(
                    StationTimeInv).float()
            else:
                raise Exception('Unknown data format for training...')
            input = ModelUtils.getVariable(Bip2d).float()
            time_data = ModelUtils.getVariable(BTimeData).float()
            target = ModelUtils.getVariable(Blabel).float()

            try:
                batch_data = np.concatenate(
                    (input.squeeze(), station_time_inv_input, time_data,
                     target, init_station_temp[2]),
                    axis=1)
            except:
                batch_data = np.concatenate(
                    (input.squeeze(), time_data, target, init_station_temp[2]),
                    axis=1)

            test_inits += init_station_temp[0]

            if test_data is None:
                test_data = batch_data
                test_stations = init_station_temp[1]
            else:
                test_data = np.vstack((test_data, batch_data))
                test_stations = np.hstack(
                    (test_stations, init_station_temp[1]))

        except TypeError:
            # when the batch size is small, it could happen, that all labels have been corrupted and therefore
            # collate_fn would return an empty list
            print('Value error...')
            continue

    test_keys = pd.DataFrame.from_dict({
        'Station': test_stations,
        'Init': test_inits
    })
    test_data = pd.DataFrame(data=test_data, columns=column_names)
    test_ds = pd.concat([test_keys, test_data], axis=1)

    network_ready_data_path = config['input_source'] + '/network_ready_data'
    if not os.path.exists(network_ready_data_path):
        os.makedirs(network_ready_data_path)

    network_ready_train_data_path = network_ready_data_path + '/train_data'
    network_ready_test_data_path = network_ready_data_path + '/test_data'

    train_ds.to_pickle(network_ready_train_data_path)
    test_ds.to_pickle(network_ready_test_data_path)

    # shap specific config entries for analysis in jupyter notebook
    config['train_data_path'] = network_ready_data_path + '/train_data'
    config['test_data_path'] = network_ready_data_path + '/test_data'

    # dump config
    with open(network_ready_data_path + '/config.pkl', 'wb') as handle:
        pkl.dump(config, handle, protocol=pkl.HIGHEST_PROTOCOL)

    print('Network ready data analysis successfully executed.')
Beispiel #10
0
def CreateBaselineData(DateBegin, DateEnd, PredictionWindow, isLocal,
                       n_parallel):
    time_begin = time()

    if DateEnd < DateBegin:
        raise Exception('DateEnd is smaller than DateBegin.')

    # different paths, whether we run the script locally or on a cluster node
    if isLocal:
        ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset'  # COSMO-1 outputs
        ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data'  # base address of topo files
        ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/'  # base adress of obs files
        DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/baseline'  # target directory for all generated files
    else:
        ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1'  # COSMO-1 outputs
        ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo'  # base address of topo files
        ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations'  # base adress of obs files
        DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/baseline'  # target directory for all generated files

    if not os.path.exists(DESTINATION):
        os.makedirs(DESTINATION)

    # create an output folder for each station, based on the station ids
    OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc')
    TOPO = xr.open_dataset(ADDRESStopo + '/topodata.nc')
    station_ids = OBS['station_id'].data

    # extract time invariant features for each station and the corresponding sub-grid
    if not os.path.exists(DESTINATION + '/station_neighbors.pkl'):

        station_neighbors = {}

        # calculate for each station the neighbors on the grid in parallel
        with Pool(processes=n_parallel) as pool:
            process_results = []

            gridHeightData = TOPO.HH.data
            gridLatData = TOPO.lat.data
            gridLonData = TOPO.lon.data

            # start a new process with the work function for each data split
            for idx_S, S in enumerate(station_ids):
                # calculate height difference between grid points and station
                station_height = OBS['height'].sel(station_id=S).data
                station_lat = OBS['lat'].sel(station_id=S).data
                station_lon = OBS['lon'].sel(station_id=S).data

                print('Neighborhood calculation for staiton %s queued.' % S)
                process_results.append(
                    pool.apply_async(
                        getStationNeighbors,
                        (S, gridHeightData, gridLatData, gridLonData,
                         station_height, station_lat, station_lon)))

            # aggregate results from all processes
            for ps_idx, ps_result in enumerate(process_results):
                # sync processes
                S, neighbor_data = ps_result.get()
                station_neighbors[S] = neighbor_data
                print('[Process %s] Synchronized after data creation.' %
                      ps_idx)

        with open(DESTINATION + '/station_neighbors.pkl', 'wb') as handle:
            pkl.dump(station_neighbors, handle, protocol=pkl.HIGHEST_PROTOCOL)
        print(
            'Station time invariant features have been calculated and stored.')
    else:
        with open(DESTINATION + '/station_neighbors.pkl', 'rb') as handle:
            station_neighbors = pkl.load(handle)
        print(
            'Station time invariant features have been found on disk and were therefore not created again.'
        )

    OBS.close()
    TOPO.close()

    for S in station_ids:
        temp_output_path = DESTINATION + '/temp/station_%s' % S
        if not os.path.exists(temp_output_path):
            os.makedirs(temp_output_path)

    # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not
    # already exists in the output folder
    folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION, 'Station',
                                          DateBegin, DateEnd)
    folders.sort()

    # calculate begin and end index of array to exclude files, that are not in the specified time interval
    begin, end = -1, -1
    for idx, folder in enumerate(folders):
        if folder[:-4] >= DateBegin:
            begin = idx
            break

    for idx, folder in enumerate(folders):
        if folder[:-4] <= DateEnd:
            end = idx
        else:
            break

    if begin == -1 or end == -1:
        raise Exception('Could not find start or end in array.')

    folders = folders[begin:end + 1]
    print('%s files are left to be preprocessed.' % len(folders))

    # split the folders into K approx. equal splits
    if n_parallel <= 1:
        folder_splits = [folders]
    else:
        n_folders = len(folders)
        indices = np.linspace(0, n_folders, n_parallel + 1).astype(int)
        folder_splits = [
            folders[indices[i]:indices[i + 1]] for i in range(n_parallel)
        ]

    folder_splits = [l for l in folder_splits if len(l) > 0]
    # take timestamp after set-up
    time_setup = time()

    # run preprocessing in parallel for all splits and keep the processes in a list to sync them later
    # calculate min/max and select samples on data split in parallel
    with Pool(processes=n_parallel) as pool:
        process_results = []

        # start a new process with the work function for each data split
        for idx_split, split in enumerate(folder_splits):
            print('Process %s with range [%s, %s] queued.' %
                  (idx_split, split[0], split[-1]))
            process_results.append(
                pool.apply_async(
                    GetDataWrapper,
                    (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst,
                     DESTINATION, split, station_neighbors, PredictionWindow,
                     isLocal)))

        # aggregate results from all processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            result = ps_result.get()
            print('[Process %s] Synchronized after data creation.' % ps_idx)

        station_folders_paths = [
            f for f in os.listdir(DESTINATION + '/temp')
            if re.match(r'^station_([0-9]+?)$', f)
        ]

        process_results = []
        for ps_idx, station_folder in enumerate(station_folders_paths):
            print('Process %s with station folder %s queued.' %
                  (ps_idx, station_folder))
            process_results.append(
                pool.apply_async(aggregateProcessFiles,
                                 (ps_idx, DESTINATION, station_folder)))

        # aggregate results from all processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            result = ps_result.get()
            print('[Process %s] Synchronized after aggregation.' % ps_idx)

    # take timestamp after completing all processes
    time_end = time()

    # dump preprocessing information in a descriptive JSON file
    preprocessing_information = {
        'data_begin': DateBegin,
        'data_end': DateEnd,
        'future_hours': PredictionWindow,
        'n_processes': n_parallel,
        'time_setup': str(timedelta(seconds=(time_setup - time_begin))),
        'time_preprocessing': str(timedelta(seconds=(time_end - time_setup)))
    }

    preprocessing_information_json = json.dumps(preprocessing_information)
    f = open(DESTINATION + '/setup.json', 'w')
    f.write(preprocessing_information_json)
    f.close()

    print('Station baseline reprocessing sucessfully finished in %s.' %
          str(timedelta(seconds=(time_end - time_begin))))
Beispiel #11
0
        % (config['distance_metric'], h, m, s))

# runs a generalization experiment on stations only used in prediction
# this requires >=1 model config file in the "models" folder of an experiment and an "experiment_parameters.txt" file
# sample model configs and "experiment_parameters.txt" files can be found under /results/runs/spatial_generalization
# IMPORTANT: in the "experiment_parameters.txt" file one has to specify, how many test stations should be use (and therefore
# left out for training) and what station is the first in consecutive order to defined the test stations
elif options.script == 'spatialGeneralizationExperiment':
    experiment_start = time()

    config, train_test_folds, data_dictionary, data_statistics = ModelUtils.setUpModelRun(
        options=options, G=G)

    # we filter out in config specified test station from train set and filter all train
    # stations from test set for each run
    train_test_folds = DataUtils.filterUnseenTestStations(
        train_test_folds=train_test_folds, config=config)

    print('Starting to run %s' % options.script)
    print("Test Stations:", config['test_stations'])

    models = [
        f[:-4] for f in os.listdir(config['experiment_path'] + '/models')
    ]
    n_models = len(models)
    print('%s models found to run.' % n_models)
    for m_idx, m in enumerate(models):
        config['model'][
            'path'] = config['experiment_path'] + '/models/%s.txt' % m
        config['model']['name'] = m

        ModelRun.runModel(config=config,
Beispiel #12
0
def runModel(config, data_dictionary, data_statistics, train_test_folds):
    # load time invariant data
    source_path = config['input_source']
    experiment_path = config['experiment_path']

    # assign all program arguments to local variables
    config['batch_size'] = 1
    config['runs'] = 3
    config['grid_size'] = 9

    # if needed, load time invariant features
    with open(
            "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" %
        (config['input_source'], config['preprocessing'],
         config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(
        ParamNormalizationDict, data_statistics)

    plot_config = {
        'features': config['input_parameters'],
        'time_invariant_features': config['grid_time_invariant_parameters'],
        'station_features': config['station_parameters']
    }

    # cross validation
    for run in range(config['runs']):
        print('[Run %s] Cross-validation test fold %s' %
              (str(run + 1), str(run + 1)))

        # take the right preprocessed train/test data set for the current run
        train_fold, test_fold = train_test_folds[run]

        # initialize train and test dataloaders
        trainset = DataLoaders.SinglePredictionCosmoData(
            config=config,
            station_data_dict=data_dictionary,
            files=train_fold,
            featureScaling=featureScaleFunctions,
            time_invariant_data=time_invarian_data)
        trainloader = DataLoader(trainset,
                                 batch_size=config['batch_size'],
                                 shuffle=True,
                                 num_workers=config['n_loaders'],
                                 collate_fn=DataLoaders.collate_fn)

        testset = DataLoaders.SinglePredictionCosmoData(
            config=config,
            station_data_dict=data_dictionary,
            files=test_fold,
            featureScaling=featureScaleFunctions,
            time_invariant_data=time_invarian_data)
        testloader = DataLoader(testset,
                                batch_size=config['batch_size'],
                                shuffle=True,
                                num_workers=config['n_loaders'],
                                collate_fn=DataLoaders.collate_fn)

        train_features = [[] for _ in trainset.parameters]
        train_time_invariant_grid_features = [
            [] for _ in trainset.grid_time_invariant_parameters
        ]
        train_station_features = [[] for _ in trainset.station_parameters]
        train_labels = []
        # loop over complete train set
        for i, data in enumerate(trainloader, 0):
            try:
                # get training batch, e.g. label, cosmo-1 output and external features
                Blabel, Bip2d, StationTimeInv = data
            except ValueError:
                # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                # collate_fn would return an empty list
                print('Value error...')
                continue

            train_labels += list(Blabel.numpy().flatten())
            for feature_idx, _ in enumerate(trainset.parameters):
                train_features[feature_idx] += list(
                    Bip2d[:, feature_idx, :, :].numpy().flatten())
            for ti_feature_idx, _ in enumerate(
                    trainset.grid_time_invariant_parameters):
                train_time_invariant_grid_features[ti_feature_idx] += list(
                    Bip2d[:, trainset.n_parameters +
                          ti_feature_idx, :, :].numpy().flatten())
            for station_feature_idx, _ in enumerate(
                    trainset.station_parameters):
                train_station_features[station_feature_idx] += list(
                    StationTimeInv[:, station_feature_idx].numpy().flatten())

        test_features = [[] for _ in testset.parameters]
        test_time_invariant_grid_features = [
            [] for _ in testset.grid_time_invariant_parameters
        ]
        test_station_features = [[] for _ in testset.station_parameters]
        test_labels = []
        # loop over complete train set
        for i, data in enumerate(testloader, 0):
            try:
                # get training batch, e.g. label, cosmo-1 output and external features
                Blabel, Bip2d, StationTimeInv = data
            except ValueError:
                # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                # collate_fn would return an empty list
                print('Value error...')
                continue

            test_labels += list(Blabel.numpy().flatten())
            for feature_idx, _ in enumerate(testset.parameters):
                test_features[feature_idx] += list(
                    Bip2d[:, feature_idx, :, :].numpy().flatten())
            for ti_feature_idx, _ in enumerate(
                    testset.grid_time_invariant_parameters):
                test_time_invariant_grid_features[ti_feature_idx] += list(
                    Bip2d[:, testset.n_parameters +
                          ti_feature_idx, :, :].numpy().flatten())
            for station_feature_idx, _ in enumerate(
                    testset.station_parameters):
                test_station_features[station_feature_idx] += list(
                    StationTimeInv[:, station_feature_idx].numpy().flatten())

        plot_config['run'] = run
        PlotUtils.plotFeatureDistribution(
            output_path=experiment_path,
            config=plot_config,
            train_features=train_features,
            train_time_invariant_grid_features=
            train_time_invariant_grid_features,
            train_station_features=train_station_features,
            train_labels=train_labels,
            test_features=test_features,
            test_time_invariant_grid_features=test_time_invariant_grid_features,
            test_station_features=test_station_features,
            test_labels=test_labels)
Beispiel #13
0
def plotPerStationPredictionRun(source_path, observation_path, n_parallel):
    # gather all models in source folder
    error_data_per_run_dict = defaultdict()
    for path in glob.glob(source_path + '/**/model_run_error.pkl', recursive=True):
        model_name = path.split('/')[-2]
        with open(path, 'rb') as file:
            ds = pkl.load(file)
        for data_var in ds.data_vars:
            da = ds[data_var]
            try:
                error_data_per_run_dict[data_var] += [(model_name, da)]
            except:
                error_data_per_run_dict[data_var] = [(model_name, da)]

    # load observations
    OBS = xr.open_dataset(observation_path)
    # get the prediction lead time to adjust time labels
    prediciton_lead_time = ds.attrs['config']['prediction_times'][0] if 'config' in ds.attrs else 1

    for run_error_data in error_data_per_run_dict.items():
        run = run_error_data[0]
        models = run_error_data[1]
        stations = run_error_data[1][0][1].station.data
        inits = run_error_data[1][0][1].init.data
        init_type_mapping = np.array(run_error_data[1][0][1].init_type_mapping)
        train_indices = [idx for idx, item in enumerate(init_type_mapping) if item[1] == 'train']
        test_indices = [idx for idx, item in enumerate(init_type_mapping) if item[1] == 'test']
        sample_type_color_mapping = [mapping[1] for mapping in init_type_mapping]
        times = DataUtils.getTimeFromFileName(inits, prediciton_lead_time)
        time_labels = [str(t)[:-13] for t in times]

        station_name_dict = get_station_dict(OBS, stations)

        model_station_mean_errors = {}
        # plot for each station the prediction run results in parallel
        with Pool(processes=n_parallel) as pool:
            process_results = []

            for station_idx, station in enumerate(stations):
                print('Plotting of prediction run for station %s queued.' % station)
                process_results.append(pool.apply_async(plotPerStationPredictionRunWorker,
                                                        (models, station, train_indices, test_indices,
                                                         station_name_dict,sample_type_color_mapping,
                                                         time_labels, source_path, run)))

            # aggregate results from all processes
            for ps_idx, ps_result in enumerate(process_results):
                # sync processes
                model_station_mean_error = ps_result.get()

                for experiment_title, station_data_list in model_station_mean_error.items():
                    try:
                        model_station_mean_errors[experiment_title] += station_data_list
                    except KeyError:
                        model_station_mean_errors[experiment_title] = station_data_list

                print('[Process %s] Synchronized after plotting station.' % ps_idx)

        run_path = source_path + '/plots/prediction_runs/%s' % run
        if not os.path.exists(run_path):
            os.makedirs(run_path)

        generateStationPredictionResultTable(output_path=run_path, results=model_station_mean_errors)
Beispiel #14
0
def plotAveragedPredictionRun(source_path):
    # gather all models in source folder
    error_data_per_run_dict = {}
    for path in glob.glob(source_path + '/**/model_run_error.pkl', recursive=True):
        model_name = path.split('/')[-2]
        with open(path, 'rb') as file:
            ds = pkl.load(file)
        for data_var in ds.data_vars:
            inits = ds[data_var].init.data
            sample_type_mapping = [mapping[1] for mapping in ds[data_var].init_type_mapping]
            prediction_data = ds[data_var].data
            try:
                error_data_per_run_dict[data_var] += [(model_name, inits, prediction_data, sample_type_mapping)]
            except:
                error_data_per_run_dict[data_var] = [(model_name, inits, prediction_data, sample_type_mapping)]


    # get the prediction lead time to adjust time labels
    prediciton_lead_time = ds.attrs['config']['prediction_times'][0] if 'config' in ds.attrs else 1

    times = DataUtils.getTimeFromFileName(inits, prediciton_lead_time)
    time_labels = [str(t)[:-13] for t in times]

    for run_error_data in error_data_per_run_dict.items():
        run = run_error_data[0]
        model_mean_errors = {}
        n_subplots = 10
        fig, axes = plt.subplots(n_subplots, figsize=(60, 20), sharey=True)
        for model_idx, model_error_data in enumerate(run_error_data[1]):

            N = len(model_error_data[1])
            split_length = N // n_subplots
            ind = np.arange(N)  # the x locations for the groups
            experiment_title = model_error_data[0]
            prediction_data = model_error_data[2]
            init_type_mapping = model_error_data[3]
            train_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'train']
            test_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'test']
            filtered_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'filterd']


            for i in range(n_subplots):
                # split indexes into slices for each subplot
                index_split = ind[i * split_length:(i + 1) * split_length]

                if model_idx == 0:
                    sampleTypeBackgroundColoring(axes[i], index_split,
                                                 init_type_mapping[i * split_length:(i + 1) * split_length])
                    axes[i].set_xlim([np.min(index_split), np.max(index_split)])

                axes[i].plot(index_split,
                             np.nanmean(prediction_data[i * split_length:(i + 1) * split_length,:, 0],axis=1),
                             label=experiment_title, linewidth=0.15, alpha=0.8)

            train_model_bias = np.nanmean(prediction_data[train_indices][:,:,3])
            train_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[train_indices][:,:,3])))
            train_model_mae = np.nanmean(np.absolute(prediction_data[train_indices][:,:,3]))

            test_model_bias = np.nanmean(prediction_data[test_indices][:,:,3])
            test_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[test_indices][:,:,3])))
            test_model_mae = np.nanmean(np.absolute(prediction_data[test_indices][:,:,3]))
            
            filtered_model_bias = np.nanmean(prediction_data[filtered_indices][:,:,3])
            filtered_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[filtered_indices][:,:,3])))
            filtered_model_mae = np.nanmean(np.absolute(prediction_data[filtered_indices][:,:,3]))


            model_mean_errors[experiment_title] = (train_model_bias, train_model_rmse, train_model_mae,
                                                   test_model_bias, test_model_rmse, test_model_mae,
                                                   filtered_model_bias, filtered_model_rmse, filtered_model_mae)


        # add mean errors of cosmo output predictions
        train_diff_cosmo = prediction_data[train_indices][:,:,1] - prediction_data[train_indices][:,:,2]
        train_cosmo_bias = np.nanmean(train_diff_cosmo)
        train_cosmo_rmse = np.sqrt(np.nanmean(np.square(train_diff_cosmo)))
        train_cosmo_mae = np.nanmean(np.absolute(train_diff_cosmo))

        test_diff_cosmo = prediction_data[test_indices][:,:,1] - prediction_data[test_indices][:,:,2]
        test_cosmo_bias = np.nanmean(test_diff_cosmo)
        test_cosmo_rmse = np.sqrt(np.nanmean(np.square(test_diff_cosmo)))
        test_cosmo_mae = np.nanmean(np.absolute(test_diff_cosmo))
        
        filtered_diff_cosmo = prediction_data[filtered_indices][:,:,1] - prediction_data[filtered_indices][:,:,2]
        filtered_cosmo_bias = np.nanmean(filtered_diff_cosmo)
        filtered_cosmo_rmse = np.sqrt(np.nanmean(np.square(filtered_diff_cosmo)))
        filtered_cosmo_mae = np.nanmean(np.absolute(filtered_diff_cosmo))
        
        # add COSMO-1 output prediction error
        model_mean_errors['COSMO-1'] = (train_cosmo_bias, train_cosmo_rmse, train_cosmo_mae,
                                        test_cosmo_bias, test_cosmo_rmse, test_cosmo_mae,
                                        filtered_cosmo_bias, filtered_cosmo_rmse, filtered_cosmo_mae)

        for i in range(n_subplots):
            axes[i].plot(ind[i * split_length:(i + 1) * split_length],
                         np.nanmean(prediction_data[i * split_length:(i + 1) * split_length,:, 1], axis=1), label='COSMO-1',
                         linewidth=0.15, alpha=0.8, color='b', linestyle='-.')
            axes[i].plot(ind[i * split_length:(i + 1) * split_length],
                         np.nanmean(prediction_data[i * split_length:(i + 1) * split_length, 2], axis=1), label='Prediction',
                         linewidth=0.15, alpha=0.8, color='m', linestyle='--')

            tick_step_size = np.maximum(split_length // 30, 1)
            axes[i].set_xticks(ind[i * split_length:(i + 1) * split_length][::tick_step_size])
            axes[i].set_xticklabels(time_labels[i * split_length:(i + 1) * split_length][::tick_step_size])
            axes[i].set_xticks(ind[i * split_length:(i + 1) * split_length], minor=True)
            # And a corresponding grid
            axes[i].grid(which='both')

            # Or if you want different settings for the grids:
            axes[i].grid(which='minor', alpha=0.2)
            axes[i].grid(which='major', alpha=0.5)

            handles, labels = axes[0].get_legend_handles_labels()

        axes[n_subplots - 1].set_xlabel('Time')
        axes[0].legend(handles, labels)
        plt.tight_layout()

        run_path = source_path + '/plots/prediction_runs/%s' % run
        if not os.path.exists(run_path):
            os.makedirs(run_path)

        fig.savefig(run_path + '/averaged_prediction.png', dpi=300)

        generatePredictionResultTable(output_path=run_path, results=model_mean_errors)