Python DataUtils Examples

Programming Language: Python

Namespace/Package Name: utils.data

Class/Type: DataUtils

Examples at hotexamples.com: 14

Python DataUtils - 14 examples found. These are the top rated real world Python examples of utils.data.DataUtils extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getFeatureScaleFunctions(4)

getFilesToProcess(2)

getTimeFromFileName(2)

filterUnseenTestStations(1)

getDataFolds(1)

getDataStatistics(1)

getTimeInvariantStationFeatures(1)

getTrainTestFolds(1)

haversine(1)

normalizeTimeFeatures(1)

Example #1

Show file

def setUpModelRun(options, G):
    # count available CUDA devices
    if torch.cuda.is_available():
        n_cuda_dev = torch.cuda.device_count()
        print('Cuda is available with %s devices.' % n_cuda_dev)

    # prepare model run config
    config = prepareConfig(options, G)

    # set standard optimizer as default, if none is specified
    if 'optimizer' not in config:
        print(
            "No optimizer config found. Using standard sgd optimizer with learning rate 0.001 and momentum 0.9."
        )
        config['optimizer'] = {
            'algorithm': 'sgd',
            'learning_rate': 0.001,
            'momentum': 0.9
        }

    # load preprocessed time invariant data per stations
    with open(
            "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" %
        (config['input_source'], config['preprocessing'],
         config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # load all station ids
    config['stations'] = time_invarian_data.station.data

    # if preprocessing "station" is used, load complete data into memory accessible by a dictionary. Access keys are the station ids
    data_dictionary = None
    config['inits'] = None
    if config['preprocessing'] == 'station':
        # load data per station into dictionary
        data_dictionary = {}
        for station in config['stations']:
            ds = xr.open_dataset(config['input_source'] +
                                 "/station/grid_size_%s/station_%s_data.nc" %
                                 (config['original_grid_size'], station))
            data_dictionary[station] = ds.copy(deep=True)
            ds.close()

        # get all init times we have data for
        config['inits'] = ds.coords['init'].data

    # load or generate training and test folds, this requires several parameters, best described in the method itself
    train_test_folds = prepareTrainTestFolds(config)

    data_statistics = DataUtils.getDataStatistics(config=config)

    # the definitions of the grid time invariant parameters and the station parameters are hard-code
    config['grid_time_invariant_parameters'] = [
        'HH', 'HH_DIFF', 'FR_LAND', 'SOILTYP', 'LAT', 'LON', 'ABS_2D_DIST'
    ]
    config['station_parameters'] = ['height', 'lat', 'lon']

    return config, train_test_folds, data_dictionary, data_statistics

Example #2

Show file

def prepareTrainTestFolds(config):
    train_test_folds_file_name = '/train_test_folds_r_%s_sl_%s_tfw_%s_tf_%s_series_%s_s_%s.pkl' % (
        config['runs'], config['slice_size'], config['test_filter_window'],
        config['test_fraction'], config['time_serie_length']
        if 'time_serie_length' in config else 0, config['seed'])
    # if not already existing, generate filtered data splits for each run
    if not os.path.exists(config['input_source'] + train_test_folds_file_name):
        data_folds = DataUtils.getDataFolds(config=config)
        train_test_folds = DataUtils.getTrainTestFolds(config=config,
                                                       data_folds=data_folds)
    # else load the existing filtered data split
    else:
        with open(config['input_source'] + train_test_folds_file_name,
                  'rb') as f:
            train_test_folds = pkl.load(file=f)
            print('Loaded existing train/test folds.')
            sys.stdout.flush()

    return train_test_folds

Example #3

Show file

def getStationNeighbors(stationId, gridHeightData, gridLatData, gridLonData,
                        station_height, station_lat, station_lon):
    # calculate height difference between grid heights and station heights
    gridHeightDifference = gridHeightData.squeeze() - station_height

    # calculate horizontal distance in meters
    grid_lat_lon_zip = np.array(list(zip(
        gridLatData.ravel(), gridLonData.ravel())), dtype=('float32,float32')) \
        .reshape(gridLatData.shape)
    gridHorizontalDistance = np.vectorize(
        lambda lat_lon_zip: DataUtils.haversine(lat_lon_zip[0], lat_lon_zip[
            1], station_lat, station_lon))(grid_lat_lon_zip)

    closest2dId = gridHorizontalDistance.argmin()
    closest2dId = np.unravel_index(closest2dId, (674, 1058))

    closest3dId = (gridHorizontalDistance +
                   500 * np.abs(gridHeightDifference)).argmin()
    closest3dId = np.unravel_index(closest3dId, (674, 1058))

    return (stationId, ((closest2dId, gridHorizontalDistance[closest2dId],
                         gridHeightDifference[closest2dId]),
                        (closest3dId, gridHorizontalDistance[closest3dId],
                         gridHeightDifference[closest3dId])))

Example #4

Show file

File: DataLoaders.py Project: niwein/thesis_nwp_correction

    def __getitem__(self, item):
        if item > self.__len__():
            raise Exception('Tried to get data point out of range.')

        stationId, init = self.files[item]
        stationId = int(stationId)
        self.loadData((stationId, init))

        Label = self.da.temp_station.data
        # if target is corrupted return None. This is later filtered out by the custom "collate_fn()" method
        if min(Label) < -1e10 or np.isnan(Label).any():
            return None

        # if this line throws an exception, we are in the first execution of "__getitem__" and thus, we first have to
        # define the indices of the desired leads, parameters and grid bounds for a fast access of the data directly by
        # indices in the following calls.
        try:
            IP2d = self.da.cosmo_data.data[
                self.lead_idx, self.lower_grid_bound:self.upper_grid_bound,
                self.lower_grid_bound:self.upper_grid_bound][:, :, self.
                                                             parameter_idx]
        except AttributeError:
            self.calculateLowerAndUpperGridBound()
            all_leads = list(self.da.coords['lead'].data)
            all_parameters = list(self.da.coords['feature'].data)
            self.lead_idx = all_leads.index(self.lead_time)
            self.prediction_idx = [
                all_leads.index(pt) for pt in self.prediction_times
            ]
            self.prediction_idx.sort()
            self.parameter_idx = [
                all_parameters.index(p) for p in self.parameters
            ]
            self.parameter_idx.sort()
            IP2d = self.da.cosmo_data.data[
                self.lead_idx, self.lower_grid_bound:self.upper_grid_bound,
                self.lower_grid_bound:self.upper_grid_bound][:, :, self.
                                                             parameter_idx]

        # keep un-normalized temperature input from COSMO-1
        TEMP_RAW = np.copy(self.da.cosmo_data.data[self.prediction_idx,
                                                   self.closest_point_index,
                                                   self.closest_point_index,
                                                   4] - 273.15)

        for p_idx in range(self.n_parameters):
            IP2d[:, :, p_idx] = self.featureScaling[p_idx](IP2d[:, :, p_idx])

        # add temperature of lead time 0 (initial time of the model run) regardless to the lead time of the prediciton
        TEMP_T0 = self.featureScaling[4](self.da.cosmo_data.data[
            0, self.lower_grid_bound:self.upper_grid_bound,
            self.lower_grid_bound:self.upper_grid_bound][:, :, [4]])
        IP2d = np.concatenate((IP2d, TEMP_T0), 2)

        # for CNN appraoch we need a structure like (batch_item, features, lat, lon)
        IP2d = np.rollaxis(IP2d, 2, 0)

        TimeFeatures = self.da.time_data.data[self.lead_idx]
        # TODO at the moment preprocessed data with grid size 3 has normalization of time features and grid size 1 has not
        if self.master_grid_size == 3:
            TimeFeatures[:-1] = DataUtils.normalizeTimeFeatures(
                TimeFeatures[:-1])

        # get time invariant data for station if already calculated once
        try:
            (TimeInvGrid,
             TimeInvStation) = self.station_time_invariant_grid_data[stationId]
        except:
            # calculate time invariant data for station for the first time it is used for this station
            station_data = self.time_invariant_data.sel(station=stationId)
            TimeInvStation = station_data.station_position.sel(
                positinal_attribute=['height', 'lat', 'lon']).data
            TimeInvGrid = np.rollaxis(
                station_data.grid_data.sel(
                    feature=self.grid_time_invariant_parameters).data[
                        self.lower_grid_bound:self.upper_grid_bound,
                        self.lower_grid_bound:self.upper_grid_bound][..., ], 2,
                0)
            self.station_time_invariant_grid_data[stationId] = (TimeInvGrid,
                                                                TimeInvStation)

        return Label[self.prediction_idx], np.concatenate(
            (IP2d, TimeInvGrid),
            0), TimeFeatures, TimeInvStation, (init, stationId, TEMP_RAW)

Example #5

Show file

def runModel(config, data_dictionary, data_statistics, train_test_folds):
    program_start_time = time()

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config[
        'grid_time_invariant_parameters'] = []
    if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config[
        'station_parameters'] = []

    # update general static model information
    experiment_info = config
    experiment_info['model'] = ModelDict
    experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash()

    # if needed, load time invariant features
    with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (
    config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics)

    # get optimizer config
    optimizer_config = config['optimizer']

    # generate output path for experiment information
    setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % (
        config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size'])
    output_path = '%s/%s' % (config['experiment_path'], setting_string)
    if not os.path.exists(output_path):
        raise Exception('Node folder of training run has been found for "%s"' % output_path)

    ds = xr.Dataset()

    # cross validation
    for run in range(config['runs']):
        print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1)))

        stations = sorted(config['stations'])

        # take the right preprocessed train/test data set for the current run
        train_fold, test_fold = train_test_folds[run]

        # get all inits
        all_inits_set = set(config['inits'])

        # get train and test inits
        train_inits_set = set([t[1] for t in train_fold])
        test_inits_set = set([t[1] for t in test_fold])

        # get all filtered inits
        filtere_inits = set(
            [init for init in all_inits_set if init not in train_inits_set and init not in test_inits_set])

        # make sure, that all sets are distinct
        assert filtere_inits ^ train_inits_set ^ test_inits_set == all_inits_set

        init_type_mapping = {}
        for init in train_inits_set: init_type_mapping[init] = 'train'
        for init in test_inits_set: init_type_mapping[init] = 'test'
        for init in filtere_inits: init_type_mapping[init] = 'filterd'

        all_inits = sorted(list(all_inits_set))
        all_data = [(station, init) for init in all_inits for station in stations]

        n_data_points = len(all_data)

        # keep mappings from init and station to index of result numpy array
        station_index_dict = {}
        for station_idx, station in enumerate(stations): station_index_dict[station] = station_idx
        init_index_dict = {}
        for init_idx, init in enumerate(all_inits): init_index_dict[init] = init_idx

        # initialize train and test dataloaders
        dataset = DataLoaders.ErrorPredictionCosmoData(
            config=config,
            station_data_dict=data_dictionary,
            files=all_data,
            featureScaling=featureScaleFunctions,
            time_invariant_data=time_invarian_data)
        dataloader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=False,
                                num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

        # initialize network, optimizer and loss function
        net = Baseline.model_factory(model_dict=ModelDict, params=dataset.n_parameters, time_invariant_params=dataset.n_grid_time_invariant_parameters,
                                     grid=config['grid_size'], prediction_times=config['prediction_times'])

        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

        optimizer = optim.SGD(net.parameters(), lr=optimizer_config['learning_rate'], momentum=optimizer_config['momentum'])

        net, optimizer, *_ = ModelUtils.load_checkpoint(output_path + '/stored_models/run_%s' % run, model=net,
                                                        optimizer=optimizer)


        if torch.cuda.is_available():
            net.cuda()

        # we do not train, but only output the evaluation of the network on train and test data
        net.eval()

        # initialize result array of errors per init and station and initialize it with NaN
        run_error_statistics = np.empty((len(init_index_dict), len(station_index_dict), 5))
        run_error_statistics.fill(np.nan)

        # loop over complete data set
        for i, data in enumerate(dataloader, 0):
            try:
                # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                DATA = data
                # DATA has only length 4 if we do not use the station time invariant features
                if len(DATA) == 4:
                    Blabel, Bip2d, BTimeData, init_station_temp = DATA
                    station_time_inv_input = None
                elif len(DATA) == 5:
                    Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                    station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                else:
                    raise Exception('Unknown data format for training...')
                input = ModelUtils.getVariable(Bip2d).float()
                time_data = ModelUtils.getVariable(BTimeData).float()
                target = ModelUtils.getVariable(Blabel).float()

            except TypeError:
                # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                # collate_fn would return an empty list
                print('Value error...')
                continue

            out = net(input, time_data, station_time_inv_input).squeeze()
            target = target.squeeze()
            diff = (out - target).squeeze()

            for item in range(Blabel.shape[0]):
                init = init_station_temp[0][item]
                station = init_station_temp[1][item].item()
                cosmo_temperature = init_station_temp[2][item].item()
                target_temperature = init_station_temp[3][item].item()
                station_idx = station_index_dict[station]
                init_idx = init_index_dict[init]
                run_error_statistics[init_idx, station_idx, :] = np.array((out[item].item(), cosmo_temperature, target[item].item(), diff[item].item(), target_temperature))

            processed_samples = (i + 1)  * int(config['batch_size'])
            if (i+1) % np.max((1, ((n_data_points // config['batch_size']) // 100))) == 0:
                print("%s samples have been processed. [%2.1f%%]" % (processed_samples, (processed_samples / n_data_points) * 100))
                sys.stdout.flush()


        da = xr.DataArray(run_error_statistics, dims=('init', 'station', 'data'),
                          coords=[all_inits, stations, ['prediction', 'cosmo', 'target', 'difference', 'target_temperature']])
        da = da.sortby(variables='init')
        da.attrs['init_type_mapping'] = sorted(list(init_type_mapping.items()))

        ds['run_%s' % run] = da
        ds.attrs['config'] = config

        print('Error results of run %s have been processed.' % run)
        # flush output to see progress
        sys.stdout.flush()

    if not os.path.exists(output_path):
        raise Exception('Node folder of training run has been found for "%s"' % output_path)

    # dump experiment statistic
    with open(output_path + '/model_run_error.pkl', 'wb') as handle:
        pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL)

    # print program execution time
    m, s = divmod(time() - program_start_time, 60)
    h, m = divmod(m, 60)
    print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))

Example #6

Show file

File: ModelRun.py Project: niwein/thesis_nwp_correction

def runModel(config, data_dictionary, data_statistics, train_test_folds):
    program_start_time = time()

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config['grid_time_invariant_parameters'] =[]
    if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config['station_parameters'] = []

    # update general static model information
    experiment_info = config
    experiment_info['model'] = ModelDict
    experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash()


    # if needed, load time invariant features
    with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)


    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics)

    # get optimizer config
    optimizer_config = config['optimizer']

    # generate output path for experiment information
    setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % (
        config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size'])
    output_path = '%s/%s' % (config['experiment_path'], setting_string)
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # time for the set up until first run
    experiment_info['set_up_time'] = time() - program_start_time
    print('[Time]: Set-up %s' % strftime("%H:%M:%S", gmtime(experiment_info['set_up_time'])))
    sys.stdout.flush()

    # initialize statistics
    error_statistics = None
    run_times = None
    skip_statistics = None
    if 'per_station_rmse' in config:
        error_per_station_statistics = None

    # keep used learning rates
    experiment_info['scheduled_learning_rates'] = []

    # cross validation
    for run in range(config['runs']):
        # logger  for tensorboardX
        train_logger = Logger(output_path + '/logs/run_%s/train' % run)
        test_logger = Logger(output_path + '/logs/run_%s/test' % run)

        print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1)))

        # take the right preprocessed train/test data set for the current run
        train_fold, test_fold = train_test_folds[run]

        # initialize best epoch test error
        best_epoch_test_rmse = float("inf")

        # use different data loader if we want to train a 3nn model approach
        if "knn" in ModelDict:
            # initialize train and test dataloaders
            trainset = DataLoaders.CosmoData3NNData(
                config=config,
                station_data_dict=data_dictionary,
                files=train_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True,
                                     num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

            testset = DataLoaders.CosmoData3NNData(
                config=config,
                station_data_dict=data_dictionary,
                files=test_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True,
                                    num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)
        else:
            # initialize train and test dataloaders
            trainset = DataLoaders.CosmoDataGridData(
                config=config,
                station_data_dict=data_dictionary,
                files=train_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True,
                                     num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

            testset = DataLoaders.CosmoDataGridData(
                config=config,
                station_data_dict=data_dictionary,
                files=test_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True,
                                    num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

        # initialize network, optimizer and loss function
        net = Baseline.model_factory(ModelDict, trainset.n_parameters, trainset.n_grid_time_invariant_parameters,
                                     config['grid_size'], config['prediction_times'])
        # store class name
        experiment_info['model_class'] = net.__class__.__name__

        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

        if torch.cuda.is_available():
            net.cuda()

        # load number of train and test samples
        n_train_samples, n_test_samples = len(train_fold), len(test_fold)

        optimizer, scheduler = ModelUtils.initializeOptimizer(optimizer_config, net)
        criterion = nn.MSELoss()

        # keep number of processed smaples over all epochs for tensorboard
        processed_train_samples_global = 0
        processed_test_samples_global = 0

        # start learning
        for epoch in range(config['epochs']):
            epoch_train_time = np.zeros((5,))
            epoch_start_time = time()
            print('Epoch: ' + str(epoch + 1) + '\n------------------------------------------------------------')

            # adapt learning rate and store information in experiment attributes
            if scheduler is not None:
                scheduler.step()
                if run == 0: experiment_info['scheduled_learning_rates'] += scheduler.get_lr()
                print('Using learning rate %s' % str(scheduler.get_lr()))

            # TRAINING
            # initialize variables for epoch statistics
            LABELS, MODELoutputs, COSMOoutputs = None, None, None
            processed_train_samples = 0
            net.train(True)

            train_start_time = time()
            # loop over complete train set
            for i, data in enumerate(trainloader, 0):
                time_start = time()
                try:
                    # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                    DATA = data
                    # DATA has only length 4 if we do not use the station time invariant features
                    if len(DATA) == 4:
                        Blabel, Bip2d, BTimeData, init_station_temp = DATA
                        station_time_inv_input = None
                    elif len(DATA) == 5:
                        Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                        station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                    else:
                        raise Exception('Unknown data format for training...')
                    input = ModelUtils.getVariable(Bip2d).float()
                    time_data = ModelUtils.getVariable(BTimeData).float()
                    target = ModelUtils.getVariable(Blabel).float()

                except TypeError:
                    # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                    # collate_fn would return an empty list
                    print('Value error...')
                    continue
                time_after_data_preparation = time()

                processed_train_samples += len(Blabel)

                optimizer.zero_grad()
                out = net(input, time_data, station_time_inv_input)
                time_after_forward_pass = time()
                loss = criterion(out, target)
                loss.backward()
                optimizer.step()
                time_after_backward_pass = time()

                if LABELS is None:
                    LABELS = Blabel.data
                    MODELoutputs = out.data
                    COSMOoutputs = init_station_temp[2].data
                else:
                    LABELS = np.vstack((LABELS, Blabel.data))
                    MODELoutputs = np.vstack((MODELoutputs, out.data))
                    COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data))

                time_after_label_stack = time()

                if (i + 1) % 64 == 0:

                    print('Sample: %s \t Loss: %s' % (processed_train_samples, float(np.sqrt(loss.data))))

                    # ============ TensorBoard logging ============#
                    # (1) Log the scalar values
                    info = {
                        setting_string: np.sqrt(loss.item()),
                    }

                    for tag, value in info.items():
                        train_logger.scalar_summary(tag, value, processed_train_samples_global + processed_train_samples)

                    # (2) Log values and gradients of the parameters (histogram)
                    for tag, value in net.named_parameters():
                        tag = tag.replace('.', '/')
                        train_logger.histo_summary(tag, ModelUtils.to_np(value), i + 1)
                        train_logger.histo_summary(tag + '/grad', ModelUtils.to_np(value.grad), i + 1)

                    epoch_train_time += np.array((time_start - time_end,
                                                  time_after_data_preparation - time_start,
                                                  time_after_forward_pass - time_after_data_preparation,
                                                  time_after_backward_pass - time_after_forward_pass,
                                                  time_after_label_stack - time_after_backward_pass))

                time_end = time()

            # calculate error statistic of current epoch
            diff_model = MODELoutputs - LABELS
            diff_cosmo = COSMOoutputs - LABELS
            epoch_train_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0)
            epoch_train_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0)


            # update global processed samples
            processed_train_samples_global += processed_train_samples

            if np.isnan(epoch_train_rmse_model).any():
                print("Learning rate too large resulted in NaN-error while training. Stopped training...")
                return
            # print epoch training times
            print('Timing: Waiting on data=%s, Data Preparation=%s,'
                  'Forward Pass=%s, Backward Pass=%s, Data Stacking=%s' % tuple(list(epoch_train_time / len(epoch_train_time))))

            # RMSE of epoch
            print('Train/test statistic for epoch: %s' % str(epoch + 1))
            print('Train RMSE COSMO: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_cosmo[idx]) for idx in range(len(epoch_train_rmse_cosmo))]))
            print('Train RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_model[idx]) for idx in range(len(epoch_train_rmse_model))]))
            sys.stdout.flush()

            train_time = time() - train_start_time

            # TESTING
            test_start_time = time()

            LABELS, MODELoutputs, COSMOoutputs, STATION = None, None, None, None
            processed_test_samples = 0
            net.eval()
            for i, data in enumerate(testloader, 0):
                try:
                    # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                    DATA = data
                    # DATA has only length 4 if we do not use the station time invariant features
                    if len(DATA) == 4:
                        Blabel, Bip2d, BTimeData, init_station_temp = DATA
                        station_time_inv_input = None
                    elif len(DATA) == 5:
                        Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                        station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                    else:
                        raise Exception('Unknown data format for training...')
                    input = ModelUtils.getVariable(Bip2d).float()
                    time_data = ModelUtils.getVariable(BTimeData).float()
                    target = ModelUtils.getVariable(Blabel).float()

                except TypeError:
                    # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                    # collate_fn would return an empty list
                    print('Value error...')
                    continue

                processed_test_samples += len(Blabel)

                out = net(input, time_data, station_time_inv_input)
                loss = criterion(out, target)

                if LABELS is None:
                    LABELS = Blabel.data
                    MODELoutputs = out.data
                    COSMOoutputs = init_station_temp[2].data
                    STATION = init_station_temp[1].data
                else:
                    LABELS = np.vstack((LABELS, Blabel.data))
                    MODELoutputs = np.vstack((MODELoutputs, out.data))
                    COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data))
                    STATION = np.hstack((STATION, init_station_temp[1].data))

                if i % 16:
                    # ============ TensorBoard logging ============#
                    # (1) Log the scalar values
                    info = {
                        setting_string: np.sqrt(loss.item()),
                    }

                    for tag, value in info.items():
                        test_logger.scalar_summary(tag, value, processed_test_samples_global + processed_test_samples)

            # calculate error statistic of current epoch
            diff_model = MODELoutputs - LABELS
            diff_cosmo = COSMOoutputs - LABELS

            # rmse
            epoch_test_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0)
            epoch_test_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0)
            overall_test_rmse_model = ModelUtils.rmse(diff_model)
            overall_test_rmse_cosmo = ModelUtils.rmse(diff_cosmo)

            # mae
            epoch_test_mae_model = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_model, axis=0)
            epoch_test_mae_cosmo = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_cosmo, axis=0)
            overall_test_mae_model = ModelUtils.mae(diff_model)
            overall_test_mae_cosmo = ModelUtils.mae(diff_cosmo)

            # calculate per station rmse if desired (especially for K-fold station generalization experiment
            if "per_station_rmse" in config:
                max_station_id = 1435

                squared_errors_per_epoch = np.array((np.square(diff_model), np.square(diff_cosmo))).squeeze()

                # the highest index of data is 1435, thus we expect at least 1435 entries, which we can access by
                # station id
                test_samples_per_station = np.bincount(STATION, minlength=max_station_id+1)
                model_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[0], minlength=max_station_id+1)
                cosmo_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[1], minlength=max_station_id+1)

                # set division by zero/NaN warning to 'ignore'
                np.seterr(divide='ignore', invalid='ignore')

                # calculate rmse per station
                rmse_per_station = np.vstack((np.sqrt(np.divide(model_squared_error_per_station, test_samples_per_station)),
                                              np.sqrt(np.divide(cosmo_squared_error_per_station, test_samples_per_station)))).T

                # set division by zero/NaN warning to 'warn'
                np.seterr(divide='warn', invalid='warn')






            # update global processed samples
            processed_test_samples_global += processed_test_samples

            # RMSE of epoch
            print('Test RMSE COSMO: ', ", ".join(
                ["T=%s: %s" % (idx, epoch_test_rmse_cosmo[idx]) for idx in range(len(epoch_test_rmse_cosmo))]),
                  " (Overall: %s" % overall_test_rmse_cosmo)
            print('Test RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_rmse_model[idx]) for idx in range(len(epoch_test_rmse_model))]),
                  " (Overall: %s" % overall_test_rmse_model)
            # mae of epoch
            print('Test MAE COSMO: ', ", ".join(
                ["T=%s: %s" % (idx, epoch_test_mae_cosmo[idx]) for idx in range(len(epoch_test_mae_cosmo))]),
                  " (Overall: %s" % overall_test_mae_cosmo)
            print('Test MAE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_mae_model[idx]) for idx in range(len(epoch_test_mae_model))]),
                  " (Overall: %s" % overall_test_mae_model)

            sys.stdout.flush()

            test_time = time() - test_start_time

            # time for epoch
            epoch_time = time() - epoch_start_time

            # update error statistics
            error_statistics = ModelUtils.updateErrorStatistic(error_statistics,
                                                               np.array([epoch_train_rmse_model, epoch_test_rmse_model])[None, None, ...],
                                                               run, epoch, config['prediction_times'])
            # update run times statistic
            run_times = ModelUtils.updateRuntimeStatistic(run_times, np.array([epoch_time, train_time, test_time])[None, None, ...],
                                                          run, epoch)
            # update skip statistic
            skip_statistics = ModelUtils.updateSkipStatistic(skip_statistics,
                                                             np.array([n_train_samples, processed_train_samples,
                                                                       n_test_samples, processed_test_samples])[None, None, ...],
                                                             run, epoch)

            # update per station rmse data array over runs if desired (especially for K-fold station generalization experiment
            if "per_station_rmse" in config:
                error_per_station_statistics = ModelUtils.updatePerStationErrorStatistic(error_per_station_statistics, rmse_per_station, run, epoch, np.arange(max_station_id+1))

            # store model if it was the best yes
            is_best = overall_test_rmse_model <= best_epoch_test_rmse
            best_epoch_test_rmse = min(overall_test_rmse_model, best_epoch_test_rmse)
            ModelUtils.save_checkpoint({
                'epoch': epoch,
                'run': run,
                'arch': net.__class__.__name__,
                'state_dict': net.state_dict(),
                'overall_test_rmse': overall_test_rmse_model,
                'lead_test_rmse' : overall_test_rmse_model,
                'best_epoch_test_rmse': best_epoch_test_rmse,
                'optimizer': optimizer.state_dict(),
            }, is_best, output_path + '/stored_models/run_%s' % run)

            # flush output to see progress
            sys.stdout.flush()

    # update statistics dict
    ModelUtils.get_model_details(experiment_info, net, optimizer, criterion)

    # complete program runtime
    experiment_info['program_runtime'] = time() - program_start_time

    # generate data set of all experiment statistics and additional information
    experiment_statistic = xr.Dataset({
        'error_statistic' : error_statistics,
        'run_time_statistic': run_times,
        'samples_statistic' : skip_statistics}).assign_attrs(experiment_info)

    # dump experiment statistic
    with open(output_path + '/experiment_statistic.pkl', 'wb') as handle:
        pkl.dump(experiment_statistic, handle, protocol=pkl.HIGHEST_PROTOCOL)

    if 'per_station_rmse' in config:
        # dump experiment statistic
        with open(output_path + '/rmse_per_station.pkl', 'wb') as handle:
            pkl.dump(error_per_station_statistics, handle, protocol=pkl.HIGHEST_PROTOCOL)

    # print program execution time
    m, s = divmod(experiment_info['program_runtime'], 60)
    h, m = divmod(m, 60)
    print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))

Example #7

Show file

File: CreateDataByStationAndInit.py Project: niwein/thesis_nwp_correction

def CreateDataByStationAndInit(GridSize, DateBegin, DateEnd, PredictionWindow,
                               ListParam, WithTopo, TopoListParam, isLocal,
                               n_parallel):
    time_begin = time()

    if DateEnd < DateBegin:
        raise Exception('DateEnd is smaller than DateBegin.')

    assert GridSize % 2 == 1, 'Grid size must be an odd number.'

    # different paths, whether we run the script locally or on a cluster node
    if isLocal:
        ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset'  # COSMO-1 outputs
        ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data'  # base address of topo files
        ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/'  # base adress of obs files
        DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/station_init/grid_size_' + str(
            GridSize)  # target directory for all generated files
    else:
        ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1'  # COSMO-1 outputs
        ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo'  # base address of topo files
        ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations'  # base adress of obs files
        DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/station_init/grid_size_' + str(
            GridSize)  # target directory for all generated files

    # create an output folder for each station, based on the station ids
    OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc')
    station_ids = OBS['station_id'].data
    OBS.close()

    station_paths = []
    for S in station_ids:
        # prepare output folders for each station
        station_paths += [DESTINATION + '/Station_' + str(S)]
        if not os.path.exists(station_paths[-1]):
            os.makedirs(station_paths[-1])

    # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not
    # already exists in the output folder
    folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION,
                                          'StationAndInit', DateBegin, DateEnd)
    folders.sort()

    # calculate begin and end index of array to exclude files, that are not in the specified time interval
    begin, end = -1, -1
    for idx, folder in enumerate(folders):
        if folder[:-4] >= DateBegin:
            begin = idx
            break

    for idx, folder in enumerate(folders):
        if folder[:-4] <= DateEnd:
            end = idx
        else:
            break

    if begin == -1 or end == -1:
        raise Exception('Could not find start or end in array.')

    folders = folders[begin:end + 1]
    print('%s files are left to be preprocessed.' % len(folders))

    # split the folders into K approx. equal splits
    if n_parallel <= 1:
        folder_splits = [folders]
    else:
        n_folders = len(folders)
        indices = np.linspace(0, n_folders, n_parallel + 1).astype(int)
        folder_splits = [
            folders[indices[i]:indices[i + 1]] for i in range(n_parallel)
        ]

    folder_splits = [l for l in folder_splits if len(l) > 0]

    # take timestamp after set-up
    time_setup = time()

    with Pool(processes=n_parallel) as pool:
        # run preprocessing in parallel for all splits and keep the processes in a list to sync them later
        process_results = []
        for idx_split, split in enumerate(folder_splits):
            print('Process %s with range [%s, %s] queued.' %
                  (idx_split, split[0], split[-1]))
            # only calculate topo data by the first process, since it is invariant
            if idx_split == 0:
                isTopo = WithTopo
            else:
                isTopo = 0

            process_results.append(
                pool.apply_async(
                    GetData, (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst,
                              DESTINATION, ListParam, TopoListParam, GridSize,
                              isTopo, split, PredictionWindow, isLocal)))

        # forces the parent process to wait on all forked children processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            _ = ps_result.get()
            print('[Process %s] Synchronized after data creation.' % ps_idx)

    # take timestamp after completing all processes
    time_end = time()

    # dump preprocessing information in a descriptive JSON file
    preprocessing_information = {
        'grid_size': GridSize,
        'data_begin': DateBegin,
        'data_end': DateEnd,
        'parameters': ListParam,
        'future_hours': PredictionWindow,
        'n_processes': n_parallel,
        'time_setup': str(timedelta(seconds=(time_setup - time_begin))),
        'time_preprocessing': str(timedelta(seconds=(time_end - time_setup)))
    }

    preprocessing_information_json = json.dumps(preprocessing_information)
    f = open(DESTINATION + '/setup.json', 'w')
    f.write(preprocessing_information_json)
    f.close()

    print('Preprocessing sucessfully finished in %s.' %
          str(timedelta(seconds=(time_end - time_begin))))

Example #8

Show file

File: CreateDataByStationAndInit.py Project: niwein/thesis_nwp_correction

def GetData(processId, ADDRESSdata, ADDRESStopo, ADDRESSobst, DESTINATION,
            ListParam, TopoListParam, GridSize, WithTopo, Files,
            PredictionWindow, isLocal):
    # processId: (int) -> the id of the process running this method
    # ADDRESSdata: (string) -> base path to COSMO-1 data
    # ADDRESStopo: (string) -> base path to all topology files
    # ADDRESSobs: (string) -> base path to all observation files
    # DESTINATION: (string) -> base path to target output folder
    # GridSize: (int)-> side length of square around each station
    # WithTopo: (bool)-> whether we want to generate preprocessed time invariant features for each station
    # Files: (list(string)) -> list of all files ('yymmddHH') to be processed, e.g. ['15031203', '15031206', ...]
    # PredictionWindow: (list of int) -> all future hours t's [t,t+1,t+2,...] being processed, e.g. y_t, y_t+1,...
    # isLocal: (bool) -> for setting the right paths if the script is running on a local machine or in cloud, etc.

    # path to observation and topological data
    if isLocal:
        OBS = xr.open_dataset(ADDRESSobst +
                              '/meteoswiss_t2m_20151001-20180331.nc')
        TOPO = xr.open_dataset(ADDRESStopo + '/topodata.nc')
    else:
        # to fix parallelization errors, each process gets its own set of TOPO and OBS files
        OBS = xr.open_dataset(
            ADDRESSobst +
            '/process_%s/meteoswiss_t2m_20151001-20180331.nc' % processId)
        TOPO = xr.open_dataset(ADDRESStopo +
                               '/process_%s/topodata.nc' % processId)

    # load all station ids
    stationIds = OBS['station_id'].data

    # generate a view on temperature observation at each station
    TempObs = OBS['t2m'].sel(station_id=stationIds)

    # we need to localize the stations on the 1058*674 grid
    GPSgrid = np.dstack((
        TOPO['lat'][:, :],
        TOPO['lon'][:, :]))  # 1058*674*2 grid of lon lat values of each square

    # a list with the (lat,lon)-id of the nearest grid point for each station
    closestGridPointPerStation = []
    # a dictionary with the sub-grid around each station
    stationSquaresDict = {}
    # generate sub-grids for each station of the closest GridSize**2 grid points
    for S in stationIds:

        # we compute each grid square's distance with the station
        # and we take the one with the smalles distance to be our reference
        dist = GPSgrid - np.array(
            [[OBS['lat'].sel(station_id=S), OBS['lon'].sel(station_id=S)]])
        dist *= dist
        Id = (dist.sum(axis=2)).argmin()
        Id = np.unravel_index(Id, (674, 1058))

        closestGridPointPerStation += [
            Id
        ]  # Id=(x,y) coordinates of the station (approx.. to the closest point) on the 1058*674 grid

        SQUARE = {}
        # the variable stationSquaresData contains, for each station, the coord of the squares which form an N*N grid around the station
        SQUARE['lat_idx'] = [
            x + Id[0] - int(GridSize / 2) for x in range(GridSize)
        ]
        SQUARE['lon_idx'] = [
            x + Id[1] - int(GridSize / 2) for x in range(GridSize)
        ]

        stationSquaresDict[S] = SQUARE

    # pandas data frame with dictionary of the sub-grid for each station
    stationSquares = pd.DataFrame(data=stationSquaresDict)

    # extract time invariant features for each station and the corresponding sub-grid
    if WithTopo:
        if not os.path.exists(DESTINATION +
                              '/time_invariant_data_per_station.pkl'):
            ds = DataUtils.getTimeInvariantStationFeatures(
                TOPO=TOPO,
                OBS=OBS,
                stationSquares=stationSquares,
                stationIds=stationIds,
                closestGridPointPerStation=closestGridPointPerStation,
                GridSize=GridSize,
                Features=TopoListParam)
            with open(DESTINATION + '/time_invariant_data_per_station.pkl',
                      'wb') as handle:
                pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL)
            ds.close()
            print(
                '[Process %s] Time invariant features have been processed and stored.'
                % processId)
        else:
            print(
                'Time invariant features have been found on disk and were therefore not created again.'
            )

    # we now start the iteration through: Each folder, each file, each parameter, each station
    for file in Files:  # loop over all  outputs of COSMO-1, e.g. for 3h interval every day
        try:
            # mark start of preprocessing of n-th file
            print('[Process %s] Start processing %s' % (processId, file))

            # initialize data variables
            DATA = np.zeros(
                (len(stationIds), len(PredictionWindow), GridSize, GridSize,
                 len(ListParam)))
            TempForecast = np.zeros((len(stationIds), len(PredictionWindow)))
            Target = np.zeros((len(stationIds), len(PredictionWindow)))
            TimeStamp = np.zeros((len(PredictionWindow)))
            TimeData = np.zeros((len(PredictionWindow), 5))

            for idx_T, T in enumerate(
                    PredictionWindow
            ):  # loop over all future predictions, e.g. current hour + T

                if T < 10:
                    NAME = ADDRESSdata + '/' + file + '/c1ffsurf00' + str(
                        T) + '.nc'
                else:
                    NAME = ADDRESSdata + '/' + file + '/c1ffsurf0' + str(
                        T) + '.nc'

                # load netCRF4 dataset
                dataset = xr.open_dataset(NAME)

                # get initialization time of COSMO-1 data point
                t = dataset['time'].data

                # check that we do not process a data point before the first observation
                if t < OBS['time'].data[0]:
                    print('[Process %s] Skipped %s' % (processId, file))
                    raise SkipException()

                # Transform day and hour into a cyclic datetime feature
                days_rad = (DataUtils.passed_days_per_month_dict[int(
                    file[2:4])] + int(file[4:6])) / 365 * (2 * np.pi)
                hours = (int(file[6:8]) + T) % 24
                hour_rad = hours / 24 * (2 * np.pi)

                TimeData[idx_T] = [
                    np.cos(hour_rad),
                    np.sin(hour_rad),
                    np.cos(days_rad),
                    np.sin(days_rad), T / 33
                ]
                # ______________________________________________________________________

                for P in range(len(ListParam)):
                    MAP = dataset[ListParam[P]].data.squeeze()
                    for idx_S, S in enumerate(stationIds):
                        stationSquare = stationSquares[S]
                        DATA[idx_S, idx_T, :, :, P] = MAP[
                            stationSquare.lat_idx][:, stationSquare.lon_idx]

                # We compare the forecasted temperature with the actual observation
                MAP = np.squeeze(dataset['T']).data
                TempForecast[:, idx_T] = np.array(
                    [MAP[x] for x in closestGridPointPerStation])

                TimeStamp[idx_T] = t[0]

                # this dataset is not used anymore and can be closed
                dataset.close()

                try:
                    Target[:, idx_T] = TempObs.sel(time=t).data
                except RuntimeError:
                    print('Error with time=%s.' % t)
                    raise

            # we write the data in a binary file
            for idx_S, S in enumerate(stationIds):
                leads = PredictionWindow
                time_features = [
                    'cos_hour', 'sin_hour', 'cos_day', 'sin_day', 'lead'
                ]
                lats = stationSquares[S].lat_idx
                lons = stationSquares[S].lon_idx
                dims = ('lead', 'lat', 'lon', 'feature')
                cosmo_data = xr.DataArray(
                    DATA[idx_S],
                    dims=dims,
                    coords=[leads, lats, lons, ListParam])
                temp_forecast = xr.DataArray(TempForecast[idx_S],
                                             dims=('lead'),
                                             coords=[leads])
                temp_station = xr.DataArray(Target[idx_S],
                                            dims=('lead'),
                                            coords=[leads])
                time_data = xr.DataArray(TimeData,
                                         dims=('lead', 'time_feature'),
                                         coords=[leads, time_features])
                time_data.attrs['time_stamp'] = TimeStamp
                ds = xr.Dataset({
                    'cosmo_data': cosmo_data,
                    'temp_forecast': temp_forecast,
                    'temp_station': temp_station,
                    'time_data': time_data
                })
                ds.attrs['station_id'] = S

                try:
                    # ds.to_netcdf(DESTINATION + '/Station_%s/%s.nc' % (S, file[:8]))
                    with open(
                            DESTINATION + '/Station_%s/%s.pkl' % (S, file[:8]),
                            'wb') as handle:
                        pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL)
                except FileNotFoundError:
                    fileExists = os.path.exists(DESTINATION +
                                                '/Station_%s' % S)
                    print('Error that file does not exist, check says: %s' %
                          str(fileExists))
                    raise
                ds.close()

            # print that processing of data point has been completed
            print('[Process %s] Finished %s' % (processId, file))
        except SkipException:
            continue

    OBS.close()
    TOPO.close()

    print('[Process %s] Data split successfully preprocessed!' % processId)

    return 1

Example #9

Show file

File: GenerateNetworkReadyData.py Project: niwein/thesis_nwp_correction

def CreateData(config, data_dictionary, data_statistics, train_test_folds):

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict
            and ModelDict['grid_time_invariant']):
        config['grid_time_invariant_parameters'] = []
    if not ('station_time_invariant' in ModelDict
            and ModelDict['station_time_invariant']):
        config['station_parameters'] = []

    # if needed, load time invariant features
    with open(
            "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" %
        (config['input_source'], config['preprocessing'],
         config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(
        ModelUtils.ParamNormalizationDict, data_statistics)

    # add revision short hash to the config
    config['code_commit'] = ModelUtils.get_git_revision_short_hash()

    # take the right preprocessed train/test data set for the first run
    train_fold, test_fold = train_test_folds[0]

    # initialize train and test dataloaders
    trainset = DataLoaders.CosmoDataGridData(
        config=config,
        station_data_dict=data_dictionary,
        files=train_fold,
        featureScaling=featureScaleFunctions,
        time_invariant_data=time_invarian_data)
    trainloader = DataLoader(trainset,
                             batch_size=config['batch_size'],
                             shuffle=True,
                             num_workers=config['n_loaders'],
                             collate_fn=DataLoaders.collate_fn)

    testset = DataLoaders.CosmoDataGridData(
        config=config,
        station_data_dict=data_dictionary,
        files=test_fold,
        featureScaling=featureScaleFunctions,
        time_invariant_data=time_invarian_data)
    testloader = DataLoader(testset,
                            batch_size=config['batch_size'],
                            shuffle=True,
                            num_workers=config['n_loaders'],
                            collate_fn=DataLoaders.collate_fn)

    # loop over complete train set
    train_data = None
    train_inits = []
    train_stations = None
    for i, data in enumerate(trainloader, 0):
        try:
            # get training batch, e.g. label, cosmo-1 output and time inv. features for station
            DATA = data
            # DATA has only length 4 if we do not use the station time invariant features
            if len(DATA) == 4:
                Blabel, Bip2d, BTimeData, init_station_temp = DATA
                station_time_inv_input = None
            elif len(DATA) == 5:
                Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                station_time_inv_input = ModelUtils.getVariable(
                    StationTimeInv).float()
            else:
                raise Exception('Unknown data format for training...')
            input = ModelUtils.getVariable(Bip2d).float()
            time_data = ModelUtils.getVariable(BTimeData).float()
            target = ModelUtils.getVariable(Blabel).float()

            try:
                batch_data = np.concatenate(
                    (input.squeeze(), station_time_inv_input, time_data,
                     target, init_station_temp[2]),
                    axis=1)
            except:
                batch_data = np.concatenate(
                    (input.squeeze(), time_data, target, init_station_temp[2]),
                    axis=1)

            train_inits += init_station_temp[0]

            if train_data is None:
                train_data = batch_data
                train_stations = init_station_temp[1]
            else:
                train_data = np.vstack((train_data, batch_data))
                train_stations = np.hstack(
                    (train_stations, init_station_temp[1]))

        except TypeError:
            # when the batch size is small, it could happen, that all labels have been corrupted and therefore
            # collate_fn would return an empty list
            print('Value error...')
            continue

    # define column names for data frame
    column_names = [
        'Pressure', 'Wind U-Comp.', 'Wind V-Comp.', 'Wind VMAX',
        '2m-Temperature', 'Temp. of Dew Point', 'Cloud Coverage (High)',
        'Cloud Coverage (Medium)', 'Cloud Coverage (Low)',
        'Tot. Precipitation', 'ALB_RAD', 'ASOB', 'ATHB', 'HPBL',
        '2m-Temperature (Lead=0)'
    ]
    column_names += [
        'Grid Height', 'Grid-Station Height Diff.', 'Fraction of Land',
        'Soiltype', 'Latitiude', 'Longitued', 'Grid-Station 2d Distance'
    ]
    if train_data.shape[1] >= 31:
        column_names += [
            'Station Height', 'Station Latitude', 'Station Longitude'
        ]
    column_names += [
        'Hour (Cosine)', 'Hour (Sine)', 'Month (Cosine)', 'Month (Sine)',
        'Lead-Time'
    ]
    column_names += ['Target 2m-Temp.']
    column_names += ['COSMO 2m-Temp.']

    train_keys = pd.DataFrame.from_dict({
        'Station': train_stations,
        'Init': train_inits
    })
    train_data = pd.DataFrame(data=train_data, columns=column_names)
    train_ds = pd.concat([train_keys, train_data], axis=1)

    test_data = None
    test_inits = []
    test_stations = None
    for i, data in enumerate(testloader, 0):
        try:
            # get training batch, e.g. label, cosmo-1 output and time inv. features for station
            DATA = data
            # DATA has only length 4 if we do not use the station time invariant features
            if len(DATA) == 4:
                Blabel, Bip2d, BTimeData, init_station_temp = DATA
                station_time_inv_input = None
            elif len(DATA) == 5:
                Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                station_time_inv_input = ModelUtils.getVariable(
                    StationTimeInv).float()
            else:
                raise Exception('Unknown data format for training...')
            input = ModelUtils.getVariable(Bip2d).float()
            time_data = ModelUtils.getVariable(BTimeData).float()
            target = ModelUtils.getVariable(Blabel).float()

            try:
                batch_data = np.concatenate(
                    (input.squeeze(), station_time_inv_input, time_data,
                     target, init_station_temp[2]),
                    axis=1)
            except:
                batch_data = np.concatenate(
                    (input.squeeze(), time_data, target, init_station_temp[2]),
                    axis=1)

            test_inits += init_station_temp[0]

            if test_data is None:
                test_data = batch_data
                test_stations = init_station_temp[1]
            else:
                test_data = np.vstack((test_data, batch_data))
                test_stations = np.hstack(
                    (test_stations, init_station_temp[1]))

        except TypeError:
            # when the batch size is small, it could happen, that all labels have been corrupted and therefore
            # collate_fn would return an empty list
            print('Value error...')
            continue

    test_keys = pd.DataFrame.from_dict({
        'Station': test_stations,
        'Init': test_inits
    })
    test_data = pd.DataFrame(data=test_data, columns=column_names)
    test_ds = pd.concat([test_keys, test_data], axis=1)

    network_ready_data_path = config['input_source'] + '/network_ready_data'
    if not os.path.exists(network_ready_data_path):
        os.makedirs(network_ready_data_path)

    network_ready_train_data_path = network_ready_data_path + '/train_data'
    network_ready_test_data_path = network_ready_data_path + '/test_data'

    train_ds.to_pickle(network_ready_train_data_path)
    test_ds.to_pickle(network_ready_test_data_path)

    # shap specific config entries for analysis in jupyter notebook
    config['train_data_path'] = network_ready_data_path + '/train_data'
    config['test_data_path'] = network_ready_data_path + '/test_data'

    # dump config
    with open(network_ready_data_path + '/config.pkl', 'wb') as handle:
        pkl.dump(config, handle, protocol=pkl.HIGHEST_PROTOCOL)

    print('Network ready data analysis successfully executed.')

Example #10

Show file

def CreateBaselineData(DateBegin, DateEnd, PredictionWindow, isLocal,
                       n_parallel):
    time_begin = time()

    if DateEnd < DateBegin:
        raise Exception('DateEnd is smaller than DateBegin.')

    # different paths, whether we run the script locally or on a cluster node
    if isLocal:
        ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset'  # COSMO-1 outputs
        ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data'  # base address of topo files
        ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/'  # base adress of obs files
        DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/baseline'  # target directory for all generated files
    else:
        ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1'  # COSMO-1 outputs
        ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo'  # base address of topo files
        ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations'  # base adress of obs files
        DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/baseline'  # target directory for all generated files

    if not os.path.exists(DESTINATION):
        os.makedirs(DESTINATION)

    # create an output folder for each station, based on the station ids
    OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc')
    TOPO = xr.open_dataset(ADDRESStopo + '/topodata.nc')
    station_ids = OBS['station_id'].data

    # extract time invariant features for each station and the corresponding sub-grid
    if not os.path.exists(DESTINATION + '/station_neighbors.pkl'):

        station_neighbors = {}

        # calculate for each station the neighbors on the grid in parallel
        with Pool(processes=n_parallel) as pool:
            process_results = []

            gridHeightData = TOPO.HH.data
            gridLatData = TOPO.lat.data
            gridLonData = TOPO.lon.data

            # start a new process with the work function for each data split
            for idx_S, S in enumerate(station_ids):
                # calculate height difference between grid points and station
                station_height = OBS['height'].sel(station_id=S).data
                station_lat = OBS['lat'].sel(station_id=S).data
                station_lon = OBS['lon'].sel(station_id=S).data

                print('Neighborhood calculation for staiton %s queued.' % S)
                process_results.append(
                    pool.apply_async(
                        getStationNeighbors,
                        (S, gridHeightData, gridLatData, gridLonData,
                         station_height, station_lat, station_lon)))

            # aggregate results from all processes
            for ps_idx, ps_result in enumerate(process_results):
                # sync processes
                S, neighbor_data = ps_result.get()
                station_neighbors[S] = neighbor_data
                print('[Process %s] Synchronized after data creation.' %
                      ps_idx)

        with open(DESTINATION + '/station_neighbors.pkl', 'wb') as handle:
            pkl.dump(station_neighbors, handle, protocol=pkl.HIGHEST_PROTOCOL)
        print(
            'Station time invariant features have been calculated and stored.')
    else:
        with open(DESTINATION + '/station_neighbors.pkl', 'rb') as handle:
            station_neighbors = pkl.load(handle)
        print(
            'Station time invariant features have been found on disk and were therefore not created again.'
        )

    OBS.close()
    TOPO.close()

    for S in station_ids:
        temp_output_path = DESTINATION + '/temp/station_%s' % S
        if not os.path.exists(temp_output_path):
            os.makedirs(temp_output_path)

    # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not
    # already exists in the output folder
    folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION, 'Station',
                                          DateBegin, DateEnd)
    folders.sort()

    # calculate begin and end index of array to exclude files, that are not in the specified time interval
    begin, end = -1, -1
    for idx, folder in enumerate(folders):
        if folder[:-4] >= DateBegin:
            begin = idx
            break

    for idx, folder in enumerate(folders):
        if folder[:-4] <= DateEnd:
            end = idx
        else:
            break

    if begin == -1 or end == -1:
        raise Exception('Could not find start or end in array.')

    folders = folders[begin:end + 1]
    print('%s files are left to be preprocessed.' % len(folders))

    # split the folders into K approx. equal splits
    if n_parallel <= 1:
        folder_splits = [folders]
    else:
        n_folders = len(folders)
        indices = np.linspace(0, n_folders, n_parallel + 1).astype(int)
        folder_splits = [
            folders[indices[i]:indices[i + 1]] for i in range(n_parallel)
        ]

    folder_splits = [l for l in folder_splits if len(l) > 0]
    # take timestamp after set-up
    time_setup = time()

    # run preprocessing in parallel for all splits and keep the processes in a list to sync them later
    # calculate min/max and select samples on data split in parallel
    with Pool(processes=n_parallel) as pool:
        process_results = []

        # start a new process with the work function for each data split
        for idx_split, split in enumerate(folder_splits):
            print('Process %s with range [%s, %s] queued.' %
                  (idx_split, split[0], split[-1]))
            process_results.append(
                pool.apply_async(
                    GetDataWrapper,
                    (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst,
                     DESTINATION, split, station_neighbors, PredictionWindow,
                     isLocal)))

        # aggregate results from all processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            result = ps_result.get()
            print('[Process %s] Synchronized after data creation.' % ps_idx)

        station_folders_paths = [
            f for f in os.listdir(DESTINATION + '/temp')
            if re.match(r'^station_([0-9]+?)$', f)
        ]

        process_results = []
        for ps_idx, station_folder in enumerate(station_folders_paths):
            print('Process %s with station folder %s queued.' %
                  (ps_idx, station_folder))
            process_results.append(
                pool.apply_async(aggregateProcessFiles,
                                 (ps_idx, DESTINATION, station_folder)))

        # aggregate results from all processes
        for ps_idx, ps_result in enumerate(process_results):
            # sync processes
            result = ps_result.get()
            print('[Process %s] Synchronized after aggregation.' % ps_idx)

    # take timestamp after completing all processes
    time_end = time()

    # dump preprocessing information in a descriptive JSON file
    preprocessing_information = {
        'data_begin': DateBegin,
        'data_end': DateEnd,
        'future_hours': PredictionWindow,
        'n_processes': n_parallel,
        'time_setup': str(timedelta(seconds=(time_setup - time_begin))),
        'time_preprocessing': str(timedelta(seconds=(time_end - time_setup)))
    }

    preprocessing_information_json = json.dumps(preprocessing_information)
    f = open(DESTINATION + '/setup.json', 'w')
    f.write(preprocessing_information_json)
    f.close()

    print('Station baseline reprocessing sucessfully finished in %s.' %
          str(timedelta(seconds=(time_end - time_begin))))

Example #11

Show file

        % (config['distance_metric'], h, m, s))

# runs a generalization experiment on stations only used in prediction
# this requires >=1 model config file in the "models" folder of an experiment and an "experiment_parameters.txt" file
# sample model configs and "experiment_parameters.txt" files can be found under /results/runs/spatial_generalization
# IMPORTANT: in the "experiment_parameters.txt" file one has to specify, how many test stations should be use (and therefore
# left out for training) and what station is the first in consecutive order to defined the test stations
elif options.script == 'spatialGeneralizationExperiment':
    experiment_start = time()

    config, train_test_folds, data_dictionary, data_statistics = ModelUtils.setUpModelRun(
        options=options, G=G)

    # we filter out in config specified test station from train set and filter all train
    # stations from test set for each run
    train_test_folds = DataUtils.filterUnseenTestStations(
        train_test_folds=train_test_folds, config=config)

    print('Starting to run %s' % options.script)
    print("Test Stations:", config['test_stations'])

    models = [
        f[:-4] for f in os.listdir(config['experiment_path'] + '/models')
    ]
    n_models = len(models)
    print('%s models found to run.' % n_models)
    for m_idx, m in enumerate(models):
        config['model'][
            'path'] = config['experiment_path'] + '/models/%s.txt' % m
        config['model']['name'] = m

        ModelRun.runModel(config=config,

Example #12

Show file

def runModel(config, data_dictionary, data_statistics, train_test_folds):
    # load time invariant data
    source_path = config['input_source']
    experiment_path = config['experiment_path']

    # assign all program arguments to local variables
    config['batch_size'] = 1
    config['runs'] = 3
    config['grid_size'] = 9

    # if needed, load time invariant features
    with open(
            "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" %
        (config['input_source'], config['preprocessing'],
         config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(
        ParamNormalizationDict, data_statistics)

    plot_config = {
        'features': config['input_parameters'],
        'time_invariant_features': config['grid_time_invariant_parameters'],
        'station_features': config['station_parameters']
    }

    # cross validation
    for run in range(config['runs']):
        print('[Run %s] Cross-validation test fold %s' %
              (str(run + 1), str(run + 1)))

        # take the right preprocessed train/test data set for the current run
        train_fold, test_fold = train_test_folds[run]

        # initialize train and test dataloaders
        trainset = DataLoaders.SinglePredictionCosmoData(
            config=config,
            station_data_dict=data_dictionary,
            files=train_fold,
            featureScaling=featureScaleFunctions,
            time_invariant_data=time_invarian_data)
        trainloader = DataLoader(trainset,
                                 batch_size=config['batch_size'],
                                 shuffle=True,
                                 num_workers=config['n_loaders'],
                                 collate_fn=DataLoaders.collate_fn)

        testset = DataLoaders.SinglePredictionCosmoData(
            config=config,
            station_data_dict=data_dictionary,
            files=test_fold,
            featureScaling=featureScaleFunctions,
            time_invariant_data=time_invarian_data)
        testloader = DataLoader(testset,
                                batch_size=config['batch_size'],
                                shuffle=True,
                                num_workers=config['n_loaders'],
                                collate_fn=DataLoaders.collate_fn)

        train_features = [[] for _ in trainset.parameters]
        train_time_invariant_grid_features = [
            [] for _ in trainset.grid_time_invariant_parameters
        ]
        train_station_features = [[] for _ in trainset.station_parameters]
        train_labels = []
        # loop over complete train set
        for i, data in enumerate(trainloader, 0):
            try:
                # get training batch, e.g. label, cosmo-1 output and external features
                Blabel, Bip2d, StationTimeInv = data
            except ValueError:
                # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                # collate_fn would return an empty list
                print('Value error...')
                continue

            train_labels += list(Blabel.numpy().flatten())
            for feature_idx, _ in enumerate(trainset.parameters):
                train_features[feature_idx] += list(
                    Bip2d[:, feature_idx, :, :].numpy().flatten())
            for ti_feature_idx, _ in enumerate(
                    trainset.grid_time_invariant_parameters):
                train_time_invariant_grid_features[ti_feature_idx] += list(
                    Bip2d[:, trainset.n_parameters +
                          ti_feature_idx, :, :].numpy().flatten())
            for station_feature_idx, _ in enumerate(
                    trainset.station_parameters):
                train_station_features[station_feature_idx] += list(
                    StationTimeInv[:, station_feature_idx].numpy().flatten())

        test_features = [[] for _ in testset.parameters]
        test_time_invariant_grid_features = [
            [] for _ in testset.grid_time_invariant_parameters
        ]
        test_station_features = [[] for _ in testset.station_parameters]
        test_labels = []
        # loop over complete train set
        for i, data in enumerate(testloader, 0):
            try:
                # get training batch, e.g. label, cosmo-1 output and external features
                Blabel, Bip2d, StationTimeInv = data
            except ValueError:
                # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                # collate_fn would return an empty list
                print('Value error...')
                continue

            test_labels += list(Blabel.numpy().flatten())
            for feature_idx, _ in enumerate(testset.parameters):
                test_features[feature_idx] += list(
                    Bip2d[:, feature_idx, :, :].numpy().flatten())
            for ti_feature_idx, _ in enumerate(
                    testset.grid_time_invariant_parameters):
                test_time_invariant_grid_features[ti_feature_idx] += list(
                    Bip2d[:, testset.n_parameters +
                          ti_feature_idx, :, :].numpy().flatten())
            for station_feature_idx, _ in enumerate(
                    testset.station_parameters):
                test_station_features[station_feature_idx] += list(
                    StationTimeInv[:, station_feature_idx].numpy().flatten())

        plot_config['run'] = run
        PlotUtils.plotFeatureDistribution(
            output_path=experiment_path,
            config=plot_config,
            train_features=train_features,
            train_time_invariant_grid_features=
            train_time_invariant_grid_features,
            train_station_features=train_station_features,
            train_labels=train_labels,
            test_features=test_features,
            test_time_invariant_grid_features=test_time_invariant_grid_features,
            test_station_features=test_station_features,
            test_labels=test_labels)

Example #13

Show file

def plotPerStationPredictionRun(source_path, observation_path, n_parallel):
    # gather all models in source folder
    error_data_per_run_dict = defaultdict()
    for path in glob.glob(source_path + '/**/model_run_error.pkl', recursive=True):
        model_name = path.split('/')[-2]
        with open(path, 'rb') as file:
            ds = pkl.load(file)
        for data_var in ds.data_vars:
            da = ds[data_var]
            try:
                error_data_per_run_dict[data_var] += [(model_name, da)]
            except:
                error_data_per_run_dict[data_var] = [(model_name, da)]

    # load observations
    OBS = xr.open_dataset(observation_path)
    # get the prediction lead time to adjust time labels
    prediciton_lead_time = ds.attrs['config']['prediction_times'][0] if 'config' in ds.attrs else 1

    for run_error_data in error_data_per_run_dict.items():
        run = run_error_data[0]
        models = run_error_data[1]
        stations = run_error_data[1][0][1].station.data
        inits = run_error_data[1][0][1].init.data
        init_type_mapping = np.array(run_error_data[1][0][1].init_type_mapping)
        train_indices = [idx for idx, item in enumerate(init_type_mapping) if item[1] == 'train']
        test_indices = [idx for idx, item in enumerate(init_type_mapping) if item[1] == 'test']
        sample_type_color_mapping = [mapping[1] for mapping in init_type_mapping]
        times = DataUtils.getTimeFromFileName(inits, prediciton_lead_time)
        time_labels = [str(t)[:-13] for t in times]

        station_name_dict = get_station_dict(OBS, stations)

        model_station_mean_errors = {}
        # plot for each station the prediction run results in parallel
        with Pool(processes=n_parallel) as pool:
            process_results = []

            for station_idx, station in enumerate(stations):
                print('Plotting of prediction run for station %s queued.' % station)
                process_results.append(pool.apply_async(plotPerStationPredictionRunWorker,
                                                        (models, station, train_indices, test_indices,
                                                         station_name_dict,sample_type_color_mapping,
                                                         time_labels, source_path, run)))

            # aggregate results from all processes
            for ps_idx, ps_result in enumerate(process_results):
                # sync processes
                model_station_mean_error = ps_result.get()

                for experiment_title, station_data_list in model_station_mean_error.items():
                    try:
                        model_station_mean_errors[experiment_title] += station_data_list
                    except KeyError:
                        model_station_mean_errors[experiment_title] = station_data_list

                print('[Process %s] Synchronized after plotting station.' % ps_idx)

        run_path = source_path + '/plots/prediction_runs/%s' % run
        if not os.path.exists(run_path):
            os.makedirs(run_path)

        generateStationPredictionResultTable(output_path=run_path, results=model_station_mean_errors)

Example #14

Show file

def plotAveragedPredictionRun(source_path):
    # gather all models in source folder
    error_data_per_run_dict = {}
    for path in glob.glob(source_path + '/**/model_run_error.pkl', recursive=True):
        model_name = path.split('/')[-2]
        with open(path, 'rb') as file:
            ds = pkl.load(file)
        for data_var in ds.data_vars:
            inits = ds[data_var].init.data
            sample_type_mapping = [mapping[1] for mapping in ds[data_var].init_type_mapping]
            prediction_data = ds[data_var].data
            try:
                error_data_per_run_dict[data_var] += [(model_name, inits, prediction_data, sample_type_mapping)]
            except:
                error_data_per_run_dict[data_var] = [(model_name, inits, prediction_data, sample_type_mapping)]


    # get the prediction lead time to adjust time labels
    prediciton_lead_time = ds.attrs['config']['prediction_times'][0] if 'config' in ds.attrs else 1

    times = DataUtils.getTimeFromFileName(inits, prediciton_lead_time)
    time_labels = [str(t)[:-13] for t in times]

    for run_error_data in error_data_per_run_dict.items():
        run = run_error_data[0]
        model_mean_errors = {}
        n_subplots = 10
        fig, axes = plt.subplots(n_subplots, figsize=(60, 20), sharey=True)
        for model_idx, model_error_data in enumerate(run_error_data[1]):

            N = len(model_error_data[1])
            split_length = N // n_subplots
            ind = np.arange(N)  # the x locations for the groups
            experiment_title = model_error_data[0]
            prediction_data = model_error_data[2]
            init_type_mapping = model_error_data[3]
            train_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'train']
            test_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'test']
            filtered_indices = [idx for idx, item in enumerate(init_type_mapping) if item == 'filterd']


            for i in range(n_subplots):
                # split indexes into slices for each subplot
                index_split = ind[i * split_length:(i + 1) * split_length]

                if model_idx == 0:
                    sampleTypeBackgroundColoring(axes[i], index_split,
                                                 init_type_mapping[i * split_length:(i + 1) * split_length])
                    axes[i].set_xlim([np.min(index_split), np.max(index_split)])

                axes[i].plot(index_split,
                             np.nanmean(prediction_data[i * split_length:(i + 1) * split_length,:, 0],axis=1),
                             label=experiment_title, linewidth=0.15, alpha=0.8)

            train_model_bias = np.nanmean(prediction_data[train_indices][:,:,3])
            train_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[train_indices][:,:,3])))
            train_model_mae = np.nanmean(np.absolute(prediction_data[train_indices][:,:,3]))

            test_model_bias = np.nanmean(prediction_data[test_indices][:,:,3])
            test_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[test_indices][:,:,3])))
            test_model_mae = np.nanmean(np.absolute(prediction_data[test_indices][:,:,3]))
            
            filtered_model_bias = np.nanmean(prediction_data[filtered_indices][:,:,3])
            filtered_model_rmse = np.sqrt(np.nanmean(np.square(prediction_data[filtered_indices][:,:,3])))
            filtered_model_mae = np.nanmean(np.absolute(prediction_data[filtered_indices][:,:,3]))


            model_mean_errors[experiment_title] = (train_model_bias, train_model_rmse, train_model_mae,
                                                   test_model_bias, test_model_rmse, test_model_mae,
                                                   filtered_model_bias, filtered_model_rmse, filtered_model_mae)


        # add mean errors of cosmo output predictions
        train_diff_cosmo = prediction_data[train_indices][:,:,1] - prediction_data[train_indices][:,:,2]
        train_cosmo_bias = np.nanmean(train_diff_cosmo)
        train_cosmo_rmse = np.sqrt(np.nanmean(np.square(train_diff_cosmo)))
        train_cosmo_mae = np.nanmean(np.absolute(train_diff_cosmo))

        test_diff_cosmo = prediction_data[test_indices][:,:,1] - prediction_data[test_indices][:,:,2]
        test_cosmo_bias = np.nanmean(test_diff_cosmo)
        test_cosmo_rmse = np.sqrt(np.nanmean(np.square(test_diff_cosmo)))
        test_cosmo_mae = np.nanmean(np.absolute(test_diff_cosmo))
        
        filtered_diff_cosmo = prediction_data[filtered_indices][:,:,1] - prediction_data[filtered_indices][:,:,2]
        filtered_cosmo_bias = np.nanmean(filtered_diff_cosmo)
        filtered_cosmo_rmse = np.sqrt(np.nanmean(np.square(filtered_diff_cosmo)))
        filtered_cosmo_mae = np.nanmean(np.absolute(filtered_diff_cosmo))
        
        # add COSMO-1 output prediction error
        model_mean_errors['COSMO-1'] = (train_cosmo_bias, train_cosmo_rmse, train_cosmo_mae,
                                        test_cosmo_bias, test_cosmo_rmse, test_cosmo_mae,
                                        filtered_cosmo_bias, filtered_cosmo_rmse, filtered_cosmo_mae)

        for i in range(n_subplots):
            axes[i].plot(ind[i * split_length:(i + 1) * split_length],
                         np.nanmean(prediction_data[i * split_length:(i + 1) * split_length,:, 1], axis=1), label='COSMO-1',
                         linewidth=0.15, alpha=0.8, color='b', linestyle='-.')
            axes[i].plot(ind[i * split_length:(i + 1) * split_length],
                         np.nanmean(prediction_data[i * split_length:(i + 1) * split_length, 2], axis=1), label='Prediction',
                         linewidth=0.15, alpha=0.8, color='m', linestyle='--')

            tick_step_size = np.maximum(split_length // 30, 1)
            axes[i].set_xticks(ind[i * split_length:(i + 1) * split_length][::tick_step_size])
            axes[i].set_xticklabels(time_labels[i * split_length:(i + 1) * split_length][::tick_step_size])
            axes[i].set_xticks(ind[i * split_length:(i + 1) * split_length], minor=True)
            # And a corresponding grid
            axes[i].grid(which='both')

            # Or if you want different settings for the grids:
            axes[i].grid(which='minor', alpha=0.2)
            axes[i].grid(which='major', alpha=0.5)

            handles, labels = axes[0].get_legend_handles_labels()

        axes[n_subplots - 1].set_xlabel('Time')
        axes[0].legend(handles, labels)
        plt.tight_layout()

        run_path = source_path + '/plots/prediction_runs/%s' % run
        if not os.path.exists(run_path):
            os.makedirs(run_path)

        fig.savefig(run_path + '/averaged_prediction.png', dpi=300)

        generatePredictionResultTable(output_path=run_path, results=model_mean_errors)