Beispiel #1
0
def initialize_GFS_date_keys_for_sequence(date_keys: [str],
                                          labels: pd.DataFrame,
                                          train_params: [dict],
                                          target_param: str,
                                          sequence_length: int):
    # filter out date_keys, which are not continued by sufficient number of consecutive forecasts
    new_list = []
    date_keys = sorted(date_keys)
    gfs_loader = GFSLoader()
    for date_key in date_keys:
        date_matcher = re.match(NETCDF_FILE_REGEX, date_key)
        offset = int(date_matcher.group(6)) + 3
        exists = True
        for frame in range(1, sequence_length):
            new_date_key = GFSLoader.get_date_key(
                date_from_gfs_date_key(date_key) + timedelta(hours=offset))
            # check if gfs file exists
            if not all(
                    gfs_loader.get_gfs_image(new_date_key, param, offset)
                    is not None for param in train_params):
                exists = False
                break
            # check if synop label exists
            if len(labels[labels["date"] == date_from_gfs_date_key(date_key) +
                          timedelta(hours=offset)][target_param]) == 0:
                exists = False
                break

            offset = offset + 3
        if exists:
            new_list.append(date_key)

    return new_list
Beispiel #2
0
def initialize_mean_and_std(date_keys,
                            train_parameters,
                            dim: (int, int),
                            prediction_offset: int,
                            subregion_coords=None):
    log.info("Calculating std and mean for a dataset")
    means = []
    stds = []
    gfs_loader = GFSLoader()
    for param in tqdm(train_parameters):
        sum, sqr_sum = 0, 0
        for date_key in tqdm(date_keys):
            values = gfs_loader.get_gfs_image(date_key, param,
                                              prediction_offset)
            if subregion_coords is not None:
                values = get_subregion_from_GFS_slice_for_coords(
                    values, subregion_coords)
            sum += np.sum(values)
            sqr_sum += np.sum(np.power(values, 2))

        mean = sum / (len(date_keys) * dim[0] * dim[1])
        means.append(mean)
        stds.append(
            math.sqrt(sqr_sum / (len(date_keys) * dim[0] * dim[1]) -
                      pow(mean, 2)))

    return means, stds
Beispiel #3
0
def get_gfs_values_and_targets_for_gfs_ids(gfs_date_keys, labels, target_param,
                                           lat: float, lon: float,
                                           offset: int):
    targets = []
    gfs_values = []
    coords = Coords(lat, lat, lon, lon)
    gfs_loader = GFSLoader()
    param = target_param_to_gfs_name_level(target_param)[0]
    for date_key in tqdm(gfs_date_keys):
        date = date_from_gfs_date_key(date_key)
        label = labels[labels["date"] == date]
        if len(label) > 0:
            targets.append(label[target_param].to_numpy())
            gfs_values.append(
                get_point_from_GFS_slice_for_coords(
                    gfs_loader.get_gfs_image(date_key, param, offset), coords))

    return np.array(gfs_values), np.array(targets).squeeze()
Beispiel #4
0
    def __init__(self, config: Config, train_IDs, labels, normalize=True):
        self.train_parameters = process_config(
            config.experiment.train_parameters_config_file)
        self.target_param = config.experiment.target_parameter
        self.labels = labels
        self.dim = config.experiment.cnn_input_size
        self.prediction_offset = config.experiment.prediction_offset
        self.channels = len(self.train_parameters)
        self.normalization_type = config.experiment.normalization_type
        self.sequence_length = config.experiment.sequence_length
        self.list_IDs = train_IDs

        self.data = self.list_IDs[str(self.prediction_offset)]
        self.mean, self.std = [], []
        self.normalize = normalize
        self.gfs_loader = GFSLoader()
        if normalize:
            self.normalize_data(config.experiment.normalization_type)
Beispiel #5
0
def match_gfs_with_synop_sequence(features: Union[list, np.ndarray],
                                  targets: list,
                                  lat: float,
                                  lon: float,
                                  prediction_offset: int,
                                  gfs_params: list,
                                  return_GFS=True):
    gfs_values = []
    new_targets = []
    new_features = []
    gfs_loader = GFSLoader()
    removed_indices = []
    print("Matching GFS with synop data")

    for index, value in tqdm(enumerate(targets)):
        date = value[0]
        gfs_date, gfs_offset = get_forecast_date_and_offset_for_prediction_date(
            date, prediction_offset)
        gfs_date_key = gfs_loader.get_date_key(gfs_date)

        # check if there are forecasts available
        if all(
                gfs_loader.get_gfs_image(gfs_date_key, param, gfs_offset)
                is not None for param in gfs_params):
            if return_GFS:
                val = []

                for param in gfs_params:
                    val.append(
                        get_point_from_GFS_slice_for_coords(
                            gfs_loader.get_gfs_image(gfs_date_key, param,
                                                     gfs_offset),
                            Coords(lat, lat, lon, lon)))

                gfs_values.append(val)
            new_targets.append(value[1])
            new_features.append(features[index])
        else:
            removed_indices.append(index)

    if return_GFS:
        return np.array(new_features), np.array(gfs_values), np.array(
            new_targets), removed_indices
    return np.array(new_features), np.array(new_targets), removed_indices
Beispiel #6
0
def get_GFS_values_for_sequence(date_key,
                                param,
                                sequence_length: int,
                                prediction_offset: int,
                                subregion_coords: Coords = None):
    gfs_loader = GFSLoader()
    values = [
        get_subregion_from_GFS_slice_for_coords(
            gfs_loader.get_gfs_image(date_key, param, prediction_offset),
            subregion_coords)
    ]
    for frame in range(1, sequence_length):
        new_date_key = GFSLoader.get_date_key(
            date_from_gfs_date_key(date_key) + timedelta(hours=3 * frame))
        val = gfs_loader.get_gfs_image(new_date_key, param,
                                       prediction_offset + 3 * frame)
        if subregion_coords is not None:
            val = get_subregion_from_GFS_slice_for_coords(
                val, subregion_coords)
        values.append(val)

    return values
Beispiel #7
0
def initialize_min_max(date_keys: [str],
                       train_parameters,
                       prediction_offset: int,
                       subregion_coords=None):
    log.info("Calculating min and max for a dataset")
    mins = []
    maxes = []
    gfs_loader = GFSLoader()
    for param in tqdm(train_parameters):
        min, max = sys.float_info.max, sys.float_info.min
        for date_key in date_keys:
            values = gfs_loader.get_gfs_image(date_key, param,
                                              prediction_offset)
            if subregion_coords is not None:
                values = get_subregion_from_GFS_slice_for_coords(
                    values, subregion_coords)
            min = min(np.min(values), min)
            max = max(np.max(values), max)

        mins.append(min)
        maxes.append(max)

    return mins, maxes
Beispiel #8
0
def get_next_gfs_values(dates, prediction_offset, lat: float, lon: float,
                        gfs_params: list, future_dates):
    next_gfs_values = []
    gfs_loader = GFSLoader()
    first_date = dates.values[0]
    coords = Coords(lat, lat, lon, lon)
    for date in dates:
        if future_dates:
            offset = prediction_offset + int(
                divmod((date - first_date).total_seconds(), 3600)[0])
        else:
            offset = prediction_offset
        gfs_dates, gfs_offsets, mod_offset = get_forecast_dates_and_offsets_for_prediction_date(
            date, offset)

        val = []
        for param in gfs_params:
            if mod_offset == 0:
                gfs_date_key = gfs_loader.get_date_key(gfs_dates[0])
                value = gfs_loader.get_gfs_image(gfs_date_key, param,
                                                 gfs_offsets[0])

                if value is None:
                    return None

                value = get_point_from_GFS_slice_for_coords(value, coords)
                val.append(value)
            else:
                # interpolate from 2 gfs forecasts
                gfs_date_key = gfs_loader.get_date_key(gfs_dates[0])
                val1 = gfs_loader.get_gfs_image(gfs_date_key, param,
                                                gfs_offsets[0])
                gfs_date_key = gfs_loader.get_date_key(gfs_dates[1])
                val2 = gfs_loader.get_gfs_image(gfs_date_key, param,
                                                gfs_offsets[1])

                if val1 is None or val2 is None:
                    return None

                val1 = get_point_from_GFS_slice_for_coords(val1, coords)
                val2 = get_point_from_GFS_slice_for_coords(val2, coords)

                val.append(val1 * (3 - mod_offset) / 3 + val2 * mod_offset / 3)

        next_gfs_values.append(val)

    return next_gfs_values
Beispiel #9
0
class MultiChannelSpatialDataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'

    def __init__(self, config: Config, train_IDs, labels, normalize=True):
        self.train_parameters = process_config(
            config.experiment.train_parameters_config_file)
        self.target_param = config.experiment.target_parameter
        self.labels = labels
        self.dim = config.experiment.cnn_input_size
        self.prediction_offset = config.experiment.prediction_offset
        self.channels = len(self.train_parameters)
        self.normalization_type = config.experiment.normalization_type
        self.sequence_length = config.experiment.sequence_length
        self.list_IDs = train_IDs

        self.data = self.list_IDs[str(self.prediction_offset)]
        self.mean, self.std = [], []
        self.normalize = normalize
        self.gfs_loader = GFSLoader()
        if normalize:
            self.normalize_data(config.experiment.normalization_type)

    def normalize_data(self, normalization_type: NormalizationType):
        if normalization_type == NormalizationType.STANDARD:
            if self.sequence_length > 1:
                self.mean, self.std = initialize_mean_and_std_for_sequence(
                    self.list_IDs, self.train_parameters, self.dim,
                    self.sequence_length, self.prediction_offset)
            else:
                self.mean, self.std = initialize_mean_and_std(
                    self.list_IDs, self.train_parameters, self.dim,
                    self.prediction_offset)
        else:
            if self.sequence_length > 1:
                self.min, self.max = initialize_min_max_for_sequence(
                    self.list_IDs, self.train_parameters, self.sequence_length,
                    self.prediction_offset)
            else:
                self.min, self.max = initialize_min_max(
                    self.list_IDs, self.train_parameters,
                    self.prediction_offset)

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.data[index]

        X, y = self.__data_generation(ID)

        return X, y

    def __data_generation(self, ID):
        # Initialization
        if self.sequence_length > 1:
            x = np.empty((self.sequence_length, self.channels, *self.dim))
            y = np.empty(self.sequence_length)

            # Generate data
            for j, param in enumerate(self.train_parameters):
                # Store sample
                x[:,
                  j, ] = get_GFS_values_for_sequence(ID, param,
                                                     self.sequence_length,
                                                     self.prediction_offset)
                if self.normalize:
                    if self.normalization_type == NormalizationType.STANDARD:
                        x[:, j, ] = (x[:, j, ] - self.mean[j]) / self.std[j]
                    else:
                        x[:, j, ] = (x[:, j, ] - self.min[j]) / (self.max[j] -
                                                                 self.min[j])

            first_forecast_date = date_from_gfs_date_key(ID)
            labels = [
                self.labels[self.labels["date"] == first_forecast_date +
                            timedelta(hours=offset *
                                      3)][self.target_param].values[0]
                for offset in range(0, self.sequence_length)
            ]
            y[:] = labels
        else:
            x = np.empty((self.channels, *self.dim))
            y = np.empty(1)

            # Generate data
            for j, param in enumerate(self.train_parameters):
                # Store sample
                x[j, ] = self.gfs_loader.get_gfs_image(ID, param,
                                                       self.prediction_offset)
                if self.normalize:
                    if self.normalization_type == NormalizationType.STANDARD:
                        x[j, ] = (x[j, ] - self.mean[j]) / self.std[j]
                    else:
                        x[j, ] = (x[j, ] - self.min[j]) / (self.max[j] -
                                                           self.min[j])

            forecast_date = date_from_gfs_date_key(ID)
            label = self.labels[self.labels["date"] == forecast_date][
                self.target_param]

            y[0] = label.values[0]
        return x, y