def __init__(self, config: Config): super().__init__() self.config = config self.val_split = config.experiment.val_split self.batch_size = config.experiment.batch_size self.shuffle = config.experiment.shuffle self.dataset_train = ... self.dataset_val = ... self.dataset_test = ... self.train_parameters = process_config( config.experiment.train_parameters_config_file) self.prediction_offset = config.experiment.prediction_offset self.synop_file = config.experiment.synop_file self.target_param = config.experiment.target_parameter self.sequence_length = config.experiment.sequence_length self.labels, self.label_mean, self.label_std = prepare_synop_dataset( self.synop_file, [self.target_param], dataset_dir=SYNOP_DATASETS_DIRECTORY, from_year=config.experiment.synop_from_year, to_year=config.experiment.synop_to_year) available_ids = get_available_gfs_date_keys(self.train_parameters, self.prediction_offset, self.sequence_length) self.IDs = initialize_GFS_date_keys_for_sequence( available_ids, self.labels, self.train_parameters, self.target_param, self.sequence_length)
def __init__(self, config: Config): """Initialization""" self.train_parameters = process_config( config.experiment.train_parameters_config_file) self.target_param = config.experiment.target_parameter self.synop_file = config.experiment.synop_file self.prediction_offset = config.experiment.prediction_offset self.target_coords = config.experiment.target_coords synop_data, synop_mean, synop_std = prepare_synop_dataset( self.synop_file, [self.target_param], dataset_dir=SYNOP_DATASETS_DIRECTORY, from_year=config.experiment.synop_from_year, to_year=config.experiment.synop_to_year) synop_data_dates = synop_data['date'] labels = pd.concat([synop_data_dates, synop_data[self.target_param]], axis=1).to_numpy().tolist() _, self.gfs_data, self.targets = match_gfs_with_synop_sequence( labels, labels, self.target_coords[0], self.target_coords[1], self.prediction_offset, self.train_parameters) self.targets = self.targets.reshape((len(self.targets), 1)) if config.experiment.normalization_type == NormalizationType.STANDARD: self.gfs_data = (self.gfs_data - np.mean( self.gfs_data, axis=0)) / np.std(self.gfs_data, axis=0) else: self.gfs_data = (self.gfs_data - np.min(self.gfs_data, axis=0)) / ( np.max(self.gfs_data, axis=0) - np.min(self.gfs_data, axis=0)) assert len(self.gfs_data) == len(self.targets) self.data = list(zip(self.gfs_data, self.targets)) print(synop_mean) print(synop_std)
def __init__(self, config: Config, list_IDs, train=True, normalize=True): 'Initialization' self.list_IDs = list_IDs self.train_parameters = process_config( config.experiment.train_parameters_config_file) self.target_param = config.experiment.target_parameter self.synop_file = config.experiment.synop_file self.labels, self.label_mean, self.label_std = prepare_synop_dataset( self.synop_file, [self.target_param], dataset_dir=SYNOP_DATASETS_DIRECTORY) self.dim = config.experiment.cnn_input_size self.channels = len(self.train_parameters) self.normalization_type = config.experiment.normalization_type length = len(self.list_IDs) training_data, test_data = self.list_IDs[:int(length * 0.8)], self.list_IDs[ int(length * 0.8):] if train: data = training_data else: data = test_data self.data = data self.mean, self.std = [], [] self.normalize = normalize if normalize: if config.experiment.normalization_type == NormalizationType.STANDARD: self.mean, self.std = initialize_mean_and_std( self.list_IDs, self.train_parameters, self.dim) else: self.min, self.max = initialize_min_max( self.list_IDs, self.train_parameters)
def prepare_data(self, *args, **kwargs): self.synop_data = prepare_synop_dataset( self.synop_file, list(list(zip(*self.train_params))[1]), dataset_dir=SYNOP_DATASETS_DIRECTORY, from_year=self.synop_from_year, to_year=self.synop_to_year, norm=False) dates = get_correct_dates_for_sequence(self.synop_data, self.sequence_length, self.future_sequence_length, self.prediction_offset) self.synop_data = self.synop_data.reset_index() # Get indices which correspond to 'dates' - 'dates' are the ones, which start a proper sequence without breaks self.synop_data_indices = self.synop_data[self.synop_data["date"].isin( dates)].index # data was not normalized, so take all frames which will be used, compute std and mean and normalize data self.synop_data, synop_mean, synop_std = normalize_synop_data( self.synop_data, self.synop_data_indices, self.feature_names, self.sequence_length + self.prediction_offset + self.future_sequence_length, self.normalization_type) self.synop_mean = synop_mean[self.target_param_index] self.synop_std = synop_std[self.target_param_index] print(f"Synop mean: {synop_mean[self.target_param_index]}") print(f"Synop std: {synop_std[self.target_param_index]}")
def prepare_data(gfs_dir: str, synop_dir: str, start_seq: int, end_seq: int, gfs_past: int, gfs_future: int, synop_length: int, train_split: int, column_name_label: str): features = [ consts.DIRECTION_COLUMN, consts.VELOCITY_COLUMN, consts.GUST_COLUMN, consts.TEMPERATURE, consts.PRESSURE, consts.CURRENT_WEATHER ] synop_dataset, _, _ = prepare_synop_dataset( synop_dir, list(list(zip(*features))[1]), dataset_dir=SYNOP_DATASETS_DIRECTORY) gfs_dataset, gfs_hour_diff = prepare_gfs_sequence_dataset( gfs_dir, start_seq, end_seq, gfs_past, gfs_future) least_recent_date = get_oldest_gfs_date(gfs_dataset) recent_synop_date = synop_dataset["date"].max() synop_dataset = synop_dataset[ synop_dataset["date"] >= pd.to_datetime(least_recent_date) - timedelta(hours=synop_length)] gfs_dataset = filter_gfs(gfs_dataset, recent_synop_date, gfs_hour_diff, start_seq + end_seq) synop_dataset_input = [] gfs_dataset_input = [] dataset_label = [] print("Creating data set") for key in tqdm.tqdm(gfs_dataset.keys()): gfs_creation_date = datetime.strptime(key, '%Y-%m-%d-%HZ') single_gfs = gfs_dataset[key] single_gfs = convert_wind(single_gfs, "U-wind, m/s", "V-wind, m/s") for hour in range(gfs_hour_diff): try: synop_index = synop_dataset.index[synop_dataset['date'] == gfs_creation_date][0] synop_input_index = hour + synop_index - synop_length + 1 synop_input = synop_dataset.loc[ synop_input_index:synop_input_index + synop_length] label = synop_dataset.loc[hour + synop_index + start_seq:hour + synop_index + end_seq] synop_input = synop_input.drop( ['date', 'year', 'day', 'month'], axis=1).values gfs_input = normalize(single_gfs.drop(['date'], axis=1).values) label = label[column_name_label].values synop_dataset_input.append(synop_input) gfs_dataset_input.append(gfs_input) dataset_label.append(label) except: pass return np.array(synop_dataset_input), np.array( gfs_dataset_input), np.array(dataset_label)
def explore_synop(localisation_name: str, code_fallback: int): relevant_features = [f for f in SYNOP_FEATURES if f[1] not in ['year', 'month', 'day', 'hour']] synop_file = f"{localisation_name}_{code_fallback}_data.csv" if not os.path.exists(os.path.join(SYNOP_DATASETS_DIRECTORY, synop_file)): prepare_synop_csv(localisation_name, code_fallback, SYNOP_FEATURES) data = prepare_synop_dataset(synop_file, list(list(zip(*relevant_features))[1]), norm=False, dataset_dir=SYNOP_DATASETS_DIRECTORY) explore_synop_correlations(data, relevant_features, localisation_name) explore_synop_patterns(data, relevant_features, localisation_name)
def main(station: str, target_param: str, prediction_offset: int, sequence_length: int, from_year: int): synop_file = STATION_META[station]['synop_file'] synop_data = prepare_synop_dataset(synop_file, [target_param], norm=False, dataset_dir=SYNOP_DATASETS_DIRECTORY, from_year=from_year) labels = synop_data[['date', target_param]] available_gfs_date_keys = get_available_gfs_date_keys( target_param_to_gfs_name_level(target_param), prediction_offset, sequence_length, from_year) compare_gfs_with_synop(labels, available_gfs_date_keys, target_param, STATION_META[station]['lat'], STATION_META[station]['lon'], prediction_offset, sequence_length)
def prepare_data(past_len=12, future_offset=12, train_split_factor=0.75): features = [] dataset_train, _, _ = prepare_synop_dataset("preprocess/synop_data/135_data.csv", "") gfs_dataset = prepare_gfs_dataset_for_single_point_time("preprocess/wind_and_temp", 'date', future_offset) least_recent_date = gfs_dataset["date"].min() dataset_train = dataset_train[dataset_train["date"] >= least_recent_date - timedelta(hours=past_len)] gfs_dataset = gfs_dataset[gfs_dataset["date"] > least_recent_date + timedelta(hours=future_offset)] gfs_dataset = gfs_dataset.sort_values(by=["date"]) train_synop_input, gfs_input, train_synop_label = create_sequence(dataset_train, gfs_dataset, past_len, future_offset) train_size = int(train_synop_input.shape[0] * train_split_factor) x_train = [train_synop_input[:train_size], gfs_input[:train_size]] y_train = train_synop_label[:train_size] x_valid = [train_synop_input[train_size:], gfs_input[train_size:]] y_valid = train_synop_label[train_size:] return x_train, y_train, x_valid, y_valid
def __init__(self, config: Config): super().__init__() self.config = config self.val_split = config.experiment.val_split self.batch_size = config.experiment.batch_size self.shuffle = config.experiment.shuffle self.dataset_train = ... self.dataset_val = ... self.dataset_test = ... self.synop_file = config.experiment.synop_file self.train_params = config.experiment.synop_train_features self.target_param = config.experiment.target_parameter self.sequence_length = config.experiment.sequence_length self.labels = prepare_synop_dataset( self.synop_file, list(list(zip(*self.train_params))[1]), dataset_dir=SYNOP_DATASETS_DIRECTORY, from_year=config.experiment.synop_from_year, to_year=config.experiment.synop_to_year, norm=False) self.dates = get_correct_dates_for_sequence( self.labels, self.sequence_length, 1, config.experiment.prediction_offset)
def prepare_target_attribute_dataset(synop_data_file, target_attribute, init_date, end_date): dataset, _, _ = prepare_synop_dataset(synop_data_file, [target_attribute], SYNOP_DATASETS_DIRECTORY) return filter_for_dates(dataset, init_date, end_date)