def get_mean_and_std_cmax(list_IDs: [str], dim: (int, int), sequence_length: int, future_sequence_length: int = 0, prediction_offset: int = 0): # Bear in mind that date_keys are indices of FIRST frame in the sequence. Not all frames exist in date_keys because of that fact. log.info("Calculating std and mean for the CMAX dataset") all_ids = set([ item for sublist in [[ get_cmax_datekey_from_offset(id, offset) for offset in range( 0, sequence_length + future_sequence_length + prediction_offset) ] for id in list_IDs] for item in sublist ]) mean, sqr_mean = 0, 0 denom = len(all_ids) * dim[0] * dim[1] / 4 cmax_loader = CMAXLoader() for id in tqdm(all_ids): values = cmax_loader.get_cmax_image(id) mean += np.sum(values) / denom sqr_mean += np.sum(np.power(values, 2)) / denom std = math.sqrt(sqr_mean - pow(mean, 2)) return cmax_loader.get_all_loaded_cmax_images(), mean, std
def initialize_mean_and_std_for_sequence(date_keys: dict, train_parameters, dim: (int, int), sequence_length: int, prediction_offset: int, subregion_coords: Coords = None): log.info("Calculating std and mean for a dataset") means = [] stds = [] for param in tqdm(train_parameters): sum, sqr_sum = 0, 0 for id in tqdm(date_keys[prediction_offset]): values = np.squeeze( get_GFS_values_for_sequence(id, param, sequence_length, prediction_offset, subregion_coords)) sum += np.sum(values) sqr_sum += np.sum(np.power(values, 2)) mean = sum / (len(date_keys) * sequence_length * dim[0] * dim[1]) means.append(mean) stds.append( math.sqrt(sqr_sum / (len(date_keys) * sequence_length * dim[0] * dim[1]) - pow(mean, 2))) return means, stds
def initialize_mean_and_std(date_keys, train_parameters, dim: (int, int), prediction_offset: int, subregion_coords=None): log.info("Calculating std and mean for a dataset") means = [] stds = [] gfs_loader = GFSLoader() for param in tqdm(train_parameters): sum, sqr_sum = 0, 0 for date_key in tqdm(date_keys): values = gfs_loader.get_gfs_image(date_key, param, prediction_offset) if subregion_coords is not None: values = get_subregion_from_GFS_slice_for_coords( values, subregion_coords) sum += np.sum(values) sqr_sum += np.sum(np.power(values, 2)) mean = sum / (len(date_keys) * dim[0] * dim[1]) means.append(mean) stds.append( math.sqrt(sqr_sum / (len(date_keys) * dim[0] * dim[1]) - pow(mean, 2))) return means, stds
def get_min_max_cmax(list_IDs: [str], sequence_length: int, future_sequence_length: int = 0, prediction_offset: int = 0): all_ids = set([ item for sublist in [[ get_cmax_datekey_from_offset(id, offset) for offset in range( 0, sequence_length + future_sequence_length + prediction_offset) ] for id in list_IDs] for item in sublist ]) cmax_loader = CMAXLoader() log.info("Loading CMAX files into the runtime.") for id in tqdm(all_ids): values = cmax_loader.get_cmax_image(id) return cmax_loader.get_all_loaded_cmax_images( ), CMAX_MIN, CMAX_MAX # We know max and min upfront, let's not waste time :)
def initialize_min_max_for_sequence(list_IDs: [str], train_parameters, sequence_length: int, prediction_offset: int, subregion_coords=None): log.info("Calculating min and max for the GFS dataset") mins = [] maxes = [] for param in tqdm(train_parameters): min, max = sys.float_info.max, sys.float_info.min for id in list_IDs: values = np.squeeze( get_GFS_values_for_sequence(id, param, sequence_length, prediction_offset, subregion_coords)) min = min(values, min) max = max(values, max) mins.append(min) maxes.append(max) return mins, maxes
def initialize_min_max(date_keys: [str], train_parameters, prediction_offset: int, subregion_coords=None): log.info("Calculating min and max for a dataset") mins = [] maxes = [] gfs_loader = GFSLoader() for param in tqdm(train_parameters): min, max = sys.float_info.max, sys.float_info.min for date_key in date_keys: values = gfs_loader.get_gfs_image(date_key, param, prediction_offset) if subregion_coords is not None: values = get_subregion_from_GFS_slice_for_coords( values, subregion_coords) min = min(np.min(values), min) max = max(np.max(values), max) mins.append(min) maxes.append(max) return mins, maxes
def main(cfg: Config): cfg.experiment.train_parameters_config_file = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'config', 'train_parameters', cfg.experiment.train_parameters_config_file) log.info( f'\\[init] Loaded config:\n{OmegaConf.to_yaml(cfg, resolve=True)}') pl.seed_everything(cfg.experiment.seed) RUN_NAME = os.getenv('RUN_NAME') log.info(f'[bold yellow]\\[init] Run name --> {RUN_NAME}') run: Run = wandb_logger.experiment # type: ignore # Setup logging & checkpointing tags = get_tags(cast(DictConfig, cfg)) run.tags = tags run.notes = str(cfg.notes) wandb_logger.log_hyperparams(OmegaConf.to_container( cfg, resolve=True)) # type: ignore log.info( f'[bold yellow][{RUN_NAME} / {run.id}]: [bold white]{",".join(tags)}') setproctitle.setproctitle( f'{RUN_NAME} ({os.getenv("WANDB_PROJECT")})') # type: ignore log.info( f'[bold white]Overriding cfg.lightning settings with derived values:') log.info( f' >>> num_sanity_val_steps = {-1 if cfg.experiment.validate_before_training else 0}\n' ) # Create main system (system = models + training regime) system: LightningModule = instantiate(cfg.experiment.system, cfg) log.info(f'[bold yellow]\\[init] System architecture:') log.info(system) # Prepare data using datamodules datamodule: LightningDataModule = instantiate(cfg.experiment.datamodule, cfg) resume_path = get_resume_checkpoint(cfg, wandb_logger) if resume_path is not None: log.info(f'[bold yellow]\\[checkpoint] [bold white]{resume_path}') checkpointer = CustomCheckpointer( period=1, dirpath='checkpoints', filename='{epoch}', ) trainer: pl.Trainer = instantiate( cfg.lightning, logger=wandb_logger, max_epochs=cfg.experiment.epochs, callbacks=[checkpointer], resume_from_checkpoint=resume_path, checkpoint_callback=True if cfg.experiment.save_checkpoints else False, num_sanity_val_steps=-1 if cfg.experiment.validate_before_training else 0) trainer.fit(system, datamodule=datamodule) trainer.test(system, datamodule=datamodule) wandb_logger.log_metrics( { 'target_mean': datamodule.dataset_test.mean, 'target_std': datamodule.dataset_test.std }, step=system.current_epoch) mean = datamodule.dataset_test.mean std = datamodule.dataset_test.std plot_results(system, cfg, mean, std) if trainer.interrupted: # type: ignore log.info(f'[bold red]>>> Training interrupted.') run.finish(exit_code=255)