def load_model(filesystem: S3FS, config: PredictionConfig) -> ResUnetA: """ Copy the model locally if not existing and load it """ if not os.path.exists(f'{config.temp_model_path}/{config.model_name}'): if not filesystem.exists( f'{config.model_path}/{config.model_name}/checkpoints/'): filesystem.makedirs( f'{config.model_path}/{config.model_name}/checkpoints/') copy_dir(filesystem, f'{config.model_path}/{config.model_name}/checkpoints/', f'{config.temp_model_path}/{config.model_name}', 'checkpoints') copy_file(filesystem, f'{config.model_path}/{config.model_name}/model_cfg.json', f'{config.temp_model_path}/{config.model_name}', 'model_cfg.json') input_shape = dict( features=[None, config.height, config.width, config.n_channels]) with open(f'{config.temp_model_path}/{config.model_name}/model_cfg.json', 'r') as jfile: model_cfg = json.load(jfile) # initialise model from config, build, compile and load trained weights model = ResUnetA(model_cfg) model.build(input_shape) model.net.compile() model.net.load_weights( f'{config.temp_model_path}/{config.model_name}/checkpoints/model.ckpt') return model
def load_metadata(filesystem: S3FS, config: PredictionConfig) -> pd.DataFrame: """ Load DataFrame with info about normalisation factors """ metadata_dir = os.path.dirname(config.metadata_path) if not filesystem.exists(metadata_dir): filesystem.makedirs(metadata_dir) df = pd.read_csv(filesystem.open(f'{config.metadata_path}')) normalisation_factors = df.groupby( pd.to_datetime(df.timestamp).dt.to_period("M")).max() normalisation_factors['month'] = pd.to_datetime( normalisation_factors.timestamp).dt.month return normalisation_factors
def npz_dir_dataset(file_dir_or_list: Union[str, List[str]], features: dict, metadata_path: str, fold: int = None, randomize: bool = True, num_parallel: int = 5, shuffle_size: int = 500, filesystem: S3FS = None, npz_from_s3: bool = False) -> tf.data.Dataset: """ Creates a tf.data.Dataset from a directory containing numpy .npz files. Files are loaded lazily when needed. `num_parallel` files are read in parallel and interleaved together. :param file_dir_or_list: directory containing .npz files or a list of paths to .npz files :param features: dict of (`field` -> `feature_name`) mappings, where `field` is the field in the .npz array and `feature_name` is the name of the feature it is saved to. :param fold: in k-fold validation, fold to consider when querying the patchlet info dataframe :param randomize: whether to shuffle the samples of the dataset or not, defaults to `True` :param num_parallel: number of files to read in parallel and intereleave, defaults to 5 :param shuffle_size: buffer size for shuffling file order, defaults to 500 :param metadata_path: path to input csv files with patchlet information :param filesystem: filesystem to access bucket, defaults to None :param npz_from_s3: if True, npz files are loaded from S3 bucket, otherwise from local disk :return: dataset containing examples merged from files """ files = file_dir_or_list if npz_from_s3: assert filesystem is not None # If dir, then list files if isinstance(file_dir_or_list, str): if filesystem and not filesystem.isdir(file_dir_or_list): filesystem.makedirs(file_dir_or_list) dir_list = os.listdir( file_dir_or_list) if not npz_from_s3 else filesystem.listdir( file_dir_or_list) files = [os.path.join(file_dir_or_list, f) for f in dir_list] fields = list(features.keys()) # Read one file for shape info file = next(iter(files)) data = np.load(file) if not npz_from_s3 else np.load( filesystem.openbin(file)) np_arrays = [data[f] for f in fields] # Append norm arrays perc99, meanstd_mean, meanstd_median, meanstd_std = _construct_norm_arrays( file, metadata_path, fold, filesystem) np_arrays.append(perc99) np_arrays.append(meanstd_mean) np_arrays.append(meanstd_median) np_arrays.append(meanstd_std) # Read shape and type info # types = tuple(arr.dtype for arr in np_arrays) types = (tf.uint16, tf.float32, tf.float32, tf.float32, tf.float64, tf.float64, tf.float64, tf.float64) shapes = tuple(arr.shape[1:] for arr in np_arrays) # Create datasets datasets = [ _npz_file_lazy_dataset(file, fields, types, shapes, metadata_path, fold=fold, filesystem=filesystem, npz_from_s3=npz_from_s3) for file in files ] ds = tf.data.Dataset.from_tensor_slices(datasets) # Shuffle files and interleave multiple files in parallel if randomize: ds = ds.shuffle(shuffle_size) ds = ds.interleave(lambda x: x, cycle_length=num_parallel) return ds