Example #1
0
def load_model(filesystem: S3FS, config: PredictionConfig) -> ResUnetA:
    """ Copy the model locally if not existing and load it """
    if not os.path.exists(f'{config.temp_model_path}/{config.model_name}'):
        if not filesystem.exists(
                f'{config.model_path}/{config.model_name}/checkpoints/'):
            filesystem.makedirs(
                f'{config.model_path}/{config.model_name}/checkpoints/')
        copy_dir(filesystem,
                 f'{config.model_path}/{config.model_name}/checkpoints/',
                 f'{config.temp_model_path}/{config.model_name}',
                 'checkpoints')
        copy_file(filesystem,
                  f'{config.model_path}/{config.model_name}/model_cfg.json',
                  f'{config.temp_model_path}/{config.model_name}',
                  'model_cfg.json')

    input_shape = dict(
        features=[None, config.height, config.width, config.n_channels])

    with open(f'{config.temp_model_path}/{config.model_name}/model_cfg.json',
              'r') as jfile:
        model_cfg = json.load(jfile)

    # initialise model from config, build, compile and load trained weights
    model = ResUnetA(model_cfg)
    model.build(input_shape)
    model.net.compile()
    model.net.load_weights(
        f'{config.temp_model_path}/{config.model_name}/checkpoints/model.ckpt')

    return model
Example #2
0
def load_metadata(filesystem: S3FS, config: PredictionConfig) -> pd.DataFrame:
    """ Load DataFrame with info about normalisation factors """
    metadata_dir = os.path.dirname(config.metadata_path)
    if not filesystem.exists(metadata_dir):
        filesystem.makedirs(metadata_dir)

    df = pd.read_csv(filesystem.open(f'{config.metadata_path}'))

    normalisation_factors = df.groupby(
        pd.to_datetime(df.timestamp).dt.to_period("M")).max()

    normalisation_factors['month'] = pd.to_datetime(
        normalisation_factors.timestamp).dt.month

    return normalisation_factors
def npz_dir_dataset(file_dir_or_list: Union[str, List[str]],
                    features: dict,
                    metadata_path: str,
                    fold: int = None,
                    randomize: bool = True,
                    num_parallel: int = 5,
                    shuffle_size: int = 500,
                    filesystem: S3FS = None,
                    npz_from_s3: bool = False) -> tf.data.Dataset:
    """ Creates a tf.data.Dataset from a directory containing numpy .npz files.

    Files are loaded lazily when needed. `num_parallel` files are read in parallel and interleaved together.

    :param file_dir_or_list: directory containing .npz files or a list of paths to .npz files
    :param features: dict of (`field` -> `feature_name`) mappings, where `field` is the field in the .npz array
                   and `feature_name` is the name of the feature it is saved to.
    :param fold: in k-fold validation, fold to consider when querying the patchlet info dataframe
    :param randomize: whether to shuffle the samples of the dataset or not, defaults to `True`
    :param num_parallel: number of files to read in parallel and intereleave, defaults to 5
    :param shuffle_size: buffer size for shuffling file order, defaults to 500
    :param metadata_path: path to input csv files with patchlet information
    :param filesystem: filesystem to access bucket, defaults to None
    :param npz_from_s3: if True, npz files are loaded from S3 bucket, otherwise from local disk
    :return: dataset containing examples merged from files
    """

    files = file_dir_or_list

    if npz_from_s3:
        assert filesystem is not None

    # If dir, then list files
    if isinstance(file_dir_or_list, str):
        if filesystem and not filesystem.isdir(file_dir_or_list):
            filesystem.makedirs(file_dir_or_list)
        dir_list = os.listdir(
            file_dir_or_list) if not npz_from_s3 else filesystem.listdir(
                file_dir_or_list)
        files = [os.path.join(file_dir_or_list, f) for f in dir_list]

    fields = list(features.keys())

    # Read one file for shape info
    file = next(iter(files))
    data = np.load(file) if not npz_from_s3 else np.load(
        filesystem.openbin(file))
    np_arrays = [data[f] for f in fields]

    # Append norm arrays
    perc99, meanstd_mean, meanstd_median, meanstd_std = _construct_norm_arrays(
        file, metadata_path, fold, filesystem)

    np_arrays.append(perc99)
    np_arrays.append(meanstd_mean)
    np_arrays.append(meanstd_median)
    np_arrays.append(meanstd_std)

    # Read shape and type info
    #     types = tuple(arr.dtype for arr in np_arrays)
    types = (tf.uint16, tf.float32, tf.float32, tf.float32, tf.float64,
             tf.float64, tf.float64, tf.float64)
    shapes = tuple(arr.shape[1:] for arr in np_arrays)

    # Create datasets
    datasets = [
        _npz_file_lazy_dataset(file,
                               fields,
                               types,
                               shapes,
                               metadata_path,
                               fold=fold,
                               filesystem=filesystem,
                               npz_from_s3=npz_from_s3) for file in files
    ]
    ds = tf.data.Dataset.from_tensor_slices(datasets)

    # Shuffle files and interleave multiple files in parallel
    if randomize:
        ds = ds.shuffle(shuffle_size)

    ds = ds.interleave(lambda x: x, cycle_length=num_parallel)

    return ds