def validate_configs(configs_folder):
    schema = helpers.load_dict(os.path.join(configs_folder, 'schema.json'))
    for file_name in Path(configs_folder).glob('**/*.json'):
        if str(file_name.name) == 'schema.json':
            continue
        try:
            jsonschema.validate(schema=schema,
                                instance=helpers.load_dict(str(file_name)))
        except jsonschema.exceptions.ValidationError as e:
            print(f"ValidationError for the file {str(file_name)}")
            raise e
def validate_configs(configs_folder):
    schema = helpers.load_dict(os.path.join(configs_folder, 'schema.json'))
    for file_name in os.listdir(configs_folder):
        if file_name == 'schema.json':
            continue
        config_file_path = os.path.join(configs_folder, file_name)
        try:
            jsonschema.validate(schema=schema,
                                instance=helpers.load_dict(config_file_path))
        except jsonschema.exceptions.ValidationError as e:
            print(f"ValidationError for the file {config_file_path}")
            raise e
def main(training_config_path: typing.AnyStr,
         validation_config_path: typing.AnyStr,
         user_config_path: typing.AnyStr,
         tensorboard_tracking_folder: typing.AnyStr):
    """
    Train a model

    :param training_config_path: path to the JSON config file used to store training set parameters
    :param validation_config_path: path to the JSON config file used to store validation set parameters
    :param user_config_path: path to the JSON config file used to store user model, dataloader and trainer parameters
    :param tensorboard_tracking_folder: path where to store TensorBoard data and save trained model
    """
    training_config_dict = helpers.load_dict(training_config_path)
    validation_config_dict = helpers.load_dict(validation_config_path)
    user_config_dict = helpers.load_dict(user_config_path)

    helpers.validate_admin_config(training_config_dict)
    helpers.validate_admin_config(validation_config_dict)
    helpers.validate_user_config(user_config_dict)

    training_source = user_config_dict['data_loader']['hyper_params'][
        'preprocessed_data_source']['training']
    validation_source = user_config_dict['data_loader']['hyper_params'][
        'preprocessed_data_source']['validation']

    training_data_loader = helpers.get_online_data_loader(
        user_config_dict,
        training_config_dict,
        preprocessed_data_path=training_source)
    validation_data_loader = helpers.get_online_data_loader(
        user_config_dict,
        validation_config_dict,
        preprocessed_data_path=validation_source)

    print("Eager mode", tf.executing_eagerly())

    mirrored_strategy = helpers.get_mirrored_strategy()

    train_models(user_config_dict=user_config_dict,
                 training_config_dict=training_config_dict,
                 training_data_loader=training_data_loader,
                 validation_data_loader=validation_data_loader,
                 tensorboard_tracking_folder=tensorboard_tracking_folder,
                 mirrored_strategy=mirrored_strategy)
Exemple #4
0
def main(config_path: typing.AnyStr,
         tensorboard_tracking_folder: typing.AnyStr):
    """
    Train a model

    :param config_path: path to the JSON config file that follows configs/user/schema.json
    :param tensorboard_tracking_folder: path where to store TensorBoard data and save trained model
    """
    user_config_dict = helpers.load_dict(config_path)
    helpers.validate_user_config(user_config_dict)

    if tensorboard_tracking_folder is not None:
        tensorboard_tracking_folder = Path(tensorboard_tracking_folder)
        tensorboard_tracking_folder.mkdir(parents=True, exist_ok=True)

    train_models(config=user_config_dict,
                 tensorboard_tracking_folder=tensorboard_tracking_folder)
Exemple #5
0
def netcdf_preloader(crop_size=50,
                     tmp_array_size=200,
                     admin_config_file_path=None,
                     path_output='.',
                     dataframe=None,
                     target_datetimes=None,
                     stations=None):
    """
    Preprocess netcdf files which are related to target_datetimes and stations

    The resulting preprocessed netcdf files will be stored in path_output

    If admin_config_file_path is not specified, the following have to be specified:
        * dataframe
        * target_datetimes
        * stations
    If admin_config_file is specified, it overwrites the parameters specified above.

    :param crop_size: The crop size around each station in pixels
    :param tmp_array_size: nb of timestamps to fit in temporary array
    :param admin_config_file_path: The admin configuration file path
    :param path_output: The folder where the outputted preprocessed netcdf files will be
    :param dataframe: a pandas dataframe that provides the netCDF file path (or HDF5 file path and offset) for all
            relevant timestamp values over the test period.
    :param target_datetimes: a list of timestamps that your data loader should use to provide imagery for your model.
            The ordering of this list is important, as each element corresponds to a sequence of GHI values
            to predict. By definition, the GHI values must be provided for the offsets given by ``target_time_offsets``
            which are added to each timestamp (T=0) in this datetimes list.
    :param stations: a map of station names of interest paired with their coordinates (latitude, longitude, elevation).
    """
    if admin_config_file_path:
        admin_config_dict = helpers.load_dict(admin_config_file_path)
        dataframe_path = admin_config_dict['dataframe_path']
        with open(dataframe_path, 'rb') as df_file_handler:
            dataframe = pickle.load(df_file_handler)
        target_datetimes = [
            datetime.datetime.strptime(date_time, '%Y-%m-%dT%H:%M:%S')
            for date_time in admin_config_dict['target_datetimes']
        ]
        stations = admin_config_dict['stations']

    # hard coded values for now
    n_channels = 5
    n_timestep = 5

    dc = crop_size // 2
    ddt = datetime.timedelta(minutes=15)

    # number of sample points
    n_sample = len(target_datetimes)

    # Generate all datetimes including prior timesteps from targets
    all_dt = []
    for dt0 in target_datetimes:
        for i in range(4, 0, -1):
            all_dt.append(dt0 - i * ddt)
        all_dt.append(dt0)

    chunksizes = min(256, n_sample)

    # Initialize output netcdf files (one for each station)
    nc_outs = {}
    os.makedirs(path_output, exist_ok=True)
    for station, coord in stations.items():
        nc_outs[station] = netCDF4.Dataset(
            os.path.join(path_output, f'{station}.nc'), 'w')
        nc_outs[station].createDimension('time', n_sample)
        nc_outs[station].createDimension('lat', crop_size)
        nc_outs[station].createDimension('lon', crop_size)
        nc_outs[station].createDimension('channel', n_channels)
        nc_outs[station].createDimension('timestep', n_timestep)
        time = nc_outs[station].createVariable('time', 'f8', ('time', ))
        time.calendar = 'standard'
        time.units = 'days since 1970-01-01 00:00:00'
        nc_outs[station].createVariable('lat', 'f4', ('lat', ))
        nc_outs[station].createVariable('lon', 'f4', ('lon', ))
        nc_outs[station].createVariable(
            f'data',
            'f4', ('time', 'timestep', 'channel', 'lat', 'lon'),
            zlib=True,
            chunksizes=(chunksizes, n_timestep, n_channels, crop_size,
                        crop_size))

    # Initialize temporary arrays to store data to limit constantly writing to disk
    init = True
    tmp_arrays = {}
    coord_ij = {}
    for station, coord in stations.items():
        tmp_arrays[f"{station}"] = ma.masked_all(
            (tmp_array_size, n_timestep, n_channels, crop_size, crop_size))
    tmp_arrays["time"] = ma.masked_all((tmp_array_size, ))

    # Loop through all timesteps and load images. The sequence moves through
    # T-60min, T-45min, T-30min, T-15min, T0 and store those in the timestep
    # dimension in the netcdf files. A temporary array is used in case the number
    # of timestamps to process would be to big for memory.
    for t, dt in enumerate(tqdm.tqdm(all_dt)):
        at_t0 = not ((t + 1) % n_timestep)
        t_sample = t // n_timestep
        t_sample_tmp = t_sample % tmp_array_size
        timestep_id = t % n_timestep
        k = dataframe.index.get_loc(dt)
        nc_path = dataframe['ncdf_path'][k]
        try:
            nc_loop = netCDF4.Dataset(nc_path, 'r')
        except OSError:
            # What to do when the netCDF4 file is not available.
            # Currently filling with 0 later in the code...
            if at_t0:
                tmp_arrays["time"][t_sample_tmp] = netCDF4.date2num(
                    dt, time.units, time.calendar)
        else:
            if init:
                lat_loop = nc_loop['lat'][:]
                lon_loop = nc_loop['lon'][:]
                for station, coord in stations.items():
                    lat_diff = np.abs(lat_loop - coord[0])
                    i = np.where(lat_diff == lat_diff.min())[0][0]
                    lon_diff = np.abs(lon_loop - coord[1])
                    j = np.where(lon_diff == lon_diff.min())[0][0]
                    nc_outs[station].variables['lat'][:] = lat_loop[i - dc:i +
                                                                    dc]
                    nc_outs[station].variables['lon'][:] = lon_loop[j - dc:j +
                                                                    dc]
                    coord_ij[station] = (i, j)
                init = False

            for d, c in enumerate([1, 2, 3, 4, 6]):
                channel_data = nc_loop.variables[f'ch{c}'][0, :, :]
                for station, coord in stations.items():
                    i, j = coord_ij[station]
                    tmp_arrays[f"{station}"][t_sample_tmp, timestep_id, d, :, :] = \
                        channel_data[i - dc:i + dc, j - dc:j + dc]

            if at_t0:
                tmp_arrays["time"][t_sample_tmp] = nc_loop.variables['time'][0]

            nc_loop.close()

        if ((t_sample_tmp == (tmp_array_size - 1)) and
            (timestep_id == n_timestep - 1)) or (t == (len(all_dt) - 1)):
            t0 = t_sample - t_sample_tmp
            for station, coord in stations.items():
                # Here we fill missing values with 0
                nc_outs[station]['data'][t0:t_sample + 1, :, :, :, :] = \
                    ma.filled(tmp_arrays[f"{station}"][:t_sample_tmp + 1, :, :, :, :], 0)
                tmp_arrays[f"{station}"] = \
                    ma.masked_all((tmp_array_size, n_timestep, n_channels, crop_size, crop_size))
                nc_outs[station]['time'][t0:t_sample + 1] = \
                    tmp_arrays['time'][:t_sample_tmp + 1]
            tmp_arrays["time"] = ma.masked_all((tmp_array_size, ))

    for station, coord in stations.items():
        nc_outs[station].close()
Exemple #6
0
def generate_predictions(input_file_path: str, pred_file_path: str):
    """Generates predictions for the machine translation task (EN->FR).
    You are allowed to modify this function as needed, but one again, you cannot
    modify any other part of this file. We will be importing only this function
    in our final evaluation script. Since you will most definitely need to import
    modules for your code, you must import these inside the function itself.
    Args:
        input_file_path: the file path that contains the input data.
        pred_file_path: the file path where to store the predictions.
    Returns: None
    """

    ##### MODIFY BELOW #####
    import tensorflow as tf

    from libs import helpers
    from libs.data_loaders.abstract_dataloader import AbstractDataloader
    from libs.models import transformer

    import tqdm

    import logging
    from libs.data_loaders.abstract_dataloader import create_masks_fm
    from libs.data_loaders.dataloader_bilingual_huggingface import BilingualTranslationHFSubword
    from libs.data_loaders.dataloader_bilingual_tensorflow import BilingualTranslationTFSubword
    from libs.data_loaders.mass_subword import MassSubwordDataLoader
    from libs.models.transformer import Encoder, Decoder

    logger = tf.get_logger()
    logger.setLevel(logging.DEBUG)

    import numpy as np
    import random
    from libs.seeds import TENSOR_FLOW_SEED, NUMPY_SEED, RANDOM_SEED

    tf.random.set_seed(TENSOR_FLOW_SEED)
    np.random.seed(NUMPY_SEED)
    random.seed(RANDOM_SEED)

    best_config_file = '/project/cq-training-1/project2/teams/team03/models/transformer_mass_v1_translation_with_pretraining_eval.json'
    # best_config_file = 'configs/user/transformers-fm/TFM_TINY_BBPE_eval.json'
    logger.info(f"Using best config file: {best_config_file}")
    best_config = helpers.load_dict(best_config_file)
    helpers.validate_user_config(best_config)

    # TODO: Edit our AbstractDataloader to support a raw_english_test_set_file_path. Currently it only supports
    #   preprocessed data defined directly in best_config.
    data_loader: AbstractDataloader = helpers.get_online_data_loader(
        config=best_config, raw_english_test_set_file_path=input_file_path)

    if best_config["model"]["definition"][
            "module"] == 'libs.models.transformerv2':
        model = transformer.load_transformer(best_config)
    else:
        mirrored_strategy = helpers.get_mirrored_strategy()
        if mirrored_strategy is not None and mirrored_strategy.num_replicas_in_sync > 1:
            with mirrored_strategy.scope():
                model: tf.keras.Model = helpers.prepare_model(
                    config=best_config)
        else:
            model: tf.keras.Model = helpers.prepare_model(config=best_config)

    #    batch_size = 32  # 32 is max for 6GB GPU memory
    batch_size = 128
    data_loader.build(batch_size=batch_size)
    test_dataset = data_loader.test_dataset

    all_predictions = []
    if isinstance(data_loader, MassSubwordDataLoader):
        all_predictions = transformer.inference(data_loader.tokenizer, model,
                                                test_dataset)
    else:
        if isinstance(data_loader, BilingualTranslationTFSubword) or \
                isinstance(data_loader, BilingualTranslationHFSubword):
            sample_to_display = 10

            encoder: Encoder = model.get_layer("encoder")
            decoder: Decoder = model.get_layer("decoder")
            final_layer: tf.keras.layers.Dense = model.layers[-1]

            for inputs, mask in tqdm.tqdm(test_dataset,
                                          total=data_loader.test_steps):

                mini_batch_size = inputs.shape[0]
                dec_inp = tf.Variable(
                    tf.zeros(
                        (mini_batch_size, data_loader.get_seq_length() + 1),
                        dtype=tf.int32))

                bos_tensor = tf.convert_to_tensor(data_loader.bos)
                bos_tensor = tf.reshape(bos_tensor, [1, 1])
                bos_tensor = tf.tile(bos_tensor,
                                     multiples=[mini_batch_size, 1])

                dec_inp[:, 0].assign(bos_tensor[:, 0])  # BOS token

                # WARNING: IF THE MODEL USED WAS FROM A TF FILE, A LOT OF WARNINGS WILL APPEAR
                #  Workaround: Use the hdf5 format to load the final model
                # https://github.com/tensorflow/tensorflow/issues/35146
                def get_preds(encoder, decoder, final_layer, dec_inp, inputs,
                              mask, max_seq):
                    enc_output: tf.Tensor = encoder.__call__(inputs=inputs,
                                                             mask=mask,
                                                             training=False)

                    for timestep in range(max_seq):
                        _, combined_mask, dec_padding_mask = create_masks_fm(
                            inp=inputs, tar=dec_inp[:, :-1])

                        dec_output, attention_weights = decoder(
                            inputs=dec_inp[:, :-1],
                            enc_output=enc_output,
                            look_ahead_mask=combined_mask,
                            padding_mask=dec_padding_mask)

                        outputs = final_layer(
                            inputs=dec_output
                        )  # (batch_size, seq_length, vocab_size)
                        pred = tf.argmax(outputs[:, timestep, :], axis=-1)
                        pred = tf.cast(pred, dtype=tf.int32)
                        dec_inp[:, timestep + 1].assign(pred)
                    return dec_inp

                predictions = get_preds(
                    encoder=encoder,
                    decoder=decoder,
                    final_layer=final_layer,
                    dec_inp=dec_inp,
                    inputs=inputs,
                    mask=mask,
                    # TODO Decision to be made, 100 seq length doesn't seem to hurt perfs
                    max_seq=100)  # data_loader.get_seq_length())
                for prediction in predictions.numpy():
                    if sample_to_display > 0:
                        logger.info(
                            f"Example of generated translation: {data_loader.decode(prediction)}"
                        )
                        sample_to_display -= 1
                    all_predictions += [data_loader.decode(prediction)]

        else:
            raise NotImplementedError(
                f"No method to generate for class {data_loader.__class__.__name__}"
            )

    with open(pred_file_path, 'w+') as file_handler:
        for prediction in all_predictions:
            file_handler.write(f'{prediction}\n')