def test_prepare_data_correct_dims():
    params = {
        'data': blueno.DataConfig(**{
            'data_dir': '/home/lzhu7/elvo-analysis/data/'
                        'processed-standard/arrays/',
            'labels_path': '/home/lzhu7/elvo-analysis/data/'
                           'processed-standard/labels.csv',
            'index_col': 'Anon ID',
            'label_col': 'occlusion_exists',
            'gcs_url': 'gs://elvos/processed/processed-standard',
        }),

        'val_split': 0.2,
        'seed': 0,
        'batch_size': 8,

        'generator': blueno.GeneratorConfig(
            generator_callable=lambda: None),

        'model': blueno.ModelConfig(**{
            # The callable must take in **kwargs as an argument
            'model_callable': small_model,
            'dropout_rate1': 0.8,
            'dropout_rate2': 0.7,
            'optimizer': keras.optimizers.Adam(lr=1e-4),
            'loss': keras.losses.categorical_crossentropy,
        }),
    }
    params = blueno.ParamConfig(**params)
    _, _, y_train, y_test, _, _ = preprocessing.prepare_data(
        params, train_test_val=False)
    assert y_train.ndim == 2
    assert y_test.ndim == 2
Exemple #2
0
def __load_data(params):
    """
    Loads the data.

    :params params: The ParamConfig file in question
    :return: train_data, validation_data, test_data,
        train_labels, vaidation_labels, test_labels,
        train_ids, validation_ids, test_ids
    """
    return preprocessing.prepare_data(params)
Exemple #3
0
def load_training_xy(data_root):
    # Minimal params to generate the correct training-validation data
    # split
    params = blueno.ParamConfig(
        data=blueno.DataConfig(
            data_dir='{}/arrays'.format(data_root),
            labels_path='{}/labels.csv'.format(data_root),
            index_col='Anon ID',
            label_col='occlusion_exists',
            gcs_url='gs://elvos/processed/processed-new-training-2'),
        generator=blueno.GeneratorConfig(
            generator_callable=standard_generators),
        model=blueno.ModelConfig(model_callable=None,
                                 optimizer=None,
                                 loss=categorical_crossentropy,
                                 dropout_rate1=None,
                                 dropout_rate2=None),
        batch_size=None,
        seed=0,  # or 0
        val_split=0.1,
    )
    arrays = prepare_data(params, train_test_val=False)
    return arrays[0], arrays[2]
Exemple #4
0
def evaluate_from_config(params):
    # Load the data
    logging.info('Preparing data and models')
    (x_train, _, x_test, y_train, _, y_test, id_train, _,
     id_test) = preprocessing.prepare_data(params,
                                           train_test_val=True,
                                           sort=False)

    metrics = [
        'acc', utils.sensitivity, utils.specificity, utils.true_positives,
        utils.false_negatives
    ]

    model = params.model.model_callable
    model.load_weights(params.model_weights)
    model.compile(loss=params.model.loss,
                  optimizer=params.model.optimizer,
                  metrics=metrics)

    logging.info('Evaluating the model...')
    results = evaluate_model(x_test, y_test, model, x_train=x_train)
    logging.info('Results:')
    logging.info(results)
def test_prepare_data_matching_indices():
    params = {
        'data': blueno.DataConfig(**{
            'data_dir': '/home/lzhu7/elvo-analysis/data/'
                        'processed-standard/arrays/',
            'labels_path': '/home/lzhu7/elvo-analysis/data/'
                           'processed-standard/labels.csv',
            'index_col': 'Anon ID',
            'label_col': 'occlusion_exists',
            'gcs_url': 'gs://elvos/processed/processed-standard'
        }),

        'val_split': 0.2,
        'seed': 0,
        'batch_size': 8,
        'max_epochs': 1,

        'generator': blueno.GeneratorConfig(
            generator_callable=lambda: None),

        'model': blueno.ModelConfig(**{
            # The callable must take in **kwargs as an argument
            'model_callable': small_model,
            'dropout_rate1': 0.8,
            'dropout_rate2': 0.7,
            'optimizer': keras.optimizers.Adam(lr=1e-4),
            'loss': keras.losses.categorical_crossentropy,
        }),
    }
    params = blueno.ParamConfig(**params)
    _, _, y_train, y_test, id_train, id_test = preprocessing.prepare_data(
        params, train_test_val=False)
    for i, id_ in enumerate(id_test):
        if id_ == '068WBWCQGW5JHBYV':
            assert y_test[i][0] == 1
        elif id_ == 'FBGMN3O08GW5GG91':
            assert y_test[i][1] == 1
Exemple #6
0
def hyperoptimize(hyperparams: Union[blueno.ParamGrid,
                                     List[blueno.ParamConfig]],
                  username: str,
                  slack_token: str = None,
                  num_gpus=1,
                  gpu_offset=0,
                  log_dir: str = None) -> None:
    """
    Runs training jobs on input hyperparameter grid.

    :param hyperparams: a dictionary of parameters. See blueno/types for
    a specification
    :param username: your name
    :param slack_token: a slack token for uploading to GitHub
    :param num_gpus: the number of gpus you will use
    :param gpu_offset: your gpu offset
    :param log_dir: the directory you will too. This directory should already
    exist
    :return:
    """
    if isinstance(hyperparams, blueno.ParamGrid):
        param_list = model_selection.ParameterGrid(hyperparams.__dict__)
    else:
        param_list = hyperparams

    logging.info(
        'optimizing grid with {} configurations'.format(len(param_list)))

    gpu_index = 0
    processes = []
    for params in param_list:
        if isinstance(params, dict):
            params = blueno.ParamConfig(**params)

        check_data_in_sync(params)

        # This is where we'd run preprocessing. To run in a reasonable amount
        # of time, the raw data must be cached in-memory.
        arrays = preprocessing.prepare_data(params, train_test_val=False)
        x_train, x_valid, y_train, y_valid, id_train, id_valid = arrays

        # Start the model training job
        # Run in a separate process to avoid memory issues
        # Note how this depends on offset
        os.environ['CUDA_VISIBLE_DEVICES'] = f'{gpu_index + gpu_offset}'

        if params.job_fn is None:
            job_fn = start_job
        else:
            job_fn = params.job_fn

        logging.debug('using job fn {}'.format(job_fn))

        # Uses the parent of the data_dir to name the job,
        # which may not work for all data formats.
        if params.job_name:
            job_name = params.job_name
        else:
            job_name = str(pathlib.Path(params.data.data_dir).parent.name)
        job_name += f'_{y_train.shape[1]}-classes'

        process = multiprocessing.Process(target=job_fn,
                                          args=(x_train, y_train,
                                                x_valid, y_valid),
                                          kwargs={
                                              'params': params,
                                              'job_name': job_name,
                                              'username': username,
                                              'slack_token': slack_token,
                                              'log_dir': log_dir,
                                              'id_valid': id_valid,
                                          })
        gpu_index += 1
        gpu_index %= num_gpus

        logging.debug(f'gpu_index is now {gpu_index + gpu_offset}')
        process.start()
        processes.append(process)
        if gpu_index == 0:
            logging.info(f'all gpus used, calling join on processes:'
                         f' {processes}')
        p: multiprocessing.Process
        for p in processes:
            p.join()
        processes = []
        time.sleep(60)
Exemple #7
0
def simple_ensemble(model_blob_names: List[str],
                    data_dir: str,
                    labels_path: str,
                    loss: Callable,
                    seed: int,
                    val_split: float,
                    train_test_val: bool,
                    sort: bool):
    """Creates an ensemble from the list of model_urls.

    DO NOT mix models in compat/ with those in sorted_models/.

    :param model_blob_names: a list of model blob names, like
        compat/
    :param data_dir: the path to the data used by ALL models
    :param labels_path: the path to the labels used by ALL models
    :param loss: a loss function like keras.losses.categorical_crossentropy,
        used by ALL models
    :param seed: the seed of ALL of the models
    :param val_split: the val split of ALL of the models
    :param train_test_val: True if train_test_val split was used on the models
    :param sort: set to True of you are loading from sorted_models/, false
        if loading from compat/
    :return:
    """

    # Set the params variable from the function arguments,
    # we'll need this to load the data as x_train, y_train, ...
    params = blueno.ParamConfig(
        data=blueno.DataConfig(
            # TODO: Generalize to work for all users
            data_dir=data_dir,
            labels_path=labels_path,
            index_col='Anon ID',
            label_col='occlusion_exists',
            gcs_url='',
        ),
        generator=None,
        model=blueno.ModelConfig(
            model_callable=None,
            optimizer=None,
            # TODO: Some may use a different loss
            loss=loss,
        ),
        batch_size=None,
        seed=seed,
        val_split=val_split
    )

    x_train, x_valid, y_train, y_valid, _, _ = prepare_data(
        params, train_test_val=train_test_val, sort=sort)
    datagen = ImageDataGenerator(featurewise_center=True,
                                 featurewise_std_normalization=True)
    datagen.fit(x_train)

    client = storage.Client(project='elvo-198322')
    bucket = storage.Bucket(client, name='elvos')

    # This is a copy of the ensemble_models function, using
    # model names instead.
    models = []
    time1 = time.time()
    for i, blob_name in enumerate(model_blob_names):
        # Here we load and evaluate each individual model
        # so we can be sure that our data, validation split, and seed
        # are correct
        blob = bucket.get_blob(blob_name)
        if blob is None:
            raise ValueError(f'Blob {blob_name} does not exist')
        model_filepath = f'{i}.hdf5'

        print(f'downloading model {blob_name}')
        time2 = time.time()
        blob.download_to_filename(model_filepath)
        time3 = time.time()
        print(f'seconds to download: {time3 - time2}')

        print(f'loading model {blob_name}')
        model: keras.Model
        model = load_model(model_filepath, compile=True)
        os.remove(model_filepath)
        time4 = time.time()
        print(f'seconds to load: {time4 - time3}')
        # Used to check the model
        evaluate_model(model, datagen, x_valid, y_valid)

        model.name = f'model_{i}'
        models.append(model)

    # Finally we ensemble and evaluate the models here
    print('using models {}'.format(models))
    model_input = layers.Input(shape=models[0].input_shape[1:])
    ensemble = ensemble_models(models, model_input)

    evaluate_model(ensemble, datagen, x_valid, y_valid)
    time7 = time.time()
    print(f'seconds per ensemble: {time7 - time1}', flush=True)