def test_prepare_data_correct_dims(): params = { 'data': blueno.DataConfig(**{ 'data_dir': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/arrays/', 'labels_path': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/labels.csv', 'index_col': 'Anon ID', 'label_col': 'occlusion_exists', 'gcs_url': 'gs://elvos/processed/processed-standard', }), 'val_split': 0.2, 'seed': 0, 'batch_size': 8, 'generator': blueno.GeneratorConfig( generator_callable=lambda: None), 'model': blueno.ModelConfig(**{ # The callable must take in **kwargs as an argument 'model_callable': small_model, 'dropout_rate1': 0.8, 'dropout_rate2': 0.7, 'optimizer': keras.optimizers.Adam(lr=1e-4), 'loss': keras.losses.categorical_crossentropy, }), } params = blueno.ParamConfig(**params) _, _, y_train, y_test, _, _ = preprocessing.prepare_data( params, train_test_val=False) assert y_train.ndim == 2 assert y_test.ndim == 2
def __load_data(params): """ Loads the data. :params params: The ParamConfig file in question :return: train_data, validation_data, test_data, train_labels, vaidation_labels, test_labels, train_ids, validation_ids, test_ids """ return preprocessing.prepare_data(params)
def load_training_xy(data_root): # Minimal params to generate the correct training-validation data # split params = blueno.ParamConfig( data=blueno.DataConfig( data_dir='{}/arrays'.format(data_root), labels_path='{}/labels.csv'.format(data_root), index_col='Anon ID', label_col='occlusion_exists', gcs_url='gs://elvos/processed/processed-new-training-2'), generator=blueno.GeneratorConfig( generator_callable=standard_generators), model=blueno.ModelConfig(model_callable=None, optimizer=None, loss=categorical_crossentropy, dropout_rate1=None, dropout_rate2=None), batch_size=None, seed=0, # or 0 val_split=0.1, ) arrays = prepare_data(params, train_test_val=False) return arrays[0], arrays[2]
def evaluate_from_config(params): # Load the data logging.info('Preparing data and models') (x_train, _, x_test, y_train, _, y_test, id_train, _, id_test) = preprocessing.prepare_data(params, train_test_val=True, sort=False) metrics = [ 'acc', utils.sensitivity, utils.specificity, utils.true_positives, utils.false_negatives ] model = params.model.model_callable model.load_weights(params.model_weights) model.compile(loss=params.model.loss, optimizer=params.model.optimizer, metrics=metrics) logging.info('Evaluating the model...') results = evaluate_model(x_test, y_test, model, x_train=x_train) logging.info('Results:') logging.info(results)
def test_prepare_data_matching_indices(): params = { 'data': blueno.DataConfig(**{ 'data_dir': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/arrays/', 'labels_path': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/labels.csv', 'index_col': 'Anon ID', 'label_col': 'occlusion_exists', 'gcs_url': 'gs://elvos/processed/processed-standard' }), 'val_split': 0.2, 'seed': 0, 'batch_size': 8, 'max_epochs': 1, 'generator': blueno.GeneratorConfig( generator_callable=lambda: None), 'model': blueno.ModelConfig(**{ # The callable must take in **kwargs as an argument 'model_callable': small_model, 'dropout_rate1': 0.8, 'dropout_rate2': 0.7, 'optimizer': keras.optimizers.Adam(lr=1e-4), 'loss': keras.losses.categorical_crossentropy, }), } params = blueno.ParamConfig(**params) _, _, y_train, y_test, id_train, id_test = preprocessing.prepare_data( params, train_test_val=False) for i, id_ in enumerate(id_test): if id_ == '068WBWCQGW5JHBYV': assert y_test[i][0] == 1 elif id_ == 'FBGMN3O08GW5GG91': assert y_test[i][1] == 1
def hyperoptimize(hyperparams: Union[blueno.ParamGrid, List[blueno.ParamConfig]], username: str, slack_token: str = None, num_gpus=1, gpu_offset=0, log_dir: str = None) -> None: """ Runs training jobs on input hyperparameter grid. :param hyperparams: a dictionary of parameters. See blueno/types for a specification :param username: your name :param slack_token: a slack token for uploading to GitHub :param num_gpus: the number of gpus you will use :param gpu_offset: your gpu offset :param log_dir: the directory you will too. This directory should already exist :return: """ if isinstance(hyperparams, blueno.ParamGrid): param_list = model_selection.ParameterGrid(hyperparams.__dict__) else: param_list = hyperparams logging.info( 'optimizing grid with {} configurations'.format(len(param_list))) gpu_index = 0 processes = [] for params in param_list: if isinstance(params, dict): params = blueno.ParamConfig(**params) check_data_in_sync(params) # This is where we'd run preprocessing. To run in a reasonable amount # of time, the raw data must be cached in-memory. arrays = preprocessing.prepare_data(params, train_test_val=False) x_train, x_valid, y_train, y_valid, id_train, id_valid = arrays # Start the model training job # Run in a separate process to avoid memory issues # Note how this depends on offset os.environ['CUDA_VISIBLE_DEVICES'] = f'{gpu_index + gpu_offset}' if params.job_fn is None: job_fn = start_job else: job_fn = params.job_fn logging.debug('using job fn {}'.format(job_fn)) # Uses the parent of the data_dir to name the job, # which may not work for all data formats. if params.job_name: job_name = params.job_name else: job_name = str(pathlib.Path(params.data.data_dir).parent.name) job_name += f'_{y_train.shape[1]}-classes' process = multiprocessing.Process(target=job_fn, args=(x_train, y_train, x_valid, y_valid), kwargs={ 'params': params, 'job_name': job_name, 'username': username, 'slack_token': slack_token, 'log_dir': log_dir, 'id_valid': id_valid, }) gpu_index += 1 gpu_index %= num_gpus logging.debug(f'gpu_index is now {gpu_index + gpu_offset}') process.start() processes.append(process) if gpu_index == 0: logging.info(f'all gpus used, calling join on processes:' f' {processes}') p: multiprocessing.Process for p in processes: p.join() processes = [] time.sleep(60)
def simple_ensemble(model_blob_names: List[str], data_dir: str, labels_path: str, loss: Callable, seed: int, val_split: float, train_test_val: bool, sort: bool): """Creates an ensemble from the list of model_urls. DO NOT mix models in compat/ with those in sorted_models/. :param model_blob_names: a list of model blob names, like compat/ :param data_dir: the path to the data used by ALL models :param labels_path: the path to the labels used by ALL models :param loss: a loss function like keras.losses.categorical_crossentropy, used by ALL models :param seed: the seed of ALL of the models :param val_split: the val split of ALL of the models :param train_test_val: True if train_test_val split was used on the models :param sort: set to True of you are loading from sorted_models/, false if loading from compat/ :return: """ # Set the params variable from the function arguments, # we'll need this to load the data as x_train, y_train, ... params = blueno.ParamConfig( data=blueno.DataConfig( # TODO: Generalize to work for all users data_dir=data_dir, labels_path=labels_path, index_col='Anon ID', label_col='occlusion_exists', gcs_url='', ), generator=None, model=blueno.ModelConfig( model_callable=None, optimizer=None, # TODO: Some may use a different loss loss=loss, ), batch_size=None, seed=seed, val_split=val_split ) x_train, x_valid, y_train, y_valid, _, _ = prepare_data( params, train_test_val=train_test_val, sort=sort) datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True) datagen.fit(x_train) client = storage.Client(project='elvo-198322') bucket = storage.Bucket(client, name='elvos') # This is a copy of the ensemble_models function, using # model names instead. models = [] time1 = time.time() for i, blob_name in enumerate(model_blob_names): # Here we load and evaluate each individual model # so we can be sure that our data, validation split, and seed # are correct blob = bucket.get_blob(blob_name) if blob is None: raise ValueError(f'Blob {blob_name} does not exist') model_filepath = f'{i}.hdf5' print(f'downloading model {blob_name}') time2 = time.time() blob.download_to_filename(model_filepath) time3 = time.time() print(f'seconds to download: {time3 - time2}') print(f'loading model {blob_name}') model: keras.Model model = load_model(model_filepath, compile=True) os.remove(model_filepath) time4 = time.time() print(f'seconds to load: {time4 - time3}') # Used to check the model evaluate_model(model, datagen, x_valid, y_valid) model.name = f'model_{i}' models.append(model) # Finally we ensemble and evaluate the models here print('using models {}'.format(models)) model_input = layers.Input(shape=models[0].input_shape[1:]) ensemble = ensemble_models(models, model_input) evaluate_model(ensemble, datagen, x_valid, y_valid) time7 = time.time() print(f'seconds per ensemble: {time7 - time1}', flush=True)