def test_prepare_data_correct_dims(): params = { 'data': blueno.DataConfig(**{ 'data_dir': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/arrays/', 'labels_path': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/labels.csv', 'index_col': 'Anon ID', 'label_col': 'occlusion_exists', 'gcs_url': 'gs://elvos/processed/processed-standard', }), 'val_split': 0.2, 'seed': 0, 'batch_size': 8, 'generator': blueno.GeneratorConfig( generator_callable=lambda: None), 'model': blueno.ModelConfig(**{ # The callable must take in **kwargs as an argument 'model_callable': small_model, 'dropout_rate1': 0.8, 'dropout_rate2': 0.7, 'optimizer': keras.optimizers.Adam(lr=1e-4), 'loss': keras.losses.categorical_crossentropy, }), } params = blueno.ParamConfig(**params) _, _, y_train, y_test, _, _ = preprocessing.prepare_data( params, train_test_val=False) assert y_train.ndim == 2 assert y_test.ndim == 2
def test_check_data_in_sync_raises(): with pytest.raises(ValueError): params = { 'data': blueno.DataConfig(**{ 'data_dir': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/arrays/', 'labels_path': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/labels.csv', 'index_col': 'Anon ID', 'label_col': 'occlusion_exists', 'gcs_url': 'gs://elvos/processed/processed', }), 'val_split': 0.2, 'seed': 0, 'batch_size': 8, 'max_epochs': 1, 'generator': blueno.GeneratorConfig( generator_callable=generators.luke.standard_generators), 'model': blueno.ModelConfig(**{ # The callable must take in **kwargs as an argument 'model_callable': small_model, 'dropout_rate1': 0.8, 'dropout_rate2': 0.7, 'optimizer': keras.optimizers.Adam(lr=1e-4), 'loss': keras.losses.categorical_crossentropy, }), } params = blueno.ParamConfig(**params) bluenot.check_data_in_sync(params)
def test_start_job_log(): x_train = np.random.uniform(0, 255, (100, 220, 220, 3)) y_train = np.random.randint(0, 2, (100, 5)) x_valid = np.random.uniform(0, 255, (20, 220, 220, 3)) y_valid = np.random.randint(0, 2, (20, 5)) params = { 'data': blueno.DataConfig(**{ 'data_dir': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/arrays/', 'labels_path': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/labels.csv', 'index_col': 'Anon ID', 'label_col': 'Location of occlusions on CTA (Matt verified)', 'gcs_url': 'gs://elvos/processed/processed-standard', }), 'val_split': 0.2, 'seed': 0, 'batch_size': 8, 'max_epochs': 1, 'generator': blueno.GeneratorConfig( generator_callable=generators.luke.standard_generators), 'model': blueno.ModelConfig(**{ # The callable must take in **kwargs as an argument 'model_callable': small_model, 'dropout_rate1': 0.8, 'dropout_rate2': 0.7, 'optimizer': keras.optimizers.Adam(lr=1e-4), 'loss': keras.losses.categorical_crossentropy, }), } params = blueno.ParamConfig(**params) bluenot.start_job(x_train, y_train, x_valid, y_valid, job_name='test_job', username='******', params=params, log_dir='/tmp/') for filepath in glob.glob('/tmp/test_job*'): os.remove(filepath)
def run_web_gpu1708_job(data_name: str, batch_size: int, val_split: float, max_epochs: int, job_name: str, author_name: str): blueno_home = pathlib.Path('/home/lzhu7/elvo-analysis') data_dir = blueno_home / 'data' log_dir = blueno_home / 'logs' param_config = blueno.ParamConfig( data=blueno.DataConfig( data_dir=str(data_dir / data_name / 'arrays'), labels_path=str(data_dir / data_name / 'labels.csv'), index_col='Anon ID', label_col='occlusion_exists', gcs_url=f'gs://elvos/processed/{data_name}', ), generator=blueno.GeneratorConfig( generator_callable=generators.luke.standard_generators, ), model=blueno.ModelConfig( model_callable=models.luke.resnet, optimizer=keras.optimizers.Adam(lr=1e-5), loss=keras.losses.categorical_crossentropy, ), batch_size=int(batch_size), seed=0, val_split=float(val_split), early_stopping=False, max_epochs=int(max_epochs), job_name=job_name, ) logging.info('training web job {}'.format(param_config)) bluenot.hyperoptimize( [param_config], author_name, num_gpus=1, gpu_offset=3, log_dir=str(log_dir), )
def load_training_xy(data_root): # Minimal params to generate the correct training-validation data # split params = blueno.ParamConfig( data=blueno.DataConfig( data_dir='{}/arrays'.format(data_root), labels_path='{}/labels.csv'.format(data_root), index_col='Anon ID', label_col='occlusion_exists', gcs_url='gs://elvos/processed/processed-new-training-2'), generator=blueno.GeneratorConfig( generator_callable=standard_generators), model=blueno.ModelConfig(model_callable=None, optimizer=None, loss=categorical_crossentropy, dropout_rate1=None, dropout_rate2=None), batch_size=None, seed=0, # or 0 val_split=0.1, ) arrays = prepare_data(params, train_test_val=False) return arrays[0], arrays[2]
def test_prepare_and_job(): params = { 'data': blueno.DataConfig(**{ 'data_dir': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/arrays/', 'labels_path': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/labels.csv', 'index_col': 'Anon ID', 'label_col': 'occlusion_exists', 'gcs_url': 'gs://elvos/processed/processed-standard', }), 'val_split': 0.2, 'seed': 0, 'batch_size': 8, 'max_epochs': 1, 'generator': blueno.GeneratorConfig( generator_callable=generators.luke.standard_generators), 'model': blueno.ModelConfig(**{ # The callable must take in **kwargs as an argument 'model_callable': small_model, 'dropout_rate1': 0.8, 'dropout_rate2': 0.7, 'optimizer': keras.optimizers.Adam(lr=1e-4), 'loss': keras.losses.categorical_crossentropy, }), } params = blueno.ParamConfig(**params) x_train, x_valid, y_train, y_valid, _, _ = \ bluenot.preprocessing.prepare_data(params, train_test_val=False) bluenot.start_job(x_train, y_train, x_valid, y_valid, job_name='test_prepare_and_job', username='******', params=params)
def test_prepare_data_matching_indices(): params = { 'data': blueno.DataConfig(**{ 'data_dir': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/arrays/', 'labels_path': '/home/lzhu7/elvo-analysis/data/' 'processed-standard/labels.csv', 'index_col': 'Anon ID', 'label_col': 'occlusion_exists', 'gcs_url': 'gs://elvos/processed/processed-standard' }), 'val_split': 0.2, 'seed': 0, 'batch_size': 8, 'max_epochs': 1, 'generator': blueno.GeneratorConfig( generator_callable=lambda: None), 'model': blueno.ModelConfig(**{ # The callable must take in **kwargs as an argument 'model_callable': small_model, 'dropout_rate1': 0.8, 'dropout_rate2': 0.7, 'optimizer': keras.optimizers.Adam(lr=1e-4), 'loss': keras.losses.categorical_crossentropy, }), } params = blueno.ParamConfig(**params) _, _, y_train, y_test, id_train, id_test = preprocessing.prepare_data( params, train_test_val=False) for i, id_ in enumerate(id_test): if id_ == '068WBWCQGW5JHBYV': assert y_test[i][0] == 1 elif id_ == 'FBGMN3O08GW5GG91': assert y_test[i][1] == 1
def hyperoptimize(hyperparams: Union[blueno.ParamGrid, List[blueno.ParamConfig]], username: str, slack_token: str = None, num_gpus=1, gpu_offset=0, log_dir: str = None) -> None: """ Runs training jobs on input hyperparameter grid. :param hyperparams: a dictionary of parameters. See blueno/types for a specification :param username: your name :param slack_token: a slack token for uploading to GitHub :param num_gpus: the number of gpus you will use :param gpu_offset: your gpu offset :param log_dir: the directory you will too. This directory should already exist :return: """ if isinstance(hyperparams, blueno.ParamGrid): param_list = model_selection.ParameterGrid(hyperparams.__dict__) else: param_list = hyperparams logging.info( 'optimizing grid with {} configurations'.format(len(param_list))) gpu_index = 0 processes = [] for params in param_list: if isinstance(params, dict): params = blueno.ParamConfig(**params) check_data_in_sync(params) # This is where we'd run preprocessing. To run in a reasonable amount # of time, the raw data must be cached in-memory. arrays = preprocessing.prepare_data(params, train_test_val=False) x_train, x_valid, y_train, y_valid, id_train, id_valid = arrays # Start the model training job # Run in a separate process to avoid memory issues # Note how this depends on offset os.environ['CUDA_VISIBLE_DEVICES'] = f'{gpu_index + gpu_offset}' if params.job_fn is None: job_fn = start_job else: job_fn = params.job_fn logging.debug('using job fn {}'.format(job_fn)) # Uses the parent of the data_dir to name the job, # which may not work for all data formats. if params.job_name: job_name = params.job_name else: job_name = str(pathlib.Path(params.data.data_dir).parent.name) job_name += f'_{y_train.shape[1]}-classes' process = multiprocessing.Process(target=job_fn, args=(x_train, y_train, x_valid, y_valid), kwargs={ 'params': params, 'job_name': job_name, 'username': username, 'slack_token': slack_token, 'log_dir': log_dir, 'id_valid': id_valid, }) gpu_index += 1 gpu_index %= num_gpus logging.debug(f'gpu_index is now {gpu_index + gpu_offset}') process.start() processes.append(process) if gpu_index == 0: logging.info(f'all gpus used, calling join on processes:' f' {processes}') p: multiprocessing.Process for p in processes: p.join() processes = [] time.sleep(60)
def simple_ensemble(model_blob_names: List[str], data_dir: str, labels_path: str, loss: Callable, seed: int, val_split: float, train_test_val: bool, sort: bool): """Creates an ensemble from the list of model_urls. DO NOT mix models in compat/ with those in sorted_models/. :param model_blob_names: a list of model blob names, like compat/ :param data_dir: the path to the data used by ALL models :param labels_path: the path to the labels used by ALL models :param loss: a loss function like keras.losses.categorical_crossentropy, used by ALL models :param seed: the seed of ALL of the models :param val_split: the val split of ALL of the models :param train_test_val: True if train_test_val split was used on the models :param sort: set to True of you are loading from sorted_models/, false if loading from compat/ :return: """ # Set the params variable from the function arguments, # we'll need this to load the data as x_train, y_train, ... params = blueno.ParamConfig( data=blueno.DataConfig( # TODO: Generalize to work for all users data_dir=data_dir, labels_path=labels_path, index_col='Anon ID', label_col='occlusion_exists', gcs_url='', ), generator=None, model=blueno.ModelConfig( model_callable=None, optimizer=None, # TODO: Some may use a different loss loss=loss, ), batch_size=None, seed=seed, val_split=val_split ) x_train, x_valid, y_train, y_valid, _, _ = prepare_data( params, train_test_val=train_test_val, sort=sort) datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True) datagen.fit(x_train) client = storage.Client(project='elvo-198322') bucket = storage.Bucket(client, name='elvos') # This is a copy of the ensemble_models function, using # model names instead. models = [] time1 = time.time() for i, blob_name in enumerate(model_blob_names): # Here we load and evaluate each individual model # so we can be sure that our data, validation split, and seed # are correct blob = bucket.get_blob(blob_name) if blob is None: raise ValueError(f'Blob {blob_name} does not exist') model_filepath = f'{i}.hdf5' print(f'downloading model {blob_name}') time2 = time.time() blob.download_to_filename(model_filepath) time3 = time.time() print(f'seconds to download: {time3 - time2}') print(f'loading model {blob_name}') model: keras.Model model = load_model(model_filepath, compile=True) os.remove(model_filepath) time4 = time.time() print(f'seconds to load: {time4 - time3}') # Used to check the model evaluate_model(model, datagen, x_valid, y_valid) model.name = f'model_{i}' models.append(model) # Finally we ensemble and evaluate the models here print('using models {}'.format(models)) model_input = layers.Input(shape=models[0].input_shape[1:]) ensemble = ensemble_models(models, model_input) evaluate_model(ensemble, datagen, x_valid, y_valid) time7 = time.time() print(f'seconds per ensemble: {time7 - time1}', flush=True)
'freeze': [False], })) model_list = [blueno.ModelConfig(**m) for m in model_list] PARAM_GRID = model_selection.ParameterGrid({ 'data': [ blueno.DataConfig( data_dir=str( pathlib.Path(DATA_DIR) / 'processed-new-training-2/arrays/'), labels_path=str( pathlib.Path(DATA_DIR) / 'processed-new-training-2/labels.csv'), index_col='Anon ID', label_col='occlusion_exists', gcs_url='gs://elvos/processed/processed-new-training-2') ], 'generator': [ blueno.GeneratorConfig( generator_callable=generators.luke.standard_generators, rotation_range=30) ], 'model': model_list, 'batch_size': [5], 'seed': [0], 'val_split': [0.1], # So we run the grid 16 times }) PARAM_GRID = [blueno.ParamConfig(**p) for p in PARAM_GRID]