def get_deepspeech(input_dim, output_dim, context=9, units=2048, dropouts=(0.05, 0.05, 0.05, 0, 0.05), tflite_version=False, is_mixed_precision=False, lstm_implementation=2, random_state=1) -> keras.Model: """ The `get_deepspeech` returns the graph definition of the DeepSpeech model. Default parameters are overwritten only where it is needed. Reference: "Deep Speech: Scaling up end-to-end speech recognition." (https://arxiv.org/abs/1412.5567) """ if is_mixed_precision: policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) if dropouts[3] != 0: logger.warning("Mozilla DeepSpeech doesn't use dropout " "after LSTM(dropouts[3]). Be careful!") np.random.seed(random_state) tf.random.set_seed(random_state) max_seq_length = None if tflite_version: max_seq_length = 1 with tf.device('/gpu:0'): input_tensor = layers.Input([max_seq_length, input_dim], name='X') # Add 4th dimension [batch, time, frequency, channel] x = layers.Lambda(keras.backend.expand_dims, arguments=dict(axis=3))(input_tensor) # Fill zeros around time dimension x = layers.ZeroPadding2D(padding=(context, 0))(x) # Convolve signal in time dim receptive_field = (2 * context + 1, input_dim) x = layers.Conv2D(filters=units, kernel_size=receptive_field)(x) # Squeeze into 3rd dim array x = layers.Lambda(keras.backend.squeeze, arguments=dict(axis=2))(x) x = layers.ReLU()(x) x = layers.Dropout(rate=dropouts[0])(x) x = layers.TimeDistributed(layers.Dense(units), name='td_dense_2')(x) x = layers.ReLU(max_value=20)(x) x = layers.Dropout(rate=dropouts[1])(x) x = layers.TimeDistributed(layers.Dense(units), name='td_dense_3')(x) x = layers.ReLU(max_value=20)(x) x = layers.Dropout(rate=dropouts[2])(x) x = layers.LSTM(units, return_sequences=True, name='lstm_1', unroll=tflite_version, implementation=lstm_implementation)(x) x = layers.Dropout(rate=dropouts[3])(x) x = layers.TimeDistributed(layers.Dense(units), name='td_dense_4')(x) x = layers.ReLU(max_value=20)(x) x = layers.Dropout(rate=dropouts[4])(x) x = layers.TimeDistributed(layers.Dense(output_dim), name='td_dense_5')(x) model = keras.Model(input_tensor, x, name='DeepSpeech') if is_mixed_precision: # revert policy policy = mixed_precision.Policy('float32') mixed_precision.set_policy(policy) return model
'bbox': nms.nmsed_boxes, 'classes': nms.nmsed_classes, 'confidence': nms.nmsed_scores, } if __name__ == '__main__': from yolo.utils.run_utils import prep_gpu from yolo.configs import yolo as exp_cfg from yolo.tasks.yolo import YoloTask import yolo.utils.export.tensor_rt as trt prep_gpu() from tensorflow.keras.mixed_precision import experimental as mixed_precision mixed_precision.set_policy('float16') # init a fake webcam # ls /dev/video* # sudo modprobe -r v4l2loopback # sudo modprobe v4l2loopback devices=1 video_nr=20 card_label="v4l2loopback" exclusive_caps=1 # name = "saved_models/v4/regular" # new_name = f"{name}_tensorrt" # model = trt.TensorRT(saved_model=new_name, save_new_path=new_name, max_workspace_size_bytes=4000000000, max_batch_size=5)#, precision_mode="INT8", use_calibration=True) # model.compile() # model.summary() # model.set_postprocessor_fn(func) config = exp_cfg.YoloTask() task = YoloTask(config)
def train(strategy, cfg): os.makedirs(cfg.MODEL.SAVE_DIR, exist_ok=True) if cfg.DATASET.BFLOAT16: policy = mixed_precision.Policy('mixed_bfloat16') mixed_precision.set_policy(policy) tf.random.set_seed(cfg.TRAIN.SEED) np.random.seed(cfg.TRAIN.SEED) spe = int(np.ceil(cfg.DATASET.TRAIN_SAMPLES / cfg.TRAIN.BATCH_SIZE)) spv = cfg.DATASET.VAL_SAMPLES // cfg.VAL.BATCH_SIZE if cfg.TRAIN.SCALE_LR: lr = cfg.TRAIN.BASE_LR * cfg.TRAIN.BATCH_SIZE / 32 cfg.TRAIN.WARMUP_FACTOR = 32 / cfg.TRAIN.BATCH_SIZE else: lr = cfg.TRAIN.BASE_LR if cfg.TRAIN.LR_SCHEDULE == 'warmup_cosine_decay': lr_schedule = WarmupCosineDecay( initial_learning_rate=lr, decay_steps=cfg.TRAIN.EPOCHS * spe, warmup_steps=cfg.TRAIN.WARMUP_EPOCHS * spe, warmup_factor=cfg.TRAIN.WARMUP_FACTOR) elif cfg.TRAIN.LR_SCHEDULE == 'warmup_piecewise': lr_schedule = WarmupPiecewise( boundaries=[x * spe for x in cfg.TRAIN.DECAY_EPOCHS], values=[lr, lr / 10, lr / 10 ** 2], warmup_steps=spe * cfg.TRAIN.WARMUP_EPOCHS, warmup_factor=cfg.TRAIN.WARMUP_FACTOR) else: lr_schedule = lr with strategy.scope(): optimizer = tf.keras.optimizers.Adam(lr_schedule) if cfg.TRAIN.WANDB_RUN_ID: api = wandb.Api() run = api.run(f"{cfg.EVAL.WANDB_RUNS}/{cfg.TRAIN.WANDB_RUN_ID}") run.file("model-best.h5").download(replace=True) model = tf.keras.models.load_model('model-best.h5', custom_objects={ 'relu6': tf.nn.relu6, 'WarmupCosineDecay': WarmupCosineDecay }) model.compile(optimizer=model.optimizer, loss=mse) else: if cfg.MODEL.TYPE == 'simple_baseline': model = SimpleBaseline(cfg) elif cfg.MODEL.TYPE == 'hrnet': model = HRNet(cfg) elif cfg.MODEL.TYPE == 'evopose': model = EvoPose(cfg) elif cfg.MODEL.TYPE == 'eflite': model = EfficientNetLite(cfg) elif cfg.MODEL.TYPE == 'ef': model = EfficientNet(cfg) model.compile(optimizer=optimizer, loss=mse) cfg.DATASET.OUTPUT_SHAPE = model.output_shape[1:] cfg.DATASET.SIGMA = 2 * cfg.DATASET.OUTPUT_SHAPE[0] / 64 wandb_config = setup_wandb(cfg, model) train_ds = load_tfds(cfg, 'train') train_ds = strategy.experimental_distribute_dataset(train_ds) if cfg.TRAIN.VAL: val_ds = load_tfds(cfg, 'val') val_ds = strategy.experimental_distribute_dataset(val_ds) print('Training {} ({} / {}) on {} for {} epochs' .format(cfg.MODEL.NAME, wandb_config.parameters, wandb_config.flops, cfg.TRAIN.ACCELERATOR, cfg.TRAIN.EPOCHS)) initial_epoch = 0 if cfg.TRAIN.WANDB_RUN_ID: initial_epoch = cfg.TRAIN.INITIAL_EPOCH model.fit(train_ds, initial_epoch=initial_epoch, epochs=cfg.TRAIN.EPOCHS, verbose=1, validation_data=val_ds, validation_steps=spv, steps_per_epoch=spe, callbacks=[WandbCallback()]) return model
def train(datadir, var_dict, output_vars, filters, kernels, lr, batch_size, early_stopping_patience, epochs, exp_id, model_save_dir, pred_save_dir, train_years, valid_years, test_years, lead_time, gpu, norm_subsample, data_subsample, lr_step, lr_divide, network_type, restore_best_weights, bn_position, nt_in, dt_in, use_bias, l2, skip, dropout, reduce_lr_patience, reduce_lr_factor, min_lr_times, unres, loss, cmip, cmip_dir, pretrained_model, last_pretrained_layer, last_trainable_layer, min_es_delta, optimizer, activation, ext_mean, ext_std, cont_time, multi_dt, momentum, parametric, one_cycle, long_skip, train_tfr_files, valid_tfr_files, test_tfr_files, tfr_num_parallel_calls, tfr_buffer_size, tfr_prefetch, y_roll, X_roll, discard_first, min_lead_time, relu_idxs, tp_log, tfr_out_idxs, predict_difference, is_categorical, bin_min, bin_max, num_bins, quantile_bins, **kwargs): print(type(var_dict)) # os.environ["CUDA_VISIBLE_DEVICES"]=str(2) # # Limit TF memory usage # limit_mem() os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(g) for g in gpu]) mirrored_strategy = tf.distribute.MirroredStrategy( devices=[f"/gpu:{i}" for i, g in enumerate(gpu)]) # Mixed precicion policy policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) # Open dataset and create data generators if cmip: if len(cmip_dir) > 1: dg_train, dg_valid, dg_test = [], [], [] for cd in cmip_dir: dgtr, dgv, dgte = load_data( var_dict, datadir, cmip, cd, train_years, valid_years, test_years, lead_time, batch_size, output_vars, data_subsample, norm_subsample, nt_in, dt_in, ext_mean=ext_mean, ext_std=ext_std, cont_time=cont_time, multi_dt=multi_dt, train_tfr_files=train_tfr_files, valid_tfr_files=valid_tfr_files, test_tfr_files=test_tfr_files, tfr_num_parallel_calls=tfr_num_parallel_calls, tfr_buffer_size=tfr_buffer_size, tfr_prefetch=tfr_prefetch, y_roll=y_roll, X_roll=X_roll, discard_first=discard_first, min_lead_time=min_lead_time, tp_log=tp_log, tfr_out_idxs=tfr_out_idxs, predict_difference=predict_difference, is_categorical=is_categorical, bin_min=bin_min, bin_max=bin_max, num_bins=num_bins, quantile_bins=quantile_bins) dg_train.append(dgtr) dg_valid.append(dgv) dg_test.append(dgte) dg_train, dg_valid, dg_test = [ CombinedDataGenerator(dg, batch_size) for dg in [dg_train, dg_valid, dg_test] ] else: dg_train, dg_valid, dg_test = load_data( var_dict, datadir, cmip, cmip_dir[0], train_years, valid_years, test_years, lead_time, batch_size, output_vars, data_subsample, norm_subsample, nt_in, dt_in, ext_mean=ext_mean, ext_std=ext_std, cont_time=cont_time, multi_dt=multi_dt, train_tfr_files=train_tfr_files, valid_tfr_files=valid_tfr_files, test_tfr_files=test_tfr_files, tfr_num_parallel_calls=tfr_num_parallel_calls, tfr_buffer_size=tfr_buffer_size, tfr_prefetch=tfr_prefetch, y_roll=y_roll, X_roll=X_roll, discard_first=discard_first, min_lead_time=min_lead_time, tp_log=tp_log, tfr_out_idxs=tfr_out_idxs, predict_difference=predict_difference, is_categorical=is_categorical, bin_min=bin_min, bin_max=bin_max, num_bins=num_bins, quantile_bins=quantile_bins) else: dg_train, dg_valid, dg_test = load_data( var_dict, datadir, cmip, cmip_dir, train_years, valid_years, test_years, lead_time, batch_size, output_vars, data_subsample, norm_subsample, nt_in, dt_in, ext_mean=ext_mean, ext_std=ext_std, cont_time=cont_time, multi_dt=multi_dt, train_tfr_files=train_tfr_files, valid_tfr_files=valid_tfr_files, test_tfr_files=test_tfr_files, tfr_num_parallel_calls=tfr_num_parallel_calls, tfr_buffer_size=tfr_buffer_size, tfr_prefetch=tfr_prefetch, y_roll=y_roll, X_roll=X_roll, discard_first=discard_first, min_lead_time=min_lead_time, tp_log=tp_log, tfr_out_idxs=tfr_out_idxs, predict_difference=predict_difference, is_categorical=is_categorical, bin_min=bin_min, bin_max=bin_max, num_bins=num_bins, quantile_bins=quantile_bins) # Build model if pretrained_model is not None: pretrained_model = keras.models.load_model(pretrained_model, custom_objects={ 'PeriodicConv2D': PeriodicConv2D, 'ChannelReLU2D': ChannelReLU2D, 'lat_mse': keras.losses.mse, 'lat_mae': keras.losses.mse }) with mirrored_strategy.scope(): if network_type == 'resnet': model = build_resnet(filters, kernels, input_shape=dg_train.shape, bn_position=bn_position, use_bias=use_bias, l2=l2, skip=skip, dropout=dropout, activation=activation, long_skip=long_skip, relu_idxs=relu_idxs, categorical=is_categorical, nvars=len(dg_train.output_idxs)) elif network_type == 'uresnet': model = build_uresnet(filters, kernels, unres, input_shape=dg_train.shape, bn_position=bn_position, use_bias=use_bias, l2=l2, skip=skip, dropout=dropout, activation=activation) if pretrained_model is not None: # Copy over weights for i, l in enumerate(pretrained_model.layers): model.layers[i].set_weights(l.get_weights()) if l.name == last_pretrained_layer: break # Set trainable to false if last_trainable_layer is not None: for l in model.layers: l.trainable = False if l.name == last_trainable_layer: break if multi_dt > 1: model = create_multi_dt_model(model, multi_dt, dg_train) if loss == 'lat_mse': loss = create_lat_mse(dg_train.data.lat) if loss == 'lat_mae': loss = create_lat_mae(dg_train.data.lat) if loss == 'lat_rmse': loss = create_lat_rmse(dg_train.data.lat) if loss == 'lat_crps': loss = create_lat_crps(dg_train.data.lat, len(dg_train.output_idxs)) if loss == 'lat_crps_relu': loss = create_lat_crps(dg_train.data.lat, len(dg_train.output_idxs), relu=True) if loss == 'lat_crps_mae': loss = create_lat_crps_mae(dg_train.data.lat, len(dg_train.output_idxs)) if loss == 'lat_crps_lcgev': loss = create_lat_crps_lcgev(dg_train.data.lat, len(dg_train.output_idxs)) if loss == 'lat_log_loss': loss = create_lat_log_loss(dg_train.data.lat, len(dg_train.output_idxs)) if loss == 'lat_categorical_crossentropy': loss = create_lat_categorical_loss(dg_train.data.lat, len(dg_train.output_idxs)) if optimizer == 'adam': opt = keras.optimizers.Adam(lr) elif optimizer == 'adadelta': opt = keras.optimizers.Adadelta(lr) elif optimizer == 'sgd': opt = keras.optimizers.SGD(lr, momentum=momentum, nesterov=True) elif optimizer == 'rmsprop': opt = keras.optimizers.RMSprop(lr, momentum=momentum) model.compile(opt, loss) print(model.summary()) # Learning rate settings callbacks = [] if early_stopping_patience is not None: callbacks.append( tf.keras.callbacks.EarlyStopping( patience=early_stopping_patience, verbose=1, min_delta=min_es_delta, mode='auto', restore_best_weights=restore_best_weights)) if reduce_lr_patience is not None: callbacks.append( tf.keras.callbacks.ReduceLROnPlateau( patience=reduce_lr_patience, factor=reduce_lr_factor, verbose=1, min_lr=reduce_lr_factor**min_lr_times * lr, )) if lr_step is not None: callbacks.append( keras.callbacks.LearningRateScheduler( LRUpdate(lr, lr_step, lr_divide))) if one_cycle: callbacks.append( OneCycleLR( lr, maximum_momentum=None if not optimizer == 'sgd' else 0.95, minimum_momentum=None if not optimizer == 'sgd' else 0.85, verbose=1)) # Train model history = model.fit(dg_train.tfr_dataset or dg_train, epochs=epochs, validation_data=dg_valid.tfr_dataset or dg_valid, callbacks=callbacks) print(f'Saving model: {model_save_dir}/{exp_id}.h5') model.save(f'{model_save_dir}/{exp_id}.h5') print(f'Saving model weights: {model_save_dir}/{exp_id}_weights.h5') model.save_weights(f'{model_save_dir}/{exp_id}_weights.h5') print(f'Saving training_history: {model_save_dir}/{exp_id}_history.pkl') to_pickle(history.history, f'{model_save_dir}/{exp_id}_history.pkl') print( f'Saving norm files: {model_save_dir}/{exp_id}_mean.nc and {model_save_dir}/{exp_id}_std.nc' ) dg_train.mean.to_netcdf(f'{model_save_dir}/{exp_id}_mean.nc') dg_train.std.to_netcdf(f'{model_save_dir}/{exp_id}_std.nc') # Create predictions preds = create_predictions(model, dg_test, parametric=parametric, multi_dt=multi_dt > 1) if len(preds.lat) != 32: preds = regrid(preds, ddeg_out=5.625) print(f'Saving predictions: {pred_save_dir}/{exp_id}.nc') preds.to_netcdf(f'{pred_save_dir}/{exp_id}.nc') # Print score in real units if not cmip: if '5.625deg' in datadir: valdir = datadir else: valdir = '/'.join(datadir.split('/')[:-2] + ['5.625deg/']) z500_valid = load_test_data(f'{valdir}geopotential_500', 'z', years=slice(test_years[0], test_years[1])).drop('level') t850_valid = load_test_data(f'{valdir}temperature_850', 't', years=slice(test_years[0], test_years[1])).drop('level') tp = xr.open_mfdataset( f'{valdir}/6hr_precipitation/*.nc', combine='by_coords').sel(time=slice(test_years[0], test_years[1])) t2m = xr.open_mfdataset( f'{valdir}/2m_temperature/*.nc', combine='by_coords').sel(time=slice(test_years[0], test_years[1])) valid = xr.merge([z500_valid, t850_valid, tp, t2m]) print(compute_weighted_rmse(preds, valid).load())
def train_updnet( multicoil=True, brain=False, af=4, contrast=None, cuda_visible_devices='0123', n_samples=None, n_epochs=200, n_iter=10, use_mixed_precision=False, n_layers=3, base_n_filter=16, non_linearity='relu', channel_attention_kwargs=None, refine_smaps=False, loss='mae', original_run_id=None, fixed_masks=False, n_epochs_original=250, equidistant_fake=False, mask_type=None, ): if brain: n_volumes = brain_n_volumes_train else: n_volumes = n_volumes_train # paths if multicoil: if brain: train_path = f'{FASTMRI_DATA_DIR}brain_multicoil_train/' val_path = f'{FASTMRI_DATA_DIR}brain_multicoil_val/' else: train_path = f'{FASTMRI_DATA_DIR}multicoil_train/' val_path = f'{FASTMRI_DATA_DIR}multicoil_val/' else: train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/' val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/' os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices) af = int(af) # trying mixed precision if use_mixed_precision: policy_type = 'mixed_float16' else: policy_type = 'float32' policy = mixed_precision.Policy(policy_type) mixed_precision.set_policy(policy) # generators if multicoil: dataset = multicoil_dataset if mask_type is None: if brain: if equidistant_fake: mask_type = 'equidistant_fake' else: mask_type = 'equidistant' else: mask_type = 'random' kwargs = { 'parallel': False, 'output_shape_spec': brain, 'mask_type': mask_type, } else: dataset = singlecoil_dataset kwargs = {} train_set = dataset( train_path, AF=af, contrast=contrast, inner_slices=None, rand=True, scale_factor=1e6, n_samples=n_samples, fixed_masks=fixed_masks, **kwargs ) val_set = dataset( val_path, AF=af, contrast=contrast, inner_slices=None, rand=True, scale_factor=1e6, **kwargs ) run_params = { 'n_primal': 5, 'n_dual': 1, 'primal_only': True, 'multicoil': multicoil, 'n_layers': n_layers, 'layers_n_channels': [base_n_filter * 2**i for i in range(n_layers)], 'non_linearity': non_linearity, 'n_iter': n_iter, 'channel_attention_kwargs': channel_attention_kwargs, 'refine_smaps': refine_smaps, 'output_shape_spec': brain, } if multicoil: updnet_type = 'updnet_sense_' if brain: updnet_type += 'brain_' else: updnet_type = 'updnet_singlecoil_' additional_info = f'af{af}' if contrast is not None: additional_info += f'_{contrast}' if n_samples is not None: additional_info += f'_{n_samples}' if n_iter != 10: additional_info += f'_i{n_iter}' if non_linearity != 'relu': additional_info += f'_{non_linearity}' if n_layers != 3: additional_info += f'_l{n_layers}' if base_n_filter != 16: additional_info += f'_bf{base_n_filter}' if loss != 'mae': additional_info += f'_{loss}' if channel_attention_kwargs: additional_info += '_ca' if refine_smaps: additional_info += '_rf_sm' if fixed_masks: additional_info += '_fixed_masks' run_id = f'{updnet_type}_{additional_info}_{int(time.time())}' chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}.hdf5' chkpt_cback = ModelCheckpoint(chkpt_path, period=n_epochs, save_weights_only=True) log_dir = op.join(f'{LOGS_DIR}logs', run_id) tboard_cback = TensorBoard( profile_batch=0, log_dir=log_dir, histogram_freq=0, write_graph=False, write_images=False, ) tqdm_cback = TQDMProgressBar() model = UPDNet(**run_params) if original_run_id is not None: lr = 1e-7 n_steps = brain_volumes_per_contrast['train'].get(contrast, n_volumes//2) else: lr = 1e-4 n_steps = n_volumes default_model_compile(model, lr=lr, loss=loss) print(run_id) if original_run_id is not None: if os.environ.get('FASTMRI_DEBUG'): n_epochs_original = 1 model.load_weights(f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-{n_epochs_original:02d}.hdf5') model.fit( train_set, steps_per_epoch=n_steps, epochs=n_epochs, validation_data=val_set, validation_steps=2, verbose=0, callbacks=[tboard_cback, chkpt_cback, tqdm_cback], ) return run_id
def get_deepspeech2(input_dim, output_dim, is_mixed_precision=True, rnn_units=800, random_state=1) -> keras.Model: if is_mixed_precision: policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) np.random.seed(random_state) tf.random.set_seed(random_state) # Create model under CPU scope and avoid OOM, errors during concatenation # a large distributed model. with tf.device('/cpu:0'): # Define input tensor [batch, time, features] input_tensor = layers.Input([None, input_dim], name='X') # Add 4th dimension [batch, time, frequency, channel] x = layers.Lambda(keras.backend.expand_dims, arguments=dict(axis=-1))(input_tensor) x = layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding='same', use_bias=False, name='conv_1')(x) x = layers.BatchNormalization(name='conv_1_bn')(x) x = layers.ReLU(name='conv_1_relu')(x) x = layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[1, 2], padding='same', use_bias=False, name='conv_2')(x) x = layers.BatchNormalization(name='conv_2_bn')(x) x = layers.ReLU(name='conv_2_relu')(x) # We need to squeeze to 3D tensor. Thanks to the stride in frequency # domain, we reduce the number of features four times for each channel. x = layers.Reshape([-1, input_dim // 4 * 32])(x) for i in [1, 2, 3, 4, 5]: recurrent = layers.GRU(units=rnn_units, activation='tanh', recurrent_activation='sigmoid', use_bias=True, return_sequences=True, reset_after=True, name=f'gru_{i}') x = layers.Bidirectional(recurrent, name=f'bidirectional_{i}', merge_mode='concat')(x) x = layers.Dropout(rate=0.5)(x) if i < 5 else x # Only between # Return at each time step logits along characters. Then CTC # computation is more stable, in contrast to the softmax. x = layers.TimeDistributed(layers.Dense(units=rnn_units * 2), name='dense_1')(x) x = layers.ReLU(name='dense_1_relu')(x) x = layers.Dropout(rate=0.5)(x) output_tensor = layers.TimeDistributed(layers.Dense(units=output_dim), name='dense_2')(x) model = keras.Model(input_tensor, output_tensor, name='DeepSpeech2') return model
def change_policy(policy): from tensorflow.keras.mixed_precision import experimental as mixed_precision mixed_precision.set_policy(policy) return
def train_dealiaser( model_fun, model_kwargs, run_id, n_scales=0, multicoil=False, af=4, contrast=None, cuda_visible_devices='0123', n_samples=None, n_epochs=200, use_mixed_precision=False, loss='mae', original_run_id=None, fixed_masks=False, n_steps_per_epoch=973, ): # paths if multicoil: train_path = f'{FASTMRI_DATA_DIR}multicoil_train/' val_path = f'{FASTMRI_DATA_DIR}multicoil_val/' else: train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/' val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/' os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices) af = int(af) # trying mixed precision if use_mixed_precision: policy_type = 'mixed_float16' else: policy_type = 'float32' policy = mixed_precision.Policy(policy_type) mixed_precision.set_policy(policy) # generators if multicoil: dataset = multicoil_dataset kwargs = {'parallel': False} else: dataset = singlecoil_dataset kwargs = {} train_set = dataset( train_path, AF=af, contrast=contrast, inner_slices=None, rand=True, scale_factor=1e6, n_samples=n_samples, fixed_masks=fixed_masks, **kwargs ) val_set = dataset( val_path, AF=af, contrast=contrast, inner_slices=None, rand=True, scale_factor=1e6, **kwargs ) additional_info = f'af{af}' if contrast is not None: additional_info += f'_{contrast}' if n_samples is not None: additional_info += f'_{n_samples}' if loss != 'mae': additional_info += f'_{loss}' if fixed_masks: additional_info += '_fixed_masks' run_id = f'{run_id}_{additional_info}_{int(time.time())}' chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}.hdf5' chkpt_cback = ModelCheckpoint(chkpt_path, period=n_epochs, save_weights_only=True) log_dir = op.join(f'{LOGS_DIR}logs', run_id) tboard_cback = TensorBoard( profile_batch=0, log_dir=log_dir, histogram_freq=0, write_graph=False, write_images=False, ) tqdm_cback = TQDMProgressBar() model = MultiscaleComplex( model_fun=model_fun, model_kwargs=model_kwargs, res=False, n_scales=n_scales, fastmri_format=True, ) if original_run_id is not None: lr = 1e-7 n_steps = n_steps_per_epoch//2 else: lr = 1e-4 n_steps = n_steps_per_epoch default_model_compile(model, lr=lr, loss=loss) print(run_id) if original_run_id is not None: if os.environ.get('FASTMRI_DEBUG'): n_epochs_original = 1 else: n_epochs_original = 250 model(next(iter(train_set))[0]) model.load_weights(f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-{n_epochs_original:02d}.hdf5') model.fit( train_set, steps_per_epoch=n_steps, epochs=n_epochs, validation_data=val_set, validation_steps=5, validation_freq=5, verbose=0, callbacks=[tboard_cback, chkpt_cback, tqdm_cback], ) return run_id
def main(logdir, config): logdir = pathlib.Path(logdir).expanduser() config.traindir = config.traindir or logdir / 'train_eps' config.evaldir = config.evaldir or logdir / 'eval_eps' config.steps //= config.action_repeat config.eval_every //= config.action_repeat config.log_every //= config.action_repeat config.time_limit //= config.action_repeat config.act = getattr(tf.nn, config.act) if config.debug: tf.config.experimental_run_functions_eagerly(True) if config.gpu_growth: message = 'No GPU found. To actually train on CPU remove this assert.' assert tf.config.experimental.list_physical_devices('GPU'), message for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) assert config.precision in (16, 32), config.precision if config.precision == 16: prec.set_policy(prec.Policy('mixed_float16')) print('Logdir', logdir) logdir.mkdir(parents=True, exist_ok=True) config.traindir.mkdir(parents=True, exist_ok=True) config.evaldir.mkdir(parents=True, exist_ok=True) step = count_steps(config.traindir) logger = tools.Logger(logdir, config.action_repeat * step) print('Create envs.') if config.offline_traindir: directory = config.offline_traindir.format(**vars(config)) else: directory = config.traindir train_eps = tools.load_episodes(directory, limit=config.dataset_size) if config.offline_evaldir: directory = config.offline_evaldir.format(**vars(config)) else: directory = config.evaldir eval_eps = tools.load_episodes(directory, limit=1) make = lambda mode: make_env(config, logger, mode, train_eps, eval_eps) train_envs = [make('train') for _ in range(config.envs)] eval_envs = [make('eval') for _ in range(config.envs)] acts = train_envs[0].action_space config.num_actions = acts.n if hasattr(acts, 'n') else acts.shape[0] prefill = max(0, config.prefill - count_steps(config.traindir)) print(f'Prefill dataset ({prefill} steps).') random_agent = lambda o, d, s: ([acts.sample() for _ in d], s) tools.simulate(random_agent, train_envs, prefill) tools.simulate(random_agent, eval_envs, episodes=1) logger.step = config.action_repeat * count_steps(config.traindir) print('Simulate agent.') train_dataset = make_dataset(train_eps, config) eval_dataset = iter(make_dataset(eval_eps, config)) agent = Dreamer(config, logger, train_dataset) if (logdir / 'variables.pkl').exists(): agent.load(logdir / 'variables.pkl') agent._should_pretrain._once = False state = None while agent._step.numpy().item() < config.steps: logger.write() print('Start evaluation.') video_pred = agent._wm.video_pred(next(eval_dataset)) logger.video('eval_openl', video_pred) eval_policy = functools.partial(agent, training=False) tools.simulate(eval_policy, eval_envs, episodes=1) print('Start training.') state = tools.simulate(agent, train_envs, config.eval_every, state=state) agent.save(logdir / 'variables.pkl') for env in train_envs + eval_envs: try: env.close() except Exception: pass
def train_model(data_path, batch_size, image_size, crop_size, lr_schedule_name, init_lr, max_lr, weight_decay, optimizer, model_type, embedding_size, num_epochs, checkpoint_path, margin=0.5, cache_path=None, range_test=False, use_tpu=False, tpu_name=None, use_mixed_precision=False, distributed=False, eager_execution=False, weights_path='', checkpoint_interval=5000, step_size=6000, recompile=False, steps_per_epoch=None, logist_scale=64): if use_tpu is True: assert tpu_name is not None, '[ERROR] TPU name must be specified' resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=tpu_name) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) print("[INFO] TPUs: ", tf.config.list_logical_devices('TPU')) if use_mixed_precision is True: if use_tpu is True: policy = mixed_precision.Policy('mixed_bfloat16') else: policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) print( "[INFO] Using mixed precision for training. This will reduce memory consumption\n" ) if distributed is True and use_tpu is False: mirrored_strategy = tf.distribute.MirroredStrategy() print("[INFO] Using distributed training strategy on GPU") train_dataset, n_imgs, n_classes = generate_training_dataset( data_path=data_path, image_size=image_size, batch_size=batch_size, crop_size=crop_size, cache=cache_path, use_mixed_precision=use_mixed_precision, use_tpu=use_tpu, model_type=model_type) test_dataset = None run_eagerly = eager_execution if eager_execution is not None else False log_dir = './logs/log_' + datetime.now().strftime("%Y%m%d_%H%M%S") tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, update_freq=100, write_graph=False) stop_on_nan = tf.keras.callbacks.TerminateOnNaN() loss_fn = SoftmaxLoss() if range_test is True: range_finder = RangeTestCallback(start_lr=init_lr, end_lr=max_lr, n_imgs=n_imgs, batch_size=batch_size) opt = get_optimizer(optimizer_name=optimizer, lr_schedule=1e-5, weight_decay=weight_decay) if use_tpu is True: with strategy.scope(): model, compiled = create_neural_network( model_type=model_type, embedding_size=embedding_size, weights_path=weights_path, n_classes=n_classes, recompile=recompile, input_shape=[crop_size, crop_size, 3], training=True, margin=margin, logist_scale=logist_scale) assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights' if compiled is False: print( '[INFO] Recompiling model using passed optimizer and loss arguments' ) model.compile(optimizer=opt, loss=loss_fn, run_eagerly=run_eagerly) elif distributed is True and use_tpu is False: with mirrored_strategy.scope(): model, compiled = create_neural_network( model_type=model_type, embedding_size=embedding_size, weights_path=weights_path, n_classes=n_classes, recompile=recompile, input_shape=[crop_size, crop_size, 3], training=True, margin=margin, logist_scale=logist_scale) opt = get_optimizer( optimizer_name=optimizer, lr_schedule=1e-5, weight_decay=weight_decay ) # Optimizer must be created within scope! assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights' if compiled is False: print( '[INFO] Recompiling model using passed optimizer and loss arguments' ) model.compile(optimizer=opt, loss=loss_fn, run_eagerly=run_eagerly) else: model, compiled = create_neural_network( model_type=model_type, embedding_size=embedding_size, weights_path=weights_path, n_classes=n_classes, recompile=recompile, input_shape=[crop_size, crop_size, 3], training=True, margin=margin, logist_scale=logist_scale) assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights' if compiled is False: print( '[INFO] Recompiling model using passed optimizer and loss arguments' ) model.compile(optimizer=opt, loss=loss_fn, run_eagerly=run_eagerly) callback_list = [range_finder, tensorboard_callback, stop_on_nan] train_history = model.fit(train_dataset, epochs=num_epochs, callbacks=callback_list) print( '\n[INFO] Training complete. Range test results can be found at "./range_test_result.png"' ) return else: lr_schedule = get_learning_rate_schedule( schedule_name=lr_schedule_name, learning_rate=init_lr, max_lr=max_lr, image_count=n_imgs, batch_size=batch_size, step_size=step_size) opt = get_optimizer(optimizer_name=optimizer, lr_schedule=lr_schedule, weight_decay=weight_decay) if not os.path.exists(checkpoint_path): os.mkdir(checkpoint_path) #checkpoint_name = checkpoint_path + '/' + 'cp-{epoch:03d}.ckpt' model_saver = tf.keras.callbacks.ModelCheckpoint( filepath=os.path.join(checkpoint_path, 'full_model'), save_weights_only=False, monitor='val_loss', mode='min', save_best_only=False, save_freq=checkpoint_interval) weights_saver = tf.keras.callbacks.ModelCheckpoint( filepath=os.path.join(checkpoint_path, 'model_weights'), save_weights_only=True, monitor='val_loss', mode='min', save_best_only=False, save_freq=checkpoint_interval) if use_tpu is True: with strategy.scope(): model, compiled = create_neural_network( model_type=model_type, embedding_size=embedding_size, weights_path=weights_path, n_classes=n_classes, recompile=recompile, input_shape=[crop_size, crop_size, 3], training=True, margin=margin, logist_scale=logist_scale) assert model is not None, '[ERROR] There was a problem in loading the pre-trained weights' if compiled is False: print( '[INFO] Recompiling model using passed optimizer and loss arguments' ) model.compile(optimizer=opt, loss=loss_fn, run_eagerly=run_eagerly) elif distributed is True and use_tpu is False: with mirrored_strategy.scope(): model, compiled = create_neural_network( model_type=model_type, embedding_size=embedding_size, weights_path=weights_path, n_classes=n_classes, recompile=recompile, input_shape=[crop_size, crop_size, 3], training=True, margin=margin, logist_scale=logist_scale) opt = get_optimizer( optimizer_name=optimizer, lr_schedule=lr_schedule, weight_decay=weight_decay ) # Optimizer must be created within scope! assert model is not None, '[ERROR] There was a problem in loading the pre-trained weights' if compiled is False: print( '[INFO] Recompiling model using passed optimizer and loss arguments' ) model.compile(optimizer=opt, loss=loss_fn, run_eagerly=run_eagerly) else: model, compiled = create_neural_network( model_type=model_type, embedding_size=embedding_size, weights_path=weights_path, n_classes=n_classes, recompile=recompile, input_shape=[crop_size, crop_size, 3], training=True, margin=margin, logist_scale=logist_scale) assert model is not None, '[ERROR] There was a problem in loading the pre-trained weights' if compiled is False: print( '[INFO] Recompiling model using passed optimizer and loss arguments' ) model.compile(optimizer=opt, loss=loss_fn, run_eagerly=run_eagerly) callback_list = [ model_saver, weights_saver, tensorboard_callback, stop_on_nan ] train_history = model.fit( train_dataset, epochs=num_epochs, callbacks=callback_list, steps_per_epoch=None if steps_per_epoch == 0 else steps_per_epoch) if not os.path.exists('./results'): os.mkdir('./results') model_name = './results/model-' + datetime.now().strftime( "%Y%m%d-%H%M%S") model.save(model_name) print( '\n[INFO] Training complete. Saved model can be found in "./results"' ) return
def get_quartznet(input_dim, output_dim, is_mixed_precision=False, tflite_version=False, num_b_block_repeats=3, b_block_kernel_sizes=(33, 39, 51, 63, 75), b_block_num_channels=(256, 256, 512, 512, 512), num_small_blocks=5, random_state=1) -> keras.Model: """ Parameters ---------- input_dim: input feature length output_dim: output feature length is_mixed_precision: if mixed precision model is needed tflite_version: if export to tflite is needed num_b_block_repeats: 1 is 5x5 quartznet, 2 is 10x5, 3 is 15x5 b_block_kernel_sizes: iterable, kernel size of each b block b_block_num_channels: iterable, number of channels of each b block """ assert len(b_block_kernel_sizes) == len(b_block_num_channels), \ "Number of kernel sizes not equal the number of channel sizes" max_seq_length = None if tflite_version: max_seq_length = 5 if is_mixed_precision: policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) np.random.seed(random_state) tf.random.set_seed(random_state) with tf.device('/cpu:0'): input_tensor = layers.Input([max_seq_length, input_dim], name='X') x = layers.Masking()(input_tensor) # First encoder layer x = layers.SeparableConv1D(256, 33, padding='same', strides=2, name='conv_1', use_bias=False)(x) x = layers.BatchNormalization(name='BN-1', momentum=0.9)(x) x = layers.ReLU(name='RELU-1')(x) block_idx = 1 for kernel_size, n_channels in zip(b_block_kernel_sizes, b_block_num_channels): for bk in range(num_b_block_repeats): x = B_block(kernel_size, n_channels, num_small_blocks, f'B-{block_idx}')(x) block_idx += 1 # First final layer x = layers.SeparableConv1D(512, 87, padding='same', name='conv_2', dilation_rate=2, use_bias=False)(x) x = layers.BatchNormalization(name='BN-2', momentum=0.9)(x) x = layers.ReLU(name='RELU-2')(x) # Second final layer x = layers.Conv1D(1024, 1, padding='same', name='conv_3', use_bias=False)(x) x = layers.BatchNormalization(name='BN-3', momentum=0.9)(x) x = layers.ReLU(name='RELU-3')(x) # Third final layer x = layers.Conv1D(output_dim, 1, padding='same', dilation_rate=1, name='conv_4')(x) model = keras.Model([input_tensor], x, name='QuartzNet') if is_mixed_precision: policy = mixed_precision.Policy('float32') mixed_precision.set_policy(policy) return model
def init_network(self): # This function builds the compute graph. # Optionally, it can build a 'subset' graph if this mode is # Net construction: start = time.time() # Here, if using mixed precision, set a global policy: if self.args.run.precision == "mixed": from tensorflow.keras.mixed_precision import experimental as mixed_precision self.policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(self.policy) if self.args.run.precision == "bfloat16": from tensorflow.keras.mixed_precision import experimental as mixed_precision self.policy = mixed_precision.Policy('mixed_bfloat16') mixed_precision.set_policy(self.policy) # self._global_step = tf.Variable(0, dtype=tf.int64) # Add the dataformat for the network construction: # This sets up the necessary output shape: output_shape = self.larcv_fetcher.output_shape('primary') # Build the network object, forward pass only: # To initialize the network, we see what the name is # and act on that: if self.args.network.name == "resnet": if self.args.network.data_format == 'sparse': raise Exception("No sparse networks available in tensorflow") else: if self.args.dataset.dimension == 2: from src.networks.tensorflow import resnet self._net = resnet.ResNet(output_shape, self.args) else: raise Exception("No Resnet3d Implemented!") elif self.args.network.name == "pointnet": if self.args.dataset.dimension == 2: from src.networks.tensorflow import pointnet self._net = pointnet.PointNet(output_shape, self.args) else: from src.networks.tensorflow import pointnet3d self._net = pointnet3d.PointNet(output_shape, self.args) elif self.args.network.name == "dgcnn": from src.networks.tensorflow import dgcnn self._net = dgcnn.DGCNN(output_shape, self.args) else: raise Exception( f"Couldn't identify network {self.args.network.name}") self._net.trainable = True # TO PROPERLY INITIALIZE THE NETWORK, NEED TO DO A FORWARD PASS minibatch_data = self.larcv_fetcher.fetch_next_batch("primary", force_pop=False) minibatch_data = self.cast_input(minibatch_data) self.forward_pass(minibatch_data['image'], training=False) end = time.time() return end - start
def main(args, yaml_path, config): tf.config.run_functions_eagerly(config['tensorflow']['eager']) from tfmodel.data import Dataset cds = config["dataset"] dataset_def = Dataset(num_input_features=int(cds["num_input_features"]), num_output_features=int(cds["num_output_features"]), padded_num_elem_size=int( cds["padded_num_elem_size"]), raw_path=cds["raw_path"], processed_path=cds["processed_path"], validation_file_path=cds["validation_file_path"], schema=cds["schema"]) if args.action == "data": dataset_def.process(config["dataset"]["num_files_per_chunk"]) return global_batch_size = config['setup']['batch_size'] model_name = os.path.splitext(os.path.basename(yaml_path))[0] + "-" + str( uuid.uuid4())[:8] print("model_name=", model_name) tfr_files = sorted(glob.glob(dataset_def.processed_path)) if len(tfr_files) == 0: raise Exception("Could not find any files in {}".format( dataset_def.datapath)) dataset = tf.data.TFRecordDataset(tfr_files).map( dataset_def.parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE) num_events = 0 for i in dataset: num_events += 1 print("dataset loaded, len={}".format(num_events)) n_train = config['setup']['num_events_train'] n_test = config['setup']['num_events_test'] n_epochs = config['setup']['num_epochs'] weight_func = weight_functions[config['setup']['sample_weights']] assert (n_train + n_test <= num_events) ps = (tf.TensorShape( [dataset_def.padded_num_elem_size, dataset_def.num_input_features]), tf.TensorShape([ dataset_def.padded_num_elem_size, dataset_def.num_output_features ]), tf.TensorShape([ dataset_def.padded_num_elem_size, ])) ds_train = dataset.take(n_train).map(weight_func).padded_batch( global_batch_size, padded_shapes=ps) ds_test = dataset.skip(n_train).take(n_test).map(weight_func).padded_batch( global_batch_size, padded_shapes=ps) #small test dataset used in the callback for making monitoring plots X_test = ds_test.take(100).map(lambda x, y, w: x) y_test = np.concatenate( list( ds_test.take(100).map( lambda x, y, w: tf.concat(y, axis=-1)).as_numpy_iterator())) ds_train_r = ds_train.repeat(n_epochs) ds_test_r = ds_test.repeat(n_epochs) weights = config['setup']['weights'] if args.weights: weights = args.weights if weights is None: outdir = 'experiments/{}'.format(model_name) if os.path.isdir(outdir): print("Output directory exists: {}".format(outdir), file=sys.stderr) sys.exit(1) else: outdir = os.path.dirname(weights) try: num_gpus = len(os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")) print("num_gpus=", num_gpus) if num_gpus > 1: strategy = tf.distribute.MirroredStrategy() global_batch_size = num_gpus * global_batch_size else: strategy = tf.distribute.OneDeviceStrategy("gpu:0") except Exception as e: print("fallback to CPU", e) strategy = tf.distribute.OneDeviceStrategy("cpu") num_gpus = 0 actual_lr = global_batch_size * float(config['setup']['lr']) Xs = [] ygens = [] ycands = [] #for faster loading if args.action == "train": dataset_def.val_filelist = dataset_def.val_filelist[:1] for fi in dataset_def.val_filelist: X, ygen, ycand = dataset_def.prepare_data(fi) Xs.append(np.concatenate(X)) ygens.append(np.concatenate(ygen)) ycands.append(np.concatenate(ycand)) X_val = np.concatenate(Xs) ygen_val = np.concatenate(ygens) ycand_val = np.concatenate(ycands) with strategy.scope(): if config['setup']['dtype'] == 'float16': model_dtype = tf.dtypes.float16 from tensorflow.keras.mixed_precision import experimental as mixed_precision policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) opt = mixed_precision.LossScaleOptimizer( tf.keras.optimizers.Adam(learning_rate=actual_lr), loss_scale="dynamic") else: model_dtype = tf.dtypes.float32 opt = tf.keras.optimizers.Adam(learning_rate=actual_lr) if args.action == "train" or args.action == "eval": model = make_model(config, model_dtype) loss_cls = PFNetLoss( num_input_classes=config["dataset"]["num_input_classes"], num_output_classes=config["dataset"]["num_output_classes"], momentum_loss_coefs=config["dataset"]["momentum_loss_coefs"]) loss_fn = loss_cls.my_loss_full if config["setup"]["trainable"] == "cls": model.set_trainable_classification() loss_fn = loss_cls.my_loss_cls elif config["setup"]["trainable"] == "reg": model.set_trainable_regression() loss_fn = loss_cls.my_loss_reg #we use the "temporal" mode to have per-particle weights model.compile(loss=loss_fn, optimizer=opt, sample_weight_mode='temporal') #Evaluate model once to build the layers model(tf.cast(X_val[:1], model_dtype)) model.summary() initial_epoch = 0 if weights: model.load_weights(weights) initial_epoch = int(weights.split("/")[-1].split("-")[1]) if args.action == "train": file_writer_cm = tf.summary.create_file_writer(outdir + '/val_extra') callbacks = prepare_callbacks( X_test, y_test, loss_cls, model, outdir, config["dataset"]["num_input_classes"], config["dataset"]["num_output_classes"], file_writer_cm) model.fit(ds_train_r, validation_data=ds_test_r, epochs=initial_epoch + n_epochs, callbacks=callbacks, steps_per_epoch=n_train // global_batch_size, validation_steps=n_test // global_batch_size, initial_epoch=initial_epoch) model.save(outdir + "/model_full", save_format="tf") if args.action == "eval": eval_model(X_val, ygen_val, ycand_val, model, config, outdir, global_batch_size) freeze_model(model, config, outdir) if args.action == "time": synthetic_timing_data = [] for iteration in range(config["timing"]["num_iter"]): numev = config["timing"]["num_ev"] for evsize in [ 128 * 10, 128 * 20, 128 * 30, 128 * 40, 128 * 50, 128 * 60, 128 * 70, 128 * 80, 128 * 90, 128 * 100 ]: for batch_size in [1, 2, 3, 4]: x = np.random.randn( batch_size, evsize, config["dataset"]["num_input_features"]).astype( np.float32) model = make_model(config, model_dtype) model(x) if weights: model.load_weights(weights) t0 = time.time() for i in range(numev // batch_size): model(x) t1 = time.time() dt = t1 - t0 time_per_event = 1000.0 * (dt / numev) synthetic_timing_data.append([{ "iteration": iteration, "batch_size": batch_size, "event_size": evsize, "time_per_event": time_per_event }]) print( "Synthetic random data: batch_size={} event_size={}, time={:.2f} ms/ev" .format(batch_size, evsize, time_per_event)) with open("{}/synthetic_timing.json".format(outdir), "w") as fi: json.dump(synthetic_timing_data, fi)
def train_ncnet( model, run_id=None, multicoil=False, three_d=False, acq_type='radial', scale_factor=1e6, dcomp=False, contrast=None, cuda_visible_devices='0123', n_samples=None, n_epochs=200, use_mixed_precision=False, loss='mae', original_run_id=None, checkpoint_epoch=0, save_state=False, lr=1e-4, **acq_kwargs, ): # paths n_volumes_train = n_volumes_train_fastmri if multicoil: train_path = f'{FASTMRI_DATA_DIR}multicoil_train/' val_path = f'{FASTMRI_DATA_DIR}multicoil_val/' elif three_d: train_path = f'{OASIS_DATA_DIR}/train/' val_path = f'{OASIS_DATA_DIR}/val/' n_volumes_train = n_volumes_train_oasis else: train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/' val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/' os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices) # trying mixed precision if use_mixed_precision: policy_type = 'mixed_float16' else: policy_type = 'float32' policy = mixed_precision.Policy(policy_type) mixed_precision.set_policy(policy) # generators if multicoil: dataset = multicoil_dataset image_size = IM_SIZE elif three_d: dataset = three_d_dataset image_size = VOLUME_SIZE else: dataset = singlecoil_dataset image_size = IM_SIZE if not three_d: add_kwargs = { 'contrast': contrast, 'rand': True, 'inner_slices': None, } else: add_kwargs = {} add_kwargs.update(**acq_kwargs) train_set = dataset(train_path, image_size, acq_type=acq_type, compute_dcomp=dcomp, scale_factor=scale_factor, n_samples=n_samples, **add_kwargs) val_set = dataset(val_path, image_size, acq_type=acq_type, compute_dcomp=dcomp, scale_factor=scale_factor, **add_kwargs) additional_info = f'{acq_type}' if contrast is not None: additional_info += f'_{contrast}' if n_samples is not None: additional_info += f'_{n_samples}' if loss != 'mae': additional_info += f'_{loss}' if dcomp: additional_info += '_dcomp' if checkpoint_epoch == 0: run_id = f'{run_id}_{additional_info}_{int(time.time())}' else: run_id = original_run_id final_epoch = checkpoint_epoch + n_epochs chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}.hdf5' log_dir = op.join(f'{LOGS_DIR}logs', run_id) tboard_cback = TensorBoard( profile_batch=0, log_dir=log_dir, histogram_freq=0, write_graph=False, write_images=False, ) tqdm_cback = TQDMProgressBar() n_steps = n_volumes_train chkpt_cback = ModelCheckpointWorkAround( chkpt_path, save_freq=int(n_epochs * n_steps), save_weights_only=True, ) default_model_compile(model, lr=lr, loss=loss) # first run of the model to avoid the saving error # ValueError: as_list() is not defined on an unknown TensorShape. # it can also allow loading of weights model(next(iter(train_set))[0]) if not checkpoint_epoch == 0: model.load_weights( f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-{checkpoint_epoch:02d}.hdf5' ) grad_vars = model.trainable_weights zero_grads = [tf.zeros_like(w) for w in grad_vars] model.optimizer.apply_gradients(zip(zero_grads, grad_vars)) with open( f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-optimizer.pkl', 'rb') as f: weight_values = pickle.load(f) model.optimizer.set_weights(weight_values) print(run_id) model.fit( train_set, steps_per_epoch=n_steps, initial_epoch=checkpoint_epoch, epochs=final_epoch, validation_data=val_set, validation_steps=2, verbose=0, callbacks=[tboard_cback, chkpt_cback, tqdm_cback], ) if save_state: symbolic_weights = getattr(model.optimizer, 'weights') weight_values = K.batch_get_value(symbolic_weights) with open(f'{CHECKPOINTS_DIR}checkpoints/{run_id}-optimizer.pkl', 'wb') as f: pickle.dump(weight_values, f) return run_id
def main(argv): # set fixed random seed, load config files tf.random.set_seed(RANDOM_SEED) # using mix precision or not if MIXPRECISION: policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) # get params for model train_iter, input_size, num_cls, lrs_schedule_params, loss_params, parser_params, model_params = get_params( FLAGS.name) # ----------------------------------------------------------------- # set up Grappler for graph optimization # Ref: https://www.tensorflow.org/guide/graph_optimization @contextlib.contextmanager def options(opts): old_opts = tf.config.optimizer.get_experimental_options() tf.config.optimizer.set_experimental_options(opts) try: yield finally: tf.config.optimizer.set_experimental_options(old_opts) # ----------------------------------------------------------------- # Creating the instance of the model specified. logging.info("Creating the model instance of YOLACT") model = Yolact(**model_params) # add weight decay for layer in model.layers: if isinstance(layer, tf.keras.layers.Conv2D) or isinstance( layer, tf.keras.layers.Dense): layer.add_loss(lambda: tf.keras.regularizers.l2(FLAGS.weight_decay) (layer.kernel)) if hasattr(layer, 'bias_regularizer') and layer.use_bias: layer.add_loss(lambda: tf.keras.regularizers.l2(FLAGS.weight_decay) (layer.bias)) # ----------------------------------------------------------------- # Creating dataloaders for training and validation logging.info("Creating the dataloader from: %s..." % FLAGS.tfrecord_dir) dateset = ObjectDetectionDataset(dataset_name=FLAGS.name, tfrecord_dir=os.path.join( FLAGS.tfrecord_dir, FLAGS.name), anchor_instance=model.anchor_instance, **parser_params) train_dataset = dateset.get_dataloader(subset='train', batch_size=FLAGS.batch_size) valid_dataset = dateset.get_dataloader(subset='val', batch_size=1) # count number of valid data for progress bar # Todo any better way to do it? num_val = 0 for _ in valid_dataset: num_val += 1 # ----------------------------------------------------------------- # Choose the Optimizor, Loss Function, and Metrics, learning rate schedule lr_schedule = learning_rate_schedule.Yolact_LearningRateSchedule( **lrs_schedule_params) logging.info("Initiate the Optimizer and Loss function...") optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule, momentum=FLAGS.momentum) criterion = loss_yolact.YOLACTLoss(**loss_params) train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32) loc = tf.keras.metrics.Mean('loc_loss', dtype=tf.float32) conf = tf.keras.metrics.Mean('conf_loss', dtype=tf.float32) mask = tf.keras.metrics.Mean('mask_loss', dtype=tf.float32) seg = tf.keras.metrics.Mean('seg_loss', dtype=tf.float32) # ----------------------------------------------------------------- # Setup the TensorBoard for better visualization # Ref: https://www.tensorflow.org/tensorboard/get_started logging.info("Setup the TensorBoard...") current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = './logs/gradient_tape/' + current_time + '/train' test_log_dir = './logs/gradient_tape/' + current_time + '/test' train_summary_writer = tf.summary.create_file_writer(train_log_dir) test_summary_writer = tf.summary.create_file_writer(test_log_dir) # ----------------------------------------------------------------- # Start the Training and Validation Process logging.info("Start the training process...") # setup checkpoints manager checkpoint = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(checkpoint, directory="./checkpoints", max_to_keep=5) # restore from latest checkpoint and iteration status = checkpoint.restore(manager.latest_checkpoint) if manager.latest_checkpoint: logging.info("Restored from {}".format(manager.latest_checkpoint)) else: logging.info("Initializing from scratch.") best_masks_map = 0. iterations = checkpoint.step.numpy() for image, labels in train_dataset: # check iteration and change the learning rate if iterations > train_iter: break checkpoint.step.assign_add(1) iterations += 1 with options({ 'constant_folding': True, 'layout_optimize': True, 'loop_optimization': True, 'arithmetic_optimization': True, 'remapping': True }): loc_loss, conf_loss, mask_loss, seg_loss = train_step( model, criterion, train_loss, optimizer, image, labels, num_cls) loc.update_state(loc_loss) conf.update_state(conf_loss) mask.update_state(mask_loss) seg.update_state(seg_loss) with train_summary_writer.as_default(): tf.summary.scalar('Total loss', train_loss.result(), step=iterations) tf.summary.scalar('Loc loss', loc.result(), step=iterations) tf.summary.scalar('Conf loss', conf.result(), step=iterations) tf.summary.scalar('Mask loss', mask.result(), step=iterations) tf.summary.scalar('Seg loss', seg.result(), step=iterations) if iterations and iterations % FLAGS.print_interval == 0: tf.print( "Iteration {}, LR: {}, Total Loss: {}, B: {}, C: {}, M: {}, S:{} " .format(iterations, optimizer._decayed_lr(var_dtype=tf.float32), train_loss.result(), loc.result(), conf.result(), mask.result(), seg.result())) if iterations and iterations % FLAGS.save_interval == 0: # save checkpoint save_path = manager.save() logging.info("Saved checkpoint for step {}: {}".format( int(checkpoint.step), save_path)) # validation and print mAP table all_map = evaluate(model, valid_dataset, num_val, num_cls) box_map, mask_map = all_map['box']['all'], all_map['mask']['all'] tf.print(f"box mAP:{box_map}, mask mAP:{mask_map}") with test_summary_writer.as_default(): tf.summary.scalar('Box mAP', box_map, step=iterations) tf.summary.scalar('Mask mAP', mask_map, step=iterations) # Saving the weights: if mask_map > best_masks_map: best_masks_map = mask_map model.save_weights( f'{FLAGS.weights}/weights_{FLAGS.name}_{str(best_masks_map)}.h5' ) # reset the metrics train_loss.reset_states() loc.reset_states() conf.reset_states() mask.reset_states() seg.reset_states()
def main(): configs = yaml.safe_load(( pathlib.Path(sys.argv[0]).parent / 'configs.yaml').read_text()) parsed, remaining = common.Flags(configs=['defaults']).parse(known_only=True) config = common.Config(configs['defaults']) for name in parsed.configs: config = config.update(configs[name]) config = common.Flags(config).parse(remaining) logdir = pathlib.Path(config.logdir).expanduser() logdir.mkdir(parents=True, exist_ok=True) config.save(logdir / 'config.yaml') print(config, '\n') print('Logdir', logdir) import tensorflow as tf tf.config.experimental_run_functions_eagerly(not config.jit) message = 'No GPU found. To actually train on CPU remove this assert.' assert tf.config.experimental.list_physical_devices('GPU'), message for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) assert config.precision in (16, 32), config.precision if config.precision == 16: from tensorflow.keras.mixed_precision import experimental as prec prec.set_policy(prec.Policy('mixed_float16')) train_replay = common.Replay(logdir / 'train_episodes', **config.replay) eval_replay = common.Replay(logdir / 'eval_episodes', **dict( capacity=config.replay.capacity // 10, minlen=config.dataset.length, maxlen=config.dataset.length)) step = common.Counter(train_replay.stats['total_steps']) outputs = [ common.TerminalOutput(), common.JSONLOutput(logdir), common.TensorBoardOutput(logdir), ] logger = common.Logger(step, outputs, multiplier=config.action_repeat) metrics = collections.defaultdict(list) should_train = common.Every(config.train_every) should_log = common.Every(config.log_every) should_video_train = common.Every(config.eval_every) should_video_eval = common.Every(config.eval_every) should_expl = common.Until(config.expl_until // config.action_repeat) def make_env(mode): suite, task = config.task.split('_', 1) if suite == 'dmc': env = common.DMC( task, config.action_repeat, config.render_size, config.dmc_camera) env = common.NormalizeAction(env) elif suite == 'atari': env = common.Atari( task, config.action_repeat, config.render_size, config.atari_grayscale) env = common.OneHotAction(env) elif suite == 'crafter': assert config.action_repeat == 1 outdir = logdir / 'crafter' if mode == 'train' else None reward = bool(['noreward', 'reward'].index(task)) or mode == 'eval' env = common.Crafter(outdir, reward) env = common.OneHotAction(env) else: raise NotImplementedError(suite) env = common.TimeLimit(env, config.time_limit) return env def per_episode(ep, mode): length = len(ep['reward']) - 1 score = float(ep['reward'].astype(np.float64).sum()) print(f'{mode.title()} episode has {length} steps and return {score:.1f}.') logger.scalar(f'{mode}_return', score) logger.scalar(f'{mode}_length', length) for key, value in ep.items(): if re.match(config.log_keys_sum, key): logger.scalar(f'sum_{mode}_{key}', ep[key].sum()) if re.match(config.log_keys_mean, key): logger.scalar(f'mean_{mode}_{key}', ep[key].mean()) if re.match(config.log_keys_max, key): logger.scalar(f'max_{mode}_{key}', ep[key].max(0).mean()) should = {'train': should_video_train, 'eval': should_video_eval}[mode] if should(step): for key in config.log_keys_video: logger.video(f'{mode}_policy_{key}', ep[key]) replay = dict(train=train_replay, eval=eval_replay)[mode] logger.add(replay.stats, prefix=mode) logger.write() print('Create envs.') num_eval_envs = min(config.envs, config.eval_eps) if config.envs_parallel == 'none': train_envs = [make_env('train') for _ in range(config.envs)] eval_envs = [make_env('eval') for _ in range(num_eval_envs)] else: make_async_env = lambda mode: common.Async( functools.partial(make_env, mode), config.envs_parallel) train_envs = [make_async_env('train') for _ in range(config.envs)] eval_envs = [make_async_env('eval') for _ in range(eval_envs)] act_space = train_envs[0].act_space obs_space = train_envs[0].obs_space train_driver = common.Driver(train_envs) train_driver.on_episode(lambda ep: per_episode(ep, mode='train')) train_driver.on_step(lambda tran, worker: step.increment()) train_driver.on_step(train_replay.add_step) train_driver.on_reset(train_replay.add_step) eval_driver = common.Driver(eval_envs) eval_driver.on_episode(lambda ep: per_episode(ep, mode='eval')) eval_driver.on_episode(eval_replay.add_episode) prefill = max(0, config.prefill - train_replay.stats['total_steps']) if prefill: print(f'Prefill dataset ({prefill} steps).') random_agent = common.RandomAgent(act_space) train_driver(random_agent, steps=prefill, episodes=1) eval_driver(random_agent, episodes=1) train_driver.reset() eval_driver.reset() print('Create agent.') train_dataset = iter(train_replay.dataset(**config.dataset)) report_dataset = iter(train_replay.dataset(**config.dataset)) eval_dataset = iter(eval_replay.dataset(**config.dataset)) agnt = agent.Agent(config, obs_space, act_space, step) train_agent = common.CarryOverState(agnt.train) train_agent(next(train_dataset)) if (logdir / 'variables.pkl').exists(): agnt.load(logdir / 'variables.pkl') else: print('Pretrain agent.') for _ in range(config.pretrain): train_agent(next(train_dataset)) train_policy = lambda *args: agnt.policy( *args, mode='explore' if should_expl(step) else 'train') eval_policy = lambda *args: agnt.policy(*args, mode='eval') def train_step(tran, worker): if should_train(step): for _ in range(config.train_steps): mets = train_agent(next(train_dataset)) [metrics[key].append(value) for key, value in mets.items()] if should_log(step): for name, values in metrics.items(): logger.scalar(name, np.array(values, np.float64).mean()) metrics[name].clear() logger.add(agnt.report(next(report_dataset)), prefix='train') logger.write(fps=True) train_driver.on_step(train_step) while step < config.steps: logger.write() print('Start evaluation.') logger.add(agnt.report(next(eval_dataset)), prefix='eval') eval_driver(eval_policy, episodes=config.eval_eps) print('Start training.') train_driver(train_policy, steps=config.eval_every) agnt.save(logdir / 'variables.pkl') for env in train_envs + eval_envs: try: env.close() except Exception: pass
def train(strategy, cfg): os.makedirs(cfg.MODEL.SAVE_DIR, exist_ok=True) if cfg.DATASET.BFLOAT16: policy = mixed_precision.Policy('mixed_bfloat16') mixed_precision.set_policy(policy) tf.random.set_seed(cfg.TRAIN.SEED) np.random.seed(cfg.TRAIN.SEED) meta_data = {'train_loss': [], 'val_loss': [], 'config': cfg} spe = int(np.ceil(cfg.DATASET.TRAIN_SAMPLES / cfg.TRAIN.BATCH_SIZE)) spv = cfg.DATASET.VAL_SAMPLES // cfg.VAL.BATCH_SIZE if cfg.TRAIN.SCALE_LR: lr = cfg.TRAIN.BASE_LR * cfg.TRAIN.BATCH_SIZE / 32 cfg.TRAIN.WARMUP_FACTOR = 32 / cfg.TRAIN.BATCH_SIZE else: lr = cfg.TRAIN.BASE_LR if cfg.TRAIN.LR_SCHEDULE == 'warmup_cosine_decay': lr_schedule = WarmupCosineDecay(initial_learning_rate=lr, decay_steps=cfg.TRAIN.EPOCHS * spe, warmup_steps=cfg.TRAIN.WARMUP_EPOCHS * spe, warmup_factor=cfg.TRAIN.WARMUP_FACTOR) elif cfg.TRAIN.LR_SCHEDULE == 'warmup_piecewise': lr_schedule = WarmupPiecewise( boundaries=[x * spe for x in cfg.TRAIN.DECAY_EPOCHS], values=[lr, lr / 10, lr / 10**2], warmup_steps=spe * cfg.TRAIN.WARMUP_EPOCHS, warmup_factor=cfg.TRAIN.WARMUP_FACTOR) else: lr_schedule = lr with strategy.scope(): optimizer = tf.keras.optimizers.Adam(lr_schedule) if cfg.MODEL.TYPE == 'simple_baseline': model = SimpleBaseline(cfg) elif cfg.MODEL.TYPE == 'hrnet': model = HRNet(cfg) elif cfg.MODEL.TYPE == 'evopose': model = EvoPose(cfg) train_loss = tf.keras.metrics.Mean() val_loss = tf.keras.metrics.Mean() cfg.DATASET.OUTPUT_SHAPE = model.output_shape[1:] cfg.DATASET.SIGMA = 2 * cfg.DATASET.OUTPUT_SHAPE[0] / 64 meta_data['parameters'] = model.count_params() meta_data['flops'] = get_flops(model) train_ds = load_tfds(cfg, 'train') train_ds = strategy.experimental_distribute_dataset(train_ds) train_iterator = iter(train_ds) if cfg.TRAIN.VAL: val_ds = load_tfds(cfg, 'val') val_ds = strategy.experimental_distribute_dataset(val_ds) @tf.function def train_step(train_iterator): def step_fn(inputs): imgs, targets, valid = inputs with tf.GradientTape() as tape: loss, l2_loss = mse_loss(model, imgs, targets, valid, training=True) scaled_loss = (loss + l2_loss) / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients( list(zip(grads, model.trainable_variables))) train_loss.update_state(loss) strategy.run(step_fn, args=(next(train_iterator), )) @tf.function def val_step(dist_inputs): def step_fn(inputs): imgs, targets, valid = inputs loss, _ = mse_loss(model, imgs, targets, valid, training=False) val_loss.update_state(loss) strategy.run(step_fn, args=(dist_inputs, )) print('Training {} ({:.2f}M / {:.2f}G) on {} for {} epochs'.format( cfg.MODEL.NAME, meta_data['parameters'] / 1e6, meta_data['flops'] / 2 / 1e9, cfg.TRAIN.ACCELERATOR, cfg.TRAIN.EPOCHS)) epoch = 1 ts = time() while epoch <= cfg.TRAIN.EPOCHS: te = time() for i in range(spe): train_step(train_iterator) if cfg.TRAIN.DISP: print('epoch {} ({}/{}) | loss: {:.1f}'.format( epoch, i + 1, spe, train_loss.result().numpy())) meta_data['train_loss'].append(train_loss.result().numpy()) if cfg.TRAIN.VAL: for i, batch in enumerate(val_ds): val_step(batch) if cfg.TRAIN.DISP: print('val {} ({}/{}) | loss: {:.1f}'.format( epoch, i + 1, spv, val_loss.result().numpy())) meta_data['val_loss'].append(val_loss.result().numpy()) if cfg.VAL.SAVE_BEST: if epoch == 1: best_weights = model.get_weights() best_loss = val_loss.result().numpy() if cfg.TRAIN.DISP: print('Cached model weights') elif val_loss.result().numpy() < best_loss: best_weights = model.get_weights() best_loss = val_loss.result().numpy() if cfg.TRAIN.DISP: print('Cached model weights') train_loss.reset_states() val_loss.reset_states() if cfg.TRAIN.SAVE_EPOCHS and epoch % cfg.TRAIN.SAVE_EPOCHS == 0: model.save(osp.join( cfg.MODEL.SAVE_DIR, '{}_ckpt{:03d}.h5'.format(cfg.MODEL.NAME, epoch)), save_format='h5') print( 'Saved checkpoint to', osp.join(cfg.MODEL.SAVE_DIR, '{}_ckpt{:03d}.h5'.format(cfg.MODEL.NAME, epoch))) if cfg.TRAIN.SAVE_META: pickle.dump( meta_data, open( osp.join(cfg.MODEL.SAVE_DIR, '{}_meta.pkl'.format(cfg.MODEL.NAME)), 'wb')) if epoch > 1 and cfg.TRAIN.DISP: est_time = (cfg.TRAIN.EPOCHS - epoch) * (time() - te) / 3600 print('Estimated time remaining: {:.2f} hrs'.format(est_time)) epoch += 1 meta_data['training_time'] = time() - ts if cfg.VAL.SAVE_BEST: model.set_weights(best_weights) return model, meta_data
def init_network(self): # This function builds the compute graph. # Optionally, it can build a 'subset' graph if this mode is # Net construction: start = time.time() # Here, if using mixed precision, set a global policy: if self.args.precision == "mixed": from tensorflow.keras.mixed_precision import experimental as mixed_precision self.policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(self.policy) batch_dims = self.larcv_fetcher.batch_dims(1) # We compute the batch_dims[0] = self.local_batch_size() # We have to make placeholders for input objects: self._input = { 'image': tf.compat.v1.placeholder(floating_point_format, batch_dims, name="input_image"), 'label': tf.compat.v1.placeholder(integer_format, batch_dims, name="input_label"), 'io_time': tf.compat.v1.placeholder(floating_point_format, (), name="io_fetch_time") } # Build the network object, forward pass only: if self.args.conv_mode == '2D': self._net = uresnet2D.UResNet(self.args) else: self._net = uresnet3D.UResNet3D(self.args) self._net.trainable = True self._logits = self._net(self._input['image'], training=self.args.training) # If channels first, need to permute the logits: if self._channels_dim == 1: permutation = tf.keras.layers.Permute((2, 3, 1)) self._loss_logits = [permutation(l) for l in self._logits] else: self._loss_logits = self._logits # Used to accumulate gradients over several iterations: with tf.compat.v1.variable_scope("gradient_accumulation"): self._accum_vars = [ tf.Variable(tv.initialized_value(), trainable=False) for tv in tf.compat.v1.trainable_variables() ] if self.args.mode == "train" or self.args.mode == "inference": # Here, if the data format is channels_first, we have to reorder the logits tensors # To put channels last. Otherwise it does not work with the softmax tensors. # Apply a softmax and argmax: self._output = dict() # Take the logits (which are one per plane) and create a softmax and prediction (one per plane) with tf.compat.v1.variable_scope("prediction"): self._output['prediction'] = [ tf.argmax(x, axis=self._channels_dim) for x in self._logits ] with tf.compat.v1.variable_scope("cross_entropy"): self.loss_calculator = LossCalculator.LossCalculator( self.args.loss_balance_scheme, self._channels_dim) self._input['split_labels'] = [ tf.squeeze(l, axis=self._channels_dim) for l in tf.split( self._input['label'], 3, self._channels_dim) ] self._input['split_images'] = [ tf.squeeze(l, axis=self._channels_dim) for l in tf.split( self._input['image'], 3, self._channels_dim) ] self._loss = self.loss_calculator( labels=self._input['split_labels'], logits=self._loss_logits) if self.args.mode == "inference": self._output['softmax'] = [ tf.nn.softmax(x, axis=self._channels_dim) for x in self._logits ] self._accuracy_calc = AccuracyCalculator.AccuracyCalculator() self._accuracy = self._accuracy_calc( prediction=self._output['prediction'], labels=self._input['split_labels']) # Add the metrics by hand: self._metrics = {} for p in [0, 1, 2]: self._metrics[f"plane{p}/Total_Accuracy"] = self._accuracy[ "total_accuracy"][p] self._metrics[f"plane{p}/Non_Bkg_Accuracy"] = self._accuracy[ "non_bkg_accuracy"][p] self._metrics[f"plane{p}/Neutrino_IoU"] = self._accuracy[ "neut_iou"][p] self._metrics[f"plane{p}/Cosmic_IoU"] = self._accuracy[ "cosmic_iou"][p] self._metrics[f"plane{p}/mIoU"] = self._accuracy["miou"][p] with tf.compat.v1.variable_scope("accuracy"): self._metrics["Average/Total_Accuracy"] = tf.reduce_mean( self._accuracy["total_accuracy"]) self._metrics["Average/Non_Bkg_Accuracy"] = tf.reduce_mean( self._accuracy["non_bkg_accuracy"]) self._metrics["Average/Neutrino_IoU"] = tf.reduce_mean( self._accuracy["neut_iou"]) self._metrics["Average/Cosmic_IoU"] = tf.reduce_mean( self._accuracy["cosmic_iou"]) self._metrics["Average/mIoU"] = tf.reduce_mean( self._accuracy["miou"]) self._metrics['loss'] = self._loss self._log_keys = ["loss", "Average/Non_Bkg_Accuracy", "Average/mIoU"] end = time.time() return end - start
def train_ncnet( model, run_id=None, multicoil=False, three_d=False, acq_type='radial', scale_factor=1e6, dcomp=False, contrast=None, cuda_visible_devices='0123', n_samples=None, n_epochs=200, use_mixed_precision=False, loss='mae', original_run_id=None, **acq_kwargs, ): # paths n_volumes_train = n_volumes_train_fastmri if multicoil: train_path = f'{FASTMRI_DATA_DIR}multicoil_train/' val_path = f'{FASTMRI_DATA_DIR}multicoil_val/' elif three_d: train_path = f'{OASIS_DATA_DIR}/train/' val_path = f'{OASIS_DATA_DIR}/val/' n_volumes_train = n_volumes_train_oasis else: train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/' val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/' os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices) # trying mixed precision if use_mixed_precision: policy_type = 'mixed_float16' else: policy_type = 'float32' policy = mixed_precision.Policy(policy_type) mixed_precision.set_policy(policy) # generators if multicoil: dataset = multicoil_dataset image_size = IM_SIZE elif three_d: dataset = three_d_dataset image_size = VOLUME_SIZE else: dataset = singlecoil_dataset image_size = IM_SIZE if not three_d: add_kwargs = { 'contrast': contrast, 'rand': True, 'inner_slices': None, } else: add_kwargs = {} add_kwargs.update(**acq_kwargs) train_set = dataset( train_path, image_size, acq_type=acq_type, compute_dcomp=dcomp, scale_factor=scale_factor, n_samples=n_samples, **add_kwargs ) val_set = dataset( val_path, image_size, acq_type=acq_type, compute_dcomp=dcomp, scale_factor=scale_factor, **add_kwargs ) additional_info = f'{acq_type}' if contrast is not None: additional_info += f'_{contrast}' if n_samples is not None: additional_info += f'_{n_samples}' if loss != 'mae': additional_info += f'_{loss}' if dcomp: additional_info += '_dcomp' run_id = f'{run_id}_{additional_info}_{int(time.time())}' chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}.hdf5' chkpt_cback = ModelCheckpoint(chkpt_path, period=n_epochs, save_weights_only=True) log_dir = op.join(f'{LOGS_DIR}logs', run_id) tboard_cback = TensorBoard( profile_batch=0, log_dir=log_dir, histogram_freq=0, write_graph=False, write_images=False, ) tqdm_cback = TQDMProgressBar() if original_run_id is not None: lr = 1e-7 n_steps = n_volumes_train//2 else: lr = 1e-4 n_steps = n_volumes_train default_model_compile(model, lr=lr, loss=loss) print(run_id) if original_run_id is not None: if os.environ.get('FASTMRI_DEBUG'): n_epochs_original = 1 else: n_epochs_original = 250 model.load_weights(f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-{n_epochs_original:02d}.hdf5') model.fit( train_set, steps_per_epoch=n_steps, epochs=n_epochs, validation_data=val_set, validation_steps=2, verbose=0, callbacks=[tboard_cback, chkpt_cback, tqdm_cback], ) return run_id
def __init__( self, seq_len, vocab_size, embedding_dim=20, hidden_dim=256, n_hidden=2, dff=512, n_epochs=1, batch_size=1000, inference_batch_size=1500, cache_dir='.', model_name='bilstm', seed=None, verbose=False ): super().__init__(seed=seed,) policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) mirrored_strategy = tf.distribute.MirroredStrategy() print('Number of devices: {}'.format(mirrored_strategy.num_replicas_in_sync)) with mirrored_strategy.scope(): input_pre = Input(shape=(seq_len - 1,)) input_post = Input(shape=(seq_len - 1,)) embed = Embedding(vocab_size + 1, embedding_dim, input_length=seq_len - 1) x_pre = embed(input_pre) x_post = embed(input_post) for _ in range(n_hidden - 1): lstm = LSTM(hidden_dim, return_sequences=True) x_pre = lstm(x_pre) x_post = lstm(x_post) lstm = LSTM(hidden_dim) x_pre = lstm(x_pre) x_post = lstm(x_post) x = concatenate([ x_pre, x_post ], name='embed_layer') #x = Dense(dff, activation='relu')(x) x = Dense(vocab_size + 1)(x) output = Activation('softmax', dtype='float32')(x) self.model_ = Model(inputs=[ input_pre, input_post ], outputs=output) opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) self.model_.compile( loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'] ) self.seq_len_ = seq_len self.vocab_size_ = vocab_size self.embedding_dim_ = embedding_dim self.hidden_dim_ = hidden_dim self.n_hidden_ = n_hidden self.dff_ = dff self.n_epochs_ = n_epochs self.batch_size_ = batch_size self.inference_batch_size_ = inference_batch_size self.cache_dir_ = cache_dir self.model_name_ = model_name self.verbose_ = verbose
def run_training( encoder_f, box_f, lr_f, name, epochs, batch_size, steps_per_epoch, img, data, val_data, img_size, mixed_float=True, notebook=True, ): """ val_data : (X_val, Y_val) tuple """ if mixed_float: policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) st = time.time() inputs = { 'image': keras.Input((img_size[0], img_size[1], 3)), 'pos': keras.Input((2)) } mymodel = BoxModel(inputs, encoder_f, box_f) loss = keras.losses.MeanSquaredError() mymodel.compile( optimizer='adam', loss=loss, # metrics=[ # 'mse', # ] ) logdir = 'logs/fit/' + name tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch='3,5', update_freq='epoch') lr_callback = keras.callbacks.LearningRateScheduler(lr_f, verbose=1) savedir = 'savedmodels/' + name + '/{epoch}' save_callback = keras.callbacks.ModelCheckpoint(savedir, save_weights_only=True, verbose=1) if notebook: tqdm_callback = TqdmNotebookCallback( metrics=['loss', 'binary_accuracy'], leave_inner=False) else: tqdm_callback = TqdmCallback() # if augment: train_ds = create_train_dataset(img, data, img_size, batch_size) mymodel.fit( x=train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch, validation_data=val_data, callbacks=[ tensorboard_callback, lr_callback, save_callback, tqdm_callback, ], verbose=0, # validation_data=val_data, ) # else: # mymodel.fit( # x=X_train, # y=Y_train, # epochs=epochs, # batch_size=batch_size, # callbacks=[ # tensorboard_callback, # lr_callback, # save_callback, # tqdm_callback, # ], # verbose=0, # validation_data=val_data # ) print('Took {} seconds'.format(time.time() - st))
def __init__( self, seq_len, vocab_size, embedding_dim=20, hidden_dim=256, n_hidden=2, n_heads=8, dff=2048, dropout_rate=0.1, n_epochs=1, batch_size=1000, cache_dir='.', model_name='attention', seed=None, verbose=False ): super().__init__(seed=seed,) policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) mirrored_strategy = tf.distribute.MirroredStrategy() print('Number of devices: {}'.format(mirrored_strategy.num_replicas_in_sync)) with mirrored_strategy.scope(): input_ = Input(shape=(seq_len - 1,)) from transformer_layers import Encoder self.encoder_ = Encoder( n_hidden, hidden_dim, n_heads, dff, vocab_size + 1, seq_len, dropout_rate, name='embed_layer', ) x = self.encoder_(input_, None) x = Reshape((hidden_dim * (seq_len - 1),))(x) #x = Dense(dff, activation='relu')(x) x = Dense(vocab_size + 1)(x) output = Activation('softmax', dtype='float32')(x) self.model_ = Model(inputs=input_, outputs=output) opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) self.model_.compile( loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'] ) self.seq_len_ = seq_len self.vocab_size_ = vocab_size self.embedding_dim_ = embedding_dim self.hidden_dim_ = hidden_dim self.n_hidden_ = n_hidden self.n_heads_ = n_heads self.dff_ = dff self.dropout_rate_ = dropout_rate self.n_epochs_ = n_epochs self.batch_size_ = batch_size self.cache_dir_ = cache_dir self.model_name_ = model_name self.verbose_ = verbose
def train_xpdnet_block( model_fun, model_kwargs, model_size=None, multicoil=True, brain=False, af=4, contrast=None, n_samples=None, batch_size=None, n_epochs=200, n_iter=10, res=True, n_scales=0, n_primal=5, use_mixed_precision=False, refine_smaps=False, refine_big=False, loss='mae', lr=1e-4, fixed_masks=False, equidistant_fake=False, multi_gpu=False, mask_type=None, primal_only=True, n_dual=1, n_dual_filters=16, multiscale_kspace_learning=False, block_size=10, block_overlap=0, epochs_per_block_step=None, ): r"""Train an XPDNet network on the fastMRI dataset. The training is done with a learning rate of 1e-4, using the RAdam optimizer. The validation is performed every 5 epochs on 5 volumes. A scale factor of 1e6 is applied to the data. Arguments: model_fun (function): the function initializing the image correction network of the XPDNet. model_kwargs (dict): the set of arguments used to initialize the image correction network. model_size (str or None): a string describing the size of the network used. This is used in the run id. Defaults to None. multicoil (bool): whether the input data is multicoil. Defaults to False. brain (bool): whether to consider brain data instead of knee. Defaults to False. af (int): the acceleration factor for the retrospective undersampling of the data. Defaults to 4. contrast (str or None): the contrast used for this specific training. If None, all contrasts are considered. Defaults to None n_samples (int or None): the number of samples to consider for this training. If None, all samples are considered. Defaults to None. n_epochs (int): the number of epochs (i.e. one pass though all the volumes/samples) for this training. Defaults to 200. checkpoint_epoch (int): the number of epochs used to train the model during the first step of the full training. This is typically used when on a cluster the training duration exceeds the maximum job duration. Defaults to 0, which means that the training is done without checkpoints. save_state (bool): whether you should save the entire model state for this training, for example to retrain where left off. Defaults to False. n_iter (int): the number of iterations for the XPDNet. res (bool): whether the XPDNet image correction networks should be residual. n_scales (int): the number of scales used in the image correction network. Defaults to 0. n_primal (int): the size of the buffer in the image space. Defaults to 5. use_mixed_precision (bool): whether to use the mixed precision API for training. Currently not working. Defaults to False. refine_smaps (bool): whether you want to refine the sensitivity maps with a neural network. loss (tf.keras.losses.Loss or str): the loss function used for the training. It should be understandable by the tf.keras loss API, or be 'compound_mssim', in which case the compound L1 MSSIM loss inspired by [P2020]. Defaults to 'mae'. original_run_id (str or None): run id of the same network trained before fine-tuning. If this is present, the training is considered fine-tuning for a network trained for 250 epochs. It will therefore apply a learning rate of 1e-7 and the epoch size will be divided in half. If None, the training is done normally, without loading weights. Defaults to None. fixed_masks (bool): whether fixed masks should be used for the retrospective undersampling. Defaults to False n_epochs_original (int): the number of epochs used to pre-train the model, only applicable if original_run_id is not None. Defaults to 250. equidistant_fake (bool): whether to use fake equidistant masks from fastMRI. Defaults to False. multi_gpu (bool): whether to use multiple GPUs for the XPDNet training. Defaults to False. Returns: - str: the run id of the trained network. """ if brain: n_volumes = brain_n_volumes_train else: n_volumes = n_volumes_train # paths if multicoil: if brain: train_path = f'{FASTMRI_DATA_DIR}brain_multicoil_train/' val_path = f'{FASTMRI_DATA_DIR}brain_multicoil_val/' else: train_path = f'{FASTMRI_DATA_DIR}multicoil_train/' val_path = f'{FASTMRI_DATA_DIR}multicoil_val/' else: train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/' val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/' af = int(af) # trying mixed precision if use_mixed_precision: policy_type = 'mixed_float16' else: policy_type = 'float32' policy = mixed_precision.Policy(policy_type) mixed_precision.set_policy(policy) # generators if multicoil: dataset = multicoil_dataset if mask_type is None: if brain: if equidistant_fake: mask_type = 'equidistant_fake' else: mask_type = 'equidistant' else: mask_type = 'random' kwargs = { 'parallel': False, 'output_shape_spec': brain, 'mask_type': mask_type, } else: dataset = singlecoil_dataset kwargs = {} train_set = dataset(train_path, AF=af, contrast=contrast, inner_slices=None, rand=True, scale_factor=1e6, n_samples=n_samples, fixed_masks=fixed_masks, batch_size=batch_size, target_image_size=IM_SIZE, **kwargs) val_set = dataset(val_path, AF=af, contrast=contrast, inner_slices=None, rand=True, scale_factor=1e6, **kwargs) run_params = { 'n_primal': n_primal, 'multicoil': multicoil, 'n_scales': n_scales, 'n_iter': n_iter, 'refine_smaps': refine_smaps, 'res': res, 'output_shape_spec': brain, 'multi_gpu': multi_gpu, 'refine_big': refine_big, 'primal_only': primal_only, 'n_dual': n_dual, 'n_dual_filters': n_dual_filters, 'multiscale_kspace_learning': multiscale_kspace_learning, } if multicoil: xpdnet_type = 'xpdnet_sense_' if brain: xpdnet_type += 'brain_' else: xpdnet_type = 'xpdnet_singlecoil_' additional_info = f'af{af}' if contrast is not None: additional_info += f'_{contrast}' if n_samples is not None: additional_info += f'_{n_samples}' if n_iter != 10: additional_info += f'_i{n_iter}' if loss != 'mae': additional_info += f'_{loss}' if refine_smaps: additional_info += '_rf_sm' if refine_big: additional_info += 'b' if fixed_masks: additional_info += '_fixed_masks' if block_overlap != 0: additional_info += f'_blkov{block_overlap}' submodel_info = model_fun.__name__ if model_size is not None: submodel_info += model_size run_id = f'{xpdnet_type}_{additional_info}_bbb_{submodel_info}_{int(time.time())}' chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}' chkpt_path += '.hdf5' log_dir = op.join(f'{LOGS_DIR}logs', run_id) tboard_cback = TensorBoard( profile_batch=0, log_dir=log_dir, histogram_freq=0, write_graph=False, write_images=False, ) tqdm_cback = TQDMProgressBar() model = XPDNet(model_fun, model_kwargs, **run_params) n_steps = n_volumes if batch_size is not None: n_steps //= batch_size chkpt_cback = ModelCheckpointWorkAround( chkpt_path, save_freq=int(n_epochs * n_steps), save_weights_only=True, ) print(run_id) stride = block_size - block_overlap assert stride > 0 n_block_steps = int(math.ceil((n_iter - block_size) / stride) + 1) ## epochs handling start_epoch = 0 final_epoch = min(epochs_per_block_step, n_epochs) for i_step in range(n_block_steps): first_block_to_train = i_step * stride blocks = list( range(first_block_to_train, first_block_to_train + block_size)) model.blocks_to_train = blocks default_model_compile(model, lr=lr, loss=loss) model.fit( train_set, steps_per_epoch=n_steps, initial_epoch=start_epoch, epochs=final_epoch, validation_data=val_set, validation_steps=5, validation_freq=5, verbose=0, callbacks=[tboard_cback, chkpt_cback, tqdm_cback], ) n_epochs = n_epochs - (final_epoch - start_epoch) if n_epochs <= 0: break start_epoch = final_epoch final_epoch += min(epochs_per_block_step, n_epochs) return run_id
def training(meta_train_iterations, meta_batch_size, k_support, k_query, num_inner_updates, inner_update_lr, learn_inner_update_lr, meta_lr, job_dir): policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) mirrored_strategy = tf.distribute.MirroredStrategy() data_generator = DataGenerator(k_support, k_query, meta_batch_size, 'meta_train', job_dir) data_generator_valid = DataGenerator(2, 32, meta_batch_size, 'meta_val', job_dir) itr = 0 meta_loss_log_dir_2 = os.path.join(job_dir, 'summary_6_intrain_2_intest/meta_loss') meta_metric_log_dir_2 = os.path.join( job_dir, 'summary_6_intrain_2_intest/meta_metric') meta_loss_summary_writer = tf.summary.create_file_writer( meta_loss_log_dir_2) meta_metric_writer = tf.summary.create_file_writer(meta_metric_log_dir_2) with mirrored_strategy.scope(): maml = MAML(k_support, k_query, num_inner_updates=num_inner_updates, inner_update_lr=inner_update_lr, learn_inner_update_lr=learn_inner_update_lr) optim = tf.keras.optimizers.Adam(learning_rate=meta_lr) optim = mixed_precision.LossScaleOptimizer(optim, loss_scale='dynamic') storage_client = storage.Client() acc_metric = tf.keras.metrics.CategoricalAccuracy('train_accuracy') dataset = data_generator.create_dataset().take(meta_train_iterations) dist_dataset = mirrored_strategy.experimental_distribute_dataset(dataset) num_replicas = mirrored_strategy.num_replicas_in_sync logging.info('mirrored_strategy.num_replicas_in_sync: %d' % (num_replicas)) best_evel_mIoU = 0 model_exp_str = 'mbs_' + str(meta_batch_size) + '.k_support_' + str( k_support) + '.k_query_' + str(k_query) + '.inner_steps_' + str( num_inner_updates) + '.inner_lr_' + str( inner_update_lr) + '.learn_inner_update_lr_' + str( learn_inner_update_lr) + '.meta_lr_' + str(meta_lr) model_file = os.path.join(job_dir, 'weights_inner_update_4', model_exp_str) for input_support_replica, input_query_replica, label_support_replica, label_query_replica, ids_replica, query_indices_replica in dist_dataset: itr = itr + 1 inp = (input_support_replica, input_query_replica, label_support_replica, label_query_replica) output_query_replicas, meta_loss = distributed_train_step( inp, maml, num_replicas, optim, mirrored_strategy) logging.info('Iteration %d: meta loss: %.5f ' % (itr, meta_loss)) if itr % 1 == 0: if num_replicas > 1: output_query = output_query_replicas.values output_query = tf.concat(output_query, 0) label_query = label_query_replica.values label_query = tf.concat(label_query, 0) else: output_query = output_query_replicas label_query = label_query_replica label_query = tf.cast(label_query, dtype=tf.float32) pred = tf.one_hot(tf.argmax(output_query, axis=-1), depth=data_generator.LABEL_SIZE) with tf.device('/CPU:0'): mIoU = compute_mIoU(label_query[:, :, :, :, 1:], pred[:, :, :, :, 1:], data_generator.LABEL_SIZE - 1) logging.info('Iteration %d: mean IoU: %.5f ' % (itr, mIoU)) with tf.device('/CPU:0'): acc_metric.update_state( label_query[:, :, :, :, 1:], tf.math.softmax(output_query)[:, :, :, :, 1:]) acc = acc_metric.result() logging.info('Iteration %d: accuracy: %.5f ' % (itr, acc)) acc_metric.reset_states() with meta_loss_summary_writer.as_default(): tf.summary.scalar('train-meta-loss', meta_loss, step=itr) with meta_metric_writer.as_default(): tf.summary.scalar('train mean IoU', mIoU, step=itr) with meta_metric_writer.as_default(): tf.summary.scalar('train accuracy', acc, step=itr) # evaluation session if itr % 150 == 0: valid_set = data_generator_valid.sample_batch( ) # only one batch, size of meta_batch_size dist_valid_dataset_single_elem = mirrored_strategy.experimental_distribute_dataset( valid_set) for input_support_val_replica, input_query_val_replica, label_support_val_replica, label_query_val_replica, ids_val_replica, query_indices_val_replica in dist_valid_dataset_single_elem: # only one elem in the dataset inp_valid = (input_support_val_replica, input_query_val_replica, label_support_val_replica, label_query_val_replica) output_query_valid_replicas, meta_loss_valid = distributed_valid_step( inp_valid, maml, num_replicas, optim, mirrored_strategy) logging.info('[VALIDATION] Iteration %d: meta loss: %.5f ' % (itr, meta_loss_valid)) if num_replicas > 1: output_query_valid = output_query_valid_replicas.values output_query_valid = tf.concat(output_query_valid, 0) label_query_valid = label_query_val_replica.values label_query_valid = tf.concat(label_query_valid, 0) ids_valid = tf.concat(ids_val_replica.values, 0) query_indices_valid = tf.concat( query_indices_val_replica.values, 0) else: output_query_valid = output_query_valid_replicas label_query_valid = label_query_val_replica ids_valid = ids_val_replica query_indices_valid = query_indices_val_replica label_query_valid = tf.cast(label_query_valid, dtype=tf.float32) pred_valid = tf.one_hot(tf.argmax(output_query_valid, axis=-1), depth=data_generator.LABEL_SIZE) with tf.device('/CPU:0'): mIoU_valid = compute_mIoU( label_query_valid[:, :, :, :, 1:], pred_valid[:, :, :, :, 1:], data_generator.LABEL_SIZE - 1) logging.info('[VALIDATION] Iteration %d: mean IoU: %.5f ' % (itr, mIoU_valid)) if mIoU_valid > best_evel_mIoU: best_evel_mIoU = mIoU_valid logging.info("saving to ", model_file) maml.save_weights(model_file) with tf.device('/CPU:0'): acc_metric.update_state( label_query_valid[:, :, :, :, 1:], tf.math.softmax(output_query_valid)[:, :, :, :, 1:]) acc_valid = acc_metric.result() logging.info('[VALIDATION] Iteration %d: accuracy: %.5f ' % (itr, acc_valid)) acc_metric.reset_states() with meta_metric_writer.as_default(): tf.summary.scalar('eval mean IoU', mIoU_valid, step=itr) with meta_metric_writer.as_default(): tf.summary.scalar('eval accuracy', acc_valid, step=itr) with tf.device('/CPU:0'): construct_predicted_label_batch(itr, ids_valid, query_indices_valid, label_query_valid, pred_valid, job_dir, storage_client) with meta_loss_summary_writer.as_default(): tf.summary.scalar('eval-meta-loss', meta_loss_valid, step=itr)
def train_model(data_path, batch_size, image_size, crop_size, lr_schedule_name, init_lr, max_lr, weight_decay, optimizer, model_type, embedding_size, num_epochs, checkpoint_path, cache_path=None, margin=0.35, range_test=False, use_tpu=False, tpu_name=None, test_path='', use_mixed_precision=False, triplet_strategy='', images_per_person=35, people_per_sample=12, pretrained_model='', distance_metric="L2", soft=True, sigma=0.3, use_lfw=True): if use_tpu is True: assert tpu_name is not None, '[ERROR] TPU name must be specified' resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=tpu_name) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) print("[INFO] TPUs: ", tf.config.list_logical_devices('TPU')) if use_mixed_precision is True: if use_tpu is True: policy = mixed_precision.Policy('mixed_bfloat16') else: policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) print( "[INFO] Using mixed precision for training. This will reduce memory consumption\n" ) train_dataset, n_imgs, n_classes = generate_training_dataset( data_path=data_path, image_size=image_size, batch_size=batch_size, crop_size=crop_size, cache=cache_path, use_mixed_precision=use_mixed_precision, images_per_person=images_per_person, people_per_sample=people_per_sample, use_tpu=use_tpu, model_type=model_type) if test_path is not None and len(test_path) > 1: if use_lfw is True: test_dataset, test_images, _ = get_LFW_dataset( data_path=test_path, image_size=image_size, batch_size=batch_size, crop_size=crop_size, cache='./lfw_dataset_cache.tfcache', use_mixed_precision=use_mixed_precision, use_tpu=use_tpu, train_classes=n_classes, model_type=model_type) else: test_dataset, test_images, _ = get_test_dataset( data_path=test_path, image_size=image_size, batch_size=30, crop_size=crop_size, cache='./test_dataset_cache.tfcache', use_mixed_precision=use_mixed_precision, use_tpu=use_tpu, train_classes=n_classes, model_type=model_type) else: test_dataset = None if triplet_strategy == 'VANILLA': loss_fn = tfa.losses.TripletSemiHardLoss(margin=margin) print('[INFO] Using vanilla triplet loss') elif triplet_strategy == 'BATCH_HARD': loss_fn = TripletBatchHardLoss(margin=margin, soft=soft, distance_metric=distance_metric) print('[INFO] Using batch-hard strategy.') elif triplet_strategy == 'BATCH_HARD_V2': loss_fn = TripletBatchHardV2Loss(margin1=(-1.0 * margin), margin2=(margin1 / 100.0), beta=0.002, distance_metric=distance_metric) print('[INFO] Using batch-hard V2 strategy') elif triplet_strategy == 'ADAPTIVE': loss_fn = AdaptiveTripletLoss(margin=margin, soft=soft, lambda_=sigma) print('[INFO] Using Adaptive Triplet Loss') else: loss_fn = TripletFocalLoss(margin=margin, sigma=sigma, soft=soft, distance_metric=distance_metric) print('[INFO] Using triplet focal loss.') triplet_loss_metrics = TripletLossMetrics(test_images, embedding_size) if range_test is True: opt = get_optimizer(optimizer_name=optimizer, lr_schedule=1e-5, weight_decay=weight_decay) if use_tpu is True: with strategy.scope(): model = create_neural_network(model_type=model_type, embedding_size=embedding_size) assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights' else: model = create_neural_network(model_type=model_type, embedding_size=embedding_size) assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights' lrs = [] losses = [] for epoch in range(5): for step, (x_batch_train, y_batch_train) in enumerate(train_dataset): with tf.GradientTape() as tape: logits = model(x_batch_train, training=True) loss_value = loss_fn(y_batch_train, logits) grads = tape.gradient(loss_value, model.trainable_weights) perturbations = opt.first_step(grads, model) with tf.GradientTape() as tape: logits = model(x_batch_train, training=True) loss_value = loss_fn(y_batch_train, logits) grads = tape.gradient(loss_value, model.trainable_weights) opt.second_step(grads, model, perturbations) losses.append(float(loss_value.numpy())) lrs.append(opt.base_optimizer.lr.numpy()) if step % 200 == 0 and step > 0: print("Step : %d :: Current loss : %f" % (step, float(loss_value.numpy()))) plt.xscale('log') plt.plot(lrs, losses, color='blue') smooth_losses = savgol_filter(losses, 7, 3) plt.plot(lrs, smooth_losses, color='red') plt.xlabel('Log learning rate') plt.ylabel('Loss') plt.savefig('./range_test_result.png') for x_batch_test, y_batch_test in test_dataset: val_logits = model(x_batch_test, training=False) triplet_loss_metrics.update_state(y_batch_test, val_logits) result = triplet_loss_metrics.result() print(str(result.numpy())) triplet_loss_metrics.reset_states() plt.xscale('log') plt.plot(lrs, losses, color='blue') smooth_losses = savgol_filter(losses, 7, 3) plt.plot(lrs, smooth_losses, color='red') plt.xlabel('Log learning rate') plt.ylabel('Loss') plt.savefig('./range_test_result.png') print( '\n[INFO] Training complete. Range test results can be found at "./range_test_result.png"' ) return else: lr_schedule = get_learning_rate_schedule( schedule_name=lr_schedule_name, learning_rate=init_lr, max_lr=max_lr, image_count=n_imgs, batch_size=batch_size) opt = get_optimizer(optimizer_name=optimizer, lr_schedule=lr_schedule, weight_decay=weight_decay) if not os.path.exists(checkpoint_path): os.mkdir(checkpoint_path) checkpoint_name = checkpoint_path + '/' + 'cp-{epoch:03d}.ckpt' if use_tpu is True: with strategy.scope(): model = create_neural_network(model_type=model_type, embedding_size=embedding_size) assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights' else: model = create_neural_network(model_type=model_type, embedding_size=embedding_size) assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights' for epoch in range(num_epochs): for step, (x_batch_train, y_batch_train) in enumerate(train_dataset): with tf.GradientTape() as tape: logits = model(x_batch_train, training=True) loss_value = loss_fn(y_batch_train, logits) grads = tape.gradient(loss_value, model.trainable_weights) perturbations = opt.first_step(grads, model) with tf.GradientTape() as tape: logits = model(x_batch_train, training=True) loss_value = loss_fn(y_batch_train, logits) grads = tape.gradient(loss_value, model.trainable_weights) opt.second_step(grads, model, perturbations) if step % 200 == 0: print("Step : %d :: Current loss : %f" % (step, float(loss_value))) for x_batch_test, y_batch_test in test_dataset: val_logits = model(x_batch_test, training=False) triplet_loss_metrics.update_state(y_batch_test, val_logits) result = triplet_loss_metrics.result() print(str(result)) triplet_loss_metrics.reset_states() if epoch % 5 == 0: model.save(checkpoint_name.format(epoch)) if not os.path.exists('./results'): os.mkdir('./results') model_name = './results/model-' + datetime.now().strftime( "%Y%m%d-%H%M%S") model.save(model_name) print( '\n[INFO] Training complete. Saved model can be found in "./results"' ) return
def set_keras_mixed_precision_policy(policy_name: str) -> None: """Set tf.keras mixed precision""" policy = mixed_precision.Policy(policy_name) mixed_precision.set_policy(policy)
def main(config): gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_visible_devices(devices=gpus[config.device], device_type='GPU') if config.gpu_growth: for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) assert config.precision in (16, 32), config.precision if config.precision == 16: prec.set_policy(prec.Policy('mixed_float16')) config.steps = int(config.steps) config.logdir.mkdir(parents=True, exist_ok=True) print('Logdir', config.logdir) # Create environments. datadir = config.logdir / 'episodes' writer = tf.summary.create_file_writer(str(config.logdir), max_queue=1000, flush_millis=20000) writer.set_as_default() train_envs = [ wrappers.Async( lambda: make_env(config, writer, 'train', datadir, store=True), config.parallel) for _ in range(config.envs) ] test_envs = [ wrappers.Async( lambda: make_env(config, writer, 'test', datadir, store=False), config.parallel) for _ in range(config.envs) ] actspace = train_envs[0].action_space # Prefill dataset with random episodes. step = count_steps(datadir, config) prefill = max(0, config.prefill - step) print(f'Prefill dataset with {prefill} steps.') random_agent = lambda o, d, _: ([actspace.sample() for _ in d], None) tools.simulate(random_agent, train_envs, prefill / config.action_repeat) writer.flush() # Train and regularly evaluate the agent. step = count_steps(datadir, config) print(f'Simulating agent for {config.steps - step} steps.') agent = Dreamer(config, datadir, actspace, writer) if (config.logdir / 'variables.pkl').exists(): print('Load checkpoint.') agent.load(config.logdir / 'variables.pkl') state = None while step < config.steps: print('Start evaluation.') if config.test_model == True: print('Start evaluate model.') tools.test_model( functools.partial(agent, training=False), test_envs, episodes=1, dynamics=agent._dynamics, model_metric_summaries=agent._model_metric_summaries, value=agent._value, decode=agent._decode, test_len=config.test_len) else: tools.simulate(functools.partial(agent, training=False), test_envs, episodes=1) writer.flush() print('Start collection.') steps = config.eval_every // config.action_repeat state = tools.simulate(agent, train_envs, steps, state=state) step = count_steps(datadir, config) agent.save(config.logdir / 'variables.pkl') for env in train_envs + test_envs: env.close()
def init_network(self): # This function builds the compute graph. # Optionally, it can build a 'subset' graph if this mode is # Net construction: start = time.time() # Here, if using mixed precision, set a global policy: if self.args.precision == "mixed": from tensorflow.keras.mixed_precision import experimental as mixed_precision self.policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(self.policy) if self.args.precision == "bfloat16": from tensorflow.keras.mixed_precision import experimental as mixed_precision self.policy = mixed_precision.Policy('mixed_bfloat16') mixed_precision.set_policy(self.policy) batch_dims = self.larcv_fetcher.batch_dims(1) # We compute the batch_dims[0] = self.local_batch_size() # self._global_step = tf.Variable(0, dtype=tf.int64) # We have to make placeholders for input objects: # # self._input = { # 'image' : tf.compat.v1.placeholder(floating_point_format, batch_dims, name="input_image"), # 'label' : tf.compat.v1.placeholder(integer_format, batch_dims, name="input_label"), # 'io_time' : tf.compat.v1.placeholder(floating_point_format, (), name="io_fetch_time") # } # Build the network object, forward pass only: if self.args.conv_mode == '2D': self._net = uresnet2D.UResNet(self.args) else: self._net = uresnet3D.UResNet3D(self.args) self._net.trainable = True # self._logits = self._net(self._input['image'], training=self.args.training) # # If channels first, need to permute the logits: # if self._channels_dim == 1: # permutation = tf.keras.layers.Permute((2, 3, 1)) # self._loss_logits = [ permutation(l) for l in self._logits ] # else: # self._loss_logits = self._logits # TO PROPERLY INITIALIZE THE NETWORK, NEED TO DO A FORWARD PASS minibatch_data = self.larcv_fetcher.fetch_next_batch("train", force_pop=False) image, label = self.cast_input(minibatch_data['image'], minibatch_data['label']) self.forward_pass(image, label, training=False) # # Here, if the data format is channels_first, we have to reorder the logits tensors # # To put channels last. Otherwise it does not work with the softmax tensors. # # # # Apply a softmax and argmax: # self._output = dict() # # # Take the logits (which are one per plane) and create a softmax and prediction (one per plane) # with tf.compat.v1.variable_scope("prediction"): # self._output['prediction'] = [ tf.argmax(x, axis=self._channels_dim) for x in self._logits] # # with tf.compat.v1.variable_scope("cross_entropy"): # # self._input['split_labels'] = [ # tf.squeeze(l, axis=self._channels_dim) # for l in tf.split(self._input['label'], 3, self._channels_dim) # ] # self._input['split_images'] = [ # tf.squeeze(l, axis=self._channels_dim) # for l in tf.split(self._input['image'], 3, self._channels_dim) # ] # # self._loss = self.loss_calculator( # labels = self._input['split_labels'], # logits = self._loss_logits) # # # if self.args.mode == "inference": # self._output['softmax'] = [tf.nn.softmax(x, axis=self._channels_dim) for x in self._logits] self.acc_calculator = AccuracyCalculator.AccuracyCalculator() self.loss_calculator = LossCalculator.LossCalculator( self.args.loss_balance_scheme, self._channels_dim) self._log_keys = ["loss", "Average/Non_Bkg_Accuracy", "Average/mIoU"] end = time.time() return end - start
import tensorflow as tf from tensorflow.keras.preprocessing.image import ImageDataGenerator from tensorflow.keras.mixed_precision import experimental as mixed_precision import pandas as pd from tensorflow.keras.layers import Flatten, Dense, LeakyReLU, BatchNormalization, Dropout import keras.backend as K from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping import efficientnet.keras as efn import tensorflow_addons as tfa policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) datagen = ImageDataGenerator(rescale=1. / 255, validation_split=0.2, horizontal_flip=True) train_csv = pd.read_csv(r"/content/train.csv") train_csv["label"] = train_csv["label"].astype(str) base_model = tf.keras.applications.ResNet50(weights='imagenet', input_shape=(512, 512, 3), include_top=True) base_model.trainable = True model = tf.keras.Sequential([ tf.keras.layers.Input((512, 512, 3)), tf.keras.layers.BatchNormalization(renorm=True), base_model, BatchNormalization(), tf.keras.layers.LeakyReLU(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(512), BatchNormalization(),
def build(model_fn: Callable[[], Union[Model, List[Model]]], optimizer_fn: Union[str, Scheduler, Callable, List[str], List[Callable], List[Scheduler], None], weights_path: Union[str, None, List[Union[str, None]]] = None, model_name: Union[str, List[str], None] = None, mixed_precision: bool = False) -> Union[Model, List[Model]]: """Build model instances and associate them with optimizers. This method can be used with TensorFlow models / optimizers: ```python model_def = fe.architecture.tensorflow.LeNet model = fe.build(model_fn = model_def, optimizer_fn="adam") model = fe.build(model_fn = model_def, optimizer_fn=lambda: tf.optimizers.Adam(lr=0.1)) model = fe.build(model_fn = model_def, optimizer_fn="adam", weights_path="~/weights.h5") ``` This method can be used with PyTorch models / optimizers: ```python model_def = fe.architecture.pytorch.LeNet model = fe.build(model_fn = model_def, optimizer_fn="adam") model = fe.build(model_fn = model_def, optimizer_fn=lambda x: torch.optim.Adam(params=x, lr=0.1)) model = fe.build(model_fn = model_def, optimizer_fn="adam", weights_path="~/weights.pt) ``` Args: model_fn: A function that define model(s). optimizer_fn: Optimizer string/definition or a list of optimizer instances/strings. The number of optimizers provided here should match the number of models generated by the `model_fn`. model_name: Name(s) of the model(s) that will be used for logging purpose. If None, a name will be automatically generated and assigned. weights_path: Path(s) from which to load model weights. If not None, then the number of weight paths provided should match the number of models generated by the `model_fn`. mixed_precision: Whether to enable mix precision network operations, only applies to tensorflow models. Returns: models: The model(s) built by FastEstimator. """ def _generate_model_names(num_names): names = [ "model" if i + build.count == 0 else "model{}".format(i + build.count) for i in range(num_names) ] build.count += num_names return names if not hasattr(build, "count"): build.count = 0 # mix-precision handling if mixed_precision: mixed_precision_tf.set_policy( mixed_precision_tf.Policy('mixed_float16')) else: mixed_precision_tf.set_policy(mixed_precision_tf.Policy('float32')) models, optimizer_fn = to_list(model_fn()), to_list(optimizer_fn) # fill optimizer if not optimizer_fn: optimizer_fn = [None] # check framework if isinstance(models[0], tf.keras.Model): framework = "tf" elif isinstance(models[0], torch.nn.Module): framework = "torch" else: raise ValueError("unrecognized model format: {}".format(type( models[0]))) # multi-gpu handling if torch.cuda.device_count() > 1: if framework == "tf" and not isinstance( tf.distribute.get_strategy(), tf.distribute.MirroredStrategy): tf.distribute.experimental_set_strategy( tf.distribute.MirroredStrategy()) models = to_list(model_fn()) if framework == "torch": models = [torch.nn.DataParallel(model) for model in models] # generate names if not model_name: model_name = _generate_model_names(len(models)) model_name = to_list(model_name) # load weights if weights_path: weights_path = to_list(weights_path) else: weights_path = [None] * len(models) assert len(models) == len(optimizer_fn) == len(weights_path) == len(model_name), \ "Found inconsistency in number of models, optimizers, model_name or weights" # create optimizer for idx, (model, optimizer_def, weight, name) in enumerate( zip(models, optimizer_fn, weights_path, model_name)): models[idx] = trace_model(_fe_compile(model, optimizer_def, weight, name, framework), model_idx=idx if len(models) > 1 else -1, model_fn=model_fn, optimizer_fn=optimizer_def, weights_path=weight) if len(models) == 1: models = models[0] return models