def get_deepspeech(input_dim,
                   output_dim,
                   context=9,
                   units=2048,
                   dropouts=(0.05, 0.05, 0.05, 0, 0.05),
                   tflite_version=False,
                   is_mixed_precision=False,
                   lstm_implementation=2,
                   random_state=1) -> keras.Model:
    """
    The `get_deepspeech` returns the graph definition of the DeepSpeech
    model. Default parameters are overwritten only where it is needed.

    Reference:
    "Deep Speech: Scaling up end-to-end speech recognition."
    (https://arxiv.org/abs/1412.5567)
    """
    if is_mixed_precision:
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)

    if dropouts[3] != 0:
        logger.warning("Mozilla DeepSpeech doesn't use dropout "
                       "after LSTM(dropouts[3]). Be careful!")
    np.random.seed(random_state)
    tf.random.set_seed(random_state)

    max_seq_length = None
    if tflite_version:
        max_seq_length = 1

    with tf.device('/gpu:0'):
        input_tensor = layers.Input([max_seq_length, input_dim], name='X')

        # Add 4th dimension [batch, time, frequency, channel]
        x = layers.Lambda(keras.backend.expand_dims,
                          arguments=dict(axis=3))(input_tensor)
        # Fill zeros around time dimension
        x = layers.ZeroPadding2D(padding=(context, 0))(x)
        # Convolve signal in time dim
        receptive_field = (2 * context + 1, input_dim)
        x = layers.Conv2D(filters=units, kernel_size=receptive_field)(x)
        # Squeeze into 3rd dim array
        x = layers.Lambda(keras.backend.squeeze, arguments=dict(axis=2))(x)

        x = layers.ReLU()(x)
        x = layers.Dropout(rate=dropouts[0])(x)

        x = layers.TimeDistributed(layers.Dense(units), name='td_dense_2')(x)

        x = layers.ReLU(max_value=20)(x)
        x = layers.Dropout(rate=dropouts[1])(x)

        x = layers.TimeDistributed(layers.Dense(units), name='td_dense_3')(x)

        x = layers.ReLU(max_value=20)(x)
        x = layers.Dropout(rate=dropouts[2])(x)

        x = layers.LSTM(units,
                        return_sequences=True,
                        name='lstm_1',
                        unroll=tflite_version,
                        implementation=lstm_implementation)(x)
        x = layers.Dropout(rate=dropouts[3])(x)

        x = layers.TimeDistributed(layers.Dense(units), name='td_dense_4')(x)
        x = layers.ReLU(max_value=20)(x)
        x = layers.Dropout(rate=dropouts[4])(x)

        x = layers.TimeDistributed(layers.Dense(output_dim),
                                   name='td_dense_5')(x)

        model = keras.Model(input_tensor, x, name='DeepSpeech')

    if is_mixed_precision:  # revert policy
        policy = mixed_precision.Policy('float32')
        mixed_precision.set_policy(policy)

    return model
        'bbox': nms.nmsed_boxes,
        'classes': nms.nmsed_classes,
        'confidence': nms.nmsed_scores,
    }


if __name__ == '__main__':

    from yolo.utils.run_utils import prep_gpu
    from yolo.configs import yolo as exp_cfg
    from yolo.tasks.yolo import YoloTask
    import yolo.utils.export.tensor_rt as trt
    prep_gpu()

    from tensorflow.keras.mixed_precision import experimental as mixed_precision
    mixed_precision.set_policy('float16')

    # init a fake webcam
    # ls /dev/video*
    # sudo modprobe -r v4l2loopback
    # sudo modprobe v4l2loopback devices=1 video_nr=20 card_label="v4l2loopback" exclusive_caps=1

    # name = "saved_models/v4/regular"
    # new_name = f"{name}_tensorrt"
    # model = trt.TensorRT(saved_model=new_name, save_new_path=new_name, max_workspace_size_bytes=4000000000, max_batch_size=5)#, precision_mode="INT8", use_calibration=True)
    # model.compile()
    # model.summary()
    # model.set_postprocessor_fn(func)

    config = exp_cfg.YoloTask()
    task = YoloTask(config)
Beispiel #3
0
def train(strategy, cfg):
    os.makedirs(cfg.MODEL.SAVE_DIR, exist_ok=True)

    if cfg.DATASET.BFLOAT16:
        policy = mixed_precision.Policy('mixed_bfloat16')
        mixed_precision.set_policy(policy)

    tf.random.set_seed(cfg.TRAIN.SEED)
    np.random.seed(cfg.TRAIN.SEED)

    spe = int(np.ceil(cfg.DATASET.TRAIN_SAMPLES / cfg.TRAIN.BATCH_SIZE))
    spv = cfg.DATASET.VAL_SAMPLES // cfg.VAL.BATCH_SIZE

    if cfg.TRAIN.SCALE_LR:
        lr = cfg.TRAIN.BASE_LR * cfg.TRAIN.BATCH_SIZE / 32
        cfg.TRAIN.WARMUP_FACTOR = 32 / cfg.TRAIN.BATCH_SIZE
    else:
        lr = cfg.TRAIN.BASE_LR

    if cfg.TRAIN.LR_SCHEDULE == 'warmup_cosine_decay':
        lr_schedule = WarmupCosineDecay(
            initial_learning_rate=lr,
            decay_steps=cfg.TRAIN.EPOCHS * spe,
            warmup_steps=cfg.TRAIN.WARMUP_EPOCHS * spe,
            warmup_factor=cfg.TRAIN.WARMUP_FACTOR)
    elif cfg.TRAIN.LR_SCHEDULE == 'warmup_piecewise':
        lr_schedule = WarmupPiecewise(
            boundaries=[x * spe for x in cfg.TRAIN.DECAY_EPOCHS],
            values=[lr, lr / 10, lr / 10 ** 2],
            warmup_steps=spe * cfg.TRAIN.WARMUP_EPOCHS,
            warmup_factor=cfg.TRAIN.WARMUP_FACTOR)
    else:
        lr_schedule = lr

    with strategy.scope():
        optimizer = tf.keras.optimizers.Adam(lr_schedule)
        if cfg.TRAIN.WANDB_RUN_ID:
            api = wandb.Api()
            run = api.run(f"{cfg.EVAL.WANDB_RUNS}/{cfg.TRAIN.WANDB_RUN_ID}")
            run.file("model-best.h5").download(replace=True)
            model = tf.keras.models.load_model('model-best.h5', 
                custom_objects={
                            'relu6': tf.nn.relu6,
                            'WarmupCosineDecay': WarmupCosineDecay
                })
            model.compile(optimizer=model.optimizer, loss=mse)
        else:
            if cfg.MODEL.TYPE == 'simple_baseline':
                model = SimpleBaseline(cfg)
            elif cfg.MODEL.TYPE == 'hrnet':
                model = HRNet(cfg)
            elif cfg.MODEL.TYPE == 'evopose':
                model = EvoPose(cfg)
            elif cfg.MODEL.TYPE == 'eflite':
                model = EfficientNetLite(cfg)
            elif cfg.MODEL.TYPE == 'ef':
                model = EfficientNet(cfg)

            model.compile(optimizer=optimizer, loss=mse)

    cfg.DATASET.OUTPUT_SHAPE = model.output_shape[1:]
    cfg.DATASET.SIGMA = 2 * cfg.DATASET.OUTPUT_SHAPE[0] / 64

    wandb_config = setup_wandb(cfg, model)

    train_ds = load_tfds(cfg, 'train')
    train_ds = strategy.experimental_distribute_dataset(train_ds)

    if cfg.TRAIN.VAL:
        val_ds = load_tfds(cfg, 'val')
        val_ds = strategy.experimental_distribute_dataset(val_ds)

    print('Training {} ({} / {}) on {} for {} epochs'
          .format(cfg.MODEL.NAME, wandb_config.parameters,
                  wandb_config.flops, cfg.TRAIN.ACCELERATOR, cfg.TRAIN.EPOCHS))

    initial_epoch = 0
    if cfg.TRAIN.WANDB_RUN_ID:
        initial_epoch = cfg.TRAIN.INITIAL_EPOCH

    model.fit(train_ds, initial_epoch=initial_epoch, epochs=cfg.TRAIN.EPOCHS, verbose=1,
                        validation_data=val_ds, 
                        validation_steps=spv, 
                        steps_per_epoch=spe,
                        callbacks=[WandbCallback()])
    
    return model
Beispiel #4
0
def train(datadir, var_dict, output_vars, filters, kernels, lr, batch_size,
          early_stopping_patience, epochs, exp_id, model_save_dir,
          pred_save_dir, train_years, valid_years, test_years, lead_time, gpu,
          norm_subsample, data_subsample, lr_step, lr_divide, network_type,
          restore_best_weights, bn_position, nt_in, dt_in, use_bias, l2, skip,
          dropout, reduce_lr_patience, reduce_lr_factor, min_lr_times, unres,
          loss, cmip, cmip_dir, pretrained_model, last_pretrained_layer,
          last_trainable_layer, min_es_delta, optimizer, activation, ext_mean,
          ext_std, cont_time, multi_dt, momentum, parametric, one_cycle,
          long_skip, train_tfr_files, valid_tfr_files, test_tfr_files,
          tfr_num_parallel_calls, tfr_buffer_size, tfr_prefetch, y_roll,
          X_roll, discard_first, min_lead_time, relu_idxs, tp_log,
          tfr_out_idxs, predict_difference, is_categorical, bin_min, bin_max,
          num_bins, quantile_bins, **kwargs):
    print(type(var_dict))

    # os.environ["CUDA_VISIBLE_DEVICES"]=str(2)
    # # Limit TF memory usage
    # limit_mem()
    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(g) for g in gpu])
    mirrored_strategy = tf.distribute.MirroredStrategy(
        devices=[f"/gpu:{i}" for i, g in enumerate(gpu)])

    # Mixed precicion policy
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_policy(policy)

    # Open dataset and create data generators
    if cmip:
        if len(cmip_dir) > 1:
            dg_train, dg_valid, dg_test = [], [], []
            for cd in cmip_dir:
                dgtr, dgv, dgte = load_data(
                    var_dict,
                    datadir,
                    cmip,
                    cd,
                    train_years,
                    valid_years,
                    test_years,
                    lead_time,
                    batch_size,
                    output_vars,
                    data_subsample,
                    norm_subsample,
                    nt_in,
                    dt_in,
                    ext_mean=ext_mean,
                    ext_std=ext_std,
                    cont_time=cont_time,
                    multi_dt=multi_dt,
                    train_tfr_files=train_tfr_files,
                    valid_tfr_files=valid_tfr_files,
                    test_tfr_files=test_tfr_files,
                    tfr_num_parallel_calls=tfr_num_parallel_calls,
                    tfr_buffer_size=tfr_buffer_size,
                    tfr_prefetch=tfr_prefetch,
                    y_roll=y_roll,
                    X_roll=X_roll,
                    discard_first=discard_first,
                    min_lead_time=min_lead_time,
                    tp_log=tp_log,
                    tfr_out_idxs=tfr_out_idxs,
                    predict_difference=predict_difference,
                    is_categorical=is_categorical,
                    bin_min=bin_min,
                    bin_max=bin_max,
                    num_bins=num_bins,
                    quantile_bins=quantile_bins)
                dg_train.append(dgtr)
                dg_valid.append(dgv)
                dg_test.append(dgte)
            dg_train, dg_valid, dg_test = [
                CombinedDataGenerator(dg, batch_size)
                for dg in [dg_train, dg_valid, dg_test]
            ]
        else:
            dg_train, dg_valid, dg_test = load_data(
                var_dict,
                datadir,
                cmip,
                cmip_dir[0],
                train_years,
                valid_years,
                test_years,
                lead_time,
                batch_size,
                output_vars,
                data_subsample,
                norm_subsample,
                nt_in,
                dt_in,
                ext_mean=ext_mean,
                ext_std=ext_std,
                cont_time=cont_time,
                multi_dt=multi_dt,
                train_tfr_files=train_tfr_files,
                valid_tfr_files=valid_tfr_files,
                test_tfr_files=test_tfr_files,
                tfr_num_parallel_calls=tfr_num_parallel_calls,
                tfr_buffer_size=tfr_buffer_size,
                tfr_prefetch=tfr_prefetch,
                y_roll=y_roll,
                X_roll=X_roll,
                discard_first=discard_first,
                min_lead_time=min_lead_time,
                tp_log=tp_log,
                tfr_out_idxs=tfr_out_idxs,
                predict_difference=predict_difference,
                is_categorical=is_categorical,
                bin_min=bin_min,
                bin_max=bin_max,
                num_bins=num_bins,
                quantile_bins=quantile_bins)
    else:
        dg_train, dg_valid, dg_test = load_data(
            var_dict,
            datadir,
            cmip,
            cmip_dir,
            train_years,
            valid_years,
            test_years,
            lead_time,
            batch_size,
            output_vars,
            data_subsample,
            norm_subsample,
            nt_in,
            dt_in,
            ext_mean=ext_mean,
            ext_std=ext_std,
            cont_time=cont_time,
            multi_dt=multi_dt,
            train_tfr_files=train_tfr_files,
            valid_tfr_files=valid_tfr_files,
            test_tfr_files=test_tfr_files,
            tfr_num_parallel_calls=tfr_num_parallel_calls,
            tfr_buffer_size=tfr_buffer_size,
            tfr_prefetch=tfr_prefetch,
            y_roll=y_roll,
            X_roll=X_roll,
            discard_first=discard_first,
            min_lead_time=min_lead_time,
            tp_log=tp_log,
            tfr_out_idxs=tfr_out_idxs,
            predict_difference=predict_difference,
            is_categorical=is_categorical,
            bin_min=bin_min,
            bin_max=bin_max,
            num_bins=num_bins,
            quantile_bins=quantile_bins)

    # Build model
    if pretrained_model is not None:
        pretrained_model = keras.models.load_model(pretrained_model,
                                                   custom_objects={
                                                       'PeriodicConv2D':
                                                       PeriodicConv2D,
                                                       'ChannelReLU2D':
                                                       ChannelReLU2D,
                                                       'lat_mse':
                                                       keras.losses.mse,
                                                       'lat_mae':
                                                       keras.losses.mse
                                                   })

    with mirrored_strategy.scope():
        if network_type == 'resnet':
            model = build_resnet(filters,
                                 kernels,
                                 input_shape=dg_train.shape,
                                 bn_position=bn_position,
                                 use_bias=use_bias,
                                 l2=l2,
                                 skip=skip,
                                 dropout=dropout,
                                 activation=activation,
                                 long_skip=long_skip,
                                 relu_idxs=relu_idxs,
                                 categorical=is_categorical,
                                 nvars=len(dg_train.output_idxs))
        elif network_type == 'uresnet':
            model = build_uresnet(filters,
                                  kernels,
                                  unres,
                                  input_shape=dg_train.shape,
                                  bn_position=bn_position,
                                  use_bias=use_bias,
                                  l2=l2,
                                  skip=skip,
                                  dropout=dropout,
                                  activation=activation)

        if pretrained_model is not None:
            # Copy over weights
            for i, l in enumerate(pretrained_model.layers):
                model.layers[i].set_weights(l.get_weights())
                if l.name == last_pretrained_layer: break

            # Set trainable to false
            if last_trainable_layer is not None:
                for l in model.layers:
                    l.trainable = False
                    if l.name == last_trainable_layer: break

        if multi_dt > 1:
            model = create_multi_dt_model(model, multi_dt, dg_train)

        if loss == 'lat_mse':
            loss = create_lat_mse(dg_train.data.lat)
        if loss == 'lat_mae':
            loss = create_lat_mae(dg_train.data.lat)
        if loss == 'lat_rmse':
            loss = create_lat_rmse(dg_train.data.lat)
        if loss == 'lat_crps':
            loss = create_lat_crps(dg_train.data.lat,
                                   len(dg_train.output_idxs))
        if loss == 'lat_crps_relu':
            loss = create_lat_crps(dg_train.data.lat,
                                   len(dg_train.output_idxs),
                                   relu=True)
        if loss == 'lat_crps_mae':
            loss = create_lat_crps_mae(dg_train.data.lat,
                                       len(dg_train.output_idxs))
        if loss == 'lat_crps_lcgev':
            loss = create_lat_crps_lcgev(dg_train.data.lat,
                                         len(dg_train.output_idxs))
        if loss == 'lat_log_loss':
            loss = create_lat_log_loss(dg_train.data.lat,
                                       len(dg_train.output_idxs))
        if loss == 'lat_categorical_crossentropy':
            loss = create_lat_categorical_loss(dg_train.data.lat,
                                               len(dg_train.output_idxs))

        if optimizer == 'adam':
            opt = keras.optimizers.Adam(lr)
        elif optimizer == 'adadelta':
            opt = keras.optimizers.Adadelta(lr)
        elif optimizer == 'sgd':
            opt = keras.optimizers.SGD(lr, momentum=momentum, nesterov=True)
        elif optimizer == 'rmsprop':
            opt = keras.optimizers.RMSprop(lr, momentum=momentum)

        model.compile(opt, loss)
        print(model.summary())

    # Learning rate settings
    callbacks = []
    if early_stopping_patience is not None:
        callbacks.append(
            tf.keras.callbacks.EarlyStopping(
                patience=early_stopping_patience,
                verbose=1,
                min_delta=min_es_delta,
                mode='auto',
                restore_best_weights=restore_best_weights))
    if reduce_lr_patience is not None:
        callbacks.append(
            tf.keras.callbacks.ReduceLROnPlateau(
                patience=reduce_lr_patience,
                factor=reduce_lr_factor,
                verbose=1,
                min_lr=reduce_lr_factor**min_lr_times * lr,
            ))
    if lr_step is not None:
        callbacks.append(
            keras.callbacks.LearningRateScheduler(
                LRUpdate(lr, lr_step, lr_divide)))
    if one_cycle:
        callbacks.append(
            OneCycleLR(
                lr,
                maximum_momentum=None if not optimizer == 'sgd' else 0.95,
                minimum_momentum=None if not optimizer == 'sgd' else 0.85,
                verbose=1))

    # Train model
    history = model.fit(dg_train.tfr_dataset or dg_train,
                        epochs=epochs,
                        validation_data=dg_valid.tfr_dataset or dg_valid,
                        callbacks=callbacks)
    print(f'Saving model: {model_save_dir}/{exp_id}.h5')
    model.save(f'{model_save_dir}/{exp_id}.h5')
    print(f'Saving model weights: {model_save_dir}/{exp_id}_weights.h5')
    model.save_weights(f'{model_save_dir}/{exp_id}_weights.h5')
    print(f'Saving training_history: {model_save_dir}/{exp_id}_history.pkl')
    to_pickle(history.history, f'{model_save_dir}/{exp_id}_history.pkl')
    print(
        f'Saving norm files: {model_save_dir}/{exp_id}_mean.nc and {model_save_dir}/{exp_id}_std.nc'
    )
    dg_train.mean.to_netcdf(f'{model_save_dir}/{exp_id}_mean.nc')
    dg_train.std.to_netcdf(f'{model_save_dir}/{exp_id}_std.nc')

    # Create predictions
    preds = create_predictions(model,
                               dg_test,
                               parametric=parametric,
                               multi_dt=multi_dt > 1)
    if len(preds.lat) != 32:
        preds = regrid(preds, ddeg_out=5.625)
    print(f'Saving predictions: {pred_save_dir}/{exp_id}.nc')
    preds.to_netcdf(f'{pred_save_dir}/{exp_id}.nc')

    # Print score in real units

    if not cmip:
        if '5.625deg' in datadir:
            valdir = datadir
        else:
            valdir = '/'.join(datadir.split('/')[:-2] + ['5.625deg/'])

        z500_valid = load_test_data(f'{valdir}geopotential_500',
                                    'z',
                                    years=slice(test_years[0],
                                                test_years[1])).drop('level')
        t850_valid = load_test_data(f'{valdir}temperature_850',
                                    't',
                                    years=slice(test_years[0],
                                                test_years[1])).drop('level')
        tp = xr.open_mfdataset(
            f'{valdir}/6hr_precipitation/*.nc',
            combine='by_coords').sel(time=slice(test_years[0], test_years[1]))
        t2m = xr.open_mfdataset(
            f'{valdir}/2m_temperature/*.nc',
            combine='by_coords').sel(time=slice(test_years[0], test_years[1]))
        valid = xr.merge([z500_valid, t850_valid, tp, t2m])

        print(compute_weighted_rmse(preds, valid).load())
Beispiel #5
0
def train_updnet(
        multicoil=True,
        brain=False,
        af=4,
        contrast=None,
        cuda_visible_devices='0123',
        n_samples=None,
        n_epochs=200,
        n_iter=10,
        use_mixed_precision=False,
        n_layers=3,
        base_n_filter=16,
        non_linearity='relu',
        channel_attention_kwargs=None,
        refine_smaps=False,
        loss='mae',
        original_run_id=None,
        fixed_masks=False,
        n_epochs_original=250,
        equidistant_fake=False,
        mask_type=None,
    ):
    if brain:
        n_volumes = brain_n_volumes_train
    else:
        n_volumes = n_volumes_train

    # paths
    if multicoil:
        if brain:
            train_path = f'{FASTMRI_DATA_DIR}brain_multicoil_train/'
            val_path = f'{FASTMRI_DATA_DIR}brain_multicoil_val/'
        else:
            train_path = f'{FASTMRI_DATA_DIR}multicoil_train/'
            val_path = f'{FASTMRI_DATA_DIR}multicoil_val/'
    else:
        train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/'
        val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/'


    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices)
    af = int(af)

    # trying mixed precision
    if use_mixed_precision:
        policy_type = 'mixed_float16'
    else:
        policy_type = 'float32'
    policy = mixed_precision.Policy(policy_type)
    mixed_precision.set_policy(policy)
    # generators
    if multicoil:
        dataset = multicoil_dataset
        if mask_type is None:
            if brain:
                if equidistant_fake:
                    mask_type = 'equidistant_fake'
                else:
                    mask_type = 'equidistant'
            else:
                mask_type = 'random'
        kwargs = {
            'parallel': False,
            'output_shape_spec': brain,
            'mask_type': mask_type,
        }
    else:
        dataset = singlecoil_dataset
        kwargs = {}
    train_set = dataset(
        train_path,
        AF=af,
        contrast=contrast,
        inner_slices=None,
        rand=True,
        scale_factor=1e6,
        n_samples=n_samples,
        fixed_masks=fixed_masks,
        **kwargs
    )
    val_set = dataset(
        val_path,
        AF=af,
        contrast=contrast,
        inner_slices=None,
        rand=True,
        scale_factor=1e6,
        **kwargs
    )

    run_params = {
        'n_primal': 5,
        'n_dual': 1,
        'primal_only': True,
        'multicoil': multicoil,
        'n_layers': n_layers,
        'layers_n_channels': [base_n_filter * 2**i for i in range(n_layers)],
        'non_linearity': non_linearity,
        'n_iter': n_iter,
        'channel_attention_kwargs': channel_attention_kwargs,
        'refine_smaps': refine_smaps,
        'output_shape_spec': brain,
    }

    if multicoil:
        updnet_type = 'updnet_sense_'
        if brain:
            updnet_type += 'brain_'
    else:
        updnet_type = 'updnet_singlecoil_'
    additional_info = f'af{af}'
    if contrast is not None:
        additional_info += f'_{contrast}'
    if n_samples is not None:
        additional_info += f'_{n_samples}'
    if n_iter != 10:
        additional_info += f'_i{n_iter}'
    if non_linearity != 'relu':
        additional_info += f'_{non_linearity}'
    if n_layers != 3:
        additional_info += f'_l{n_layers}'
    if base_n_filter != 16:
        additional_info += f'_bf{base_n_filter}'
    if loss != 'mae':
        additional_info += f'_{loss}'
    if channel_attention_kwargs:
        additional_info += '_ca'
    if refine_smaps:
        additional_info += '_rf_sm'
    if fixed_masks:
        additional_info += '_fixed_masks'

    run_id = f'{updnet_type}_{additional_info}_{int(time.time())}'
    chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}.hdf5'

    chkpt_cback = ModelCheckpoint(chkpt_path, period=n_epochs, save_weights_only=True)
    log_dir = op.join(f'{LOGS_DIR}logs', run_id)
    tboard_cback = TensorBoard(
        profile_batch=0,
        log_dir=log_dir,
        histogram_freq=0,
        write_graph=False,
        write_images=False,
    )
    tqdm_cback = TQDMProgressBar()

    model = UPDNet(**run_params)
    if original_run_id is not None:
        lr = 1e-7
        n_steps = brain_volumes_per_contrast['train'].get(contrast, n_volumes//2)
    else:
        lr = 1e-4
        n_steps = n_volumes
    default_model_compile(model, lr=lr, loss=loss)
    print(run_id)
    if original_run_id is not None:
        if os.environ.get('FASTMRI_DEBUG'):
            n_epochs_original = 1
        model.load_weights(f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-{n_epochs_original:02d}.hdf5')

    model.fit(
        train_set,
        steps_per_epoch=n_steps,
        epochs=n_epochs,
        validation_data=val_set,
        validation_steps=2,
        verbose=0,
        callbacks=[tboard_cback, chkpt_cback, tqdm_cback],
    )
    return run_id
Beispiel #6
0
def get_deepspeech2(input_dim,
                    output_dim,
                    is_mixed_precision=True,
                    rnn_units=800,
                    random_state=1) -> keras.Model:
    if is_mixed_precision:
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)

    np.random.seed(random_state)
    tf.random.set_seed(random_state)

    # Create model under CPU scope and avoid OOM, errors during concatenation
    # a large distributed model.
    with tf.device('/cpu:0'):
        # Define input tensor [batch, time, features]
        input_tensor = layers.Input([None, input_dim], name='X')

        # Add 4th dimension [batch, time, frequency, channel]
        x = layers.Lambda(keras.backend.expand_dims,
                          arguments=dict(axis=-1))(input_tensor)
        x = layers.Conv2D(filters=32,
                          kernel_size=[11, 41],
                          strides=[2, 2],
                          padding='same',
                          use_bias=False,
                          name='conv_1')(x)
        x = layers.BatchNormalization(name='conv_1_bn')(x)
        x = layers.ReLU(name='conv_1_relu')(x)

        x = layers.Conv2D(filters=32,
                          kernel_size=[11, 21],
                          strides=[1, 2],
                          padding='same',
                          use_bias=False,
                          name='conv_2')(x)
        x = layers.BatchNormalization(name='conv_2_bn')(x)
        x = layers.ReLU(name='conv_2_relu')(x)
        # We need to squeeze to 3D tensor. Thanks to the stride in frequency
        # domain, we reduce the number of features four times for each channel.
        x = layers.Reshape([-1, input_dim // 4 * 32])(x)

        for i in [1, 2, 3, 4, 5]:
            recurrent = layers.GRU(units=rnn_units,
                                   activation='tanh',
                                   recurrent_activation='sigmoid',
                                   use_bias=True,
                                   return_sequences=True,
                                   reset_after=True,
                                   name=f'gru_{i}')
            x = layers.Bidirectional(recurrent,
                                     name=f'bidirectional_{i}',
                                     merge_mode='concat')(x)
            x = layers.Dropout(rate=0.5)(x) if i < 5 else x  # Only between

        # Return at each time step logits along characters. Then CTC
        # computation is more stable, in contrast to the softmax.
        x = layers.TimeDistributed(layers.Dense(units=rnn_units * 2),
                                   name='dense_1')(x)
        x = layers.ReLU(name='dense_1_relu')(x)
        x = layers.Dropout(rate=0.5)(x)
        output_tensor = layers.TimeDistributed(layers.Dense(units=output_dim),
                                               name='dense_2')(x)

        model = keras.Model(input_tensor, output_tensor, name='DeepSpeech2')
    return model
Beispiel #7
0
def change_policy(policy):
  from tensorflow.keras.mixed_precision import experimental as mixed_precision
  mixed_precision.set_policy(policy)
  return
def train_dealiaser(
        model_fun,
        model_kwargs,
        run_id,
        n_scales=0,
        multicoil=False,
        af=4,
        contrast=None,
        cuda_visible_devices='0123',
        n_samples=None,
        n_epochs=200,
        use_mixed_precision=False,
        loss='mae',
        original_run_id=None,
        fixed_masks=False,
        n_steps_per_epoch=973,
    ):
    # paths
    if multicoil:
        train_path = f'{FASTMRI_DATA_DIR}multicoil_train/'
        val_path = f'{FASTMRI_DATA_DIR}multicoil_val/'
    else:
        train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/'
        val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/'


    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices)
    af = int(af)

    # trying mixed precision
    if use_mixed_precision:
        policy_type = 'mixed_float16'
    else:
        policy_type = 'float32'
    policy = mixed_precision.Policy(policy_type)
    mixed_precision.set_policy(policy)
    # generators
    if multicoil:
        dataset = multicoil_dataset
        kwargs = {'parallel': False}
    else:
        dataset = singlecoil_dataset
        kwargs = {}
    train_set = dataset(
        train_path,
        AF=af,
        contrast=contrast,
        inner_slices=None,
        rand=True,
        scale_factor=1e6,
        n_samples=n_samples,
        fixed_masks=fixed_masks,
        **kwargs
    )
    val_set = dataset(
        val_path,
        AF=af,
        contrast=contrast,
        inner_slices=None,
        rand=True,
        scale_factor=1e6,
        **kwargs
    )

    additional_info = f'af{af}'
    if contrast is not None:
        additional_info += f'_{contrast}'
    if n_samples is not None:
        additional_info += f'_{n_samples}'
    if loss != 'mae':
        additional_info += f'_{loss}'
    if fixed_masks:
        additional_info += '_fixed_masks'

    run_id = f'{run_id}_{additional_info}_{int(time.time())}'
    chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}.hdf5'

    chkpt_cback = ModelCheckpoint(chkpt_path, period=n_epochs, save_weights_only=True)
    log_dir = op.join(f'{LOGS_DIR}logs', run_id)
    tboard_cback = TensorBoard(
        profile_batch=0,
        log_dir=log_dir,
        histogram_freq=0,
        write_graph=False,
        write_images=False,
    )
    tqdm_cback = TQDMProgressBar()

    model = MultiscaleComplex(
        model_fun=model_fun,
        model_kwargs=model_kwargs,
        res=False,
        n_scales=n_scales,
        fastmri_format=True,
    )
    if original_run_id is not None:
        lr = 1e-7
        n_steps = n_steps_per_epoch//2
    else:
        lr = 1e-4
        n_steps = n_steps_per_epoch
    default_model_compile(model, lr=lr, loss=loss)
    print(run_id)
    if original_run_id is not None:
        if os.environ.get('FASTMRI_DEBUG'):
            n_epochs_original = 1
        else:
            n_epochs_original = 250
        model(next(iter(train_set))[0])
        model.load_weights(f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-{n_epochs_original:02d}.hdf5')

    model.fit(
        train_set,
        steps_per_epoch=n_steps,
        epochs=n_epochs,
        validation_data=val_set,
        validation_steps=5,
        validation_freq=5,
        verbose=0,
        callbacks=[tboard_cback, chkpt_cback, tqdm_cback],
    )
    return run_id
Beispiel #9
0
def main(logdir, config):
  logdir = pathlib.Path(logdir).expanduser()
  config.traindir = config.traindir or logdir / 'train_eps'
  config.evaldir = config.evaldir or logdir / 'eval_eps'
  config.steps //= config.action_repeat
  config.eval_every //= config.action_repeat
  config.log_every //= config.action_repeat
  config.time_limit //= config.action_repeat
  config.act = getattr(tf.nn, config.act)

  if config.debug:
    tf.config.experimental_run_functions_eagerly(True)
  if config.gpu_growth:
    message = 'No GPU found. To actually train on CPU remove this assert.'
    assert tf.config.experimental.list_physical_devices('GPU'), message
    for gpu in tf.config.experimental.list_physical_devices('GPU'):
      tf.config.experimental.set_memory_growth(gpu, True)
  assert config.precision in (16, 32), config.precision
  if config.precision == 16:
    prec.set_policy(prec.Policy('mixed_float16'))
  print('Logdir', logdir)
  logdir.mkdir(parents=True, exist_ok=True)
  config.traindir.mkdir(parents=True, exist_ok=True)
  config.evaldir.mkdir(parents=True, exist_ok=True)
  step = count_steps(config.traindir)
  logger = tools.Logger(logdir, config.action_repeat * step)

  print('Create envs.')
  if config.offline_traindir:
    directory = config.offline_traindir.format(**vars(config))
  else:
    directory = config.traindir
  train_eps = tools.load_episodes(directory, limit=config.dataset_size)
  if config.offline_evaldir:
    directory = config.offline_evaldir.format(**vars(config))
  else:
    directory = config.evaldir
  eval_eps = tools.load_episodes(directory, limit=1)
  make = lambda mode: make_env(config, logger, mode, train_eps, eval_eps)
  train_envs = [make('train') for _ in range(config.envs)]
  eval_envs = [make('eval') for _ in range(config.envs)]
  acts = train_envs[0].action_space
  config.num_actions = acts.n if hasattr(acts, 'n') else acts.shape[0]

  prefill = max(0, config.prefill - count_steps(config.traindir))
  print(f'Prefill dataset ({prefill} steps).')
  random_agent = lambda o, d, s: ([acts.sample() for _ in d], s)
  tools.simulate(random_agent, train_envs, prefill)
  tools.simulate(random_agent, eval_envs, episodes=1)
  logger.step = config.action_repeat * count_steps(config.traindir)

  print('Simulate agent.')
  train_dataset = make_dataset(train_eps, config)
  eval_dataset = iter(make_dataset(eval_eps, config))
  agent = Dreamer(config, logger, train_dataset)
  if (logdir / 'variables.pkl').exists():
    agent.load(logdir / 'variables.pkl')
    agent._should_pretrain._once = False

  state = None
  while agent._step.numpy().item() < config.steps:
    logger.write()
    print('Start evaluation.')
    video_pred = agent._wm.video_pred(next(eval_dataset))
    logger.video('eval_openl', video_pred)
    eval_policy = functools.partial(agent, training=False)
    tools.simulate(eval_policy, eval_envs, episodes=1)
    print('Start training.')
    state = tools.simulate(agent, train_envs, config.eval_every, state=state)
    agent.save(logdir / 'variables.pkl')
  for env in train_envs + eval_envs:
    try:
      env.close()
    except Exception:
      pass
def train_model(data_path,
                batch_size,
                image_size,
                crop_size,
                lr_schedule_name,
                init_lr,
                max_lr,
                weight_decay,
                optimizer,
                model_type,
                embedding_size,
                num_epochs,
                checkpoint_path,
                margin=0.5,
                cache_path=None,
                range_test=False,
                use_tpu=False,
                tpu_name=None,
                use_mixed_precision=False,
                distributed=False,
                eager_execution=False,
                weights_path='',
                checkpoint_interval=5000,
                step_size=6000,
                recompile=False,
                steps_per_epoch=None,
                logist_scale=64):

    if use_tpu is True:
        assert tpu_name is not None, '[ERROR] TPU name must be specified'
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=tpu_name)
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)
        print("[INFO] TPUs: ", tf.config.list_logical_devices('TPU'))

    if use_mixed_precision is True:
        if use_tpu is True:
            policy = mixed_precision.Policy('mixed_bfloat16')
        else:
            policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)
        print(
            "[INFO] Using mixed precision for training. This will reduce memory consumption\n"
        )

    if distributed is True and use_tpu is False:
        mirrored_strategy = tf.distribute.MirroredStrategy()
        print("[INFO] Using distributed training strategy on GPU")

    train_dataset, n_imgs, n_classes = generate_training_dataset(
        data_path=data_path,
        image_size=image_size,
        batch_size=batch_size,
        crop_size=crop_size,
        cache=cache_path,
        use_mixed_precision=use_mixed_precision,
        use_tpu=use_tpu,
        model_type=model_type)

    test_dataset = None

    run_eagerly = eager_execution if eager_execution is not None else False

    log_dir = './logs/log_' + datetime.now().strftime("%Y%m%d_%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                          update_freq=100,
                                                          write_graph=False)
    stop_on_nan = tf.keras.callbacks.TerminateOnNaN()

    loss_fn = SoftmaxLoss()

    if range_test is True:
        range_finder = RangeTestCallback(start_lr=init_lr,
                                         end_lr=max_lr,
                                         n_imgs=n_imgs,
                                         batch_size=batch_size)
        opt = get_optimizer(optimizer_name=optimizer,
                            lr_schedule=1e-5,
                            weight_decay=weight_decay)
        if use_tpu is True:
            with strategy.scope():
                model, compiled = create_neural_network(
                    model_type=model_type,
                    embedding_size=embedding_size,
                    weights_path=weights_path,
                    n_classes=n_classes,
                    recompile=recompile,
                    input_shape=[crop_size, crop_size, 3],
                    training=True,
                    margin=margin,
                    logist_scale=logist_scale)
                assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights'
                if compiled is False:
                    print(
                        '[INFO] Recompiling model using passed optimizer and loss arguments'
                    )
                    model.compile(optimizer=opt,
                                  loss=loss_fn,
                                  run_eagerly=run_eagerly)
        elif distributed is True and use_tpu is False:
            with mirrored_strategy.scope():
                model, compiled = create_neural_network(
                    model_type=model_type,
                    embedding_size=embedding_size,
                    weights_path=weights_path,
                    n_classes=n_classes,
                    recompile=recompile,
                    input_shape=[crop_size, crop_size, 3],
                    training=True,
                    margin=margin,
                    logist_scale=logist_scale)
                opt = get_optimizer(
                    optimizer_name=optimizer,
                    lr_schedule=1e-5,
                    weight_decay=weight_decay
                )  # Optimizer must be created within scope!
                assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights'
                if compiled is False:
                    print(
                        '[INFO] Recompiling model using passed optimizer and loss arguments'
                    )
                    model.compile(optimizer=opt,
                                  loss=loss_fn,
                                  run_eagerly=run_eagerly)
        else:
            model, compiled = create_neural_network(
                model_type=model_type,
                embedding_size=embedding_size,
                weights_path=weights_path,
                n_classes=n_classes,
                recompile=recompile,
                input_shape=[crop_size, crop_size, 3],
                training=True,
                margin=margin,
                logist_scale=logist_scale)
            assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights'
            if compiled is False:
                print(
                    '[INFO] Recompiling model using passed optimizer and loss arguments'
                )
                model.compile(optimizer=opt,
                              loss=loss_fn,
                              run_eagerly=run_eagerly)

        callback_list = [range_finder, tensorboard_callback, stop_on_nan]

        train_history = model.fit(train_dataset,
                                  epochs=num_epochs,
                                  callbacks=callback_list)

        print(
            '\n[INFO] Training complete. Range test results can be found at "./range_test_result.png"'
        )

        return
    else:
        lr_schedule = get_learning_rate_schedule(
            schedule_name=lr_schedule_name,
            learning_rate=init_lr,
            max_lr=max_lr,
            image_count=n_imgs,
            batch_size=batch_size,
            step_size=step_size)
        opt = get_optimizer(optimizer_name=optimizer,
                            lr_schedule=lr_schedule,
                            weight_decay=weight_decay)

        if not os.path.exists(checkpoint_path):
            os.mkdir(checkpoint_path)

        #checkpoint_name = checkpoint_path + '/' + 'cp-{epoch:03d}.ckpt'
        model_saver = tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(checkpoint_path, 'full_model'),
            save_weights_only=False,
            monitor='val_loss',
            mode='min',
            save_best_only=False,
            save_freq=checkpoint_interval)
        weights_saver = tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(checkpoint_path, 'model_weights'),
            save_weights_only=True,
            monitor='val_loss',
            mode='min',
            save_best_only=False,
            save_freq=checkpoint_interval)
        if use_tpu is True:
            with strategy.scope():
                model, compiled = create_neural_network(
                    model_type=model_type,
                    embedding_size=embedding_size,
                    weights_path=weights_path,
                    n_classes=n_classes,
                    recompile=recompile,
                    input_shape=[crop_size, crop_size, 3],
                    training=True,
                    margin=margin,
                    logist_scale=logist_scale)
                assert model is not None, '[ERROR] There was a problem in loading the pre-trained weights'
                if compiled is False:
                    print(
                        '[INFO] Recompiling model using passed optimizer and loss arguments'
                    )
                    model.compile(optimizer=opt,
                                  loss=loss_fn,
                                  run_eagerly=run_eagerly)
        elif distributed is True and use_tpu is False:
            with mirrored_strategy.scope():
                model, compiled = create_neural_network(
                    model_type=model_type,
                    embedding_size=embedding_size,
                    weights_path=weights_path,
                    n_classes=n_classes,
                    recompile=recompile,
                    input_shape=[crop_size, crop_size, 3],
                    training=True,
                    margin=margin,
                    logist_scale=logist_scale)
                opt = get_optimizer(
                    optimizer_name=optimizer,
                    lr_schedule=lr_schedule,
                    weight_decay=weight_decay
                )  # Optimizer must be created within scope!
                assert model is not None, '[ERROR] There was a problem in loading the pre-trained weights'
                if compiled is False:
                    print(
                        '[INFO] Recompiling model using passed optimizer and loss arguments'
                    )
                    model.compile(optimizer=opt,
                                  loss=loss_fn,
                                  run_eagerly=run_eagerly)
        else:
            model, compiled = create_neural_network(
                model_type=model_type,
                embedding_size=embedding_size,
                weights_path=weights_path,
                n_classes=n_classes,
                recompile=recompile,
                input_shape=[crop_size, crop_size, 3],
                training=True,
                margin=margin,
                logist_scale=logist_scale)
            assert model is not None, '[ERROR] There was a problem in loading the pre-trained weights'
            if compiled is False:
                print(
                    '[INFO] Recompiling model using passed optimizer and loss arguments'
                )
                model.compile(optimizer=opt,
                              loss=loss_fn,
                              run_eagerly=run_eagerly)

        callback_list = [
            model_saver, weights_saver, tensorboard_callback, stop_on_nan
        ]

        train_history = model.fit(
            train_dataset,
            epochs=num_epochs,
            callbacks=callback_list,
            steps_per_epoch=None if steps_per_epoch == 0 else steps_per_epoch)

        if not os.path.exists('./results'):
            os.mkdir('./results')

        model_name = './results/model-' + datetime.now().strftime(
            "%Y%m%d-%H%M%S")
        model.save(model_name)
        print(
            '\n[INFO] Training complete. Saved model can be found in "./results"'
        )

        return
Beispiel #11
0
def get_quartznet(input_dim,
                  output_dim,
                  is_mixed_precision=False,
                  tflite_version=False,
                  num_b_block_repeats=3,
                  b_block_kernel_sizes=(33, 39, 51, 63, 75),
                  b_block_num_channels=(256, 256, 512, 512, 512),
                  num_small_blocks=5,
                  random_state=1) -> keras.Model:
    """
    Parameters
    ----------
    input_dim: input feature length
    output_dim: output feature length
    is_mixed_precision: if mixed precision model is needed
    tflite_version: if export to tflite is needed
    num_b_block_repeats: 1 is 5x5 quartznet, 2 is 10x5, 3 is 15x5
    b_block_kernel_sizes: iterable, kernel size of each b block
    b_block_num_channels: iterable, number of channels of each b block
    """
    assert len(b_block_kernel_sizes) == len(b_block_num_channels), \
        "Number of kernel sizes not equal the number of channel sizes"

    max_seq_length = None
    if tflite_version:
        max_seq_length = 5

    if is_mixed_precision:
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)

    np.random.seed(random_state)
    tf.random.set_seed(random_state)

    with tf.device('/cpu:0'):
        input_tensor = layers.Input([max_seq_length, input_dim], name='X')

        x = layers.Masking()(input_tensor)
        # First encoder layer
        x = layers.SeparableConv1D(256,
                                   33,
                                   padding='same',
                                   strides=2,
                                   name='conv_1',
                                   use_bias=False)(x)
        x = layers.BatchNormalization(name='BN-1', momentum=0.9)(x)
        x = layers.ReLU(name='RELU-1')(x)

        block_idx = 1
        for kernel_size, n_channels in zip(b_block_kernel_sizes,
                                           b_block_num_channels):
            for bk in range(num_b_block_repeats):
                x = B_block(kernel_size, n_channels, num_small_blocks,
                            f'B-{block_idx}')(x)
                block_idx += 1

        # First final layer
        x = layers.SeparableConv1D(512,
                                   87,
                                   padding='same',
                                   name='conv_2',
                                   dilation_rate=2,
                                   use_bias=False)(x)
        x = layers.BatchNormalization(name='BN-2', momentum=0.9)(x)
        x = layers.ReLU(name='RELU-2')(x)

        # Second final layer
        x = layers.Conv1D(1024,
                          1,
                          padding='same',
                          name='conv_3',
                          use_bias=False)(x)
        x = layers.BatchNormalization(name='BN-3', momentum=0.9)(x)
        x = layers.ReLU(name='RELU-3')(x)

        # Third final layer
        x = layers.Conv1D(output_dim,
                          1,
                          padding='same',
                          dilation_rate=1,
                          name='conv_4')(x)
        model = keras.Model([input_tensor], x, name='QuartzNet')

    if is_mixed_precision:
        policy = mixed_precision.Policy('float32')
        mixed_precision.set_policy(policy)

    return model
Beispiel #12
0
    def init_network(self):

        # This function builds the compute graph.
        # Optionally, it can build a 'subset' graph if this mode is

        # Net construction:
        start = time.time()

        # Here, if using mixed precision, set a global policy:
        if self.args.run.precision == "mixed":
            from tensorflow.keras.mixed_precision import experimental as mixed_precision
            self.policy = mixed_precision.Policy('mixed_float16')
            mixed_precision.set_policy(self.policy)

        if self.args.run.precision == "bfloat16":
            from tensorflow.keras.mixed_precision import experimental as mixed_precision
            self.policy = mixed_precision.Policy('mixed_bfloat16')
            mixed_precision.set_policy(self.policy)

        #
        self._global_step = tf.Variable(0, dtype=tf.int64)

        # Add the dataformat for the network construction:

        # This sets up the necessary output shape:
        output_shape = self.larcv_fetcher.output_shape('primary')

        # Build the network object, forward pass only:
        # To initialize the network, we see what the name is
        # and act on that:
        if self.args.network.name == "resnet":
            if self.args.network.data_format == 'sparse':
                raise Exception("No sparse networks available in tensorflow")
            else:
                if self.args.dataset.dimension == 2:
                    from src.networks.tensorflow import resnet
                    self._net = resnet.ResNet(output_shape, self.args)
                else:
                    raise Exception("No Resnet3d Implemented!")
        elif self.args.network.name == "pointnet":
            if self.args.dataset.dimension == 2:
                from src.networks.tensorflow import pointnet
                self._net = pointnet.PointNet(output_shape, self.args)
            else:
                from src.networks.tensorflow import pointnet3d
                self._net = pointnet3d.PointNet(output_shape, self.args)
        elif self.args.network.name == "dgcnn":
            from src.networks.tensorflow import dgcnn
            self._net = dgcnn.DGCNN(output_shape, self.args)
        else:
            raise Exception(
                f"Couldn't identify network {self.args.network.name}")

        self._net.trainable = True

        # TO PROPERLY INITIALIZE THE NETWORK, NEED TO DO A FORWARD PASS
        minibatch_data = self.larcv_fetcher.fetch_next_batch("primary",
                                                             force_pop=False)
        minibatch_data = self.cast_input(minibatch_data)

        self.forward_pass(minibatch_data['image'], training=False)

        end = time.time()
        return end - start
Beispiel #13
0
def main(args, yaml_path, config):
    tf.config.run_functions_eagerly(config['tensorflow']['eager'])

    from tfmodel.data import Dataset
    cds = config["dataset"]

    dataset_def = Dataset(num_input_features=int(cds["num_input_features"]),
                          num_output_features=int(cds["num_output_features"]),
                          padded_num_elem_size=int(
                              cds["padded_num_elem_size"]),
                          raw_path=cds["raw_path"],
                          processed_path=cds["processed_path"],
                          validation_file_path=cds["validation_file_path"],
                          schema=cds["schema"])

    if args.action == "data":
        dataset_def.process(config["dataset"]["num_files_per_chunk"])
        return

    global_batch_size = config['setup']['batch_size']

    model_name = os.path.splitext(os.path.basename(yaml_path))[0] + "-" + str(
        uuid.uuid4())[:8]
    print("model_name=", model_name)

    tfr_files = sorted(glob.glob(dataset_def.processed_path))
    if len(tfr_files) == 0:
        raise Exception("Could not find any files in {}".format(
            dataset_def.datapath))

    dataset = tf.data.TFRecordDataset(tfr_files).map(
        dataset_def.parse_tfr_element,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    num_events = 0
    for i in dataset:
        num_events += 1
    print("dataset loaded, len={}".format(num_events))

    n_train = config['setup']['num_events_train']
    n_test = config['setup']['num_events_test']
    n_epochs = config['setup']['num_epochs']
    weight_func = weight_functions[config['setup']['sample_weights']]
    assert (n_train + n_test <= num_events)

    ps = (tf.TensorShape(
        [dataset_def.padded_num_elem_size, dataset_def.num_input_features]),
          tf.TensorShape([
              dataset_def.padded_num_elem_size, dataset_def.num_output_features
          ]), tf.TensorShape([
              dataset_def.padded_num_elem_size,
          ]))

    ds_train = dataset.take(n_train).map(weight_func).padded_batch(
        global_batch_size, padded_shapes=ps)
    ds_test = dataset.skip(n_train).take(n_test).map(weight_func).padded_batch(
        global_batch_size, padded_shapes=ps)

    #small test dataset used in the callback for making monitoring plots
    X_test = ds_test.take(100).map(lambda x, y, w: x)
    y_test = np.concatenate(
        list(
            ds_test.take(100).map(
                lambda x, y, w: tf.concat(y, axis=-1)).as_numpy_iterator()))

    ds_train_r = ds_train.repeat(n_epochs)
    ds_test_r = ds_test.repeat(n_epochs)

    weights = config['setup']['weights']
    if args.weights:
        weights = args.weights
    if weights is None:
        outdir = 'experiments/{}'.format(model_name)
        if os.path.isdir(outdir):
            print("Output directory exists: {}".format(outdir),
                  file=sys.stderr)
            sys.exit(1)
    else:
        outdir = os.path.dirname(weights)

    try:
        num_gpus = len(os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(","))
        print("num_gpus=", num_gpus)
        if num_gpus > 1:
            strategy = tf.distribute.MirroredStrategy()
            global_batch_size = num_gpus * global_batch_size
        else:
            strategy = tf.distribute.OneDeviceStrategy("gpu:0")
    except Exception as e:
        print("fallback to CPU", e)
        strategy = tf.distribute.OneDeviceStrategy("cpu")
        num_gpus = 0

    actual_lr = global_batch_size * float(config['setup']['lr'])

    Xs = []
    ygens = []
    ycands = []
    #for faster loading
    if args.action == "train":
        dataset_def.val_filelist = dataset_def.val_filelist[:1]

    for fi in dataset_def.val_filelist:
        X, ygen, ycand = dataset_def.prepare_data(fi)

        Xs.append(np.concatenate(X))
        ygens.append(np.concatenate(ygen))
        ycands.append(np.concatenate(ycand))

    X_val = np.concatenate(Xs)
    ygen_val = np.concatenate(ygens)
    ycand_val = np.concatenate(ycands)

    with strategy.scope():
        if config['setup']['dtype'] == 'float16':
            model_dtype = tf.dtypes.float16
            from tensorflow.keras.mixed_precision import experimental as mixed_precision
            policy = mixed_precision.Policy('mixed_float16')
            mixed_precision.set_policy(policy)

            opt = mixed_precision.LossScaleOptimizer(
                tf.keras.optimizers.Adam(learning_rate=actual_lr),
                loss_scale="dynamic")
        else:
            model_dtype = tf.dtypes.float32
            opt = tf.keras.optimizers.Adam(learning_rate=actual_lr)

        if args.action == "train" or args.action == "eval":
            model = make_model(config, model_dtype)

            loss_cls = PFNetLoss(
                num_input_classes=config["dataset"]["num_input_classes"],
                num_output_classes=config["dataset"]["num_output_classes"],
                momentum_loss_coefs=config["dataset"]["momentum_loss_coefs"])

            loss_fn = loss_cls.my_loss_full
            if config["setup"]["trainable"] == "cls":
                model.set_trainable_classification()
                loss_fn = loss_cls.my_loss_cls
            elif config["setup"]["trainable"] == "reg":
                model.set_trainable_regression()
                loss_fn = loss_cls.my_loss_reg

            #we use the "temporal" mode to have per-particle weights
            model.compile(loss=loss_fn,
                          optimizer=opt,
                          sample_weight_mode='temporal')

            #Evaluate model once to build the layers
            model(tf.cast(X_val[:1], model_dtype))
            model.summary()

            initial_epoch = 0
            if weights:
                model.load_weights(weights)
                initial_epoch = int(weights.split("/")[-1].split("-")[1])

            if args.action == "train":
                file_writer_cm = tf.summary.create_file_writer(outdir +
                                                               '/val_extra')
                callbacks = prepare_callbacks(
                    X_test, y_test, loss_cls, model, outdir,
                    config["dataset"]["num_input_classes"],
                    config["dataset"]["num_output_classes"], file_writer_cm)

                model.fit(ds_train_r,
                          validation_data=ds_test_r,
                          epochs=initial_epoch + n_epochs,
                          callbacks=callbacks,
                          steps_per_epoch=n_train // global_batch_size,
                          validation_steps=n_test // global_batch_size,
                          initial_epoch=initial_epoch)

                model.save(outdir + "/model_full", save_format="tf")

            if args.action == "eval":
                eval_model(X_val, ygen_val, ycand_val, model, config, outdir,
                           global_batch_size)
                freeze_model(model, config, outdir)

        if args.action == "time":
            synthetic_timing_data = []
            for iteration in range(config["timing"]["num_iter"]):
                numev = config["timing"]["num_ev"]
                for evsize in [
                        128 * 10, 128 * 20, 128 * 30, 128 * 40, 128 * 50,
                        128 * 60, 128 * 70, 128 * 80, 128 * 90, 128 * 100
                ]:
                    for batch_size in [1, 2, 3, 4]:
                        x = np.random.randn(
                            batch_size, evsize,
                            config["dataset"]["num_input_features"]).astype(
                                np.float32)

                        model = make_model(config, model_dtype)
                        model(x)

                        if weights:
                            model.load_weights(weights)

                        t0 = time.time()
                        for i in range(numev // batch_size):
                            model(x)
                        t1 = time.time()
                        dt = t1 - t0

                        time_per_event = 1000.0 * (dt / numev)
                        synthetic_timing_data.append([{
                            "iteration":
                            iteration,
                            "batch_size":
                            batch_size,
                            "event_size":
                            evsize,
                            "time_per_event":
                            time_per_event
                        }])
                        print(
                            "Synthetic random data: batch_size={} event_size={}, time={:.2f} ms/ev"
                            .format(batch_size, evsize, time_per_event))
            with open("{}/synthetic_timing.json".format(outdir), "w") as fi:
                json.dump(synthetic_timing_data, fi)
Beispiel #14
0
def train_ncnet(
    model,
    run_id=None,
    multicoil=False,
    three_d=False,
    acq_type='radial',
    scale_factor=1e6,
    dcomp=False,
    contrast=None,
    cuda_visible_devices='0123',
    n_samples=None,
    n_epochs=200,
    use_mixed_precision=False,
    loss='mae',
    original_run_id=None,
    checkpoint_epoch=0,
    save_state=False,
    lr=1e-4,
    **acq_kwargs,
):
    # paths
    n_volumes_train = n_volumes_train_fastmri
    if multicoil:
        train_path = f'{FASTMRI_DATA_DIR}multicoil_train/'
        val_path = f'{FASTMRI_DATA_DIR}multicoil_val/'
    elif three_d:
        train_path = f'{OASIS_DATA_DIR}/train/'
        val_path = f'{OASIS_DATA_DIR}/val/'
        n_volumes_train = n_volumes_train_oasis
    else:
        train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/'
        val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/'

    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices)

    # trying mixed precision
    if use_mixed_precision:
        policy_type = 'mixed_float16'
    else:
        policy_type = 'float32'
    policy = mixed_precision.Policy(policy_type)
    mixed_precision.set_policy(policy)
    # generators
    if multicoil:
        dataset = multicoil_dataset
        image_size = IM_SIZE
    elif three_d:
        dataset = three_d_dataset
        image_size = VOLUME_SIZE
    else:
        dataset = singlecoil_dataset
        image_size = IM_SIZE
    if not three_d:
        add_kwargs = {
            'contrast': contrast,
            'rand': True,
            'inner_slices': None,
        }
    else:
        add_kwargs = {}
    add_kwargs.update(**acq_kwargs)
    train_set = dataset(train_path,
                        image_size,
                        acq_type=acq_type,
                        compute_dcomp=dcomp,
                        scale_factor=scale_factor,
                        n_samples=n_samples,
                        **add_kwargs)
    val_set = dataset(val_path,
                      image_size,
                      acq_type=acq_type,
                      compute_dcomp=dcomp,
                      scale_factor=scale_factor,
                      **add_kwargs)

    additional_info = f'{acq_type}'
    if contrast is not None:
        additional_info += f'_{contrast}'
    if n_samples is not None:
        additional_info += f'_{n_samples}'
    if loss != 'mae':
        additional_info += f'_{loss}'
    if dcomp:
        additional_info += '_dcomp'
    if checkpoint_epoch == 0:
        run_id = f'{run_id}_{additional_info}_{int(time.time())}'
    else:
        run_id = original_run_id
    final_epoch = checkpoint_epoch + n_epochs
    chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}.hdf5'

    log_dir = op.join(f'{LOGS_DIR}logs', run_id)
    tboard_cback = TensorBoard(
        profile_batch=0,
        log_dir=log_dir,
        histogram_freq=0,
        write_graph=False,
        write_images=False,
    )
    tqdm_cback = TQDMProgressBar()

    n_steps = n_volumes_train

    chkpt_cback = ModelCheckpointWorkAround(
        chkpt_path,
        save_freq=int(n_epochs * n_steps),
        save_weights_only=True,
    )
    default_model_compile(model, lr=lr, loss=loss)
    # first run of the model to avoid the saving error
    # ValueError: as_list() is not defined on an unknown TensorShape.
    # it can also allow loading of weights
    model(next(iter(train_set))[0])
    if not checkpoint_epoch == 0:
        model.load_weights(
            f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-{checkpoint_epoch:02d}.hdf5'
        )
        grad_vars = model.trainable_weights
        zero_grads = [tf.zeros_like(w) for w in grad_vars]
        model.optimizer.apply_gradients(zip(zero_grads, grad_vars))
        with open(
                f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-optimizer.pkl',
                'rb') as f:
            weight_values = pickle.load(f)
        model.optimizer.set_weights(weight_values)
    print(run_id)

    model.fit(
        train_set,
        steps_per_epoch=n_steps,
        initial_epoch=checkpoint_epoch,
        epochs=final_epoch,
        validation_data=val_set,
        validation_steps=2,
        verbose=0,
        callbacks=[tboard_cback, chkpt_cback, tqdm_cback],
    )
    if save_state:
        symbolic_weights = getattr(model.optimizer, 'weights')
        weight_values = K.batch_get_value(symbolic_weights)
        with open(f'{CHECKPOINTS_DIR}checkpoints/{run_id}-optimizer.pkl',
                  'wb') as f:
            pickle.dump(weight_values, f)
    return run_id
Beispiel #15
0
def main(argv):
    # set fixed random seed, load config files
    tf.random.set_seed(RANDOM_SEED)

    # using mix precision or not
    if MIXPRECISION:
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)

    # get params for model
    train_iter, input_size, num_cls, lrs_schedule_params, loss_params, parser_params, model_params = get_params(
        FLAGS.name)

    # -----------------------------------------------------------------
    # set up Grappler for graph optimization
    # Ref: https://www.tensorflow.org/guide/graph_optimization
    @contextlib.contextmanager
    def options(opts):
        old_opts = tf.config.optimizer.get_experimental_options()
        tf.config.optimizer.set_experimental_options(opts)
        try:
            yield
        finally:
            tf.config.optimizer.set_experimental_options(old_opts)

    # -----------------------------------------------------------------
    # Creating the instance of the model specified.
    logging.info("Creating the model instance of YOLACT")
    model = Yolact(**model_params)

    # add weight decay
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.Conv2D) or isinstance(
                layer, tf.keras.layers.Dense):
            layer.add_loss(lambda: tf.keras.regularizers.l2(FLAGS.weight_decay)
                           (layer.kernel))
        if hasattr(layer, 'bias_regularizer') and layer.use_bias:
            layer.add_loss(lambda: tf.keras.regularizers.l2(FLAGS.weight_decay)
                           (layer.bias))

    # -----------------------------------------------------------------
    # Creating dataloaders for training and validation
    logging.info("Creating the dataloader from: %s..." % FLAGS.tfrecord_dir)
    dateset = ObjectDetectionDataset(dataset_name=FLAGS.name,
                                     tfrecord_dir=os.path.join(
                                         FLAGS.tfrecord_dir, FLAGS.name),
                                     anchor_instance=model.anchor_instance,
                                     **parser_params)
    train_dataset = dateset.get_dataloader(subset='train',
                                           batch_size=FLAGS.batch_size)
    valid_dataset = dateset.get_dataloader(subset='val', batch_size=1)
    # count number of valid data for progress bar
    # Todo any better way to do it?
    num_val = 0
    for _ in valid_dataset:
        num_val += 1
    # -----------------------------------------------------------------
    # Choose the Optimizor, Loss Function, and Metrics, learning rate schedule
    lr_schedule = learning_rate_schedule.Yolact_LearningRateSchedule(
        **lrs_schedule_params)
    logging.info("Initiate the Optimizer and Loss function...")
    optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule,
                                        momentum=FLAGS.momentum)
    criterion = loss_yolact.YOLACTLoss(**loss_params)
    train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
    loc = tf.keras.metrics.Mean('loc_loss', dtype=tf.float32)
    conf = tf.keras.metrics.Mean('conf_loss', dtype=tf.float32)
    mask = tf.keras.metrics.Mean('mask_loss', dtype=tf.float32)
    seg = tf.keras.metrics.Mean('seg_loss', dtype=tf.float32)
    # -----------------------------------------------------------------

    # Setup the TensorBoard for better visualization
    # Ref: https://www.tensorflow.org/tensorboard/get_started
    logging.info("Setup the TensorBoard...")
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = './logs/gradient_tape/' + current_time + '/train'
    test_log_dir = './logs/gradient_tape/' + current_time + '/test'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
    test_summary_writer = tf.summary.create_file_writer(test_log_dir)

    # -----------------------------------------------------------------
    # Start the Training and Validation Process
    logging.info("Start the training process...")

    # setup checkpoints manager
    checkpoint = tf.train.Checkpoint(step=tf.Variable(1),
                                     optimizer=optimizer,
                                     model=model)
    manager = tf.train.CheckpointManager(checkpoint,
                                         directory="./checkpoints",
                                         max_to_keep=5)
    # restore from latest checkpoint and iteration
    status = checkpoint.restore(manager.latest_checkpoint)
    if manager.latest_checkpoint:
        logging.info("Restored from {}".format(manager.latest_checkpoint))
    else:
        logging.info("Initializing from scratch.")

    best_masks_map = 0.
    iterations = checkpoint.step.numpy()

    for image, labels in train_dataset:
        # check iteration and change the learning rate
        if iterations > train_iter:
            break

        checkpoint.step.assign_add(1)
        iterations += 1
        with options({
                'constant_folding': True,
                'layout_optimize': True,
                'loop_optimization': True,
                'arithmetic_optimization': True,
                'remapping': True
        }):
            loc_loss, conf_loss, mask_loss, seg_loss = train_step(
                model, criterion, train_loss, optimizer, image, labels,
                num_cls)
        loc.update_state(loc_loss)
        conf.update_state(conf_loss)
        mask.update_state(mask_loss)
        seg.update_state(seg_loss)
        with train_summary_writer.as_default():
            tf.summary.scalar('Total loss',
                              train_loss.result(),
                              step=iterations)
            tf.summary.scalar('Loc loss', loc.result(), step=iterations)
            tf.summary.scalar('Conf loss', conf.result(), step=iterations)
            tf.summary.scalar('Mask loss', mask.result(), step=iterations)
            tf.summary.scalar('Seg loss', seg.result(), step=iterations)

        if iterations and iterations % FLAGS.print_interval == 0:
            tf.print(
                "Iteration {}, LR: {}, Total Loss: {}, B: {},  C: {}, M: {}, S:{} "
                .format(iterations,
                        optimizer._decayed_lr(var_dtype=tf.float32),
                        train_loss.result(), loc.result(), conf.result(),
                        mask.result(), seg.result()))

        if iterations and iterations % FLAGS.save_interval == 0:
            # save checkpoint
            save_path = manager.save()
            logging.info("Saved checkpoint for step {}: {}".format(
                int(checkpoint.step), save_path))

            # validation and print mAP table
            all_map = evaluate(model, valid_dataset, num_val, num_cls)
            box_map, mask_map = all_map['box']['all'], all_map['mask']['all']
            tf.print(f"box mAP:{box_map}, mask mAP:{mask_map}")

            with test_summary_writer.as_default():
                tf.summary.scalar('Box mAP', box_map, step=iterations)
                tf.summary.scalar('Mask mAP', mask_map, step=iterations)

            # Saving the weights:
            if mask_map > best_masks_map:
                best_masks_map = mask_map
                model.save_weights(
                    f'{FLAGS.weights}/weights_{FLAGS.name}_{str(best_masks_map)}.h5'
                )

            # reset the metrics
            train_loss.reset_states()
            loc.reset_states()
            conf.reset_states()
            mask.reset_states()
            seg.reset_states()
Beispiel #16
0
def main():

  configs = yaml.safe_load((
      pathlib.Path(sys.argv[0]).parent / 'configs.yaml').read_text())
  parsed, remaining = common.Flags(configs=['defaults']).parse(known_only=True)
  config = common.Config(configs['defaults'])
  for name in parsed.configs:
    config = config.update(configs[name])
  config = common.Flags(config).parse(remaining)

  logdir = pathlib.Path(config.logdir).expanduser()
  logdir.mkdir(parents=True, exist_ok=True)
  config.save(logdir / 'config.yaml')
  print(config, '\n')
  print('Logdir', logdir)

  import tensorflow as tf
  tf.config.experimental_run_functions_eagerly(not config.jit)
  message = 'No GPU found. To actually train on CPU remove this assert.'
  assert tf.config.experimental.list_physical_devices('GPU'), message
  for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)
  assert config.precision in (16, 32), config.precision
  if config.precision == 16:
    from tensorflow.keras.mixed_precision import experimental as prec
    prec.set_policy(prec.Policy('mixed_float16'))

  train_replay = common.Replay(logdir / 'train_episodes', **config.replay)
  eval_replay = common.Replay(logdir / 'eval_episodes', **dict(
      capacity=config.replay.capacity // 10,
      minlen=config.dataset.length,
      maxlen=config.dataset.length))
  step = common.Counter(train_replay.stats['total_steps'])
  outputs = [
      common.TerminalOutput(),
      common.JSONLOutput(logdir),
      common.TensorBoardOutput(logdir),
  ]
  logger = common.Logger(step, outputs, multiplier=config.action_repeat)
  metrics = collections.defaultdict(list)

  should_train = common.Every(config.train_every)
  should_log = common.Every(config.log_every)
  should_video_train = common.Every(config.eval_every)
  should_video_eval = common.Every(config.eval_every)
  should_expl = common.Until(config.expl_until // config.action_repeat)

  def make_env(mode):
    suite, task = config.task.split('_', 1)
    if suite == 'dmc':
      env = common.DMC(
          task, config.action_repeat, config.render_size, config.dmc_camera)
      env = common.NormalizeAction(env)
    elif suite == 'atari':
      env = common.Atari(
          task, config.action_repeat, config.render_size,
          config.atari_grayscale)
      env = common.OneHotAction(env)
    elif suite == 'crafter':
      assert config.action_repeat == 1
      outdir = logdir / 'crafter' if mode == 'train' else None
      reward = bool(['noreward', 'reward'].index(task)) or mode == 'eval'
      env = common.Crafter(outdir, reward)
      env = common.OneHotAction(env)
    else:
      raise NotImplementedError(suite)
    env = common.TimeLimit(env, config.time_limit)
    return env

  def per_episode(ep, mode):
    length = len(ep['reward']) - 1
    score = float(ep['reward'].astype(np.float64).sum())
    print(f'{mode.title()} episode has {length} steps and return {score:.1f}.')
    logger.scalar(f'{mode}_return', score)
    logger.scalar(f'{mode}_length', length)
    for key, value in ep.items():
      if re.match(config.log_keys_sum, key):
        logger.scalar(f'sum_{mode}_{key}', ep[key].sum())
      if re.match(config.log_keys_mean, key):
        logger.scalar(f'mean_{mode}_{key}', ep[key].mean())
      if re.match(config.log_keys_max, key):
        logger.scalar(f'max_{mode}_{key}', ep[key].max(0).mean())
    should = {'train': should_video_train, 'eval': should_video_eval}[mode]
    if should(step):
      for key in config.log_keys_video:
        logger.video(f'{mode}_policy_{key}', ep[key])
    replay = dict(train=train_replay, eval=eval_replay)[mode]
    logger.add(replay.stats, prefix=mode)
    logger.write()

  print('Create envs.')
  num_eval_envs = min(config.envs, config.eval_eps)
  if config.envs_parallel == 'none':
    train_envs = [make_env('train') for _ in range(config.envs)]
    eval_envs = [make_env('eval') for _ in range(num_eval_envs)]
  else:
    make_async_env = lambda mode: common.Async(
        functools.partial(make_env, mode), config.envs_parallel)
    train_envs = [make_async_env('train') for _ in range(config.envs)]
    eval_envs = [make_async_env('eval') for _ in range(eval_envs)]
  act_space = train_envs[0].act_space
  obs_space = train_envs[0].obs_space
  train_driver = common.Driver(train_envs)
  train_driver.on_episode(lambda ep: per_episode(ep, mode='train'))
  train_driver.on_step(lambda tran, worker: step.increment())
  train_driver.on_step(train_replay.add_step)
  train_driver.on_reset(train_replay.add_step)
  eval_driver = common.Driver(eval_envs)
  eval_driver.on_episode(lambda ep: per_episode(ep, mode='eval'))
  eval_driver.on_episode(eval_replay.add_episode)

  prefill = max(0, config.prefill - train_replay.stats['total_steps'])
  if prefill:
    print(f'Prefill dataset ({prefill} steps).')
    random_agent = common.RandomAgent(act_space)
    train_driver(random_agent, steps=prefill, episodes=1)
    eval_driver(random_agent, episodes=1)
    train_driver.reset()
    eval_driver.reset()

  print('Create agent.')
  train_dataset = iter(train_replay.dataset(**config.dataset))
  report_dataset = iter(train_replay.dataset(**config.dataset))
  eval_dataset = iter(eval_replay.dataset(**config.dataset))
  agnt = agent.Agent(config, obs_space, act_space, step)
  train_agent = common.CarryOverState(agnt.train)
  train_agent(next(train_dataset))
  if (logdir / 'variables.pkl').exists():
    agnt.load(logdir / 'variables.pkl')
  else:
    print('Pretrain agent.')
    for _ in range(config.pretrain):
      train_agent(next(train_dataset))
  train_policy = lambda *args: agnt.policy(
      *args, mode='explore' if should_expl(step) else 'train')
  eval_policy = lambda *args: agnt.policy(*args, mode='eval')

  def train_step(tran, worker):
    if should_train(step):
      for _ in range(config.train_steps):
        mets = train_agent(next(train_dataset))
        [metrics[key].append(value) for key, value in mets.items()]
    if should_log(step):
      for name, values in metrics.items():
        logger.scalar(name, np.array(values, np.float64).mean())
        metrics[name].clear()
      logger.add(agnt.report(next(report_dataset)), prefix='train')
      logger.write(fps=True)
  train_driver.on_step(train_step)

  while step < config.steps:
    logger.write()
    print('Start evaluation.')
    logger.add(agnt.report(next(eval_dataset)), prefix='eval')
    eval_driver(eval_policy, episodes=config.eval_eps)
    print('Start training.')
    train_driver(train_policy, steps=config.eval_every)
    agnt.save(logdir / 'variables.pkl')
  for env in train_envs + eval_envs:
    try:
      env.close()
    except Exception:
      pass
Beispiel #17
0
def train(strategy, cfg):
    os.makedirs(cfg.MODEL.SAVE_DIR, exist_ok=True)

    if cfg.DATASET.BFLOAT16:
        policy = mixed_precision.Policy('mixed_bfloat16')
        mixed_precision.set_policy(policy)

    tf.random.set_seed(cfg.TRAIN.SEED)
    np.random.seed(cfg.TRAIN.SEED)

    meta_data = {'train_loss': [], 'val_loss': [], 'config': cfg}

    spe = int(np.ceil(cfg.DATASET.TRAIN_SAMPLES / cfg.TRAIN.BATCH_SIZE))
    spv = cfg.DATASET.VAL_SAMPLES // cfg.VAL.BATCH_SIZE

    if cfg.TRAIN.SCALE_LR:
        lr = cfg.TRAIN.BASE_LR * cfg.TRAIN.BATCH_SIZE / 32
        cfg.TRAIN.WARMUP_FACTOR = 32 / cfg.TRAIN.BATCH_SIZE
    else:
        lr = cfg.TRAIN.BASE_LR

    if cfg.TRAIN.LR_SCHEDULE == 'warmup_cosine_decay':
        lr_schedule = WarmupCosineDecay(initial_learning_rate=lr,
                                        decay_steps=cfg.TRAIN.EPOCHS * spe,
                                        warmup_steps=cfg.TRAIN.WARMUP_EPOCHS *
                                        spe,
                                        warmup_factor=cfg.TRAIN.WARMUP_FACTOR)
    elif cfg.TRAIN.LR_SCHEDULE == 'warmup_piecewise':
        lr_schedule = WarmupPiecewise(
            boundaries=[x * spe for x in cfg.TRAIN.DECAY_EPOCHS],
            values=[lr, lr / 10, lr / 10**2],
            warmup_steps=spe * cfg.TRAIN.WARMUP_EPOCHS,
            warmup_factor=cfg.TRAIN.WARMUP_FACTOR)
    else:
        lr_schedule = lr

    with strategy.scope():
        optimizer = tf.keras.optimizers.Adam(lr_schedule)
        if cfg.MODEL.TYPE == 'simple_baseline':
            model = SimpleBaseline(cfg)
        elif cfg.MODEL.TYPE == 'hrnet':
            model = HRNet(cfg)
        elif cfg.MODEL.TYPE == 'evopose':
            model = EvoPose(cfg)
        train_loss = tf.keras.metrics.Mean()
        val_loss = tf.keras.metrics.Mean()

    cfg.DATASET.OUTPUT_SHAPE = model.output_shape[1:]
    cfg.DATASET.SIGMA = 2 * cfg.DATASET.OUTPUT_SHAPE[0] / 64

    meta_data['parameters'] = model.count_params()
    meta_data['flops'] = get_flops(model)

    train_ds = load_tfds(cfg, 'train')
    train_ds = strategy.experimental_distribute_dataset(train_ds)
    train_iterator = iter(train_ds)

    if cfg.TRAIN.VAL:
        val_ds = load_tfds(cfg, 'val')
        val_ds = strategy.experimental_distribute_dataset(val_ds)

    @tf.function
    def train_step(train_iterator):
        def step_fn(inputs):
            imgs, targets, valid = inputs
            with tf.GradientTape() as tape:
                loss, l2_loss = mse_loss(model,
                                         imgs,
                                         targets,
                                         valid,
                                         training=True)
                scaled_loss = (loss + l2_loss) / strategy.num_replicas_in_sync
            grads = tape.gradient(scaled_loss, model.trainable_variables)
            optimizer.apply_gradients(
                list(zip(grads, model.trainable_variables)))
            train_loss.update_state(loss)

        strategy.run(step_fn, args=(next(train_iterator), ))

    @tf.function
    def val_step(dist_inputs):
        def step_fn(inputs):
            imgs, targets, valid = inputs
            loss, _ = mse_loss(model, imgs, targets, valid, training=False)
            val_loss.update_state(loss)

        strategy.run(step_fn, args=(dist_inputs, ))

    print('Training {} ({:.2f}M / {:.2f}G) on {} for {} epochs'.format(
        cfg.MODEL.NAME, meta_data['parameters'] / 1e6,
        meta_data['flops'] / 2 / 1e9, cfg.TRAIN.ACCELERATOR, cfg.TRAIN.EPOCHS))

    epoch = 1
    ts = time()
    while epoch <= cfg.TRAIN.EPOCHS:
        te = time()
        for i in range(spe):
            train_step(train_iterator)
            if cfg.TRAIN.DISP:
                print('epoch {} ({}/{}) | loss: {:.1f}'.format(
                    epoch, i + 1, spe,
                    train_loss.result().numpy()))
        meta_data['train_loss'].append(train_loss.result().numpy())

        if cfg.TRAIN.VAL:
            for i, batch in enumerate(val_ds):
                val_step(batch)
                if cfg.TRAIN.DISP:
                    print('val {} ({}/{}) | loss: {:.1f}'.format(
                        epoch, i + 1, spv,
                        val_loss.result().numpy()))
            meta_data['val_loss'].append(val_loss.result().numpy())

            if cfg.VAL.SAVE_BEST:
                if epoch == 1:
                    best_weights = model.get_weights()
                    best_loss = val_loss.result().numpy()
                    if cfg.TRAIN.DISP:
                        print('Cached model weights')
                elif val_loss.result().numpy() < best_loss:
                    best_weights = model.get_weights()
                    best_loss = val_loss.result().numpy()
                    if cfg.TRAIN.DISP:
                        print('Cached model weights')

        train_loss.reset_states()
        val_loss.reset_states()

        if cfg.TRAIN.SAVE_EPOCHS and epoch % cfg.TRAIN.SAVE_EPOCHS == 0:
            model.save(osp.join(
                cfg.MODEL.SAVE_DIR,
                '{}_ckpt{:03d}.h5'.format(cfg.MODEL.NAME, epoch)),
                       save_format='h5')
            print(
                'Saved checkpoint to',
                osp.join(cfg.MODEL.SAVE_DIR,
                         '{}_ckpt{:03d}.h5'.format(cfg.MODEL.NAME, epoch)))

        if cfg.TRAIN.SAVE_META:
            pickle.dump(
                meta_data,
                open(
                    osp.join(cfg.MODEL.SAVE_DIR,
                             '{}_meta.pkl'.format(cfg.MODEL.NAME)), 'wb'))

        if epoch > 1 and cfg.TRAIN.DISP:
            est_time = (cfg.TRAIN.EPOCHS - epoch) * (time() - te) / 3600
            print('Estimated time remaining: {:.2f} hrs'.format(est_time))

        epoch += 1

    meta_data['training_time'] = time() - ts

    if cfg.VAL.SAVE_BEST:
        model.set_weights(best_weights)

    return model, meta_data
Beispiel #18
0
    def init_network(self):

        # This function builds the compute graph.
        # Optionally, it can build a 'subset' graph if this mode is

        # Net construction:
        start = time.time()

        # Here, if using mixed precision, set a global policy:
        if self.args.precision == "mixed":
            from tensorflow.keras.mixed_precision import experimental as mixed_precision
            self.policy = mixed_precision.Policy('mixed_float16')
            mixed_precision.set_policy(self.policy)

        batch_dims = self.larcv_fetcher.batch_dims(1)

        # We compute the
        batch_dims[0] = self.local_batch_size()

        # We have to make placeholders for input objects:

        self._input = {
            'image':
            tf.compat.v1.placeholder(floating_point_format,
                                     batch_dims,
                                     name="input_image"),
            'label':
            tf.compat.v1.placeholder(integer_format,
                                     batch_dims,
                                     name="input_label"),
            'io_time':
            tf.compat.v1.placeholder(floating_point_format, (),
                                     name="io_fetch_time")
        }

        # Build the network object, forward pass only:

        if self.args.conv_mode == '2D':
            self._net = uresnet2D.UResNet(self.args)
        else:
            self._net = uresnet3D.UResNet3D(self.args)

        self._net.trainable = True

        self._logits = self._net(self._input['image'],
                                 training=self.args.training)

        # If channels first, need to permute the logits:
        if self._channels_dim == 1:
            permutation = tf.keras.layers.Permute((2, 3, 1))
            self._loss_logits = [permutation(l) for l in self._logits]
        else:
            self._loss_logits = self._logits

        # Used to accumulate gradients over several iterations:
        with tf.compat.v1.variable_scope("gradient_accumulation"):
            self._accum_vars = [
                tf.Variable(tv.initialized_value(), trainable=False)
                for tv in tf.compat.v1.trainable_variables()
            ]

        if self.args.mode == "train" or self.args.mode == "inference":

            # Here, if the data format is channels_first, we have to reorder the logits tensors
            # To put channels last.  Otherwise it does not work with the softmax tensors.

            # Apply a softmax and argmax:
            self._output = dict()

            # Take the logits (which are one per plane) and create a softmax and prediction (one per plane)
            with tf.compat.v1.variable_scope("prediction"):
                self._output['prediction'] = [
                    tf.argmax(x, axis=self._channels_dim) for x in self._logits
                ]

            with tf.compat.v1.variable_scope("cross_entropy"):
                self.loss_calculator = LossCalculator.LossCalculator(
                    self.args.loss_balance_scheme, self._channels_dim)

                self._input['split_labels'] = [
                    tf.squeeze(l, axis=self._channels_dim) for l in tf.split(
                        self._input['label'], 3, self._channels_dim)
                ]
                self._input['split_images'] = [
                    tf.squeeze(l, axis=self._channels_dim) for l in tf.split(
                        self._input['image'], 3, self._channels_dim)
                ]

                self._loss = self.loss_calculator(
                    labels=self._input['split_labels'],
                    logits=self._loss_logits)

            if self.args.mode == "inference":
                self._output['softmax'] = [
                    tf.nn.softmax(x, axis=self._channels_dim)
                    for x in self._logits
                ]

            self._accuracy_calc = AccuracyCalculator.AccuracyCalculator()

            self._accuracy = self._accuracy_calc(
                prediction=self._output['prediction'],
                labels=self._input['split_labels'])

            # Add the metrics by hand:

            self._metrics = {}
            for p in [0, 1, 2]:
                self._metrics[f"plane{p}/Total_Accuracy"] = self._accuracy[
                    "total_accuracy"][p]
                self._metrics[f"plane{p}/Non_Bkg_Accuracy"] = self._accuracy[
                    "non_bkg_accuracy"][p]
                self._metrics[f"plane{p}/Neutrino_IoU"] = self._accuracy[
                    "neut_iou"][p]
                self._metrics[f"plane{p}/Cosmic_IoU"] = self._accuracy[
                    "cosmic_iou"][p]
                self._metrics[f"plane{p}/mIoU"] = self._accuracy["miou"][p]

            with tf.compat.v1.variable_scope("accuracy"):
                self._metrics["Average/Total_Accuracy"] = tf.reduce_mean(
                    self._accuracy["total_accuracy"])
                self._metrics["Average/Non_Bkg_Accuracy"] = tf.reduce_mean(
                    self._accuracy["non_bkg_accuracy"])
                self._metrics["Average/Neutrino_IoU"] = tf.reduce_mean(
                    self._accuracy["neut_iou"])
                self._metrics["Average/Cosmic_IoU"] = tf.reduce_mean(
                    self._accuracy["cosmic_iou"])
                self._metrics["Average/mIoU"] = tf.reduce_mean(
                    self._accuracy["miou"])

            self._metrics['loss'] = self._loss

        self._log_keys = ["loss", "Average/Non_Bkg_Accuracy", "Average/mIoU"]

        end = time.time()
        return end - start
Beispiel #19
0
def train_ncnet(
        model,
        run_id=None,
        multicoil=False,
        three_d=False,
        acq_type='radial',
        scale_factor=1e6,
        dcomp=False,
        contrast=None,
        cuda_visible_devices='0123',
        n_samples=None,
        n_epochs=200,
        use_mixed_precision=False,
        loss='mae',
        original_run_id=None,
        **acq_kwargs,
    ):
    # paths
    n_volumes_train = n_volumes_train_fastmri
    if multicoil:
        train_path = f'{FASTMRI_DATA_DIR}multicoil_train/'
        val_path = f'{FASTMRI_DATA_DIR}multicoil_val/'
    elif three_d:
        train_path = f'{OASIS_DATA_DIR}/train/'
        val_path = f'{OASIS_DATA_DIR}/val/'
        n_volumes_train = n_volumes_train_oasis
    else:
        train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/'
        val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/'


    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_visible_devices)

    # trying mixed precision
    if use_mixed_precision:
        policy_type = 'mixed_float16'
    else:
        policy_type = 'float32'
    policy = mixed_precision.Policy(policy_type)
    mixed_precision.set_policy(policy)
    # generators
    if multicoil:
        dataset = multicoil_dataset
        image_size = IM_SIZE
    elif three_d:
        dataset = three_d_dataset
        image_size = VOLUME_SIZE
    else:
        dataset = singlecoil_dataset
        image_size = IM_SIZE
    if not three_d:
        add_kwargs = {
            'contrast': contrast,
            'rand': True,
            'inner_slices': None,
        }
    else:
        add_kwargs = {}
    add_kwargs.update(**acq_kwargs)
    train_set = dataset(
        train_path,
        image_size,
        acq_type=acq_type,
        compute_dcomp=dcomp,
        scale_factor=scale_factor,
        n_samples=n_samples,
        **add_kwargs
    )
    val_set = dataset(
        val_path,
        image_size,
        acq_type=acq_type,
        compute_dcomp=dcomp,
        scale_factor=scale_factor,
        **add_kwargs
    )

    additional_info = f'{acq_type}'
    if contrast is not None:
        additional_info += f'_{contrast}'
    if n_samples is not None:
        additional_info += f'_{n_samples}'
    if loss != 'mae':
        additional_info += f'_{loss}'
    if dcomp:
        additional_info += '_dcomp'
    run_id = f'{run_id}_{additional_info}_{int(time.time())}'
    chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}.hdf5'

    chkpt_cback = ModelCheckpoint(chkpt_path, period=n_epochs, save_weights_only=True)
    log_dir = op.join(f'{LOGS_DIR}logs', run_id)
    tboard_cback = TensorBoard(
        profile_batch=0,
        log_dir=log_dir,
        histogram_freq=0,
        write_graph=False,
        write_images=False,
    )
    tqdm_cback = TQDMProgressBar()

    if original_run_id is not None:
        lr = 1e-7
        n_steps = n_volumes_train//2
    else:
        lr = 1e-4
        n_steps = n_volumes_train
    default_model_compile(model, lr=lr, loss=loss)
    print(run_id)
    if original_run_id is not None:
        if os.environ.get('FASTMRI_DEBUG'):
            n_epochs_original = 1
        else:
            n_epochs_original = 250
        model.load_weights(f'{CHECKPOINTS_DIR}checkpoints/{original_run_id}-{n_epochs_original:02d}.hdf5')

    model.fit(
        train_set,
        steps_per_epoch=n_steps,
        epochs=n_epochs,
        validation_data=val_set,
        validation_steps=2,
        verbose=0,
        callbacks=[tboard_cback, chkpt_cback, tqdm_cback],
    )
    return run_id
    def __init__(
            self,
            seq_len,
            vocab_size,
            embedding_dim=20,
            hidden_dim=256,
            n_hidden=2,
            dff=512,
            n_epochs=1,
            batch_size=1000,
            inference_batch_size=1500,
            cache_dir='.',
            model_name='bilstm',
            seed=None,
            verbose=False
    ):
        super().__init__(seed=seed,)

        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)

        mirrored_strategy = tf.distribute.MirroredStrategy()
        print('Number of devices: {}'.format(mirrored_strategy.num_replicas_in_sync))
        with mirrored_strategy.scope():

            input_pre = Input(shape=(seq_len - 1,))
            input_post = Input(shape=(seq_len - 1,))

            embed = Embedding(vocab_size + 1, embedding_dim,
                              input_length=seq_len - 1)
            x_pre = embed(input_pre)
            x_post = embed(input_post)

            for _ in range(n_hidden - 1):
                lstm = LSTM(hidden_dim, return_sequences=True)
                x_pre = lstm(x_pre)
                x_post = lstm(x_post)
            lstm = LSTM(hidden_dim)
            x_pre = lstm(x_pre)
            x_post = lstm(x_post)

            x = concatenate([ x_pre, x_post ],
                            name='embed_layer')

            #x = Dense(dff, activation='relu')(x)
            x = Dense(vocab_size + 1)(x)
            output = Activation('softmax', dtype='float32')(x)

            self.model_ = Model(inputs=[ input_pre, input_post ],
                                outputs=output)
            opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999,
                       amsgrad=False)
            self.model_.compile(
                loss='sparse_categorical_crossentropy', optimizer=opt,
                metrics=['accuracy']
            )
        self.seq_len_ = seq_len
        self.vocab_size_ = vocab_size
        self.embedding_dim_ = embedding_dim
        self.hidden_dim_ = hidden_dim
        self.n_hidden_ = n_hidden
        self.dff_ = dff
        self.n_epochs_ = n_epochs
        self.batch_size_ = batch_size
        self.inference_batch_size_ = inference_batch_size
        self.cache_dir_ = cache_dir
        self.model_name_ = model_name
        self.verbose_ = verbose
Beispiel #21
0
def run_training(
    encoder_f,
    box_f,
    lr_f,
    name,
    epochs,
    batch_size,
    steps_per_epoch,
    img,
    data,
    val_data,
    img_size,
    mixed_float=True,
    notebook=True,
):
    """
    val_data : (X_val, Y_val) tuple
    """
    if mixed_float:
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)

    st = time.time()

    inputs = {
        'image': keras.Input((img_size[0], img_size[1], 3)),
        'pos': keras.Input((2))
    }
    mymodel = BoxModel(inputs, encoder_f, box_f)
    loss = keras.losses.MeanSquaredError()
    mymodel.compile(
        optimizer='adam',
        loss=loss,
        # metrics=[
        #     'mse',
        # ]
    )

    logdir = 'logs/fit/' + name
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir,
                                                          histogram_freq=1,
                                                          profile_batch='3,5',
                                                          update_freq='epoch')
    lr_callback = keras.callbacks.LearningRateScheduler(lr_f, verbose=1)

    savedir = 'savedmodels/' + name + '/{epoch}'
    save_callback = keras.callbacks.ModelCheckpoint(savedir,
                                                    save_weights_only=True,
                                                    verbose=1)

    if notebook:
        tqdm_callback = TqdmNotebookCallback(
            metrics=['loss', 'binary_accuracy'], leave_inner=False)
    else:
        tqdm_callback = TqdmCallback()

    # if augment:
    train_ds = create_train_dataset(img, data, img_size, batch_size)
    mymodel.fit(
        x=train_ds,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_data,
        callbacks=[
            tensorboard_callback,
            lr_callback,
            save_callback,
            tqdm_callback,
        ],
        verbose=0,
        # validation_data=val_data,
    )

    # else:
    #     mymodel.fit(
    #         x=X_train,
    #         y=Y_train,
    #         epochs=epochs,
    #         batch_size=batch_size,
    #         callbacks=[
    #             tensorboard_callback,
    #             lr_callback,
    #             save_callback,
    #             tqdm_callback,
    #         ],
    #         verbose=0,
    #         validation_data=val_data
    #     )

    print('Took {} seconds'.format(time.time() - st))
    def __init__(
            self,
            seq_len,
            vocab_size,
            embedding_dim=20,
            hidden_dim=256,
            n_hidden=2,
            n_heads=8,
            dff=2048,
            dropout_rate=0.1,
            n_epochs=1,
            batch_size=1000,
            cache_dir='.',
            model_name='attention',
            seed=None,
            verbose=False
    ):
        super().__init__(seed=seed,)

        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)

        mirrored_strategy = tf.distribute.MirroredStrategy()
        print('Number of devices: {}'.format(mirrored_strategy.num_replicas_in_sync))
        with mirrored_strategy.scope():
            input_ = Input(shape=(seq_len - 1,))

            from transformer_layers import Encoder
            self.encoder_ = Encoder(
                n_hidden, hidden_dim, n_heads, dff,
                vocab_size + 1, seq_len, dropout_rate,
                name='embed_layer',
            )
            x = self.encoder_(input_, None)

            x = Reshape((hidden_dim * (seq_len - 1),))(x)

            #x = Dense(dff, activation='relu')(x)
            x = Dense(vocab_size + 1)(x)
            output = Activation('softmax', dtype='float32')(x)

            self.model_ = Model(inputs=input_, outputs=output)
            opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999,
                       amsgrad=False)
            self.model_.compile(
                loss='sparse_categorical_crossentropy', optimizer=opt,
                metrics=['accuracy']
            )

        self.seq_len_ = seq_len
        self.vocab_size_ = vocab_size
        self.embedding_dim_ = embedding_dim
        self.hidden_dim_ = hidden_dim
        self.n_hidden_ = n_hidden
        self.n_heads_ = n_heads
        self.dff_ = dff
        self.dropout_rate_ = dropout_rate
        self.n_epochs_ = n_epochs
        self.batch_size_ = batch_size
        self.cache_dir_ = cache_dir
        self.model_name_ = model_name
        self.verbose_ = verbose
def train_xpdnet_block(
    model_fun,
    model_kwargs,
    model_size=None,
    multicoil=True,
    brain=False,
    af=4,
    contrast=None,
    n_samples=None,
    batch_size=None,
    n_epochs=200,
    n_iter=10,
    res=True,
    n_scales=0,
    n_primal=5,
    use_mixed_precision=False,
    refine_smaps=False,
    refine_big=False,
    loss='mae',
    lr=1e-4,
    fixed_masks=False,
    equidistant_fake=False,
    multi_gpu=False,
    mask_type=None,
    primal_only=True,
    n_dual=1,
    n_dual_filters=16,
    multiscale_kspace_learning=False,
    block_size=10,
    block_overlap=0,
    epochs_per_block_step=None,
):
    r"""Train an XPDNet network on the fastMRI dataset.

    The training is done with a learning rate of 1e-4, using the RAdam optimizer.
    The validation is performed every 5 epochs on 5 volumes.
    A scale factor of 1e6 is applied to the data.

    Arguments:
        model_fun (function): the function initializing the image correction
            network of the XPDNet.
        model_kwargs (dict): the set of arguments used to initialize the image
            correction network.
        model_size (str or None): a string describing the size of the network
            used. This is used in the run id. Defaults to None.
        multicoil (bool): whether the input data is multicoil. Defaults to False.
        brain (bool): whether to consider brain data instead of knee. Defaults
            to False.
        af (int): the acceleration factor for the retrospective undersampling
            of the data. Defaults to 4.
        contrast (str or None): the contrast used for this specific training.
            If None, all contrasts are considered. Defaults to None
        n_samples (int or None): the number of samples to consider for this
            training. If None, all samples are considered. Defaults to None.
        n_epochs (int): the number of epochs (i.e. one pass though all the
            volumes/samples) for this training. Defaults to 200.
        checkpoint_epoch (int): the number of epochs used to train the model
            during the first step of the full training. This is typically used
            when on a cluster the training duration exceeds the maximum job
            duration. Defaults to 0, which means that the training is done
            without checkpoints.
        save_state (bool): whether you should save the entire model state for
            this training, for example to retrain where left off. Defaults to
            False.
        n_iter (int): the number of iterations for the XPDNet.
        res (bool): whether the XPDNet image correction networks should be
            residual.
        n_scales (int): the number of scales used in the image correction
            network. Defaults to 0.
        n_primal (int): the size of the buffer in the image space. Defaults to
            5.
        use_mixed_precision (bool): whether to use the mixed precision API for
            training. Currently not working. Defaults to False.
        refine_smaps (bool): whether you want to refine the sensitivity maps
            with a neural network.
        loss (tf.keras.losses.Loss or str): the loss function used for the
            training. It should be understandable by the tf.keras loss API,
            or be 'compound_mssim', in which case the compound L1 MSSIM loss
            inspired by [P2020]. Defaults to 'mae'.
        original_run_id (str or None): run id of the same network trained before
            fine-tuning. If this is present, the training is considered
            fine-tuning for a network trained for 250 epochs. It will therefore
            apply a learning rate of 1e-7 and the epoch size will be divided in
            half. If None, the training is done normally, without loading
            weights. Defaults to None.
        fixed_masks (bool): whether fixed masks should be used for the
            retrospective undersampling. Defaults to False
        n_epochs_original (int): the number of epochs used to pre-train the
            model, only applicable if original_run_id is not None. Defaults to
            250.
        equidistant_fake (bool): whether to use fake equidistant masks from
            fastMRI. Defaults to False.
        multi_gpu (bool): whether to use multiple GPUs for the XPDNet training.
            Defaults to False.

    Returns:
        - str: the run id of the trained network.
    """
    if brain:
        n_volumes = brain_n_volumes_train
    else:
        n_volumes = n_volumes_train
    # paths
    if multicoil:
        if brain:
            train_path = f'{FASTMRI_DATA_DIR}brain_multicoil_train/'
            val_path = f'{FASTMRI_DATA_DIR}brain_multicoil_val/'
        else:
            train_path = f'{FASTMRI_DATA_DIR}multicoil_train/'
            val_path = f'{FASTMRI_DATA_DIR}multicoil_val/'
    else:
        train_path = f'{FASTMRI_DATA_DIR}singlecoil_train/singlecoil_train/'
        val_path = f'{FASTMRI_DATA_DIR}singlecoil_val/'

    af = int(af)

    # trying mixed precision
    if use_mixed_precision:
        policy_type = 'mixed_float16'
    else:
        policy_type = 'float32'
    policy = mixed_precision.Policy(policy_type)
    mixed_precision.set_policy(policy)
    # generators
    if multicoil:
        dataset = multicoil_dataset
        if mask_type is None:
            if brain:
                if equidistant_fake:
                    mask_type = 'equidistant_fake'
                else:
                    mask_type = 'equidistant'
            else:
                mask_type = 'random'
        kwargs = {
            'parallel': False,
            'output_shape_spec': brain,
            'mask_type': mask_type,
        }
    else:
        dataset = singlecoil_dataset
        kwargs = {}
    train_set = dataset(train_path,
                        AF=af,
                        contrast=contrast,
                        inner_slices=None,
                        rand=True,
                        scale_factor=1e6,
                        n_samples=n_samples,
                        fixed_masks=fixed_masks,
                        batch_size=batch_size,
                        target_image_size=IM_SIZE,
                        **kwargs)
    val_set = dataset(val_path,
                      AF=af,
                      contrast=contrast,
                      inner_slices=None,
                      rand=True,
                      scale_factor=1e6,
                      **kwargs)

    run_params = {
        'n_primal': n_primal,
        'multicoil': multicoil,
        'n_scales': n_scales,
        'n_iter': n_iter,
        'refine_smaps': refine_smaps,
        'res': res,
        'output_shape_spec': brain,
        'multi_gpu': multi_gpu,
        'refine_big': refine_big,
        'primal_only': primal_only,
        'n_dual': n_dual,
        'n_dual_filters': n_dual_filters,
        'multiscale_kspace_learning': multiscale_kspace_learning,
    }

    if multicoil:
        xpdnet_type = 'xpdnet_sense_'
        if brain:
            xpdnet_type += 'brain_'
    else:
        xpdnet_type = 'xpdnet_singlecoil_'
    additional_info = f'af{af}'
    if contrast is not None:
        additional_info += f'_{contrast}'
    if n_samples is not None:
        additional_info += f'_{n_samples}'
    if n_iter != 10:
        additional_info += f'_i{n_iter}'
    if loss != 'mae':
        additional_info += f'_{loss}'
    if refine_smaps:
        additional_info += '_rf_sm'
        if refine_big:
            additional_info += 'b'
    if fixed_masks:
        additional_info += '_fixed_masks'
    if block_overlap != 0:
        additional_info += f'_blkov{block_overlap}'

    submodel_info = model_fun.__name__
    if model_size is not None:
        submodel_info += model_size
    run_id = f'{xpdnet_type}_{additional_info}_bbb_{submodel_info}_{int(time.time())}'
    chkpt_path = f'{CHECKPOINTS_DIR}checkpoints/{run_id}' + '-{epoch:02d}'
    chkpt_path += '.hdf5'

    log_dir = op.join(f'{LOGS_DIR}logs', run_id)
    tboard_cback = TensorBoard(
        profile_batch=0,
        log_dir=log_dir,
        histogram_freq=0,
        write_graph=False,
        write_images=False,
    )
    tqdm_cback = TQDMProgressBar()

    model = XPDNet(model_fun, model_kwargs, **run_params)
    n_steps = n_volumes

    if batch_size is not None:
        n_steps //= batch_size

    chkpt_cback = ModelCheckpointWorkAround(
        chkpt_path,
        save_freq=int(n_epochs * n_steps),
        save_weights_only=True,
    )
    print(run_id)
    stride = block_size - block_overlap
    assert stride > 0
    n_block_steps = int(math.ceil((n_iter - block_size) / stride) + 1)
    ## epochs handling
    start_epoch = 0
    final_epoch = min(epochs_per_block_step, n_epochs)

    for i_step in range(n_block_steps):
        first_block_to_train = i_step * stride
        blocks = list(
            range(first_block_to_train, first_block_to_train + block_size))
        model.blocks_to_train = blocks
        default_model_compile(model, lr=lr, loss=loss)

        model.fit(
            train_set,
            steps_per_epoch=n_steps,
            initial_epoch=start_epoch,
            epochs=final_epoch,
            validation_data=val_set,
            validation_steps=5,
            validation_freq=5,
            verbose=0,
            callbacks=[tboard_cback, chkpt_cback, tqdm_cback],
        )
        n_epochs = n_epochs - (final_epoch - start_epoch)
        if n_epochs <= 0:
            break
        start_epoch = final_epoch
        final_epoch += min(epochs_per_block_step, n_epochs)
    return run_id
Beispiel #24
0
def training(meta_train_iterations, meta_batch_size, k_support, k_query,
             num_inner_updates, inner_update_lr, learn_inner_update_lr,
             meta_lr, job_dir):
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_policy(policy)

    mirrored_strategy = tf.distribute.MirroredStrategy()
    data_generator = DataGenerator(k_support, k_query, meta_batch_size,
                                   'meta_train', job_dir)
    data_generator_valid = DataGenerator(2, 32, meta_batch_size, 'meta_val',
                                         job_dir)

    itr = 0
    meta_loss_log_dir_2 = os.path.join(job_dir,
                                       'summary_6_intrain_2_intest/meta_loss')
    meta_metric_log_dir_2 = os.path.join(
        job_dir, 'summary_6_intrain_2_intest/meta_metric')
    meta_loss_summary_writer = tf.summary.create_file_writer(
        meta_loss_log_dir_2)
    meta_metric_writer = tf.summary.create_file_writer(meta_metric_log_dir_2)

    with mirrored_strategy.scope():
        maml = MAML(k_support,
                    k_query,
                    num_inner_updates=num_inner_updates,
                    inner_update_lr=inner_update_lr,
                    learn_inner_update_lr=learn_inner_update_lr)
        optim = tf.keras.optimizers.Adam(learning_rate=meta_lr)
        optim = mixed_precision.LossScaleOptimizer(optim, loss_scale='dynamic')

    storage_client = storage.Client()
    acc_metric = tf.keras.metrics.CategoricalAccuracy('train_accuracy')

    dataset = data_generator.create_dataset().take(meta_train_iterations)
    dist_dataset = mirrored_strategy.experimental_distribute_dataset(dataset)
    num_replicas = mirrored_strategy.num_replicas_in_sync
    logging.info('mirrored_strategy.num_replicas_in_sync: %d' % (num_replicas))

    best_evel_mIoU = 0
    model_exp_str = 'mbs_' + str(meta_batch_size) + '.k_support_' + str(
        k_support) + '.k_query_' + str(k_query) + '.inner_steps_' + str(
            num_inner_updates) + '.inner_lr_' + str(
                inner_update_lr) + '.learn_inner_update_lr_' + str(
                    learn_inner_update_lr) + '.meta_lr_' + str(meta_lr)
    model_file = os.path.join(job_dir, 'weights_inner_update_4', model_exp_str)

    for input_support_replica, input_query_replica, label_support_replica, label_query_replica, ids_replica, query_indices_replica in dist_dataset:
        itr = itr + 1
        inp = (input_support_replica, input_query_replica,
               label_support_replica, label_query_replica)
        output_query_replicas, meta_loss = distributed_train_step(
            inp, maml, num_replicas, optim, mirrored_strategy)
        logging.info('Iteration %d: meta loss: %.5f ' % (itr, meta_loss))
        if itr % 1 == 0:
            if num_replicas > 1:
                output_query = output_query_replicas.values
                output_query = tf.concat(output_query, 0)

                label_query = label_query_replica.values
                label_query = tf.concat(label_query, 0)
            else:
                output_query = output_query_replicas
                label_query = label_query_replica
            label_query = tf.cast(label_query, dtype=tf.float32)
            pred = tf.one_hot(tf.argmax(output_query, axis=-1),
                              depth=data_generator.LABEL_SIZE)
            with tf.device('/CPU:0'):
                mIoU = compute_mIoU(label_query[:, :, :, :,
                                                1:], pred[:, :, :, :, 1:],
                                    data_generator.LABEL_SIZE - 1)
            logging.info('Iteration %d: mean IoU: %.5f ' % (itr, mIoU))

            with tf.device('/CPU:0'):
                acc_metric.update_state(
                    label_query[:, :, :, :, 1:],
                    tf.math.softmax(output_query)[:, :, :, :, 1:])
                acc = acc_metric.result()
            logging.info('Iteration %d: accuracy: %.5f ' % (itr, acc))
            acc_metric.reset_states()

            with meta_loss_summary_writer.as_default():
                tf.summary.scalar('train-meta-loss', meta_loss, step=itr)
            with meta_metric_writer.as_default():
                tf.summary.scalar('train mean IoU', mIoU, step=itr)
            with meta_metric_writer.as_default():
                tf.summary.scalar('train accuracy', acc, step=itr)

        # evaluation session
        if itr % 150 == 0:
            valid_set = data_generator_valid.sample_batch(
            )  # only one batch, size of meta_batch_size
            dist_valid_dataset_single_elem = mirrored_strategy.experimental_distribute_dataset(
                valid_set)
            for input_support_val_replica, input_query_val_replica, label_support_val_replica, label_query_val_replica, ids_val_replica, query_indices_val_replica in dist_valid_dataset_single_elem:  # only one elem in the dataset
                inp_valid = (input_support_val_replica,
                             input_query_val_replica,
                             label_support_val_replica,
                             label_query_val_replica)
                output_query_valid_replicas, meta_loss_valid = distributed_valid_step(
                    inp_valid, maml, num_replicas, optim, mirrored_strategy)
                logging.info('[VALIDATION] Iteration %d: meta loss: %.5f ' %
                             (itr, meta_loss_valid))
                if num_replicas > 1:
                    output_query_valid = output_query_valid_replicas.values
                    output_query_valid = tf.concat(output_query_valid, 0)

                    label_query_valid = label_query_val_replica.values
                    label_query_valid = tf.concat(label_query_valid, 0)

                    ids_valid = tf.concat(ids_val_replica.values, 0)
                    query_indices_valid = tf.concat(
                        query_indices_val_replica.values, 0)
                else:
                    output_query_valid = output_query_valid_replicas
                    label_query_valid = label_query_val_replica
                    ids_valid = ids_val_replica
                    query_indices_valid = query_indices_val_replica
                label_query_valid = tf.cast(label_query_valid,
                                            dtype=tf.float32)
                pred_valid = tf.one_hot(tf.argmax(output_query_valid, axis=-1),
                                        depth=data_generator.LABEL_SIZE)

                with tf.device('/CPU:0'):
                    mIoU_valid = compute_mIoU(
                        label_query_valid[:, :, :, :,
                                          1:], pred_valid[:, :, :, :, 1:],
                        data_generator.LABEL_SIZE - 1)
                logging.info('[VALIDATION] Iteration %d: mean IoU: %.5f ' %
                             (itr, mIoU_valid))

                if mIoU_valid > best_evel_mIoU:
                    best_evel_mIoU = mIoU_valid
                    logging.info("saving to ", model_file)
                    maml.save_weights(model_file)

                with tf.device('/CPU:0'):
                    acc_metric.update_state(
                        label_query_valid[:, :, :, :, 1:],
                        tf.math.softmax(output_query_valid)[:, :, :, :, 1:])
                    acc_valid = acc_metric.result()
                logging.info('[VALIDATION] Iteration %d: accuracy: %.5f ' %
                             (itr, acc_valid))
                acc_metric.reset_states()

                with meta_metric_writer.as_default():
                    tf.summary.scalar('eval mean IoU', mIoU_valid, step=itr)
                with meta_metric_writer.as_default():
                    tf.summary.scalar('eval accuracy', acc_valid, step=itr)

                with tf.device('/CPU:0'):
                    construct_predicted_label_batch(itr, ids_valid,
                                                    query_indices_valid,
                                                    label_query_valid,
                                                    pred_valid, job_dir,
                                                    storage_client)

                with meta_loss_summary_writer.as_default():
                    tf.summary.scalar('eval-meta-loss',
                                      meta_loss_valid,
                                      step=itr)
Beispiel #25
0
def train_model(data_path,
                batch_size,
                image_size,
                crop_size,
                lr_schedule_name,
                init_lr,
                max_lr,
                weight_decay,
                optimizer,
                model_type,
                embedding_size,
                num_epochs,
                checkpoint_path,
                cache_path=None,
                margin=0.35,
                range_test=False,
                use_tpu=False,
                tpu_name=None,
                test_path='',
                use_mixed_precision=False,
                triplet_strategy='',
                images_per_person=35,
                people_per_sample=12,
                pretrained_model='',
                distance_metric="L2",
                soft=True,
                sigma=0.3,
                use_lfw=True):

    if use_tpu is True:
        assert tpu_name is not None, '[ERROR] TPU name must be specified'
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=tpu_name)
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)
        print("[INFO] TPUs: ", tf.config.list_logical_devices('TPU'))

    if use_mixed_precision is True:
        if use_tpu is True:
            policy = mixed_precision.Policy('mixed_bfloat16')
        else:
            policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)
        print(
            "[INFO] Using mixed precision for training. This will reduce memory consumption\n"
        )

    train_dataset, n_imgs, n_classes = generate_training_dataset(
        data_path=data_path,
        image_size=image_size,
        batch_size=batch_size,
        crop_size=crop_size,
        cache=cache_path,
        use_mixed_precision=use_mixed_precision,
        images_per_person=images_per_person,
        people_per_sample=people_per_sample,
        use_tpu=use_tpu,
        model_type=model_type)

    if test_path is not None and len(test_path) > 1:
        if use_lfw is True:
            test_dataset, test_images, _ = get_LFW_dataset(
                data_path=test_path,
                image_size=image_size,
                batch_size=batch_size,
                crop_size=crop_size,
                cache='./lfw_dataset_cache.tfcache',
                use_mixed_precision=use_mixed_precision,
                use_tpu=use_tpu,
                train_classes=n_classes,
                model_type=model_type)
        else:
            test_dataset, test_images, _ = get_test_dataset(
                data_path=test_path,
                image_size=image_size,
                batch_size=30,
                crop_size=crop_size,
                cache='./test_dataset_cache.tfcache',
                use_mixed_precision=use_mixed_precision,
                use_tpu=use_tpu,
                train_classes=n_classes,
                model_type=model_type)
    else:
        test_dataset = None

    if triplet_strategy == 'VANILLA':
        loss_fn = tfa.losses.TripletSemiHardLoss(margin=margin)
        print('[INFO] Using vanilla triplet loss')
    elif triplet_strategy == 'BATCH_HARD':
        loss_fn = TripletBatchHardLoss(margin=margin,
                                       soft=soft,
                                       distance_metric=distance_metric)
        print('[INFO] Using batch-hard strategy.')
    elif triplet_strategy == 'BATCH_HARD_V2':
        loss_fn = TripletBatchHardV2Loss(margin1=(-1.0 * margin),
                                         margin2=(margin1 / 100.0),
                                         beta=0.002,
                                         distance_metric=distance_metric)
        print('[INFO] Using batch-hard V2 strategy')
    elif triplet_strategy == 'ADAPTIVE':
        loss_fn = AdaptiveTripletLoss(margin=margin, soft=soft, lambda_=sigma)
        print('[INFO] Using Adaptive Triplet Loss')
    else:
        loss_fn = TripletFocalLoss(margin=margin,
                                   sigma=sigma,
                                   soft=soft,
                                   distance_metric=distance_metric)
        print('[INFO] Using triplet focal loss.')

    triplet_loss_metrics = TripletLossMetrics(test_images, embedding_size)

    if range_test is True:
        opt = get_optimizer(optimizer_name=optimizer,
                            lr_schedule=1e-5,
                            weight_decay=weight_decay)
        if use_tpu is True:
            with strategy.scope():
                model = create_neural_network(model_type=model_type,
                                              embedding_size=embedding_size)
                assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights'

        else:
            model = create_neural_network(model_type=model_type,
                                          embedding_size=embedding_size)
            assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights'

        lrs = []
        losses = []
        for epoch in range(5):
            for step, (x_batch_train,
                       y_batch_train) in enumerate(train_dataset):
                with tf.GradientTape() as tape:
                    logits = model(x_batch_train, training=True)
                    loss_value = loss_fn(y_batch_train, logits)
                grads = tape.gradient(loss_value, model.trainable_weights)
                perturbations = opt.first_step(grads, model)
                with tf.GradientTape() as tape:
                    logits = model(x_batch_train, training=True)
                    loss_value = loss_fn(y_batch_train, logits)
                grads = tape.gradient(loss_value, model.trainable_weights)
                opt.second_step(grads, model, perturbations)
                losses.append(float(loss_value.numpy()))
                lrs.append(opt.base_optimizer.lr.numpy())
                if step % 200 == 0 and step > 0:
                    print("Step : %d :: Current loss : %f" %
                          (step, float(loss_value.numpy())))
                    plt.xscale('log')
                    plt.plot(lrs, losses, color='blue')
                    smooth_losses = savgol_filter(losses, 7, 3)
                    plt.plot(lrs, smooth_losses, color='red')
                    plt.xlabel('Log learning rate')
                    plt.ylabel('Loss')
                    plt.savefig('./range_test_result.png')
            for x_batch_test, y_batch_test in test_dataset:
                val_logits = model(x_batch_test, training=False)
                triplet_loss_metrics.update_state(y_batch_test, val_logits)
            result = triplet_loss_metrics.result()
            print(str(result.numpy()))
            triplet_loss_metrics.reset_states()

        plt.xscale('log')
        plt.plot(lrs, losses, color='blue')
        smooth_losses = savgol_filter(losses, 7, 3)
        plt.plot(lrs, smooth_losses, color='red')
        plt.xlabel('Log learning rate')
        plt.ylabel('Loss')
        plt.savefig('./range_test_result.png')
        print(
            '\n[INFO] Training complete. Range test results can be found at "./range_test_result.png"'
        )

        return
    else:
        lr_schedule = get_learning_rate_schedule(
            schedule_name=lr_schedule_name,
            learning_rate=init_lr,
            max_lr=max_lr,
            image_count=n_imgs,
            batch_size=batch_size)
        opt = get_optimizer(optimizer_name=optimizer,
                            lr_schedule=lr_schedule,
                            weight_decay=weight_decay)

        if not os.path.exists(checkpoint_path):
            os.mkdir(checkpoint_path)

        checkpoint_name = checkpoint_path + '/' + 'cp-{epoch:03d}.ckpt'

        if use_tpu is True:
            with strategy.scope():
                model = create_neural_network(model_type=model_type,
                                              embedding_size=embedding_size)
                assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights'

        else:
            model = create_neural_network(model_type=model_type,
                                          embedding_size=embedding_size)
            assert model is not None, '[ERROR] There was a problem while loading the pre-trained weights'

        for epoch in range(num_epochs):
            for step, (x_batch_train,
                       y_batch_train) in enumerate(train_dataset):
                with tf.GradientTape() as tape:
                    logits = model(x_batch_train, training=True)
                    loss_value = loss_fn(y_batch_train, logits)
                grads = tape.gradient(loss_value, model.trainable_weights)
                perturbations = opt.first_step(grads, model)
                with tf.GradientTape() as tape:
                    logits = model(x_batch_train, training=True)
                    loss_value = loss_fn(y_batch_train, logits)
                grads = tape.gradient(loss_value, model.trainable_weights)
                opt.second_step(grads, model, perturbations)
                if step % 200 == 0:
                    print("Step : %d :: Current loss : %f" %
                          (step, float(loss_value)))
            for x_batch_test, y_batch_test in test_dataset:
                val_logits = model(x_batch_test, training=False)
                triplet_loss_metrics.update_state(y_batch_test, val_logits)
            result = triplet_loss_metrics.result()
            print(str(result))
            triplet_loss_metrics.reset_states()
            if epoch % 5 == 0:
                model.save(checkpoint_name.format(epoch))

        if not os.path.exists('./results'):
            os.mkdir('./results')

        model_name = './results/model-' + datetime.now().strftime(
            "%Y%m%d-%H%M%S")
        model.save(model_name)
        print(
            '\n[INFO] Training complete. Saved model can be found in "./results"'
        )

        return
Beispiel #26
0
def set_keras_mixed_precision_policy(policy_name: str) -> None:
    """Set tf.keras mixed precision"""
    policy = mixed_precision.Policy(policy_name)
    mixed_precision.set_policy(policy)
Beispiel #27
0
def main(config):
    gpus = tf.config.experimental.list_physical_devices('GPU')
    tf.config.experimental.set_visible_devices(devices=gpus[config.device],
                                               device_type='GPU')
    if config.gpu_growth:
        for gpu in tf.config.experimental.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)
    assert config.precision in (16, 32), config.precision
    if config.precision == 16:
        prec.set_policy(prec.Policy('mixed_float16'))
    config.steps = int(config.steps)
    config.logdir.mkdir(parents=True, exist_ok=True)
    print('Logdir', config.logdir)

    # Create environments.
    datadir = config.logdir / 'episodes'
    writer = tf.summary.create_file_writer(str(config.logdir),
                                           max_queue=1000,
                                           flush_millis=20000)
    writer.set_as_default()
    train_envs = [
        wrappers.Async(
            lambda: make_env(config, writer, 'train', datadir, store=True),
            config.parallel) for _ in range(config.envs)
    ]
    test_envs = [
        wrappers.Async(
            lambda: make_env(config, writer, 'test', datadir, store=False),
            config.parallel) for _ in range(config.envs)
    ]
    actspace = train_envs[0].action_space

    # Prefill dataset with random episodes.
    step = count_steps(datadir, config)
    prefill = max(0, config.prefill - step)
    print(f'Prefill dataset with {prefill} steps.')
    random_agent = lambda o, d, _: ([actspace.sample() for _ in d], None)
    tools.simulate(random_agent, train_envs, prefill / config.action_repeat)
    writer.flush()

    # Train and regularly evaluate the agent.
    step = count_steps(datadir, config)
    print(f'Simulating agent for {config.steps - step} steps.')
    agent = Dreamer(config, datadir, actspace, writer)
    if (config.logdir / 'variables.pkl').exists():
        print('Load checkpoint.')
        agent.load(config.logdir / 'variables.pkl')
    state = None
    while step < config.steps:
        print('Start evaluation.')
        if config.test_model == True:
            print('Start evaluate model.')
            tools.test_model(
                functools.partial(agent, training=False),
                test_envs,
                episodes=1,
                dynamics=agent._dynamics,
                model_metric_summaries=agent._model_metric_summaries,
                value=agent._value,
                decode=agent._decode,
                test_len=config.test_len)
        else:
            tools.simulate(functools.partial(agent, training=False),
                           test_envs,
                           episodes=1)
        writer.flush()
        print('Start collection.')
        steps = config.eval_every // config.action_repeat
        state = tools.simulate(agent, train_envs, steps, state=state)
        step = count_steps(datadir, config)
        agent.save(config.logdir / 'variables.pkl')
    for env in train_envs + test_envs:
        env.close()
Beispiel #28
0
    def init_network(self):

        # This function builds the compute graph.
        # Optionally, it can build a 'subset' graph if this mode is

        # Net construction:
        start = time.time()

        # Here, if using mixed precision, set a global policy:
        if self.args.precision == "mixed":
            from tensorflow.keras.mixed_precision import experimental as mixed_precision
            self.policy = mixed_precision.Policy('mixed_float16')
            mixed_precision.set_policy(self.policy)

        if self.args.precision == "bfloat16":
            from tensorflow.keras.mixed_precision import experimental as mixed_precision
            self.policy = mixed_precision.Policy('mixed_bfloat16')
            mixed_precision.set_policy(self.policy)

        batch_dims = self.larcv_fetcher.batch_dims(1)

        # We compute the
        batch_dims[0] = self.local_batch_size()

        #
        self._global_step = tf.Variable(0, dtype=tf.int64)

        # We have to make placeholders for input objects:
        #
        # self._input = {
        #     'image'   : tf.compat.v1.placeholder(floating_point_format, batch_dims, name="input_image"),
        #     'label'   : tf.compat.v1.placeholder(integer_format,        batch_dims, name="input_label"),
        #     'io_time' : tf.compat.v1.placeholder(floating_point_format, (), name="io_fetch_time")
        # }

        # Build the network object, forward pass only:

        if self.args.conv_mode == '2D':
            self._net = uresnet2D.UResNet(self.args)
        else:
            self._net = uresnet3D.UResNet3D(self.args)

        self._net.trainable = True

        # self._logits = self._net(self._input['image'], training=self.args.training)

        # # If channels first, need to permute the logits:
        # if self._channels_dim == 1:
        #     permutation = tf.keras.layers.Permute((2, 3, 1))
        #     self._loss_logits = [ permutation(l) for l in self._logits ]
        # else:
        #     self._loss_logits = self._logits

        # TO PROPERLY INITIALIZE THE NETWORK, NEED TO DO A FORWARD PASS
        minibatch_data = self.larcv_fetcher.fetch_next_batch("train",
                                                             force_pop=False)
        image, label = self.cast_input(minibatch_data['image'],
                                       minibatch_data['label'])

        self.forward_pass(image, label, training=False)

        # # Here, if the data format is channels_first, we have to reorder the logits tensors
        # # To put channels last.  Otherwise it does not work with the softmax tensors.
        #
        #
        # # Apply a softmax and argmax:
        # self._output = dict()
        #
        # # Take the logits (which are one per plane) and create a softmax and prediction (one per plane)
        # with tf.compat.v1.variable_scope("prediction"):
        #     self._output['prediction'] = [ tf.argmax(x, axis=self._channels_dim) for x in self._logits]
        #
        # with tf.compat.v1.variable_scope("cross_entropy"):
        #
        #     self._input['split_labels'] = [
        #         tf.squeeze(l, axis=self._channels_dim)
        #             for l in tf.split(self._input['label'], 3, self._channels_dim)
        #         ]
        #     self._input['split_images'] = [
        #         tf.squeeze(l, axis=self._channels_dim)
        #             for l in tf.split(self._input['image'], 3, self._channels_dim)
        #         ]
        #
        #     self._loss = self.loss_calculator(
        #             labels = self._input['split_labels'],
        #             logits = self._loss_logits)
        #
        #
        # if self.args.mode == "inference":
        #     self._output['softmax'] = [tf.nn.softmax(x, axis=self._channels_dim) for x in self._logits]

        self.acc_calculator = AccuracyCalculator.AccuracyCalculator()
        self.loss_calculator = LossCalculator.LossCalculator(
            self.args.loss_balance_scheme, self._channels_dim)

        self._log_keys = ["loss", "Average/Non_Bkg_Accuracy", "Average/mIoU"]

        end = time.time()
        return end - start
Beispiel #29
0
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.mixed_precision import experimental as mixed_precision
import pandas as pd
from tensorflow.keras.layers import Flatten, Dense, LeakyReLU, BatchNormalization, Dropout
import keras.backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import efficientnet.keras as efn
import tensorflow_addons as tfa

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

datagen = ImageDataGenerator(rescale=1. / 255, validation_split=0.2, horizontal_flip=True)
train_csv = pd.read_csv(r"/content/train.csv")
train_csv["label"] = train_csv["label"].astype(str)

base_model = tf.keras.applications.ResNet50(weights='imagenet', input_shape=(512, 512, 3), include_top=True)

base_model.trainable = True

model = tf.keras.Sequential([

	tf.keras.layers.Input((512, 512, 3)),
	tf.keras.layers.BatchNormalization(renorm=True),
	base_model,
	BatchNormalization(),
	tf.keras.layers.LeakyReLU(),
	tf.keras.layers.Flatten(),
	tf.keras.layers.Dense(512),
	BatchNormalization(),
Beispiel #30
0
def build(model_fn: Callable[[], Union[Model, List[Model]]],
          optimizer_fn: Union[str, Scheduler, Callable, List[str],
                              List[Callable], List[Scheduler], None],
          weights_path: Union[str, None, List[Union[str, None]]] = None,
          model_name: Union[str, List[str], None] = None,
          mixed_precision: bool = False) -> Union[Model, List[Model]]:
    """Build model instances and associate them with optimizers.

    This method can be used with TensorFlow models / optimizers:
    ```python
    model_def = fe.architecture.tensorflow.LeNet
    model = fe.build(model_fn = model_def, optimizer_fn="adam")
    model = fe.build(model_fn = model_def, optimizer_fn=lambda: tf.optimizers.Adam(lr=0.1))
    model = fe.build(model_fn = model_def, optimizer_fn="adam", weights_path="~/weights.h5")
    ```

    This method can be used with PyTorch models / optimizers:
    ```python
    model_def = fe.architecture.pytorch.LeNet
    model = fe.build(model_fn = model_def, optimizer_fn="adam")
    model = fe.build(model_fn = model_def, optimizer_fn=lambda x: torch.optim.Adam(params=x, lr=0.1))
    model = fe.build(model_fn = model_def, optimizer_fn="adam", weights_path="~/weights.pt)
    ```

    Args:
        model_fn: A function that define model(s).
        optimizer_fn: Optimizer string/definition or a list of optimizer instances/strings. The number of optimizers
            provided here should match the number of models generated by the `model_fn`.
        model_name: Name(s) of the model(s) that will be used for logging purpose. If None, a name will be
            automatically generated and assigned.
        weights_path: Path(s) from which to load model weights. If not None, then the number of weight paths provided
            should match the number of models generated by the `model_fn`.
        mixed_precision: Whether to enable mix precision network operations, only applies to tensorflow models.

    Returns:
        models: The model(s) built by FastEstimator.
    """
    def _generate_model_names(num_names):
        names = [
            "model" if i + build.count == 0 else "model{}".format(i +
                                                                  build.count)
            for i in range(num_names)
        ]
        build.count += num_names
        return names

    if not hasattr(build, "count"):
        build.count = 0
    # mix-precision handling
    if mixed_precision:
        mixed_precision_tf.set_policy(
            mixed_precision_tf.Policy('mixed_float16'))
    else:
        mixed_precision_tf.set_policy(mixed_precision_tf.Policy('float32'))
    models, optimizer_fn = to_list(model_fn()), to_list(optimizer_fn)
    # fill optimizer
    if not optimizer_fn:
        optimizer_fn = [None]
    # check framework
    if isinstance(models[0], tf.keras.Model):
        framework = "tf"
    elif isinstance(models[0], torch.nn.Module):
        framework = "torch"
    else:
        raise ValueError("unrecognized model format: {}".format(type(
            models[0])))
    # multi-gpu handling
    if torch.cuda.device_count() > 1:
        if framework == "tf" and not isinstance(
                tf.distribute.get_strategy(), tf.distribute.MirroredStrategy):
            tf.distribute.experimental_set_strategy(
                tf.distribute.MirroredStrategy())
            models = to_list(model_fn())
        if framework == "torch":
            models = [torch.nn.DataParallel(model) for model in models]
    # generate names
    if not model_name:
        model_name = _generate_model_names(len(models))
    model_name = to_list(model_name)
    # load weights
    if weights_path:
        weights_path = to_list(weights_path)
    else:
        weights_path = [None] * len(models)
    assert len(models) == len(optimizer_fn) == len(weights_path) == len(model_name), \
        "Found inconsistency in number of models, optimizers, model_name or weights"
    # create optimizer
    for idx, (model, optimizer_def, weight, name) in enumerate(
            zip(models, optimizer_fn, weights_path, model_name)):
        models[idx] = trace_model(_fe_compile(model, optimizer_def, weight,
                                              name, framework),
                                  model_idx=idx if len(models) > 1 else -1,
                                  model_fn=model_fn,
                                  optimizer_fn=optimizer_def,
                                  weights_path=weight)
    if len(models) == 1:
        models = models[0]
    return models