Ejemplo n.º 1
0
Archivo: utils.py Proyecto: lmc00/TFG
def get_callbacks(CONF, use_lr_decay=True):
    """
    Get a callback list to feed fit_generator.
    #TODO Use_remote callback needs proper configuration
    #TODO Add ReduceLROnPlateau callback?

    Parameters
    ----------
    CONF: dict

    Returns
    -------
    List of callbacks
    """

    calls = []

    # Add mandatory callbacks
    calls.append(callbacks.TerminateOnNaN())
    calls.append(LRHistory())

    # Add optional callbacks
    if use_lr_decay:
        milestones = np.array(CONF['training']['lr_step_schedule']) * CONF['training']['epochs']
        milestones = milestones.astype(np.int)
        calls.append(LR_scheduler(lr_decay=CONF['training']['lr_step_decay'],
                                  epoch_milestones=milestones.tolist()))

    if CONF['monitor']['use_tensorboard']:
        calls.append(callbacks.TensorBoard(log_dir=paths.get_logs_dir(), write_graph=False))

        # # Let the user launch Tensorboard
        # print('Monitor your training in Tensorboard by executing the following comand on your console:')
        # print('    tensorboard --logdir={}'.format(paths.get_logs_dir()))
        # Run Tensorboard  on a separate Thread/Process on behalf of the user
        port = os.getenv('monitorPORT', 6006)
        port = int(port) if len(str(port)) >= 4 else 6006
        subprocess.run(['fuser', '-k', '{}/tcp'.format(port)]) # kill any previous process in that port
        p = Process(target=launch_tensorboard, args=(port,), daemon=True)
        p.start()

    if CONF['monitor']['use_remote']:
        calls.append(callbacks.RemoteMonitor())

    if CONF['training']['use_validation'] and CONF['training']['use_early_stopping']:
        calls.append(callbacks.EarlyStopping(patience=int(0.1 * CONF['training']['epochs'])))

    if CONF['training']['ckpt_freq'] is not None:
        calls.append(callbacks.ModelCheckpoint(
            os.path.join(paths.get_checkpoints_dir(), 'epoch-{epoch:02d}.hdf5'),
            verbose=1,
            period=max(1, int(CONF['training']['ckpt_freq'] * CONF['training']['epochs']))))

    if not calls:
        calls = None

    return calls
Ejemplo n.º 2
0
def binaryClassification(data, labels, hiddenLayers, lrate, nEpochs, kSplitt=10, rp=0.01, columns=None, plotName=None):
    if (columns is not None):
        data = data[:, columns]
    
    if (kSplitt > 0):
        randomSeed = 0
        if (randomSeed != 0):
            kfold = StratifiedKFold(n_splits=kSplitt, shuffle=True, random_state=randomSeed)
        else:
            kfold = StratifiedKFold(n_splits=kSplitt, shuffle=True)
    
    i = 0;
    cvscores = []
    # K-Fold analysis based on https://machinelearningmastery.com/evaluate-performance-deep-learning-models-keras/
    for train, test in kfold.split(data, labels):
        i = i+1
        ### Define Neuronal Network
        cbks = [callbacks.TerminateOnNaN()]
        layers=[keras.layers.Dense(i, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(rp)) for i in hiddenLayers]
#         layers=[keras.layers.Dense(i, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(rp)) for i in hiddenLayers]
#         layers=keras.layers.Dense(i, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(rp))(layers)
        layers.append(keras.layers.Dense(1, activation=tf.nn.sigmoid))
        model = keras.Sequential(layers)
        
        model.compile(optimizer = tf.train.AdamOptimizer(),
                      lr        = lrate, 
                      loss      = 'binary_crossentropy',
                      metrics   = ['accuracy'])
        
        ### Execute model
        history =  model.fit(data[train], labels[train], epochs=nEpochs, callbacks=cbks, verbose=0) #validation_data=[test_data,test_labels]) #--> Use this to grep & plot this per Epochs (last line)
        scores = model.evaluate(data[test], labels[test], verbose=0)
    
        if (np.isnan(history.history['loss']).any()):
            raise ValueError("Loss was not a number")
        
        # Needs to be refactored
        if (plotName is not None):
            plt.plot(history.history['acc'])     
            #plt.plot(history.history['val_acc'])
            plt.title('Model accuracy')
            plt.ylabel('Accuracy')
            plt.xlabel('Epoch')
            plt.legend(['Train', 'Test'], loc='upper left')
            plt.savefig("../data/" + plotName + str(i) + ".png")
        
        
        print("%s %s: %.2f%%" % (i, model.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
    
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
Ejemplo n.º 3
0
def binaryClassification(train_data, train_labels, test_data, test_labels, nEpochs, lrate, layerSize, rp=0.01, columns=None):
    ### Read input data    ###
#     # Training data (80%)
#     train_data=np.load(folder + "train_data.npy")
#     train_labels=np.load(folder + "train_labels.npy")
#     # Evaluation data (10%)
#     test_data=np.load(folder + "test_data.npy")
#     test_labels=np.load(folder + "test_labels.npy")
    
    if (columns is not None):
        train_data = train_data[:, columns]
        test_data  = test_data[:, columns]    
    
    ### Define Neuronal Network
    cbks = [callbacks.TerminateOnNaN()]
    layers=[keras.layers.Dense(i, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(rp)) for i in layerSize]
    layers.append(keras.layers.Dense(1, activation=tf.nn.sigmoid))
    model = keras.Sequential(layers)
    model.compile(optimizer = tf.train.AdamOptimizer(),
                  lr        = lrate, 
                  loss      = 'binary_crossentropy',
                  metrics   = ['accuracy'])
    ### Execute model
#     history =  model.fit(train_data, train_labels, epochs=nEpochs, verbose=1, validation_data=[test_data,test_labels]) #--> Use this to grep & plot this per Epochs (last line)
    history = model.fit(train_data, train_labels, callbacks=cbks, epochs=nEpochs, verbose=0)
    test_loss, test_acc = model.evaluate(test_data, test_labels, verbose=0)
    
#     if (math.isnan(history.history['loss'])):
    if (np.isnan(history.history['loss']).any()):
        raise ValueError("Loss was not a number")
    
    plt.plot(history.history['acc'])
#     plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    #plt.show()
#     saveModel(model);
    plotWeights(model)
    return test_loss, test_acc
Ejemplo n.º 4
0
def callback(model_out, patience, metrics):
    calls = [
        callbacks.ModelCheckpoint(model_out,
                                  save_best_only=True,
                                  monitor=metrics,
                                  verbose=1)
    ]
    calls += [
        callbacks.ReduceLROnPlateau(patience=3,
                                    factor=0.5,
                                    min_delta=1e-6,
                                    monitor=metrics,
                                    verbose=1)
    ]
    calls += [
        callbacks.EarlyStopping(patience=patience,
                                restore_best_weights=True,
                                min_delta=1e-5,
                                monitor=metrics,
                                verbose=1)
    ]
    return calls + [callbacks.TerminateOnNaN()]
Ejemplo n.º 5
0
    def train(self, epochs: int, lr: float, steps_per_epoch: int = 1):
        """
        This function is used to Train the model, it uses Adam Optimizer to train, and it saves the weights of every 
        epoch in 'model_weights' dir, training steps_per_epoch=1 and val_steps=5 by default.
        
        You can optionally set the following parameters:
        
        param: epochs (NO of epochs to train the model)
        param: lr (learning rate for the model)
        param: steps_per_epoch (it defines steps per epoch for training data)
        """

        if (self.modelType == 'tinyyolov4'):
            self.optimizer = optimizers.Adam(learning_rate=lr)
            self.model.compile(optimizer=self.optimizer,
                               loss_iou_type='ciou',
                               loss_verbose=0)

            def lr_scheduler(epoch, lr):
                return lr * tf.math.exp(-0.1)

            self.model.fit(self.train_dataset,
                           epochs=epochs,
                           callbacks=[
                               callbacks.LearningRateScheduler(lr_scheduler,
                                                               verbose=1),
                               callbacks.TerminateOnNaN(),
                               callbacks.TensorBoard(histogram_freq=1,
                                                     log_dir="./logs"),
                               SaveWeightsCallback(yolo=self.model,
                                                   dir_path="./model_weights",
                                                   weights_type="yolo",
                                                   epoch_per_save=1),
                           ],
                           validation_data=self.val_dataset,
                           validation_steps=self.val_steps,
                           steps_per_epoch=steps_per_epoch)
        else:
            raise RuntimeError('Invalid ModelType: Valid Type is YOLOv4')
Ejemplo n.º 6
0
    lr_schedule_cb = keras_callbacks.LearningRateScheduler(
        schedule=(
            #         schedule := tomo2seg_schedule.get_schedule00()
            schedule := tomo2seg_schedule.LinSpaceSchedule(
                offset_epoch=0, wait=100, start=initial_lr, stop=initial_lr / 10, n_between=100
            )
        ),
        verbose=2,
    )

    # todo plot schedule

    logger.info(f"{lr_schedule_cb.schedule.range=}")

    callbacks = [
        keras_callbacks.TerminateOnNaN(),
        keras_callbacks.ModelCheckpoint(
            t2s_model.autosaved2_model_path_str,
            monitor="val_loss",
            verbose=1,
            save_best_only=True,
            mode="min",
        ),

        history_cb,
        history_plot_cb,
        lr_schedule_cb,
    ]

    try:
        early_stop_cb
Ejemplo n.º 7
0
def train_model(learning_algorithm, dataset, hidden_layers, batch_dim,
                learning_rate, seed):
    """ function that trains a neural network with tf.keras with automatic differentiation.
    
    Keyword arguments:
    learning_algorithm -- either 'EBP' for error backpropagation (with softmax and cross-entropy loss) or 'BrainProp'
    dataset -- either 'MNIST', 'CIFAR10' or 'CIFAR100'
    hidden_layers -- list of layers for the network (accepts 'Dense(n)', 'Conv2D(n_filters, (ksize_x,ksize_y)' and any other layer with full input)
    batch_dim -- minibatch size
    learning_rate -- learning rate used for training
    seed -- integer, seed used for reproducible results
    """

    save_plots = True

    print("Experiment begins, training on {} with {}".format(
        dataset, learning_algorithm))

    np.random.seed(seed)
    tf.random.set_seed(seed)

    if dataset == 'MNIST':
        (train_images,
         train_labels), (test_images,
                         test_labels) = datasets.mnist.load_data()
        if len(np.shape(train_images)) < 4:
            train_images = tf.expand_dims(train_images, -1).numpy()
            test_images = tf.expand_dims(test_images, -1).numpy()
    elif dataset == 'CIFAR10':
        (train_images,
         train_labels), (test_images,
                         test_labels) = datasets.cifar10.load_data()
    elif dataset == 'CIFAR100':
        (train_images,
         train_labels), (test_images,
                         test_labels) = datasets.cifar100.load_data(
                             label_mode='fine')
    else:
        raise Exception(
            "Unknown dataset. Choose either \'MNIST\', \'CIFAR10\' or \'CIFAR100\'."
        )

    if tf.reduce_max(train_images) > 1:
        train_images = train_images / 255.0
    if tf.reduce_max(test_images) > 1:
        test_images = test_images / 255.0

    image_shape = np.shape(train_images)[1:]
    n_classes = tf.cast(tf.reduce_max(train_labels) + 1, dtype=tf.int32)
    n_batches = len(train_images) // batch_dim

    train_labels = tf.keras.utils.to_categorical(train_labels,
                                                 n_classes,
                                                 dtype='float32')
    test_labels = tf.keras.utils.to_categorical(test_labels,
                                                n_classes,
                                                dtype='float32')

    #preparing architecture and optimizer depending on the selected learning algorithm
    if learning_algorithm == 'EBP':
        output_activation_function = 'softmax'
        loss = 'categorical_crossentropy'
        metric = 'accuracy'
        output_layer = layers.Dense
    elif learning_algorithm == 'BrainProp':
        output_activation_function = 'linear'
        metric = 'accuracy'
        brainprop = import_from_path('brainprop', file_path="brainprop.py")
        loss = brainprop.BrainPropLoss(batch_size=batch_dim,
                                       n_classes=n_classes,
                                       replicas=1)
        output_layer = brainprop.BrainPropLayer
#         if os.path.exists('brainprop.py') != True:
#           ! wget https://github.com/isapome/BrainProp/raw/master/brainprop.py
#         from brainprop import BrainPropLayer, BrainPropLoss
#         loss = BrainPropLoss(batch_size=batch_dim, n_classes=n_classes, replicas=1)
#         output_layer = BrainPropLayer
    else:
        raise Exception(
            "Unknown learning algorithm. Choose between \'EBP\' and \'BrainProp\' "
        )

    optimizer = optimizers.SGD(learning_rate=learning_rate, momentum=0.)

    bias = False
    initializer = tf.random_normal_initializer(mean=0., stddev=0.01)
    regularizer = None
    pad = 'same'

    model = models.Sequential()
    model.add(Input(shape=image_shape))  #input_shape=image_shape

    flatten_layer = 0  #there needs to be a flatten layer between 4dim inputs and dense layers.

    for hidden_layer in hidden_layers:

        if hidden_layer.__class__.__name__ == 'Dense' and flatten_layer < 1:
            model.add(layers.Flatten())
            flatten_layer += 1

        if hidden_layer.__class__.__name__ == 'Conv2D' and flatten_layer > 0:
            raise Exception(
                "Please do not add convolutional layers after dense layers.")

        config = hidden_layer.get_config()
        layer = layers.deserialize({
            'class_name': hidden_layer.__class__.__name__,
            'config': config
        })
        layer.use_bias = bias
        layer.kernel_initializer = initializer
        layer.kernel_regularizer = regularizer
        if hidden_layer.__class__.__name__ == 'Conv2D':
            layer.padding = pad
        model.add(layer)

    last_layer = output_layer(n_classes,
                              activation=output_activation_function,
                              use_bias=bias,
                              kernel_regularizer=regularizer,
                              kernel_initializer=initializer)
    model.add(last_layer)
    model.summary()

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    epochs = 500  #just as upper bound. Early stopping will act much earlier than this.

    lr_schedule = None
    terminate_on_NaN = callbacks.TerminateOnNaN()
    earlystopping = callbacks.EarlyStopping(monitor='val_accuracy',
                                            min_delta=0.001,
                                            patience=10,
                                            verbose=1,
                                            mode='max',
                                            baseline=None,
                                            restore_best_weights=False)
    callbacks_list = list(
        filter(None, [lr_schedule, terminate_on_NaN, earlystopping]))

    tic_training = datetime.datetime.now()
    history = model.fit(train_images,
                        train_labels,
                        batch_size=batch_dim,
                        epochs=epochs,
                        validation_data=(test_images, test_labels),
                        shuffle=True,
                        verbose=2,
                        callbacks=callbacks_list)

    toc_training = datetime.datetime.now()
    elapsed = (toc_training - tic_training).seconds // 60
    print("Training, elapsed: {} minute{}.".format(elapsed,
                                                   's' if elapsed > 1 else ''))

    if save_plots == True:  #save a plot of the accuracy as a function of the epochs
        filename_plot = get_filename('accuracy.png', dataset,
                                     learning_algorithm)

        n_epochs = len(history.history['accuracy'])

        plt.figure()
        plt.title("{} - {}".format(learning_algorithm, dataset), fontsize=16)
        plt.plot(history.history['accuracy'], label='accuracy', linewidth=2)
        plt.plot(history.history['val_accuracy'],
                 label='validation accuracy',
                 linewidth=2)
        maximum_val_accuracy = np.max(history.history['val_accuracy'])
        argmax_val_accuracy = np.argmax(history.history['val_accuracy'])
        plt.plot([argmax_val_accuracy, argmax_val_accuracy],
                 [-0.4, maximum_val_accuracy],
                 '--',
                 color='green',
                 linewidth=1)
        plt.plot(argmax_val_accuracy,
                 maximum_val_accuracy,
                 'ks',
                 markersize=7,
                 label='maximum = {:.5}'.format(maximum_val_accuracy))
        plt.xticks(list(plt.xticks()[0]) + [argmax_val_accuracy])
        plt.gca().get_xticklabels()[-1].set_color("white")
        plt.gca().get_xticklabels()[-1].set_fontweight('bold')
        plt.gca().get_xticklabels()[-1].set_bbox(
            dict(facecolor='green', edgecolor='white', alpha=0.8))
        plt.xlabel('Epoch', fontsize=14)
        plt.ylabel('Accuracy', fontsize=14)
        plt.xlim([-0.4, (n_epochs - .5)])
        plt.ylim([0.0, 1.05])
        plt.legend(loc='lower right', fontsize=12)
        print("Saving the accuracy plot as \'{}\'".format(filename_plot))
        plt.savefig(filename_plot, dpi=300, bbox_inches='tight')
Ejemplo n.º 8
0
#evaluation (if the flag -l was used)/training
if args.load:
    saved_weights = args.load
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    print("Loading weights {}".format(saved_weights))
    model.load_weights(saved_weights)
    
    history = model.evaluate(test_images, test_labels, batch_size=batch_dim, verbose=2)
    
else:
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    
    epochs = 500
    
    lr_schedule = callbacks.LearningRateScheduler(lambda epoch: learning_rate * (0.5 ** (epoch // 100)), verbose=0)
    terminate_on_NaN = callbacks.TerminateOnNaN()
    earlystopping = callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=45, verbose=1, mode='max', baseline=None, restore_best_weights=False)
    callbacks_list = list(filter(None, [lr_schedule, terminate_on_NaN, earlystopping]))
    
    tic_training = datetime.datetime.now()
    history = model.fit(train_images, train_labels, batch_size=batch_dim, epochs=epochs, validation_data=(test_images, test_labels), shuffle=True, verbose=2, callbacks=callbacks_list)

    toc_training = datetime.datetime.now()
    print("Training, elapsed: {} minutes.".format((toc_training - tic_training).seconds//60))


    def get_filename(type):
        """Computes the filename for the outputs of the training 
        (checks whether the file already exists, in that case adds a number to the filename 
        to avoid overriding it)
        
Ejemplo n.º 9
0
                  metrics=['mean_absolute_error'])
    history = model.fit(
        train_loader,
        validation_data=val_loader,
        epochs=args.num_epochs,
        verbose=True,
        shuffle=False,
        callbacks=[
            LRLogger(),
            EpochTimeLogger(),
            cb.LearningRateScheduler(lr_schedule),
            cb.ModelCheckpoint(os.path.join(test_dir, 'best_model.h5'),
                               save_best_only=True),
            cb.EarlyStopping(patience=128, restore_best_weights=True),
            cb.CSVLogger(os.path.join(test_dir, 'train_log.csv')),
            cb.TerminateOnNaN()
        ])

    # Run on the validation set and assess statistics
    y_true = np.hstack([x[1].numpy()[:, 0] for x in iter(test_loader)])
    y_pred = np.squeeze(model.predict(test_loader))

    pd.DataFrame({
        'true': y_true,
        'pred': y_pred
    }).to_csv(os.path.join(test_dir, 'test_results.csv'), index=False)

    with open(os.path.join(test_dir, 'test_summary.json'), 'w') as fp:
        json.dump(
            {
                'r2_score': float(np.corrcoef(y_true, y_pred)[1, 0]**
def train(
    train_data,
    val_data,
    test_data,
    model: keras.Model,
    save_dir: pathlib.Path,
    config: Config,
    category_taxonomy: Taxonomy,
    category_names: List[str],
):
    print("Starting training...")
    temporary_log_dir = pathlib.Path(tempfile.mkdtemp())
    print("Temporary log directory: {}".format(temporary_log_dir))

    X_train, y_train = train_data
    X_val, y_val = val_data
    X_test, y_test = test_data

    model.fit(
        X_train,
        y_train,
        batch_size=config.train_config.batch_size,
        epochs=config.train_config.epochs,
        validation_data=(X_val, y_val),
        callbacks=[
            callbacks.TerminateOnNaN(),
            callbacks.ModelCheckpoint(
                filepath=str(save_dir /
                             "weights.{epoch:02d}-{val_loss:.4f}.hdf5"),
                monitor="val_loss",
                save_best_only=True,
            ),
            callbacks.TensorBoard(log_dir=str(temporary_log_dir),
                                  histogram_freq=2),
            callbacks.EarlyStopping(monitor="val_loss", patience=4),
            callbacks.CSVLogger(str(save_dir / "training.csv")),
        ],
    )
    print("Training ended")

    log_dir = save_dir / "logs"
    print("Moving log directory from {} to {}".format(temporary_log_dir,
                                                      log_dir))
    shutil.move(str(temporary_log_dir), str(log_dir))

    model.save(str(save_dir / "last_checkpoint.hdf5"))

    last_checkpoint_path = sorted(save_dir.glob("weights.*.hdf5"))[-1]

    print("Restoring last checkpoint {}".format(last_checkpoint_path))
    model = keras.models.load_model(str(last_checkpoint_path))

    print("Evaluating on validation dataset")
    y_pred_val = model.predict(X_val)
    report, clf_report = evaluation_report(y_val,
                                           y_pred_val,
                                           taxonomy=category_taxonomy,
                                           category_names=category_names)

    save_json(report, save_dir / "metrics_val.json")
    save_json(clf_report, save_dir / "classification_report_val.json")

    y_pred_test = model.predict(X_test)
    report, clf_report = evaluation_report(y_test,
                                           y_pred_test,
                                           taxonomy=category_taxonomy,
                                           category_names=category_names)

    save_json(report, save_dir / "metrics_test.json")
    save_json(clf_report, save_dir / "classification_report_test.json")
Ejemplo n.º 11
0
def train_test_model(args, hparams=None, reporter=None):

    logger.info("setting up devices")
    # allow growth to precent memory errors
    setup_devices()

    logger.info("setting up callbacks")
    callbacks = []

    # setting up wandb
    if args.wandb_project:
        import wandb
        wandb_run = wandb.init(project=args.wandb_project,
                               config=args,
                               name=args.wandb_name,
                               sync_tensorboard=True)
        callbacks.append(wandb.keras.WandbCallback())

        if args.logdir is None:
            args.logdir = os.path.join(
                "logs", args.wandb_project,
                "%s-%s" % (get_now_timestamp(), str(wandb_run.id)))
            logger.info("Using logdir %s, because None was specified" %
                        args.logdir)

    if args.logdir is None:
        args.logdir = os.path.join("logs", "default", get_now_timestamp())

    logger.info("logdir: %s" % args.logdir)
    if args.delete_logdir and os.path.isdir(args.logdir):
        logger.warning("delting everything in logdir %s" % args.logdir)
        shutil.rmtree(args.logdir)

    os.makedirs(args.logdir, exist_ok=True)

    # write hyperparameters as text summary
    with tf.summary.create_file_writer(os.path.join(args.logdir,
                                                    'train')).as_default():
        hyperparameters = [
            tf.convert_to_tensor([k, str(v)]) for k, v in vars(args).items()
        ]
        tf.summary.text('hyperparameters', tf.stack(hyperparameters), step=0)

    if not args.no_tensorboard:
        callbacks.append(
            kcallbacks.TensorBoard(log_dir=args.logdir,
                                   histogram_freq=0,
                                   write_graph=True,
                                   profile_batch=0,
                                   write_images=False,
                                   write_grads=True,
                                   update_freq=args.tensorboard_update_freq))

    if not args.no_terminate_on_nan:
        callbacks.append(kcallbacks.TerminateOnNaN())

    if not args.no_model_checkpoint:
        callbacks.append(
            kcallbacks.ModelCheckpoint(
                os.path.join(args.logdir, "model-best.h5"),
                monitor=args.model_checkpoint_monitor,  # val_loss default
                verbose=1,
                save_best_only=not args.no_save_best_only,
                period=1))

    if not args.no_early_stopping:
        callbacks.append(
            kcallbacks.EarlyStopping(
                monitor=args.early_stopping_monitor,  # default: val_loss
                mode=args.early_stopping_mode,  # default: min
                min_delta=0,
                patience=args.early_stopping_patience,  # default: 20
                verbose=1))

    if args.reduce_lr_on_plateau:
        callbacks.append(
            kcallbacks.ReduceLROnPlateau(monitor=args.reduce_lr_monitor,
                                         factor=args.reduce_lr_factor,
                                         patience=args.reduce_lr_patience,
                                         min_lr=args.reduce_lr_min_lr,
                                         verbose=1,
                                         mode=args.reduce_lr_mode,
                                         min_delta=args.reduce_lr_min_delta))

    if hparams:
        from tensorboard.plugins.hparams import api as hp
        callbacks.append(hp.KerasCallback(args.logdir, hparams))

    if reporter:
        from ray.tune.integration.keras import TuneReporterCallback
        callbacks.append(TuneReporterCallback(reporter))

    if args.tpu_strategy:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)

    elif len(args.gpus) == 0:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
    elif len(args.gpus) == 1:
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:%d" %
                                                   args.gpus[0])
    else:
        strategy = tf.distribute.MirroredStrategy(
            devices=['/gpu:%d' % gpu for gpu in args.gpus])

    global_batch_size = args.batch_size * (len(args.gpus)
                                           if len(args.gpus) > 0 else 1)

    # callbacks.append(kcallbacks.LambdaCallback(on_epoch_end=on_epoch_end))

    assert (args.record_dir is not None or args.dataset is not None
            or args.record_tag is not None or args.directory is not None)

    logger.info("setting up dataset")
    if args.dataset or args.directory:
        if args.dataset and type(args.dataset) == str:
            cache_dir = get_cache_dir(args.data_dir, args.dataset)
            ds = get_dataset_by_name(args.dataset, cache_dir)
        elif args.dataset:
            ds = args.dataset
        else:
            ds = DirectoryDataset(args.directory)
            cache_dir = args.directory

        assert (ds.num_classes > 0), "The dataset must have at least 1 class"
        logger.info("using dataset %s with %d classes" %
                    (ds.__class__.__name__, ds.num_classes))

        if not args.train_on_generator:
            logger.info("writing records")

            record_dir = os.path.join(cache_dir, 'records')
            logger.info("using record dir %s" % record_dir)

            writer = TFWriter(record_dir, options=args.record_options)
            writer.write(ds)
            writer.validate(ds)

        num_classes = ds.num_classes
    elif args.record_dir:
        if not os.path.exists(args.record_dir):
            raise Exception("cannot find record dir %s" % args.record_dir)
        record_dir = args.record_dir
        num_classes = TFReader(record_dir,
                               options=args.record_options).num_classes
    elif args.record_tag:
        record_tag = args.record_tag
        record_dir = os.path.join(args.data_dir, 'downloaded', record_tag)
        download_records(record_tag, record_dir)
        num_classes = TFReader(record_dir,
                               options=args.record_options).num_classes

    if args.size and args.color_mode != ColorMode.NONE:
        input_shape = (args.size[0], args.size[1],
                       3 if args.color_mode == ColorMode.RGB else 1)

    elif args.train_on_generator:
        raise Exception(
            "please specify the 'size' and 'color_mode' argument when training using the generator"
        )
    else:
        input_shape = TFReader(record_dir,
                               options=args.record_options).input_shape
        input_shape = (input_shape[0], input_shape[1],
                       3 if args.color_mode == ColorMode.RGB else 1)

    logger.info("input shape: %s" % str(input_shape))

    # set scale mask based on sigmoid activation
    scale_mask = args.final_activation == 'sigmoid'

    if num_classes != 2 and args.final_activation == 'sigmoid':
        logger.error(
            'do not choose sigmoid as the final activation when the dataset has more than 2 classes'
        )
        return {}

    if args.final_activation == 'sigmoid':
        logger.warning(
            'using only 1 output channel for sigmoid activation function to work'
        )
        num_classes = 1

    logger.info('strategy: %s' % str(strategy))

    # check valid model args
    if args.model in models_by_name:
        valid_model_args = list(
            inspect.signature(models_by_name[args.model]).parameters.keys())

        for key in args.model_args.keys():
            if key not in valid_model_args:
                raise Exception(
                    "invalid model args; cannot find key %s in %s for model of name %s"
                    % (key, str(valid_model_args), args.model))

    logger.info("creating model %s" % args.model)
    with strategy.scope():
        model_args = {'input_shape': input_shape, "num_classes": num_classes}
        model_args.update(args.model_args)

        if isinstance(args.model, str):
            model = get_model_by_name(args.model, model_args)
        elif isinstance(args.model, types.FunctionType):
            model = args.model(**model_args)
        else:
            logger.warning(
                "using own model, please make sure num_classes and input_shape is correct"
            )
            model = args.model

        if not args.no_save_model_weights:
            callbacks.append(
                custom_callbacks.SaveBestWeights(
                    model, os.path.join(args.logdir, 'best-weights.h5')))

        if args.model_weights:
            logger.info("restoring model weights from %s" % args.model_weights)
            model.load_weights(args.model_weights)

        model = Model(model.input,
                      Activation(args.final_activation)(model.output))
        logger.info("output shape: %s" % model.output.shape)
        logger.info("input shape: %s" % model.input.shape)

        # loss and metrics
        loss = get_loss_by_name(args.loss)
        metrics = [get_metric_by_name(name) for name in args.metrics]

        logger.info("metrics: %s" % str(metrics))
        logger.info("loss: %s" % str(loss))

        opt = get_optimizer_by_name(args.optimizer, args.learning_rate)
        model.compile(optimizer=opt, loss=loss,
                      metrics=metrics)  # metrics=losses

    if args.summary:
        model.summary()

    if args.train_on_generator:
        train_ds = convert2tfdataset(ds, DataType.TRAIN)
        val_ds = convert2tfdataset(ds, DataType.VAL)
    else:
        logger.info("using tfreader to read record dir %s" % record_dir)
        reader = TFReader(record_dir, options=args.record_options)
        train_ds = reader.get_dataset(DataType.TRAIN)
        val_ds = reader.get_dataset(DataType.VAL)

    logger.info("building input pipeline")
    # train preprocessing
    train_preprocess_fn = preprocessing_ds.get_preprocess_fn(
        args.size, args.color_mode, args.resize_method, scale_mask=scale_mask)
    train_ds = train_ds.map(train_preprocess_fn,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)

    augment_fn = None if len(
        args.augmentations) == 0 else preprocessing_ds.get_augment_fn(
            args.size, global_batch_size, methods=args.augmentations)
    train_ds = preprocessing_ds.prepare_dataset(train_ds,
                                                global_batch_size,
                                                buffer_size=args.buffer_size,
                                                augment_fn=augment_fn)

    # val preprocessing
    val_preprocess_fn = preprocessing_ds.get_preprocess_fn(
        args.size, args.color_mode, args.resize_method, scale_mask=scale_mask)
    val_ds = val_ds.map(val_preprocess_fn,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    val_ds = preprocessing_ds.prepare_dataset(val_ds,
                                              global_batch_size,
                                              buffer_size=args.val_buffer_size)

    # log images to tensorboard
    if not args.no_tensorboard:
        if args.tensorboard_train_images_update_batch_freq > 0:
            train_ds_images = convert2tfdataset(
                ds, DataType.TRAIN
            ) if args.train_on_generator else reader.get_dataset(
                DataType.TRAIN)
            train_ds_images = train_ds_images.map(val_preprocess_fn,
                                                  num_parallel_calls=1)
            train_ds_images = preprocessing_ds.prepare_dataset(
                train_ds_images,
                args.num_tensorboard_images,
                buffer_size=10,
                shuffle=True,
                prefetch=False)
            train_prediction_callback = custom_callbacks.BatchPredictionCallback(
                model,
                os.path.join(args.logdir, 'train'),
                train_ds_images,
                scaled_mask=scale_mask,
                binary_threshold=args.binary_threshold,
                update_freq=args.tensorboard_train_images_update_batch_freq)
            callbacks.append(train_prediction_callback)
            train_prediction_callback.on_batch_end(-1, {})

        if args.tensorboard_val_images:
            val_ds_images = convert2tfdataset(
                ds, DataType.VAL
            ) if args.train_on_generator else reader.get_dataset(DataType.VAL)
            val_ds_images = val_ds_images.map(val_preprocess_fn,
                                              num_parallel_calls=1)
            val_ds_images = preprocessing_ds.prepare_dataset(
                val_ds_images,
                args.num_tensorboard_images,
                buffer_size=1,
                shuffle=False,
                prefetch=False,
                take=args.num_tensorboard_images)
            val_prediction_callback = custom_callbacks.EpochPredictionCallback(
                model,
                os.path.join(args.logdir, 'validation'),
                val_ds_images,
                scaled_mask=scale_mask,
                binary_threshold=args.binary_threshold,
                update_freq=args.tensorboard_images_freq)
            callbacks.append(val_prediction_callback)
            val_prediction_callback.on_epoch_end(-1, {})

        if args.tensorboard_test_images:
            test_ds_images = convert2tfdataset(
                ds, DataType.TEST
            ) if args.train_on_generator else reader.get_dataset(DataType.TEST)
            test_ds_images = test_ds_images.map(val_preprocess_fn,
                                                num_parallel_calls=1)
            test_ds_images = preprocessing_ds.prepare_dataset(
                test_ds_images,
                args.num_tensorboard_images,
                buffer_size=1,
                shuffle=False,
                prefetch=False,
                take=args.num_tensorboard_images)
            test_prediction_callback = custom_callbacks.EpochPredictionCallback(
                model,
                os.path.join(args.logdir, 'test'),
                val_ds_images,
                scaled_mask=scale_mask,
                binary_threshold=args.binary_threshold,
                update_freq=args.tensorboard_images_freq)
            callbacks.append(test_prediction_callback)
            test_prediction_callback.on_epoch_end(-1, {})

    if args.start_tensorboard:
        kill_start_tensorboard(args.logdir, port=args.tensorboard_port)

    if args.steps_per_epoch != -1:
        steps_per_epoch = args.steps_per_epoch
    elif args.train_on_generator:
        steps_per_epoch = ds.num_examples(DataType.TRAIN) // global_batch_size
    else:
        logger.warning(
            "Reading total number of input samples, cause no steps were specifed. This may take a while."
        )
        steps_per_epoch = reader.num_examples(
            DataType.TRAIN) // global_batch_size

    if args.validation_steps != -1:
        validation_steps = args.validation_steps
    elif args.train_on_generator:
        validation_steps = ds.num_examples(DataType.VAL) // global_batch_size
    else:
        logger.warning(
            "Reading total number of input val samples, cause no val_steps were specifed. This may take a while."
        )
        validation_steps = reader.num_examples(
            DataType.VAL) // global_batch_size

    model.fit(train_ds,
              steps_per_epoch=steps_per_epoch,
              validation_data=val_ds,
              validation_steps=validation_steps,
              callbacks=callbacks,
              epochs=args.epochs,
              validation_freq=args.validation_freq)

    results = model.evaluate(val_ds, steps=validation_steps)

    # saved model export
    saved_model_path = os.path.join(args.logdir, 'saved_model',
                                    str(args.saved_model_version))

    if os.path.exists(saved_model_path):
        shutil.rmtree(saved_model_path)

    if not args.no_export_saved_model:
        logger.info("exporting saved model to %s" % saved_model_path)
        model.save(saved_model_path, save_format='tf')

    return results, model
Ejemplo n.º 12
0
def _train_model(model: tf.keras.Model, database: Dict[str, float], num_epochs: int, test_set: Optional[List[str]],
                 batch_size: int = 32, validation_split: float = 0.1, bootstrap: bool = False,
                 random_state: int = 1, learning_rate: float = 1e-3, patience: int = None,
                 timeout: float = None) -> Union[Tuple[List, dict], Tuple[List, dict, List[float]]]:
    """Train a model

    Args:
        model: Model to be trained
        database: Training dataset of molecule mapped to a property
        test_set: Hold-out set. If provided, this function will return predictions on this set
        num_epochs: Maximum number of epochs to run
        batch_size: Number of molecules per training batch
        validation_split: Fraction of molecules used for the training/validation split
        bootstrap: Whether to perform a bootstrap sample of the dataset
        random_state: Seed to the random number generator. Ensures entries do not move between train
            and validation set as the database becomes larger
        learning_rate: Learning rate for the Adam optimizer
        patience: Number of epochs without improvement before terminating training.
        timeout: Maximum training time in seconds
    Returns:
        model: Updated weights
        history: Training history
    """
    # Compile the model with a new optimizer
    #  We find that it is best to reset the optimizer before updating
    model.compile(tf.keras.optimizers.Adam(lr=learning_rate), 'mean_squared_error')

    # Separate the database into molecules and properties
    smiles, y = zip(*database.items())
    smiles = np.array(smiles)
    y = np.array(y)

    # Make the training and validation splits
    rng = np.random.RandomState(random_state)
    train_split = rng.rand(len(smiles)) > validation_split
    train_X = smiles[train_split]
    train_y = y[train_split]
    valid_X = smiles[~train_split]
    valid_y = y[~train_split]

    # Perform a bootstrap sample of the training data
    if bootstrap:
        sample = rng.choice(len(train_X), size=(len(train_X),), replace=True)
        train_X = train_X[sample]
        train_y = train_y[sample]

    # Make the training data loaders
    train_loader = GraphLoader(train_X, train_y, batch_size=batch_size, shuffle=True)
    val_loader = GraphLoader(valid_X, valid_y, batch_size=batch_size, shuffle=False)

    # Make the callbacks
    final_learn_rate = 1e-6
    init_learn_rate = learning_rate
    decay_rate = (final_learn_rate / init_learn_rate) ** (1. / (num_epochs - 1))

    def lr_schedule(epoch, lr):
        return lr * decay_rate

    if patience is None:
        patience = num_epochs // 8

    early_stopping = cb.EarlyStopping(patience=patience, restore_best_weights=True)
    my_callbacks = [
        LRLogger(),
        EpochTimeLogger(),
        cb.LearningRateScheduler(lr_schedule),
        early_stopping,
        cb.TerminateOnNaN(),
        train_loader  # So the shuffling gets called
    ]
    if timeout is not None:
        my_callbacks += [
            TimeLimitCallback(timeout)
        ]

    # Run the desired number of epochs
    history = model.fit(train_loader, epochs=num_epochs, validation_data=val_loader,
                        verbose=False, shuffle=False, callbacks=my_callbacks)

    # If a timeout is used, make sure we are using the best weights
    #  The training may have exited without storing the best weights
    if timeout is not None:
        model.set_weights(early_stopping.best_weights)

    # Check if there is a NaN loss
    if np.isnan(history.history['loss']).any():
        raise ValueError('Training failed due to a NaN loss.')

    # If provided, evaluate model on test set
    test_pred = None
    if test_set is not None:
        test_pred = evaluate_mpnn([model], test_set, batch_size, cache=False)

    # Convert weights to numpy arrays (avoids mmap issues)
    weights = []
    for v in model.get_weights():
        v = np.array(v)
        if np.isnan(v).any():
            raise ValueError('Found some NaN weights.')
        weights.append(v)

    # Once we are finished training call "clear_session"
    tf.keras.backend.clear_session()
    if test_pred is None:
        return weights, history.history
    else:
        return weights, history.history, test_pred[:, 0].tolist()
Ejemplo n.º 13
0
    model_out_path = os.path.join(
        config.data_settings.model_save_directory,
        'model_held_out{}'.format(config.data_settings.train_year))
    model_out_path += "{epoch:03d}"
    log_out_path = os.path.join(config.data_settings.model_save_directory,
                                'logs/')

    chpt = cbacks.ModelCheckpoint(model_out_path,
                                  save_best_only=False,
                                  verbose=True,
                                  monitor='val_f1',
                                  mode='max')

    tb = cbacks.TensorBoard(log_dir=log_out_path,
                            update_freq=config.data_settings.tb_update_freq)

    nanloss = cbacks.TerminateOnNaN()

    model.fit(
        train,
        steps_per_epoch=config.model_settings.training_steps_per_epoch,
        epochs=config.model_settings.epochs,
        # validation_data=validation,
        callbacks=[chpt, tb, nanloss],
        verbose=config.model_settings.train_verbosity)

    fully_trained_model_path = os.path.join(
        log_out_path, "{}_epochs".format(config.model_settings.epochs))
    model.save(fully_trained_model_path, save_format='tf')
Ejemplo n.º 14
0
def train(args) -> None:
    """Start training based on args input"""
    # Check if GPU is available
    print("\nNum GPUs Available: %d\n"\
          % (len(tf.config.list_physical_devices('GPU'))))

    # Set tf.keras mixed precision to float16
    set_keras_mixed_precision_policy('mixed_float16')

    # Create dataset
    save_svs_file, save_train_file, save_val_file \
            = generate_dataset(args.data_dir_AD, args.data_dir_control,
                               args.patch_size, force_regenerate=False)

    if args.fold_num != 0:  # If using five-fold cross-validation
        save_svs_file, save_train_file, save_val_file \
                = generate_five_fold_dataset(args.data_dir_AD, args.data_dir_control,
                                             args.patch_size, args.fold_num)

    # Load dataset
    train_dataset, val_dataset, class_weight \
            = load_dataset(save_svs_file, save_train_file, save_val_file,
                           args.batch_size)

    # Create network model
    model = get_model(args.model)
    #model.summary(120)
    #print(keras.backend.floatx())

    class_names = ['Background', 'Gray Matter', 'White Matter']
    model.compile(optimizer=optimizers.Adam(),
                  loss=get_loss_func(args.loss_func, class_weight,
                                     gamma=args.focal_loss_gamma),
                  metrics=[metrics.SparseCategoricalAccuracy(),
                           SparseMeanIoU(num_classes=3, name='IoU/Mean'),
                           SparsePixelAccuracy(num_classes=3, name='PixelAcc'),
                           SparseMeanAccuracy(num_classes=3, name='MeanAcc'),
                           SparseFreqIoU(num_classes=3, name='IoU/Freq_weighted'),
                           SparseConfusionMatrix(num_classes=3, name='cm')] \
            + SparseIoU.get_iou_metrics(num_classes=3, class_names=class_names))

    # Create another checkpoint/log folder for model.name and timestamp
    args.ckpt_dir = os.path.join(args.ckpt_dir,
                                 model.name+'-'+args.file_suffix)
    args.log_dir = os.path.join(args.log_dir, 'fit',
                                model.name+'-'+args.file_suffix)
    if args.fold_num != 0:  # If using five-fold cross-validation
        args.ckpt_dir += f'_fold_{args.fold_num}'
        args.log_dir += f'_fold_{args.fold_num}'

    # Check if resume from training
    initial_epoch = 0
    if args.ckpt_filepath is not None:
        if args.ckpt_weights_only:
            if args.ckpt_filepath.endswith('.index'):   # Get rid of the suffix
                args.ckpt_filepath = args.ckpt_filepath.replace('.index', '')
            model.load_weights(args.ckpt_filepath).assert_existing_objects_matched()
            print('Model weights loaded')
        else:
            model = load_whole_model(args.ckpt_filepath)
            print('Whole model (weights + optimizer state) loaded')

        initial_epoch = int(args.ckpt_filepath.split('/')[-1]\
                .split('-')[1])
        # Save in same checkpoint_dir but different log_dir (add current time)
        args.ckpt_dir = os.path.abspath(
            os.path.dirname(args.ckpt_filepath))
        args.log_dir = args.ckpt_dir.replace(
            'checkpoints', 'tf_logs/fit') + f'-retrain_{args.file_suffix}'

    # Write configurations to log_dir
    log_configs(args.log_dir, save_svs_file, train_dataset, val_dataset, args)

    # Create checkpoint directory
    if not os.path.exists(args.ckpt_dir):
        os.makedirs(args.ckpt_dir)
    # Create log directory
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    # Create a callback that saves the model's weights every 1 epoch
    if val_dataset:
        ckpt_path = os.path.join(
            args.ckpt_dir, 'cp-{epoch:03d}-{val_IoU/Mean:.4f}.ckpt')
    else:
        ckpt_path = os.path.join(
            args.ckpt_dir, 'cp-{epoch:03d}-{IoU/Mean:.4f}.ckpt')
    cp_callback = callbacks.ModelCheckpoint(
        filepath=ckpt_path,
        verbose=1,
        save_weights_only=args.ckpt_weights_only,
        save_freq='epoch')

    # Create a TensorBoard callback
    tb_callback = callbacks.TensorBoard(
        log_dir=args.log_dir,
        histogram_freq=1,
        write_graph=True,
        write_images=False,
        update_freq='batch',
        profile_batch='100, 120')

    # Create a Lambda callback for plotting confusion matrix
    cm_callback = get_cm_callback(args.log_dir, class_names)

    # Create a TerminateOnNaN callback
    nan_callback = callbacks.TerminateOnNaN()

    # Create an EarlyStopping callback
    if val_dataset:
        es_callback = callbacks.EarlyStopping(monitor='val_IoU/Mean',
                                              min_delta=0.01,
                                              patience=3,
                                              verbose=1,
                                              mode='max')

    if val_dataset:
        model.fit(
            train_dataset,
            epochs=args.num_epochs,
            steps_per_epoch=len(train_dataset) \
                    if args.steps_per_epoch == -1 else args.steps_per_epoch,
            initial_epoch=initial_epoch,
            validation_data=val_dataset,
            validation_steps=len(val_dataset) // args.val_subsplits \
                    if args.val_steps == -1 else args.val_steps,
            callbacks=[cp_callback, tb_callback, nan_callback, cm_callback, es_callback])
    else:
        model.fit(
            train_dataset,
            epochs=args.num_epochs,
            steps_per_epoch=len(train_dataset) \
                    if args.steps_per_epoch == -1 else args.steps_per_epoch,
            initial_epoch=initial_epoch,
            callbacks=[cp_callback, tb_callback, nan_callback, cm_callback])
    # TODO: Switch to tf.data

    print('Training finished!')
Ejemplo n.º 15
0
                 kernel_initializer='lecun_uniform',
                 name='dense_relu3')(x)
x = layers.Dropout(0.2)(x)
#
output = layers.Dense(1,
                      activation='linear',
                      kernel_initializer='lecun_uniform')(x)
model = models.Model(inputs=input1, outputs=output)

model.compile(optimizer=optimizers.Adam(), loss='mae')
model.summary()

my_callbacks = [
    callbacks.EarlyStopping(patience=10, verbose=1),
    #callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),
    callbacks.TerminateOnNaN()
]

# train
history = model.fit(X1_train,
                    Y_train,
                    epochs=500,
                    batch_size=128,
                    verbose=2,
                    validation_data=(X1_val, Y_val),
                    callbacks=my_callbacks)

nameModel = 'EMD_Conv2D_MAE'
#nameModel = 'EMD_Dense_MAPE'
#nameModel = 'EMD_Dense_MAE_AsymmetryLarge_%s' %sys.argv[1]