def get_callbacks(CONF, use_lr_decay=True): """ Get a callback list to feed fit_generator. #TODO Use_remote callback needs proper configuration #TODO Add ReduceLROnPlateau callback? Parameters ---------- CONF: dict Returns ------- List of callbacks """ calls = [] # Add mandatory callbacks calls.append(callbacks.TerminateOnNaN()) calls.append(LRHistory()) # Add optional callbacks if use_lr_decay: milestones = np.array(CONF['training']['lr_step_schedule']) * CONF['training']['epochs'] milestones = milestones.astype(np.int) calls.append(LR_scheduler(lr_decay=CONF['training']['lr_step_decay'], epoch_milestones=milestones.tolist())) if CONF['monitor']['use_tensorboard']: calls.append(callbacks.TensorBoard(log_dir=paths.get_logs_dir(), write_graph=False)) # # Let the user launch Tensorboard # print('Monitor your training in Tensorboard by executing the following comand on your console:') # print(' tensorboard --logdir={}'.format(paths.get_logs_dir())) # Run Tensorboard on a separate Thread/Process on behalf of the user port = os.getenv('monitorPORT', 6006) port = int(port) if len(str(port)) >= 4 else 6006 subprocess.run(['fuser', '-k', '{}/tcp'.format(port)]) # kill any previous process in that port p = Process(target=launch_tensorboard, args=(port,), daemon=True) p.start() if CONF['monitor']['use_remote']: calls.append(callbacks.RemoteMonitor()) if CONF['training']['use_validation'] and CONF['training']['use_early_stopping']: calls.append(callbacks.EarlyStopping(patience=int(0.1 * CONF['training']['epochs']))) if CONF['training']['ckpt_freq'] is not None: calls.append(callbacks.ModelCheckpoint( os.path.join(paths.get_checkpoints_dir(), 'epoch-{epoch:02d}.hdf5'), verbose=1, period=max(1, int(CONF['training']['ckpt_freq'] * CONF['training']['epochs'])))) if not calls: calls = None return calls
def binaryClassification(data, labels, hiddenLayers, lrate, nEpochs, kSplitt=10, rp=0.01, columns=None, plotName=None): if (columns is not None): data = data[:, columns] if (kSplitt > 0): randomSeed = 0 if (randomSeed != 0): kfold = StratifiedKFold(n_splits=kSplitt, shuffle=True, random_state=randomSeed) else: kfold = StratifiedKFold(n_splits=kSplitt, shuffle=True) i = 0; cvscores = [] # K-Fold analysis based on https://machinelearningmastery.com/evaluate-performance-deep-learning-models-keras/ for train, test in kfold.split(data, labels): i = i+1 ### Define Neuronal Network cbks = [callbacks.TerminateOnNaN()] layers=[keras.layers.Dense(i, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(rp)) for i in hiddenLayers] # layers=[keras.layers.Dense(i, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(rp)) for i in hiddenLayers] # layers=keras.layers.Dense(i, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(rp))(layers) layers.append(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model = keras.Sequential(layers) model.compile(optimizer = tf.train.AdamOptimizer(), lr = lrate, loss = 'binary_crossentropy', metrics = ['accuracy']) ### Execute model history = model.fit(data[train], labels[train], epochs=nEpochs, callbacks=cbks, verbose=0) #validation_data=[test_data,test_labels]) #--> Use this to grep & plot this per Epochs (last line) scores = model.evaluate(data[test], labels[test], verbose=0) if (np.isnan(history.history['loss']).any()): raise ValueError("Loss was not a number") # Needs to be refactored if (plotName is not None): plt.plot(history.history['acc']) #plt.plot(history.history['val_acc']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.savefig("../data/" + plotName + str(i) + ".png") print("%s %s: %.2f%%" % (i, model.metrics_names[1], scores[1]*100)) cvscores.append(scores[1] * 100) print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
def binaryClassification(train_data, train_labels, test_data, test_labels, nEpochs, lrate, layerSize, rp=0.01, columns=None): ### Read input data ### # # Training data (80%) # train_data=np.load(folder + "train_data.npy") # train_labels=np.load(folder + "train_labels.npy") # # Evaluation data (10%) # test_data=np.load(folder + "test_data.npy") # test_labels=np.load(folder + "test_labels.npy") if (columns is not None): train_data = train_data[:, columns] test_data = test_data[:, columns] ### Define Neuronal Network cbks = [callbacks.TerminateOnNaN()] layers=[keras.layers.Dense(i, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(rp)) for i in layerSize] layers.append(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model = keras.Sequential(layers) model.compile(optimizer = tf.train.AdamOptimizer(), lr = lrate, loss = 'binary_crossentropy', metrics = ['accuracy']) ### Execute model # history = model.fit(train_data, train_labels, epochs=nEpochs, verbose=1, validation_data=[test_data,test_labels]) #--> Use this to grep & plot this per Epochs (last line) history = model.fit(train_data, train_labels, callbacks=cbks, epochs=nEpochs, verbose=0) test_loss, test_acc = model.evaluate(test_data, test_labels, verbose=0) # if (math.isnan(history.history['loss'])): if (np.isnan(history.history['loss']).any()): raise ValueError("Loss was not a number") plt.plot(history.history['acc']) # plt.plot(history.history['val_acc']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') #plt.show() # saveModel(model); plotWeights(model) return test_loss, test_acc
def callback(model_out, patience, metrics): calls = [ callbacks.ModelCheckpoint(model_out, save_best_only=True, monitor=metrics, verbose=1) ] calls += [ callbacks.ReduceLROnPlateau(patience=3, factor=0.5, min_delta=1e-6, monitor=metrics, verbose=1) ] calls += [ callbacks.EarlyStopping(patience=patience, restore_best_weights=True, min_delta=1e-5, monitor=metrics, verbose=1) ] return calls + [callbacks.TerminateOnNaN()]
def train(self, epochs: int, lr: float, steps_per_epoch: int = 1): """ This function is used to Train the model, it uses Adam Optimizer to train, and it saves the weights of every epoch in 'model_weights' dir, training steps_per_epoch=1 and val_steps=5 by default. You can optionally set the following parameters: param: epochs (NO of epochs to train the model) param: lr (learning rate for the model) param: steps_per_epoch (it defines steps per epoch for training data) """ if (self.modelType == 'tinyyolov4'): self.optimizer = optimizers.Adam(learning_rate=lr) self.model.compile(optimizer=self.optimizer, loss_iou_type='ciou', loss_verbose=0) def lr_scheduler(epoch, lr): return lr * tf.math.exp(-0.1) self.model.fit(self.train_dataset, epochs=epochs, callbacks=[ callbacks.LearningRateScheduler(lr_scheduler, verbose=1), callbacks.TerminateOnNaN(), callbacks.TensorBoard(histogram_freq=1, log_dir="./logs"), SaveWeightsCallback(yolo=self.model, dir_path="./model_weights", weights_type="yolo", epoch_per_save=1), ], validation_data=self.val_dataset, validation_steps=self.val_steps, steps_per_epoch=steps_per_epoch) else: raise RuntimeError('Invalid ModelType: Valid Type is YOLOv4')
lr_schedule_cb = keras_callbacks.LearningRateScheduler( schedule=( # schedule := tomo2seg_schedule.get_schedule00() schedule := tomo2seg_schedule.LinSpaceSchedule( offset_epoch=0, wait=100, start=initial_lr, stop=initial_lr / 10, n_between=100 ) ), verbose=2, ) # todo plot schedule logger.info(f"{lr_schedule_cb.schedule.range=}") callbacks = [ keras_callbacks.TerminateOnNaN(), keras_callbacks.ModelCheckpoint( t2s_model.autosaved2_model_path_str, monitor="val_loss", verbose=1, save_best_only=True, mode="min", ), history_cb, history_plot_cb, lr_schedule_cb, ] try: early_stop_cb
def train_model(learning_algorithm, dataset, hidden_layers, batch_dim, learning_rate, seed): """ function that trains a neural network with tf.keras with automatic differentiation. Keyword arguments: learning_algorithm -- either 'EBP' for error backpropagation (with softmax and cross-entropy loss) or 'BrainProp' dataset -- either 'MNIST', 'CIFAR10' or 'CIFAR100' hidden_layers -- list of layers for the network (accepts 'Dense(n)', 'Conv2D(n_filters, (ksize_x,ksize_y)' and any other layer with full input) batch_dim -- minibatch size learning_rate -- learning rate used for training seed -- integer, seed used for reproducible results """ save_plots = True print("Experiment begins, training on {} with {}".format( dataset, learning_algorithm)) np.random.seed(seed) tf.random.set_seed(seed) if dataset == 'MNIST': (train_images, train_labels), (test_images, test_labels) = datasets.mnist.load_data() if len(np.shape(train_images)) < 4: train_images = tf.expand_dims(train_images, -1).numpy() test_images = tf.expand_dims(test_images, -1).numpy() elif dataset == 'CIFAR10': (train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data() elif dataset == 'CIFAR100': (train_images, train_labels), (test_images, test_labels) = datasets.cifar100.load_data( label_mode='fine') else: raise Exception( "Unknown dataset. Choose either \'MNIST\', \'CIFAR10\' or \'CIFAR100\'." ) if tf.reduce_max(train_images) > 1: train_images = train_images / 255.0 if tf.reduce_max(test_images) > 1: test_images = test_images / 255.0 image_shape = np.shape(train_images)[1:] n_classes = tf.cast(tf.reduce_max(train_labels) + 1, dtype=tf.int32) n_batches = len(train_images) // batch_dim train_labels = tf.keras.utils.to_categorical(train_labels, n_classes, dtype='float32') test_labels = tf.keras.utils.to_categorical(test_labels, n_classes, dtype='float32') #preparing architecture and optimizer depending on the selected learning algorithm if learning_algorithm == 'EBP': output_activation_function = 'softmax' loss = 'categorical_crossentropy' metric = 'accuracy' output_layer = layers.Dense elif learning_algorithm == 'BrainProp': output_activation_function = 'linear' metric = 'accuracy' brainprop = import_from_path('brainprop', file_path="brainprop.py") loss = brainprop.BrainPropLoss(batch_size=batch_dim, n_classes=n_classes, replicas=1) output_layer = brainprop.BrainPropLayer # if os.path.exists('brainprop.py') != True: # ! wget https://github.com/isapome/BrainProp/raw/master/brainprop.py # from brainprop import BrainPropLayer, BrainPropLoss # loss = BrainPropLoss(batch_size=batch_dim, n_classes=n_classes, replicas=1) # output_layer = BrainPropLayer else: raise Exception( "Unknown learning algorithm. Choose between \'EBP\' and \'BrainProp\' " ) optimizer = optimizers.SGD(learning_rate=learning_rate, momentum=0.) bias = False initializer = tf.random_normal_initializer(mean=0., stddev=0.01) regularizer = None pad = 'same' model = models.Sequential() model.add(Input(shape=image_shape)) #input_shape=image_shape flatten_layer = 0 #there needs to be a flatten layer between 4dim inputs and dense layers. for hidden_layer in hidden_layers: if hidden_layer.__class__.__name__ == 'Dense' and flatten_layer < 1: model.add(layers.Flatten()) flatten_layer += 1 if hidden_layer.__class__.__name__ == 'Conv2D' and flatten_layer > 0: raise Exception( "Please do not add convolutional layers after dense layers.") config = hidden_layer.get_config() layer = layers.deserialize({ 'class_name': hidden_layer.__class__.__name__, 'config': config }) layer.use_bias = bias layer.kernel_initializer = initializer layer.kernel_regularizer = regularizer if hidden_layer.__class__.__name__ == 'Conv2D': layer.padding = pad model.add(layer) last_layer = output_layer(n_classes, activation=output_activation_function, use_bias=bias, kernel_regularizer=regularizer, kernel_initializer=initializer) model.add(last_layer) model.summary() model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) epochs = 500 #just as upper bound. Early stopping will act much earlier than this. lr_schedule = None terminate_on_NaN = callbacks.TerminateOnNaN() earlystopping = callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=10, verbose=1, mode='max', baseline=None, restore_best_weights=False) callbacks_list = list( filter(None, [lr_schedule, terminate_on_NaN, earlystopping])) tic_training = datetime.datetime.now() history = model.fit(train_images, train_labels, batch_size=batch_dim, epochs=epochs, validation_data=(test_images, test_labels), shuffle=True, verbose=2, callbacks=callbacks_list) toc_training = datetime.datetime.now() elapsed = (toc_training - tic_training).seconds // 60 print("Training, elapsed: {} minute{}.".format(elapsed, 's' if elapsed > 1 else '')) if save_plots == True: #save a plot of the accuracy as a function of the epochs filename_plot = get_filename('accuracy.png', dataset, learning_algorithm) n_epochs = len(history.history['accuracy']) plt.figure() plt.title("{} - {}".format(learning_algorithm, dataset), fontsize=16) plt.plot(history.history['accuracy'], label='accuracy', linewidth=2) plt.plot(history.history['val_accuracy'], label='validation accuracy', linewidth=2) maximum_val_accuracy = np.max(history.history['val_accuracy']) argmax_val_accuracy = np.argmax(history.history['val_accuracy']) plt.plot([argmax_val_accuracy, argmax_val_accuracy], [-0.4, maximum_val_accuracy], '--', color='green', linewidth=1) plt.plot(argmax_val_accuracy, maximum_val_accuracy, 'ks', markersize=7, label='maximum = {:.5}'.format(maximum_val_accuracy)) plt.xticks(list(plt.xticks()[0]) + [argmax_val_accuracy]) plt.gca().get_xticklabels()[-1].set_color("white") plt.gca().get_xticklabels()[-1].set_fontweight('bold') plt.gca().get_xticklabels()[-1].set_bbox( dict(facecolor='green', edgecolor='white', alpha=0.8)) plt.xlabel('Epoch', fontsize=14) plt.ylabel('Accuracy', fontsize=14) plt.xlim([-0.4, (n_epochs - .5)]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right', fontsize=12) print("Saving the accuracy plot as \'{}\'".format(filename_plot)) plt.savefig(filename_plot, dpi=300, bbox_inches='tight')
#evaluation (if the flag -l was used)/training if args.load: saved_weights = args.load model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) print("Loading weights {}".format(saved_weights)) model.load_weights(saved_weights) history = model.evaluate(test_images, test_labels, batch_size=batch_dim, verbose=2) else: model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) epochs = 500 lr_schedule = callbacks.LearningRateScheduler(lambda epoch: learning_rate * (0.5 ** (epoch // 100)), verbose=0) terminate_on_NaN = callbacks.TerminateOnNaN() earlystopping = callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=45, verbose=1, mode='max', baseline=None, restore_best_weights=False) callbacks_list = list(filter(None, [lr_schedule, terminate_on_NaN, earlystopping])) tic_training = datetime.datetime.now() history = model.fit(train_images, train_labels, batch_size=batch_dim, epochs=epochs, validation_data=(test_images, test_labels), shuffle=True, verbose=2, callbacks=callbacks_list) toc_training = datetime.datetime.now() print("Training, elapsed: {} minutes.".format((toc_training - tic_training).seconds//60)) def get_filename(type): """Computes the filename for the outputs of the training (checks whether the file already exists, in that case adds a number to the filename to avoid overriding it)
metrics=['mean_absolute_error']) history = model.fit( train_loader, validation_data=val_loader, epochs=args.num_epochs, verbose=True, shuffle=False, callbacks=[ LRLogger(), EpochTimeLogger(), cb.LearningRateScheduler(lr_schedule), cb.ModelCheckpoint(os.path.join(test_dir, 'best_model.h5'), save_best_only=True), cb.EarlyStopping(patience=128, restore_best_weights=True), cb.CSVLogger(os.path.join(test_dir, 'train_log.csv')), cb.TerminateOnNaN() ]) # Run on the validation set and assess statistics y_true = np.hstack([x[1].numpy()[:, 0] for x in iter(test_loader)]) y_pred = np.squeeze(model.predict(test_loader)) pd.DataFrame({ 'true': y_true, 'pred': y_pred }).to_csv(os.path.join(test_dir, 'test_results.csv'), index=False) with open(os.path.join(test_dir, 'test_summary.json'), 'w') as fp: json.dump( { 'r2_score': float(np.corrcoef(y_true, y_pred)[1, 0]**
def train( train_data, val_data, test_data, model: keras.Model, save_dir: pathlib.Path, config: Config, category_taxonomy: Taxonomy, category_names: List[str], ): print("Starting training...") temporary_log_dir = pathlib.Path(tempfile.mkdtemp()) print("Temporary log directory: {}".format(temporary_log_dir)) X_train, y_train = train_data X_val, y_val = val_data X_test, y_test = test_data model.fit( X_train, y_train, batch_size=config.train_config.batch_size, epochs=config.train_config.epochs, validation_data=(X_val, y_val), callbacks=[ callbacks.TerminateOnNaN(), callbacks.ModelCheckpoint( filepath=str(save_dir / "weights.{epoch:02d}-{val_loss:.4f}.hdf5"), monitor="val_loss", save_best_only=True, ), callbacks.TensorBoard(log_dir=str(temporary_log_dir), histogram_freq=2), callbacks.EarlyStopping(monitor="val_loss", patience=4), callbacks.CSVLogger(str(save_dir / "training.csv")), ], ) print("Training ended") log_dir = save_dir / "logs" print("Moving log directory from {} to {}".format(temporary_log_dir, log_dir)) shutil.move(str(temporary_log_dir), str(log_dir)) model.save(str(save_dir / "last_checkpoint.hdf5")) last_checkpoint_path = sorted(save_dir.glob("weights.*.hdf5"))[-1] print("Restoring last checkpoint {}".format(last_checkpoint_path)) model = keras.models.load_model(str(last_checkpoint_path)) print("Evaluating on validation dataset") y_pred_val = model.predict(X_val) report, clf_report = evaluation_report(y_val, y_pred_val, taxonomy=category_taxonomy, category_names=category_names) save_json(report, save_dir / "metrics_val.json") save_json(clf_report, save_dir / "classification_report_val.json") y_pred_test = model.predict(X_test) report, clf_report = evaluation_report(y_test, y_pred_test, taxonomy=category_taxonomy, category_names=category_names) save_json(report, save_dir / "metrics_test.json") save_json(clf_report, save_dir / "classification_report_test.json")
def train_test_model(args, hparams=None, reporter=None): logger.info("setting up devices") # allow growth to precent memory errors setup_devices() logger.info("setting up callbacks") callbacks = [] # setting up wandb if args.wandb_project: import wandb wandb_run = wandb.init(project=args.wandb_project, config=args, name=args.wandb_name, sync_tensorboard=True) callbacks.append(wandb.keras.WandbCallback()) if args.logdir is None: args.logdir = os.path.join( "logs", args.wandb_project, "%s-%s" % (get_now_timestamp(), str(wandb_run.id))) logger.info("Using logdir %s, because None was specified" % args.logdir) if args.logdir is None: args.logdir = os.path.join("logs", "default", get_now_timestamp()) logger.info("logdir: %s" % args.logdir) if args.delete_logdir and os.path.isdir(args.logdir): logger.warning("delting everything in logdir %s" % args.logdir) shutil.rmtree(args.logdir) os.makedirs(args.logdir, exist_ok=True) # write hyperparameters as text summary with tf.summary.create_file_writer(os.path.join(args.logdir, 'train')).as_default(): hyperparameters = [ tf.convert_to_tensor([k, str(v)]) for k, v in vars(args).items() ] tf.summary.text('hyperparameters', tf.stack(hyperparameters), step=0) if not args.no_tensorboard: callbacks.append( kcallbacks.TensorBoard(log_dir=args.logdir, histogram_freq=0, write_graph=True, profile_batch=0, write_images=False, write_grads=True, update_freq=args.tensorboard_update_freq)) if not args.no_terminate_on_nan: callbacks.append(kcallbacks.TerminateOnNaN()) if not args.no_model_checkpoint: callbacks.append( kcallbacks.ModelCheckpoint( os.path.join(args.logdir, "model-best.h5"), monitor=args.model_checkpoint_monitor, # val_loss default verbose=1, save_best_only=not args.no_save_best_only, period=1)) if not args.no_early_stopping: callbacks.append( kcallbacks.EarlyStopping( monitor=args.early_stopping_monitor, # default: val_loss mode=args.early_stopping_mode, # default: min min_delta=0, patience=args.early_stopping_patience, # default: 20 verbose=1)) if args.reduce_lr_on_plateau: callbacks.append( kcallbacks.ReduceLROnPlateau(monitor=args.reduce_lr_monitor, factor=args.reduce_lr_factor, patience=args.reduce_lr_patience, min_lr=args.reduce_lr_min_lr, verbose=1, mode=args.reduce_lr_mode, min_delta=args.reduce_lr_min_delta)) if hparams: from tensorboard.plugins.hparams import api as hp callbacks.append(hp.KerasCallback(args.logdir, hparams)) if reporter: from ray.tune.integration.keras import TuneReporterCallback callbacks.append(TuneReporterCallback(reporter)) if args.tpu_strategy: resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu='grpc://' + os.environ['COLAB_TPU_ADDR']) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif len(args.gpus) == 0: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") elif len(args.gpus) == 1: strategy = tf.distribute.OneDeviceStrategy(device="/gpu:%d" % args.gpus[0]) else: strategy = tf.distribute.MirroredStrategy( devices=['/gpu:%d' % gpu for gpu in args.gpus]) global_batch_size = args.batch_size * (len(args.gpus) if len(args.gpus) > 0 else 1) # callbacks.append(kcallbacks.LambdaCallback(on_epoch_end=on_epoch_end)) assert (args.record_dir is not None or args.dataset is not None or args.record_tag is not None or args.directory is not None) logger.info("setting up dataset") if args.dataset or args.directory: if args.dataset and type(args.dataset) == str: cache_dir = get_cache_dir(args.data_dir, args.dataset) ds = get_dataset_by_name(args.dataset, cache_dir) elif args.dataset: ds = args.dataset else: ds = DirectoryDataset(args.directory) cache_dir = args.directory assert (ds.num_classes > 0), "The dataset must have at least 1 class" logger.info("using dataset %s with %d classes" % (ds.__class__.__name__, ds.num_classes)) if not args.train_on_generator: logger.info("writing records") record_dir = os.path.join(cache_dir, 'records') logger.info("using record dir %s" % record_dir) writer = TFWriter(record_dir, options=args.record_options) writer.write(ds) writer.validate(ds) num_classes = ds.num_classes elif args.record_dir: if not os.path.exists(args.record_dir): raise Exception("cannot find record dir %s" % args.record_dir) record_dir = args.record_dir num_classes = TFReader(record_dir, options=args.record_options).num_classes elif args.record_tag: record_tag = args.record_tag record_dir = os.path.join(args.data_dir, 'downloaded', record_tag) download_records(record_tag, record_dir) num_classes = TFReader(record_dir, options=args.record_options).num_classes if args.size and args.color_mode != ColorMode.NONE: input_shape = (args.size[0], args.size[1], 3 if args.color_mode == ColorMode.RGB else 1) elif args.train_on_generator: raise Exception( "please specify the 'size' and 'color_mode' argument when training using the generator" ) else: input_shape = TFReader(record_dir, options=args.record_options).input_shape input_shape = (input_shape[0], input_shape[1], 3 if args.color_mode == ColorMode.RGB else 1) logger.info("input shape: %s" % str(input_shape)) # set scale mask based on sigmoid activation scale_mask = args.final_activation == 'sigmoid' if num_classes != 2 and args.final_activation == 'sigmoid': logger.error( 'do not choose sigmoid as the final activation when the dataset has more than 2 classes' ) return {} if args.final_activation == 'sigmoid': logger.warning( 'using only 1 output channel for sigmoid activation function to work' ) num_classes = 1 logger.info('strategy: %s' % str(strategy)) # check valid model args if args.model in models_by_name: valid_model_args = list( inspect.signature(models_by_name[args.model]).parameters.keys()) for key in args.model_args.keys(): if key not in valid_model_args: raise Exception( "invalid model args; cannot find key %s in %s for model of name %s" % (key, str(valid_model_args), args.model)) logger.info("creating model %s" % args.model) with strategy.scope(): model_args = {'input_shape': input_shape, "num_classes": num_classes} model_args.update(args.model_args) if isinstance(args.model, str): model = get_model_by_name(args.model, model_args) elif isinstance(args.model, types.FunctionType): model = args.model(**model_args) else: logger.warning( "using own model, please make sure num_classes and input_shape is correct" ) model = args.model if not args.no_save_model_weights: callbacks.append( custom_callbacks.SaveBestWeights( model, os.path.join(args.logdir, 'best-weights.h5'))) if args.model_weights: logger.info("restoring model weights from %s" % args.model_weights) model.load_weights(args.model_weights) model = Model(model.input, Activation(args.final_activation)(model.output)) logger.info("output shape: %s" % model.output.shape) logger.info("input shape: %s" % model.input.shape) # loss and metrics loss = get_loss_by_name(args.loss) metrics = [get_metric_by_name(name) for name in args.metrics] logger.info("metrics: %s" % str(metrics)) logger.info("loss: %s" % str(loss)) opt = get_optimizer_by_name(args.optimizer, args.learning_rate) model.compile(optimizer=opt, loss=loss, metrics=metrics) # metrics=losses if args.summary: model.summary() if args.train_on_generator: train_ds = convert2tfdataset(ds, DataType.TRAIN) val_ds = convert2tfdataset(ds, DataType.VAL) else: logger.info("using tfreader to read record dir %s" % record_dir) reader = TFReader(record_dir, options=args.record_options) train_ds = reader.get_dataset(DataType.TRAIN) val_ds = reader.get_dataset(DataType.VAL) logger.info("building input pipeline") # train preprocessing train_preprocess_fn = preprocessing_ds.get_preprocess_fn( args.size, args.color_mode, args.resize_method, scale_mask=scale_mask) train_ds = train_ds.map(train_preprocess_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) augment_fn = None if len( args.augmentations) == 0 else preprocessing_ds.get_augment_fn( args.size, global_batch_size, methods=args.augmentations) train_ds = preprocessing_ds.prepare_dataset(train_ds, global_batch_size, buffer_size=args.buffer_size, augment_fn=augment_fn) # val preprocessing val_preprocess_fn = preprocessing_ds.get_preprocess_fn( args.size, args.color_mode, args.resize_method, scale_mask=scale_mask) val_ds = val_ds.map(val_preprocess_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) val_ds = preprocessing_ds.prepare_dataset(val_ds, global_batch_size, buffer_size=args.val_buffer_size) # log images to tensorboard if not args.no_tensorboard: if args.tensorboard_train_images_update_batch_freq > 0: train_ds_images = convert2tfdataset( ds, DataType.TRAIN ) if args.train_on_generator else reader.get_dataset( DataType.TRAIN) train_ds_images = train_ds_images.map(val_preprocess_fn, num_parallel_calls=1) train_ds_images = preprocessing_ds.prepare_dataset( train_ds_images, args.num_tensorboard_images, buffer_size=10, shuffle=True, prefetch=False) train_prediction_callback = custom_callbacks.BatchPredictionCallback( model, os.path.join(args.logdir, 'train'), train_ds_images, scaled_mask=scale_mask, binary_threshold=args.binary_threshold, update_freq=args.tensorboard_train_images_update_batch_freq) callbacks.append(train_prediction_callback) train_prediction_callback.on_batch_end(-1, {}) if args.tensorboard_val_images: val_ds_images = convert2tfdataset( ds, DataType.VAL ) if args.train_on_generator else reader.get_dataset(DataType.VAL) val_ds_images = val_ds_images.map(val_preprocess_fn, num_parallel_calls=1) val_ds_images = preprocessing_ds.prepare_dataset( val_ds_images, args.num_tensorboard_images, buffer_size=1, shuffle=False, prefetch=False, take=args.num_tensorboard_images) val_prediction_callback = custom_callbacks.EpochPredictionCallback( model, os.path.join(args.logdir, 'validation'), val_ds_images, scaled_mask=scale_mask, binary_threshold=args.binary_threshold, update_freq=args.tensorboard_images_freq) callbacks.append(val_prediction_callback) val_prediction_callback.on_epoch_end(-1, {}) if args.tensorboard_test_images: test_ds_images = convert2tfdataset( ds, DataType.TEST ) if args.train_on_generator else reader.get_dataset(DataType.TEST) test_ds_images = test_ds_images.map(val_preprocess_fn, num_parallel_calls=1) test_ds_images = preprocessing_ds.prepare_dataset( test_ds_images, args.num_tensorboard_images, buffer_size=1, shuffle=False, prefetch=False, take=args.num_tensorboard_images) test_prediction_callback = custom_callbacks.EpochPredictionCallback( model, os.path.join(args.logdir, 'test'), val_ds_images, scaled_mask=scale_mask, binary_threshold=args.binary_threshold, update_freq=args.tensorboard_images_freq) callbacks.append(test_prediction_callback) test_prediction_callback.on_epoch_end(-1, {}) if args.start_tensorboard: kill_start_tensorboard(args.logdir, port=args.tensorboard_port) if args.steps_per_epoch != -1: steps_per_epoch = args.steps_per_epoch elif args.train_on_generator: steps_per_epoch = ds.num_examples(DataType.TRAIN) // global_batch_size else: logger.warning( "Reading total number of input samples, cause no steps were specifed. This may take a while." ) steps_per_epoch = reader.num_examples( DataType.TRAIN) // global_batch_size if args.validation_steps != -1: validation_steps = args.validation_steps elif args.train_on_generator: validation_steps = ds.num_examples(DataType.VAL) // global_batch_size else: logger.warning( "Reading total number of input val samples, cause no val_steps were specifed. This may take a while." ) validation_steps = reader.num_examples( DataType.VAL) // global_batch_size model.fit(train_ds, steps_per_epoch=steps_per_epoch, validation_data=val_ds, validation_steps=validation_steps, callbacks=callbacks, epochs=args.epochs, validation_freq=args.validation_freq) results = model.evaluate(val_ds, steps=validation_steps) # saved model export saved_model_path = os.path.join(args.logdir, 'saved_model', str(args.saved_model_version)) if os.path.exists(saved_model_path): shutil.rmtree(saved_model_path) if not args.no_export_saved_model: logger.info("exporting saved model to %s" % saved_model_path) model.save(saved_model_path, save_format='tf') return results, model
def _train_model(model: tf.keras.Model, database: Dict[str, float], num_epochs: int, test_set: Optional[List[str]], batch_size: int = 32, validation_split: float = 0.1, bootstrap: bool = False, random_state: int = 1, learning_rate: float = 1e-3, patience: int = None, timeout: float = None) -> Union[Tuple[List, dict], Tuple[List, dict, List[float]]]: """Train a model Args: model: Model to be trained database: Training dataset of molecule mapped to a property test_set: Hold-out set. If provided, this function will return predictions on this set num_epochs: Maximum number of epochs to run batch_size: Number of molecules per training batch validation_split: Fraction of molecules used for the training/validation split bootstrap: Whether to perform a bootstrap sample of the dataset random_state: Seed to the random number generator. Ensures entries do not move between train and validation set as the database becomes larger learning_rate: Learning rate for the Adam optimizer patience: Number of epochs without improvement before terminating training. timeout: Maximum training time in seconds Returns: model: Updated weights history: Training history """ # Compile the model with a new optimizer # We find that it is best to reset the optimizer before updating model.compile(tf.keras.optimizers.Adam(lr=learning_rate), 'mean_squared_error') # Separate the database into molecules and properties smiles, y = zip(*database.items()) smiles = np.array(smiles) y = np.array(y) # Make the training and validation splits rng = np.random.RandomState(random_state) train_split = rng.rand(len(smiles)) > validation_split train_X = smiles[train_split] train_y = y[train_split] valid_X = smiles[~train_split] valid_y = y[~train_split] # Perform a bootstrap sample of the training data if bootstrap: sample = rng.choice(len(train_X), size=(len(train_X),), replace=True) train_X = train_X[sample] train_y = train_y[sample] # Make the training data loaders train_loader = GraphLoader(train_X, train_y, batch_size=batch_size, shuffle=True) val_loader = GraphLoader(valid_X, valid_y, batch_size=batch_size, shuffle=False) # Make the callbacks final_learn_rate = 1e-6 init_learn_rate = learning_rate decay_rate = (final_learn_rate / init_learn_rate) ** (1. / (num_epochs - 1)) def lr_schedule(epoch, lr): return lr * decay_rate if patience is None: patience = num_epochs // 8 early_stopping = cb.EarlyStopping(patience=patience, restore_best_weights=True) my_callbacks = [ LRLogger(), EpochTimeLogger(), cb.LearningRateScheduler(lr_schedule), early_stopping, cb.TerminateOnNaN(), train_loader # So the shuffling gets called ] if timeout is not None: my_callbacks += [ TimeLimitCallback(timeout) ] # Run the desired number of epochs history = model.fit(train_loader, epochs=num_epochs, validation_data=val_loader, verbose=False, shuffle=False, callbacks=my_callbacks) # If a timeout is used, make sure we are using the best weights # The training may have exited without storing the best weights if timeout is not None: model.set_weights(early_stopping.best_weights) # Check if there is a NaN loss if np.isnan(history.history['loss']).any(): raise ValueError('Training failed due to a NaN loss.') # If provided, evaluate model on test set test_pred = None if test_set is not None: test_pred = evaluate_mpnn([model], test_set, batch_size, cache=False) # Convert weights to numpy arrays (avoids mmap issues) weights = [] for v in model.get_weights(): v = np.array(v) if np.isnan(v).any(): raise ValueError('Found some NaN weights.') weights.append(v) # Once we are finished training call "clear_session" tf.keras.backend.clear_session() if test_pred is None: return weights, history.history else: return weights, history.history, test_pred[:, 0].tolist()
model_out_path = os.path.join( config.data_settings.model_save_directory, 'model_held_out{}'.format(config.data_settings.train_year)) model_out_path += "{epoch:03d}" log_out_path = os.path.join(config.data_settings.model_save_directory, 'logs/') chpt = cbacks.ModelCheckpoint(model_out_path, save_best_only=False, verbose=True, monitor='val_f1', mode='max') tb = cbacks.TensorBoard(log_dir=log_out_path, update_freq=config.data_settings.tb_update_freq) nanloss = cbacks.TerminateOnNaN() model.fit( train, steps_per_epoch=config.model_settings.training_steps_per_epoch, epochs=config.model_settings.epochs, # validation_data=validation, callbacks=[chpt, tb, nanloss], verbose=config.model_settings.train_verbosity) fully_trained_model_path = os.path.join( log_out_path, "{}_epochs".format(config.model_settings.epochs)) model.save(fully_trained_model_path, save_format='tf')
def train(args) -> None: """Start training based on args input""" # Check if GPU is available print("\nNum GPUs Available: %d\n"\ % (len(tf.config.list_physical_devices('GPU')))) # Set tf.keras mixed precision to float16 set_keras_mixed_precision_policy('mixed_float16') # Create dataset save_svs_file, save_train_file, save_val_file \ = generate_dataset(args.data_dir_AD, args.data_dir_control, args.patch_size, force_regenerate=False) if args.fold_num != 0: # If using five-fold cross-validation save_svs_file, save_train_file, save_val_file \ = generate_five_fold_dataset(args.data_dir_AD, args.data_dir_control, args.patch_size, args.fold_num) # Load dataset train_dataset, val_dataset, class_weight \ = load_dataset(save_svs_file, save_train_file, save_val_file, args.batch_size) # Create network model model = get_model(args.model) #model.summary(120) #print(keras.backend.floatx()) class_names = ['Background', 'Gray Matter', 'White Matter'] model.compile(optimizer=optimizers.Adam(), loss=get_loss_func(args.loss_func, class_weight, gamma=args.focal_loss_gamma), metrics=[metrics.SparseCategoricalAccuracy(), SparseMeanIoU(num_classes=3, name='IoU/Mean'), SparsePixelAccuracy(num_classes=3, name='PixelAcc'), SparseMeanAccuracy(num_classes=3, name='MeanAcc'), SparseFreqIoU(num_classes=3, name='IoU/Freq_weighted'), SparseConfusionMatrix(num_classes=3, name='cm')] \ + SparseIoU.get_iou_metrics(num_classes=3, class_names=class_names)) # Create another checkpoint/log folder for model.name and timestamp args.ckpt_dir = os.path.join(args.ckpt_dir, model.name+'-'+args.file_suffix) args.log_dir = os.path.join(args.log_dir, 'fit', model.name+'-'+args.file_suffix) if args.fold_num != 0: # If using five-fold cross-validation args.ckpt_dir += f'_fold_{args.fold_num}' args.log_dir += f'_fold_{args.fold_num}' # Check if resume from training initial_epoch = 0 if args.ckpt_filepath is not None: if args.ckpt_weights_only: if args.ckpt_filepath.endswith('.index'): # Get rid of the suffix args.ckpt_filepath = args.ckpt_filepath.replace('.index', '') model.load_weights(args.ckpt_filepath).assert_existing_objects_matched() print('Model weights loaded') else: model = load_whole_model(args.ckpt_filepath) print('Whole model (weights + optimizer state) loaded') initial_epoch = int(args.ckpt_filepath.split('/')[-1]\ .split('-')[1]) # Save in same checkpoint_dir but different log_dir (add current time) args.ckpt_dir = os.path.abspath( os.path.dirname(args.ckpt_filepath)) args.log_dir = args.ckpt_dir.replace( 'checkpoints', 'tf_logs/fit') + f'-retrain_{args.file_suffix}' # Write configurations to log_dir log_configs(args.log_dir, save_svs_file, train_dataset, val_dataset, args) # Create checkpoint directory if not os.path.exists(args.ckpt_dir): os.makedirs(args.ckpt_dir) # Create log directory if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) # Create a callback that saves the model's weights every 1 epoch if val_dataset: ckpt_path = os.path.join( args.ckpt_dir, 'cp-{epoch:03d}-{val_IoU/Mean:.4f}.ckpt') else: ckpt_path = os.path.join( args.ckpt_dir, 'cp-{epoch:03d}-{IoU/Mean:.4f}.ckpt') cp_callback = callbacks.ModelCheckpoint( filepath=ckpt_path, verbose=1, save_weights_only=args.ckpt_weights_only, save_freq='epoch') # Create a TensorBoard callback tb_callback = callbacks.TensorBoard( log_dir=args.log_dir, histogram_freq=1, write_graph=True, write_images=False, update_freq='batch', profile_batch='100, 120') # Create a Lambda callback for plotting confusion matrix cm_callback = get_cm_callback(args.log_dir, class_names) # Create a TerminateOnNaN callback nan_callback = callbacks.TerminateOnNaN() # Create an EarlyStopping callback if val_dataset: es_callback = callbacks.EarlyStopping(monitor='val_IoU/Mean', min_delta=0.01, patience=3, verbose=1, mode='max') if val_dataset: model.fit( train_dataset, epochs=args.num_epochs, steps_per_epoch=len(train_dataset) \ if args.steps_per_epoch == -1 else args.steps_per_epoch, initial_epoch=initial_epoch, validation_data=val_dataset, validation_steps=len(val_dataset) // args.val_subsplits \ if args.val_steps == -1 else args.val_steps, callbacks=[cp_callback, tb_callback, nan_callback, cm_callback, es_callback]) else: model.fit( train_dataset, epochs=args.num_epochs, steps_per_epoch=len(train_dataset) \ if args.steps_per_epoch == -1 else args.steps_per_epoch, initial_epoch=initial_epoch, callbacks=[cp_callback, tb_callback, nan_callback, cm_callback]) # TODO: Switch to tf.data print('Training finished!')
kernel_initializer='lecun_uniform', name='dense_relu3')(x) x = layers.Dropout(0.2)(x) # output = layers.Dense(1, activation='linear', kernel_initializer='lecun_uniform')(x) model = models.Model(inputs=input1, outputs=output) model.compile(optimizer=optimizers.Adam(), loss='mae') model.summary() my_callbacks = [ callbacks.EarlyStopping(patience=10, verbose=1), #callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1), callbacks.TerminateOnNaN() ] # train history = model.fit(X1_train, Y_train, epochs=500, batch_size=128, verbose=2, validation_data=(X1_val, Y_val), callbacks=my_callbacks) nameModel = 'EMD_Conv2D_MAE' #nameModel = 'EMD_Dense_MAPE' #nameModel = 'EMD_Dense_MAE_AsymmetryLarge_%s' %sys.argv[1]