def main(argv): # Need the user to provide system argv for job_id and product_id, it is prepared for frontend calling if len(argv) < 2 or len(argv) > 3: print( "ERROR: Format error, refer to the usage: python test.py job_id product_id" ) elif not argv[1].isdigit(): print("ERROR: Format error, job_id must be in int format") elif not argv[1].isalnum(): print( "ERROR: Format error, product_id must be consistent by character or number, without special character" ) else: print("INFO: Start training model " + datetime.datetime.now().strftime("%Y%m%d%H%M%S")) # GPU settings gpus = tf.config.list_physical_devices("GPU") if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # Folder generate for log file and model saving log_dir, save_model_dir = folder_preparation(argv[1], argv[2]) # get the dataset train_dataset, valid_dataset, test_dataset, train_count, valid_count, test_count = generate_datasets( ) # create model model = get_model() print_model_summary(network=model) # Setup target for validation dataset accuracy, only when the valid_accuracy reachs the threshold the weight can be saved threshold = THRESHOLD # define loss calculation loss_object = tf.keras.losses.SparseCategoricalCrossentropy() # Tried RMSprop for optimizer, the result is not so good, finetune the optimizer to Adam or Momentum optimizer = tf.keras.optimizers.Adam(lr=GLOBAL_LEARNING_RATE, decay=WEIGHT_DECAY) # optimizer = tf.keras.optimizers.RMSprop(learning_rate = GLOBAL_LEARNING_RATE,momentum = MOMENTUM) # Define training KPI train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') # Define valid KPI valid_loss = tf.keras.metrics.Mean(name='valid_loss') valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='valid_accuracy') # @tf.function def train(image_batch, label_batch): with tf.GradientTape() as tape: predictions = model(image_batch, training=True) loss = loss_object(y_true=label_batch, y_pred=predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients( grads_and_vars=zip(gradients, model.trainable_variables)) train_loss.update_state(values=loss) train_accuracy.update_state(y_true=label_batch, y_pred=predictions) return predictions.numpy(), tf.math.argmax(predictions, axis=1).numpy() # @tf.function def valid(image_batch, label_batch): predictions = model(image_batch, training=True) v_loss = loss_object(label_batch, predictions) valid_loss.update_state(values=v_loss) valid_accuracy.update_state(y_true=label_batch, y_pred=predictions) return tf.math.argmax(predictions, axis=1).numpy() # start training for epoch in range(EPOCHS): train_step = 0 valid_step = 0 for features in train_dataset: train_step += 1 images, labels = process_features(features, data_augmentation=False) predictions, predict_labels = train(images, labels) # Print the info on the screen for developer to monitor training detail print( "Epoch: {}/{}, step: {}/{}, loss: {:.5f}, accuracy: {:.5f}, softmax(logits):{}, " "predict_label:{}, target_label:{}".format( epoch, EPOCHS, train_step, math.ceil(train_count / BATCH_SIZE), train_loss.result().numpy(), train_accuracy.result().numpy(), predictions, predict_labels, labels)) # Record information into the log file file = open(log_dir + "training_result_step" + ".log", "a") file.write("train\t") file.write(datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "\t") file.write(str(epoch) + "\t") file.write(str(train_step) + "\t") file.write(str(train_accuracy.result().numpy()) + "\t") file.write(str(predict_labels) + "\t") file.write(str(labels) + "\n") file.close() for features in valid_dataset: valid_step += 1 valid_images, valid_labels = process_features( features, data_augmentation=False) predict_labels = valid(valid_images, valid_labels) file = open(log_dir + "training_result_step" + ".log", "a") file.write("validation\t") file.write(datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "\t") file.write(str(epoch) + "\t") file.write(str(valid_step) + "\t") file.write(str(valid_accuracy.result().numpy()) + "\t") file.write(str(predict_labels) + "\t") file.write(str(valid_labels) + "\n") file.close() # Print the info on the screen for developer to monitor validation result print("Epoch: {}/{}, train loss: {:.5f}, train accuracy: {:.5f}, " "valid loss: {:.5f}, valid accuracy: {:.5f}".format( epoch, EPOCHS, train_loss.result().numpy(), train_accuracy.result().numpy(), valid_loss.result().numpy(), valid_accuracy.result().numpy())) # Create log file in txt format, easy for pandas to analysis and for best model selection file = open(log_dir + "training_result" + ".log", "a") file.write(datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "\t") file.write(str(epoch) + "\t") file.write(str(valid_accuracy.result().numpy()) + "\n") file.close() valid_accuracy_result = valid_accuracy.result().numpy() train_loss.reset_states() train_accuracy.reset_states() valid_loss.reset_states() valid_accuracy.reset_states() # Save the weights for evaluation and prediction only when the valid accuracy is higher than threshold and best ever result if epoch % save_every_n_epoch == 0: if valid_accuracy_result >= threshold: model.save_weights(filepath=save_model_dir + str(epoch) + "/model", save_format='tf') # model._set_inputs(inputs=tf.random.normal(shape=(1, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS))) # tf.keras.models.save_model(model, save_model_dir + str(epoch), save_format='tf') # Threshold update threshold = valid_accuracy_result
return images, labels if __name__ == '__main__': print(tf.__name__, ": ", tf.__version__, sep="") # GPU settings gpus = tf.config.list_physical_devices("GPU") if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) # get the dataset train_dataset, valid_dataset, test_dataset, train_count, valid_count, test_count = generate_datasets( ) model = get_model() checkpoint_save_path = "./saved_model/resnet_101/epoch-0" if os.path.exists(checkpoint_save_path + '.index'): print('-------------load the model-----------------') model.load_weights(checkpoint_save_path) # # model_save_path = "./saved_model/epoch-50.index" # if os.path.exists(model_save_path): # print('-------------load the model-----------------') # model.load_weights(filepath=model_save_path) print_model_summary(network=model)
def main(argv): # Need the user to provide system argv for job_id and product_id, it is prepared for frontend calling if len(argv) < 2 or len(argv) > 3: print( "ERROR: Format error, refer to the usage: python test.py job_id product_id" ) elif not argv[1].isdigit(): print("ERROR: Format error, job_id must be in int format") elif not argv[1].isalnum(): print( "ERROR: Format error, product_id must be consistent by character or number, without special character" ) else: print("INFO: Start evaluating model " + datetime.datetime.now().strftime("%Y%m%d%H%M%S")) # GPU settings gpus = tf.config.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # Folder generate for log file and model saving log_dir, save_model_dir = folder_preparation(argv[1], argv[2]) # get the original_dataset train_dataset, valid_dataset, test_dataset, train_count, valid_count, test_count = generate_datasets( ) # load the model model = get_model() model.load_weights(filepath=save_model_dir + "model") # model = tf.saved_model.load(save_model_dir) # Get the accuracy on the test set loss_object = tf.keras.metrics.SparseCategoricalCrossentropy() test_loss = tf.keras.metrics.Mean() test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() # @tf.function def test_step(images, labels): predictions = model(images, training=False) t_loss = loss_object(labels, predictions) test_loss(t_loss) test_accuracy(labels, predictions) return tf.math.argmax(predictions, axis=1).numpy() batch = 0 for features in test_dataset: batch += 1 test_images, test_labels = process_features( features, data_augmentation=False) predict_labels = test_step(test_images, test_labels) print( "loss: {:.5f}, test accuracy: {:.5f}, predict_labels:{}, test_labels:{}" .format(test_loss.result(), test_accuracy.result(), predict_labels, test_labels)) file = open(log_dir + "test_result_step" + ".log", "a") file.write("test\t") file.write(datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "\t") file.write(str(batch) + "\t") file.write(str(test_accuracy.result().numpy()) + "\t") file.write(str(predict_labels) + "\t") file.write(str(test_labels) + "\n") file.close() print("The accuracy on test set is: {:.3f}%".format( test_accuracy.result() * 100)) file = open(log_dir + "test_result" + ".log", "a") file.write(datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "\t") file.write(str(test_accuracy.result()) + "\n") file.close()