def train(model_dir, results_subdir, random_seed, resolution): np.random.seed(random_seed) tf.set_random_seed(np.random.randint(1 << 31)) session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) session_conf.gpu_options.allow_growth = True sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) set_session(sess) # parser config config_file = model_dir + "/config.ini" print("Config File Path:", config_file, flush=True) assert os.path.isfile(config_file) cp = ConfigParser() cp.read(config_file) # default config base_model_name = cp["DEFAULT"].get("base_model_name") # train config path_model_base_weights = cp["TRAIN"].get("path_model_base_weights") use_trained_model_weights = cp["TRAIN"].getboolean( "use_trained_model_weights") use_best_weights = cp["TRAIN"].getboolean("use_best_weights") output_weights_name = cp["TRAIN"].get("output_weights_name") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") initial_learning_rate = cp["TRAIN"].getfloat("initial_learning_rate") image_dimension = cp["TRAIN"].getint("image_dimension") patience_reduce_lr = cp["TRAIN"].getint("patience_reduce_lr") min_lr = cp["TRAIN"].getfloat("min_lr") positive_weights_multiply = cp["TRAIN"].getfloat( "positive_weights_multiply") patience = cp["TRAIN"].getint("patience") samples_per_epoch = cp["TRAIN"].getint("samples_per_epoch") reduce_lr = cp["TRAIN"].getfloat("reduce_lr") print("** DenseNet input resolution:", image_dimension, flush=True) print("** GAN image resolution:", resolution, flush=True) print("** Patience epochs", patience, flush=True) print("** Samples per epoch:", samples_per_epoch, flush=True) log2_record = int(np.log2(resolution)) record_file_ending = "*" + np.str(log2_record) + ".tfrecords" print("** Resolution ", resolution, " corresponds to ", record_file_ending, " TFRecord file.", flush=True) output_dir = os.path.join( results_subdir, "classification_results_res_" + np.str(2**log2_record) + "/train") print("Output Directory:", output_dir, flush=True) if not os.path.isdir(output_dir): os.makedirs(output_dir) # if previously trained weights is used, never re-split if use_trained_model_weights: print("** use trained model weights **", flush=True) training_stats_file = os.path.join(output_dir, ".training_stats.json") if os.path.isfile(training_stats_file): # TODO: add loading previous learning rate? training_stats = json.load(open(training_stats_file)) else: training_stats = {} else: # start over training_stats = {} show_model_summary = cp["TRAIN"].getboolean("show_model_summary") running_flag_file = os.path.join(output_dir, ".training.lock") if os.path.isfile(running_flag_file): raise RuntimeError("A process is running in this directory!!!") else: open(running_flag_file, "a").close() try: print("backup config file to", output_dir, flush=True) shutil.copy(config_file, os.path.join(output_dir, os.path.split(config_file)[1])) tfrecord_dir_tr = os.path.join(results_subdir[:-4], "train") tfrecord_dir_vl = os.path.join(results_subdir[:-4], "valid") shutil.copy(tfrecord_dir_tr + "/train.csv", output_dir) shutil.copy(tfrecord_dir_vl + "/valid.csv", output_dir) # Get class names class_names = get_class_names(output_dir, "train") # get train sample counts train_counts, train_pos_counts = get_sample_counts( output_dir, "train", class_names) valid_counts, _ = get_sample_counts(output_dir, "valid", class_names) print("Total Training Data:", train_counts, flush=True) print("Total Validation Data:", valid_counts, flush=True) train_steps = int(min(samples_per_epoch, train_counts) / batch_size) print("** train_steps:", train_steps, flush=True) validation_steps = int(np.floor(valid_counts / batch_size)) print("** validation_steps:", validation_steps, flush=True) # compute class weights print("** compute class weights from training data **", flush=True) class_weights = get_class_weights( train_counts, train_pos_counts, multiply=positive_weights_multiply, ) print("** class_weights **", flush=True) print(class_weights) print("** load model **", flush=True) if use_trained_model_weights: if use_best_weights: model_weights_file = os.path.join( output_dir, "best_" + output_weights_name) else: model_weights_file = os.path.join(output_dir, output_weights_name) else: model_weights_file = None # Use downloaded weights if os.path.isfile(path_model_base_weights): base_weights = path_model_base_weights print("** Base weights will be loaded.", flush=True) else: base_weights = None print("** No Base weights.", flush=True) # Get Model # ------------------------------------ input_shape = (image_dimension, image_dimension, 3) img_input = Input(shape=input_shape) base_model = DenseNet121(include_top=False, weights=base_weights, input_tensor=img_input, input_shape=input_shape, pooling="avg") x = base_model.output predictions = Dense(len(class_names), activation="sigmoid", name="predictions")(x) model = Model(inputs=img_input, outputs=predictions) if use_trained_model_weights and model_weights_file != None: print("** load model weights_path:", model_weights_file, flush=True) model.load_weights(model_weights_file) # ------------------------------------ if show_model_summary: print(model.summary()) print("** create image generators", flush=True) train_seq = TFWrapper(tfrecord_dir=tfrecord_dir_tr, record_file_endings=record_file_ending, batch_size=batch_size, model_target_size=(image_dimension, image_dimension), steps=train_steps, augment=True, shuffle=True, prefetch=True, repeat=True) valid_seq = TFWrapper(tfrecord_dir=tfrecord_dir_vl, record_file_endings=record_file_ending, batch_size=batch_size, model_target_size=(image_dimension, image_dimension), steps=None, augment=False, shuffle=False, prefetch=True, repeat=True) # Initialise train and valid iterats print("** Initialise train and valid iterators", flush=True) train_seq.initialise() valid_seq.initialise() output_weights_path = os.path.join(output_dir, output_weights_name) print("** set output weights path to:", output_weights_path, flush=True) print("** SINGLE_gpu_model is used!", flush=True) model_train = model checkpoint = ModelCheckpoint( output_weights_path, save_weights_only=True, save_best_only=False, verbose=1, ) print("** compile model with class weights **", flush=True) optimizer = Adam(lr=initial_learning_rate) model_train.compile(optimizer=optimizer, loss="binary_crossentropy") auroc = MultipleClassAUROC(sequence=valid_seq, class_names=class_names, weights_path=output_weights_path, stats=training_stats, early_stop_p=patience, learn_rate_p=patience_reduce_lr, learn_rate_f=reduce_lr, min_lr=min_lr, workers=0) callbacks = [ checkpoint, TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size), auroc ] print("** start training **", flush=True) history = model_train.fit_generator( generator=train_seq, steps_per_epoch=train_steps, epochs=epochs, validation_data=valid_seq, validation_steps=validation_steps, callbacks=callbacks, class_weight=class_weights, workers=0, shuffle=False, ) # dump history print("** dump history **", flush=True) with open(os.path.join(output_dir, "history.pkl"), "wb") as f: pickle.dump({ "history": history.history, "auroc": auroc.aurocs, }, f) print("** done! **", flush=True) finally: os.remove(running_flag_file)
def train_rsna_clf(train_data=None, validation_data=None, remove_running=True): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config output_dir = cp["DEFAULT"].get("output_dir") image_source_dir = cp["DEFAULT"].get("image_source_dir") base_model_name = cp["DEFAULT"].get("base_model_name") class_names1 = cp["DEFAULT"].get("class_names1").split(",") class_names2 = cp["DEFAULT"].get("class_names2").split(",") # train config train_image_source_dir = cp["TRAIN"].get("train_image_source_dir") train_class_info = cp["TRAIN"].get("train_class_info") train_box_info = cp["TRAIN"].get("train_box_info") use_base_model_weights = cp["TRAIN"].getboolean("use_base_model_weights") use_trained_model_weights = cp["TRAIN"].getboolean( "use_trained_model_weights") use_best_weights = cp["TRAIN"].getboolean("use_best_weights") input_weights_name = cp["TRAIN"].get("input_weights_name") output_weights_name = cp["TRAIN"].get("output_weights_name") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") initial_learning_rate = cp["TRAIN"].getfloat("initial_learning_rate") generator_workers = cp["TRAIN"].getint("generator_workers") image_dimension = cp["TRAIN"].getint("image_dimension") train_steps = cp["TRAIN"].get("train_steps") patience_reduce_lr = cp["TRAIN"].getint("patience_reduce_lr") min_lr = cp["TRAIN"].getfloat("min_lr") validation_steps = cp["TRAIN"].get("validation_steps") positive_weights_multiply = cp["TRAIN"].getfloat( "positive_weights_multiply") dataset_csv_dir = cp["TRAIN"].get("dataset_csv_dir") # if previously trained weights is used, never re-split if use_trained_model_weights: # resuming mode print("** use trained model weights **") # load training status for resuming training_stats_file = os.path.join(output_dir, ".training_stats.json") if os.path.isfile(training_stats_file): # TODO: add loading previous learning rate? training_stats = json.load(open(training_stats_file)) else: training_stats = {} else: # start over training_stats = {} show_model_summary = cp["TRAIN"].getboolean("show_model_summary") # end parser config # check output_dir, create it if not exists if not os.path.isdir(output_dir): os.makedirs(output_dir) running_flag_file = os.path.join(output_dir, ".training.lock") if os.path.isfile(running_flag_file): if remove_running: os.remove(running_flag_file) open(running_flag_file, "a").close() else: raise RuntimeError("A process is running in this directory!!!") else: open(running_flag_file, "a").close() try: print(f"backup config file to {output_dir}") shutil.copy(config_file, os.path.join(output_dir, os.path.split(config_file)[1])) # get train/dev sample counts train_counts, train_pos_counts = get_sample_counts( train_data.df, class_names2) validation_counts, _ = get_sample_counts(validation_data.df, class_names2) # compute steps if train_steps == "auto": train_steps = int(train_counts / batch_size) else: try: train_steps = int(train_steps) except ValueError: raise ValueError(f""" train_steps: {train_steps} is invalid, please use 'auto' or integer. """) print(f"** train_steps: {train_steps} **") if validation_steps == "auto": validation_steps = int(validation_counts / batch_size) else: try: validation_steps = int(validation_steps) except ValueError: raise ValueError(f""" validation_steps: {validation_steps} is invalid, please use 'auto' or integer. """) print(f"** validation_steps: {validation_steps} **") # compute class weights print("** compute class weights from training data **") class_weights = get_class_weights( train_counts, train_pos_counts, multiply=positive_weights_multiply, ) print("** class_weights **") print(class_weights) print("** load model **") if use_trained_model_weights: if use_best_weights: model_weights_file = os.path.join( output_dir, f"best_{input_weights_name}") else: model_weights_file = os.path.join(output_dir, input_weights_name) else: model_weights_file = None model_factory = ModelFactory() model = model_factory.get_model( class_names1, model_name=base_model_name, use_base_weights=use_base_model_weights, weights_path=model_weights_file, input_shape=(image_dimension, image_dimension, 3)) model = modify_last_layer(model, class_names2) if show_model_summary: print(model.summary()) train_sq = AugmentedLabelSequence_clf( train_data, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=train_steps, ) validation_sq = AugmentedLabelSequence_clf( validation_data, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=validation_steps, ) output_weights_path = os.path.join(output_dir, output_weights_name) print(f"** set output weights path to: {output_weights_path} **") print("** check multiple gpu availability **") gpus = len(os.getenv("CUDA_VISIBLE_DEVICES", "1").split(",")) if gpus > 1: print(f"** multi_gpu_model is used! gpus={gpus} **") model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint( filepath=output_weights_path, base_model=model, ) else: model_train = model checkpoint = ModelCheckpoint( output_weights_path, save_weights_only=True, save_best_only=True, verbose=1, ) print("** compile model with class weights **") optimizer = Adam(lr=initial_learning_rate) model_train.compile(optimizer=optimizer, loss="binary_crossentropy") auroc = MultipleClassAUROC( sequence=validation_sq, class_names=class_names2, weights_path=output_weights_path, stats=training_stats, workers=generator_workers, ) callbacks = [ checkpoint, TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience_reduce_lr, verbose=1, mode="min", min_lr=min_lr), auroc, ] print("** start training **") history = model_train.fit_generator( generator=train_sq, steps_per_epoch=train_steps, epochs=epochs, validation_data=validation_sq, validation_steps=validation_steps, callbacks=callbacks, class_weight=class_weights, workers=generator_workers, shuffle=False, ) # dump history print("** dump history **") with open(os.path.join(output_dir, "history.pkl"), "wb") as f: pickle.dump({ "history": history.history, "auroc": auroc.aurocs, }, f) print("** done! **") finally: os.remove(running_flag_file)
def main(): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config output_dir = cp["DEFAULT"].get("output_dir") image_source_dir = cp["DEFAULT"].get("image_source_dir") train_patient_count = cp["DEFAULT"].getint("train_patient_count") dev_patient_count = cp["DEFAULT"].getint("dev_patient_count") data_entry_file = cp["DEFAULT"].get("data_entry_file") class_names = cp["DEFAULT"].get("class_names").split(",") # train config use_base_model_weights = cp["TRAIN"].getboolean("use_base_model_weights") use_trained_model_weights = cp["TRAIN"].getboolean( "use_trained_model_weights") use_best_weights = cp["TRAIN"].getboolean("use_best_weights") output_weights_name = cp["TRAIN"].get("output_weights_name") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") initial_learning_rate = cp["TRAIN"].getfloat("initial_learning_rate") train_steps = cp["TRAIN"].get("train_steps") patience_reduce_lr = cp["TRAIN"].getint("patience_reduce_lr") validation_steps = cp["TRAIN"].get("validation_steps") positive_weights_multiply = cp["TRAIN"].getfloat( "positive_weights_multiply") use_class_balancing = cp["TRAIN"].getboolean("use_class_balancing") use_default_split = cp["TRAIN"].getboolean("use_default_split") # if previously trained weights is used, never re-split if use_trained_model_weights: # resuming mode print( "** use trained model weights, turn on use_skip_split automatically **" ) use_skip_split = True # load training status for resuming training_stats_file = os.path.join(output_dir, ".training_stats.json") if os.path.isfile(training_stats_file): # TODO: add loading previous learning rate? training_stats = json.load(open(training_stats_file)) else: training_stats = {} else: # start over use_skip_split = cp["TRAIN"].getboolean("use_skip_split ") training_stats = {} split_dataset_random_state = cp["TRAIN"].getint( "split_dataset_random_state") show_model_summary = cp["TRAIN"].getboolean("show_model_summary") # end parser config # check output_dir, create it if not exists if not os.path.isdir(output_dir): os.makedirs(output_dir) running_flag_file = os.path.join(output_dir, ".training.lock") if os.path.isfile(running_flag_file): raise RuntimeError("A process is running in this directory!!!") else: open(running_flag_file, "a").close() try: print(f"backup config file to {output_dir}") shutil.copy(config_file, os.path.join(output_dir, os.path.split(config_file)[1])) # split train/dev/test if use_default_split: datasets = ["train", "dev", "test"] for dataset in datasets: shutil.copy(f"./data/default_split/{dataset}.csv", output_dir) elif not use_skip_split: print("** split dataset **") split_data( data_entry_file, class_names, train_patient_count, dev_patient_count, output_dir, split_dataset_random_state, ) # get train/dev sample counts train_counts, train_pos_counts = get_sample_counts( output_dir, "train", class_names) dev_counts, _ = get_sample_counts(output_dir, "dev", class_names) # compute steps if train_steps == "auto": train_steps = int(train_counts / batch_size) else: try: train_steps = int(train_steps) except ValueError: raise ValueError(f""" train_steps: {train_steps} is invalid, please use 'auto' or integer. """) print(f"** train_steps: {train_steps} **") if validation_steps == "auto": validation_steps = int(dev_counts / batch_size) else: try: validation_steps = int(validation_steps) except ValueError: raise ValueError(f""" validation_steps: {validation_steps} is invalid, please use 'auto' or integer. """) print(f"** validation_steps: {validation_steps} **") # compute class weights print("** compute class weights from training data **") class_weights = get_class_weights( train_counts, train_pos_counts, multiply=positive_weights_multiply, use_class_balancing=use_class_balancing) print("** class_weights **") for c, w in class_weights.items(): print(f" {c}: {w}") print("** load model **") if use_base_model_weights: base_model_weights_file = cp["TRAIN"].get( "base_model_weights_file") else: base_model_weights_file = None if use_trained_model_weights: if use_best_weights: model_weights_file = os.path.join( output_dir, f"best_{output_weights_name}") else: model_weights_file = os.path.join(output_dir, output_weights_name) else: model_weights_file = None model = get_model(class_names, base_model_weights_file, model_weights_file) if show_model_summary: print(model.summary()) # recreate symlink folder for ImageDataGenerator symlink_dir_name = "image_links" create_symlink(image_source_dir, output_dir, symlink_dir_name) print("** create image generators **") train_data_path = f"{output_dir}/{symlink_dir_name}/train/" train_generator = custom_image_generator( ImageDataGenerator(horizontal_flip=True, rescale=1. / 255), train_data_path, batch_size=batch_size, class_names=class_names, ) dev_data_path = f"{output_dir}/{symlink_dir_name}/dev/" dev_generator = custom_image_generator( ImageDataGenerator(horizontal_flip=True, rescale=1. / 255), dev_data_path, batch_size=batch_size, class_names=class_names, ) output_weights_path = os.path.join(output_dir, output_weights_name) print(f"** set output weights path to: {output_weights_path} **") print("** check multiple gpu availability **") gpus = len(os.getenv("CUDA_VISIBLE_DEVICES", "1").split(",")) if gpus > 1: print(f"** multi_gpu_model is used! gpus={gpus} **") model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint( filepath=output_weights_path, base_model=model, ) else: model_train = model checkpoint = ModelCheckpoint(output_weights_path) print("** compile model with class weights **") optimizer = Adam(lr=initial_learning_rate) model_train.compile(optimizer=optimizer, loss="binary_crossentropy") auroc = MultipleClassAUROC( generator=dev_generator, steps=validation_steps, class_names=class_names, weights_path=output_weights_path, stats=training_stats, ) callbacks = [ checkpoint, TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience_reduce_lr, verbose=1), auroc, ] print("** training start **") history = model_train.fit_generator( generator=train_generator, steps_per_epoch=train_steps, epochs=epochs, validation_data=dev_generator, validation_steps=validation_steps, callbacks=callbacks, class_weight=class_weights, ) # dump history print("** dump history **") with open(os.path.join(output_dir, "history.pkl"), "wb") as f: pickle.dump({ "history": history.history, "auroc": auroc.aurocs, }, f) print("** done! **") finally: os.remove(running_flag_file)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_epoch', type=int, default=0) args = parser.parse_args() # Set Parameter # base_model_name = "DenseNet121" use_base_model_weights = True weights_path = None image_dimension = 224 batch_size = 32 epochs = 20 class_names = ["Nodule", "Pneumothorax"] csv_path = './data/classification' image_source_dir = '/media/nfs/CXR/NIH/chest_xrays/NIH/data/images_1024x1024/' augmenter = None # If train_steps is set to None, will calculate train steps by len(train)/batch_size train_steps = None positive_weights_multiply = 1 outputs_path = './experiments/ae' weights_name = f'weights{args.model_epoch}.h5' output_weights_path = os.path.join(outputs_path, weights_name) initial_learning_rate = 0.0001 training_stats = {} # Get Sample and Total Count From Training Data and Compute Class Weights # train_counts, train_pos_counts = get_sample_counts(csv_path, "train", class_names) if train_steps == None: train_steps = int(train_counts / batch_size) dev_counts, _ = get_sample_counts(csv_path, "test", class_names) validation_steps = int(dev_counts / batch_size) print('***Compute Class Weights***') class_weights = get_class_weights(train_counts, train_pos_counts, multiply=positive_weights_multiply) print(class_weights) # Create Image Sequence # train_sequence = AugmentedImageSequence( dataset_csv_file=os.path.join(csv_path, "train.csv"), class_names=class_names, source_image_dir=image_source_dir, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=train_steps, model_epoch=args.model_epoch) validation_sequence = AugmentedImageSequence( dataset_csv_file=os.path.join(csv_path, "test.csv"), class_names=class_names, source_image_dir=image_source_dir, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=validation_steps, shuffle_on_epoch_end=False, model_epoch=args.model_epoch) # Build Model # factory = ModelFactory() model = factory.get_model(class_names, model_name=base_model_name, use_base_weights=use_base_model_weights, weights_path=None, input_shape=(image_dimension, image_dimension, 3)) print("** check multiple gpu availability **") gpus = len(os.getenv("CUDA_VISIBLE_DEVICES", "1").split(",")) if gpus > 1: print("** multi_gpu_model is used! gpus={gpus} **") model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint( filepath=output_weights_path, base_model=model, ) else: model_train = model checkpoint = ModelCheckpoint( output_weights_path, save_weights_only=True, save_best_only=True, verbose=1, ) auroc = MultipleClassAUROC(sequence=validation_sequence, class_names=class_names, weights_path=output_weights_path, stats=training_stats, workers=8, model_epoch=args.model_epoch) callbacks = [ checkpoint, TensorBoard(log_dir=os.path.join(outputs_path, "logs"), batch_size=batch_size), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=1, verbose=1, mode="min", min_lr=1e-8), auroc, ] # Compile Model # print('*** Start Compiling ***') optimizer = Adam(lr=initial_learning_rate) model_train.compile(optimizer=optimizer, loss="binary_crossentropy") # Train # print("** start training **") history = model_train.fit_generator( generator=train_sequence, steps_per_epoch=train_steps, epochs=epochs, validation_data=validation_sequence, validation_steps=validation_steps, callbacks=callbacks, class_weight=class_weights, workers=8, shuffle=False, ) # dump history print("** dump history **") with open(os.path.join(outputs_path, f"history{args.model_epoch}.pkl"), "wb") as f: pickle.dump({ "history": history.history, "auroc": auroc.aurocs, }, f) print("** done! **")