def make_steps(step, ampl): """ Perform training epochs @param step Number of epochs to perform @param ampl the K, the randomized component of the score matrix. """ global w2ts, t2i, steps, features, score, histories # shuffle the training pictures random.shuffle(train) # Map whale id to the list of associated training picture hash value w2ts = {} for w, hs in w2hs.items(): for h in hs: if h in train_set: if w not in w2ts: w2ts[w] = [] if h not in w2ts[w]: w2ts[w].append(h) for w, ts in w2ts.items(): w2ts[w] = np.array(ts) # Map training picture hash value to index in 'train' array t2i = {} for i, t in enumerate(train): t2i[t] = i # Compute the match score for each picture pair features, score = compute_score() csv_logger = CSVLogger( os.path.join(history_dir, f'trained_{steps + step}.csv')) print("** check multiple gpu availability **") output_weights_path = os.path.join(models_dir, 'model_finetuning.h5') gpus = len(os.getenv("CUDA_VISIBLE_DEVICES", "0,1").split(",")) if gpus > 1: print(f"** multi_gpu_model is used! gpus={gpus} **") model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint( filepath=output_weights_path, base_model=model, save_best_only=False, save_weights_only=False, ) else: model_train = model checkpoint = ModelCheckpoint( output_weights_path, # save_weights_only=True, save_best_only=False, verbose=1, ) # model_train.compile(Adam(lr=64e-5), loss=focal_loss(gamma=2., alpha=.5), metrics=['binary_crossentropy', 'acc']) model_train.compile(Adam(lr=64e-5), loss='binary_crossentropy', metrics=['binary_crossentropy', 'acc']) callbacks = [ csv_logger, checkpoint, TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size), ] # Train the model_train for 'step' epochs history = model_train.fit_generator( TrainingData(score + ampl * np.random.random_sample(size=score.shape), steps=step, batch_size=batch_size), initial_epoch=steps, epochs=steps + step, max_queue_size=max_queue_size, workers=workers, verbose=1, callbacks=callbacks, ).history steps += step # Collect history data history['epochs'] = steps history['ms'] = np.mean(score) history['lr'] = get_lr(model_train) print(history['epochs'], history['lr'], history['ms']) histories.append(history)
def main(): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config output_dir = cp["DEFAULT"].get("output_dir") image_source_dir = cp["DEFAULT"].get("image_source_dir") train_patient_count = cp["DEFAULT"].getint("train_patient_count") dev_patient_count = cp["DEFAULT"].getint("dev_patient_count") data_entry_file = cp["DEFAULT"].get("data_entry_file") class_names = cp["DEFAULT"].get("class_names").split(",") # train config use_base_model_weights = cp["TRAIN"].getboolean("use_base_model_weights") use_trained_model_weights = cp["TRAIN"].getboolean( "use_trained_model_weights") use_best_weights = cp["TRAIN"].getboolean("use_best_weights") output_weights_name = cp["TRAIN"].get("output_weights_name") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") initial_learning_rate = cp["TRAIN"].getfloat("initial_learning_rate") train_steps = cp["TRAIN"].get("train_steps") patience_reduce_lr = cp["TRAIN"].getint("patience_reduce_lr") validation_steps = cp["TRAIN"].get("validation_steps") positive_weights_multiply = cp["TRAIN"].getfloat( "positive_weights_multiply") use_class_balancing = cp["TRAIN"].getboolean("use_class_balancing") use_default_split = cp["TRAIN"].getboolean("use_default_split") # if previously trained weights is used, never re-split if use_trained_model_weights: # resuming mode print( "** use trained model weights, turn on use_skip_split automatically **" ) use_skip_split = True # load training status for resuming training_stats_file = os.path.join(output_dir, ".training_stats.json") if os.path.isfile(training_stats_file): # TODO: add loading previous learning rate? training_stats = json.load(open(training_stats_file)) else: training_stats = {} else: # start over use_skip_split = cp["TRAIN"].getboolean("use_skip_split ") training_stats = {} split_dataset_random_state = cp["TRAIN"].getint( "split_dataset_random_state") show_model_summary = cp["TRAIN"].getboolean("show_model_summary") # end parser config # check output_dir, create it if not exists if not os.path.isdir(output_dir): os.makedirs(output_dir) running_flag_file = os.path.join(output_dir, ".training.lock") if os.path.isfile(running_flag_file): raise RuntimeError("A process is running in this directory!!!") else: open(running_flag_file, "a").close() try: print(f"backup config file to {output_dir}") shutil.copy(config_file, os.path.join(output_dir, os.path.split(config_file)[1])) # split train/dev/test if use_default_split: datasets = ["train", "dev", "test"] for dataset in datasets: shutil.copy(f"./data/default_split/{dataset}.csv", output_dir) elif not use_skip_split: print("** split dataset **") split_data( data_entry_file, class_names, train_patient_count, dev_patient_count, output_dir, split_dataset_random_state, ) # get train/dev sample counts train_counts, train_pos_counts = get_sample_counts( output_dir, "train", class_names) dev_counts, _ = get_sample_counts(output_dir, "dev", class_names) # compute steps if train_steps == "auto": train_steps = int(train_counts / batch_size) else: try: train_steps = int(train_steps) except ValueError: raise ValueError(f""" train_steps: {train_steps} is invalid, please use 'auto' or integer. """) print(f"** train_steps: {train_steps} **") if validation_steps == "auto": validation_steps = int(dev_counts / batch_size) else: try: validation_steps = int(validation_steps) except ValueError: raise ValueError(f""" validation_steps: {validation_steps} is invalid, please use 'auto' or integer. """) print(f"** validation_steps: {validation_steps} **") # compute class weights print("** compute class weights from training data **") class_weights = get_class_weights( train_counts, train_pos_counts, multiply=positive_weights_multiply, use_class_balancing=use_class_balancing) print("** class_weights **") for c, w in class_weights.items(): print(f" {c}: {w}") print("** load model **") if use_base_model_weights: base_model_weights_file = cp["TRAIN"].get( "base_model_weights_file") else: base_model_weights_file = None if use_trained_model_weights: if use_best_weights: model_weights_file = os.path.join( output_dir, f"best_{output_weights_name}") else: model_weights_file = os.path.join(output_dir, output_weights_name) else: model_weights_file = None model = get_model(class_names, base_model_weights_file, model_weights_file) if show_model_summary: print(model.summary()) # recreate symlink folder for ImageDataGenerator symlink_dir_name = "image_links" create_symlink(image_source_dir, output_dir, symlink_dir_name) print("** create image generators **") train_data_path = f"{output_dir}/{symlink_dir_name}/train/" train_generator = custom_image_generator( ImageDataGenerator(horizontal_flip=True, rescale=1. / 255), train_data_path, batch_size=batch_size, class_names=class_names, ) dev_data_path = f"{output_dir}/{symlink_dir_name}/dev/" dev_generator = custom_image_generator( ImageDataGenerator(horizontal_flip=True, rescale=1. / 255), dev_data_path, batch_size=batch_size, class_names=class_names, ) output_weights_path = os.path.join(output_dir, output_weights_name) print(f"** set output weights path to: {output_weights_path} **") print("** check multiple gpu availability **") gpus = len(os.getenv("CUDA_VISIBLE_DEVICES", "1").split(",")) if gpus > 1: print(f"** multi_gpu_model is used! gpus={gpus} **") model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint( filepath=output_weights_path, base_model=model, ) else: model_train = model checkpoint = ModelCheckpoint(output_weights_path) print("** compile model with class weights **") optimizer = Adam(lr=initial_learning_rate) model_train.compile(optimizer=optimizer, loss="binary_crossentropy") auroc = MultipleClassAUROC( generator=dev_generator, steps=validation_steps, class_names=class_names, weights_path=output_weights_path, stats=training_stats, ) callbacks = [ checkpoint, TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience_reduce_lr, verbose=1), auroc, ] print("** training start **") history = model_train.fit_generator( generator=train_generator, steps_per_epoch=train_steps, epochs=epochs, validation_data=dev_generator, validation_steps=validation_steps, callbacks=callbacks, class_weight=class_weights, ) # dump history print("** dump history **") with open(os.path.join(output_dir, "history.pkl"), "wb") as f: pickle.dump({ "history": history.history, "auroc": auroc.aurocs, }, f) print("** done! **") finally: os.remove(running_flag_file)
def main(): # parser config print("### Input configuration file ### \n") config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config print("### Read default configurations ### \n") output_dir = cp["DEFAULT"].get("output_dir") image_train_source_dir = cp["DEFAULT"].get("image_train_source_dir") image_valid_source_dir = cp["DEFAULT"].get("image_valid_source_dir") base_model_name = cp["DEFAULT"].get("base_model_name") class_names = cp["DEFAULT"].get("class_names").split(",") # train config print("### Reading training configurations ### \n") use_base_model_weights = cp["TRAIN"].getboolean("use_base_model_weights") use_trained_model_weights = cp["TRAIN"].getboolean( "use_trained_model_weights") use_best_weights = cp["TRAIN"].getboolean("use_best_weights") output_weights_name = cp["TRAIN"].get("output_weights_name") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") initial_learning_rate = cp["TRAIN"].getfloat("initial_learning_rate") generator_workers = cp["TRAIN"].getint("generator_workers") image_dimension = cp["TRAIN"].getint("image_dimension") patience_reduce_lr = cp["TRAIN"].getint("patience_reduce_lr") min_lr = cp["TRAIN"].getfloat("min_lr") positive_weights_multiply = cp["TRAIN"].getfloat( "positive_weights_multiply") dataset_csv_dir = cp["TRAIN"].get("dataset_csv_dir") # if previously trained weights is used, never re-split if use_trained_model_weights: # resuming mode print("** use trained model weights **") # load training status for resuming training_stats_file = os.path.join(output_dir, ".training_stats.json") if os.path.isfile(training_stats_file): # TODO: add loading previous learning rate? training_stats = json.load(open(training_stats_file)) else: training_stats = {} else: # start over training_stats = {} print("### Show model summary ### \n") show_model_summary = cp["TRAIN"].getboolean("show_model_summary") # end parser config print("### Check output directory ### \n") # check output_dir, create it if not exists if not os.path.isdir(output_dir): os.makedirs(output_dir) running_flag_file = os.path.join(output_dir, ".training.lock") if os.path.isfile(running_flag_file): raise RuntimeError("A process is running in this directory!!!") else: open(running_flag_file, "a").close() try: print("### Backup config file to {} \n".format(output_dir)) shutil.copy(config_file, os.path.join(output_dir, os.path.split(config_file)[1])) datasets = ["train", "valid"] for dataset in datasets: shutil.copy(os.path.join(dataset_csv_dir, dataset + '.csv'), output_dir) # get train/dev sample counts print("### Get class frequencies ### \n") train_counts, train_pos_counts = get_sample_counts( output_dir, "train", class_names) dev_counts, _ = get_sample_counts(output_dir, "valid", class_names) # compute steps print("### Compute step size ### \n") train_steps = int(train_counts / batch_size) validation_steps = int(dev_counts / batch_size) # compute class weights print("### Class weights ### \n") class_weights = get_class_weights( train_counts, train_pos_counts, multiply=positive_weights_multiply, ) print("### Class_weights ### \n") print(class_weights) print("\n") print("### Loading model ### \n") if use_trained_model_weights: if use_best_weights: model_weights_file = os.path.join( output_dir, "best_" + output_weights_name) else: model_weights_file = os.path.join(output_dir, output_weights_name) else: model_weights_file = None model_factory = ModelFactory() print("### Get model ### \n") model = model_factory.get_model( class_names, model_name=base_model_name, use_base_weights=use_base_model_weights, weights_path=model_weights_file, input_shape=(image_dimension, image_dimension, 3)) print("Show model summary? {}".format(show_model_summary)) if show_model_summary: print(model.summary()) print("\n ### Create image generators ### \n") train_sequence = AugmentedImageSequence( dataset_csv_file=os.path.join(output_dir, "train.csv"), class_names=class_names, source_image_dir=image_train_source_dir, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=train_steps, ) validation_sequence = AugmentedImageSequence( dataset_csv_file=os.path.join(output_dir, "valid.csv"), class_names=class_names, source_image_dir=image_valid_source_dir, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=validation_steps, shuffle_on_epoch_end=False, ) output_weights_path = os.path.join(output_dir, output_weights_name) print("### Set output weights path to {} ### \n".format( output_weights_path)) print("### Check multiple gpu availability ### \n") #gpus = len(os.getenv("CUDA_VISIBLE_DEVICES").split(",")) if False: ## Turn off multiple gpu model print("### Multi_gpu_model is used! gpus={} ###".format(gpus)) model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint( filepath=output_weights_path, base_model=model, ) else: model_train = model checkpoint = ModelCheckpoint( output_weights_path, save_weights_only=True, save_best_only=True, verbose=1, ) print("### Compile model with class weights ### \n") optimizer = Adam(lr=initial_learning_rate) model_train.compile(optimizer=optimizer, loss="binary_crossentropy") auroc = MultipleClassAUROC( sequence=validation_sequence, class_names=class_names, weights_path=output_weights_path, stats=training_stats, workers=generator_workers, ) callbacks = [ checkpoint, TensorBoard(log_dir=os.path.join(output_dir, "logs")), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience_reduce_lr, verbose=1, mode="min", min_lr=min_lr), auroc, ] print("### Start training ### \n") history = model_train.fit( train_sequence, steps_per_epoch=train_steps, epochs=epochs, validation_data=validation_sequence, validation_steps=validation_steps, callbacks=callbacks, class_weight=class_weights, workers=generator_workers, shuffle=False, ) # dump history print("### Dump history ### \n") with open(os.path.join(output_dir, "history.pkl"), "wb") as f: pickle.dump({ "history": history.history, "auroc": auroc.aurocs, }, f) print("** done! **") finally: os.remove(running_flag_file)
def train_rsna_clf(train_data=None, validation_data=None, remove_running=True): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config output_dir = cp["DEFAULT"].get("output_dir") image_source_dir = cp["DEFAULT"].get("image_source_dir") base_model_name = cp["DEFAULT"].get("base_model_name") class_names1 = cp["DEFAULT"].get("class_names1").split(",") class_names2 = cp["DEFAULT"].get("class_names2").split(",") # train config train_image_source_dir = cp["TRAIN"].get("train_image_source_dir") train_class_info = cp["TRAIN"].get("train_class_info") train_box_info = cp["TRAIN"].get("train_box_info") use_base_model_weights = cp["TRAIN"].getboolean("use_base_model_weights") use_trained_model_weights = cp["TRAIN"].getboolean( "use_trained_model_weights") use_best_weights = cp["TRAIN"].getboolean("use_best_weights") input_weights_name = cp["TRAIN"].get("input_weights_name") output_weights_name = cp["TRAIN"].get("output_weights_name") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") initial_learning_rate = cp["TRAIN"].getfloat("initial_learning_rate") generator_workers = cp["TRAIN"].getint("generator_workers") image_dimension = cp["TRAIN"].getint("image_dimension") train_steps = cp["TRAIN"].get("train_steps") patience_reduce_lr = cp["TRAIN"].getint("patience_reduce_lr") min_lr = cp["TRAIN"].getfloat("min_lr") validation_steps = cp["TRAIN"].get("validation_steps") positive_weights_multiply = cp["TRAIN"].getfloat( "positive_weights_multiply") dataset_csv_dir = cp["TRAIN"].get("dataset_csv_dir") # if previously trained weights is used, never re-split if use_trained_model_weights: # resuming mode print("** use trained model weights **") # load training status for resuming training_stats_file = os.path.join(output_dir, ".training_stats.json") if os.path.isfile(training_stats_file): # TODO: add loading previous learning rate? training_stats = json.load(open(training_stats_file)) else: training_stats = {} else: # start over training_stats = {} show_model_summary = cp["TRAIN"].getboolean("show_model_summary") # end parser config # check output_dir, create it if not exists if not os.path.isdir(output_dir): os.makedirs(output_dir) running_flag_file = os.path.join(output_dir, ".training.lock") if os.path.isfile(running_flag_file): if remove_running: os.remove(running_flag_file) open(running_flag_file, "a").close() else: raise RuntimeError("A process is running in this directory!!!") else: open(running_flag_file, "a").close() try: print(f"backup config file to {output_dir}") shutil.copy(config_file, os.path.join(output_dir, os.path.split(config_file)[1])) # get train/dev sample counts train_counts, train_pos_counts = get_sample_counts( train_data.df, class_names2) validation_counts, _ = get_sample_counts(validation_data.df, class_names2) # compute steps if train_steps == "auto": train_steps = int(train_counts / batch_size) else: try: train_steps = int(train_steps) except ValueError: raise ValueError(f""" train_steps: {train_steps} is invalid, please use 'auto' or integer. """) print(f"** train_steps: {train_steps} **") if validation_steps == "auto": validation_steps = int(validation_counts / batch_size) else: try: validation_steps = int(validation_steps) except ValueError: raise ValueError(f""" validation_steps: {validation_steps} is invalid, please use 'auto' or integer. """) print(f"** validation_steps: {validation_steps} **") # compute class weights print("** compute class weights from training data **") class_weights = get_class_weights( train_counts, train_pos_counts, multiply=positive_weights_multiply, ) print("** class_weights **") print(class_weights) print("** load model **") if use_trained_model_weights: if use_best_weights: model_weights_file = os.path.join( output_dir, f"best_{input_weights_name}") else: model_weights_file = os.path.join(output_dir, input_weights_name) else: model_weights_file = None model_factory = ModelFactory() model = model_factory.get_model( class_names1, model_name=base_model_name, use_base_weights=use_base_model_weights, weights_path=model_weights_file, input_shape=(image_dimension, image_dimension, 3)) model = modify_last_layer(model, class_names2) if show_model_summary: print(model.summary()) train_sq = AugmentedLabelSequence_clf( train_data, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=train_steps, ) validation_sq = AugmentedLabelSequence_clf( validation_data, batch_size=batch_size, target_size=(image_dimension, image_dimension), augmenter=augmenter, steps=validation_steps, ) output_weights_path = os.path.join(output_dir, output_weights_name) print(f"** set output weights path to: {output_weights_path} **") print("** check multiple gpu availability **") gpus = len(os.getenv("CUDA_VISIBLE_DEVICES", "1").split(",")) if gpus > 1: print(f"** multi_gpu_model is used! gpus={gpus} **") model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint( filepath=output_weights_path, base_model=model, ) else: model_train = model checkpoint = ModelCheckpoint( output_weights_path, save_weights_only=True, save_best_only=True, verbose=1, ) print("** compile model with class weights **") optimizer = Adam(lr=initial_learning_rate) model_train.compile(optimizer=optimizer, loss="binary_crossentropy") auroc = MultipleClassAUROC( sequence=validation_sq, class_names=class_names2, weights_path=output_weights_path, stats=training_stats, workers=generator_workers, ) callbacks = [ checkpoint, TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience_reduce_lr, verbose=1, mode="min", min_lr=min_lr), auroc, ] print("** start training **") history = model_train.fit_generator( generator=train_sq, steps_per_epoch=train_steps, epochs=epochs, validation_data=validation_sq, validation_steps=validation_steps, callbacks=callbacks, class_weight=class_weights, workers=generator_workers, shuffle=False, ) # dump history print("** dump history **") with open(os.path.join(output_dir, "history.pkl"), "wb") as f: pickle.dump({ "history": history.history, "auroc": auroc.aurocs, }, f) print("** done! **") finally: os.remove(running_flag_file)
rescale=1.0 / 255) validation_generator = (validation_datagen.flow_from_directory( validation_dir, target_size=(224, 224), batch_size=32, class_mode="categorical" ) if len(class_names) > 2 else validation_datagen.flow_from_directory( validation_dir, target_size=(224, 224), batch_size=32, class_mode="binary")) gpus = len(os.getenv("CUDA_VISIBLE_DEVICES", "1").split(",")) if gpus > 1: print(f"** multi_gpu_model is used! gpus={gpus} **") model_train = multi_gpu_model(model, gpus) # FIXME: currently (Keras 2.1.2) checkpoint doesn't work with multi_gpu_model checkpoint = MultiGPUModelCheckpoint(filepath=output_weights_path, base_model=model) else: model_train = model checkpoint = ModelCheckpoint(output_weights_path, save_weights_only=True, save_best_only=True, verbose=1) optimizer = Adam(lr=0.001) if len(class_names) > 2: model_train.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"]) else: model_train.compile(optimizer=optimizer, loss="binary_crossentropy",