def on_batch_end(self, batch, logs=None): overwrite = True # if set to False, will keep weights from previous batches (uses a lot of storage!) if self.batch % self.save_each == 0: weights_path = pj( self.exp_dir, 'weights.h5' if overwrite else f'weights_{self.batch}.h5') log_path = weights_path.replace('.h5', '.log') self.model.save_weights(weights_path) log = {**logs, **self.params} dump(log.items(), log_path) self.batch += 1
def gen_exp_data_dir(gender, train_samples, validation_samples, subjects=None): """ Generates a data dir for the experiment, using either random or specified subjects of a given gender with the specified train/validation sets sizes. :param gender: the gender of the subjects to train model on :param train_samples: the number of images to use in training phase :param validation_samples: the number of images to use in validation phase :param subjects: [optional] a tuple of exactly two subject names, of the same gender. :return: the experiment name, the experiment data dir, actual number of training samples, actual number of validation samples """ gender_source_dir = pj(Paths.data_dir, gender) # use specified subjects if subjects: assert len( subjects ) == 2, f'Invalid size ({len(subjects)}) for subjects argument. Must be exactly 2!' assert all( subject in SUBJECTS[gender] for subject in subjects ), f'Both subjects must be of the specified gender: {gender}' subject1_source_dir = pj(gender_source_dir, subjects[0]) subject2_source_dir = pj(gender_source_dir, subjects[1]) subject1_name = subjects[0] subject2_name = subjects[1] assert pe(subject1_source_dir) and pe( subject2_source_dir ), f'Images dir for either {subjects[0]} or {subjects[1]} was not found!' # get two random subjects from relevant gender dir else: gender_source_subjects = glob(pj(gender_source_dir, '*')) random.shuffle(gender_source_subjects) subject1_source_dir = gender_source_subjects[0] subject2_source_dir = gender_source_subjects[1] subject1_name = ps(subject1_source_dir)[1] subject2_name = ps(subject2_source_dir)[1] # get images for both subjects subject1_image_paths = glob(pj(subject1_source_dir, '*.jpg')) subject2_image_paths = glob(pj(subject2_source_dir, '*.jpg')) # get the minimum number of images between subject 1 and 2 (to create train/validation sets of same sizes) min_num_images = min(len(subject1_image_paths), len(subject2_image_paths)) # if there are not enough images for requested train and validation samples, use same ratio of train/validation with available images if train_samples + validation_samples > min_num_images: train_validation_ratio = train_samples / (train_samples + validation_samples) train_samples = int(min_num_images * train_validation_ratio) validation_samples = min_num_images - train_samples assert train_samples and validation_samples, 'Train and Validation sets must be larger than 0' # print sets sizes print( f'Using {train_samples} training samples and {validation_samples} validation samples.' ) # get random training and validation images for subject 1 random.shuffle(subject1_image_paths) subject1_train_image_paths = subject1_image_paths[:train_samples] subject1_validation_image_paths = subject1_image_paths[ train_samples:train_samples + validation_samples] # get random training and validation images for subject 2 random.shuffle(subject2_image_paths) subject2_train_image_paths = subject2_image_paths[:train_samples] subject2_validation_image_paths = subject2_image_paths[ train_samples:train_samples + validation_samples] # init experiment data dir with train and validation data dirs timestamp = dt.datetime.now().strftime('%Y-%m-%d_%H%M') exp_name = f'{subject1_name}_{subject2_name}_{timestamp}' exp_data_dir = pj(Paths.experiments_dir, exp_name) # init train data dirs train_dir = pj(exp_data_dir, 'train') subject1_train_dir = pj(train_dir, subject1_name) subject2_train_dir = pj(train_dir, subject2_name) mkdirs(subject1_train_dir) mkdirs(subject2_train_dir) # init validation data dirs validation_dir = pj(exp_data_dir, 'validation') subject1_validation_dir = pj(validation_dir, subject1_name) subject2_validation_dir = pj(validation_dir, subject2_name) mkdirs(subject1_validation_dir) mkdirs(subject2_validation_dir) # copy training and validation images for subject 1 and 2 to exp_data_dir for ip in subject1_train_image_paths: shutil.copy(ip, subject1_train_dir) for ip in subject2_train_image_paths: shutil.copy(ip, subject2_train_dir) for ip in subject1_validation_image_paths: shutil.copy(ip, subject1_validation_dir) for ip in subject2_validation_image_paths: shutil.copy(ip, subject2_validation_dir) # write experiment metadata metadata = { 'Gender': gender, 'Training samples': train_samples, 'Validation samples': validation_samples } dump(metadata.items(), pj(exp_data_dir, 'metadata.txt')) dump(subject1_train_image_paths + subject2_train_image_paths, pj(exp_data_dir, 'train_paths.txt')) dump(subject1_validation_image_paths + subject2_validation_image_paths, pj(exp_data_dir, 'validation_paths.txt')) return exp_name, exp_data_dir, train_samples, validation_samples
validation_data=validation_generator, validation_steps=valid_steps_per_epoch, callbacks=[WeightsSaver(model, save_each, exp_dir)]) # save final model weights to disk model.save_weights(pj(exp_dir, f'{EXP_NAME}_weights_final.h5')) # make predictions on the validation/test set p_validation = model.predict_generator(validation_generator, verbose=1) # cross-entropy loss score on the validation/test set # loss_valid = log_loss(validation_generator, p_validation) # save model predictions to disk p_valid_path = pj(exp_dir, f'{EXP_NAME}_pred_valid.csv') dump(p_validation, p_valid_path) # log end and run times end_time = dt.datetime.now() print(f'End time: {end_time}') print(f'Run time: {end_time - start_time}') # save experiment statistics to disk exp_stats = { 'Exp name:': EXP_NAME, 'Start time:': start_time, 'End time:': end_time, 'Run time:': (end_time - start_time), # 'Loss (valid):': loss_valid, '': '' } # TODO : add more statistics to the log file.
def run_experiment(gender=None, train_samples=None, validation_samples=None, subjects=None, img_size=224, img_channels=3, num_classes=2, batch_size=16, epochs=10, freeze_first_layers=36, save_each=5, learning_rate=0.001, model='vgg16', prev_exp_dir=None): """ Train a model on a pair of subjects, according to specified arguments and get predictions on validation set, as well as performance metrics. :param gender: the gender of the subjects for which to create an experiment :param train_samples: number of training samples per class :param validation_samples: number of validation samples per class :param subjects: [optional] a tuple of exactly two subject names, of the same gender. if not set, two random subjects of the same gender will be used. :param img_size: [optional] height / width of input images :param img_channels: [optional] number of color channels in input images :param num_classes: [optional] number of output classes :param batch_size: [optional] number of training samples per gradient update :param epochs: [optional] number of iteration over the entire training set :param freeze_first_layers: [optional] number of layers to freeze :param save_each: [optional] number of batches after which to save intermediate weights h5 file :param learning_rate: [optional] the step to use in each gradient update :param model: [optional] the name of the model to use (vgg16/bcn) :param prev_exp_dir: [optional] an existing experiment dir to use train/validation images from (if specified, no need to specify gender, train_samples, validation_samples). :return: """ assert ( gender and train_samples and validation_samples ) or prev_exp_dir, 'Either prev_exp_dir or gender and train_samples and validation_samples must be specified!' assert model in models, f'{model} is not supported! available models: {str(models)}' # print start time start_time = dt.datetime.now() print(f'Start time: {start_time}') # generate data for experiment if prev_exp_dir: # init experiment data dir with subject names from prev_exp_dir and new timestamp subject1_name, subject2_name, old_timestamp = ps( prev_exp_dir)[1].split('_', 2) timestamp = dt.datetime.now().strftime('%Y-%m-%d_%H%M') exp_name = f'{subject1_name}_{subject2_name}_{timestamp}' exp_data_dir = pj(Paths.experiments_dir, exp_name) mkdirs(exp_data_dir) # copy train and validation images from prev_exp_dir to exp_data_dir shutil.copytree(pj(prev_exp_dir, 'train'), pj(exp_data_dir, 'train')) shutil.copytree(pj(prev_exp_dir, 'validation'), pj(exp_data_dir, 'validation')) assert (subject1_name in M_SUBJECTS and subject2_name in M_SUBJECTS) or ( subject1_name in F_SUBJECTS and subject2_name in F_SUBJECTS ), f'Subjects in {prev_exp_dir} are from different genders!' subjects = (subject1_name, subject2_name) gender = 'M' if subject1_name in M_SUBJECTS else 'F' total_train_samples = len( glob(pj(exp_data_dir, 'train', '**', '*.jpg'), recursive=True)) total_validation_samples = len( glob(pj(exp_data_dir, 'validation', '**', '*.jpg'), recursive=True)) # train_samples = total_train_samples // 2 # validation_samples = total_validation_samples // 2 print( f'Using data for experiment | Exp name: {exp_name} | Gender: {gender} | Total train samples: {total_train_samples} | Total validation samples: {total_validation_samples} | Subjects: {subjects}' ) else: print( f'Generating data for experiment | Gender: {gender} | Requested train samples (per class): {train_samples} | Requested validation samples (per class): {validation_samples} | Subjects: {subjects}' ) exp_name, exp_data_dir, actual_train_samples, actual_validation_samples = gen_exp_data_dir( gender, train_samples, validation_samples, subjects) total_train_samples = actual_train_samples * 2 total_validation_samples = actual_validation_samples * 2 print( f'Generated data for experiment | Exp name: {exp_name} | Total train samples: {total_train_samples} | Total validation samples: {total_validation_samples} | Exp dir: {exp_data_dir}' ) print(f'Using exp data dir: {exp_data_dir}') # get training and validation data generators print( f'Getting train and validation generators | Batch size: {batch_size} | Image size: {img_size}' ) train_generator, validation_generator = get_train_and_valid_generators( exp_data_dir, batch_size, img_size) # load model metrics = ['accuracy'] if model == 'vgg16': # load vgg16 model from cnn_finetune.vgg16 import vgg16_model initial_weights_path = pj( Paths.pretrained_dir, 'vgg16_weights_tf_dim_ordering_tf_kernels.h5') initial_weights_num_classes = 1000 initial_weights = (initial_weights_path, initial_weights_num_classes) print( f'Loading vgg16 model | Initial weights path: {initial_weights_path} | Initial weights number of classes: {initial_weights_num_classes}' ) model = vgg16_model(img_size, img_size, img_channels, num_classes, initial_weights, freeze_first_layers, learning_rate, metrics) elif model == 'bcn': # load binary convnet model from cnn_finetune.small_convnet import binary_convnet_model model = binary_convnet_model(img_size, img_size, img_channels, metrics=metrics) else: model = None # start fine-tuning the model print(f'Training model | Epochs: {epochs}') model.fit_generator( train_generator, steps_per_epoch=total_train_samples // batch_size, epochs=epochs, validation_data=validation_generator, validation_steps=total_validation_samples // batch_size, # callbacks=[WeightsSaver(model, save_each, exp_data_dir)] ) # save final model weights to disk final_weights_path = pj(exp_data_dir, f'{exp_name}_weights_final.h5') print(f'Saving model weights | Path: {final_weights_path}') model.save_weights(final_weights_path) # make predictions on the validation/test set print('Making predictions on validation set') # make predictions on validation set validation_predictions = model.predict_generator(validation_generator, verbose=1) # convert the probabilities matrix to an array of predicted classes if num_classes > 2: validation_y_pred = np.array( [np.argmax(p) for p in validation_predictions], dtype=np.float32) else: # if binary classification validation_y_pred = np.array( [0 if p > 0.5 else 1 for p in validation_predictions], dtype=np.float32) # generate an array of true classes. Important: validation generator must be used with shuffle=False for this to work. validation_y_true = validation_generator.classes # np.array([[1-yt, yt] for yt in validation_generator.classes], dtype=np.float32) # generate predictions object and save it to exp_data_dir predictions_data_path = pj(exp_data_dir, f'{exp_name}_validations_predictions.csv') print( f'Saving predictions on validation set | Path: {predictions_data_path}' ) predictions_data = [('class', 'filename', 'y_true', 'y_pred', 'prob_0')] predictions_data += [ (validation_generator.class_indices[ps( validation_generator.filenames[ind])[0]], ps(validation_generator.filenames[ind])[1], validation_y_true[ind], int(validation_y_pred[ind]), pred[0]) for ind, pred in enumerate(validation_predictions) ] dump(predictions_data, predictions_data_path, delimiter=',') objdump([validation_y_true, validation_y_pred], pj(exp_data_dir, f'{exp_name}_validations_predictions.pkl')) # cross-entropy loss score on the validation/test set print('Getting metrics on validation set predictions') validation_loss = log_loss(validation_y_true, validation_y_pred) validation_accuracy = accuracy_score(validation_y_true, validation_y_pred) validation_precision = precision_score(validation_y_true, validation_y_pred, average='micro') validation_recall = recall_score(validation_y_true, validation_y_pred, average='micro') validation_f1 = f1_score(validation_y_true, validation_y_pred, average='micro') # save classification report report = classification_report( validation_y_true, validation_y_pred, list(validation_generator.class_indices.values()), list(validation_generator.class_indices.keys())) print(report) dump(report, pj(exp_data_dir, 'report.txt')) # log end and run times end_time = dt.datetime.now() print(f'End time: {end_time}') print(f'Run time: {end_time - start_time}') # save experiment statistics to disk exp_stats = { 'Exp name:': exp_name, 'Gender:': gender, 'Start time:': start_time, 'End time:': end_time, 'Run time:': (end_time - start_time), '-': '-', 'Train samples:': total_train_samples, 'Epochs:': epochs, 'Batch size:': batch_size, 'Steps per epoch:': total_train_samples // batch_size, 'Freeze first layers:': freeze_first_layers, 'Learning rate:': learning_rate, 'Save each:': save_each, '--': '--', 'Validation samples:': total_validation_samples, 'Validation loss:': validation_loss, 'Validation accuracy:': validation_accuracy, 'Validation precision:': validation_precision, 'Validation recall:': validation_recall, 'Validation F1:': validation_f1, '': '' } exp_stats_path = pj(exp_data_dir, f'{exp_name}.log') dump(exp_stats.items(), exp_stats_path, append=True) # if final weights are saved, delete intermediate weights file intermediate_weights_path = pj(exp_data_dir, 'weights.h5') if pe(final_weights_path) and pe(intermediate_weights_path): os.remove(intermediate_weights_path)
def predict_with_model(data_dir, out_dir, model_name, initial_weights_path, num_classes, batch_size=16): # create output dir mkdirs(out_dir) # load model img_size = None if model_name == 'vgg16': initial_weights = (initial_weights_path, num_classes if num_classes > 2 else 1) model = vgg16_model(num_classes=num_classes, initial_weights=initial_weights) img_size = vgg16_img_size elif model_name == 'bcn': model = binary_convnet_model(initial_weights_path=initial_weights_path) img_size = bcn_img_size else: model = None assert model, f'model {model_name} not loaded!' print( f'Loading {model_name} model | Initial weights path: {initial_weights_path} | Initial weights number of classes: {num_classes}' ) # init data generator data_generator = get_data_generator(data_dir, img_size, batch_size) # make predictions data_predictions = model.predict_generator(data_generator, verbose=1) # convert the probabilities matrix to an array of predicted classes if num_classes > 2: data_y_pred = np.array([np.argmax(p) for p in data_predictions], dtype=np.float32) else: # if binary classification data_y_pred = np.array([0 if p > 0.5 else 1 for p in data_predictions], dtype=np.float32) # generate an array of true classes. Important: data generator must be used with shuffle=False for this to work. data_y_true = data_generator.classes # np.array([[1-yt, yt] for yt in data_generator.classes], dtype=np.float32) # generate predictions object and save it to out_dir exp_name = 'exp' # TODO : construct exp name predictions_data_path = pj(out_dir, f'{exp_name}_predictions.csv') print(f'Saving predictions on data set | Path: {predictions_data_path}') predictions_data = [('class', 'filename', 'y_true', 'y_pred', 'prob_0')] predictions_data += [ (data_generator.class_indices[ps(data_generator.filenames[ind])[0]], ps(data_generator.filenames[ind])[1], data_y_true[ind], int(data_y_pred[ind]), pred[0]) for ind, pred in enumerate(data_predictions) ] dump(predictions_data, predictions_data_path, delimiter=',') objdump([data_y_true, data_y_pred], pj(out_dir, f'{exp_name}_predictions.pkl')) # cross-entropy loss score on the data/test set print('Getting metrics on data set predictions') data_loss = log_loss(data_y_true, data_y_pred) data_accuracy = accuracy_score(data_y_true, data_y_pred) data_precision = precision_score(data_y_true, data_y_pred, average='micro') data_recall = recall_score(data_y_true, data_y_pred, average='micro') data_f1 = f1_score(data_y_true, data_y_pred, average='micro') # save classification report report = classification_report(data_y_true, data_y_pred, list(data_generator.class_indices.values()), list(data_generator.class_indices.keys())) print(report) dump(report, pj(out_dir, 'report.txt')) # save experiment statistics to disk exp_stats = { 'Exp name:': exp_name, # 'Gender:': gender, # 'Start time:': start_time, # 'End time:': end_time, # 'Run time:': (end_time - start_time), '-': '-', 'Data dir:': data_dir, 'Model name:': model_name, 'Initial weights path:': initial_weights_path, 'Num. classess:': num_classes, '--': '--', # 'Validation samples:': total_data_samples, 'Validation loss:': data_loss, 'Validation accuracy:': data_accuracy, 'Validation precision:': data_precision, 'Validation recall:': data_recall, 'Validation F1:': data_f1, '': '' } exp_stats_path = pj(out_dir, f'{exp_name}.log') dump(exp_stats.items(), exp_stats_path, append=True)
# cross-entropy loss score on the validation/test set loss_valid = log_loss(y_valid, p_valid) # generate predictions object and save it to exp_data_dir predictions_data_path = pj(exp_dir, f'{EXP_NAME}_validations_predictions.csv') predictions_data = [ ('y_true', 'y_pred', 'prob_0', 'prob_1', 'prob_2', 'prob_3', 'prob_4', 'prob_5', 'prob_6', 'prob_7', 'prob_8', 'prob_9') ] predictions_data += [(np.argmax(y_valid[ind]), np.argmax(p_valid[ind]), pred[0], pred[1], pred[2], pred[3], pred[4], pred[5], pred[6], pred[7], pred[8], pred[9]) for ind, pred in enumerate(p_valid)] dump(predictions_data, predictions_data_path, delimiter=',') # cross-entropy loss score on the validation/test set # p_valid_one_hot = np.array([[int(i == np.argmax(pv)) for i in range(0, len(pv))] for pv in p_valid], dtype=np.float32) # convert the probabilities matrix to an array of 1-hot vectors. yv = [np.argmax(y) for y in y_valid] pv = [np.argmax(p) for p in p_valid] objdump([y_valid, p_valid, yv, pv], pj(exp_dir, f'{EXP_NAME}_validations_predictions.pkl')) validation_loss = log_loss(y_valid, p_valid) validation_accuracy = accuracy_score(yv, pv) validation_precision = precision_score(yv, pv, average='micro') validation_recall = recall_score(yv, pv, average='micro') validation_f1 = f1_score(yv, pv, average='micro') # save classification report