def augmentation_generator(df_imgs, image_folder_path, batch_size, n_augmentation, input_shape, labels, preprocessing, TrainImageGen): nb_imgs = df_imgs.shape[0] batch_size_adapted = int(batch_size / n_augmentation) nb_batch = int(math.ceil(nb_imgs * 1.0 / batch_size_adapted)) while True: for num_batch in range(nb_batch): df_imgs_batch = df_imgs.iloc[num_batch * batch_size_adapted:(num_batch + 1) * batch_size_adapted, :] nb_imgs_batch = df_imgs_batch.shape[0] X_batch_list = [] y_batch_list = [] for num_img in range(nb_imgs_batch): row = df_imgs_batch.iloc[num_img, :] img_filename = row[constants.FILENAME] img_path = utils.get_file_path(image_folder_path, img_filename) label = row[constants.LABEL] label_index = labels.index(label) try: x = utils.preprocess_img(img_path, input_shape, preprocessing) x = np.tile(x, (n_augmentation, 1, 1, 1)) # TrainImageGen returns infinite loop, each of which yields batch data for batch in TrainImageGen.flow(x, batch_size=n_augmentation): X_batch_list.append(batch) y_batch_list.extend([label_index] * n_augmentation) break except IOError as e: print("Cannot read the image '{}', skipping it. Error: {}". format(img_filename, e)) X_batch = np.concatenate(X_batch_list) actual_batch_size = X_batch.shape[0] y_batch = np.zeros((actual_batch_size, n_classes)) y_batch[range(actual_batch_size), y_batch_list] = 1 yield (X_batch, y_batch)
def no_augmentation_generator(df_imgs, image_folder_path, batch_size, input_shape, labels, preprocessing): nb_imgs = df_imgs.shape[0] nb_batch = int(math.ceil(nb_imgs * 1.0 / batch_size)) while True: for num_batch in range(nb_batch): df_imgs_batch = df_imgs.iloc[num_batch * batch_size:(num_batch + 1) * batch_size, :] nb_imgs_batch = df_imgs_batch.shape[0] X_batch_list = [] y_batch_list = [] for num_img in range(nb_imgs_batch): row = df_imgs_batch.iloc[num_img, :] img_filename = row[constants.FILENAME] img_path = utils.get_file_path(image_folder_path, img_filename) label = row[constants.LABEL] label_index = labels.index(label) try: x = utils.preprocess_img(img_path, input_shape, preprocessing) X_batch_list.append(x) y_batch_list.append(label_index) except IOError as e: print("Cannot read the image '{}', skipping it. Error: {}". format(img_filename, e)) X_batch = np.array(X_batch_list) actual_batch_size = X_batch.shape[0] y_batch = np.zeros((actual_batch_size, n_classes)) y_batch[range(actual_batch_size), y_batch_list] = 1 yield (X_batch, y_batch)
def download_files_to_managed_folder(folder_path, files_info, chunk_size=8192): total_size = 0 bytes_so_far = 0 for file_info in files_info: response = requests.get(file_info["url"], stream=True) total_size += int(response.headers.get('content-length')) file_info["response"] = response update_time = time.time() for file_info in files_info: with open( utils.get_file_path(folder_path, file_info["filename"]), "wb") as f: for content in file_info["response"].iter_content( chunk_size=chunk_size): bytes_so_far += len(content) # Only scale to 80% because needs to compute model summary after download percent = int(float(bytes_so_far) / total_size * 80) update_time = update_percent(percent, update_time) f.write(content)
def predict(limit=5, min_threshold=0): batch_size = 100 n = 0 results = {"prediction": [], "error": []} num_images = len(images_paths) while True: if (n * batch_size) >= num_images: break next_batch_list = [] error_indices = [] for index_in_batch, i in enumerate( range(n * batch_size, min((n + 1) * batch_size, num_images))): img_path = images_paths[i] try: preprocessed_img = utils.preprocess_img( utils.get_file_path(image_folder_path, img_path), model_input_shape, preprocessing) next_batch_list.append(preprocessed_img) except IOError as e: print("Cannot read the image '{}', skipping it. Error: {}". format(img_path, e)) error_indices.append(index_in_batch) next_batch = np.array(next_batch_list) prediction_batch = utils.get_predictions(model, next_batch, limit, min_threshold, labels_df) error_batch = [0] * len(prediction_batch) for err_index in error_indices: prediction_batch.insert(err_index, None) error_batch.insert(err_index, 1) results["prediction"].extend(prediction_batch) results["error"].extend(error_batch) n += 1 print("{} images treated, out of {}".format( min(n * batch_size, num_images), num_images)) return results
utils.check_managed_folder_filesystem(model_folder) model_folder_path = model_folder.get_path() output_name = get_output_names_for_role('scored_dataset')[0] output_dataset = dataiku.Dataset(output_name) # Model model_and_pp = utils.load_instantiate_keras_model_preprocessing( model_folder_path, goal=constants.SCORING) model = model_and_pp["model"] preprocessing = model_and_pp["preprocessing"] model_input_shape = utils.get_model_input_shape(model, model_folder_path) # (classId -> Name) mapping labels_df = None labels_path = utils.get_file_path(model_folder_path, constants.MODEL_LABELS_FILE) if os.path.isfile(labels_path): labels_df = pd.read_csv(labels_path, sep=",") labels_df = labels_df.set_index('id') else: print( "------ \n Info: No csv file in the model folder, will not use class names. \n ------" ) # Image paths images_paths = os.listdir(image_folder_path) ################################################################################################################### ## COMPUTING SCORE ###################################################################################################################
def run(self, progress_callback): # Retrieving parameters output_folder_name = self.config.get('outputName', '') model = self.config.get('model', '') architecture, trained_on = model.split('_') # Creating new Managed Folder if needed project = self.client.get_project(self.project_key) output_folder_found = False for folder in project.list_managed_folders(): if output_folder_name == folder['name']: output_folder = project.get_managed_folder(folder['id']) output_folder_found = True break if not output_folder_found: output_folder = project.create_managed_folder(output_folder_name) output_folder_path = dataiku.Folder( output_folder.get_definition()["id"], project_key=self.project_key).get_path() # Building config file config = { "architecture": architecture, "trained_on": trained_on, "extract_layer_default_index": utils.get_extract_layer_index(architecture, trained_on) } # Downloading weights url_to_weights = utils.get_weights_urls(architecture, trained_on) def update_percent(percent, last_update_time): new_time = time.time() if (new_time - last_update_time) > 3: progress_callback(percent) return new_time else: return last_update_time def download_files_to_managed_folder(folder_path, files_info, chunk_size=8192): total_size = 0 bytes_so_far = 0 for file_info in files_info: response = requests.get(file_info["url"], stream=True) total_size += int(response.headers.get('content-length')) file_info["response"] = response update_time = time.time() for file_info in files_info: with open( utils.get_file_path(folder_path, file_info["filename"]), "wb") as f: for content in file_info["response"].iter_content( chunk_size=chunk_size): bytes_so_far += len(content) # Only scale to 80% because needs to compute model summary after download percent = int(float(bytes_so_far) / total_size * 80) update_time = update_percent(percent, update_time) f.write(content) files_to_dl = [{ "url": url_to_weights["top"], "filename": utils.get_weights_filename(output_folder_path, config) }, { "url": url_to_weights["no_top"], "filename": utils.get_weights_filename(output_folder_path, config, "_notop") }] if trained_on == constants.IMAGENET: # Downloading mapping id <-> name for imagenet classes # File used by Keras in all its 'decode_predictions' methods # Found here : https://github.com/keras-team/keras/blob/2.1.1/keras/applications/imagenet_utils.py imagenet_id_class_mapping_url = "https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json" imagenet_class_mapping_temp_file = "imagenet_classes_mapping.json" files_to_dl.append({ "url": imagenet_id_class_mapping_url, "filename": imagenet_class_mapping_temp_file }) output_folder.put_file(constants.CONFIG_FILE, json.dumps(config)) download_files_to_managed_folder(output_folder_path, files_to_dl) if trained_on == constants.IMAGENET: # Convert class mapping from json to csv mapping_df = pd.read_json(utils.get_file_path( output_folder_path, imagenet_class_mapping_temp_file), orient="index") mapping_df = mapping_df.reset_index() mapping_df = mapping_df.rename(columns={ "index": "id", 1: "className" })[["id", "className"]] mapping_df.to_csv(utils.get_file_path(output_folder_path, constants.MODEL_LABELS_FILE), index=False, sep=",") os.remove( utils.get_file_path(output_folder_path, imagenet_class_mapping_temp_file)) # Computing model info utils.save_model_info(output_folder_path) return "<span>DONE</span>"
save_best_only=True, save_weights_only=should_save_weights_only) else: mcheck = ModelCheckpoint(model_weights_path, monitor="val_loss", save_best_only=True, save_weights_only=should_save_weights_only) callback_list.append(mcheck) ################################################################################################################### ## TENSORBOARD ################################################################################################################### if tensorboard: log_path = utils.get_file_path(output_model_folder_path, constants.TENSORBOARD_LOGS) # If already folder at loger_path, delete it if os.path.isdir(log_path): shutil.rmtree(log_path) tsboard = TensorBoard(log_dir=log_path, write_graph=True) callback_list.append(tsboard) ################################################################################################################### ## TRAIN MODEL ################################################################################################################### model.fit_generator(train_generator, steps_per_epoch=nb_steps_per_epoch, epochs=nb_epochs,