def augmentation_generator(df_imgs, image_folder_path, batch_size,
                           n_augmentation, input_shape, labels, preprocessing,
                           TrainImageGen):

    nb_imgs = df_imgs.shape[0]
    batch_size_adapted = int(batch_size / n_augmentation)
    nb_batch = int(math.ceil(nb_imgs * 1.0 / batch_size_adapted))

    while True:

        for num_batch in range(nb_batch):

            df_imgs_batch = df_imgs.iloc[num_batch *
                                         batch_size_adapted:(num_batch + 1) *
                                         batch_size_adapted, :]
            nb_imgs_batch = df_imgs_batch.shape[0]

            X_batch_list = []
            y_batch_list = []

            for num_img in range(nb_imgs_batch):

                row = df_imgs_batch.iloc[num_img, :]
                img_filename = row[constants.FILENAME]
                img_path = utils.get_file_path(image_folder_path, img_filename)
                label = row[constants.LABEL]
                label_index = labels.index(label)

                try:
                    x = utils.preprocess_img(img_path, input_shape,
                                             preprocessing)
                    x = np.tile(x, (n_augmentation, 1, 1, 1))

                    # TrainImageGen returns infinite loop, each of which yields batch data
                    for batch in TrainImageGen.flow(x,
                                                    batch_size=n_augmentation):
                        X_batch_list.append(batch)
                        y_batch_list.extend([label_index] * n_augmentation)
                        break
                except IOError as e:
                    print("Cannot read the image '{}', skipping it. Error: {}".
                          format(img_filename, e))

            X_batch = np.concatenate(X_batch_list)

            actual_batch_size = X_batch.shape[0]
            y_batch = np.zeros((actual_batch_size, n_classes))
            y_batch[range(actual_batch_size), y_batch_list] = 1

            yield (X_batch, y_batch)
def no_augmentation_generator(df_imgs, image_folder_path, batch_size,
                              input_shape, labels, preprocessing):

    nb_imgs = df_imgs.shape[0]
    nb_batch = int(math.ceil(nb_imgs * 1.0 / batch_size))

    while True:

        for num_batch in range(nb_batch):

            df_imgs_batch = df_imgs.iloc[num_batch *
                                         batch_size:(num_batch + 1) *
                                         batch_size, :]
            nb_imgs_batch = df_imgs_batch.shape[0]
            X_batch_list = []
            y_batch_list = []

            for num_img in range(nb_imgs_batch):

                row = df_imgs_batch.iloc[num_img, :]
                img_filename = row[constants.FILENAME]

                img_path = utils.get_file_path(image_folder_path, img_filename)
                label = row[constants.LABEL]
                label_index = labels.index(label)
                try:
                    x = utils.preprocess_img(img_path, input_shape,
                                             preprocessing)
                    X_batch_list.append(x)
                    y_batch_list.append(label_index)
                except IOError as e:
                    print("Cannot read the image '{}', skipping it. Error: {}".
                          format(img_filename, e))

            X_batch = np.array(X_batch_list)

            actual_batch_size = X_batch.shape[0]
            y_batch = np.zeros((actual_batch_size, n_classes))
            y_batch[range(actual_batch_size), y_batch_list] = 1

            yield (X_batch, y_batch)
 def download_files_to_managed_folder(folder_path,
                                      files_info,
                                      chunk_size=8192):
     total_size = 0
     bytes_so_far = 0
     for file_info in files_info:
         response = requests.get(file_info["url"], stream=True)
         total_size += int(response.headers.get('content-length'))
         file_info["response"] = response
     update_time = time.time()
     for file_info in files_info:
         with open(
                 utils.get_file_path(folder_path,
                                     file_info["filename"]), "wb") as f:
             for content in file_info["response"].iter_content(
                     chunk_size=chunk_size):
                 bytes_so_far += len(content)
                 # Only scale to 80% because needs to compute model summary after download
                 percent = int(float(bytes_so_far) / total_size * 80)
                 update_time = update_percent(percent, update_time)
                 f.write(content)
def predict(limit=5, min_threshold=0):
    batch_size = 100
    n = 0
    results = {"prediction": [], "error": []}
    num_images = len(images_paths)
    while True:
        if (n * batch_size) >= num_images:
            break

        next_batch_list = []
        error_indices = []
        for index_in_batch, i in enumerate(
                range(n * batch_size, min((n + 1) * batch_size, num_images))):
            img_path = images_paths[i]
            try:
                preprocessed_img = utils.preprocess_img(
                    utils.get_file_path(image_folder_path, img_path),
                    model_input_shape, preprocessing)
                next_batch_list.append(preprocessed_img)
            except IOError as e:
                print("Cannot read the image '{}', skipping it. Error: {}".
                      format(img_path, e))
                error_indices.append(index_in_batch)
        next_batch = np.array(next_batch_list)

        prediction_batch = utils.get_predictions(model, next_batch, limit,
                                                 min_threshold, labels_df)
        error_batch = [0] * len(prediction_batch)

        for err_index in error_indices:
            prediction_batch.insert(err_index, None)
            error_batch.insert(err_index, 1)

        results["prediction"].extend(prediction_batch)
        results["error"].extend(error_batch)
        n += 1
        print("{} images treated, out of {}".format(
            min(n * batch_size, num_images), num_images))
    return results
utils.check_managed_folder_filesystem(model_folder)
model_folder_path = model_folder.get_path()

output_name = get_output_names_for_role('scored_dataset')[0]
output_dataset = dataiku.Dataset(output_name)

# Model
model_and_pp = utils.load_instantiate_keras_model_preprocessing(
    model_folder_path, goal=constants.SCORING)
model = model_and_pp["model"]
preprocessing = model_and_pp["preprocessing"]
model_input_shape = utils.get_model_input_shape(model, model_folder_path)

# (classId -> Name) mapping
labels_df = None
labels_path = utils.get_file_path(model_folder_path,
                                  constants.MODEL_LABELS_FILE)
if os.path.isfile(labels_path):
    labels_df = pd.read_csv(labels_path, sep=",")
    labels_df = labels_df.set_index('id')
else:
    print(
        "------ \n Info: No csv file in the model folder, will not use class names. \n ------"
    )

# Image paths
images_paths = os.listdir(image_folder_path)

###################################################################################################################
## COMPUTING SCORE
###################################################################################################################
    def run(self, progress_callback):

        # Retrieving parameters
        output_folder_name = self.config.get('outputName', '')
        model = self.config.get('model', '')
        architecture, trained_on = model.split('_')

        # Creating new Managed Folder if needed
        project = self.client.get_project(self.project_key)
        output_folder_found = False

        for folder in project.list_managed_folders():
            if output_folder_name == folder['name']:
                output_folder = project.get_managed_folder(folder['id'])
                output_folder_found = True
                break

        if not output_folder_found:
            output_folder = project.create_managed_folder(output_folder_name)

        output_folder_path = dataiku.Folder(
            output_folder.get_definition()["id"],
            project_key=self.project_key).get_path()

        # Building config file
        config = {
            "architecture":
            architecture,
            "trained_on":
            trained_on,
            "extract_layer_default_index":
            utils.get_extract_layer_index(architecture, trained_on)
        }

        # Downloading weights
        url_to_weights = utils.get_weights_urls(architecture, trained_on)

        def update_percent(percent, last_update_time):
            new_time = time.time()
            if (new_time - last_update_time) > 3:
                progress_callback(percent)
                return new_time
            else:
                return last_update_time

        def download_files_to_managed_folder(folder_path,
                                             files_info,
                                             chunk_size=8192):
            total_size = 0
            bytes_so_far = 0
            for file_info in files_info:
                response = requests.get(file_info["url"], stream=True)
                total_size += int(response.headers.get('content-length'))
                file_info["response"] = response
            update_time = time.time()
            for file_info in files_info:
                with open(
                        utils.get_file_path(folder_path,
                                            file_info["filename"]), "wb") as f:
                    for content in file_info["response"].iter_content(
                            chunk_size=chunk_size):
                        bytes_so_far += len(content)
                        # Only scale to 80% because needs to compute model summary after download
                        percent = int(float(bytes_so_far) / total_size * 80)
                        update_time = update_percent(percent, update_time)
                        f.write(content)

        files_to_dl = [{
            "url":
            url_to_weights["top"],
            "filename":
            utils.get_weights_filename(output_folder_path, config)
        }, {
            "url":
            url_to_weights["no_top"],
            "filename":
            utils.get_weights_filename(output_folder_path, config, "_notop")
        }]

        if trained_on == constants.IMAGENET:
            # Downloading mapping id <-> name for imagenet classes
            # File used by Keras in all its 'decode_predictions' methods
            # Found here : https://github.com/keras-team/keras/blob/2.1.1/keras/applications/imagenet_utils.py
            imagenet_id_class_mapping_url = "https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json"
            imagenet_class_mapping_temp_file = "imagenet_classes_mapping.json"
            files_to_dl.append({
                "url": imagenet_id_class_mapping_url,
                "filename": imagenet_class_mapping_temp_file
            })

        output_folder.put_file(constants.CONFIG_FILE, json.dumps(config))
        download_files_to_managed_folder(output_folder_path, files_to_dl)

        if trained_on == constants.IMAGENET:
            # Convert class mapping from json to csv
            mapping_df = pd.read_json(utils.get_file_path(
                output_folder_path, imagenet_class_mapping_temp_file),
                                      orient="index")
            mapping_df = mapping_df.reset_index()
            mapping_df = mapping_df.rename(columns={
                "index": "id",
                1: "className"
            })[["id", "className"]]
            mapping_df.to_csv(utils.get_file_path(output_folder_path,
                                                  constants.MODEL_LABELS_FILE),
                              index=False,
                              sep=",")
            os.remove(
                utils.get_file_path(output_folder_path,
                                    imagenet_class_mapping_temp_file))

        # Computing model info
        utils.save_model_info(output_folder_path)

        return "<span>DONE</span>"
        save_best_only=True,
        save_weights_only=should_save_weights_only)
else:
    mcheck = ModelCheckpoint(model_weights_path,
                             monitor="val_loss",
                             save_best_only=True,
                             save_weights_only=should_save_weights_only)

callback_list.append(mcheck)

###################################################################################################################
## TENSORBOARD
###################################################################################################################

if tensorboard:
    log_path = utils.get_file_path(output_model_folder_path,
                                   constants.TENSORBOARD_LOGS)

    # If already folder at loger_path, delete it
    if os.path.isdir(log_path):
        shutil.rmtree(log_path)

    tsboard = TensorBoard(log_dir=log_path, write_graph=True)
    callback_list.append(tsboard)

###################################################################################################################
## TRAIN MODEL
###################################################################################################################

model.fit_generator(train_generator,
                    steps_per_epoch=nb_steps_per_epoch,
                    epochs=nb_epochs,