def evaluate_model_area(model_name: str, test_words_type: str) -> None:
    r"""MAKEDOC: what is evaluate_model_area doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_model_area")
    # logg.setLevel("INFO")
    logg.debug("Start evaluate_model_area")

    # magic to fix the GPUs
    setup_gpus()

    # # VAN_opa1_lr05_bs32_en15_dsaug07_wLTall
    # hypa = {
    #     "batch_size_type": "32",
    #     "dataset_name": "aug07",
    #     "epoch_num_type": "15",
    #     "learning_rate_type": "03",
    #     "net_type": "VAN",
    #     "optimizer_type": "a1",
    #     # "words_type": "LTall",
    #     "words_type": train_words_type,
    # }
    # # use_validation = True
    # use_validation = False
    # dataset_name = hypa["dataset_name"]

    # get the model name
    # model_name = build_area_name(hypa, use_validation)
    logg.debug(f"model_name: {model_name}")

    dataset_re = re.compile("_ds(.*?)_")
    match = dataset_re.search(model_name)
    if match is not None:
        logg.debug(f"match[1]: {match[1]}")
        dataset_name = match[1]

    train_words_type_re = re.compile("_w(.*?)[_.]")
    match = train_words_type_re.search(model_name)
    if match is not None:
        logg.debug(f"match[1]: {match[1]}")
        train_words_type = match[1]

    # load the model
    model_folder = Path("trained_models") / "area"
    model_path = model_folder / f"{model_name}.h5"
    model = tf_models.load_model(model_path)
    # model.summary()

    train_words = words_types[train_words_type]
    logg.debug(f"train_words: {train_words}")
    test_words = words_types[test_words_type]
    logg.debug(f"test_words: {test_words}")

    # input data
    processed_path = Path("data_proc") / f"{dataset_name}"
    data, labels = load_processed(processed_path, test_words)
    logg.debug(f"list(data.keys()): {list(data.keys())}")
    logg.debug(f"data['testing'].shape: {data['testing'].shape}")

    # evaluate on the words you trained on
    logg.debug("Evaluate on test data:")
    model.evaluate(data["testing"], labels["testing"])
    # model.evaluate(data["validation"], labels["validation"])

    # predict labels/cm/fscore
    y_pred = model.predict(data["testing"])
    cm = pred_hot_2_cm(labels["testing"], y_pred, test_words)
    # y_pred = model.predict(data["validation"])
    # cm = pred_hot_2_cm(labels["validation"], y_pred, test_words)
    fscore = analyze_confusion(cm, test_words)
    logg.debug(f"fscore: {fscore}")

    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, model_name, test_words, fscore, train_words)

    fig_name = f"{model_name}_test{test_words_type}_cm.{{}}"
    cm_folder = Path("plot_results") / "cm"
    if not cm_folder.exists():
        cm_folder.mkdir(parents=True, exist_ok=True)

    plot_cm_path = cm_folder / fig_name.format("png")
    fig.savefig(plot_cm_path)
    plot_cm_path = cm_folder / fig_name.format("pdf")
    fig.savefig(plot_cm_path)

    plt.show()
Exemple #2
0
def train_transfer(
    hypa: ty.Dict[str, str],
    force_retrain: bool,
    use_validation: bool,
    trained_folder: Path,
    root_info_folder: Path,
    tensorboard_logs_folder: Path,
) -> None:
    """MAKEDOC: what is train_transfer doing?

    https://www.tensorflow.org/guide/keras/transfer_learning/#build_a_model
    """
    logg = logging.getLogger(f"c.{__name__}.train_transfer")
    # logg.setLevel("INFO")
    logg.debug("Start train_transfer")

    ##########################################################
    #   Setup folders
    ##########################################################

    # name the model
    model_name = build_transfer_name(hypa, use_validation)
    logg.debug(f"model_name: {model_name}")

    # save the trained model here
    model_path = trained_folder / f"{model_name}.h5"
    placeholder_path = trained_folder / f"{model_name}.txt"

    # check if this model has already been trained
    if placeholder_path.exists():
        if force_retrain:
            logg.warn("\nRETRAINING MODEL!!\n")
        else:
            logg.debug("Already trained")
            return

    # save info regarding the model training in this folder
    model_info_folder = root_info_folder / model_name
    if not model_info_folder.exists():
        model_info_folder.mkdir(parents=True, exist_ok=True)

    # magic to fix the GPUs
    setup_gpus()

    ##########################################################
    #   Load data
    ##########################################################

    # grab a few hypas
    words_type = hypa["words_type"]
    datasets_type = hypa["datasets_type"]

    # get the partition of the data
    partition, ids2labels = prepare_partitions(words_type)

    # get the word list
    words = words_types[words_type]
    num_labels = len(words)

    # get the dataset name list
    datasets_types, datasets_shapes = get_datasets_types()
    dataset_names = datasets_types[datasets_type]
    dataset_shape = datasets_shapes[datasets_type]

    # the shape of each sample
    input_shape = (*dataset_shape, 3)

    # from hypa extract training param (epochs, batch, opt, ...)
    training_param = get_training_param_transfer(hypa, use_validation,
                                                 tensorboard_logs_folder,
                                                 model_path)

    # load datasets
    processed_folder = Path("data_split")
    data_split_paths = [processed_folder / f"{dn}" for dn in dataset_names]
    # data, labels = load_triple(data_paths, words)

    # assemble the gen_param for the generators
    gen_param = {
        "dim": dataset_shape,
        "batch_size": training_param["batch_sizes"][0],
        "shuffle": True,
        "label_names": words,
        "data_split_paths": data_split_paths,
    }

    # maybe concatenate the valdation and training lists
    val_generator: ty.Optional[AudioGenerator] = None
    if use_validation:
        val_generator = AudioGenerator(partition["validation"], ids2labels,
                                       **gen_param)
        logg.debug("Using validation data")
    else:
        partition["training"].extend(partition["validation"])
        logg.debug("NOT using validation data")

    # create the training generator with the modified (maybe) list of IDs
    training_generator = AudioGenerator(partition["training"], ids2labels,
                                        **gen_param)
    logg.debug(f"len(training_generator): {len(training_generator)}")

    ###### always create the test generator
    # do not shuffle the test data
    gen_param["shuffle"] = False
    # do not batch it, no loss of stray data at the end
    gen_param["batch_size"] = 1
    testing_generator = AudioGenerator(partition["testing"], ids2labels,
                                       **gen_param)

    ##########################################################
    #   Setup model
    ##########################################################

    # from hypa extract model param
    model_param = get_model_param_transfer(hypa, num_labels, input_shape)

    # get mean and var to normalize the data
    data_mean, data_variance = get_generator_mean_var_cached(
        training_generator, words_type, datasets_type, processed_folder)

    # get the model
    model, base_model = TRAmodel(data_mean=data_mean,
                                 data_variance=data_variance,
                                 **model_param)
    model.summary()

    # a dict to recreate this training
    recap: ty.Dict[str, ty.Any] = {}
    recap["words"] = words
    recap["hypa"] = hypa
    recap["model_param"] = model_param
    recap["use_validation"] = use_validation
    recap["model_name"] = model_name
    recap["batch_sizes"] = training_param["batch_sizes"]
    recap["epoch_num"] = training_param["epoch_num"]
    recap["version"] = "003"

    # logg.debug(f"recap: {recap}")
    recap_path = model_info_folder / "recap.json"
    recap_path.write_text(json.dumps(recap, indent=4))

    ##########################################################
    #   Compile and fit model the first time
    ##########################################################

    model.compile(
        optimizer=training_param["opt"][0],
        loss=tf_losses.CategoricalCrossentropy(),
        metrics=training_param["metrics"][0],
    )

    results_freeze = model.fit(
        training_generator,
        validation_data=val_generator,
        epochs=training_param["epoch_num"][0],
        callbacks=training_param["callbacks"][0],
    )

    # reload the best weights saved by the ModelCheckpoint
    model.load_weights(str(model_path))

    ##########################################################
    #   Save results, history, performance
    ##########################################################

    # results_freeze_recap
    results_freeze_recap: ty.Dict[str, ty.Any] = {}
    results_freeze_recap["model_name"] = model_name
    results_freeze_recap["results_recap_version"] = "001"

    # save the histories
    results_freeze_recap["history_train"] = {
        mn: results_freeze.history[mn]
        for mn in model.metrics_names
    }
    if use_validation:
        results_freeze_recap["history_val"] = {
            f"val_{mn}": results_freeze.history[f"val_{mn}"]
            for mn in model.metrics_names
        }

    # save the results
    res_recap_path = model_info_folder / "results_freeze_recap.json"
    res_recap_path.write_text(json.dumps(results_freeze_recap, indent=4))

    ##########################################################
    #   Compile and fit model the second time
    ##########################################################

    # Unfreeze the base_model. Note that it keeps running in inference mode
    # since we passed `training=False` when calling it. This means that
    # the batchnorm layers will not update their batch statistics.
    # This prevents the batchnorm layers from undoing all the training
    # we've done so far.
    base_model.trainable = True
    model.summary()

    model.compile(
        optimizer=training_param["opt"][1],  # Low learning rate
        loss=tf_losses.CategoricalCrossentropy(),
        metrics=training_param["metrics"][1],
    )

    results_full = model.fit(
        training_generator,
        validation_data=val_generator,
        epochs=training_param["epoch_num"][1],
        callbacks=training_param["callbacks"][1],
    )

    # reload the best weights saved by the ModelCheckpoint
    model.load_weights(str(model_path))

    ##########################################################
    #   Save results, history, performance
    ##########################################################

    results_full_recap: ty.Dict[str, ty.Any] = {}
    results_full_recap["model_name"] = model_name
    results_full_recap["results_recap_version"] = "001"

    # evaluate performance
    eval_testing = model.evaluate(testing_generator)
    for metrics_name, value in zip(model.metrics_names, eval_testing):
        logg.debug(f"{metrics_name}: {value}")
        results_full_recap[metrics_name] = value

    # compute the confusion matrix
    y_pred = model.predict(testing_generator)
    y_pred_labels = testing_generator.pred2labelnames(y_pred)
    y_true = testing_generator.get_true_labels()
    # cm = pred_hot_2_cm(y_true, y_pred, words)
    cm = confusion_matrix(y_true, y_pred_labels)
    results_full_recap["cm"] = cm.tolist()

    # compute the fscore
    fscore = analyze_confusion(cm, words)
    logg.debug(f"fscore: {fscore}")
    results_full_recap["fscore"] = fscore

    # plot the cm
    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, model_name, words, fscore)
    plot_cm_path = model_info_folder / "test_confusion_matrix.png"
    fig.savefig(plot_cm_path)
    plt.close(fig)

    # save the histories
    results_full_recap["history_train"] = {
        mn: results_full.history[mn]
        for mn in model.metrics_names
    }
    if use_validation:
        results_full_recap["history_val"] = {
            f"val_{mn}": results_full.history[f"val_{mn}"]
            for mn in model.metrics_names
        }

    # save the results
    res_recap_path = model_info_folder / "results_full_recap.json"
    res_recap_path.write_text(json.dumps(results_full_recap, indent=4))

    # save the trained model
    model.save(model_path)

    # save the placeholder
    placeholder_path.write_text(f"Trained. F-score: {fscore}")
Exemple #3
0
def train_model(hypa, force_retrain):
    """MAKEDOC: What is train_model doing?"""
    logg = logging.getLogger(f"c.{__name__}.train_model")
    # logg.debug("Starting train_model")

    # get the words
    words = words_types[hypa["words"]]

    # name the model
    model_name = build_cnn_name(hypa)
    logg.debug(f"model_name: {model_name}")

    # save the trained model here
    model_folder = Path("trained_models") / "cnn"
    if not model_folder.exists():
        model_folder.mkdir(parents=True, exist_ok=True)
    model_path = model_folder / f"{model_name}.h5"
    # logg.debug(f"model_path: {model_path}")

    placeholder_path = model_folder / f"{model_name}.txt"
    # check if this model has already been trained
    if placeholder_path.exists():
        if force_retrain:
            logg.warn("\nRETRAINING MODEL!!\n")
        else:
            logg.debug("Already trained")
            return

    # save info regarding the model training in this folder
    info_folder = Path("info") / "cnn" / model_name
    if not info_folder.exists():
        info_folder.mkdir(parents=True, exist_ok=True)

    # magic to fix the GPUs
    setup_gpus()

    # input data
    processed_path = Path("data_proc") / f"{hypa['dataset']}"
    data, labels = load_processed(processed_path, words)

    # from hypa extract model param
    model_param = {}
    model_param["num_labels"] = len(words)
    model_param["input_shape"] = data["training"][0].shape
    model_param["base_filters"] = hypa["base_filters"]
    model_param["base_dense_width"] = hypa["base_dense_width"]

    # translate types to actual values

    kernel_size_types = {
        "01": [(2, 2), (2, 2), (2, 2)],
        "02": [(5, 1), (3, 3), (3, 3)],
        "03": [(1, 5), (3, 3), (3, 3)],
    }
    model_param["kernel_sizes"] = kernel_size_types[hypa["kernel_size_type"]]

    pool_size_types = {
        "01": [(2, 2), (2, 2), (2, 2)],
        "02": [(2, 1), (2, 2), (2, 2)],
        "03": [(1, 2), (2, 2), (2, 2)],
    }
    model_param["pool_sizes"] = pool_size_types[hypa["pool_size_type"]]

    dropout_types = {"01": [0.03, 0.01], "02": [0.3, 0.1]}
    model_param["dropouts"] = dropout_types[hypa["dropout_type"]]

    # a dict to recreate this training
    recap = {}
    recap["words"] = words
    recap["hypa"] = hypa
    recap["model_param"] = model_param
    recap["model_name"] = model_name
    recap["version"] = "002"
    # logg.debug(f"recap: {recap}")
    recap_path = info_folder / "recap.json"
    recap_path.write_text(json.dumps(recap, indent=4))

    learning_rate_types = {
        "01": "fixed01",
        "02": "fixed02",
        "03": "fixed03",
        "e1": "exp_decay_keras_01",
        "04": "exp_decay_step_01",
        "05": "exp_decay_smooth_01",
        "06": "exp_decay_smooth_02",
    }
    learning_rate_type = hypa["learning_rate_type"]
    lr_value = learning_rate_types[learning_rate_type]

    # setup opt fixed lr values
    if lr_value.startswith("fixed"):
        if lr_value == "fixed01":
            lr = 1e-2
        elif lr_value == "fixed02":
            lr = 1e-3
        elif lr_value == "fixed03":
            lr = 1e-4
    else:
        lr = 1e-3

    if lr_value == "exp_decay_keras_01":
        lr = ExponentialDecay(0.1, decay_steps=100000, decay_rate=0.96, staircase=True)

    optimizer_types = {
        "a1": Adam(learning_rate=lr),
        "r1": RMSprop(learning_rate=lr),
    }
    opt = optimizer_types[hypa["optimizer_type"]]

    # create the model
    model = CNNmodel(**model_param)
    # model.summary()

    metrics = [
        tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
    ]

    model.compile(
        optimizer=opt,
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=metrics,
    )

    # setup callbacks
    callbacks = []

    # setup exp decay step / smooth
    if lr_value.startswith("exp_decay"):
        if lr_value == "exp_decay_step_01":
            exp_decay_part = partial(exp_decay_step, epochs_drop=5)
        elif lr_value == "exp_decay_smooth_01":
            exp_decay_part = partial(exp_decay_smooth, epochs_drop=5)
        elif lr_value == "exp_decay_smooth_02":
            exp_decay_part = partial(
                exp_decay_smooth, epochs_drop=5, initial_lrate=1e-2
            )
        lrate = LearningRateScheduler(exp_decay_part)
        callbacks.append(lrate)

    # # setup early stopping
    # early_stop = EarlyStopping(
    #     # monitor="val_categorical_accuracy",
    #     monitor="val_loss",
    #     patience=4,
    #     verbose=1,
    #     restore_best_weights=True,
    # )
    # callbacks.append(early_stop)

    # get training parameters
    BATCH_SIZE = hypa["batch_size"]
    SHUFFLE_BUFFER_SIZE = BATCH_SIZE
    EPOCH_NUM = hypa["epoch_num"]

    # load the datasets
    datasets = {}
    for which in ["training", "validation", "testing"]:
        # logg.debug(f"data[{which}].shape: {data[which].shape}")
        datasets[which] = Dataset.from_tensor_slices((data[which], labels[which]))
        # logg.debug(f"datasets[{which}]: {datasets[which]}")
        datasets[which] = datasets[which].shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
        # logg.debug(f"datasets[{which}]: {datasets[which]}")

    # train the model
    results = model.fit(
        data["training"],
        labels["training"],
        # validation_data=datasets["validation"],
        validation_data=(data["validation"], labels["validation"]),
        batch_size=BATCH_SIZE,
        epochs=EPOCH_NUM,
        verbose=1,
        callbacks=callbacks,
    )

    # save the trained model
    model.save(model_path)

    results_recap = {}
    results_recap["model_name"] = model_name

    # version of the results saved
    results_recap["results_recap_version"] = "002"

    # quickly evaluate the results
    # logg.debug(f"\nmodel.metrics_names: {model.metrics_names}")
    # for which in ["training", "validation", "testing"]:
    #     model_eval = model.evaluate(datasets[which])
    #     logg.debug(f"{which}: model_eval: {model_eval}")

    # save the evaluation results
    logg.debug("Evaluate on test data:")
    # eval_testing = model.evaluate(datasets["testing"])
    # results_recap[model.metrics_names[0]] = eval_testing[0]
    # results_recap[model.metrics_names[1]] = eval_testing[1]
    eval_testing = model.evaluate(data["testing"], labels["testing"])
    for metrics_name, value in zip(model.metrics_names, eval_testing):
        logg.debug(f"{metrics_name}: {value}")
        results_recap[metrics_name] = value

    # compute the confusion matrix
    # y_pred = model.predict(datasets["testing"])
    y_pred = model.predict(data["testing"])
    cm = pred_hot_2_cm(labels["testing"], y_pred, words)
    # logg.debug(f"cm: {cm}")
    results_recap["cm"] = cm.tolist()

    # compute the fscore
    fscore = analyze_confusion(cm, words)
    logg.debug(f"fscore: {fscore}")

    # plot the cm
    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, model_name, words, fscore)
    plot_cm_path = info_folder / "test_confusion_matrix.png"
    fig.savefig(plot_cm_path)
    plt.close(fig)

    # save the histories
    results_recap["history"] = {
        "loss": results.history["loss"],
        "val_loss": results.history["val_loss"],
        "categorical_accuracy": results.history["categorical_accuracy"],
        "val_categorical_accuracy": results.history["val_categorical_accuracy"],
    }

    # save the results
    res_recap_path = info_folder / "results_recap.json"
    res_recap_path.write_text(json.dumps(results_recap, indent=4))

    y_pred_dataset = model.predict(datasets["testing"])
    cm_dataset = pred_hot_2_cm(labels["testing"], y_pred_dataset, words)
    fscore_dataset = analyze_confusion(cm_dataset, words)
    logg.debug(f"fscore_dataset: {fscore_dataset} fscore {fscore}")
    # for i, (ys, yd) in enumerate(zip(y_pred, y_pred_dataset)):
    #     pred_split = np.argmax(ys)
    #     pred_dataset = np.argmax(yd)
    #     logg.debug(f"i: {i} pred_split: {pred_split} pred_dataset: {pred_dataset}")

    # plt.show()

    placeholder_path.write_text(f"Trained. F-score: {fscore}")

    return "done_training"
def train_attention(hypa: ty.Dict[str, str], force_retrain: bool,
                    use_validation: bool) -> None:
    """MAKEDOC: what is train_attention doing?"""
    logg = logging.getLogger(f"c.{__name__}.train_attention")
    # logg.setLevel("INFO")
    logg.debug("Start train_attention")

    # build the model name
    model_name = build_attention_name(hypa, use_validation)
    logg.debug(f"model_name: {model_name}")

    # save the trained model here
    model_folder = Path("trained_models") / "attention"
    if not model_folder.exists():
        model_folder.mkdir(parents=True, exist_ok=True)
    model_path = model_folder / f"{model_name}.h5"
    placeholder_path = model_folder / f"{model_name}.txt"

    # check if this model has already been trained
    if placeholder_path.exists():
        if force_retrain:
            logg.warn("\nRETRAINING MODEL!!\n")
        else:
            logg.debug("Already trained")
            return

    # save info regarding the model training in this folder
    info_folder = Path("info") / "attention" / model_name
    if not info_folder.exists():
        info_folder.mkdir(parents=True, exist_ok=True)

    # get the word list
    words = words_types[hypa["words_type"]]
    num_labels = len(words)

    # load data
    processed_folder = Path("data_proc")
    processed_path = processed_folder / f"{hypa['dataset_name']}"
    data, labels = load_processed(processed_path, words)

    # concatenate train and val for final train
    val_data = None
    if use_validation:
        x = data["training"]
        y = labels["training"]
        val_data = (data["validation"], labels["validation"])
        logg.debug("Using validation data")
    else:
        x = np.concatenate((data["training"], data["validation"]))
        y = np.concatenate((labels["training"], labels["validation"]))
        logg.debug("NOT using validation data")

    # the shape of each sample
    input_shape = data["training"][0].shape

    # from hypa extract model param
    model_param = get_model_param_attention(hypa, num_labels, input_shape)

    batch_size_types = {"01": 32, "02": 16}
    batch_size = batch_size_types[hypa["batch_size_type"]]

    epoch_num_types = {"01": 15, "02": 30, "03": 2, "04": 4}
    epoch_num = epoch_num_types[hypa["epoch_num_type"]]

    # magic to fix the GPUs
    setup_gpus()

    model = AttentionModel(**model_param)
    # model.summary()

    metrics = [
        tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
    ]

    learning_rate_types = {
        "01": "fixed01",
        "02": "fixed02",
        "03": "exp_decay_step_01",
        "04": "exp_decay_smooth_01",
        "05": "clr_triangular2_01",
        "06": "clr_triangular2_02",
        "07": "clr_triangular2_03",
        "08": "clr_triangular2_04",
        "09": "clr_triangular2_05",
        "10": "exp_decay_smooth_02",
    }
    learning_rate_type = hypa["learning_rate_type"]
    lr_value = learning_rate_types[learning_rate_type]

    # setup opt fixed lr values
    if lr_value.startswith("fixed"):
        if lr_value == "fixed01":
            lr = 1e-3
        elif lr_value == "fixed02":
            lr = 1e-4
    else:
        lr = 1e-3

    optimizer_types = {
        "a1": Adam(learning_rate=lr),
        "r1": RMSprop(learning_rate=lr)
    }
    opt = optimizer_types[hypa["optimizer_type"]]

    model.compile(
        optimizer=opt,
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=metrics,
    )

    # setup callbacks
    callbacks = []

    # setup exp decay step / smooth
    if lr_value.startswith("exp_decay"):
        if lr_value == "exp_decay_step_01":
            exp_decay_part = partial(exp_decay_step, epochs_drop=5)
        elif lr_value == "exp_decay_smooth_01":
            exp_decay_part = partial(exp_decay_smooth, epochs_drop=5)
        elif lr_value == "exp_decay_smooth_02":
            exp_decay_part = partial(exp_decay_smooth,
                                     epochs_drop=5,
                                     initial_lrate=1e-2)
        lrate = LearningRateScheduler(exp_decay_part)
        callbacks.append(lrate)

    # setup cyclic learning rate
    if lr_value.startswith("clr_triangular2"):
        base_lr = 1e-5
        max_lr = 1e-3

        # training iteration per epoch = num samples // batch size
        # step size suggested = 2~8 * iterations
        if lr_value == "clr_triangular2_01":
            step_factor = 8
            step_size = step_factor * x.shape[0] // batch_size

        elif lr_value == "clr_triangular2_02":
            step_factor = 2
            step_size = step_factor * x.shape[0] // batch_size

        # target_cycles = the number of cycles we want in those epochs
        # it_per_epoch = num_samples // batch_size
        # total_iterations = it_per_epoch * epoch_num
        # step_size = total_iterations // target_cycles
        elif lr_value == "clr_triangular2_03":
            # the number of cycles we want in those epochs
            target_cycles = 4
            it_per_epoch = x.shape[0] // batch_size
            total_iterations = it_per_epoch * epoch_num
            step_size = total_iterations // (target_cycles * 2)

        elif lr_value == "clr_triangular2_04":
            # the number of cycles we want in those epochs
            target_cycles = 2
            it_per_epoch = x.shape[0] // batch_size
            total_iterations = it_per_epoch * epoch_num
            step_size = total_iterations // (target_cycles * 2)

        elif lr_value == "clr_triangular2_05":
            # the number of cycles we want in those epochs
            target_cycles = 2
            it_per_epoch = x.shape[0] // batch_size
            total_iterations = it_per_epoch * epoch_num
            step_size = total_iterations // (target_cycles * 2)
            # set bigger starting value
            max_lr = 1e-2

        logg.debug(f"x.shape[0]: {x.shape[0]}")
        logg.debug(f"CLR is using step_size: {step_size}")

        mode = "triangular2"
        cyclic_lr = CyclicLR(base_lr, max_lr, step_size, mode)
        callbacks.append(cyclic_lr)

    # setup early stopping
    if learning_rate_type in ["01", "02", "03", "04"]:
        metric_to_monitor = "val_loss" if use_validation else "loss"
        early_stop = EarlyStopping(
            monitor=metric_to_monitor,
            patience=4,
            restore_best_weights=True,
            verbose=1,
        )
        callbacks.append(early_stop)

    # model_checkpoint = ModelCheckpoint(
    #     model_name,
    #     monitor="val_loss",
    #     save_best_only=True,
    # )

    # a dict to recreate this training
    # FIXME this should be right before fit and have epoch_num/batch_size/lr info
    recap: ty.Dict[str, ty.Any] = {}
    recap["words"] = words
    recap["hypa"] = hypa
    recap["model_param"] = model_param
    recap["use_validation"] = use_validation
    recap["model_name"] = model_name
    recap["version"] = "001"
    # logg.debug(f"recap: {recap}")
    recap_path = info_folder / "recap.json"
    recap_path.write_text(json.dumps(recap, indent=4))

    results = model.fit(
        x,
        y,
        validation_data=val_data,
        epochs=epoch_num,
        batch_size=batch_size,
        callbacks=callbacks,
    )

    results_recap: ty.Dict[str, ty.Any] = {}
    results_recap["model_name"] = model_name
    results_recap["results_recap_version"] = "002"

    # eval performance on the various metrics
    eval_testing = model.evaluate(data["testing"], labels["testing"])
    for metrics_name, value in zip(model.metrics_names, eval_testing):
        logg.debug(f"{metrics_name}: {value}")
        results_recap[metrics_name] = value

    # compute the confusion matrix
    y_pred = model.predict(data["testing"])
    cm = pred_hot_2_cm(labels["testing"], y_pred, words)
    # logg.debug(f"cm: {cm}")
    results_recap["cm"] = cm.tolist()

    # compute the fscore
    fscore = analyze_confusion(cm, words)
    logg.debug(f"fscore: {fscore}")
    results_recap["fscore"] = fscore

    # save the histories
    results_recap["history_train"] = {
        mn: results.history[mn]
        for mn in model.metrics_names
    }
    if use_validation:
        results_recap["history_val"] = {
            f"val_{mn}": results.history[f"val_{mn}"]
            for mn in model.metrics_names
        }

    # plot the cm
    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, model_name, words, fscore)
    plot_cm_path = info_folder / "test_confusion_matrix.png"
    fig.savefig(plot_cm_path)
    plt.close(fig)

    # save the results
    res_recap_path = info_folder / "results_recap.json"
    res_recap_path.write_text(json.dumps(results_recap, indent=4))

    # if cyclic_lr was used save the history
    if lr_value.startswith("clr_triangular2"):
        logg.debug(f"cyclic_lr.history.keys(): {cyclic_lr.history.keys()}")
        clr_recap = {}
        for metric_name, values in cyclic_lr.history.items():
            clr_recap[metric_name] = list(float(v) for v in values)
        clr_recap_path = info_folder / "clr_recap.json"
        clr_recap_path.write_text(json.dumps(clr_recap, indent=4))

    # save the trained model
    model.save(model_path)

    placeholder_path.write_text(f"Trained. F-score: {fscore}")
def recompute_fscore_cnn() -> None:
    """MAKEDOC: what is recompute_fscore_cnn doing?"""
    logg = logging.getLogger(f"c.{__name__}.recompute_fscore_cnn")
    # logg.setLevel("INFO")
    logg.debug("Start recompute_fscore_cnn")

    info_folder = Path("info")
    trained_folder = Path("trained_models")

    for model_folder in info_folder.iterdir():
        # logg.debug(f"model_folder: {model_folder}")

        # check that it is a CNN
        model_name = model_folder.name
        if not model_name.startswith("CNN"):
            continue

        # check that the model is trained and not a placeholder
        model_path = trained_folder / f"{model_name}.h5"
        found_model = False
        if model_path.exists():
            if model_path.stat().st_size > 100:
                found_model = True
        if not found_model:
            continue

        # load it
        model = models.load_model(model_path)

        res_recap_path = model_folder / "results_recap.json"
        if not res_recap_path.exists():
            continue
        results_recap = json.loads(res_recap_path.read_text())
        # logg.debug(f"results_recap['cm']: {results_recap['cm']}")

        recap_path = model_folder / "recap.json"
        recap = json.loads(recap_path.read_text())
        # logg.debug(f"recap['words']: {recap['words']}")

        words = recap["words"]
        hypa = recap["hypa"]

        # check that the data is available
        dn = hypa["dataset"]
        wt = hypa["words"]
        if dn.startswith("mel") or dn.startswith("mfcc"):
            preprocess_spec(dn, wt)
        elif dn.startswith("aug"):
            do_augmentation(dn, wt)

        processed_path = Path("data_proc") / f"{hypa['dataset']}"
        data, labels = load_processed(processed_path, words)

        y_pred = model.predict(data["testing"])
        cm = pred_hot_2_cm(labels["testing"], y_pred, words)
        fscore = analyze_confusion(cm, words)
        # logg.debug(f"fscore: {fscore}")

        # overwrite the cm
        results_recap["cm"] = cm.tolist()
        # add the fscore
        results_recap["fscore"] = fscore
        # increase the version
        results_recap["results_recap_version"] = "002"
        # write the new results
        res_recap_path.write_text(json.dumps(results_recap, indent=4))

        # increase the recap version (shows that it is after this debacle)
        recap["version"] = "002"
        recap_path.write_text(json.dumps(recap, indent=4))

        # save the new plots
        fig, ax = plt.subplots(figsize=(12, 12))
        plot_confusion_matrix(cm, ax, model_name, words, fscore)
        plot_cm_path = info_folder / "test_confusion_matrix.png"
        fig.savefig(plot_cm_path)
        plt.close(fig)
def delete_bad_models_cnn(args) -> None:
    """MAKEDOC: what is delete_bad_models_cnn doing?"""
    logg = logging.getLogger(f"c.{__name__}.delete_bad_models_cnn")
    # logg.setLevel("INFO")
    logg.debug("Start delete_bad_models_cnn")

    info_folder = Path("info") / "cnn"
    trained_folder = Path("trained_models") / "cnn"
    deleted = 0
    recreated = 0
    bad_models = 0
    good_models = 0

    for model_folder in info_folder.iterdir():
        # logg.debug(f"model_folder: {model_folder}")

        model_name = model_folder.name
        if not model_name.startswith("CNN"):
            continue

        model_name = model_folder.name
        model_path = trained_folder / f"{model_name}.h5"
        placeholder_path = trained_folder / f"{model_name}.txt"

        res_recap_path = model_folder / "results_recap.json"
        if not res_recap_path.exists():
            logg.warn(f"Skipping {res_recap_path}, not found")
            continue
        results_recap = json.loads(res_recap_path.read_text())

        recap_path = model_folder / "recap.json"
        recap = json.loads(recap_path.read_text())

        # load info
        words_type = recap["hypa"]["words"]
        words = recap["words"]

        cm = np.array(results_recap["cm"])
        fscore = analyze_confusion(cm, words)
        # logg.debug(f"fscore: {fscore}")

        categorical_accuracy = results_recap["categorical_accuracy"]
        # logg.debug(f"categorical_accuracy: {categorical_accuracy}")

        if "all" in words_type:
            f_tresh = 0.9
            ca_tresh = 0.9
        elif "f1" in words_type:
            f_tresh = 0.975
            ca_tresh = 0.975
        elif "f2" in words_type:
            f_tresh = 0.97
            ca_tresh = 0.97
        elif "dir" in words_type:
            f_tresh = 0.97
            ca_tresh = 0.97
        elif "num" in words_type:
            f_tresh = 0.965
            ca_tresh = 0.965
        elif "k1" in words_type:
            f_tresh = 0.9
            ca_tresh = 0.9
        elif "w2" in words_type:
            f_tresh = 0.85
            ca_tresh = 0.85
        else:
            logg.warn(f"Not specified f_tresh for words_type: {words_type}")
            f_tresh = 0.8
            ca_tresh = 0.8

        if fscore < f_tresh and categorical_accuracy < ca_tresh:
            bad_models += 1

            if model_path.exists():
                # manually uncomment this when ready
                # model_path.unlink()
                deleted += 1
                logg.debug(f"Deleting model_path: {model_path}")
                logg.debug(f"\tfscore: {fscore}")
                logg.debug(f"\tcategorical_accuracy: {categorical_accuracy}")

            # check that a placeholder is there, you have info for this model
            else:
                if not placeholder_path.exists():
                    placeholder_path.write_text("Deleted")
                    logg.debug(
                        f"Recreating placeholder_path: {placeholder_path}")
                    recreated += 1

        else:
            # logg.debug(f"Good model_path {model_path} {words_type}")
            # logg.debug(f"\tfscore: {fscore}")
            # logg.debug(f"\tcategorical_accuracy: {categorical_accuracy}")
            good_models += 1

    logg.debug(f"bad_models: {bad_models}")
    logg.debug(f"good_models: {good_models}")
    logg.debug(f"deleted: {deleted}")
    logg.debug(f"recreated: {recreated}")
def test_audio_generator(words_type: str) -> None:
    """MAKEDOC: what is test_audio_generator doing?"""
    logg = logging.getLogger(f"c.{__name__}.test_audio_generator")
    # logg.setLevel("INFO")
    logg.debug("Start test_audio_generator")

    partition, ids2labels = prepare_partitions(words_type)

    for fold in partition:
        logg.debug(f"partition[{fold}][:4]: {partition[fold][:4]}")

    logg.debug(f"\nlen(ids2labels): {len(ids2labels)}")
    for ID in ids2labels:
        logg.debug(f"ids2labels[{ID}]: {ids2labels[ID]}")
        break

    words = words_types[words_type]
    processed_folder = Path("data_split") / "mel04"
    data_split_paths = [processed_folder]

    params = {
        "dim": (64, 64),
        "batch_size": 32,
        "shuffle": True,
        "label_names": words,
        "data_split_paths": data_split_paths,
    }

    training_generator = AudioGenerator(partition["training"], ids2labels, **params)
    logg.debug(f"len(training_generator): {len(training_generator)}")

    val_generator = AudioGenerator(partition["validation"], ids2labels, **params)
    logg.debug(f"len(val_generator): {len(val_generator)}")

    # do not shuffle the test data
    params["shuffle"] = False
    # do not batch it, no loss of stray data at the end
    params["batch_size"] = 1
    testing_generator = AudioGenerator(partition["testing"], ids2labels, **params)
    logg.debug(f"len(testing_generator): {len(testing_generator)}")

    X, y = training_generator[0]
    logg.debug(f"X.shape: {X.shape} y.shape: {y.shape}")

    model_param: ty.Dict[str, ty.Any] = {}
    model_param["num_labels"] = len(words)
    model_param["input_shape"] = (64, 64, 1)
    model_param["base_dense_width"] = 32
    model_param["base_filters"] = 20
    model_param["dropouts"] = [0.03, 0.01]
    model_param["kernel_sizes"] = [(5, 1), (3, 3), (3, 3)]
    model_param["pool_sizes"] = [(2, 1), (2, 2), (2, 2)]
    model = CNNmodel(**model_param)
    # model.summary()

    metrics = [
        tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
    ]
    opt = tf.optimizers.Adam()
    loss = tf.keras.losses.CategoricalCrossentropy()
    model.compile(optimizer=opt, loss=loss, metrics=metrics)

    EPOCH_NUM = 5
    model.fit(training_generator, validation_data=val_generator, epochs=EPOCH_NUM)

    eval_testing = model.evaluate(testing_generator)
    for metrics_name, value in zip(model.metrics_names, eval_testing):
        logg.debug(f"{metrics_name}: {value}")

    y_pred = model.predict(testing_generator)
    y_pred_labels = testing_generator.pred2labelnames(y_pred)
    y_true = testing_generator.get_true_labels()

    cm = confusion_matrix(y_true, y_pred_labels)
    fscore = analyze_confusion(cm, words)
    logg.debug(f"fscore: {fscore}")

    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, "Test generator", words, fscore)
    fig.tight_layout()
    plt.show()
def evaluate_model_cnn(which_dataset: str, train_words_type: str,
                       test_words_type: str) -> None:
    """MAKEDOC: what is evaluate_model_cnn doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_model_cnn")
    # logg.setLevel("INFO")
    logg.debug("Start evaluate_model_cnn")

    # magic to fix the GPUs
    setup_gpus()

    # setup the parameters
    # hypa: ty.Dict[str, ty.Union[str, int]] = {}
    # hypa["base_dense_width"] = 32
    # hypa["base_filters"] = 20
    # hypa["batch_size"] = 32
    # hypa["dropout_type"] = "01"
    # # hypa["epoch_num"] = 16
    # hypa["epoch_num"] = 15
    # hypa["kernel_size_type"] = "02"
    # # hypa["pool_size_type"] = "02"
    # hypa["pool_size_type"] = "01"
    # # hypa["learning_rate_type"] = "02"
    # hypa["learning_rate_type"] = "04"
    # hypa["optimizer_type"] = "a1"
    # hypa["dataset"] = which_dataset
    # hypa["words"] = train_words_type

    # hypa: ty.Dict[str, ty.Union[str, int]] = {}
    # hypa["base_dense_width"] = 32
    # hypa["base_filters"] = 32
    # hypa["batch_size"] = 32
    # hypa["dropout_type"] = "02"
    # hypa["epoch_num"] = 15
    # hypa["kernel_size_type"] = "02"
    # hypa["pool_size_type"] = "01"
    # hypa["learning_rate_type"] = "04"
    # hypa["optimizer_type"] = "a1"
    # hypa["dataset"] = which_dataset
    # hypa["words"] = train_words_type

    hypa: ty.Dict[str, ty.Union[str, int]] = {
        "base_dense_width": 32,
        "base_filters": 32,
        "batch_size": 32,
        # "dataset": "aug07",
        "dropout_type": "01",
        "epoch_num": 15,
        "kernel_size_type": "02",
        "learning_rate_type": "04",
        "optimizer_type": "a1",
        "pool_size_type": "01",
        # "words": "all",
    }

    hypa["dataset"] = which_dataset
    hypa["words"] = train_words_type

    # get the words
    # train_words = words_types[train_words_type]
    test_words = words_types[test_words_type]

    model_name = build_cnn_name(hypa)
    logg.debug(f"model_name: {model_name}")

    model_folder = Path("trained_models") / "cnn"
    model_path = model_folder / f"{model_name}.h5"
    if not model_path.exists():
        logg.error(f"Model not found at: {model_path}")
        raise FileNotFoundError

    model = tf.keras.models.load_model(model_path)
    model.summary()

    # input data
    processed_path = Path("data_proc") / f"{which_dataset}"
    data, labels = load_processed(processed_path, test_words)
    logg.debug(f"data['testing'].shape: {data['testing'].shape}")

    # evaluate on the words you trained on
    logg.debug("Evaluate on test data:")
    model.evaluate(data["testing"], labels["testing"])
    # model.evaluate(data["validation"], labels["validation"])

    # predict labels/cm/fscore
    y_pred = model.predict(data["testing"])
    cm = pred_hot_2_cm(labels["testing"], y_pred, test_words)
    # y_pred = model.predict(data["validation"])
    # cm = pred_hot_2_cm(labels["validation"], y_pred, test_words)
    fscore = analyze_confusion(cm, test_words)
    logg.debug(f"fscore: {fscore}")

    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, model_name, test_words, fscore)

    plt.show()
def build_cnn_results_df() -> pd.DataFrame:
    """MAKEDOC: what is build_cnn_results_df doing?"""
    logg = logging.getLogger(f"c.{__name__}.build_cnn_results_df")
    logg.setLevel("INFO")
    logg.debug("Start build_cnn_results_df")

    info_folder = Path("info") / "cnn"

    pandito: ty.Dict[str, ty.List[str]] = {
        "dense_width": [],
        "filters": [],
        "batch_size": [],
        "dataset": [],
        "dropout": [],
        "epoch_num": [],
        "kernel_size": [],
        "pool_size": [],
        "lr": [],
        "opt": [],
        "words": [],
        "fscore": [],
        "loss": [],
        "cat_acc": [],
        "model_name": [],
    }

    for model_folder in info_folder.iterdir():
        logg.debug(f"model_folder: {model_folder}")

        model_name = model_folder.name
        if not model_name.startswith("CNN"):
            continue

        res_recap_path = model_folder / "results_recap.json"
        if not res_recap_path.exists():
            continue

        results_recap = json.loads(res_recap_path.read_text())
        logg.debug(f"results_recap['cm']: {results_recap['cm']}")

        recap_path = model_folder / "recap.json"
        recap = json.loads(recap_path.read_text())
        logg.debug(f"recap['words']: {recap['words']}")

        cm = np.array(results_recap["cm"])
        fscore = analyze_confusion(cm, recap["words"])
        logg.debug(f"fscore: {fscore}")

        categorical_accuracy = results_recap["categorical_accuracy"]
        logg.debug(f"categorical_accuracy: {categorical_accuracy}")

        pandito["dense_width"].append(recap["hypa"]["base_dense_width"])
        pandito["filters"].append(recap["hypa"]["base_filters"])
        pandito["batch_size"].append(recap["hypa"]["batch_size"])
        pandito["dataset"].append(recap["hypa"]["dataset"])
        pandito["dropout"].append(recap["hypa"]["dropout_type"])
        pandito["epoch_num"].append(recap["hypa"]["epoch_num"])
        pandito["kernel_size"].append(recap["hypa"]["kernel_size_type"])
        pandito["pool_size"].append(recap["hypa"]["pool_size_type"])
        pandito["words"].append(recap["hypa"]["words"])
        pandito["cat_acc"].append(results_recap["categorical_accuracy"])
        pandito["loss"].append(results_recap["loss"])
        pandito["model_name"].append(results_recap["model_name"])
        # pandito["fscore"].append(fscore)
        pandito["fscore"].append(categorical_accuracy)

        if "version" in recap:
            if recap["version"] == "001":
                pandito["lr"].append(recap["hypa"]["learning_rate_type"])
                pandito["opt"].append(recap["hypa"]["optimizer_type"])
            elif recap["version"] == "002":
                if "learning_rate_type" in recap["hypa"]:
                    pandito["lr"].append(recap["hypa"]["learning_rate_type"])
                    pandito["opt"].append(recap["hypa"]["optimizer_type"])
                else:
                    # pandito["lr"].append("default")
                    # pandito["opt"].append("adam")
                    pandito["lr"].append("01")
                    pandito["opt"].append("a1")
        else:
            # pandito["lr"].append("default")
            # pandito["opt"].append("adam")
            pandito["lr"].append("01")
            pandito["opt"].append("a1")

    df = pd.DataFrame(pandito)
    return df
Exemple #10
0
def train_img(
    hypa: ty.Dict[str, str],
    force_retrain: bool,
    use_validation: bool,
    trained_folder: Path,
    root_info_folder: Path,
) -> None:
    """MAKEDOC: what is train_img doing?"""
    logg = logging.getLogger(f"c.{__name__}.train_img")
    # logg.setLevel("INFO")
    logg.debug("Start train_img")

    ##########################################################
    #   Setup folders
    ##########################################################

    # name the model
    model_name = build_img_name(hypa, use_validation)
    logg.debug(f"model_name: {model_name}")

    # save the trained model here
    model_path = trained_folder / f"{model_name}.h5"
    placeholder_path = trained_folder / f"{model_name}.txt"

    # check if this model has already been trained
    if placeholder_path.exists():
        if force_retrain:
            logg.warn("\nRETRAINING MODEL!!\n")
        else:
            logg.debug("Already trained")
            return

    # save info regarding the model training in this folder
    model_info_folder = root_info_folder / model_name
    if not model_info_folder.exists():
        model_info_folder.mkdir(parents=True, exist_ok=True)

    # magic to fix the GPUs
    setup_gpus()

    ##########################################################
    #   Load data
    ##########################################################

    label_type = hypa["words_type"]
    label_list = get_label_list(label_type)
    num_labels = len(label_list)

    dataset_raw_folder = Path.home(
    ) / "datasets" / "imagenet" / "imagenet_images"
    dataset_proc_base_folder = Path.home() / "datasets" / "imagenet"

    # get the partition of the data
    partition, ids2labels = prepare_partitions(label_list, dataset_raw_folder)

    num_samples = len(partition["training"])

    # from hypa extract training param (epochs, batch, opt, ...)
    training_param = get_training_param_img(hypa, use_validation, model_path,
                                            num_samples)

    preprocess_type = hypa["dataset_name"]
    dataset_proc_folder = dataset_proc_base_folder / preprocess_type

    val_generator: ty.Optional[ImageNetGenerator] = None
    if use_validation:
        val_generator = ImageNetGenerator(
            partition["validation"],
            ids2labels,
            label_list,
            dataset_proc_folder=dataset_proc_folder,
            dataset_raw_folder=dataset_raw_folder,
            preprocess_type=preprocess_type,
            save_processed=True,
            batch_size=training_param["batch_size"],
            shuffle=True,
        )
        logg.debug("Using validation data")
    else:
        partition["training"].extend(partition["validation"])
        logg.debug("NOT using validation data")

    training_generator = ImageNetGenerator(
        partition["training"],
        ids2labels,
        label_list,
        dataset_proc_folder=dataset_proc_folder,
        dataset_raw_folder=dataset_raw_folder,
        preprocess_type=preprocess_type,
        save_processed=True,
        batch_size=training_param["batch_size"],
        shuffle=True,
    )

    testing_generator = ImageNetGenerator(
        partition["testing"],
        ids2labels,
        label_list,
        dataset_proc_folder=dataset_proc_folder,
        dataset_raw_folder=dataset_raw_folder,
        preprocess_type=preprocess_type,
        save_processed=True,
        batch_size=1,
        shuffle=False,
    )

    ##########################################################
    #   Setup model
    ##########################################################

    input_shape = training_generator.get_img_shape()

    # from hypa extract model param
    model_param = get_model_param_img(hypa, num_labels, input_shape)

    # get the model with the chosen params
    net_type = hypa["net_type"]
    if net_type == "ARN":
        model = AreaNet.build(**model_param)
    elif net_type == "AAN":
        model = ActualAreaNet.build(**model_param)
    elif net_type == "VAN":
        model = VerticalAreaNet.build(**model_param)
    elif net_type.startswith("SI"):
        if net_type == "SIM":
            sim_type = "1"
        elif net_type == "SI2":
            sim_type = "2"
        model = SimpleNet.build(sim_type=sim_type, **model_param)

    # a few metrics to track
    metrics = [
        tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
    ]

    # compile the model
    model.compile(
        optimizer=training_param["opt"],
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=metrics,
    )

    # recap
    recap: ty.Dict[str, ty.Any] = {}
    recap["model_name"] = model_name
    recap["words"] = label_list
    recap["hypa"] = hypa
    recap["model_param"] = model_param
    recap["use_validation"] = use_validation
    recap["batch_size"] = training_param["batch_size"]
    recap["epochs"] = training_param["epochs"]
    recap["lr_name"] = training_param["lr_name"]
    recap["version"] = "002"

    # logg.debug(f"recap: {recap}")
    recap_path = model_info_folder / "recap.json"
    recap_path.write_text(json.dumps(recap, indent=4))

    # https://stackoverflow.com/a/45546663/2237151
    model_summary_path = model_info_folder / "model_summary.txt"
    with model_summary_path.open("w") as msf:
        model.summary(line_length=150, print_fn=lambda x: msf.write(x + "\n"))

    ##########################################################
    #   Fit model
    ##########################################################

    results = model.fit(
        training_generator,
        validation_data=val_generator,
        epochs=training_param["epochs"],
        batch_size=training_param["batch_size"],
        callbacks=training_param["callbacks"],
    )

    ##########################################################
    #   Save results, history, performance
    ##########################################################

    # results_recap
    results_recap: ty.Dict[str, ty.Any] = {}
    results_recap["model_name"] = model_name
    results_recap["results_recap_version"] = "001"

    # evaluate performance
    eval_testing = model.evaluate(testing_generator)
    for metrics_name, value in zip(model.metrics_names, eval_testing):
        logg.debug(f"{metrics_name}: {value}")
        results_recap[metrics_name] = value

    # confusion matrix
    y_pred = model.predict(testing_generator)
    y_pred_labels = testing_generator.pred2labelnames(y_pred)
    y_true = testing_generator.get_true_labels()
    cm = confusion_matrix(y_true, y_pred_labels)
    results_recap["cm"] = cm.tolist()

    # fscore
    fscore = analyze_confusion(cm, label_list)
    logg.debug(f"fscore: {fscore}")
    results_recap["fscore"] = fscore

    # save the histories
    results_recap["history_train"] = {
        mn: results.history[mn]
        for mn in model.metrics_names
    }
    if use_validation:
        results_recap["history_val"] = {
            f"val_{mn}": results.history[f"val_{mn}"]
            for mn in model.metrics_names
        }

    # save the results
    res_recap_path = model_info_folder / "results_recap.json"
    res_recap_path.write_text(json.dumps(results_recap, indent=4))

    # plot the cm
    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, model_name, label_list, fscore)
    plot_cm_path = model_info_folder / "test_confusion_matrix.png"
    fig.savefig(plot_cm_path)
    plt.close(fig)

    # save the trained model
    model.save(model_path)

    # save the placeholder
    placeholder_path.write_text(f"Trained. F-score: {fscore}")
Exemple #11
0
def evaluate_model_cm(model_name: str, test_words_type: str) -> float:
    r"""MAKEDOC: what is evaluate_model_cm doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_model_cm")
    # logg.setLevel("INFO")
    # logg.debug("\nStart evaluate_model_cm")

    # magic to fix the GPUs
    setup_gpus()

    logg.debug(f"\nmodel_name: {model_name}")

    dataset_re = re.compile("_ds(.*?)_")
    match = dataset_re.search(model_name)
    if match is not None:
        logg.debug(f"match[1]: {match[1]}")
        dataset_name = match[1]

    train_words_type_re = re.compile("_w(.*?)[_.]")
    match = train_words_type_re.search(model_name)
    if match is not None:
        logg.debug(f"match[1]: {match[1]}")
        train_words_type = match[1]

    arch_type = model_name[:3]

    if arch_type == "ATT":
        train_type_tag = "attention"
    else:
        train_type_tag = "area"

    # load the model
    model_folder = Path("trained_models") / train_type_tag
    model_path = model_folder / f"{model_name}.h5"
    model = tf_models.load_model(model_path)
    # model.summary()

    train_words = words_types[train_words_type]
    logg.debug(f"train_words: {train_words}")
    test_words = words_types[test_words_type]
    logg.debug(f"test_words: {test_words}")

    # input data must exist
    if dataset_name.startswith("mel"):
        preprocess_spec(dataset_name, test_words_type)
    elif dataset_name.startswith("aug"):
        do_augmentation(dataset_name, test_words_type)

    # input data
    processed_path = Path("data_proc") / f"{dataset_name}"
    data, labels = load_processed(processed_path, test_words)
    logg.debug(f"list(data.keys()): {list(data.keys())}")
    logg.debug(f"data['testing'].shape: {data['testing'].shape}")

    # evaluate on the words you trained on
    logg.debug("Evaluate on test data:")
    model.evaluate(data["testing"], labels["testing"])
    # model.evaluate(data["validation"], labels["validation"])

    # predict labels/cm/fscore
    y_pred = model.predict(data["testing"])
    cm = pred_hot_2_cm(labels["testing"], y_pred, test_words)
    # y_pred = model.predict(data["validation"])
    # cm = pred_hot_2_cm(labels["validation"], y_pred, test_words)
    fscore = analyze_confusion(cm, test_words)
    logg.debug(f"fscore: {fscore}")

    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, model_name, test_words, fscore, train_words)

    fig_name = f"{model_name}_test{test_words_type}_cm.{{}}"
    cm_folder = Path("plot_results") / "cm_all01"
    if not cm_folder.exists():
        cm_folder.mkdir(parents=True, exist_ok=True)

    plot_cm_path = cm_folder / fig_name.format("png")
    fig.savefig(plot_cm_path)
    plot_cm_path = cm_folder / fig_name.format("pdf")
    fig.savefig(plot_cm_path)

    # plt.show()
    return fscore
def train_area(
    hypa: ty.Dict[str, str],
    force_retrain: bool,
    use_validation: bool,
    trained_folder: Path,
    root_info_folder: Path,
) -> None:
    """MAKEDOC: what is train_area doing?"""
    logg = logging.getLogger(f"c.{__name__}.train_area")
    # logg.setLevel("INFO")
    logg.debug("Start train_area")

    ##########################################################
    #   Setup folders
    ##########################################################

    # name the model
    model_name = build_area_name(hypa, use_validation)
    logg.debug(f"model_name: {model_name}")

    # save the trained model here
    model_path = trained_folder / f"{model_name}.h5"
    placeholder_path = trained_folder / f"{model_name}.txt"

    # check if this model has already been trained
    if placeholder_path.exists():
        if force_retrain:
            logg.warn("\nRETRAINING MODEL!!\n")
        else:
            logg.debug("Already trained")
            return

    # save info regarding the model training in this folder
    model_info_folder = root_info_folder / model_name
    if not model_info_folder.exists():
        model_info_folder.mkdir(parents=True, exist_ok=True)

    # magic to fix the GPUs
    setup_gpus()

    ##########################################################
    #   Load data
    ##########################################################

    # get the words
    words = words_types[hypa["words_type"]]
    num_labels = len(words)

    # load data
    processed_folder = Path("data_proc")
    processed_path = processed_folder / f"{hypa['dataset_name']}"
    data, labels = load_processed(processed_path, words)

    # concatenate train and val for final train
    val_data = None
    if use_validation:
        x = data["training"]
        y = labels["training"]
        val_data = (data["validation"], labels["validation"])
        logg.debug("Using validation data")
    else:
        x = np.concatenate((data["training"], data["validation"]))
        y = np.concatenate((labels["training"], labels["validation"]))
        logg.debug("NOT using validation data")

    ##########################################################
    #   Setup model
    ##########################################################

    # the shape of each sample
    input_shape = data["training"][0].shape

    # from hypa extract model param
    model_param = get_model_param_area(hypa, num_labels, input_shape)

    # get the model with the chosen params
    net_type = hypa["net_type"]
    if net_type == "ARN":
        model = AreaNet.build(**model_param)
    elif net_type == "AAN":
        model = ActualAreaNet.build(**model_param)
    elif net_type == "VAN":
        model = VerticalAreaNet.build(**model_param)
    elif net_type.startswith("SI"):
        if net_type == "SIM":
            sim_type = "1"
        elif net_type == "SI2":
            sim_type = "2"
        model = SimpleNet.build(sim_type=sim_type, **model_param)

    num_samples = x.shape[0]
    logg.debug(f"num_samples: {num_samples}")

    # from hypa extract training param (epochs, batch, opt, ...)
    training_param = get_training_param_area(hypa, use_validation, model_path,
                                             num_samples)

    # a few metrics to track
    metrics = [
        tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
    ]

    # compile the model
    model.compile(
        optimizer=training_param["opt"],
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=metrics,
    )

    # recap
    recap: ty.Dict[str, ty.Any] = {}
    recap["model_name"] = model_name
    recap["words"] = words
    recap["hypa"] = hypa
    recap["model_param"] = model_param
    recap["use_validation"] = use_validation
    recap["batch_size"] = training_param["batch_size"]
    recap["epochs"] = training_param["epochs"]
    recap["lr_name"] = training_param["lr_name"]
    recap["version"] = "002"

    # logg.debug(f"recap: {recap}")
    recap_path = model_info_folder / "recap.json"
    recap_path.write_text(json.dumps(recap, indent=4))

    # https://stackoverflow.com/a/45546663/2237151
    model_summary_path = model_info_folder / "model_summary.txt"
    with model_summary_path.open("w") as msf:
        model.summary(line_length=150, print_fn=lambda x: msf.write(x + "\n"))

    ##########################################################
    #   Fit model
    ##########################################################

    results = model.fit(
        x,
        y,
        validation_data=val_data,
        epochs=training_param["epochs"],
        batch_size=training_param["batch_size"],
        callbacks=training_param["callbacks"],
    )

    ##########################################################
    #   Save results, history, performance
    ##########################################################

    # results_recap
    results_recap: ty.Dict[str, ty.Any] = {}
    results_recap["model_name"] = model_name
    results_recap["results_recap_version"] = "001"

    # evaluate performance
    eval_testing = model.evaluate(data["testing"], labels["testing"])
    for metrics_name, value in zip(model.metrics_names, eval_testing):
        logg.debug(f"{metrics_name}: {value}")
        results_recap[metrics_name] = value

    # confusion matrix
    y_pred = model.predict(data["testing"])
    cm = pred_hot_2_cm(labels["testing"], y_pred, words)
    results_recap["cm"] = cm.tolist()

    # fscore
    fscore = analyze_confusion(cm, words)
    logg.debug(f"fscore: {fscore}")
    results_recap["fscore"] = fscore

    # save the histories
    results_recap["history_train"] = {
        mn: results.history[mn]
        for mn in model.metrics_names
    }
    if use_validation:
        results_recap["history_val"] = {
            f"val_{mn}": results.history[f"val_{mn}"]
            for mn in model.metrics_names
        }

    # save the results
    res_recap_path = model_info_folder / "results_recap.json"
    res_recap_path.write_text(json.dumps(results_recap, indent=4))

    # plot the cm
    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(cm, ax, model_name, words, fscore)
    plot_cm_path = model_info_folder / "test_confusion_matrix.png"
    fig.savefig(plot_cm_path)
    plt.close(fig)

    # save the trained model
    model.save(model_path)

    # save the placeholder
    placeholder_path.write_text(f"Trained. F-score: {fscore}")