コード例 #1
0
ファイル: feedforward_nn.py プロジェクト: tesseract-42/libra
def classification_ann(instruction,
                       callback=False,
                       dataset=None,
                       text=[],
                       ca_threshold=None,
                       preprocess=True,
                       callback_mode='min',
                       drop=None,
                       random_state=49,
                       test_size=0.2,
                       epochs=50,
                       generate_plots=True,
                       maximizer="val_accuracy",
                       save_model=False,
                       save_path=os.getcwd(),
                       add_layer={}):
    '''
    Body of the classification function used that is called in the neural network query
    if the data is categorical.
    :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained.
    :return dictionary that holds all the information for the finished model.
    '''

    if dataset is None:
        dataReader = DataReader(get_file())
    else:
        dataReader = DataReader(dataset)
    logger("Reading in dataset")
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocessor(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(remove))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y = pd.concat([y['train'], y['test']], axis=0)

    num_classes = len(np.unique(y))

    if num_classes < 2:
        raise Exception("Number of classes must be greater than or equal to 2")

    X_train = data['train']
    X_test = data['test']

    if num_classes >= 2:
        # ANN needs target one hot encoded for classification
        one_hotencoder = OneHotEncoder()
        y = pd.DataFrame(one_hotencoder.fit_transform(
            np.reshape(y.values, (-1, 1))).toarray(),
                         columns=one_hotencoder.get_feature_names())

    y_train = y.iloc[:len(X_train)]
    y_test = y.iloc[len(X_train):]

    models = []
    losses = []
    accuracies = []
    model_data = []

    logger("Establishing callback function")

    # early stopping callback
    es = EarlyStopping(monitor=maximizer, mode='max', verbose=0, patience=5)

    callback_value = None
    if callback is not False:
        callback_value = [es]

    i = 0
    model = get_keras_model_class(data, i, num_classes, add_layer)
    logger("Training initial model")

    history = model.fit(X_train,
                        y_train,
                        callbacks=callback_value,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        verbose=0)

    model_data.append(model)
    models.append(history)
    col_name = [[
        "Initial number of layers ", "| Training Accuracy ", "| Test Accuracy "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append("| " + str(history.history['accuracy'][
        len(history.history['val_accuracy']) - 1]))
    values.append("| " + str(history.history['val_accuracy'][
        len(history.history['val_accuracy']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])
    accuracies.append(
        history.history['val_accuracy'][len(history.history['val_accuracy']) -
                                        1])
    # keeps running model and fit functions until the validation loss stops
    # decreasing

    logger("Testing number of layers")
    col_name = [[
        "Current number of layers", "| Training Accuracy", "| Test Accuracy"
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2

    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    # while all(x < y for x, y in zip(accuracies, accuracies[1:])):
    while (len(accuracies) <= 2 or
           accuracies[len(accuracies) - 1] > accuracies[len(accuracies) - 2]):
        model = get_keras_model_class(data, i, num_classes, add_layer)
        history = model.fit(X_train,
                            y_train,
                            callbacks=callback_value,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            verbose=0)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append("| " + str(history.history['accuracy'][
            len(history.history['accuracy']) - 1]))
        values.append("| " + str(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        del values, datax
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        accuracies.append(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1])
        models.append(history)
        model_data.append(model)

        i += 1
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    # del values, datax

    final_model = model_data[accuracies.index(max(accuracies))]
    final_hist = models[accuracies.index(max(accuracies))]

    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))
    logger(
        '->', "Training Accuracy: " + str(final_hist.history['accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))
    logger(
        '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))

    # genreates appropriate classification plots by feeding all information
    plots = {}
    if generate_plots:
        plots = generate_classification_plots(models[len(models) - 1])

    if save_model:
        save(final_model, save_model, save_path)

    print("")
    logger("Stored model under 'classification_ANN' key")
    clearLog()

    K.clear_session()

    # stores the values and plots into the object dictionary
    return {
        'id': generate_id(),
        "model": final_model,
        'num_classes': num_classes,
        "plots": plots,
        "target": remove,
        "preprocessor": full_pipeline,
        "interpreter": one_hotencoder,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        },
        'accuracy': {
            'training_accuracy': final_hist.history['accuracy'],
            'validation_accuracy': final_hist.history['val_accuracy']
        }
    }
コード例 #2
0
ファイル: feedforward_nn.py プロジェクト: Samyak-Doshi/libra
def classification_ann(instruction,
                       dataset=None,
                       text=None,
                       ca_threshold=None,
                       preprocess=True,
                       callback_mode='min',
                       drop=None,
                       random_state=49,
                       test_size=0.2,
                       epochs=50,
                       generate_plots=True,
                       maximizer="val_loss",
                       save_model=True,
                       save_path=os.getcwd()):

    global currLog
    logger("Reading in dataset...")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(remove))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y = pd.concat([y['train'], y['test']], axis=0)

    num_classes = len(np.unique(y))

    X_train = data['train']
    X_test = data['test']

    # ANN needs target one hot encoded for classification
    one_hot_encoder = OneHotEncoder()

    y = pd.DataFrame(one_hot_encoder.fit_transform(
        np.reshape(y.values, (-1, 1))).toarray(),
                     columns=one_hot_encoder.get_feature_names())

    y_train = y.iloc[:len(X_train)]
    y_test = y.iloc[len(X_train):]

    models = []
    losses = []
    accuracies = []
    model_data = []

    logger("Establishing callback function...")

    # early stopping callback
    es = EarlyStopping(monitor=maximizer, mode='min', verbose=0, patience=5)

    i = 0
    model = get_keras_model_class(data, i, num_classes)
    logger("Training initial model...")
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        callbacks=[es],
                        verbose=0)

    model_data.append(model)
    models.append(history)
    col_name = [[
        "Initial number of layers ", "| Training Loss ", "| Test Loss "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append(
        "| " +
        str(history.history['loss'][len(history.history['val_loss']) - 1]))
    values.append(
        "| " +
        str(history.history['val_loss'][len(history.history['val_loss']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    #print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])
    # keeps running model and fit functions until the validation loss stops
    # decreasing

    logger("Testing number of layers...")
    col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]]
    col_width = max(len(word) for row in col_name for word in row) + 2

    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    while (all(x > y for x, y in zip(losses, losses[1:]))):
        model = get_keras_model_class(data, i, num_classes)
        history = model.fit(X_train,
                            y_train,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            callbacks=[es],
                            verbose=0)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append(
            "| " +
            str(history.history['loss'][len(history.history['val_loss']) - 1]))
        values.append("| " + str(history.history['val_loss'][
            len(history.history['val_loss']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        accuracies.append(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1])
        i += 1
    #print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    #del values, datax
    final_model = model_data[losses.index(min(losses))]
    final_hist = models[losses.index(min(losses))]
    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))
    logger(
        '->', "Training Accuracy: " + str(final_hist.history['accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))
    logger(
        '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))

    # genreates appropriate classification plots by feeding all information
    plots = generate_classification_plots(models[len(models) - 1], data, y,
                                          model, X_test, y_test)

    if save_model:
        save(final_model, save_model)

    print("")
    logger("Stored model under 'classification_ANN' key")

    # stores the values and plots into the object dictionary
    return {
        'id': generate_id(),
        "model": final_model,
        'num_classes': num_classes,
        "plots": plots,
        "target": remove,
        "preprocesser": full_pipeline,
        "interpreter": one_hot_encoder,
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        },
        'accuracy': {
            'training_accuracy': final_hist.history['accuracy'],
            'validation_accuracy': final_hist.history['val_accuracy']
        }
    }