Exemple #1
0
def regression_ann(instruction,
                   callback=False,
                   ca_threshold=None,
                   text=[],
                   dataset=None,
                   drop=None,
                   preprocess=True,
                   test_size=0.2,
                   random_state=49,
                   epochs=50,
                   generate_plots=True,
                   callback_mode='min',
                   maximizer="val_loss",
                   save_model=False,
                   save_path=os.getcwd(),
                   add_layer={}):
    '''
    Body of the regression function used that is called in the neural network query
    if the data is numerical.
    :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained.
    :return dictionary that holds all the information for the finished model.
    '''

    if dataset is None:
        dataReader = DataReader(get_file())
    else:
        dataReader = DataReader(dataset)
    logger("Reading in dataset")
    data = dataReader.data_generator()
    # data = pd.read_csv(self.dataset)

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)
    data, y, target, full_pipeline = initial_preprocessor(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(target))

    X_train = data['train']
    X_test = data['test']

    # Target scaling
    target_scaler = StandardScaler()

    y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1))
    y_test = target_scaler.transform(np.array(y['test']).reshape(-1, 1))

    logger("Establishing callback function")

    models = []
    losses = []
    model_data = []

    # callback function to store lowest loss value
    es = EarlyStopping(monitor=maximizer,
                       mode=callback_mode,
                       verbose=0,
                       patience=5)

    callback_value = None
    if callback is not False:
        callback_value = [es]

    i = 0

    # add_layer format: {<object> : list of indexs}
    # get the first 3 layer model
    model = get_keras_model_reg(data, i, add_layer)

    logger("Training initial model")
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        callbacks=callback_value,
                        verbose=0)
    models.append(history)
    model_data.append(model)

    col_name = [[
        "Initial number of layers ", "| Training Loss ", "| Test Loss "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append(
        "| " +
        str(history.history['loss'][len(history.history['val_loss']) - 1]))
    values.append(
        "| " +
        str(history.history['val_loss'][len(history.history['val_loss']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")

    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])

    # keeps running model and fit functions until the validation loss stops
    # decreasing
    logger("Testing number of layers")
    col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    # while all(x > y for x, y in zip(losses, losses[1:])):
    while (len(losses) <= 2
           or losses[len(losses) - 1] < losses[len(losses) - 2]):
        model = get_keras_model_reg(data, i, add_layer)
        history = model.fit(X_train,
                            y_train,
                            callbacks=callback_value,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            verbose=0)
        model_data.append(model)
        models.append(history)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append(
            "| " +
            str(history.history['loss'][len(history.history['val_loss']) - 1]))
        values.append("| " + str(history.history['val_loss'][
            len(history.history['val_loss']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        del values, datax
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        i += 1
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    final_model = model_data[losses.index(min(losses))]
    final_hist = models[losses.index(min(losses))]
    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))

    logger(
        '->', "Training Loss: " +
        str(final_hist.history['loss'][len(final_hist.history['val_loss']) -
                                       1]))
    logger(
        '->', "Test Loss: " +
        str(final_hist.history['val_loss'][len(final_hist.history['val_loss'])
                                           - 1]))

    # calls function to generate plots in plot generation
    plots = {}
    if generate_plots:
        init_plots, plot_names = generate_regression_plots(
            models[len(models) - 1], data, y)
        for x in range(len(plot_names)):
            plots[str(plot_names[x])] = init_plots[x]

    if save_model:
        save(final_model, save_model, save_path)
    # stores values in the client object models dictionary field
    print("")
    logger("Stored model under 'regression_ANN' key")
    clearLog()

    K.clear_session()

    return {
        'id': generate_id(),
        'model': final_model,
        "target": target,
        "num_classes": 1,
        "plots": plots,
        "preprocessor": full_pipeline,
        "interpreter": target_scaler,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        }
    }
Exemple #2
0
def text_classification_query(self,
                              instruction,
                              drop=None,
                              preprocess=True,
                              label_column=None,
                              test_size=0.2,
                              random_state=49,
                              learning_rate=1e-2,
                              epochs=20,
                              monitor="val_loss",
                              batch_size=32,
                              max_text_length=200,
                              max_features=20000,
                              generate_plots=True,
                              save_model=False,
                              save_path=os.getcwd()):
    """
    function to apply text_classification algorithm for sentiment analysis
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    """

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if max_text_length < 1:
        raise Exception("Max text length must be equal to or greater than 1")

    if save_model:
        if not os.path.exists(save_path):
            raise Exception("Save path does not exists")

    if test_size == 0:
        testing = False
    else:
        testing = True

    data = DataReader(self.dataset)
    data = data.data_generator()

    if preprocess:
        data.fillna(0, inplace=True)

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    if label_column is None:
        label = "label"
    else:
        label = label_column

    X, Y, target = get_target_values(data, instruction, label)
    Y = np.array(Y)
    classes = np.unique(Y)

    logger("->", "Target Column Found: {}".format(target))

    vocab = {}
    if preprocess:
        logger("Preprocessing data")
        X = lemmatize_text(text_clean_up(X.array))
        vocab = X
        X = encode_text(X, X)

    X = np.array(X)

    model = get_keras_text_class(max_features, len(classes), learning_rate)
    logger("Building Keras LSTM model dynamically")

    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=test_size, random_state=random_state)

    X_train = sequence.pad_sequences(X_train, maxlen=max_text_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_text_length)

    y_vals = np.unique(np.append(y_train, y_test))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i
    map_func = np.vectorize(lambda x: label_mappings[x])
    y_train = map_func(y_train)
    y_test = map_func(y_test)

    logger("Training initial model")

    # early stopping callback
    es = EarlyStopping(monitor=monitor, mode='auto', verbose=0, patience=5)

    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_test, y_test),
                        batch_size=batch_size,
                        epochs=epochs,
                        callbacks=[es],
                        verbose=0)

    logger(
        "->", "Final training loss: {}".format(
            history.history["loss"][len(history.history["loss"]) - 1]))
    if testing:
        logger(
            "->", "Final validation loss: {}".format(
                history.history["val_loss"][len(history.history["val_loss"]) -
                                            1]))
        logger(
            "->", "Final validation accuracy: {}".format(
                history.history["val_accuracy"][
                    len(history.history["val_accuracy"]) - 1]))
        losses = {
            'training_loss': history.history['loss'],
            'val_loss': history.history['val_loss']
        }
        accuracy = {
            'training_accuracy': history.history['accuracy'],
            'validation_accuracy': history.history['val_accuracy']
        }
    else:
        logger("->",
               "Final validation loss: {}".format("0, No validation done"))
        losses = {'training_loss': history.history['loss']}
        accuracy = {'training_accuracy': history.history['accuracy']}

    plots = {}
    if generate_plots:
        # generates appropriate classification plots by feeding all
        # information
        logger("Generating plots")
        plots = generate_classification_plots(history, X, Y, model, X_test,
                                              y_test)

    if save_model:
        save(model, save_model, save_path=save_path)

    logger(
        "Storing information in client object under key 'text_classification'")
    # storing values the model dictionary

    self.models["text_classification"] = {
        "model": model,
        "classes": classes,
        "plots": plots,
        "target": Y,
        "vocabulary": vocab,
        "interpreter": label_mappings,
        "max_text_length": max_text_length,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': losses,
        'accuracy': accuracy
    }
    clearLog()
    return self.models["text_classification"]
Exemple #3
0
def classification_ann(instruction,
                       callback=False,
                       dataset=None,
                       text=[],
                       ca_threshold=None,
                       preprocess=True,
                       callback_mode='min',
                       drop=None,
                       random_state=49,
                       test_size=0.2,
                       epochs=50,
                       generate_plots=True,
                       maximizer="val_accuracy",
                       save_model=False,
                       save_path=os.getcwd(),
                       add_layer={}):
    '''
    Body of the classification function used that is called in the neural network query
    if the data is categorical.
    :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained.
    :return dictionary that holds all the information for the finished model.
    '''

    if dataset is None:
        dataReader = DataReader(get_file())
    else:
        dataReader = DataReader(dataset)
    logger("Reading in dataset")
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocessor(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(remove))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y = pd.concat([y['train'], y['test']], axis=0)

    num_classes = len(np.unique(y))

    if num_classes < 2:
        raise Exception("Number of classes must be greater than or equal to 2")

    X_train = data['train']
    X_test = data['test']

    if num_classes >= 2:
        # ANN needs target one hot encoded for classification
        one_hotencoder = OneHotEncoder()
        y = pd.DataFrame(one_hotencoder.fit_transform(
            np.reshape(y.values, (-1, 1))).toarray(),
                         columns=one_hotencoder.get_feature_names())

    y_train = y.iloc[:len(X_train)]
    y_test = y.iloc[len(X_train):]

    models = []
    losses = []
    accuracies = []
    model_data = []

    logger("Establishing callback function")

    # early stopping callback
    es = EarlyStopping(monitor=maximizer, mode='max', verbose=0, patience=5)

    callback_value = None
    if callback is not False:
        callback_value = [es]

    i = 0
    model = get_keras_model_class(data, i, num_classes, add_layer)
    logger("Training initial model")

    history = model.fit(X_train,
                        y_train,
                        callbacks=callback_value,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        verbose=0)

    model_data.append(model)
    models.append(history)
    col_name = [[
        "Initial number of layers ", "| Training Accuracy ", "| Test Accuracy "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append("| " + str(history.history['accuracy'][
        len(history.history['val_accuracy']) - 1]))
    values.append("| " + str(history.history['val_accuracy'][
        len(history.history['val_accuracy']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])
    accuracies.append(
        history.history['val_accuracy'][len(history.history['val_accuracy']) -
                                        1])
    # keeps running model and fit functions until the validation loss stops
    # decreasing

    logger("Testing number of layers")
    col_name = [[
        "Current number of layers", "| Training Accuracy", "| Test Accuracy"
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2

    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    # while all(x < y for x, y in zip(accuracies, accuracies[1:])):
    while (len(accuracies) <= 2 or
           accuracies[len(accuracies) - 1] > accuracies[len(accuracies) - 2]):
        model = get_keras_model_class(data, i, num_classes, add_layer)
        history = model.fit(X_train,
                            y_train,
                            callbacks=callback_value,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            verbose=0)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append("| " + str(history.history['accuracy'][
            len(history.history['accuracy']) - 1]))
        values.append("| " + str(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        del values, datax
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        accuracies.append(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1])
        models.append(history)
        model_data.append(model)

        i += 1
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    # del values, datax

    final_model = model_data[accuracies.index(max(accuracies))]
    final_hist = models[accuracies.index(max(accuracies))]

    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))
    logger(
        '->', "Training Accuracy: " + str(final_hist.history['accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))
    logger(
        '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))

    # genreates appropriate classification plots by feeding all information
    plots = {}
    if generate_plots:
        plots = generate_classification_plots(models[len(models) - 1])

    if save_model:
        save(final_model, save_model, save_path)

    print("")
    logger("Stored model under 'classification_ANN' key")
    clearLog()

    K.clear_session()

    # stores the values and plots into the object dictionary
    return {
        'id': generate_id(),
        "model": final_model,
        'num_classes': num_classes,
        "plots": plots,
        "target": remove,
        "preprocessor": full_pipeline,
        "interpreter": one_hotencoder,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        },
        'accuracy': {
            'training_accuracy': final_hist.history['accuracy'],
            'validation_accuracy': final_hist.history['val_accuracy']
        }
    }