def regression_ann(instruction, callback=False, ca_threshold=None, text=[], dataset=None, drop=None, preprocess=True, test_size=0.2, random_state=49, epochs=50, generate_plots=True, callback_mode='min', maximizer="val_loss", save_model=False, save_path=os.getcwd(), add_layer={}): ''' Body of the regression function used that is called in the neural network query if the data is numerical. :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained. :return dictionary that holds all the information for the finished model. ''' if dataset is None: dataReader = DataReader(get_file()) else: dataReader = DataReader(dataset) logger("Reading in dataset") data = dataReader.data_generator() # data = pd.read_csv(self.dataset) if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, target, full_pipeline = initial_preprocessor( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(target)) X_train = data['train'] X_test = data['test'] # Target scaling target_scaler = StandardScaler() y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1)) y_test = target_scaler.transform(np.array(y['test']).reshape(-1, 1)) logger("Establishing callback function") models = [] losses = [] model_data = [] # callback function to store lowest loss value es = EarlyStopping(monitor=maximizer, mode=callback_mode, verbose=0, patience=5) callback_value = None if callback is not False: callback_value = [es] i = 0 # add_layer format: {<object> : list of indexs} # get the first 3 layer model model = get_keras_model_reg(data, i, add_layer) logger("Training initial model") history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=callback_value, verbose=0) models.append(history) model_data.append(model) col_name = [[ "Initial number of layers ", "| Training Loss ", "| Test Loss " ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") values = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append( "| " + str(history.history['val_loss'][len(history.history['val_loss']) - 1])) datax = [] datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") losses.append(history.history[maximizer][len(history.history[maximizer]) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("Testing number of layers") col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") datax = [] # while all(x > y for x, y in zip(losses, losses[1:])): while (len(losses) <= 2 or losses[len(losses) - 1] < losses[len(losses) - 2]): model = get_keras_model_reg(data, i, add_layer) history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) model_data.append(model) models.append(history) values = [] datax = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append("| " + str(history.history['val_loss'][ len(history.history['val_loss']) - 1])) datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") del values, datax losses.append( history.history[maximizer][len(history.history[maximizer]) - 1]) i += 1 # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) final_model = model_data[losses.index(min(losses))] final_hist = models[losses.index(min(losses))] print("") logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger( '->', "Training Loss: " + str(final_hist.history['loss'][len(final_hist.history['val_loss']) - 1])) logger( '->', "Test Loss: " + str(final_hist.history['val_loss'][len(final_hist.history['val_loss']) - 1])) # calls function to generate plots in plot generation plots = {} if generate_plots: init_plots, plot_names = generate_regression_plots( models[len(models) - 1], data, y) for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] if save_model: save(final_model, save_model, save_path) # stores values in the client object models dictionary field print("") logger("Stored model under 'regression_ANN' key") clearLog() K.clear_session() return { 'id': generate_id(), 'model': final_model, "target": target, "num_classes": 1, "plots": plots, "preprocessor": full_pipeline, "interpreter": target_scaler, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss'] } }
def text_classification_query(self, instruction, drop=None, preprocess=True, label_column=None, test_size=0.2, random_state=49, learning_rate=1e-2, epochs=20, monitor="val_loss", batch_size=32, max_text_length=200, max_features=20000, generate_plots=True, save_model=False, save_path=os.getcwd()): """ function to apply text_classification algorithm for sentiment analysis :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. """ if test_size < 0: raise Exception("Test size must be a float between 0 and 1") if test_size >= 1: raise Exception( "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training " "data)") if epochs < 1: raise Exception( "Epoch number is less than 1 (model will not be trained)") if batch_size < 1: raise Exception("Batch size must be equal to or greater than 1") if max_text_length < 1: raise Exception("Max text length must be equal to or greater than 1") if save_model: if not os.path.exists(save_path): raise Exception("Save path does not exists") if test_size == 0: testing = False else: testing = True data = DataReader(self.dataset) data = data.data_generator() if preprocess: data.fillna(0, inplace=True) if drop is not None: data.drop(drop, axis=1, inplace=True) if label_column is None: label = "label" else: label = label_column X, Y, target = get_target_values(data, instruction, label) Y = np.array(Y) classes = np.unique(Y) logger("->", "Target Column Found: {}".format(target)) vocab = {} if preprocess: logger("Preprocessing data") X = lemmatize_text(text_clean_up(X.array)) vocab = X X = encode_text(X, X) X = np.array(X) model = get_keras_text_class(max_features, len(classes), learning_rate) logger("Building Keras LSTM model dynamically") X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=random_state) X_train = sequence.pad_sequences(X_train, maxlen=max_text_length) X_test = sequence.pad_sequences(X_test, maxlen=max_text_length) y_vals = np.unique(np.append(y_train, y_test)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i map_func = np.vectorize(lambda x: label_mappings[x]) y_train = map_func(y_train) y_test = map_func(y_test) logger("Training initial model") # early stopping callback es = EarlyStopping(monitor=monitor, mode='auto', verbose=0, patience=5) history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, epochs=epochs, callbacks=[es], verbose=0) logger( "->", "Final training loss: {}".format( history.history["loss"][len(history.history["loss"]) - 1])) if testing: logger( "->", "Final validation loss: {}".format( history.history["val_loss"][len(history.history["val_loss"]) - 1])) logger( "->", "Final validation accuracy: {}".format( history.history["val_accuracy"][ len(history.history["val_accuracy"]) - 1])) losses = { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] } accuracy = { 'training_accuracy': history.history['accuracy'], 'validation_accuracy': history.history['val_accuracy'] } else: logger("->", "Final validation loss: {}".format("0, No validation done")) losses = {'training_loss': history.history['loss']} accuracy = {'training_accuracy': history.history['accuracy']} plots = {} if generate_plots: # generates appropriate classification plots by feeding all # information logger("Generating plots") plots = generate_classification_plots(history, X, Y, model, X_test, y_test) if save_model: save(model, save_model, save_path=save_path) logger( "Storing information in client object under key 'text_classification'") # storing values the model dictionary self.models["text_classification"] = { "model": model, "classes": classes, "plots": plots, "target": Y, "vocabulary": vocab, "interpreter": label_mappings, "max_text_length": max_text_length, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': losses, 'accuracy': accuracy } clearLog() return self.models["text_classification"]
def classification_ann(instruction, callback=False, dataset=None, text=[], ca_threshold=None, preprocess=True, callback_mode='min', drop=None, random_state=49, test_size=0.2, epochs=50, generate_plots=True, maximizer="val_accuracy", save_model=False, save_path=os.getcwd(), add_layer={}): ''' Body of the classification function used that is called in the neural network query if the data is categorical. :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained. :return dictionary that holds all the information for the finished model. ''' if dataset is None: dataReader = DataReader(get_file()) else: dataReader = DataReader(dataset) logger("Reading in dataset") data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocessor( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(remove)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y = pd.concat([y['train'], y['test']], axis=0) num_classes = len(np.unique(y)) if num_classes < 2: raise Exception("Number of classes must be greater than or equal to 2") X_train = data['train'] X_test = data['test'] if num_classes >= 2: # ANN needs target one hot encoded for classification one_hotencoder = OneHotEncoder() y = pd.DataFrame(one_hotencoder.fit_transform( np.reshape(y.values, (-1, 1))).toarray(), columns=one_hotencoder.get_feature_names()) y_train = y.iloc[:len(X_train)] y_test = y.iloc[len(X_train):] models = [] losses = [] accuracies = [] model_data = [] logger("Establishing callback function") # early stopping callback es = EarlyStopping(monitor=maximizer, mode='max', verbose=0, patience=5) callback_value = None if callback is not False: callback_value = [es] i = 0 model = get_keras_model_class(data, i, num_classes, add_layer) logger("Training initial model") history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) model_data.append(model) models.append(history) col_name = [[ "Initial number of layers ", "| Training Accuracy ", "| Test Accuracy " ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") values = [] values.append(str(len(model.layers))) values.append("| " + str(history.history['accuracy'][ len(history.history['val_accuracy']) - 1])) values.append("| " + str(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1])) datax = [] datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) losses.append(history.history[maximizer][len(history.history[maximizer]) - 1]) accuracies.append( history.history['val_accuracy'][len(history.history['val_accuracy']) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("Testing number of layers") col_name = [[ "Current number of layers", "| Training Accuracy", "| Test Accuracy" ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") datax = [] # while all(x < y for x, y in zip(accuracies, accuracies[1:])): while (len(accuracies) <= 2 or accuracies[len(accuracies) - 1] > accuracies[len(accuracies) - 2]): model = get_keras_model_class(data, i, num_classes, add_layer) history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) values = [] datax = [] values.append(str(len(model.layers))) values.append("| " + str(history.history['accuracy'][ len(history.history['accuracy']) - 1])) values.append("| " + str(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1])) datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") del values, datax losses.append( history.history[maximizer][len(history.history[maximizer]) - 1]) accuracies.append(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1]) models.append(history) model_data.append(model) i += 1 # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) # del values, datax final_model = model_data[accuracies.index(max(accuracies))] final_hist = models[accuracies.index(max(accuracies))] print("") logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger( '->', "Training Accuracy: " + str(final_hist.history['accuracy'][ len(final_hist.history['val_accuracy']) - 1])) logger( '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][ len(final_hist.history['val_accuracy']) - 1])) # genreates appropriate classification plots by feeding all information plots = {} if generate_plots: plots = generate_classification_plots(models[len(models) - 1]) if save_model: save(final_model, save_model, save_path) print("") logger("Stored model under 'classification_ANN' key") clearLog() K.clear_session() # stores the values and plots into the object dictionary return { 'id': generate_id(), "model": final_model, 'num_classes': num_classes, "plots": plots, "target": remove, "preprocessor": full_pipeline, "interpreter": one_hotencoder, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss'] }, 'accuracy': { 'training_accuracy': final_hist.history['accuracy'], 'validation_accuracy': final_hist.history['val_accuracy'] } }