def get_ner(self, instruction): """ function to identify name entities :param instruction: Used to get target column :return: dictionary object with detected name-entities """ data = DataReader(self.dataset) data = data.data_generator() target = get_similar_column(get_value_instruction(instruction), data) logger("->", "Target Column Found: {}".format(target)) # Remove stopwords if any from the detection column data['combined_text_for_ner'] = data[target].apply( lambda x: ' '.join([word for word in x.split() if word not in stopwords.words()])) logger("Detecting Name Entities from : {} data files".format(data.shape[0] + 1)) # Named entity recognition pipeline, default model selection with NoStdStreams(): hugging_face_ner_detector = pipeline('ner', grouped_entities=True, framework='tf') data['ner'] = data['combined_text_for_ner'].apply(lambda x: hugging_face_ner_detector(x)) logger("NER detection status complete") logger("Storing information in client object under key 'named_entity_recognition'") self.models["named_entity_recognition"] = { "model": hugging_face_ner_detector.model, "tokenizer": hugging_face_ner_detector.tokenizer, 'name_entities': data['ner'].to_dict()} logger("Output: ", data['ner'].to_dict()) clearLog() return self.models["named_entity_recognition"]
def decision_tree(instruction, dataset=None, preprocess=True, ca_threshold=None, text=None, test_size=0.2, drop=None): logger("Reading in dataset....") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target Column Found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i # Custom label encoder due to train test split y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values num_classes = len(np.unique(y)) # fitting and storing logger("Fitting Decision Tree...") clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Score found on testing set: {}".format(score)) logger("Stored model under 'decision_tree' key") clearLog() return { 'id': generate_id(), "model": clf, "target": remove, "accuracy_score": score, "preprocesser": full_pipeline, "interpeter": label_mappings, "cross_val_score": cross_val_score(clf, X_train, y_train, cv=3) }
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): global counter dataReader = DataReader(dataset) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data, y, target, full_pipeline = initial_preprocessor( data, instruction, True, 0.2, [], 0.2, random_state=49) le = preprocessing.LabelEncoder() X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] y_train= le.fit_transform(y_train) y_test = le.fit_transform(y_test) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score( first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(X_train.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=x) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(X_train.columns[indices]) X_temp_train = X_train[X_train.columns[indices]] X_temp_test = X_test[X_train.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) print(accuracy_scores) return datas[the_index], accuracy_scores[0], max( accuracy_scores), list(columns[the_index])
def dimensionality_KPCA(instruction, dataset, target="", y=""): global currLog global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf") data_modified = kpca.fit_transform(dataset) X_train, X_test, y_train, y_test = train_test_split( dataset, y, test_size=0.2, random_state=49) X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split( data_modified, y, test_size=0.2, random_state=49) clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train_mod) acc = [] acc.append(accuracy_score( clf_mod.predict(X_test_mod), y_test_mod)) for i, j in product(range(3, 10), ["entropy", "gini"]): model = tree.DecisionTreeClassifier(criterion=j, max_depth=i) model = model.fit(X_train_mod, y_train_mod) acc.append(accuracy_score(model.predict(X_test_mod), y_test)) del i, j data_modified = pd.DataFrame(data_modified) data_modified[target] = np.r_[y_train, y_test] # data_modified.to_csv("./data/housingPCA.csv") return data_modified, accuracy_score( clf.predict(X_test), y_test), max(acc), (len( dataset.columns) - len(data_modified.columns)) def booster(dataset, obj): #obj=["reg:linear","multi:softmax "] X_train, X_test, y_train, y_test = train_test_split( dataset, y, test_size=0.2, random_state=49) clf = XGBClassifier( objective=obj, learning_rate=0.1, silent=1, alpha=10) clf.fit(X_train, y_train) return accuracy_score(clf.predict(X_test_mod), y_test_mod)
def dimensionality_KPCA(instruction, dataset, target="", y=""): ''' function to reduce dimensionality in dataset via kernal principal component analysis :param instruction: command sent to client instance in written query. :param dataset: data instantiated in client instance passed to the algorithm :param target: column name of response variable/feature :param y: dictionary of train/test data values associated with response variable/feature ''' pca = KernelPCA(kernel='rbf') dataReader = DataReader(dataset) dataset = dataReader.data_generator() data, y, target, full_pipeline = initial_preprocesser(dataset, instruction, True, 0.2, [], 0.2, random_state=49) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.transform(X_test) clf = tree.DecisionTreeClassifier() clf_mod = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod.fit(X_train_mod, y_train) acc = [] acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test)) for i, j in product(range(3, 10), ["entropy", "gini"]): model = tree.DecisionTreeClassifier(criterion=j, max_depth=i) model = model.fit(X_train_mod, y_train) acc.append(accuracy_score(model.predict(X_test_mod), y_test)) del i, j data_modified = pd.concat( [pd.DataFrame(X_train_mod), pd.DataFrame(X_test_mod)], axis=0) y_combined = np.r_[y_train, y_test] data_modified[target] = y_combined # data_modified.to_csv("./data/housingPCA.csv") return data_modified, accuracy_score( clf.predict(X_test), y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): global currLog global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data = structured_preprocesser(data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split( dataset, y, test_size=0.2, random_state=49) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score( first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(dataset.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=i) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(dataset.columns[indices]) X_temp_train = X_train[dataset.columns[indices]] X_temp_test = X_test[dataset.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) return datas[the_index], accuracy_scores[0], max( accuracy_scores), list(columns[the_index])
def nearest_neighbors(instruction=None, dataset=None, ca_threshold=None, preprocess=True, drop=None, min_neighbors=3, max_neighbors=10): logger("Reading in dataset....") # Reads in dataset # data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target Column Found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # encodes the label dataset into 0's and 1's y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values models = [] scores = [] logger("Fitting Nearest Neighbor...") logger("Identifying optimal number of neighbors...") # Tries all neighbor possibilities, based on either defaults or user # specified values for x in range(min_neighbors, max_neighbors): knn = KNeighborsClassifier(n_neighbors=x) knn.fit(X_train, y_train) models.append(knn) scores.append(accuracy_score(knn.predict(X_test), y_test)) logger("Stored model under 'nearest_neighbors' key") knn = models[scores.index(min(scores))] return { 'id': generate_id(), "model": knn, "accuracy_score": scores.index(min(scores)), "preprocesser": full_pipeline, "interpreter": label_mappings, "target": remove, "cross_val_score": cross_val_score(knn, X_train, y_train, cv=3) } clearLog()
def dimensionality_PCA(instruction, dataset, ca_threshold=None): global counter pca = PCA(0.92) dataReader = DataReader(dataset) dataset = dataReader.data_generator() data, y, target, full_pipeline = initial_preprocesser(dataset, instruction, True, 0.2, [], 0.2, random_state=49) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.transform(X_test) clf = tree.DecisionTreeClassifier() clf_mod = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod.fit(X_train_mod, y_train) acc = [] acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test)) for i, j in product(range(3, 10), ["entropy", "gini"]): model = tree.DecisionTreeClassifier(criterion=j, max_depth=i) model = model.fit(X_train_mod, y_train) acc.append(accuracy_score(model.predict(X_test_mod), y_test)) del i, j data_modified = pd.concat( [pd.DataFrame(X_train_mod), pd.DataFrame(X_test_mod)], axis=0) y_combined = np.r_[y_train, y_test] data_modified[target] = y_combined # data_modified.to_csv("./data/housingPCA.csv") return data_modified, accuracy_score( clf.predict(X_test), y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))
def dimensionality_KPCA(instruction, dataset, target="", y=""): ''' function to reduce dimensionality in dataset via kernal principal component analysis :param instruction: command sent to client instance in written query. :param dataset: data instantiated in client instance passed to the algorithm :param target: column name of response variable/feature :param y: dictionary of train/test data values associated with response variable/feature ''' global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf") data_modified = kpca.fit_transform(dataset) X_train, X_test, y_train, y_test = train_test_split( dataset, y, test_size=0.2, random_state=49) X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split( data_modified, y, test_size=0.2, random_state=49) clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train_mod) acc = [] acc.append(accuracy_score( clf_mod.predict(X_test_mod), y_test_mod)) for i, j in product(range(3, 10), ["entropy", "gini"]): model = tree.DecisionTreeClassifier(criterion=j, max_depth=i) model = model.fit(X_train_mod, y_train_mod) acc.append(accuracy_score(model.predict(X_test_mod), y_test)) del i, j data_modified = pd.DataFrame(data_modified) data_modified[target] = np.r_[y_train, y_test] # data_modified.to_csv("./data/housingPCA.csv") return data_modified, accuracy_score( clf.predict(X_test), y_test), max(acc), (len( dataset.columns) - len(data_modified.columns))
def train_svm(instruction, dataset=None, test_size=0.2, kernel='linear', text=[], preprocess=True, ca_threshold=None, drop=None, cross_val_size=0.3, degree=3, gamma='scale', coef0=0.0, max_iter=-1, random_state=49): ''' function to train a support vector machine clustering algorithm :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) logger("Preprocessing data") data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(target)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = sklearn.preprocessing.LabelEncoder() label_mappings.fit(y_vals) y_train = label_mappings.transform(y_train) y_test = label_mappings.transform(y_test) # Fitting to SVM and storing in the model dictionary logger("Fitting Support Vector Machine") clf = svm.SVC(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, max_iter=max_iter) clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Accuracy found on testing set: {}".format(score)) logger('->', "Stored model under 'svm' key") clearLog() return { 'id': generate_id(), "model": clf, 'num_classes': num_classes, "accuracy": { 'cross_val_score': cross_val_score(clf, X_train, y_train), 'accuracy_score': score }, "target": target, "preprocesser": full_pipeline, "interpreter": label_mappings, 'test_data': { 'X': X_test, 'y': y_test } } clearLog()
def tune_helper(model_to_tune=None, dataset=None, models=None, max_layers=10, min_layers=2, min_dense=32, max_dense=512, executions_per_trial=3, max_trials=1, activation='relu', loss='categorical_crossentropy', metrics='accuracy', seed=42, objective='val_accuracy', directory='my_dir', epochs=10, step=32, verbose=0, test_size=0.2): logger("Getting target model for tuning...") # checks to see which requested model is in the self.models # processing for regression feed forward NN if model_to_tune == 'regression_ANN': logger("Tuning model hyperparameters...") dataReader = DataReader(dataset) data = dataReader.data_generator() target = models['regression_ANN']['target'] target_column = data[models['regression_ANN']['target']] data = models['regression_ANN']['preprocesser'].transform( data.drop(target, axis=1)) returned_model, returned_pms, history = tuneReg( data, target_column, max_layers=max_layers, min_layers=min_layers, min_dense=min_dense, max_dense=max_dense, executions_per_trial=executions_per_trial, max_trials=max_trials, epochs=epochs, activation=activation, step=step, verbose=verbose, test_size=test_size) models['regression_ANN'] = { 'model': returned_model, 'hyperparametes': returned_pms, 'losses': { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] } } # processing for classification feed forward NN elif model_to_tune == "classification_ANN": logger("Tuning model hyperparameters...") dataReader = DataReader(dataset) data = dataReader.data_generator() target = models['classification_ANN']['target'] target_column = data[models['classification_ANN']['target']] data = models['classification_ANN']['preprocesser'].transform( data.drop(target, axis=1)) returned_model, returned_pms, history = tuneClass( data, target_column, models['classification_ANN']['num_classes'], max_layers=max_layers, min_layers=min_layers, min_dense=min_dense, max_dense=max_dense, executions_per_trial=executions_per_trial, max_trials=max_trials, activation=activation, loss=loss, metrics=metrics, epochs=epochs, step=step, verbose=verbose, test_size=test_size) models['classification_ANN'] = { 'model': returned_model, 'hyperparametes': returned_pms, 'losses': { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] } } # processing for convolutional NN elif model_to_tune == "convolutional_NN": logger("Tuning model hyperparameters...") X_train, X_test, height, width, num_classes = get_image_data(dataset) model, returned_pms, history = tuneCNN( X_train, X_test, height, width, num_classes, executions_per_trial=executions_per_trial, max_trials=max_trials, seed=seed, objective=objective, directory=directory, epochs=epochs, verbose=verbose, test_size=test_size) models["convolutional_NN"]["model"] = model models["convolutional_NN"]["hyperparametes"] = returned_pms, models["convolutional_NN"]["losses"] = { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] } return models
def dimensionality_ICA(instruction, dataset, target="", y=""): global counter dataReader = DataReader(dataset) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data, y, target, full_pipeline = initial_preprocessor( data, instruction, True, 0.2, [], 0.2, random_state=49) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] pca = FastICA(n_components=len(X_train.columns)) X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.fit_transform(X_test) clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train) acc = [] sets = [] acc.append(accuracy_score( clf_mod.predict(X_test_mod), y_test)) frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod))) frame[target] = np.r_[y_train, y_test] sets.append(frame) for i in range(2, len(X_train.columns)): pca = FastICA(n_components=i) X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.fit_transform(X_test) frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod))) frame[target] = np.r_[y_train, y_test] sets.append(frame) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train) acc.append(accuracy_score( clf_mod.predict(X_test_mod), y_test)) del i data_modified = sets[acc.index(max(acc))] score = max(acc) return data_modified, score, ((len( X_train.columns) + 1) - len(data_modified.columns))
def classification_ann(instruction, callback=False, dataset=None, text=[], ca_threshold=None, preprocess=True, callback_mode='min', drop=None, random_state=49, test_size=0.2, epochs=50, generate_plots=True, maximizer="val_accuracy", save_model=False, save_path=os.getcwd(), add_layer={}): ''' Body of the classification function used that is called in the neural network query if the data is categorical. :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained. :return dictionary that holds all the information for the finished model. ''' if dataset is None: dataReader = DataReader(get_file()) else: dataReader = DataReader(dataset) logger("Reading in dataset") data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocessor( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(remove)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y = pd.concat([y['train'], y['test']], axis=0) num_classes = len(np.unique(y)) if num_classes < 2: raise Exception("Number of classes must be greater than or equal to 2") X_train = data['train'] X_test = data['test'] if num_classes >= 2: # ANN needs target one hot encoded for classification one_hotencoder = OneHotEncoder() y = pd.DataFrame(one_hotencoder.fit_transform( np.reshape(y.values, (-1, 1))).toarray(), columns=one_hotencoder.get_feature_names()) y_train = y.iloc[:len(X_train)] y_test = y.iloc[len(X_train):] models = [] losses = [] accuracies = [] model_data = [] logger("Establishing callback function") # early stopping callback es = EarlyStopping(monitor=maximizer, mode='max', verbose=0, patience=5) callback_value = None if callback is not False: callback_value = [es] i = 0 model = get_keras_model_class(data, i, num_classes, add_layer) logger("Training initial model") history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) model_data.append(model) models.append(history) col_name = [[ "Initial number of layers ", "| Training Accuracy ", "| Test Accuracy " ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") values = [] values.append(str(len(model.layers))) values.append("| " + str(history.history['accuracy'][ len(history.history['val_accuracy']) - 1])) values.append("| " + str(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1])) datax = [] datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) losses.append(history.history[maximizer][len(history.history[maximizer]) - 1]) accuracies.append( history.history['val_accuracy'][len(history.history['val_accuracy']) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("Testing number of layers") col_name = [[ "Current number of layers", "| Training Accuracy", "| Test Accuracy" ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") datax = [] # while all(x < y for x, y in zip(accuracies, accuracies[1:])): while (len(accuracies) <= 2 or accuracies[len(accuracies) - 1] > accuracies[len(accuracies) - 2]): model = get_keras_model_class(data, i, num_classes, add_layer) history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) values = [] datax = [] values.append(str(len(model.layers))) values.append("| " + str(history.history['accuracy'][ len(history.history['accuracy']) - 1])) values.append("| " + str(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1])) datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") del values, datax losses.append( history.history[maximizer][len(history.history[maximizer]) - 1]) accuracies.append(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1]) models.append(history) model_data.append(model) i += 1 # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) # del values, datax final_model = model_data[accuracies.index(max(accuracies))] final_hist = models[accuracies.index(max(accuracies))] print("") logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger( '->', "Training Accuracy: " + str(final_hist.history['accuracy'][ len(final_hist.history['val_accuracy']) - 1])) logger( '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][ len(final_hist.history['val_accuracy']) - 1])) # genreates appropriate classification plots by feeding all information plots = {} if generate_plots: plots = generate_classification_plots(models[len(models) - 1]) if save_model: save(final_model, save_model, save_path) print("") logger("Stored model under 'classification_ANN' key") clearLog() K.clear_session() # stores the values and plots into the object dictionary return { 'id': generate_id(), "model": final_model, 'num_classes': num_classes, "plots": plots, "target": remove, "preprocessor": full_pipeline, "interpreter": one_hotencoder, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss'] }, 'accuracy': { 'training_accuracy': final_hist.history['accuracy'], 'validation_accuracy': final_hist.history['val_accuracy'] } }
def train_xgboost(instruction, dataset=None, learning_rate=0.1, n_estimators=1000, ca_threshold=None, max_depth=6, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', random_state=27, test_size=0.2, text=[], preprocess=True, verbosity=0, drop=None): ''' function to train a xgboost algorithm :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) logger("Preprocessing data") data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(target)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) if num_classes > 2: objective = 'multi:softmax' # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = sklearn.preprocessing.LabelEncoder() label_mappings.fit(y_vals) y_train = label_mappings.transform(y_train) y_test = label_mappings.transform(y_test) # Fitting to SVM and storing in the model dictionary logger("Fitting XGBoost") clf = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, objective=objective, verbosity=verbosity, random_state=random_state) clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Accuracy found on testing set: {}".format(score)) logger('->', "Stored model under 'xgboost' key") clearLog() clearLog() return { 'id': generate_id(), "model": clf, "target": target, 'num_classes': num_classes, "accuracy": { 'cross_val_score': cross_val_score( clf, X_train, y_train, ), 'accuracy_score': score }, "accuracy_score": score, "preprocesser": full_pipeline, "interpreter": label_mappings, 'test_data': { 'X': X_test, 'y': y_test } }
def summarization_query(self, instruction, preprocess=True, label_column=None, drop=None, epochs=10, batch_size=32, learning_rate=1e-4, max_text_length=512, max_summary_length=150, test_size=0.2, random_state=49, gpu=False, generate_plots=True, save_model=False, save_path=os.getcwd()): ''' function to apply algorithm for text summarization :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' if test_size < 0: raise Exception("Test size must be a float between 0 and 1") if test_size >= 1: raise Exception( "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training " "data)") if max_text_length < 2 | max_summary_length < 2: raise Exception("Text and summary must be at least of length 2") if epochs < 1: raise Exception( "Epoch number is less than 1 (model will not be trained)") if batch_size < 1: raise Exception("Batch size must be equal to or greater than 1") if max_text_length < 1: raise Exception("Max text length must be equal to or greater than 1") if max_summary_length < 1: raise Exception( "Max summary length must be equal to or greater than 1") if save_model: if not os.path.exists(save_path): raise Exception("Save path does not exists") if test_size == 0: testing = False else: testing = True if gpu: device = "cuda" else: device = "cpu" data = DataReader(self.dataset) data = data.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) if preprocess: data.fillna(0, inplace=True) logger("Preprocessing data...") if label_column is None: label = "summary" else: label = label_column X, Y, target = get_target_values(data, instruction, label) df = pd.DataFrame({'text': Y, 'ctext': X}) logger("->", "Target Column Found: {}".format(target)) torch.manual_seed(random_state) np.random.seed(random_state) tokenizer = T5Tokenizer.from_pretrained("t5-small") train_size = 1 - test_size train_dataset = df.sample(frac=train_size, random_state=random_state).reset_index(drop=True) logger("Establishing dataset walkers") training_set = CustomDataset(train_dataset, tokenizer, max_text_length, max_summary_length) if testing: val_dataset = df.drop(train_dataset.index).reset_index(drop=True) val_set = CustomDataset(val_dataset, tokenizer, max_text_length, max_summary_length) val_params = { 'batch_size': batch_size, 'shuffle': False, 'num_workers': 0 } val_loader = DataLoader(val_set, **val_params) else: val_loader = None train_params = { 'batch_size': batch_size, 'shuffle': True, 'num_workers': 0 } training_loader = DataLoader(training_set, **train_params) # used small model model = T5ForConditionalGeneration.from_pretrained("t5-small") model = model.to(device) optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate) logger('Fine-Tuning the model on your dataset...') total_loss_train = [] total_loss_val = [] for epoch in range(epochs): loss_train, loss_val = train(epoch, tokenizer, model, device, training_loader, val_loader, optimizer, testing=testing) total_loss_train.append(loss_train) total_loss_val.append(loss_val) logger("->", "Final training loss: {}".format(loss_train)) if testing: logger("->", "Final validation loss: {}".format(loss_val)) else: logger("->", "Final validation loss: {}".format("0, No validation done")) plots = {} if generate_plots: logger("Generating plots") plots.update({ "loss": libra.plotting.nonkeras_generate_plots.plot_loss( total_loss_train, total_loss_val) }) if save_model: logger("Saving model") path = save_path + "DocSummarization.pth" torch.save(model, path) logger("->", "Saved model to disk as DocSummarization.pth") logger( "Storing information in client object under key 'doc_summarization'") self.models["doc_summarization"] = { "model": model, "max_text_length": max_text_length, "max_sum_length": max_summary_length, "plots": plots, 'losses': { 'training_loss': loss_train, 'val_loss': loss_val } } clearLog() return self.models["doc_summarization"]
def dimensionality_reduc( instruction, dataset, arr=[ "RF", "PCA", "KPCA", "ICA"], inplace=False): global currLog global counter dataReader = DataReader(dataset) logger("loading dataset...") data = dataReader.data_generator() data.fillna(0, inplace=True) logger("getting most similar column from instruction...") target = get_similar_column(get_value_instruction(instruction), data) y = data[target] del data[target] le = preprocessing.LabelEncoder() y = le.fit_transform(y) data = structured_preprocesser(data) perms = [] overall_storage = [] finals = [] logger("generating dimensionality permutations...") for i in range(1, len(arr) + 1): for elem in list(permutations(arr, i)): perms.append(elem) logger("running each possible permutation...") logger("realigning tensors...") for path in perms: currSet = data for element in path: if element == "RF": data_mod, beg_acc, final_acc, col_removed = dimensionality_RF( instruction, currSet, target, y) elif element == "PCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA( instruction, currSet, target, y) elif element == "KPCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA( instruction, currSet, target, y) elif element == "ICA": data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA( instruction, currSet, target, y) overall_storage.append( list([data_mod, beg_acc, final_acc, col_removed])) currSet = data_mod finals.append(overall_storage[len(overall_storage) - 1]) logger("Fetching Best Accuracies...") accs = [] print("") print("Baseline Accuracy: " + str(finals[0][1])) print("----------------------------") for i, element in product(range(len(finals)), finals): print("Permutation --> " + str(perms[i]) + " | Final Accuracy --> " + str(element[2])) if finals[0][1] < element[2]: accs.append(list(["Permutation --> " + str(perms[i]) + " | Final Accuracy --> " + str(element[2])])) print("") print("Best Accuracies") print("----------------------------") for element in accs: print(element) if inplace: data.to_csv(dataset)
def nearest_neighbors(instruction=None, dataset=None, ca_threshold=None, preprocess=True, drop=None, min_neighbors=3, max_neighbors=10, leaf_size=30, p=2, test_size=0.2, random_state=49, algorithm='auto', text=[]): ''' function to train a nearest neighbor algorithm :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") # Reads in dataset # data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) logger("Preprocessing data") data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # encodes the label dataset into 0's and 1's y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = sklearn.preprocessing.LabelEncoder() label_mappings.fit(y_vals) y_train = label_mappings.transform(y_train) y_test = label_mappings.transform(y_test) logger("Labels being mapped to appropriate classes") models = [] scores = [] logger("Fitting nearest neighbors model") logger("Identifying optimal number of neighbors") # Tries all neighbor possibilities, based on either defaults or user # specified values num_neighbors = [] for x in range(min_neighbors, max_neighbors): knn = KNeighborsClassifier(n_neighbors=x, leaf_size=leaf_size, p=p, algorithm=algorithm) knn.fit(X_train, y_train) models.append(knn) scores.append(accuracy_score(knn.predict(X_test), y_test)) num_neighbors.append(x) logger( "->", "Optimal number of neighbors found: {}".format( num_neighbors[scores.index(max(scores))])) logger( "->", "Accuracy found on testing set: {}".format(scores[scores.index( max(scores))])) logger("Stored model under 'nearest_neighbors' key") knn = models[scores.index(min(scores))] clearLog() return { 'id': generate_id(), "model": knn, 'num_classes': num_classes, "accuracy": { 'accuracy_score': scores[scores.index(max(scores))], 'cross_val_score': cross_val_score(knn, X_train, y_train, cv=3) }, "preprocesser": full_pipeline, "interpreter": label_mappings, 'test_data': { 'X': X_test, 'y': y_test }, "target": remove } clearLog()
def classification_ann(instruction, dataset=None, text=None, ca_threshold=None, preprocess=True, callback_mode='min', drop=None, random_state=49, test_size=0.2, epochs=50, generate_plots=True, maximizer="val_loss", save_model=True, save_path=os.getcwd()): global currLog logger("Reading in dataset...") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target Column Found: {}".format(remove)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y = pd.concat([y['train'], y['test']], axis=0) num_classes = len(np.unique(y)) X_train = data['train'] X_test = data['test'] # ANN needs target one hot encoded for classification one_hot_encoder = OneHotEncoder() y = pd.DataFrame(one_hot_encoder.fit_transform( np.reshape(y.values, (-1, 1))).toarray(), columns=one_hot_encoder.get_feature_names()) y_train = y.iloc[:len(X_train)] y_test = y.iloc[len(X_train):] models = [] losses = [] accuracies = [] model_data = [] logger("Establishing callback function...") # early stopping callback es = EarlyStopping(monitor=maximizer, mode='min', verbose=0, patience=5) i = 0 model = get_keras_model_class(data, i, num_classes) logger("Training initial model...") history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=[es], verbose=0) model_data.append(model) models.append(history) col_name = [[ "Initial number of layers ", "| Training Loss ", "| Test Loss " ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") values = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append( "| " + str(history.history['val_loss'][len(history.history['val_loss']) - 1])) datax = [] datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") #print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) losses.append(history.history[maximizer][len(history.history[maximizer]) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("Testing number of layers...") col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") datax = [] while (all(x > y for x, y in zip(losses, losses[1:]))): model = get_keras_model_class(data, i, num_classes) history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=[es], verbose=0) values = [] datax = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append("| " + str(history.history['val_loss'][ len(history.history['val_loss']) - 1])) datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") losses.append( history.history[maximizer][len(history.history[maximizer]) - 1]) accuracies.append(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1]) i += 1 #print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) #del values, datax final_model = model_data[losses.index(min(losses))] final_hist = models[losses.index(min(losses))] print("") logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger( '->', "Training Accuracy: " + str(final_hist.history['accuracy'][ len(final_hist.history['val_accuracy']) - 1])) logger( '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][ len(final_hist.history['val_accuracy']) - 1])) # genreates appropriate classification plots by feeding all information plots = generate_classification_plots(models[len(models) - 1], data, y, model, X_test, y_test) if save_model: save(final_model, save_model) print("") logger("Stored model under 'classification_ANN' key") # stores the values and plots into the object dictionary return { 'id': generate_id(), "model": final_model, 'num_classes': num_classes, "plots": plots, "target": remove, "preprocesser": full_pipeline, "interpreter": one_hot_encoder, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss'] }, 'accuracy': { 'training_accuracy': final_hist.history['accuracy'], 'validation_accuracy': final_hist.history['val_accuracy'] } }
def decision_tree(instruction, dataset=None, preprocess=True, ca_threshold=None, text=[], test_size=0.2, drop=None, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0): ''' function to train a decision tree algorithm. :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() logger("Preprocessing data") if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target column found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = sklearn.preprocessing.LabelEncoder() label_mappings.fit(y_vals) y_train = label_mappings.transform(y_train) y_test = label_mappings.transform(y_test) logger("Labels being mapped to appropriate classes") num_classes = len(np.unique(y)) # fitting and storing logger("Fitting Decision Tree") clf = tree.DecisionTreeClassifier( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha) clf = clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Score found on testing set: {}".format(score)) logger("Stored model under 'decision_tree' key") clearLog() return { 'id': generate_id(), "model": clf, "target": remove, 'num_classes': num_classes, "accuracy": { 'cross_val_score': cross_val_score(clf, X_train, y_train, cv=3), 'accuracy_score': score }, "accuracy_score": score, "preprocesser": full_pipeline, "interpreter": label_mappings, 'test_data': { 'X': X_test, 'y': y_test } }
def regression_ann(instruction, ca_threshold=None, text=None, dataset=None, drop=None, preprocess=True, test_size=0.2, random_state=49, epochs=50, generate_plots=True, callback_mode='min', maximizer="val_loss", save_model=True, save_path=os.getcwd()): global currLog logger("reading in dataset...") dataReader = DataReader(dataset) data = dataReader.data_generator() # data = pd.read_csv(self.dataset) if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target Column Found: {}".format(target)) X_train = data['train'] X_test = data['test'] # Target scaling target_scaler = StandardScaler() y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1)) y_test = target_scaler.transform(np.array(y['test']).reshape(-1, 1)) logger("Establishing callback function...") models = [] losses = [] model_data = [] # callback function to store lowest loss value es = EarlyStopping(monitor=maximizer, mode=callback_mode, verbose=0, patience=5) i = 0 # get the first 3 layer model model = get_keras_model_reg(data, i) logger("Training initial model...") history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=[es], verbose=0) models.append(history) model_data.append(model) col_name = [[ "Initial number of layers ", "| Training Loss ", "| Test Loss " ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") values = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append( "| " + str(history.history['val_loss'][len(history.history['val_loss']) - 1])) datax = [] datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") losses.append(history.history[maximizer][len(history.history[maximizer]) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("Testing number of layers...") print(currLog) col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") datax = [] while (all(x > y for x, y in zip(losses, losses[1:]))): model = get_keras_model_reg(data, i) history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), verbose=0) model_data.append(model) models.append(history) values = [] datax = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append("| " + str(history.history['val_loss'][ len(history.history['val_loss']) - 1])) datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") del values, datax losses.append( history.history[maximizer][len(history.history[maximizer]) - 1]) i += 1 #print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) final_model = model_data[losses.index(min(losses))] final_hist = models[losses.index(min(losses))] print("") logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger( '->', "Training Loss: " + str(final_hist.history['loss'][len(final_hist.history['val_loss']) - 1])) logger( '->', "Test Loss: " + str(final_hist.history['val_loss'][len(final_hist.history['val_loss']) - 1])) # calls function to generate plots in plot generation if generate_plots: init_plots, plot_names = generate_regression_plots( models[len(models) - 1], data, y) plots = {} for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] if save_model: save(final_model, save_model) # stores values in the client object models dictionary field print("") logger("Stored model under 'regression_ANN' key") return { 'id': generate_id(), 'model': final_model, "target": target, "plots": plots, "preprocesser": full_pipeline, "interpreter": target_scaler, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss'] } }
def k_means_clustering(dataset=None, scatters=[], clusters=None, preprocess=True, generate_plots=True, drop=None, base_clusters=1, verbose=0, n_init=10, max_iter=300, random_state=42, text=[]): ''' function to train a k means clustering algorithm :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) dataPandas = data.copy() full_pipeline = None if preprocess: logger("Preprocessing data") data, full_pipeline = clustering_preprocessor(data) data = np.array(data) modelStorage = [] inertiaStor = [] # processes dataset and runs KMeans algorithm on one cluster as # baseline if clusters is None: i = base_clusters logger("Creating unsupervised clustering task") kmeans = KMeans(n_clusters=i, random_state=random_state, verbose=verbose, n_init=n_init, max_iter=max_iter).fit(data) modelStorage.append(kmeans) # stores SSE values in an array for later comparison inertiaStor.append(kmeans.inertia_) logger("Identifying best centroid count and optimizing accuracy") col_name = [["Number of clusters ", "| Inertia "]] col_width = max(len(word) for row in col_name for word in row) + 2 printtable(col_name, col_width) values = [] values.append(str(i)) values.append("| " + str(inertiaStor[i - base_clusters])) datax = [] datax.append(values) printtable(datax, col_width) i += 1 # continues to increase cluster size until SSE values don't decrease by # 1000 - this value was decided based on precedence while (all(earlier >= later for earlier, later in zip(inertiaStor, inertiaStor[1:]))): kmeans = KMeans(n_clusters=i, random_state=random_state, verbose=verbose, n_init=n_init, max_iter=max_iter).fit(data) modelStorage.append(kmeans) inertiaStor.append(kmeans.inertia_) values = [] values.append(str(i)) values.append("| " + str(inertiaStor[i - base_clusters])) datax = [] datax.append(values) printtable(datax, col_width) # minimize inertia up to 10000 i += 1 # checks to see if it should continue to run; need to improve this # algorithm if i > 3 and inertiaStor[len(inertiaStor) - 2] - 1000 <= inertiaStor[len(inertiaStor) - 1]: print() break # generates the clustering plots approiately logger("->", "Optimal number of clusters found: {}".format(i)) logger("->", "Final inertia of {}".format(inertiaStor[len(inertiaStor) - 1])) else: kmeans = KMeans(n_clusters=clusters, random_state=random_state, verbose=verbose, n_init=n_init, max_iter=max_iter).fit(data) plots = {} if generate_plots: if clusters is None: logger("Generating plots and storing in model") init_plots, plot_names, elbow = generate_clustering_plots( modelStorage[len(modelStorage) - 1], dataPandas, data, scatters, inertiaStor, base_clusters) for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] plots['elbow'] = elbow logger("Stored model under 'k_means_clustering' key") clearLog() # stores plots and information in the dictionary client model return { 'id': generate_id(), "model": (modelStorage[len(modelStorage) - 1] if clusters is None else kmeans), "preprocesser": full_pipeline, "plots": plots }
def tune_helper(model_to_tune=None, dataset=None, models=None, max_layers=10, min_layers=2, min_dense=32, max_dense=512, executions_per_trial=3, max_trials=1, activation='relu', loss='categorical_crossentropy', metrics='accuracy', seed=42, objective='val_accuracy', generate_plots=True, directory='my_dir', epochs=10, step=32, patience=1, verbose=0, test_size=0.2): ''' Helper function that calls the appropriate tuning function :param instruction: the objective that you want to reduce dimensions to maximize :return the updated models dictionary ''' print("") logger("Getting target model for tuning...") # checks to see which requested model is in the self.models # processing for regression feed forward NN if model_to_tune == 'regression_ANN': logger("Reading in data") logger("Tuning model hyperparameters...") dataReader = DataReader(dataset) data = dataReader.data_generator() target = models['regression_ANN']['target'] target_column = data[models['regression_ANN']['target']] data = models['regression_ANN']['preprocesser'].transform( data.drop(target, axis=1)) returned_model, returned_pms, history, X_test, y_test = tuneReg( data.values, target_column.values, max_layers=max_layers, min_layers=min_layers, min_dense=min_dense, max_dense=max_dense, executions_per_trial=executions_per_trial, max_trials=max_trials, epochs=epochs, activation=activation, step=step, directory=directory, verbose=verbose, test_size=test_size) plots = {} logger("->", 'Best Hyperparameters Found: {}'.format(returned_pms.values)) if generate_plots: logger("Generating updated plots") init_plots, plot_names = generate_regression_plots( history, data, target_column) for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] models['regression_ANN'] = { 'id': models['regression_ANN']['id'], 'model': returned_model, 'target': target, "plots": plots, 'preprocesser': models['regression_ANN']['preprocesser'], 'interpreter': models['regression_ANN']['interpreter'], 'test_data': { 'X': X_test, 'y': y_test }, 'hyperparameters': returned_pms.values, 'losses': { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] } } logger("Re-stored model under 'regression_ANN' key") # processing for classification feed forward NN elif model_to_tune == "classification_ANN": logger("Reading in data") logger("Tuning model hyperparameters...") dataReader = DataReader(dataset) data = dataReader.data_generator() target = models['classification_ANN']['target'] target_column = data[models['classification_ANN']['target']] data = models['classification_ANN']['preprocesser'].transform( data.drop(target, axis=1)) returned_model, returned_pms, history, X_test, y_test = tuneClass( data, target_column, models['classification_ANN']['num_classes'], max_layers=max_layers, min_layers=min_layers, min_dense=min_dense, max_dense=max_dense, executions_per_trial=executions_per_trial, max_trials=max_trials, activation=activation, loss=loss, directory=directory, metrics=metrics, epochs=epochs, step=step, verbose=verbose, test_size=test_size) plots = {} logger("->", 'Best Hyperparameters Found: {}'.format(returned_pms.values)) if generate_plots: logger("Generating updated plots") plots = generate_classification_plots(history, data, target_column, returned_model, X_test, y_test) logger("Re-stored model under 'classification_ANN' key") models['classification_ANN'] = { 'id': models['classification_ANN']['id'], 'model': returned_model, 'hyperparameters': returned_pms.values, 'plots': plots, 'preprocesser': models['classification_ANN']['preprocesser'], 'interpreter': models['classification_ANN']['interpreter'], 'test_data': { 'X': X_test, 'y': y_test }, 'target': target, 'losses': { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] }, 'accuracy': { 'training_accuracy': history.history['accuracy'], 'validation_accuracy': history.history['val_accuracy'] } } elif model_to_tune == "convolutional_NN": logger("Tuning model hyperparameters...") X_train, X_test, height, width, num_classes = get_image_data(models) logger('Located image data') model, returned_pms, history = tuneCNN( X_train, X_test, height, width, num_classes, executions_per_trial=executions_per_trial, max_trials=max_trials, seed=seed, objective=objective, directory=directory, patience=patience, epochs=epochs, verbose=verbose, test_size=test_size) logger("->", "Optimal image size identified: {}".format( (height, width, 3))) logger('Packaging HyperModel') logger("->", 'Best Hyperparameters Found: {}'.format(returned_pms.values)) logger("Re-stored model under 'convolutional_NN' key") models['convolutional_NN'] = { 'id': models['convolutional_NN']['id'], 'data_type': models['convolutional_NN']['data_type'], 'data_path': models['convolutional_NN']['data_path'], 'data': { 'train': X_train, 'test': X_test }, 'shape': models['convolutional_NN']['shape'], 'model': model, 'num_classes': models['convolutional_NN']['num_classes'], 'data_sizes': models['convolutional_NN']['data_sizes'], 'losses': { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] }, 'accuracy': { 'training_accuracy': history.history['accuracy'], 'validation_accuracy': history.history['val_accuracy'] } } clearLog() return models
def regression_ann(instruction, callback=False, ca_threshold=None, text=[], dataset=None, drop=None, preprocess=True, test_size=0.2, random_state=49, epochs=50, generate_plots=True, callback_mode='min', maximizer="val_loss", save_model=False, save_path=os.getcwd(), add_layer={}): ''' Body of the regression function used that is called in the neural network query if the data is numerical. :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained. :return dictionary that holds all the information for the finished model. ''' if dataset is None: dataReader = DataReader(get_file()) else: dataReader = DataReader(dataset) logger("Reading in dataset") data = dataReader.data_generator() # data = pd.read_csv(self.dataset) if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, target, full_pipeline = initial_preprocessor( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(target)) X_train = data['train'] X_test = data['test'] # Target scaling target_scaler = StandardScaler() y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1)) y_test = target_scaler.transform(np.array(y['test']).reshape(-1, 1)) logger("Establishing callback function") models = [] losses = [] model_data = [] # callback function to store lowest loss value es = EarlyStopping(monitor=maximizer, mode=callback_mode, verbose=0, patience=5) callback_value = None if callback is not False: callback_value = [es] i = 0 # add_layer format: {<object> : list of indexs} # get the first 3 layer model model = get_keras_model_reg(data, i, add_layer) logger("Training initial model") history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=callback_value, verbose=0) models.append(history) model_data.append(model) col_name = [[ "Initial number of layers ", "| Training Loss ", "| Test Loss " ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") values = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append( "| " + str(history.history['val_loss'][len(history.history['val_loss']) - 1])) datax = [] datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") losses.append(history.history[maximizer][len(history.history[maximizer]) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("Testing number of layers") col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") datax = [] # while all(x > y for x, y in zip(losses, losses[1:])): while (len(losses) <= 2 or losses[len(losses) - 1] < losses[len(losses) - 2]): model = get_keras_model_reg(data, i, add_layer) history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) model_data.append(model) models.append(history) values = [] datax = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append("| " + str(history.history['val_loss'][ len(history.history['val_loss']) - 1])) datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") del values, datax losses.append( history.history[maximizer][len(history.history[maximizer]) - 1]) i += 1 # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) final_model = model_data[losses.index(min(losses))] final_hist = models[losses.index(min(losses))] print("") logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger( '->', "Training Loss: " + str(final_hist.history['loss'][len(final_hist.history['val_loss']) - 1])) logger( '->', "Test Loss: " + str(final_hist.history['val_loss'][len(final_hist.history['val_loss']) - 1])) # calls function to generate plots in plot generation plots = {} if generate_plots: init_plots, plot_names = generate_regression_plots( models[len(models) - 1], data, y) for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] if save_model: save(final_model, save_model, save_path) # stores values in the client object models dictionary field print("") logger("Stored model under 'regression_ANN' key") clearLog() K.clear_session() return { 'id': generate_id(), 'model': final_model, "target": target, "num_classes": 1, "plots": plots, "preprocessor": full_pipeline, "interpreter": target_scaler, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss'] } }
def image_caption_query(self, instruction, label_column=None, drop=None, epochs=10, preprocess=True, random_state=49, test_size=0.2, top_k=5000, batch_size=32, buffer_size=1000, embedding_dim=256, units=512, gpu=False, generate_plots=True, save_model_decoder=False, save_path_decoder=os.getcwd(), save_model_encoder=False, save_path_encoder=os.getcwd()): ''' function to apply predictive algorithm for image_caption generation :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' if test_size < 0: raise Exception("Test size must be a float between 0 and 1") if test_size >= 1: raise Exception( "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training " "data)") if top_k < 1: raise Exception("Top_k value must be equal to or greater than 1") if batch_size < 1: raise Exception("Batch size must be equal to or greater than 1") if buffer_size < 1: raise Exception("Buffer size must be equal to or greater than 1") if embedding_dim < 1: raise Exception( "Embedding dimension must be equal to or greater than 1") if units < 1: raise Exception("Units must be equal to or greater than 1") if epochs < 1: raise Exception( "Epoch number is less than 1 (model will not be trained)") if save_model_decoder: if not os.path.exists(save_path_decoder): raise Exception("Decoder save path does not exists") if save_model_encoder: if not os.path.exists(save_path_encoder): raise Exception("Encoder save path does not exists") if test_size == 0: testing = False else: testing = True if gpu: if tf.test.gpu_device_name(): print('Default GPU Device: {}'.format(tf.test.gpu_device_name())) else: raise Exception("Please install GPU version of Tensorflow") device = '/device:GPU:0' else: device = '/device:CPU:0' np.random.seed(random_state) tf.random.set_seed(random_state) data = DataReader(self.dataset) df = data.data_generator() if preprocess: df.fillna(0, inplace=True) if drop is not None: df.drop(drop, axis=1, inplace=True) logger("Preprocessing data") train_captions = [] img_name_vector = [] if label_column is None: label = instruction else: label = label_column x = get_path_column(df) y = get_similar_column(get_value_instruction(label), df) logger("->", "Target Column Found: {}".format(y)) for row in df.iterrows(): if preprocess: caption = '<start> ' + row[1][y] + ' <end>' image_id = row[1][x] image_path = image_id img_name_vector.append(image_path) train_captions.append(caption) image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet') new_input = image_model.input hidden_layer = image_model.layers[-1].output logger("Extracting features from model") image_features_extract_model = tf.keras.Model(new_input, hidden_layer) image_dataset = tf.data.Dataset.from_tensor_slices( sorted(set(img_name_vector))) image_dataset = image_dataset.map( load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16) for img, path in image_dataset: batch_features = image_features_extract_model(img) batch_features = tf.reshape( batch_features, (batch_features.shape[0], -1, batch_features.shape[3])) for bf, p in zip(batch_features, path): path_of_feature = p.numpy().decode("utf-8") np.save(path_of_feature, bf.numpy()) logger("->", "Tokenizing top {} words".format(top_k)) tokenizer = tf.keras.preprocessing.text.Tokenizer( num_words=top_k, oov_token="<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ') tokenizer.fit_on_texts(train_captions) tokenizer.word_index['<pad>'] = 0 tokenizer.index_word[0] = '<pad>' train_seqs = tokenizer.texts_to_sequences(train_captions) cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post') vocab_size = top_k + 1 # num_steps = len(img_name_vector) // batch_size if testing: img_name_train, img_name_val, cap_train, cap_val = train_test_split( img_name_vector, cap_vector, test_size=test_size, random_state=0) else: img_name_train = img_name_vector cap_train = cap_vector dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train)) dataset = dataset.map(lambda item1, item2: tf.numpy_function( map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # Shuffle and batch logger("Shuffling dataset") dataset = dataset.shuffle(buffer_size).batch(batch_size) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) if testing: dataset_val = tf.data.Dataset.from_tensor_slices( (img_name_val, cap_val)) dataset_val = dataset_val.map( lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # Shuffle and batch dataset_val = dataset_val.shuffle(buffer_size).batch(batch_size) dataset_val = dataset_val.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) logger("Establishing encoder decoder framework") encoder = CNN_Encoder(embedding_dim) decoder = RNN_Decoder(embedding_dim, units, vocab_size) optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) @tf.function def train_step(img_tensor, target): with tf.device(device): loss = 0 # initializing the hidden state for each batch # because the captions are not related from image to image hidden = decoder.reset_state(batch_size=target.shape[0]) dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1) with tf.GradientTape() as tape: features = encoder(img_tensor) for i in range(1, target.shape[1]): # passing the features through the decoder predictions, hidden, _ = decoder(dec_input, features, hidden) loss += loss_function(target[:, i], predictions) # using teacher forcing dec_input = tf.expand_dims(target[:, i], 1) total_loss = (loss / int(target.shape[1])) trainable_variables = encoder.trainable_variables + decoder.trainable_variables gradients = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(gradients, trainable_variables)) return loss, total_loss @tf.function def val_step(img_tensor, target): with tf.device(device): loss = 0 # initializing the hidden state for each batch # because the captions are not related from image to image hidden = decoder.reset_state(batch_size=target.shape[0]) dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1) with tf.GradientTape() as tape: features = encoder(img_tensor) for i in range(1, target.shape[1]): # passing the features through the decoder predictions, hidden, _ = decoder(dec_input, features, hidden) loss += loss_function(target[:, i], predictions) # using teacher forcing dec_input = tf.expand_dims(target[:, i], 1) total_loss = (loss / int(target.shape[1])) return total_loss logger("Training model...") with tf.device(device): loss_plot_train = [] loss_plot_val = [] for epoch in range(epochs): total_loss = 0 total_loss_val = 0 for (batch, (img_tensor, target)) in enumerate(dataset): batch_loss, t_loss = train_step(img_tensor, target) total_loss += t_loss loss_plot_train.append(total_loss.numpy()) if testing: for (batch, (img_tensor, target)) in enumerate(dataset_val): batch_loss, t_loss = train_step(img_tensor, target) total_loss_val += t_loss loss_plot_val.append(total_loss_val.numpy()) dir_name = os.path.dirname(img_name_vector[0]) files = os.listdir(dir_name) for item in files: if item.endswith(".npy"): os.remove(os.path.join(dir_name, item)) plots = {} if generate_plots: logger("Generating plots") plots.update({ "loss": libra.plotting.nonkeras_generate_plots.plot_loss( loss_plot_train, loss_plot_val) }) logger("->", "Final training loss: {}".format(str(total_loss.numpy()))) total_loss = total_loss.numpy() if testing: total_loss_val = total_loss_val.numpy() total_loss_val_str = str(total_loss_val) else: total_loss_val = 0 total_loss_val_str = str("0, No validation done") logger("->", "Final validation loss: {}".format(total_loss_val_str)) if save_model_decoder: logger("Saving decoder checkpoint...") encoder.save_weights(save_path_decoder + "decoderImgCap.ckpt") if save_model_encoder: logger("Saving encoder checkpoint...") encoder.save_weights(save_path_encoder + "encoderImgCap.ckpt") logger("Storing information in client object under key 'image_caption'") self.models["image_caption"] = { "decoder": decoder, "encoder": encoder, "tokenizer": tokenizer, "feature_extraction": image_features_extract_model, "plots": plots, 'losses': { 'Training loss': total_loss, 'Validation loss': total_loss_val } } clearLog() return self.models["image_caption"]
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): ''' function to reduce dimensionality in dataset via random forest method :param instruction: command sent to client instance in written query. :param dataset: data instantiated in client instance passed to the algorithm :param target: column name of response variable/feature :param y: dictionary of train/test data values associated with response variable/feature :param n_features: maximum number of features to choose to analyze/select ''' global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data = structured_preprocesser(data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split( dataset, y, test_size=0.2, random_state=49) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score( first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(dataset.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=i) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(dataset.columns[indices]) X_temp_train = X_train[dataset.columns[indices]] X_temp_test = X_test[dataset.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) return datas[the_index], accuracy_scores[0], max( accuracy_scores), list(columns[the_index])
def k_means_clustering(dataset=None, preprocess=True, generate_plots=True, drop=None, base_clusters=1): logger("Reading dataset...") # loads dataset and replaces n/a with zero # data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) dataPandas = data.copy() full_pipeline = None if preprocess: logger("Preprocessing data...") data, full_pipeline = clustering_preprocessor(data) data = np.array(data) modelStorage = [] inertiaStor = [] # processes dataset and runs KMeans algorithm on one cluster as # baseline i = base_clusters logger("Creating unsupervised clustering task...") kmeans = KMeans(n_clusters=i, random_state=0).fit(data) modelStorage.append(kmeans) # stores SSE values in an array for later comparison inertiaStor.append(kmeans.inertia_) i += 1 logger("Identifying best centroid count and optimizing accuracy") # continues to increase cluster size until SSE values don't decrease by # 1000 - this value was decided based on precedence while (all(earlier >= later for earlier, later in zip(inertiaStor, inertiaStor[1:]))): kmeans = KMeans(n_clusters=i, random_state=0).fit(data) modelStorage.append(kmeans) inertiaStor.append(kmeans.inertia_) # minimize inertia up to 10000 i += 1 # checks to see if it should continue to run; need to improve this # algorithm if i > 3 and inertiaStor[len(inertiaStor) - 2] - 1000 <= inertiaStor[ len(inertiaStor) - 1]: break # generates the clustering plots approiately logger("->", "Optimal number of clusters found: {}".format(i)) if generate_plots: logger("Generating plots and storing in model") init_plots, plot_names = generate_clustering_plots( modelStorage[len(modelStorage) - 1], dataPandas, data) plots = {} for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] logger("Stored model under 'k_means_clustering' key") # stores plots and information in the dictionary client model return { 'id': generate_id(), "model": modelStorage[len(modelStorage) - 1], "preprocesser": full_pipeline, "plots": plots } clearLog()
def summarization_query(self, instruction, preprocess=True, label_column=None, drop=None, epochs=5, batch_size=32, learning_rate=3e-5, max_text_length=512, gpu=False, test_size=0.2, random_state=49, generate_plots=True, save_model=False, save_path=os.getcwd()): ''' function to apply algorithm for text summarization :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' if test_size < 0: raise Exception("Test size must be a float between 0 and 1") if test_size >= 1: raise Exception( "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training " "data)") if max_text_length < 2: raise Exception("Text and summary must be at least of length 2") if epochs < 1: raise Exception( "Epoch number is less than 1 (model will not be trained)") if batch_size < 1: raise Exception("Batch size must be equal to or greater than 1") if max_text_length < 1: raise Exception("Max text length must be equal to or greater than 1") if save_model: if not os.path.exists(save_path): raise Exception("Save path does not exist") if test_size == 0: testing = False else: testing = True if gpu: if tf.test.gpu_device_name(): print('Default GPU Device: {}'.format(tf.test.gpu_device_name())) else: raise Exception("Please install GPU version of Tensorflow") device = '/device:GPU:0' else: device = '/device:CPU:0' tf.random.set_seed(random_state) np.random.seed(random_state) data = DataReader(self.dataset) data = data.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) if preprocess: data.fillna(0, inplace=True) logger("Preprocessing data...") if label_column is None: label = "summary" else: label = label_column tokenizer = T5Tokenizer.from_pretrained("t5-small") # Find target columns X, Y, target = get_target_values(data, instruction, label) logger("->", "Target Column Found: {}".format(target)) logger("Establishing dataset walkers") # Clean up text if preprocess: logger("Preprocessing data") X = add_prefix(lemmatize_text(text_clean_up(X.array)), "summarize: ") Y = add_prefix(lemmatize_text(text_clean_up(Y.array)), "summarize: ") # tokenize text/summaries X = tokenize_for_input_ids(X, tokenizer, max_text_length) Y = tokenize_for_input_ids(Y, tokenizer, max_text_length) logger('Fine-Tuning the model on your dataset...') # Suppress unnecessary output with NoStdStreams(): model = TFT5ForConditionalGeneration.from_pretrained( "t5-small", output_loading_info=False) if testing: X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=random_state) test_dataset = tf.data.Dataset.from_tensor_slices( (X_test, y_test)).shuffle(10000).batch(batch_size) else: X_train = X y_train = Y train_dataset = tf.data.Dataset.from_tensor_slices( (X_train, y_train)).shuffle(10000).batch(batch_size) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) total_training_loss = [] total_validation_loss = [] # Training Loop with tf.device(device): for epoch in range(epochs): total_loss = 0 total_loss_val = 0 for data, truth in train_dataset: with tf.GradientTape() as tape: out = model(inputs=data, decoder_input_ids=data) loss_value = loss(truth, out[0]) total_loss += loss_value grads = tape.gradient(loss_value, model.trainable_weights) optimizer.apply_gradients( zip(grads, model.trainable_weights)) total_training_loss.append(total_loss) # Validation Loop if testing: for data, truth in test_dataset: logits = model(inputs=data, decoder_input_ids=data, training=False) val_loss = loss(truth, logits[0]) total_loss_val += val_loss total_validation_loss.append(total_loss_val) logger( "->", "Final training loss: {}".format( str(total_training_loss[len(total_training_loss) - 1].numpy()))) if testing: total_loss_val_str = str( total_validation_loss[len(total_validation_loss) - 1].numpy()) else: total_loss_val = [0] total_loss_val_str = str("0, No validation done") logger("->", "Final validation loss: {}".format(total_loss_val_str)) if testing: losses = { "Training loss": total_training_loss[len(total_training_loss) - 1].numpy(), "Validation loss": total_validation_loss[len(total_validation_loss) - 1].numpy() } else: losses = { "Training loss": total_training_loss[len(total_training_loss) - 1].numpy() } plots = None if generate_plots: logger("Generating plots") plots = { "loss": libra.plotting.nonkeras_generate_plots.plot_loss( total_training_loss, total_validation_loss) } if save_model: logger("Saving model") model.save_weights(save_path + "summarization_checkpoint.ckpt") logger("Storing information in client object under key 'summarization'") self.models["summarization"] = { "model": model, "max_text_length": max_text_length, "plots": plots, "tokenizer": tokenizer, 'losses': losses } clearLog() return self.models["summarization"]
def dimensionality_reduc( instruction, dataset, arr=[ "RF", "PCA", "KPCA", "ICA"], inplace=False): ''' function to perform dimensionality reduction on the dataset (retrieve only features with most relevance from multidimensional space of the dataset) :param instruction: command sent to client instance in written query :param dataset: data instantiated in client instance passed to the algorithm :param arr: list of options of algorithm/dimension reducing techniques options to choose from :param inplace: option to keep features that were deemed as not important intact in the dataset ''' global counter dataReader = DataReader(dataset) logger("loading dataset...") data = dataReader.data_generator() data.fillna(0, inplace=True) logger("getting most similar column from instruction...") target = get_similar_column(get_value_instruction(instruction), data) y = data[target] del data[target] le = preprocessing.LabelEncoder() y = le.fit_transform(y) data = structured_preprocesser(data) perms = [] overall_storage = [] finals = [] logger("generating dimensionality permutations...") for i in range(1, len(arr) + 1): for elem in list(permutations(arr, i)): perms.append(elem) logger("running each possible permutation...") logger("realigning tensors...") for path in perms: currSet = data for element in path: if element == "RF": data_mod, beg_acc, final_acc, col_removed = dimensionality_RF( instruction, currSet, target, y) elif element == "PCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA( instruction, currSet, target, y) elif element == "KPCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA( instruction, currSet, target, y) elif element == "ICA": data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA( instruction, currSet, target, y) overall_storage.append( list([data_mod, beg_acc, final_acc, col_removed])) currSet = data_mod finals.append(overall_storage[len(overall_storage) - 1]) logger("Fetching Best Accuracies...") accs = [] logger("->", "Baseline Accuracy: " + str(finals[0][1])) # print("----------------------------") col_name = [["Permutation ", "| Final Accuracy "]] printtable(col_name, max(len(word) for row in col_name for word in row) + 5) for i, element in product(range(len(finals)), finals): values = [] values.append(str(perms[i])) values.append("| " + str(element[2])) datax = [] datax.append(values) printtable(datax, max(len(word) for row in col_name for word in row) + 5) del values, datax if finals[0][1] < element[2]: accs.append(list([str(perms[i]), "| " + str(element[2])])) print("") logger("->", " Best Accuracies") # print("----------------------------") col_name = [["Permutation ", "| Final Accuracy "]] printtable(col_name, max(len(word) for row in col_name for word in row) + 5) printtable(accs, col_width) if inplace: data.to_csv(dataset)
def text_classification_query(self, instruction, drop=None, preprocess=True, label_column=None, test_size=0.2, random_state=49, learning_rate=1e-2, epochs=20, monitor="val_loss", batch_size=32, max_text_length=200, max_features=20000, generate_plots=True, save_model=False, save_path=os.getcwd()): """ function to apply text_classification algorithm for sentiment analysis :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. """ if test_size < 0: raise Exception("Test size must be a float between 0 and 1") if test_size >= 1: raise Exception( "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training " "data)") if epochs < 1: raise Exception( "Epoch number is less than 1 (model will not be trained)") if batch_size < 1: raise Exception("Batch size must be equal to or greater than 1") if max_text_length < 1: raise Exception("Max text length must be equal to or greater than 1") if save_model: if not os.path.exists(save_path): raise Exception("Save path does not exists") if test_size == 0: testing = False else: testing = True data = DataReader(self.dataset) data = data.data_generator() if preprocess: data.fillna(0, inplace=True) if drop is not None: data.drop(drop, axis=1, inplace=True) if label_column is None: label = "label" else: label = label_column X, Y, target = get_target_values(data, instruction, label) Y = np.array(Y) classes = np.unique(Y) logger("->", "Target Column Found: {}".format(target)) vocab = {} if preprocess: logger("Preprocessing data") X = lemmatize_text(text_clean_up(X.array)) vocab = X X = encode_text(X, X) X = np.array(X) model = get_keras_text_class(max_features, len(classes), learning_rate) logger("Building Keras LSTM model dynamically") X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=random_state) X_train = sequence.pad_sequences(X_train, maxlen=max_text_length) X_test = sequence.pad_sequences(X_test, maxlen=max_text_length) y_vals = np.unique(np.append(y_train, y_test)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i map_func = np.vectorize(lambda x: label_mappings[x]) y_train = map_func(y_train) y_test = map_func(y_test) logger("Training initial model") # early stopping callback es = EarlyStopping(monitor=monitor, mode='auto', verbose=0, patience=5) history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, epochs=epochs, callbacks=[es], verbose=0) logger( "->", "Final training loss: {}".format( history.history["loss"][len(history.history["loss"]) - 1])) if testing: logger( "->", "Final validation loss: {}".format( history.history["val_loss"][len(history.history["val_loss"]) - 1])) logger( "->", "Final validation accuracy: {}".format( history.history["val_accuracy"][ len(history.history["val_accuracy"]) - 1])) losses = { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] } accuracy = { 'training_accuracy': history.history['accuracy'], 'validation_accuracy': history.history['val_accuracy'] } else: logger("->", "Final validation loss: {}".format("0, No validation done")) losses = {'training_loss': history.history['loss']} accuracy = {'training_accuracy': history.history['accuracy']} plots = {} if generate_plots: # generates appropriate classification plots by feeding all # information logger("Generating plots") plots = generate_classification_plots(history, X, Y, model, X_test, y_test) if save_model: save(model, save_model, save_path=save_path) logger( "Storing information in client object under key 'text_classification'") # storing values the model dictionary self.models["text_classification"] = { "model": model, "classes": classes, "plots": plots, "target": Y, "vocabulary": vocab, "interpreter": label_mappings, "max_text_length": max_text_length, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': losses, 'accuracy': accuracy } clearLog() return self.models["text_classification"]
def train_svm(instruction, dataset=None, test_size=0.2, kernel='linear', text=None, preprocess=True, ca_threshold=None, drop=None, cross_val_size=0.3): logger("Reading in dataset....") # reads dataset and fills n/a values with zeroes #data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target Column Found: {}".format(target)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values # Fitting to SVM and storing in the model dictionary logger("Fitting Support Vector Machine...") clf = svm.SVC(kernel=kernel) clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Accuracy found on testing set: {}".format(score)) logger('->', "Stored model under 'svm' key") return { 'id': generate_id(), "model": clf, "accuracy_score": accuracy_score(clf.predict(X_test), y_test), "target": target, "preprocesser": full_pipeline, "interpreter": label_mappings, "cross_val_score": cross_val_score(clf, X_train, y_train) } clearLog()