def automatedKNN(train_X, train_y, test_X, test_y): """Executes K-nearest neighbour on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedKNN', 'Starting') k_max = round(train_y.shape[0] / len(Counter(train_y).keys())) * 0.8 k_range = list(dict.fromkeys(geomspace(1, k_max, 50, dtype="int"))) if(train_y.shape[0] > 200): if(train_X.shape[1] > 20): algorithm_sel = "ball_tree" else: algorithm_sel = "kd_tree" else: algorithm_sel = "auto" param_grid = {'n_neighbors': k_range, 'weights': ['uniform', 'distance']} model = KNeighborsClassifier(algorithm=algorithm_sel, n_jobs=core_count) model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def automatedLogReg(train_X, train_y, test_X, test_y): """Executes Logistic Regression on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedLogReg', 'Starting') if(train_y.shape[0] > 200): if(train_X.shape[1] > 20): solver_sel = "sag" else: solver_sel = "saga" else: solver_sel = "liblinear" model = LogisticRegression(multi_class="ovr", solver=solver_sel, n_jobs=core_count) param_grid = {'C': logspace(-3, 3, 7), 'penalty': ["l1", "l2"]} model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def automl(df, explicit=automl_explicit): """Main Function. Parameters ---------- df: DataFrame DataFrame to be evaluated. explicit: boolean, optional If True it will also return the algorithm name. Returns ------- AUC: float Best AUC archived. algorithm_name: string, optional Name of the best performing algorithm. """ global resultsDict log_it('Module: automl', 'Starting.') evaluate(df) log_it( 'Module: automl', 'Results: ' + str(max(resultsDict, key=resultsDict.get)) + ' - ' + str(resultsDict[max(resultsDict, key=resultsDict.get)])) if explicit: return max(resultsDict, key=resultsDict.get), \ resultsDict[max(resultsDict, key=resultsDict.get)] else: return max(list(resultsDict.values()))
def automatedSGDReg(train_X, train_y, test_X, test_y): """Executes Stochastic Gradient Descent Classifier. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedSGDReg', 'Starting') param_grid = {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 'max_iter': [1000], 'loss': ['log', 'modified_huber'], 'penalty': ['l1', 'l2'], 'n_jobs': [-1]} model = SGDClassifier(n_jobs=core_count) model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def multiclass_RocAuc_Score(test_y, pred, average="macro"): """Calculates the AUC Score. Parameters ---------- test_y : numpy arrays Test Target. pred : numpy array Predictions. average : string ****** Returns ------- roc_auc_score : float AUC score calculated by roc_auc_score. """ log_it('Module: multiclass_RocAuc_Score', 'Starting') lb = LabelBinarizer() lb.fit(test_y) y_test = lb.transform(test_y) y_pred = lb.transform(pred) return roc_auc_score(y_test, y_pred, average=average, multi_class="roc_auc_ovr")
def saveFinal(train_X, test_X, train_y, df_res, filename): """Saves the calculated results by calling the save_results function. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. df_res: DataFrame DataFrame containing the calculated results. filename: string Name of the analyzed DataSet. """ log_it('DataSet: ' + filename, 'Saving results') df_res['algorithm'] = df_res.index df_res['dataset'] = filename df_res['num_rows'] = train_X.shape[0] + test_X.shape[0] df_res['num_vars'] = train_X.shape[1] df_res['num_clases'] = len(unique(train_y)) df_res = df_res.sort_values(['AUC', 'time'], ascending=[0, 1]) df_res.apply(lambda row: save_results(row['dataset'], row['num_rows'], row['num_vars'], row['num_clases'], row['algorithm'], row['AUC'], row['time']), axis=1)
def run_RandomSearch(X, y, clf, param_grid, cv=search_cv): """Executes K-nearest neighbour on the given Data. Parameters ---------- X : numpy arrays Features. y : numpy array Targets. clf : **** ****** param_grid : ***** Paramters to be evaluated. cv : ****** Number of ****** predefined in the config_file. Returns ------- random_search : **** AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: run_RandomSearch', 'Starting') random_search = RandomizedSearchCV(clf, param_distributions=param_grid, scoring='roc_auc_ovr', n_jobs=core_count) random_search.fit(X, y) return random_search
def automatedRandomForest(train_X, train_y, test_X, test_y): """Executes Random Forest Classifier on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedRandomForest', 'Starting') param_grid = {'bootstrap': [True], 'max_depth': [50, 100, 200], 'min_samples_leaf': [2, 4, 8], 'min_samples_split': [2, 4, 8, 12], 'n_estimators': [100, 500, 1000]} model = RandomForestClassifier() model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def runMods(train_X, train_y, test_X, test_y, list_of_mdls=training_mdls): """Executes all algorithms in the list for the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. list_of_mdls: list, optional List with the function names to be executed. Default set to all algorithms. Returns ------- Results: list List with the results of each runned function. """ log_it('Module: runAllMod', 'Starting') Results = pd.DataFrame(columns=['AUC', 'time']) for mdl in list_of_mdls: start = time() row = pd.Series([eval(mdl + '(train_X, train_y, test_X, test_y)'), time()-start], index=['AUC', 'time'], name=mdl) Results = Results.append(row) return Results
def automatedBagging(train_X, train_y, test_X, test_y): """Executes Bagging Classifier on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedBagging', 'Starting') param_grid = {'n_estimators': [40, 100], 'base_estimator__max_depth': [4, 5, 6], 'base_estimator__max_leaf_nodes': [10, 25], 'max_samples': [0.05, 0.1, 0.2, 0.5]} model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_jobs=core_count) model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def automatedDecisionTree(train_X, train_y, test_X, test_y): """Executes Decision Tree Classifier on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedDecisionTree', 'Starting') param_grid = {"criterion": ["gini", "entropy"], "min_samples_split": geomspace(2, 50, 10, dtype=int), "max_depth": geomspace(2, 50, 8, dtype=int), "min_samples_leaf": geomspace(2, 50, 8, dtype=int), "max_leaf_nodes": geomspace(2, 200, 10, dtype=int)} model = DecisionTreeClassifier() model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def evaluate(df): """Function to execute models and generating new features until AUC == 1 or the maximum time has elapsed. Parameters ---------- df: DataFrame DataFrame to be evaluated. """ log_it('Module: evaluate', 'Starting.') folds = get_folds(df) predicted_algorithm = default_algorithm iter_control = -1 while True: count = 1 for train_X, train_y, test_X, test_y in folds: globals()['results%s' % count] = runMods(train_X, train_y, test_X, test_y, predicted_algorithm) count += 1 df_res = xVal_Means(results1, results2, results3, results4) df_res['algorithm'] = df_res.index df_res.apply(lambda row: save_result(row['algorithm'], row['AUC']), axis=1) log_it( 'Module: evaluate', 'Iteration: ' + str(iter_control) + ' Result: ' + str(df_res.iloc[0]['algorithm']) + ' : ' + str(df_res.iloc[0]['AUC'])) if max(list(resultsDict.values())) == 1.0 or iter_control > 6: log_it( 'Module: evaluate', 'Exits, Iteration: ' + str(iter_control) + ' AUC: ' + str(max(list(resultsDict.values())))) break else: if iter_control >= 0: folds = add_features(folds, iter_control) folds_features = get_foldFeatures(folds).reshape(1, -1) if folds_features[0][-1] == 1.0: log_it('Module: evaluate', 'Exits, only one column left.') break predicted_algorithm = decission_tree.predict(folds_features) \ .tolist() iter_control += 1 log_it( 'Module: evaluate', 'Predicted: ' + str(predicted_algorithm[0]) + ' For iteration: ' + str(iter_control))
def automatedPassiveAgr(train_X, train_y, test_X, test_y): """Executes Passive Aggressive Classifier on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedPassiveAgr', 'Starting') model = PassiveAggressiveClassifier(fit_intercept=True, n_jobs=core_count) model = model.fit(train_X, train_y) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def automatedGaussNB(train_X, train_y, test_X, test_y): """Executes Gaussian naive Bayes classifier on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedGaussNB', 'Starting') model = GaussianNB() model.fit(train_X, ravel(train_y)) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def add_features(folds, counter): """Controls the addition of new features to every fold. Parameters ---------- folds : array Array containing the diferent DataSet folds. counter : int Number to specify the type of feature generation to be used. Returns ------- returning_folds : array Array containing the diferent DataSet folds. """ log_it('Module: add_features', 'Adding Features.') resulting_folds = [] for train_X, train_y, test_X, test_y in folds: try: new_train_X, new_test_X = generateFeatures(train_X, test_X, counter) new_train_X = format_df(new_train_X) new_test_X = format_df(new_test_X) new_train_X, new_test_X = transformer(new_train_X, new_test_X, train_y) resulting_folds.append([new_train_X, train_y, new_test_X, test_y]) except: log_it('Module: add_features', 'Generation skipped.') resulting_folds.append([train_X, train_y, test_X, test_y]) log_it('Module: add_features', 'Returning new folds.') return resulting_folds
def automatedSVM(train_X, train_y, test_X, test_y): """Executes Support-Vector Machine Classifier on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ selector = getHighVariance(train_X) train_X = deleteHighVariance(train_X, selector) if(train_X.shape[1] <= 2): log_it('Module: Bagging automatedSVM', 'Skipped, Shape:'+str(train_X.shape)) return 0 elif(train_y.shape[0] > 200 & (train_X.shape[1] > 20)): log_it('Module: Bagging automatedSVM', 'Starting') param_grid = [{'base_estimator__kernel': ['rbf'], 'base_estimator__gamma': [1e-2, 1e-3, 1e-5], 'base_estimator__C': [0.1, 1, 10, 50], 'n_estimators': [40, 100], 'max_samples': [0.1, 0.2, 0.5]}, {'base_estimator__kernel': ['sigmoid'], 'base_estimator__gamma': [1e-2, 1e-3, 1e-5], 'base_estimator__C': [0.1, 1, 10, 50], 'n_estimators': [40, 100], 'max_samples': [0.1, 0.2, 0.5]}, {'base_estimator__kernel': ['linear'], 'base_estimator__C': [0.1, 1, 10, 50], 'n_estimators': [40, 100], 'max_samples': [0.1, 0.2, 0.5]}, {'base_estimator__kernel': ['poly'], 'base_estimator__degree': [2, 3, 4], 'base_estimator__C': [0.1, 1, 10, 50], 'n_estimators': [40, 100], 'max_samples': [0.1, 0.2, 0.5]}] model = BaggingClassifier(base_estimator=SVC(), n_jobs=core_count) model = run_RandomSearch(train_X, train_y, model, param_grid) else: log_it('Module: automatedSVM', 'Starting') param_grid = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-5], 'C': [0.1, 1, 10, 50]}, {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-5], 'C': [0.001, 0.10, 10, 50]}, {'kernel': ['linear'], 'C': [0.1, 1, 10, 50]}, {'kernel': ['poly'], 'degree': [2, 3, 4], 'C': [0.1, 1, 10, 50]}] model = SVC(decision_function_shape='ovr', probability=True) model = run_RandomSearch(train_X, train_y, model, param_grid) test_X = deleteHighVariance(test_X, selector) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def automatedBerNB(train_X, train_y, test_X, test_y): """Executes Bernoulli Naive Bayes classifier on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedBerNB', 'Starting') model = BernoulliNB() param_grid = {'alpha': linspace(0.1, 1, 10)} model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def automatedRidgeReg(train_X, train_y, test_X, test_y): """Executes Ridge Classifier on the given Data. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedRidgeReg', 'Starting') param_grid = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0]} model = RidgeClassifier(fit_intercept=False) model = GridSearchCV(estimator=model, param_grid=param_grid) model.fit(train_X, train_y) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)
def automatedHistGB(train_X, train_y, test_X, test_y): """Executes Histogram-based Gradient Boosting Classifier. Parameters ---------- train_X, test_X : numpy arrays Train and test Features. train_y, test_y : numpy array Train and test Targets. Returns ------- multiclass_RocAuc_Score: float AUC score calculated by multiclass_RocAuc_Score. """ log_it('Module: automatedHistGB', 'Starting') param_grid = {'max_iter': [1000, 1200, 1500], 'learning_rate': [0.1], 'max_depth': [25, 50, 75]} model = HistGradientBoostingClassifier() model = run_RandomSearch(train_X, train_y, model, param_grid) pred = model.predict(test_X) return multiclass_RocAuc_Score(test_y, pred)