Exemple #1
0
def automatedKNN(train_X, train_y, test_X, test_y):
    """Executes K-nearest neighbour on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedKNN', 'Starting')
    k_max = round(train_y.shape[0] / len(Counter(train_y).keys())) * 0.8
    k_range = list(dict.fromkeys(geomspace(1, k_max, 50, dtype="int")))
    if(train_y.shape[0] > 200):
        if(train_X.shape[1] > 20):
            algorithm_sel = "ball_tree"
        else:
            algorithm_sel = "kd_tree"
    else:
        algorithm_sel = "auto"
    param_grid = {'n_neighbors': k_range, 'weights': ['uniform', 'distance']}
    model = KNeighborsClassifier(algorithm=algorithm_sel, n_jobs=core_count)
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #2
0
def automatedLogReg(train_X, train_y, test_X, test_y):
    """Executes Logistic Regression on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedLogReg', 'Starting')
    if(train_y.shape[0] > 200):
        if(train_X.shape[1] > 20):
            solver_sel = "sag"
        else:
            solver_sel = "saga"
    else:
        solver_sel = "liblinear"
    model = LogisticRegression(multi_class="ovr",
                               solver=solver_sel,
                               n_jobs=core_count)
    param_grid = {'C': logspace(-3, 3, 7), 'penalty': ["l1", "l2"]}
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #3
0
def automl(df, explicit=automl_explicit):
    """Main Function.

    Parameters
    ----------
    df: DataFrame
        DataFrame to be evaluated.
    explicit: boolean, optional
        If True it will also return the algorithm name.

    Returns
    -------
    AUC: float
        Best AUC archived.
    algorithm_name: string, optional
        Name of the best performing algorithm.
    """
    global resultsDict
    log_it('Module: automl', 'Starting.')
    evaluate(df)
    log_it(
        'Module: automl',
        'Results: ' + str(max(resultsDict, key=resultsDict.get)) + ' - ' +
        str(resultsDict[max(resultsDict, key=resultsDict.get)]))
    if explicit:
        return max(resultsDict, key=resultsDict.get), \
            resultsDict[max(resultsDict, key=resultsDict.get)]
    else:
        return max(list(resultsDict.values()))
Exemple #4
0
def automatedSGDReg(train_X, train_y, test_X, test_y):
    """Executes Stochastic Gradient Descent Classifier.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedSGDReg', 'Starting')
    param_grid = {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
                  'max_iter': [1000],
                  'loss': ['log', 'modified_huber'],
                  'penalty': ['l1', 'l2'],
                  'n_jobs': [-1]}
    model = SGDClassifier(n_jobs=core_count)
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #5
0
def multiclass_RocAuc_Score(test_y, pred, average="macro"):
    """Calculates the AUC Score.

    Parameters
    ----------
    test_y : numpy arrays
        Test Target.
    pred : numpy array
        Predictions.
    average : string
        ******

    Returns
    -------
    roc_auc_score : float
        AUC score calculated by roc_auc_score.
    """

    log_it('Module: multiclass_RocAuc_Score', 'Starting')
    lb = LabelBinarizer()
    lb.fit(test_y)
    y_test = lb.transform(test_y)
    y_pred = lb.transform(pred)
    return roc_auc_score(y_test, y_pred, average=average,
                         multi_class="roc_auc_ovr")
Exemple #6
0
def saveFinal(train_X, test_X, train_y, df_res, filename):
    """Saves the calculated results by calling the save_results function.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.
    df_res: DataFrame
        DataFrame containing the calculated results.
    filename: string
        Name of the analyzed DataSet.
    """

    log_it('DataSet: ' + filename, 'Saving results')
    df_res['algorithm'] = df_res.index
    df_res['dataset'] = filename
    df_res['num_rows'] = train_X.shape[0] + test_X.shape[0]
    df_res['num_vars'] = train_X.shape[1]
    df_res['num_clases'] = len(unique(train_y))
    df_res = df_res.sort_values(['AUC', 'time'], ascending=[0, 1])
    df_res.apply(lambda row: save_results(row['dataset'], row['num_rows'],
                                          row['num_vars'], row['num_clases'],
                                          row['algorithm'], row['AUC'],
                                          row['time']), axis=1)
Exemple #7
0
def run_RandomSearch(X, y, clf, param_grid, cv=search_cv):
    """Executes K-nearest neighbour on the given Data.

    Parameters
    ----------
    X : numpy arrays
        Features.
    y : numpy array
        Targets.
    clf : ****
        ******
    param_grid : *****
        Paramters to be evaluated.
    cv : ******
        Number of ****** predefined in the config_file.

    Returns
    -------
    random_search : ****
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: run_RandomSearch', 'Starting')
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_grid,
                                       scoring='roc_auc_ovr',
                                       n_jobs=core_count)
    random_search.fit(X, y)
    return random_search
Exemple #8
0
def automatedRandomForest(train_X, train_y, test_X, test_y):
    """Executes Random Forest Classifier on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedRandomForest', 'Starting')
    param_grid = {'bootstrap': [True],
                  'max_depth': [50, 100, 200],
                  'min_samples_leaf': [2, 4, 8],
                  'min_samples_split': [2, 4, 8, 12],
                  'n_estimators': [100, 500, 1000]}
    model = RandomForestClassifier()
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #9
0
def runMods(train_X, train_y, test_X, test_y,
            list_of_mdls=training_mdls):
    """Executes all algorithms in the list for the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.
    list_of_mdls: list, optional
        List with the function names to be executed.
        Default set to all algorithms.

    Returns
    -------
    Results: list
        List with the results of each runned function.
    """

    log_it('Module: runAllMod', 'Starting')
    Results = pd.DataFrame(columns=['AUC', 'time'])
    for mdl in list_of_mdls:
        start = time()
        row = pd.Series([eval(mdl + '(train_X, train_y, test_X, test_y)'),
                         time()-start], index=['AUC', 'time'], name=mdl)
        Results = Results.append(row)
    return Results
Exemple #10
0
def automatedBagging(train_X, train_y, test_X, test_y):
    """Executes Bagging Classifier on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedBagging', 'Starting')
    param_grid = {'n_estimators': [40, 100],
                  'base_estimator__max_depth': [4, 5, 6],
                  'base_estimator__max_leaf_nodes': [10, 25],
                  'max_samples': [0.05, 0.1, 0.2, 0.5]}
    model = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                              n_jobs=core_count)
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #11
0
def automatedDecisionTree(train_X, train_y, test_X, test_y):
    """Executes Decision Tree Classifier on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedDecisionTree', 'Starting')
    param_grid = {"criterion": ["gini", "entropy"],
                  "min_samples_split": geomspace(2, 50, 10, dtype=int),
                  "max_depth": geomspace(2, 50, 8, dtype=int),
                  "min_samples_leaf": geomspace(2, 50, 8, dtype=int),
                  "max_leaf_nodes": geomspace(2, 200, 10, dtype=int)}
    model = DecisionTreeClassifier()
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #12
0
def evaluate(df):
    """Function to execute models and generating new features
    until AUC == 1 or the maximum time has elapsed.

    Parameters
    ----------
    df: DataFrame
        DataFrame to be evaluated.
    """
    log_it('Module: evaluate', 'Starting.')
    folds = get_folds(df)
    predicted_algorithm = default_algorithm
    iter_control = -1

    while True:
        count = 1
        for train_X, train_y, test_X, test_y in folds:
            globals()['results%s' % count] = runMods(train_X, train_y, test_X,
                                                     test_y,
                                                     predicted_algorithm)
            count += 1
        df_res = xVal_Means(results1, results2, results3, results4)
        df_res['algorithm'] = df_res.index
        df_res.apply(lambda row: save_result(row['algorithm'], row['AUC']),
                     axis=1)
        log_it(
            'Module: evaluate', 'Iteration: ' + str(iter_control) +
            ' Result: ' + str(df_res.iloc[0]['algorithm']) + ' : ' +
            str(df_res.iloc[0]['AUC']))
        if max(list(resultsDict.values())) == 1.0 or iter_control > 6:
            log_it(
                'Module: evaluate', 'Exits, Iteration: ' + str(iter_control) +
                ' AUC: ' + str(max(list(resultsDict.values()))))
            break
        else:
            if iter_control >= 0:
                folds = add_features(folds, iter_control)
            folds_features = get_foldFeatures(folds).reshape(1, -1)
            if folds_features[0][-1] == 1.0:
                log_it('Module: evaluate', 'Exits, only one column left.')
                break
            predicted_algorithm = decission_tree.predict(folds_features) \
                .tolist()
            iter_control += 1
            log_it(
                'Module: evaluate',
                'Predicted: ' + str(predicted_algorithm[0]) +
                ' For iteration: ' + str(iter_control))
Exemple #13
0
def automatedPassiveAgr(train_X, train_y, test_X, test_y):
    """Executes Passive Aggressive Classifier on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedPassiveAgr', 'Starting')
    model = PassiveAggressiveClassifier(fit_intercept=True, n_jobs=core_count)
    model = model.fit(train_X, train_y)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #14
0
def automatedGaussNB(train_X, train_y, test_X, test_y):
    """Executes Gaussian naive Bayes classifier on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedGaussNB', 'Starting')
    model = GaussianNB()
    model.fit(train_X, ravel(train_y))
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #15
0
def add_features(folds, counter):
    """Controls the addition of new features to every fold.

    Parameters
    ----------
    folds : array
        Array containing the diferent DataSet folds.
    counter : int
        Number to specify the type of feature generation to be used.

    Returns
    -------
    returning_folds : array
        Array containing the diferent DataSet folds.
    """
    log_it('Module: add_features', 'Adding Features.')
    resulting_folds = []
    for train_X, train_y, test_X, test_y in folds:
        try:
            new_train_X, new_test_X = generateFeatures(train_X, test_X,
                                                       counter)
            new_train_X = format_df(new_train_X)
            new_test_X = format_df(new_test_X)
            new_train_X, new_test_X = transformer(new_train_X, new_test_X,
                                                  train_y)
            resulting_folds.append([new_train_X, train_y, new_test_X, test_y])
        except:
            log_it('Module: add_features', 'Generation skipped.')
            resulting_folds.append([train_X, train_y, test_X, test_y])
    log_it('Module: add_features', 'Returning new folds.')
    return resulting_folds
Exemple #16
0
def automatedSVM(train_X, train_y, test_X, test_y):
    """Executes Support-Vector Machine Classifier on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    selector = getHighVariance(train_X)
    train_X = deleteHighVariance(train_X, selector)
    if(train_X.shape[1] <= 2):
        log_it('Module: Bagging automatedSVM',
               'Skipped, Shape:'+str(train_X.shape))
        return 0
    elif(train_y.shape[0] > 200 & (train_X.shape[1] > 20)):
        log_it('Module: Bagging automatedSVM', 'Starting')
        param_grid = [{'base_estimator__kernel': ['rbf'],
                       'base_estimator__gamma': [1e-2, 1e-3, 1e-5],
                       'base_estimator__C': [0.1, 1, 10, 50],
                       'n_estimators': [40, 100],
                       'max_samples': [0.1, 0.2, 0.5]},
                      {'base_estimator__kernel': ['sigmoid'],
                       'base_estimator__gamma': [1e-2, 1e-3,  1e-5],
                       'base_estimator__C': [0.1, 1, 10, 50],
                       'n_estimators': [40, 100],
                       'max_samples': [0.1, 0.2, 0.5]},
                      {'base_estimator__kernel': ['linear'],
                       'base_estimator__C': [0.1, 1, 10, 50],
                       'n_estimators': [40, 100],
                       'max_samples': [0.1, 0.2, 0.5]},
                      {'base_estimator__kernel': ['poly'],
                       'base_estimator__degree': [2, 3, 4],
                       'base_estimator__C': [0.1, 1, 10, 50],
                       'n_estimators': [40, 100],
                       'max_samples': [0.1, 0.2, 0.5]}]
        model = BaggingClassifier(base_estimator=SVC(), n_jobs=core_count)
        model = run_RandomSearch(train_X, train_y, model, param_grid)
    else:
        log_it('Module: automatedSVM', 'Starting')
        param_grid = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3,  1e-5],
                       'C': [0.1, 1, 10, 50]},
                      {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3,  1e-5],
                       'C': [0.001, 0.10,  10, 50]},
                      {'kernel': ['linear'], 'C': [0.1, 1, 10, 50]},
                      {'kernel': ['poly'], 'degree': [2, 3, 4],
                       'C': [0.1, 1, 10, 50]}]
        model = SVC(decision_function_shape='ovr', probability=True)
        model = run_RandomSearch(train_X, train_y, model, param_grid)
    test_X = deleteHighVariance(test_X, selector)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #17
0
def automatedBerNB(train_X, train_y, test_X, test_y):
    """Executes Bernoulli Naive Bayes classifier on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedBerNB', 'Starting')
    model = BernoulliNB()
    param_grid = {'alpha': linspace(0.1, 1, 10)}
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #18
0
def automatedRidgeReg(train_X, train_y, test_X, test_y):
    """Executes Ridge Classifier on the given Data.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedRidgeReg', 'Starting')
    param_grid = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0]}
    model = RidgeClassifier(fit_intercept=False)
    model = GridSearchCV(estimator=model, param_grid=param_grid)
    model.fit(train_X, train_y)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)
Exemple #19
0
def automatedHistGB(train_X, train_y, test_X, test_y):
    """Executes Histogram-based Gradient Boosting Classifier.

    Parameters
    ----------
    train_X, test_X : numpy arrays
        Train and test Features.
    train_y, test_y : numpy array
        Train and test Targets.

    Returns
    -------
    multiclass_RocAuc_Score: float
        AUC score calculated by multiclass_RocAuc_Score.
    """

    log_it('Module: automatedHistGB', 'Starting')
    param_grid = {'max_iter': [1000, 1200, 1500],
                  'learning_rate': [0.1],
                  'max_depth': [25, 50, 75]}
    model = HistGradientBoostingClassifier()
    model = run_RandomSearch(train_X, train_y, model, param_grid)
    pred = model.predict(test_X)
    return multiclass_RocAuc_Score(test_y, pred)