Ejemplo n.º 1
0
def knn_test_several_params(trnX, tstX, trnY, tstY, multi_class, plot=False):
    nvalues = [1, 3]
    dist = ["manhattan", "euclidean", "chebyshev"]
    values = {}
    best_n_value = 0
    best_dist = 0
    best_score = 0
    best_model = None
    for d in dist:
        yvalues = []
        for n in nvalues:
            knn = knn_model(trnX, trnY, n, d)
            score = calculte_models_auc_score(knn, tstX, tstY, multi_class)
            if score > best_score:
                best_score = score
                best_n_value = n
                best_dist = d
                best_model = knn
            yvalues.append(score)
        values[d] = yvalues
    if plot:
        plt.figure()
        multiple_line_chart(plt.gca(),
                            nvalues,
                            values,
                            "KNN variants",
                            "n",
                            "Sensitivty",
                            percentage=True)
        plt.show()
    return best_model, best_score, best_dist, best_n_value
Ejemplo n.º 2
0
def xg_boost(trnX, tstX, trnY, tstY, multi_class, plot=False):
    best_score = 0
    best_model = None
    best_learning_rate = 0
    best_depth = 0
    best_estimator = 0
    learning_rates = [0.1, 0.05, 0.01]
    max_depths = [5, 25]
    n_estimators = [10, 50, 100, 200, 300, 400, 500]
    if plot:
        plt.figure()
        fig, axs = plt.subplots(1,
                                len(max_depths),
                                figsize=(16, 4),
                                squeeze=False)
    for k in range(len(max_depths)):
        max_depth = max_depths[k]
        values = {}
        for learning_rate in learning_rates:
            yvalues = []
            for n in n_estimators:
                model = xgb.XGBClassifier(
                    max_depth=max_depth,
                    learning_rate=learning_rate,
                    n_estimators=n,
                    early_stopping_rounds=10,
                )
                model.fit(trnX, trnY)
                score = calculte_models_auc_score(model, tstX, tstY,
                                                  multi_class)
                if score > best_score:
                    best_score = score
                    best_model = model
                    best_learning_rate = learning_rate
                    best_depth = max_depth
                    best_estimator = n
                yvalues.append(score)
            values[learning_rate] = yvalues
            if plot:
                multiple_line_chart(
                    axs[0, k],
                    n_estimators,
                    values,
                    "XG Boost with %s depth" % max_depth,
                    "Number of estimators",
                    "Sensitivity",
                    percentage=True,
                )
    if plot:
        plt.show()
    return best_model, best_score, best_learning_rate, best_depth, best_estimator
def dt_plot_accuracy(trnX, tstX, trnY, tstY, multi_class, plot=False):
    best_samples_leaf = 0
    best_depth = 0
    best_criteria = ""
    best_score = 0
    best_model = None
    min_samples_leaf = [.01]
    max_depths = [10]
    criteria = ["entropy", "gini"]
    if plot:
        plt.figure()
        fig, axs = plt.subplots(1, 2, figsize=(16, 4), squeeze=False)
    for k in range(len(criteria)):
        f = criteria[k]
        values = {}
        for d in max_depths:
            yvalues = []
            for n in min_samples_leaf:
                tree = decision_tree(trnX,
                                     trnY,
                                     samples_leaf=n,
                                     depth=d,
                                     criterion=f)
                score = calculte_models_auc_score(tree, tstX, tstY,
                                                  multi_class)
                if score > best_score:
                    best_score = score
                    best_depth = d
                    best_criteria = f
                    best_samples_leaf = n
                    best_model = tree
                yvalues.append(score)
            values[d] = yvalues
            if plot:
                multiple_line_chart(
                    axs[0, k],
                    min_samples_leaf,
                    values,
                    "%s criteria" % f,
                    "min sample leaf",
                    "Sensitivity",
                    percentage=True,
                )
    if plot:
        plt.show()
    return best_model, best_score, best_samples_leaf, best_depth, best_criteria
Ejemplo n.º 4
0
def rf_test_different_params(trnX, tstX, trnY, tstY, multi_class, plot=False):
    best_numb_estimator = 0
    best_depth = 0
    best_feature = ""
    best_score = 0
    best_model = None
    n_estimators = [100]
    max_depths = [10]
    max_features = ["sqrt", "log2"]

    if plot:
        plt.figure()
        fig, axs = plt.subplots(1, 2, figsize=(10, 4), squeeze=False)
    for k in range(len(max_features)):
        f = max_features[k]
        values = {}
        for d in max_depths:
            yvalues = []
            for n in n_estimators:
                rf = random_forest(trnX, trnY, n, d, f)
                score = calculte_models_auc_score(rf, tstX, tstY, multi_class)
                if score > best_score:
                    best_score = score
                    best_depth = d
                    best_numb_estimator = n
                    best_feature = f
                    best_model = rf
                yvalues.append(score)
            values[d] = yvalues
        if plot:
            multiple_line_chart(
                axs[0, k],
                n_estimators,
                values,
                "RF with %s features" % f,
                "nr estimators",
                "Sensitivity",
                percentage=True,
            )
    if plot:
        plt.show()
    return best_model, best_score, best_numb_estimator, best_depth, best_feature
def histogram_with_distributions(ax: plt.Axes, series: pd.Series, var: str):
    values = series.sort_values().values
    n, bins, patches = ax.hist(values, 20, density=True, edgecolor="grey")
    distributions = compute_known_distributions(values, bins)
    multiple_line_chart(ax, values, distributions, "Best fit for %s" % var,
                        var, "probability")