def knn_test_several_params(trnX, tstX, trnY, tstY, multi_class, plot=False): nvalues = [1, 3] dist = ["manhattan", "euclidean", "chebyshev"] values = {} best_n_value = 0 best_dist = 0 best_score = 0 best_model = None for d in dist: yvalues = [] for n in nvalues: knn = knn_model(trnX, trnY, n, d) score = calculte_models_auc_score(knn, tstX, tstY, multi_class) if score > best_score: best_score = score best_n_value = n best_dist = d best_model = knn yvalues.append(score) values[d] = yvalues if plot: plt.figure() multiple_line_chart(plt.gca(), nvalues, values, "KNN variants", "n", "Sensitivty", percentage=True) plt.show() return best_model, best_score, best_dist, best_n_value
def xg_boost(trnX, tstX, trnY, tstY, multi_class, plot=False): best_score = 0 best_model = None best_learning_rate = 0 best_depth = 0 best_estimator = 0 learning_rates = [0.1, 0.05, 0.01] max_depths = [5, 25] n_estimators = [10, 50, 100, 200, 300, 400, 500] if plot: plt.figure() fig, axs = plt.subplots(1, len(max_depths), figsize=(16, 4), squeeze=False) for k in range(len(max_depths)): max_depth = max_depths[k] values = {} for learning_rate in learning_rates: yvalues = [] for n in n_estimators: model = xgb.XGBClassifier( max_depth=max_depth, learning_rate=learning_rate, n_estimators=n, early_stopping_rounds=10, ) model.fit(trnX, trnY) score = calculte_models_auc_score(model, tstX, tstY, multi_class) if score > best_score: best_score = score best_model = model best_learning_rate = learning_rate best_depth = max_depth best_estimator = n yvalues.append(score) values[learning_rate] = yvalues if plot: multiple_line_chart( axs[0, k], n_estimators, values, "XG Boost with %s depth" % max_depth, "Number of estimators", "Sensitivity", percentage=True, ) if plot: plt.show() return best_model, best_score, best_learning_rate, best_depth, best_estimator
def dt_plot_accuracy(trnX, tstX, trnY, tstY, multi_class, plot=False): best_samples_leaf = 0 best_depth = 0 best_criteria = "" best_score = 0 best_model = None min_samples_leaf = [.01] max_depths = [10] criteria = ["entropy", "gini"] if plot: plt.figure() fig, axs = plt.subplots(1, 2, figsize=(16, 4), squeeze=False) for k in range(len(criteria)): f = criteria[k] values = {} for d in max_depths: yvalues = [] for n in min_samples_leaf: tree = decision_tree(trnX, trnY, samples_leaf=n, depth=d, criterion=f) score = calculte_models_auc_score(tree, tstX, tstY, multi_class) if score > best_score: best_score = score best_depth = d best_criteria = f best_samples_leaf = n best_model = tree yvalues.append(score) values[d] = yvalues if plot: multiple_line_chart( axs[0, k], min_samples_leaf, values, "%s criteria" % f, "min sample leaf", "Sensitivity", percentage=True, ) if plot: plt.show() return best_model, best_score, best_samples_leaf, best_depth, best_criteria
def rf_test_different_params(trnX, tstX, trnY, tstY, multi_class, plot=False): best_numb_estimator = 0 best_depth = 0 best_feature = "" best_score = 0 best_model = None n_estimators = [100] max_depths = [10] max_features = ["sqrt", "log2"] if plot: plt.figure() fig, axs = plt.subplots(1, 2, figsize=(10, 4), squeeze=False) for k in range(len(max_features)): f = max_features[k] values = {} for d in max_depths: yvalues = [] for n in n_estimators: rf = random_forest(trnX, trnY, n, d, f) score = calculte_models_auc_score(rf, tstX, tstY, multi_class) if score > best_score: best_score = score best_depth = d best_numb_estimator = n best_feature = f best_model = rf yvalues.append(score) values[d] = yvalues if plot: multiple_line_chart( axs[0, k], n_estimators, values, "RF with %s features" % f, "nr estimators", "Sensitivity", percentage=True, ) if plot: plt.show() return best_model, best_score, best_numb_estimator, best_depth, best_feature
def histogram_with_distributions(ax: plt.Axes, series: pd.Series, var: str): values = series.sort_values().values n, bins, patches = ax.hist(values, 20, density=True, edgecolor="grey") distributions = compute_known_distributions(values, bins) multiple_line_chart(ax, values, distributions, "Best fit for %s" % var, var, "probability")