def show_learning_curve( est: BaseEstimator, conf_mat_labels: List, X_train: DataFrame, y_train: Series, X_test: DataFrame, y_test: Series, scoring_metric: str = "f1_micro", cv: StratifiedKFold = StratifiedKFold(n_splits=12), sizes: np.linspace = np.linspace(0.3, 1.0, 10), fig_size: Tuple = (8, 8), savefig: Path = Path().cwd() / "reports" / "figures" / "cm.png", ) -> None: """Plot the learning curve""" fig, ax = plt.subplots(figsize=fig_size) cm = LearningCurve(est, cv=cv, scoring=scoring_metric, train_sizes=sizes, n_jobs=-1) cm = LearningCurve(est, classes=conf_mat_labels, ax=ax) cm.fit(X_train, y_train) cm.score(X_test, y_test) cm.finalize() if not savefig.is_file(): fig.savefig(savefig, bbox_inches="tight", dpi=300)
def evaluation(estimator, X, Y, x, y): classes = [Y[1], Y[0]] f, (ax, ax1, ax2) = plt.subplots(1, 3, figsize=(18, 6)) #Confusion Matrix cmm = ConfusionMatrix(model=estimator, ax=ax1, classes=classes, label_encoder={ 0.0: 'Negativo', 1.0: 'Positivo' }) cmm.score(x, y) #ROCAUC viz = ROCAUC(model=estimator, ax=ax2) viz.fit(X, Y) viz.score(x, y) #Learning Curve cv_strategy = StratifiedKFold(n_splits=3) sizes = np.linspace(0.3, 1.0, 10) visualizer = LearningCurve(estimator, ax=ax, cv=cv_strategy, scoring='roc_auc', train_sizes=sizes, n_jobs=4) visualizer.fit(X, Y) cmm.poof(), viz.poof(), visualizer.poof() plt.show()
def plot_learning_curve(X, y, model, figPath=None): """Prints a training curve for a model (with file saving ability). Args: X (numpy array): Features for the evaluation. y (numpy array): Targets for the evaluation. model (sklearn Model): The model to visualize. figPath (str): Where to save the figure. figPath=None does not save the figure. """ # create a stratified cross validation to ensure good sample # representation. cv = StratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True) visualizer = LearningCurve(model, cv=cv, scoring='f1_weighted', train_sizes=np.linspace(0.3, 1.0, 10), n_jobs=-1) visualizer.fit(X, y) if figPath == None: visualizer.show() else: visualizer.show(outpath=f'{figPath}/learning.png')
def draw_learning_curve(self, cv, scoring='accuracy', n_jobs=5): visualizer = LearningCurve(self.model, cv=cv, scoring=scoring, n_jobs=n_jobs) visualizer.fit(self.training_data, self.training_labels) visualizer.poof()
def eva_model(c, n, X, y, X_test, y_test, class_names, outdir): model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c) rfe = RFE(model, n_features_to_select=n) ## learning curve plt.clf() viz_LC = LearningCurve( rfe, scoring='f1_weighted', n_jobs=4 ) viz_LC.fit(X, y) viz_LC.show(outpath=outdir + '/LC.png') ## classification report plt.clf() viz_CR = ClassificationReport(rfe, classes=class_names, support=True) viz_CR.fit(X, y) viz_CR.score(X_test, y_test) viz_CR.show(outpath=outdir + '/CR.png') ## confusion matrix plt.clf() viz_CM = ConfusionMatrix(rfe, classes=class_names) viz_CM.fit(X, y) viz_CM.score(X_test, y_test) viz_CM.show(outpath=outdir + '/CM.png') ## precision recall curve plt.clf() viz_PRC = PrecisionRecallCurve(rfe, per_class=True, iso_f1_curves=True, fill_area=False, micro=False, classes=class_names) viz_PRC.fit(X, y) viz_PRC.score(X_test, y_test) viz_PRC.show(outpath=outdir + '/PRC.png',size=(1080,720)) ## class prediction error plt.clf() viz_CPE = ClassPredictionError( rfe, classes=class_names ) viz_CPE.fit(X, y) viz_CPE.score(X_test, y_test) viz_CPE.show(outpath=outdir + '/CPE.png') ## ROCAUC plt.clf() viz_RA = ROCAUC(rfe, classes=class_names, size=(1080,720)) viz_RA.fit(X, y) viz_RA.score(X, y) viz_RA.show(outpath=outdir + '/RA.png') fit = rfe.fit(X,y) y_predict = fit.predict(X_test) f1 = f1_score(y_test, y_predict, average='weighted') features_retained_RFE = X.columns[rfe.get_support()].values feature_df =pd.DataFrame(features_retained_RFE.tolist()) feature_df.to_csv(outdir + '/features.csv', sep='\t', index=False) return f1
def generate_learning_curve(model, clf_name, scoring, sizes, cv, n_jobs, dataset_name, X_train, y_train): viz = LearningCurve(model, cv=cv, scoring=scoring, train_sizes=sizes, n_jobs=n_jobs) viz.fit(X_train, y_train) viz.show("results/{}_learning_curve_{}.png".format(clf_name, dataset_name)) plt.clf()
def learning_curve_clusterer(path="images/learning_curve_clusterer.png"): X, y = make_blobs(n_samples=1000, centers=5) _, ax = plt.subplots() sizes = np.linspace(0.3, 1.0, 10) oz = LearningCurve( KMeans(), ax=ax, train_sizes=sizes, scoring="adjusted_rand_score" ) oz.fit(X, y) oz.poof(outpath=path)
def learning_curve_clusterer(path="images/learning_curve_clusterer.png"): X, y = make_blobs(n_samples=1000, centers=5) _, ax = plt.subplots() sizes = np.linspace(0.3, 1.0, 10) oz = LearningCurve(KMeans(), ax=ax, train_sizes=sizes, scoring="adjusted_rand_score") oz.fit(X, y) oz.poof(outpath=path)
def visualizeLearningCurve(classifier, features, labels, scoring='precision'): sizes = numpy.linspace(0.1, 1.0, 10) cv = StratifiedKFold(10) visualizer = LearningCurve(classifier, cv=cv, train_sizes=sizes, scoring=scoring, n_jobs=10) visualizer.fit(features.drop(["appid", "name"], axis=1), list(map(convertLabelToNumber, labels))) visualizer.poof()
def learning_curve_sklearn_example(path="images/learning_curve_sklearn_example.png"): digits = load_digits() X, y = digits.data, digits.target _, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(9,4)) cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) oz = LearningCurve(GaussianNB(), ax=ax[0], cv=cv, n_jobs=4) oz.fit(X, y) oz.finalize() cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) oz = LearningCurve(SVC(gamma=0.001), ax=ax[1], cv=cv, n_jobs=4) oz.fit(X, y) oz.poof(outpath=path)
def learning_curve_regressor(path="images/learning_curve_regressor.png"): data = pd.read_csv(os.path.join(FIXTURES, "energy", "energy.csv")) targets = ["heating load", "cooling load"] features = [col for col in data.columns if col not in targets] X = data[features] y = data[targets[0]] _, ax = plt.subplots() sizes = np.linspace(0.3, 1.0, 10) oz = LearningCurve(RidgeCV(), ax=ax, train_sizes=sizes, scoring='r2') oz.fit(X, y) oz.poof(outpath=path)
def learning_curve_sklearn_example( path="images/learning_curve_sklearn_example.png"): digits = load_digits() X, y = digits.data, digits.target _, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(9, 4)) cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) oz = LearningCurve(GaussianNB(), ax=ax[0], cv=cv, n_jobs=4) oz.fit(X, y) oz.finalize() cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) oz = LearningCurve(SVC(gamma=0.001), ax=ax[1], cv=cv, n_jobs=4) oz.fit(X, y) oz.poof(outpath=path)
def run_crossvalidation(model, x_train, y_train, cv=5, scoring="accuracy", learning_curve=False): """ Runs cross validation on a certain model. Parameters ---------- model : Model Model to cross validate x_train : nd-array Training data y_train : nd-array Testing data cv : int, Crossvalidation Generator, optional Cross validation method, by default 5 scoring : str, optional Scoring method, by default 'accuracy' learning_curve : bool, optional If true plot learning curve, by default False Returns ------- list List of cross validation curves """ # TODO: Make curves slightly bigger visualizer_scores = CVScores(model, cv=cv, scoring=scoring) visualizer_scores.fit(x_train, y_train) visualizer_scores.show() if learning_curve: visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring) visualizer_lcurve.fit(x_train, y_train) visualizer_lcurve.show() return visualizer_scores.cv_scores_
def run_crossvalidation(model, x_train, y_train, cv=5, scoring="accuracy", report=None, model_name=None): """ Runs cross validation on a certain model. Parameters ---------- model : Model Model to cross validate x_train : nd-array Training data y_train : nd-array Testing data cv : int, Crossvalidation Generator, optional Cross validation method, by default 5 scoring : str, optional Scoring method, by default 'accuracy' """ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5)) visualizer_scores = CVScores(model, cv=cv, scoring=scoring, ax=axes[0]) visualizer_scores.fit(x_train, y_train) visualizer_scores.finalize() visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring, ax=axes[1]) visualizer_lcurve.fit(x_train, y_train) visualizer_lcurve.finalize() visualizer_scores.show() visualizer_lcurve.show() if report or _global_config['track_experiments']: # pragma: no cover fig.savefig(os.path.join(IMAGE_DIR, model_name, "cv.png"))
def yellow_brick_learning_curve(model, x, y, cpu_count, cv_count, scoring_metric): """ """ # Create the learning curve visualizer cv = StratifiedKFold(n_splits=cv_count) sizes = np.linspace(0.3, 1.0, 10) # Instantiate the classification model and visualizer visualizer = LearningCurve(model, cv=cv, scoring=scoring_metric, train_sizes=sizes, n_jobs=cpu_count) visualizer.fit(x, y) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
def learning_curves(models, X, y): """ :params models: Modelos a serem avaliados :params X: Dados de Treino variaveis independentes :params y: Dados de Treino variavel dependente :return: Viz da curvas de apendizagem """ cv_strategy = StratifiedKFold(n_splits=3) for model in models: sizes = np.linspace(0.3, 1.0, 10) viz = LearningCurve(model, cv=cv_strategy, scoring='roc_auc', train_sizes=sizes, n_jobs=4) viz.fit(X, y) viz.show()
def learning_curve(model, X, y): # from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import RepeatedStratifiedKFold from yellowbrick.model_selection import LearningCurve # Create the learning curve visualizer # cv = StratifiedKFold(12) cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1) sizes = np.linspace(0.3, 1.0, 10) viz = LearningCurve(model, cv=cv, train_sizes=sizes, scoring='neg_log_loss', n_jobs=4) # Fit and poof the visualizer viz.fit(X, y) viz.poof()
def learning_curve_classifier(path="images/learning_curve_classifier.png"): data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv")) target = "outcome" features = [col for col in data.columns if col != target] X = pd.get_dummies(data[features]) y = data[target] _, ax = plt.subplots() cv = StratifiedKFold(12) sizes = np.linspace(0.3, 1.0, 10) oz = LearningCurve( MultinomialNB(), ax=ax, cv=cv, n_jobs=4, train_sizes=sizes, scoring='f1_weighted' ) oz.fit(X, y) oz.poof(outpath=path)
def learning_curve_classifier(path="images/learning_curve_classifier.png"): data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv")) target = "outcome" features = [col for col in data.columns if col != target] X = pd.get_dummies(data[features]) y = data[target] _, ax = plt.subplots() cv = StratifiedKFold(12) sizes = np.linspace(0.3, 1.0, 10) oz = LearningCurve(MultinomialNB(), ax=ax, cv=cv, n_jobs=4, train_sizes=sizes, scoring='f1_weighted') oz.fit(X, y) oz.poof(outpath=path)
prediction_error(model, x_test, preds, ax=ax) # Do some scoring on XGB estimators # Validation curve viz = ValidationCurve(XGBRegressor(objective="reg:squarederror"), param_name="max_depth", param_range=np.arange(1, 11), cv=5, scoring="r2") viz.fit(x_train, y_train) viz.show() # Learning curve model = XGBRegressor(objective="reg:squarederror") viz_2 = LearningCurve(model, scoring="r2") viz_2.fit(x_train, y_train) viz_2.show() model = RFECV(LassoCV(), cv=5, scoring='r2') model.fit(x_train, y_train) model.show() """ Section: 5 Time-Series Algorithms """ # Fitting ARIMA # Original Series # plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120}) fig, axes = plt.subplots(3, 1, sharex=True) plot_acf(main_data.traffic_volume, ax=axes[0])
#class weight balanced didn't improve it! from sklearn.model_selection import permutation_test_score score, permutation_scores, pvalue = permutation_test_score( tree1, X_train, y_train, scoring="accuracy", cv=5, n_permutations=20, n_jobs=1) print("Classification score %s (pvalue : %s)" % (score, pvalue)) scoring = make_scorer(f1_score, average = 'micro') from yellowbrick.model_selection import LearningCurve visualizer = LearningCurve( tree1, cv=10, scoring=scoring,verbose = 0 ) visualizer.fit(X_train, y_train) # Fit the data to the visualizer visualizer.show() """# > **Random Forest**""" from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(oob_score = True,n_estimators=250) clf.get_params clf.fit(X_train,y_train) y_clf = clf.predict(X_test) clf.score(X_test,y_test) clf.score(X_train,y_train)
def learning(): X, y = load_energy() sizes = np.linspace(0.3, 1.0, 10) oz = LearningCurve(RidgeCV(), train_sizes=sizes, scoring="r2", ax=newfig()) oz.fit(X, y) savefig(oz, "learning_curve")
print(classification_report(y_test, y_test_pred)) # In[42]: #Training and Testing error with new data print(classification_report(y_train_New, y_train_pred_New)) print(classification_report(y_test_New, y_test_pred_New)) # In[44]: #Learning Curve from sklearn.model_selection import cross_validate from yellowbrick.model_selection import LearningCurve from sklearn.model_selection import StratifiedKFold cv = StratifiedKFold(n_splits=12) sizes = np.linspace(0.3, 1.0, 10) model = ExtraTreesRegressor(n_estimators=250, random_state=0, max_depth=80, max_features='auto') visualizer = LearningCurve(model, cv=cv, scoring='roc_auc', train_sizes=sizes, n_jobs=5) visualizer.fit(X_train_New, y_train_New) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
# %% best_params = dt.hyperParameterTuning(x_train,y_train) print(best_params) dt_tuned = DecisionTreeClassifier(max_depth=best_params['max_depth'], min_samples_leaf=best_params['min_samples_leaf'], random_state=rs) # %% sizes = np.linspace(0.3, 1.0, 10) # Instantiate the classification model and visualizer visualizer = LearningCurve( dt_tuned, scoring='f1_weighted', train_sizes=sizes, n_jobs=4 ) visualizer.fit(x_data,y_data) # Fit the data to the visualizer visualizer.show() # %% [markdown] # # 3. Support Vector Machine # %% class SupportVectorMachine(): def trainTest(self,x_train,x_test, y_train, y_test): scaler = StandardScaler() scaled_x_train = scaler.fit_transform(x_train) scaled_x_test = scaler.fit_transform(x_test) cs = [x/10000 for x in [1, 10, 100, 1000, 10000, 100000, 1000000]] df = [] for c in cs: for k in ["linear", "sigmoid"]:
def upper_region_classifier(): ur_dataset = pd.read_csv("../resources/datasets/ur_dataset.csv", na_values='?', dtype='category') print("UR classes", ur_dataset.groupby(SECOND_LEVEL_TARGET).size()) # Separate training feature & training labels X = ur_dataset.drop(['class'], axis=1) y = ur_dataset['class'] # Spot check # spot_check_algorithms(X, y) # pipeline = Pipeline([ # ('bow', CountVectorizer()), # ('classifier', BernoulliNB()), # ]) # Create a cross-validation strategy # StratifiedKFold cross-validation strategy to ensure all of our classes in each split are represented with the same proportion. cv = RepeatedStratifiedKFold(n_splits=3, random_state=42) # https://machinelearningmastery.com/automate-machine-learning-workflows-pipelines-python-scikit-learn/ # create feature union features = [] # features.append(('pca', MCA(n_components=3))) features.append(('select_best', SelectKBest(k=15))) feature_union = FeatureUnion(features) # create pipeline estimators = [] estimators.append(('feature_union', feature_union)) estimators.append(('ROS', RandomOverSampler(random_state=42))) estimators.append(('logistic', RandomForestClassifier(random_state=13))) model = Pipeline(estimators) imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), SelectKBest(k=15), RandomForestClassifier(random_state=13)) scores = cross_val_score(imba_pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1) print("After oversampling mean", scores.mean()) ############################################# Hyper-parameter Tuning ########################################### params = { 'n_estimators': [5, 10, 20, 30], 'max_depth': [4, 6, 10, 12], 'random_state': [13] } new_params = { 'randomforestclassifier__' + key: params[key] for key in params } grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=cv, scoring='f1_micro', return_train_score=True) grid_imba.fit(X, y) print(grid_imba.best_params_) print(grid_imba.best_score_) #refer - https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn model = grid_imba.best_estimator_ sizes = np.linspace(0.3, 1.0, 10) # Instantiate the classification model and visualizer visualizer = LearningCurve(model, cv=cv, scoring='f1_micro', train_sizes=sizes, n_jobs=4) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
result2 = cross_val_score(arbre, x_train, y_train, cv=kplis_strat) print(' score kplis strat cross validation :{}'.format(result2)) print('moyenne score cross validation : {:.2f}'.format(result2.mean())) result3 = cross_val_score(arbre, x_train, y_train, cv=shuffle) print(' score shuffle split cross validation :{}'.format(result3)) print('moyenne score cross validation : {:.2f}'.format(result3.mean())) cm = ConfusionMatrix(arbre, classes=[0, 1, 2, 3, 4, 5, 6], percent=True) cm.fit(x_train, y_train) cm.score(x_test, y_test) cm.poof() size = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] lc = LearningCurve(DecisionTreeClassifier(), train_sizes=size, score='r2') lc.fit(x_train, y_train) lc.poof() ''' ---------------------- Forêt aléatoire ------------------------''' foret = RandomForestClassifier(n_estimators=120, max_features='sqrt', n_jobs=-1, random_state=0) foret.fit(x_train, y_train) result = cross_val_score(foret, x_train, y_train, cv=5) print(' score cross validation :{}'.format(result)) print('moyenne score cross validation : {:.2f}'.format(result.mean())) result1 = cross_val_score(foret, x_train, y_train, cv=kplis) print(' score kplis cross validation :{}'.format(result1))
case_name = "mg_sizing_dataset_with_loc" df = pd.read_csv("results/" + case_name + ".csv", sep=";|,", engine="python", index_col='index') #df = df.loc[df['off-grid'] == 1] X = df[features] scaler.fit(X) X = scaler.transform(X) # X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns) targets = ["PV","BAT","RBAT","INV","GEN","NPV"] y = df[targets] cv = StratifiedKFold(12) param_range = np.arange(1, 30, 1) cv = KFold(n_splits=12, random_state=40, shuffle=True) viz = ValidationCurve( KNeighborsRegressor(), param_name="n_neighbors", param_range=param_range, scoring="r2", cv=cv, n_jobs=8 ) viz.fit(X, y) viz.show() visualizer = LearningCurve(KNeighborsRegressor(), scoring='r2', random_state=2, cv=cv, shuffle=True) visualizer.fit(X, y) visualizer.show() vis = CVScores(KNeighborsRegressor(), cv=cv, scoring='r2') vis.fit(X, y) # Fit the data to the visualizer vis.show()
# n_jobs=-1, ax=ax) val_curve.fit(X, y) val_curve.poof() fig.tight_layout() plt.show() fig, ax = plt.subplots(figsize=(16, 9)) l_curve = LearningCurve( KNeighborsRegressor(n_neighbors=best_k), train_sizes=np.arange(.1, 1.01, .1), scoring=rmse_score, cv=5, # n_jobs=-1, ax=ax) l_curve.fit(X, y) l_curve.poof() fig.tight_layout() plt.show() # Binary Classification y_binary = (y > y.median()).astype(int) n_neighbors = tuple(range(5, 151, 10)) n_folds = 5 scoring = 'roc_auc' pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]) param_grid = {'knn__n_neighbors': n_neighbors}