def plot_learning_curve(X, y, model, figPath=None): """Prints a training curve for a model (with file saving ability). Args: X (numpy array): Features for the evaluation. y (numpy array): Targets for the evaluation. model (sklearn Model): The model to visualize. figPath (str): Where to save the figure. figPath=None does not save the figure. """ # create a stratified cross validation to ensure good sample # representation. cv = StratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True) visualizer = LearningCurve(model, cv=cv, scoring='f1_weighted', train_sizes=np.linspace(0.3, 1.0, 10), n_jobs=-1) visualizer.fit(X, y) if figPath == None: visualizer.show() else: visualizer.show(outpath=f'{figPath}/learning.png')
def eva_model(c, n, X, y, X_test, y_test, class_names, outdir): model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c) rfe = RFE(model, n_features_to_select=n) ## learning curve plt.clf() viz_LC = LearningCurve( rfe, scoring='f1_weighted', n_jobs=4 ) viz_LC.fit(X, y) viz_LC.show(outpath=outdir + '/LC.png') ## classification report plt.clf() viz_CR = ClassificationReport(rfe, classes=class_names, support=True) viz_CR.fit(X, y) viz_CR.score(X_test, y_test) viz_CR.show(outpath=outdir + '/CR.png') ## confusion matrix plt.clf() viz_CM = ConfusionMatrix(rfe, classes=class_names) viz_CM.fit(X, y) viz_CM.score(X_test, y_test) viz_CM.show(outpath=outdir + '/CM.png') ## precision recall curve plt.clf() viz_PRC = PrecisionRecallCurve(rfe, per_class=True, iso_f1_curves=True, fill_area=False, micro=False, classes=class_names) viz_PRC.fit(X, y) viz_PRC.score(X_test, y_test) viz_PRC.show(outpath=outdir + '/PRC.png',size=(1080,720)) ## class prediction error plt.clf() viz_CPE = ClassPredictionError( rfe, classes=class_names ) viz_CPE.fit(X, y) viz_CPE.score(X_test, y_test) viz_CPE.show(outpath=outdir + '/CPE.png') ## ROCAUC plt.clf() viz_RA = ROCAUC(rfe, classes=class_names, size=(1080,720)) viz_RA.fit(X, y) viz_RA.score(X, y) viz_RA.show(outpath=outdir + '/RA.png') fit = rfe.fit(X,y) y_predict = fit.predict(X_test) f1 = f1_score(y_test, y_predict, average='weighted') features_retained_RFE = X.columns[rfe.get_support()].values feature_df =pd.DataFrame(features_retained_RFE.tolist()) feature_df.to_csv(outdir + '/features.csv', sep='\t', index=False) return f1
def generate_learning_curve(model, clf_name, scoring, sizes, cv, n_jobs, dataset_name, X_train, y_train): viz = LearningCurve(model, cv=cv, scoring=scoring, train_sizes=sizes, n_jobs=n_jobs) viz.fit(X_train, y_train) viz.show("results/{}_learning_curve_{}.png".format(clf_name, dataset_name)) plt.clf()
def run_crossvalidation(model, x_train, y_train, cv=5, scoring="accuracy", learning_curve=False): """ Runs cross validation on a certain model. Parameters ---------- model : Model Model to cross validate x_train : nd-array Training data y_train : nd-array Testing data cv : int, Crossvalidation Generator, optional Cross validation method, by default 5 scoring : str, optional Scoring method, by default 'accuracy' learning_curve : bool, optional If true plot learning curve, by default False Returns ------- list List of cross validation curves """ # TODO: Make curves slightly bigger visualizer_scores = CVScores(model, cv=cv, scoring=scoring) visualizer_scores.fit(x_train, y_train) visualizer_scores.show() if learning_curve: visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring) visualizer_lcurve.fit(x_train, y_train) visualizer_lcurve.show() return visualizer_scores.cv_scores_
def run_crossvalidation(model, x_train, y_train, cv=5, scoring="accuracy", report=None, model_name=None): """ Runs cross validation on a certain model. Parameters ---------- model : Model Model to cross validate x_train : nd-array Training data y_train : nd-array Testing data cv : int, Crossvalidation Generator, optional Cross validation method, by default 5 scoring : str, optional Scoring method, by default 'accuracy' """ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5)) visualizer_scores = CVScores(model, cv=cv, scoring=scoring, ax=axes[0]) visualizer_scores.fit(x_train, y_train) visualizer_scores.finalize() visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring, ax=axes[1]) visualizer_lcurve.fit(x_train, y_train) visualizer_lcurve.finalize() visualizer_scores.show() visualizer_lcurve.show() if report or _global_config['track_experiments']: # pragma: no cover fig.savefig(os.path.join(IMAGE_DIR, model_name, "cv.png"))
def yellow_brick_learning_curve(model, x, y, cpu_count, cv_count, scoring_metric): """ """ # Create the learning curve visualizer cv = StratifiedKFold(n_splits=cv_count) sizes = np.linspace(0.3, 1.0, 10) # Instantiate the classification model and visualizer visualizer = LearningCurve(model, cv=cv, scoring=scoring_metric, train_sizes=sizes, n_jobs=cpu_count) visualizer.fit(x, y) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
def learning_curves(models, X, y): """ :params models: Modelos a serem avaliados :params X: Dados de Treino variaveis independentes :params y: Dados de Treino variavel dependente :return: Viz da curvas de apendizagem """ cv_strategy = StratifiedKFold(n_splits=3) for model in models: sizes = np.linspace(0.3, 1.0, 10) viz = LearningCurve(model, cv=cv_strategy, scoring='roc_auc', train_sizes=sizes, n_jobs=4) viz.fit(X, y) viz.show()
# Do some scoring on XGB estimators # Validation curve viz = ValidationCurve(XGBRegressor(objective="reg:squarederror"), param_name="max_depth", param_range=np.arange(1, 11), cv=5, scoring="r2") viz.fit(x_train, y_train) viz.show() # Learning curve model = XGBRegressor(objective="reg:squarederror") viz_2 = LearningCurve(model, scoring="r2") viz_2.fit(x_train, y_train) viz_2.show() model = RFECV(LassoCV(), cv=5, scoring='r2') model.fit(x_train, y_train) model.show() """ Section: 5 Time-Series Algorithms """ # Fitting ARIMA # Original Series # plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120}) fig, axes = plt.subplots(3, 1, sharex=True) plot_acf(main_data.traffic_volume, ax=axes[0]) # 1st Differencing
#class weight balanced didn't improve it! from sklearn.model_selection import permutation_test_score score, permutation_scores, pvalue = permutation_test_score( tree1, X_train, y_train, scoring="accuracy", cv=5, n_permutations=20, n_jobs=1) print("Classification score %s (pvalue : %s)" % (score, pvalue)) scoring = make_scorer(f1_score, average = 'micro') from yellowbrick.model_selection import LearningCurve visualizer = LearningCurve( tree1, cv=10, scoring=scoring,verbose = 0 ) visualizer.fit(X_train, y_train) # Fit the data to the visualizer visualizer.show() """# > **Random Forest**""" from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(oob_score = True,n_estimators=250) clf.get_params clf.fit(X_train,y_train) y_clf = clf.predict(X_test) clf.score(X_test,y_test) clf.score(X_train,y_train)
print(classification_report(y_test, y_test_pred)) # In[42]: #Training and Testing error with new data print(classification_report(y_train_New, y_train_pred_New)) print(classification_report(y_test_New, y_test_pred_New)) # In[44]: #Learning Curve from sklearn.model_selection import cross_validate from yellowbrick.model_selection import LearningCurve from sklearn.model_selection import StratifiedKFold cv = StratifiedKFold(n_splits=12) sizes = np.linspace(0.3, 1.0, 10) model = ExtraTreesRegressor(n_estimators=250, random_state=0, max_depth=80, max_features='auto') visualizer = LearningCurve(model, cv=cv, scoring='roc_auc', train_sizes=sizes, n_jobs=5) visualizer.fit(X_train_New, y_train_New) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
def upper_region_classifier(): ur_dataset = pd.read_csv("../resources/datasets/ur_dataset.csv", na_values='?', dtype='category') print("UR classes", ur_dataset.groupby(SECOND_LEVEL_TARGET).size()) # Separate training feature & training labels X = ur_dataset.drop(['class'], axis=1) y = ur_dataset['class'] # Spot check # spot_check_algorithms(X, y) # pipeline = Pipeline([ # ('bow', CountVectorizer()), # ('classifier', BernoulliNB()), # ]) # Create a cross-validation strategy # StratifiedKFold cross-validation strategy to ensure all of our classes in each split are represented with the same proportion. cv = RepeatedStratifiedKFold(n_splits=3, random_state=42) # https://machinelearningmastery.com/automate-machine-learning-workflows-pipelines-python-scikit-learn/ # create feature union features = [] # features.append(('pca', MCA(n_components=3))) features.append(('select_best', SelectKBest(k=15))) feature_union = FeatureUnion(features) # create pipeline estimators = [] estimators.append(('feature_union', feature_union)) estimators.append(('ROS', RandomOverSampler(random_state=42))) estimators.append(('logistic', RandomForestClassifier(random_state=13))) model = Pipeline(estimators) imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), SelectKBest(k=15), RandomForestClassifier(random_state=13)) scores = cross_val_score(imba_pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1) print("After oversampling mean", scores.mean()) ############################################# Hyper-parameter Tuning ########################################### params = { 'n_estimators': [5, 10, 20, 30], 'max_depth': [4, 6, 10, 12], 'random_state': [13] } new_params = { 'randomforestclassifier__' + key: params[key] for key in params } grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=cv, scoring='f1_micro', return_train_score=True) grid_imba.fit(X, y) print(grid_imba.best_params_) print(grid_imba.best_score_) #refer - https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn model = grid_imba.best_estimator_ sizes = np.linspace(0.3, 1.0, 10) # Instantiate the classification model and visualizer visualizer = LearningCurve(model, cv=cv, scoring='f1_micro', train_sizes=sizes, n_jobs=4) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure