def evaluate_generalization(test_dataset, estimator): predictions = estimator.predict(test_dataset.X) true_labels = test_dataset.y accuracy = accuracy_score(true_labels, predictions) loss = log_loss(true_labels, predictions) ax = plot.confusion_matrix(true_labels, predictions) plt.subplot(ax) plt.savefig("generalization_confusion_matrix.png", format="png") print("Accuracy", accuracy) print("Log Loss", loss) print(classification_report(true_labels, predictions))
def plot_confusion_matrix(y, preds, classes, ofname, title='Confusion matrix', figsize=(10, 10), cmap=cm.Blues, logscale=False, verbose=0): if verbose > 0: print("Plotting confusion matrix ...") fig = plt.figure(tight_layout=True, figsize=figsize) ax = fig.add_subplot(1, 1, 1) sklearnplot.confusion_matrix(y, preds, target_names=classes, cmap=cmap, ax=ax) ensure_dir(ofname) plt.savefig(ofname) plt.close()
def evaluate_generalization(test_dataset, estimator): predictions = estimator.predict(test_dataset.X) true_labels = test_dataset.y accuracy = accuracy_score(true_labels, predictions) # loss = log_loss(true_labels, predictions) ax = plot.confusion_matrix(true_labels, predictions, target_names=[ "plane", "auto", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck" ]) plt.subplot(ax) plt.savefig("cifar_generalization_confusion_matrix.png", format="png") print("Accuracy", accuracy) #print("Log Loss", loss) print(classification_report(true_labels, predictions))
def test_normalized_confusion_matrix(): plot.confusion_matrix(y_test, y_pred, target_names, normalize=True)
def test_confusion_matrix(): plot.confusion_matrix(y_test, y_pred, target_names)
def test_confusion_matrix(self): with self.assertRaisesRegexp(ValueError, "needed to plot"): plot.confusion_matrix(None, [1, 0])
from sklearn.metrics import accuracy_score from sklearn.neural_network import MLPClassifier import matplotlib.pyplot as plt from sklearn_evaluation import plot ## Load and Split Dataset digits = datasets.load_digits() features = digits.data # print (features) labels = digits.target # print(labels) # split the data to 60% training and 40% testing x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=.4) print('Training samples is : ', len(x_train)) print('Testing samples is : ', len((x_test))) ## Training and Testing ANN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, ), random_state=3) clf = ANN.fit(features, labels) predictions = clf.predict(x_test) print('Training ......') print('Accuracy is : ', accuracy_score(y_test, predictions)) ## Plot the Confusing Matrix plot.confusion_matrix(y_test, predictions) plt.show()
import matplotlib.pyplot as plt from sklearn import datasets from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import train_test_split from sklearn_evaluation import plot data = datasets.make_classification(200, 10, 5, class_sep=0.65) X = data[0] y = data[1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) est = RandomForestClassifier() est.fit(X_train, y_train) y_pred = est.predict(X_test) y_score = est.predict_proba(X_test) y_true = y_test plot.confusion_matrix(y_true, y_pred) plt.show()
df.to_csv(root_path / "reports/train_test_split_selection.csv", index=False) # Save summary only df.loc[df["index"] == "accuracy"].reset_index(drop=True)[[ "pipeline", "train_percentage", "precision" ]].to_csv( root_path / "reports/train_test_split_accuracy.csv", index=False, header=["pipeline", "train_percentage", "accuracy"], ) #%% # Do a confusion matrix for the best train_percentage split # RandomForestClassifier ax = plot.confusion_matrix(best_test_y["RandomForestClassifier"], best_test_y_pred["RandomForestClassifier"], target_names=[ "red soil", "cotton crop", "grey soil", "damp grey soil", "vegetation stubble soil", "very damp grey soil" ]) fig = ax.get_figure() fig.set_figheight(15) fig.set_figwidth(15) fig.savefig(root_path / 'reports/figures/confusion_matrix_RandomForestClassifier.png') fig.clear() # SVC ax = plot.confusion_matrix(best_test_y["SVC"], best_test_y_pred["SVC"], target_names=[ "red soil", "cotton crop", "grey soil",
module_ = importlib.import_module(module_name) class_ = getattr(module_, class_name) clf = class_() df = pd.read_parquet(str(upstream['join'])) X = df.drop('target', axis='columns') y = df.target # Perform grid search over the passed parameters grid = GridSearchCV(clf, model_params, n_jobs=-1, cv=2) # We want to estimate generalization performance *and* tune hyperparameters # so we are using nested cross-validation y_pred = cross_val_predict(grid, X, y) print(classification_report(y, y_pred)) plot.confusion_matrix(y, y_pred) # find best params grid.fit(X, y) grid.best_params_ plot.grid_search(grid.cv_results_, change=list(model_params)) best = grid.best_estimator_ best with open(product['model'], 'wb') as f: pickle.dump(best, f)
def main(args): print('Preparing...') # Load CountVectorizer and TfidfTransformer with open(os.path.join(args.pickle_dir, 'review_CountVectorizer.pickle'), 'rb') as f: review_count = pickle.load(f) with open(os.path.join(args.pickle_dir, 'review_TfidfTransformer.pickle'), 'rb') as f: review_tfidf = pickle.load(f) with open(os.path.join(args.pickle_dir, 'title_CountVectorizer.pickle'), 'rb') as f: title_count = pickle.load(f) with open(os.path.join(args.pickle_dir, 'title_TfidfTransformer.pickle'), 'rb') as f: title_tfidf = pickle.load(f) # Load model with open(args.model_path, 'rb') as f: clf = pickle.load(f) # binary or not binary = len(clf.classes_) == 2 # Init Result File result_dir = os.path.split(args.result_path)[0] if not os.path.isdir(result_dir): os.makedirs(result_dir) if not os.path.isfile(args.result_path): if binary: pd.DataFrame(columns=['Dataset Name', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc']) \ .to_csv(args.result_path, index=False) else: pd.DataFrame(columns=['Dataset Name', 'accuracy', 'precision', 'recall', 'f1']) \ .to_csv(args.result_path, index=False) # Init Confusion Matrix Directory if args.confusion_matrix_dir: if not os.path.isdir(args.confusion_matrix_dir): os.makedirs(args.confusion_matrix_dir) # Evaluating for test_path in args.test_paths: test_name = os.path.splitext(os.path.split(test_path)[1])[0] test_df = pd.read_csv(test_path) test_X, test_y = vectorize.count_tfidf_make_dataset( test_df, review_count, review_tfidf, title_count, title_tfidf) pred = clf.predict(test_X) # Save Confusion Matrix Image if args.confusion_matrix_dir: plot.confusion_matrix(test_y, pred) plt.savefig( os.path.join(args.confusion_matrix_dir, '{}.png'.format(test_name))) plt.clf() # Save Result result_df = pd.read_csv(args.result_path) if binary: result_df.loc[len(result_df)] = { 'Dataset Name': test_name, 'accuracy': accuracy_score(test_y, pred), 'precision': precision_score(test_y, pred), 'recall': recall_score(test_y, pred), 'f1': f1_score(test_y, pred), 'roc_auc': roc_auc_score(test_y, pred), } else: result_df.loc[len(result_df)] = { 'Dataset Name': test_name, 'accuracy': accuracy_score(test_y, pred), 'precision': precision_score(test_y, pred, average='weighted'), 'recall': recall_score(test_y, pred, average='weighted'), 'f1': f1_score(test_y, pred, average='weighted'), # ROC AUC is not available on multi class } result_df.to_csv(args.result_path, index=False) print('{} Done...'.format(test_name))
from sklearn.cross_validation import train_test_split from sklearn import datasets from sklearn_evaluation.plot import confusion_matrix import matplotlib.pyplot as plt from matplotlib import style style.use('seaborn-dark') # Import some data to play with data = datasets.make_classification(1000, 10, 5, class_sep=0.7, n_classes=8) X = data[0] y = data[1] # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) est = RandomForestClassifier() est.fit(X_train, y_train) y_pred = est.predict(X_test) y_true = y_test confusion_matrix(y_true, y_pred, normalize=True) plt.show() confusion_matrix(y_true, y_pred) plt.show()
from sklearn.ensemble import RandomForestClassifier from sklearn_evaluation import plot # + tags=["parameters"] upstream = ['join'] product = None # - df = pd.read_parquet(str(upstream['join'])) X = df.drop('target', axis='columns') y = df.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # + tags=["model-training"] clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train) # - y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred)) plot.confusion_matrix(y_test, y_pred) with open(product['model'], 'wb') as f: pickle.dump(clf, f)
from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import train_test_split from sklearn import datasets from sklearn_evaluation.plot import confusion_matrix import matplotlib.pyplot as plt from matplotlib import style style.use('seaborn-dark') # Import some data to play with data = datasets.make_classification(1000, 10, 5, class_sep=0.7, n_classes=8) X = data[0] y = data[1] # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) est = RandomForestClassifier() est.fit(X_train, y_train) y_pred = est.predict(X_test) y_true = y_test confusion_matrix(y_true, y_pred, normalize=True) plt.show() confusion_matrix(y_true, y_pred) plt.show()
def main(args): print('Preparing...') # Load Dataset train_df = pd.read_csv(args.train_path) devtest_df = pd.read_csv(args.devtest_path) if args.vectorize == 'doc2vec': title_doc2vec = Doc2Vec.load(args.doc2vec_title_path) review_doc2vec = Doc2Vec.load(args.doc2vec_review_path) train_X, train_y = vectorize.doc2vec_make_dataset( train_df, review_doc2vec, title_doc2vec) devtest_X, devtest_y = vectorize.doc2vec_make_dataset( devtest_df, review_doc2vec, title_doc2vec) elif args.vectorize == 'count_tfidf': with open( os.path.join(args.pickle_dir, 'review_CountVectorizer.pickle'), 'rb') as f: review_count = pickle.load(f) with open( os.path.join(args.pickle_dir, 'review_TfidfTransformer.pickle'), 'rb') as f: review_tfidf = pickle.load(f) with open( os.path.join(args.pickle_dir, 'title_CountVectorizer.pickle'), 'rb') as f: title_count = pickle.load(f) with open( os.path.join(args.pickle_dir, 'title_TfidfTransformer.pickle'), 'rb') as f: title_tfidf = pickle.load(f) train_X, train_y = vectorize.count_tfidf_make_dataset( train_df, review_count, review_tfidf, title_count, title_tfidf) devtest_X, devtest_y = vectorize.count_tfidf_make_dataset( devtest_df, review_count, review_tfidf, title_count, title_tfidf) else: raise ValueError('vectorize method must be doc2vec or count_tfidf') # binary or not binary = len(np.unique(train_y)) == 2 # Init Result File result_dir = os.path.split(args.result_path)[0] if not os.path.isdir(result_dir): os.makedirs(result_dir) if not os.path.isfile(args.result_path): if binary: pd.DataFrame(columns=['Model Name', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])\ .to_csv(args.result_path, index=False) else: pd.DataFrame(columns=['Model Name', 'accuracy', 'precision', 'recall', 'f1'])\ .to_csv(args.result_path, index=False) # Init Confusion Matrix Directory if args.confusion_matrix_dir: if not os.path.isdir(args.confusion_matrix_dir): os.makedirs(args.confusion_matrix_dir) # Model list clf_models = [ DecisionTreeClassifier, LogisticRegression, Perceptron, RandomForestClassifier, LinearSVC, ] # Training for model in clf_models: clf = model(class_weight='balanced') clf.fit(train_X, train_y) pred = clf.predict(devtest_X) # Save Best Model if model == LogisticRegression: if binary: best_path = 'model/binary_best.pickle' else: best_path = 'model/multi_best.pickle' with open(best_path, 'wb') as f: pickle.dump(clf, f) # Save Confusion Matrix Image if args.confusion_matrix_dir: plot.confusion_matrix(devtest_y, pred) plt.savefig( os.path.join(args.confusion_matrix_dir, '{}.png'.format(model.__name__))) plt.clf() # Save Result result_df = pd.read_csv(args.result_path) if binary: result_df.loc[len(result_df)] = { 'Model Name': model.__name__, 'accuracy': accuracy_score(devtest_y, pred), 'precision': precision_score(devtest_y, pred), 'recall': recall_score(devtest_y, pred), 'f1': f1_score(devtest_y, pred), 'roc_auc': roc_auc_score(devtest_y, pred), } else: result_df.loc[len(result_df)] = { 'Model Name': model.__name__, 'accuracy': accuracy_score(devtest_y, pred), 'precision': precision_score(devtest_y, pred, average='weighted'), 'recall': recall_score(devtest_y, pred, average='weighted'), 'f1': f1_score(devtest_y, pred, average='weighted'), # ROC AUC is not available on multi class } result_df.to_csv(args.result_path, index=False) print('{} Done...'.format(model.__name__))
# extract_upstream=True in your pipeline.yaml file, if this task has # dependencies, list them them here (e.g. upstream = ['some_task']), otherwise # leave as None upstream = ['get', 'petal-area', 'sepal-area'] # extract_product=False in your pipeline.yaml file, leave this as None, the # value in the YAML spec will be added here during task execution product = None # - df = pd.read_csv(upstream['get']['data']) petal = pd.read_csv(upstream['petal-area']['data']) sepal = pd.read_csv(upstream['sepal-area']['data']) train = df.join(petal).join(sepal) X = train.drop('target', axis='columns') y = train.target model = RandomForestClassifier() model.fit(X, y) y_pred = model.predict(X) confusion_matrix(y, y_pred) Path(product['model']).write_bytes(pickle.dumps(model))