Ejemplo n.º 1
0
def evaluate_generalization(test_dataset, estimator):
    predictions = estimator.predict(test_dataset.X)
    true_labels = test_dataset.y
    accuracy = accuracy_score(true_labels, predictions)
    loss = log_loss(true_labels, predictions)
    ax = plot.confusion_matrix(true_labels, predictions)
    plt.subplot(ax)
    plt.savefig("generalization_confusion_matrix.png", format="png")

    print("Accuracy", accuracy)
    print("Log Loss", loss)
    print(classification_report(true_labels, predictions))
Ejemplo n.º 2
0
def plot_confusion_matrix(y,
                          preds,
                          classes,
                          ofname,
                          title='Confusion matrix',
                          figsize=(10, 10),
                          cmap=cm.Blues,
                          logscale=False,
                          verbose=0):

    if verbose > 0:
        print("Plotting confusion matrix ...")
    fig = plt.figure(tight_layout=True, figsize=figsize)
    ax = fig.add_subplot(1, 1, 1)
    sklearnplot.confusion_matrix(y,
                                 preds,
                                 target_names=classes,
                                 cmap=cmap,
                                 ax=ax)

    ensure_dir(ofname)
    plt.savefig(ofname)
    plt.close()
Ejemplo n.º 3
0
def evaluate_generalization(test_dataset, estimator):
    predictions = estimator.predict(test_dataset.X)
    true_labels = test_dataset.y
    accuracy = accuracy_score(true_labels, predictions)
    # loss = log_loss(true_labels, predictions)
    ax = plot.confusion_matrix(true_labels,
                               predictions,
                               target_names=[
                                   "plane", "auto", "bird", "cat", "deer",
                                   "dog", "frog", "horse", "ship", "truck"
                               ])
    plt.subplot(ax)
    plt.savefig("cifar_generalization_confusion_matrix.png", format="png")
    print("Accuracy", accuracy)
    #print("Log Loss", loss)
    print(classification_report(true_labels, predictions))
def test_normalized_confusion_matrix():
    plot.confusion_matrix(y_test, y_pred, target_names, normalize=True)
def test_confusion_matrix():
    plot.confusion_matrix(y_test, y_pred, target_names)
Ejemplo n.º 6
0
 def test_confusion_matrix(self):
     with self.assertRaisesRegexp(ValueError, "needed to plot"):
         plot.confusion_matrix(None, [1, 0])
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn_evaluation import plot
## Load and Split Dataset
digits = datasets.load_digits()
features = digits.data
# print (features)
labels = digits.target
# print(labels)

# split the data to 60% training and 40% testing
x_train, x_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=.4)
print('Training samples is : ', len(x_train))
print('Testing samples is : ', len((x_test)))
## Training and Testing
ANN = MLPClassifier(solver='lbfgs',
                    alpha=1e-5,
                    hidden_layer_sizes=(10, ),
                    random_state=3)
clf = ANN.fit(features, labels)
predictions = clf.predict(x_test)

print('Training ......')
print('Accuracy is : ', accuracy_score(y_test, predictions))
## Plot the Confusing Matrix
plot.confusion_matrix(y_test, predictions)
plt.show()
def test_normalized_confusion_matrix():
    plot.confusion_matrix(y_test, y_pred, target_names, normalize=True)
Ejemplo n.º 9
0
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

from sklearn_evaluation import plot

data = datasets.make_classification(200, 10, 5, class_sep=0.65)
X = data[0]
y = data[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

est = RandomForestClassifier()
est.fit(X_train, y_train)

y_pred = est.predict(X_test)
y_score = est.predict_proba(X_test)
y_true = y_test

plot.confusion_matrix(y_true, y_pred)
plt.show()
Ejemplo n.º 10
0
df.to_csv(root_path / "reports/train_test_split_selection.csv", index=False)
# Save summary only
df.loc[df["index"] == "accuracy"].reset_index(drop=True)[[
    "pipeline", "train_percentage", "precision"
]].to_csv(
    root_path / "reports/train_test_split_accuracy.csv",
    index=False,
    header=["pipeline", "train_percentage", "accuracy"],
)
#%%
# Do a confusion matrix for the best train_percentage split
# RandomForestClassifier
ax = plot.confusion_matrix(best_test_y["RandomForestClassifier"],
                           best_test_y_pred["RandomForestClassifier"],
                           target_names=[
                               "red soil", "cotton crop", "grey soil",
                               "damp grey soil", "vegetation stubble soil",
                               "very damp grey soil"
                           ])
fig = ax.get_figure()
fig.set_figheight(15)
fig.set_figwidth(15)
fig.savefig(root_path /
            'reports/figures/confusion_matrix_RandomForestClassifier.png')
fig.clear()

# SVC
ax = plot.confusion_matrix(best_test_y["SVC"],
                           best_test_y_pred["SVC"],
                           target_names=[
                               "red soil", "cotton crop", "grey soil",
Ejemplo n.º 11
0
module_ = importlib.import_module(module_name)
class_ = getattr(module_, class_name)
clf = class_()

df = pd.read_parquet(str(upstream['join']))
X = df.drop('target', axis='columns')
y = df.target

# Perform grid search over the passed parameters
grid = GridSearchCV(clf, model_params, n_jobs=-1, cv=2)

# We want to estimate generalization performance *and* tune hyperparameters
# so we are using nested cross-validation
y_pred = cross_val_predict(grid, X, y)

print(classification_report(y, y_pred))

plot.confusion_matrix(y, y_pred)

# find best params
grid.fit(X, y)
grid.best_params_

plot.grid_search(grid.cv_results_, change=list(model_params))

best = grid.best_estimator_
best

with open(product['model'], 'wb') as f:
    pickle.dump(best, f)
def main(args):
    print('Preparing...')

    # Load CountVectorizer and TfidfTransformer
    with open(os.path.join(args.pickle_dir, 'review_CountVectorizer.pickle'),
              'rb') as f:
        review_count = pickle.load(f)

    with open(os.path.join(args.pickle_dir, 'review_TfidfTransformer.pickle'),
              'rb') as f:
        review_tfidf = pickle.load(f)

    with open(os.path.join(args.pickle_dir, 'title_CountVectorizer.pickle'),
              'rb') as f:
        title_count = pickle.load(f)

    with open(os.path.join(args.pickle_dir, 'title_TfidfTransformer.pickle'),
              'rb') as f:
        title_tfidf = pickle.load(f)

    # Load model
    with open(args.model_path, 'rb') as f:
        clf = pickle.load(f)

    # binary or not
    binary = len(clf.classes_) == 2

    # Init Result File
    result_dir = os.path.split(args.result_path)[0]
    if not os.path.isdir(result_dir):
        os.makedirs(result_dir)

    if not os.path.isfile(args.result_path):
        if binary:
            pd.DataFrame(columns=['Dataset Name', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc']) \
                .to_csv(args.result_path, index=False)
        else:
            pd.DataFrame(columns=['Dataset Name', 'accuracy', 'precision', 'recall', 'f1']) \
                .to_csv(args.result_path, index=False)

    # Init Confusion Matrix Directory
    if args.confusion_matrix_dir:
        if not os.path.isdir(args.confusion_matrix_dir):
            os.makedirs(args.confusion_matrix_dir)

    # Evaluating
    for test_path in args.test_paths:
        test_name = os.path.splitext(os.path.split(test_path)[1])[0]

        test_df = pd.read_csv(test_path)
        test_X, test_y = vectorize.count_tfidf_make_dataset(
            test_df, review_count, review_tfidf, title_count, title_tfidf)
        pred = clf.predict(test_X)

        # Save Confusion Matrix Image
        if args.confusion_matrix_dir:
            plot.confusion_matrix(test_y, pred)
            plt.savefig(
                os.path.join(args.confusion_matrix_dir,
                             '{}.png'.format(test_name)))
            plt.clf()

        # Save Result
        result_df = pd.read_csv(args.result_path)

        if binary:
            result_df.loc[len(result_df)] = {
                'Dataset Name': test_name,
                'accuracy': accuracy_score(test_y, pred),
                'precision': precision_score(test_y, pred),
                'recall': recall_score(test_y, pred),
                'f1': f1_score(test_y, pred),
                'roc_auc': roc_auc_score(test_y, pred),
            }
        else:
            result_df.loc[len(result_df)] = {
                'Dataset Name': test_name,
                'accuracy': accuracy_score(test_y, pred),
                'precision': precision_score(test_y, pred, average='weighted'),
                'recall': recall_score(test_y, pred, average='weighted'),
                'f1': f1_score(test_y, pred, average='weighted'),
                # ROC AUC is not available on multi class
            }

        result_df.to_csv(args.result_path, index=False)
        print('{} Done...'.format(test_name))
Ejemplo n.º 13
0
from sklearn.cross_validation import train_test_split
from sklearn import datasets
from sklearn_evaluation.plot import confusion_matrix

import matplotlib.pyplot as plt

from matplotlib import style
style.use('seaborn-dark')

# Import some data to play with
data = datasets.make_classification(1000, 10, 5, class_sep=0.7, n_classes=8)
X = data[0]
y = data[1]

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.5,
                                                    random_state=0)

est = RandomForestClassifier()
est.fit(X_train, y_train)
y_pred = est.predict(X_test)
y_true = y_test

confusion_matrix(y_true, y_pred, normalize=True)
plt.show()

confusion_matrix(y_true, y_pred)
plt.show()
Ejemplo n.º 14
0
from sklearn.ensemble import RandomForestClassifier
from sklearn_evaluation import plot

# + tags=["parameters"]
upstream = ['join']
product = None
# -

df = pd.read_parquet(str(upstream['join']))
X = df.drop('target', axis='columns')
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

# + tags=["model-training"]
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
# -

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

plot.confusion_matrix(y_test, y_pred)

with open(product['model'], 'wb') as f:
    pickle.dump(clf, f)
Ejemplo n.º 15
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn import datasets
from sklearn_evaluation.plot import confusion_matrix

import matplotlib.pyplot as plt

from matplotlib import style
style.use('seaborn-dark')

# Import some data to play with
data = datasets.make_classification(1000, 10, 5, class_sep=0.7, n_classes=8)
X = data[0]
y = data[1]

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                    random_state=0)


est = RandomForestClassifier()
est.fit(X_train, y_train)
y_pred = est.predict(X_test)
y_true = y_test

confusion_matrix(y_true, y_pred, normalize=True)
plt.show()

confusion_matrix(y_true, y_pred)
plt.show()
def test_confusion_matrix():
    plot.confusion_matrix(y_test, y_pred, target_names)
def main(args):
    print('Preparing...')

    # Load Dataset
    train_df = pd.read_csv(args.train_path)
    devtest_df = pd.read_csv(args.devtest_path)

    if args.vectorize == 'doc2vec':
        title_doc2vec = Doc2Vec.load(args.doc2vec_title_path)
        review_doc2vec = Doc2Vec.load(args.doc2vec_review_path)

        train_X, train_y = vectorize.doc2vec_make_dataset(
            train_df, review_doc2vec, title_doc2vec)
        devtest_X, devtest_y = vectorize.doc2vec_make_dataset(
            devtest_df, review_doc2vec, title_doc2vec)

    elif args.vectorize == 'count_tfidf':
        with open(
                os.path.join(args.pickle_dir, 'review_CountVectorizer.pickle'),
                'rb') as f:
            review_count = pickle.load(f)

        with open(
                os.path.join(args.pickle_dir,
                             'review_TfidfTransformer.pickle'), 'rb') as f:
            review_tfidf = pickle.load(f)

        with open(
                os.path.join(args.pickle_dir, 'title_CountVectorizer.pickle'),
                'rb') as f:
            title_count = pickle.load(f)

        with open(
                os.path.join(args.pickle_dir, 'title_TfidfTransformer.pickle'),
                'rb') as f:
            title_tfidf = pickle.load(f)

        train_X, train_y = vectorize.count_tfidf_make_dataset(
            train_df, review_count, review_tfidf, title_count, title_tfidf)
        devtest_X, devtest_y = vectorize.count_tfidf_make_dataset(
            devtest_df, review_count, review_tfidf, title_count, title_tfidf)

    else:
        raise ValueError('vectorize method must be doc2vec or count_tfidf')

    # binary or not
    binary = len(np.unique(train_y)) == 2

    # Init Result File
    result_dir = os.path.split(args.result_path)[0]
    if not os.path.isdir(result_dir):
        os.makedirs(result_dir)

    if not os.path.isfile(args.result_path):
        if binary:
            pd.DataFrame(columns=['Model Name', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])\
                .to_csv(args.result_path, index=False)
        else:
            pd.DataFrame(columns=['Model Name', 'accuracy', 'precision', 'recall', 'f1'])\
                .to_csv(args.result_path, index=False)

    # Init Confusion Matrix Directory
    if args.confusion_matrix_dir:
        if not os.path.isdir(args.confusion_matrix_dir):
            os.makedirs(args.confusion_matrix_dir)

    # Model list
    clf_models = [
        DecisionTreeClassifier,
        LogisticRegression,
        Perceptron,
        RandomForestClassifier,
        LinearSVC,
    ]

    # Training
    for model in clf_models:
        clf = model(class_weight='balanced')
        clf.fit(train_X, train_y)
        pred = clf.predict(devtest_X)

        # Save Best Model
        if model == LogisticRegression:
            if binary:
                best_path = 'model/binary_best.pickle'
            else:
                best_path = 'model/multi_best.pickle'

            with open(best_path, 'wb') as f:
                pickle.dump(clf, f)

        # Save Confusion Matrix Image
        if args.confusion_matrix_dir:
            plot.confusion_matrix(devtest_y, pred)
            plt.savefig(
                os.path.join(args.confusion_matrix_dir,
                             '{}.png'.format(model.__name__)))
            plt.clf()

        # Save Result
        result_df = pd.read_csv(args.result_path)

        if binary:
            result_df.loc[len(result_df)] = {
                'Model Name': model.__name__,
                'accuracy': accuracy_score(devtest_y, pred),
                'precision': precision_score(devtest_y, pred),
                'recall': recall_score(devtest_y, pred),
                'f1': f1_score(devtest_y, pred),
                'roc_auc': roc_auc_score(devtest_y, pred),
            }
        else:
            result_df.loc[len(result_df)] = {
                'Model Name': model.__name__,
                'accuracy': accuracy_score(devtest_y, pred),
                'precision': precision_score(devtest_y,
                                             pred,
                                             average='weighted'),
                'recall': recall_score(devtest_y, pred, average='weighted'),
                'f1': f1_score(devtest_y, pred, average='weighted'),
                # ROC AUC is not available on multi class
            }

        result_df.to_csv(args.result_path, index=False)
        print('{} Done...'.format(model.__name__))
Ejemplo n.º 18
0
# extract_upstream=True in your pipeline.yaml file, if this task has
# dependencies, list them them here (e.g. upstream = ['some_task']), otherwise
# leave as None
upstream = ['get', 'petal-area', 'sepal-area']

# extract_product=False in your pipeline.yaml file, leave this as None, the
# value in the YAML spec  will be added here during task execution
product = None
# -

df = pd.read_csv(upstream['get']['data'])

petal = pd.read_csv(upstream['petal-area']['data'])

sepal = pd.read_csv(upstream['sepal-area']['data'])

train = df.join(petal).join(sepal)

X = train.drop('target', axis='columns')
y = train.target

model = RandomForestClassifier()

model.fit(X, y)

y_pred = model.predict(X)

confusion_matrix(y, y_pred)

Path(product['model']).write_bytes(pickle.dumps(model))