Ejemplo n.º 1
0
            annot=True,
            linewidth=.5,
            square=True,
            cmap='Blues_r',
            fmt='f')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix', size=15)

report = classification_report(test_df['test_label'], mnb_pred)
print(report)

# ## Passive Aggressive Classifier

## Instantiating a Passive Aggressive Classifier : pa_tfidf_clf
pa_tfidf_clf = PassiveAggressiveClassifier()

## Fit the classifier to the training data
pa_tfidf_clf.fit(count_train, train_df['train_label'])

## Create the predicted tags: pac_pred
pac_pred = pa_tfidf_clf.predict(count_test)
## Calculate the accuracy score: pac_score
pac_score = metrics.accuracy_score(test_df['test_label'], pac_pred)

## Calculate the confusion matrix: pac_cm
pac_cm = metrics.confusion_matrix(test_df['test_label'],
                                  pac_pred,
                                  labels=['true', 'false'])
print('Confusion Matrix --- PassiveAggressiveClassifier')
print(pac_cm)
Ejemplo n.º 2
0
                                            target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
Ejemplo n.º 3
0
def modele_ens(clf,tr,y_tr,ts,y_ts):
    clf.fit(tr, y_tr)
    pred=clf.predict(ts)
    y_test_array=np.array(y_ts)
    score = accuracy_score(y_ts, pred)
    print(clf)
    print(score)
    del pred
    del y_test_array
    del score

#Ensemblistes
modele_ens(SVC( C=0.6, kernel='linear'),x_train2,y_train,x_test2,y_test)

modele_ens(PassiveAggressiveClassifier(C=1,random_state=42),x_train2,y_train,x_test2,y_test)

modele_ens(Perceptron(n_iter=3,alpha=1,eta0=0.01,warm_start=True),x_train2,y_train,x_test2,y_test)

#memory error
#modele_ens(RidgeClassifierCV(alphas=0.1),x_train2,y_train,x_test2,y_test)
    
#Non supervisés

#Neural Network

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils
Ejemplo n.º 4
0
def test_classifier_undefined_methods():
    clf = PassiveAggressiveClassifier(max_iter=100)
    for meth in ("predict_proba", "predict_log_proba", "transform"):
        with pytest.raises(AttributeError):
            getattr(clf, meth)
Ejemplo n.º 5
0
                                          pred,
                                          target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"),
                   "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"),
                  (PassiveAggressiveClassifier(n_iter=50),
                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(
        benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(
Ejemplo n.º 6
0
]

sgd_regressors = [('SGD-sl', SGDRegressor(loss='squared_loss',
                                          penalty=penalty)),
                  ('SGD-h', SGDRegressor(loss='huber', penalty=penalty)),
                  ('SGD-ei',
                   SGDRegressor(loss='epsilon_insensitive', penalty=penalty)),
                  ('SGD-sei',
                   SGDRegressor(loss='squared_epsilon_insensitive',
                                penalty=penalty))]

selected_classifiers = [('SGD', SGDClassifier(loss='hinge', penalty='l1')),
                        ('Perceptron-I', Perceptron(penalty='l1')),
                        ('Perceptron-II', Perceptron(penalty='l2')),
                        ('Perceptron-II', Perceptron(penalty='elasticnet')),
                        ('PA-I', PassiveAggressiveClassifier(loss='hinge')),
                        ('PA-II',
                         PassiveAggressiveClassifier(loss='squared_hinge'))]

selected_regressors = [
    ('SGD-l1', SGDRegressor(loss='squared_loss', penalty='l2')),
    ('PA-I', PassiveAggressiveRegressor(loss='epsilon_insensitive')),
    ('PA-II', PassiveAggressiveRegressor(loss='squared_epsilon_insensitive'))
]

scalers = [('Min-Max', MinMaxScaler()), ('Standarization', StandardScaler()),
           ('Max-Abs', MaxAbsScaler())]

data = json.load(
    open('machine_learner/collected_data/dataset_with_selected_features.json'))
features = data['features']
                                          target_names=target_names))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in ((RidgeClassifier(tol=1e-2,
                                   solver="sag"), "Ridge Classifier"),
                  (Perceptron(max_iter=50),
                   "Perceptron"), (PassiveAggressiveClassifier(max_iter=50),
                                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10),
                   "kNN"), (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(
    def Define_pipelines(self):

        logging.info('Start defining pipelines...\n\n')

        SGD_tfidf_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_300_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_300_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_300_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_300_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_300_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_300_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_300_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_300_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_300_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_300_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_300_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_300_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_100_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_100_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_100_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_100_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_100_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_100_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_100_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_100_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_100_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_100_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_100_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_100_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SVC_tfidf_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_300_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_300_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_300_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_300_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_300_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_300_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_100_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_100_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_100_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_100_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_100_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_100_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_100_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_100_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_100_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_100_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_100_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_100_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_300_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_300_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_300_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_300_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_300_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_300_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        PA_tfidf_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_300_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_300_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_300_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_300_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_300_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_300_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_100_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_100_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_100_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_100_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_100_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_100_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        #

        PA_count_embedding_100_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_100_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_100_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_100_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_100_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_100_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        #

        PA_count_embedding_300_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_300_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_300_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_300_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_300_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_300_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        #

        ET_tfidf_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_300_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_300_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_300_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_300_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_300_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_15, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_300_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_15, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_300_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_300_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_300_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_300_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_300_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_300_10, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_300_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_300_10, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_100_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_100_10_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_100_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_100_10_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_100_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_10, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_100_10_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_10, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_100_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_100_15_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_100_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_100_15_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_100_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(
                 self.model_100_15, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_100_15_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(
                 self.model_100_15, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        all_models = [
            ("SGD tfidf", SGD_tfidf_pipeline),
            ("SGD count", SGD_count_pipeline),
            ("SGD count embedding 300 15",
             SGD_count_embedding_300_15_pipeline),
            ("SGD tfidf embedding 300 15",
             SGD_tfidf_embedding_300_15_pipeline),
            ("SGD count embedding sum 300 15",
             SGD_count_embedding_300_15_pipeline_sum),
            ("SGD tfidf embedding sum 300 15",
             SGD_tfidf_embedding_300_15_pipeline_sum),
            ("SGD count embedding max 300 15",
             SGD_count_embedding_300_15_pipeline_max),
            ("SGD tfidf embedding max 300 15",
             SGD_tfidf_embedding_300_15_pipeline_max),
            ("SGD count embedding 300 10",
             SGD_count_embedding_300_10_pipeline),
            ("SGD tfidf embedding 300 10",
             SGD_tfidf_embedding_300_10_pipeline),
            ("SGD count embedding sum 300 10",
             SGD_count_embedding_300_10_pipeline_sum),
            ("SGD tfidf embedding sum 300 10",
             SGD_tfidf_embedding_300_10_pipeline_sum),
            ("SGD count embedding max 300 10",
             SGD_count_embedding_300_10_pipeline_max),
            ("SGD tfidf embedding max 300 10",
             SGD_tfidf_embedding_300_10_pipeline_max),
            ("SGD count embedding 100 10",
             SGD_count_embedding_100_10_pipeline),
            ("SGD tfidf embedding 100 10",
             SGD_tfidf_embedding_100_10_pipeline),
            ("SGD count embedding sum 100 10",
             SGD_count_embedding_100_10_pipeline_sum),
            ("SGD tfidf embedding sum 100 10",
             SGD_tfidf_embedding_100_10_pipeline_sum),
            ("SGD count embedding max 100 10",
             SGD_count_embedding_100_10_pipeline_max),
            ("SGD tfidf embedding max 100 10",
             SGD_tfidf_embedding_100_10_pipeline_max),
            ("SGD count embedding 100 15",
             SGD_count_embedding_100_15_pipeline),
            ("SGD tfidf embedding 100 15",
             SGD_tfidf_embedding_100_15_pipeline),
            ("SGD count embedding sum 100 15",
             SGD_count_embedding_100_15_pipeline_sum),
            ("SGD tfidf embedding sum 100 15",
             SGD_tfidf_embedding_100_15_pipeline_sum),
            ("SGD count embedding max 100 15",
             SGD_count_embedding_100_15_pipeline_max),
            ("SGD tfidf embedding max 100 15",
             SGD_tfidf_embedding_100_15_pipeline_max),
            ("SVC tfidf", SVC_tfidf_pipeline),
            ("SVC count", SVC_count_pipeline),
            ("SVC count embedding 300 15",
             SVC_count_embedding_300_15_pipeline),
            ("SVC tfidf embedding 300 15",
             SVC_tfidf_embedding_300_15_pipeline),
            ("SVC count embedding sum 300 15",
             SVC_count_embedding_300_15_pipeline_sum),
            ("SVC tfidf embedding sum 300 15",
             SVC_tfidf_embedding_300_15_pipeline_sum),
            ("SVC count embedding max 300 15",
             SVC_count_embedding_300_15_pipeline_max),
            ("SVC tfidf embedding max 300 15",
             SVC_tfidf_embedding_300_15_pipeline_max),
            ("SVC count embedding 100 15",
             SVC_count_embedding_100_15_pipeline),
            ("SVC tfidf embedding 100 15",
             SVC_tfidf_embedding_100_15_pipeline),
            ("SVC count embedding sum 100 15",
             SVC_count_embedding_100_15_pipeline_sum),
            ("SVC tfidf embedding sum 100 15",
             SVC_tfidf_embedding_100_15_pipeline_sum),
            ("SVC count embedding max 100 15",
             SVC_count_embedding_100_15_pipeline_max),
            ("SVC tfidf embedding max 100 15",
             SVC_tfidf_embedding_100_15_pipeline_max),
            ("SVC count embedding 100 10",
             SVC_count_embedding_100_10_pipeline),
            ("SVC tfidf embedding 100 10",
             SVC_tfidf_embedding_100_10_pipeline),
            ("SVC count embedding sum 100 10",
             SVC_count_embedding_100_10_pipeline_sum),
            ("SVC tfidf embedding sum 100 10",
             SVC_tfidf_embedding_100_10_pipeline_sum),
            ("SVC count embedding max 100 10",
             SVC_count_embedding_100_10_pipeline_max),
            ("SVC tfidf embedding max 100 10",
             SVC_tfidf_embedding_100_10_pipeline_max),
            ("SVC count embedding 300 10",
             SVC_count_embedding_300_10_pipeline),
            ("SVC tfidf embedding 300 10",
             SVC_tfidf_embedding_300_10_pipeline),
            ("SVC count embedding sum 300 10",
             SVC_count_embedding_300_10_pipeline_sum),
            ("SVC tfidf embedding sum 300 10",
             SVC_tfidf_embedding_300_10_pipeline_sum),
            ("SVC count embedding max 300 10",
             SVC_count_embedding_300_10_pipeline_max),
            ("SVC tfidf embedding max 300 10",
             SVC_tfidf_embedding_300_10_pipeline_max),
            ("PA tfidf", PA_tfidf_pipeline), ("PA count", PA_count_pipeline),
            ("PA count embedding 300 15", PA_count_embedding_300_15_pipeline),
            ("PA tfidf embedding 300 15", PA_tfidf_embedding_300_15_pipeline),
            ("PA count embedding sum 300 15",
             PA_count_embedding_300_15_pipeline_sum),
            ("PA tfidf embedding sum 300 15",
             PA_tfidf_embedding_300_15_pipeline_sum),
            ("PA count embedding max 300 15",
             PA_count_embedding_300_15_pipeline_max),
            ("PA tfidf embedding max 300 15",
             PA_tfidf_embedding_300_15_pipeline_max),
            ("PA count embedding 100 15", PA_count_embedding_100_15_pipeline),
            ("PA tfidf embedding 100 15", PA_tfidf_embedding_100_15_pipeline),
            ("PA count embedding sum 100 15",
             PA_count_embedding_100_15_pipeline_sum),
            ("PA tfidf embedding sum 100 15",
             PA_tfidf_embedding_100_15_pipeline_sum),
            ("PA count embedding max 100 15",
             PA_count_embedding_100_15_pipeline_max),
            ("PA tfidf embedding max 100 15",
             PA_tfidf_embedding_100_15_pipeline_max),
            ("PA count embedding 100 10", PA_count_embedding_100_10_pipeline),
            ("PA tfidf embedding 100 10", PA_tfidf_embedding_100_10_pipeline),
            ("PA count embedding sum 100 10",
             PA_count_embedding_100_10_pipeline_sum),
            ("PA tfidf embedding sum 100 10",
             PA_tfidf_embedding_100_10_pipeline_sum),
            ("PA count embedding max 100 10",
             PA_count_embedding_100_10_pipeline_max),
            ("PA tfidf embedding max 100 10",
             PA_tfidf_embedding_100_10_pipeline_max),
            ("PA count embedding 300 10", PA_count_embedding_300_10_pipeline),
            ("PA tfidf embedding 300 10", PA_tfidf_embedding_300_10_pipeline),
            ("PA count embedding sum 300 10",
             PA_count_embedding_300_10_pipeline_sum),
            ("PA tfidf embedding sum 300 10",
             PA_tfidf_embedding_300_10_pipeline_sum),
            ("PA count embedding max 300 10",
             PA_count_embedding_300_10_pipeline_max),
            ("PA tfidf embedding max 300 10",
             PA_tfidf_embedding_300_10_pipeline_max),
            ("ET tfidf", ET_tfidf_pipeline), ("ET count", ET_count_pipeline),
            ("ET count embedding 100 15", ET_count_embedding_100_15_pipeline),
            ("ET tifdf embedding 100 15", ET_tfidf_embedding_100_15_pipeline),
            ("ET count embedding sum 100 15",
             ET_count_embedding_100_15_pipeline_sum),
            ("ET tifdf embedding sum 100 15",
             ET_tfidf_embedding_100_15_pipeline_sum),
            ("ET count embedding max 100 15",
             ET_count_embedding_100_15_pipeline_max),
            ("ET tifdf embedding max 100 15",
             ET_tfidf_embedding_100_15_pipeline_max),
            ("ET count embedding 100 10", ET_count_embedding_100_10_pipeline),
            ("ET tifdf embedding 100 10", ET_tfidf_embedding_100_10_pipeline),
            ("ET count embedding sum 100 10",
             ET_count_embedding_100_10_pipeline_sum),
            ("ET tifdf embedding sum 100 10",
             ET_tfidf_embedding_100_10_pipeline_sum),
            ("ET count embedding max 100 10",
             ET_count_embedding_100_10_pipeline_max),
            ("ET tifdf embedding max 100 10",
             ET_tfidf_embedding_100_10_pipeline_max),
            ("ET count embedding 300 10", ET_count_embedding_300_10_pipeline),
            ("ET tifdf embedding 300 10", ET_tfidf_embedding_300_10_pipeline),
            ("ET count embedding sum 300 10",
             ET_count_embedding_300_10_pipeline_sum),
            ("ET tifdf embedding sum 300 10",
             ET_tfidf_embedding_300_10_pipeline_sum),
            ("ET count embedding max 300 10",
             ET_count_embedding_300_10_pipeline_max),
            ("ET tifdf embedding max 300 10",
             ET_tfidf_embedding_300_10_pipeline_max),
            ("ET count embedding 300 15", ET_count_embedding_300_15_pipeline),
            ("ET tifdf embedding 300 15", ET_tfidf_embedding_300_15_pipeline),
            ("ET count embedding sum 300 15",
             ET_count_embedding_300_15_pipeline_sum),
            ("ET tifdf embedding sum 300 15",
             ET_tfidf_embedding_300_15_pipeline_sum),
            ("ET count embedding max 300 15",
             ET_count_embedding_300_15_pipeline_max),
            ("ET tifdf embedding max 300 15",
             ET_tfidf_embedding_300_15_pipeline_max)
        ]

        return all_models
Ejemplo n.º 9
0
                                   verbose=0,
                                   random_state=None,
                                   max_iter=1000),
                         n_jobs=1)),
])
for emotion in emotions:
    SVC.fit(X_train, train[emotion])
    prediction = SVC.predict(X_test)
    print('Test accuracy of :' + emotion)
    print(accuracy_score(test[emotion], prediction))
# In[19]:

print("PAC")
pac = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(PassiveAggressiveClassifier(random_state=0))),
])

for emotion in emotions:
    pac.fit(X_train, train[emotion])
    prediction = pac.predict(X_test)
    print('Test accuracy of :' + emotion)
    print(accuracy_score(test[emotion], prediction))

# In[21]:

import pickle

pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(SVC, file)
Ejemplo n.º 10
0
def evaluate_classifiers():
    train, test, labelsTrain, labelsTest = prepare_train_test()
    pipe = make_preprocessing_pipeline()
    pipe.fit(train, labelsTrain)

    def benchmark(clf):
        "Benchmarks an algorithm."
        print('_' * 10)
        print(clf)
        t0 = time()
        clf.fit(pipe.transform(train), labelsTrain)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(pipe.transform(test))
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = accuracy_score(labelsTest, pred)
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time

    results = []
    for clf, name in ((RidgeClassifier(tol=1e-2,
                                       solver="lsqr"), "Ridge Classifier"),
                      (Perceptron(n_iter=50),
                       "Perceptron"), (PassiveAggressiveClassifier(n_iter=50),
                                       "Passive-Aggressive"),
                      (KNeighborsClassifier(n_neighbors=10),
                       "kNN"), (RandomForestClassifier(n_estimators=100),
                                "Random forest")):
        print('=' * 10)
        print(name)
        results.append(benchmark(clf))

    for penalty in ["l2", "l1"]:
        print('=' * 10)
        print("%s penalty" % penalty.upper())
        # Train Liblinear model
        results.append(
            benchmark(
                LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))

        # Train SGD model
        results.append(
            benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)))

    # Train SGD with Elastic Net penalty
    print('=' * 10)
    print("Elastic-Net penalty")
    results.append(
        benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")))

    # Train NearestCentroid without threshold
    print('=' * 10)
    print("NearestCentroid (aka Rocchio classifier)")
    results.append(benchmark(NearestCentroid()))

    # Train sparse Naive Bayes classifiers
    print('=' * 10)
    print("Naive Bayes")
    results.append(benchmark(MultinomialNB(alpha=.01)))
    results.append(benchmark(BernoulliNB(alpha=.01)))

    print('=' * 10)
    print("LinearSVC with L1-based feature selection")
    # The smaller C, the stronger the regularization.
    # The more regularization, the more sparsity.
    results.append(
        benchmark(
            Pipeline([('feature_selection',
                       LinearSVC(penalty="l1", dual=False, tol=1e-3)),
                      ('classification', LinearSVC())])))
Ejemplo n.º 11
0

def classify2(X, Y, classifier, X_test, Y_test):
    name = classifier[0]
    clf = classifier[1]
    print("training %s" % name)
    clf.fit(X, Y)
    y_pred = clf.predict(X_test)
    accuracy = np.mean(y_pred == Y_test) * 100
    print(accuracy)


# define different classifiers
classifiers = [("KNneighbors", KNeighborsClassifier(n_neighbors=3)),
               ("SVM", svm.SVC()),
               ("SAG", LogisticRegression(solver='sag', tol=1e-1)),
               ("SGD", SGDClassifier()), ("ASGD", SGDClassifier(average=True)),
               ("Perceptron", Perceptron()),
               ("Passive-Aggressive I",
                PassiveAggressiveClassifier(loss='hinge', C=1.0)),
               ("Passive-Aggressive II",
                PassiveAggressiveClassifier(loss='squared_hinge', C=1.0))]

#running k-NN algorithm with 80:20 training:test split:
X = data[::50]  #< - - Small subset of data
Y = labels_tr[::50]
t1 = time()
classify(X, Y, ("KNneighbors", KNeighborsClassifier(n_neighbors=7)), 0.2)
t2 = time()
print(t2 - t1, 's')
Ejemplo n.º 12
0
# Iterator over parsed Reuters SGML files.
data_stream = stream_reuters_documents()

# We learn a binary classification between the "acq" class and all the others.
# "acq" was chosen as it is more or less evenly distributed in the Reuters
# files. For other datasets, one should take care of creating a test set with
# a realistic portion of positive instances.
all_classes = np.array([0, 1])
positive_class = 'acq'

# Here are some classifiers that support the `partial_fit` method
partial_fit_classifiers = {
    'SGD': SGDClassifier(max_iter=5, tol=1e-3),
    'Perceptron': Perceptron(tol=1e-3),
    'NB Multinomial': MultinomialNB(alpha=0.01),
    'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),
}


def get_minibatch(doc_iter, size, pos_class=positive_class):
    """Extract a minibatch of examples, return a tuple X_text, y.

    Note: size is before excluding invalid docs with no topics assigned.

    """
    data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics'])
            for doc in itertools.islice(doc_iter, size) if doc['topics']]
    if not len(data):
        return np.asarray([], dtype=int), np.asarray([], dtype=int)
    X_text, y = zip(*data)
    return X_text, np.asarray(y, dtype=int)
y = [
    'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female',
    'male', 'female', 'male'
]

treeCLF = tree.DecisionTreeClassifier()

treeCLF = treeCLF.fit(x, y)

treePredict = treeCLF.predict([[190, 70, 43]])

print('Decision Tree Classifier')
print(treePredict)
print('-------------------------')

pac = PassiveAggressiveClassifier(random_state=0)
pac.fit(x, y)

pacPredict = pac.predict([[190, 70, 43]])

print('Passive Aggressive Classifier')
print(pacPredict)
print('-------------------------')

nc = KNeighborsClassifier()
nc.fit(x, y)

ncPredict = nc.predict([[190, 70, 43]])

print('Neighbor Classifier')
print(ncPredict)
Ejemplo n.º 14
0
def benchmark(clf):
    t0 = time()
    clf.fit(x_train, categorie_train)
    train_time = time() - t0
    print("  Train time: %0.3fs" % train_time)
    t0 = time()
    pred = clf.predict(x_test)
    test_time = time() - t0
    print("  Test time: %0.3fs" % test_time)
    score = metrics.accuracy_score(categorie_test, pred)
    print("  Accuratezza: %0.3f" % score)


#SGDClassifier
print(" SGDClassifier test:\n")
SDGC = SGDClassifier(penalty='l2', n_jobs=-1)
benchmark(SDGC)
print("\n\n")
#ComplementNB
print(" ComplementNB test:\n")
CNB = ComplementNB()
benchmark(CNB)
print("\n\n")
#PassiveAggressiveClassifier
print(" PassiveAggressiveClassifier test:\n")
PAC = PassiveAggressiveClassifier(max_iter=50, n_jobs=-1)
benchmark(PAC)

#Fine script
print("\n")
print("\n---------- ESECUZIONE TERMINATA! ----------\n\n")
# Iterator over parsed Reuters SGML files.
data_stream = stream_reuters_documents()

# We learn a binary classification between the "acq" class and all the others.
# "acq" was chosen as it is more or less evenly distributed in the Reuters
# files. For other datasets, one should take care of creating a test set with
# a realistic portion of positive instances.
all_classes = np.array([0, 1])
positive_class = 'acq'

# Here are some classifiers that support the `partial_fit` method
partial_fit_classifiers = {
    'SGD': SGDClassifier(),
    'Perceptron': Perceptron(),
    'NB Multinomial': MultinomialNB(alpha=0.01),
    'Passive-Aggressive': PassiveAggressiveClassifier(),
}


def get_minibatch(doc_iter, size, pos_class=positive_class):
    """Extract a minibatch of examples, return a tuple X_text, y.

    Note: size is before excluding invalid docs with no topics assigned.

    """
    data = [(u'{title}\n\n{body}'.format(**doc), pos_class in doc['topics'])
            for doc in itertools.islice(doc_iter, size)
            if doc['topics']]
    if not len(data):
        return np.asarray([], dtype=int), np.asarray([], dtype=int)
    X_text, y = zip(*data)
Ejemplo n.º 16
0
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Binarizer

from sklearn.linear_model import Perceptron, SGDClassifier, PassiveAggressiveClassifier

Data_reader = csv_reader.CsvReader('../DataSet/TalkingData')
Data_writer = csv_reader.CsvReader('../output')

clfdir = {
    'MNB': MultinomialNB(),
    'BNB': BernoulliNB(),
    'PT': Perceptron(),
    'SG': SGDClassifier(),
    'PAC': PassiveAggressiveClassifier()
}

# clfdir={'MNB':MultinomialNB(),
#         'BNB':BernoulliNB()}


def get_next_data(train_data_chunk, persent=1, number=20000):
    print('get_next_data ')
    res_data = pd.DataFrame()
    test_data = pd.DataFrame()
    for data_chunk in train_data_chunk:
        if test_data.empty:
            test_data = data_chunk
        else:
            if random.randint(0, 10) > 2:
Ejemplo n.º 17
0
def run_exp(outpath,
            random_states,
            datasets,
            model_list,
            alphas=None,
            proportion=None):
    id = time.time()
    directory = "%s/%i" % (outpath, int(id))
    stat_rate = 100

    if not os.path.exists(directory):
        os.makedirs(directory)

    for random_state in random_states:
        directory1 = "%s/%i" % (directory, random_state)
        if not os.path.exists(directory1):
            os.makedirs(directory1)

        for dataset in datasets:
            directory2 = "%s/%s" % (directory1, dataset)
            if not os.path.exists(directory2):
                os.makedirs(directory2)

            if dataset == "20news":
                data = load_20news(100000, random_state)
            elif dataset == "mnist":
                data = load_mnist_train_test(random_state)
            elif dataset == "product_type":
                data = load_product_type_dataset(random_state)
            else:
                raise Exception(
                    "Dataset %s is not found. Download data from google drive. Set the correct path to data"
                    % dataset)

            train_data, train_target, test_data, test_target = data
            if proportion is not None:
                end = int(train_data.shape[0] * proportion)
                if end <= 0:
                    raise Exception("Proportion is invalid")
                train_data, train_target = train_data[:end], train_target[:end]

            n, d = train_data.shape
            c = np.unique(train_target).shape[0]

            scale = n / batch_size
            # scale = 10e8

            if dataset == "20news":
                lr = 0.05
            else:
                lr = 0.01

            n_iter = 1

            print(
                "Data set summery : number of samples %i, number of features %i, number of classes %i"
                % (n, d, c))

            # create models
            models = {}
            bmodels = []
            for model in model_list:
                if model == "AROW":
                    if dataset == "20news":
                        m_model = DummyModel()
                        models[model] = m_model
                    else:
                        if c > 2:
                            # m_model = sol.SOL('arow', c)
                            m_model = M_AROW(
                                c, d)  # TODO verify the arrow implementation
                        else:
                            m_model = AROW(d)
                        models[model] = m_model
                elif model == "SGD":
                    m_model = SGDClassifier(max_iter=1, tol=1)
                    models[model] = m_model

                elif model == "PA":
                    m_model = PassiveAggressiveClassifier(max_iter=1, tol=1)
                    models[model] = m_model

                elif model == "BB-SVB":
                    m_model = BayesianClassifier("BB-SVB",
                                                 n_samples,
                                                 d,
                                                 c,
                                                 learning_rate=lr)
                    models[model] = m_model
                    bmodels.append("BB-SVB")

                elif model == "SSVB":
                    m_model = BayesianClassifier("SSVB",
                                                 n_samples,
                                                 d,
                                                 c,
                                                 learning_rate=lr)
                    bmodels.append("SSVB")
                    models[model] = m_model

                elif model == "SVI":
                    m_model = BayesianClassifier("SVI",
                                                 n_samples,
                                                 d,
                                                 c,
                                                 learning_rate=lr,
                                                 scale=scale)
                    bmodels.append("SVI")
                    models[model] = m_model

                elif model == "PVI":
                    if alphas is None or len(alphas) < 1:
                        raise Exception("%s require valid values for alpha" %
                                        model)
                    for alpha in alphas:
                        model = "PVI(%d)" % int(np.log10(alpha))
                        m_model = BayesianClassifier(model,
                                                     n_samples,
                                                     d,
                                                     c,
                                                     learning_rate=lr,
                                                     scale=alpha)
                        bmodels.append(model)
                        models[model] = m_model

                else:
                    raise Exception("Unknown model : %s" % model)

            loglik_arr = {}
            loss_arr = {}
            kl_arr = {}
            mean_arr = {}
            std_arr = {}
            for model in bmodels:
                loss_arr[model] = []
                kl_arr[model] = []
                mean_arr[model] = []
                std_arr[model] = []
                loglik_arr[model] = []

            error_counts = {}
            error_arr = {}

            out_stats = {
                "error": error_arr,
                "loss": loss_arr,
                "kl": kl_arr,
                "mean": mean_arr,
                "std": std_arr
            }

            for model in models:
                error_counts[model] = 0
                error_arr[model] = []

            for i in range(0, n, batch_size):

                X, y = get_minibatch(train_data, train_target, i, batch_size)

                for model in models:

                    if i == 0:
                        if model in ["PA", "SGD"]:
                            pred = [0]
                        else:
                            pred = models[model].predict(X)
                    else:
                        pred = models[model].predict(X)

                    # error logs
                    if pred[0] != y[0]:
                        error_counts[model] += 1
                    error_arr[model].append(error_counts[model])

                    if model in loss_arr:
                        loss, kl = models[model].fit(X, y, i, iter=n_iter)
                        if i % stat_interval == 0:
                            log_p = models[model].log_predictive_likelihood(
                                test_data, test_target)
                            loglik_arr[model].append(log_p[0])

                        _mean, _std = models[model].get_vars()

                        mean_arr[model].append(str(_mean.tolist()[:10]))
                        std_arr[model].append(str(_std.tolist()[:10]))

                        loss_arr[model].append(loss)
                        kl_arr[model].append(kl)

                    elif model in ["PA", "SGD"]:
                        models[model].partial_fit(X, y, range(0, c))
                    else:
                        models[model].fit(X, y)

                losses_str = "%d" % i
                for model in loss_arr:
                    losses_str += " %s : (%f, %f)" % (
                        model, loss_arr[model][-1], kl_arr[model][-1])
                print(losses_str)

                error_str = "Error Counts ------->"
                for model in models:
                    error_str += " %s:%d," % (model, error_counts[model])
                print(error_str)

                print(
                    "========================================================")

                if i % stat_rate == 0 or i == n - 1:
                    print("writing stats to directory : %s" % directory2)
                    for stat in out_stats:
                        path = "%s/%s.csv" % (directory2, stat)
                        stats = out_stats[stat]
                        df = pd.DataFrame(stats)
                        if not os.path.isfile(path):
                            df.to_csv(path)
                        else:
                            df.to_csv(path, mode='a', header=False)

                        for model in stats:
                            stats[model] = []

            error_str = "Final error rate ========>>"
            for model in models:
                error_str += " %s:%f," % (model,
                                          error_counts[model] / float(n))
            print(error_str)

            # final accuracy computation
            final_acc_arr = {}
            for model in models:
                pred = models[model].predict(test_data)

                if c <= 2:
                    acc = f1_score(test_target, np.array(pred).flatten())
                else:
                    acc = f1_score(test_target,
                                   np.array(pred).flatten(),
                                   average="micro")

                final_acc_arr[model] = [acc]

                print("Accuracy of %s : %f" % (model, acc))

            final_stats = {
                "accuracy": final_acc_arr,
                "likelihood": loglik_arr,
            }

            for stat in final_stats:
                path = "%s/%s.csv" % (directory2, stat)
                pd.DataFrame(final_stats[stat]).to_csv(path)
Ejemplo n.º 18
0
    },
    'cnb': {
        'cnb__alpha': [1, 0.1, 0.01, 0.001]
    },
    'ridge': {},
    'perceptron': {},
    'pa': {},
    'rf': {},
    'nc': {},
    'cnb': {}
}
models = {
    'mnb': MultinomialNB(),
    'ridge': RidgeClassifier(tol=1e-2, solver="sag"),
    'perceptron': Perceptron(max_iter=50),
    'pa': PassiveAggressiveClassifier(max_iter=50),
    'knn': KNeighborsClassifier(n_neighbors=10),
    'rf': RandomForestClassifier(),
    'lsvc': LinearSVC(dual=False, tol=1e-3),
    'sgd': SGDClassifier(alpha=.0001, max_iter=50),
    'nc': NearestCentroid(),
    'cnb': ComplementNB()
}

for key in models:
    print(key)
    params = {}
    steps = {}
    steps[key] = Pipeline(pre_steps + [(key, models[key])])
    params[key] = [{**map, **models_param_grid[key]} for map in vect_param]
    estimator = EstimatorSelectionHelper(steps, params)
save_classifier = open("c:/users/user/desktop/project/logistics_classifier.pickle","wb")
pickle.dump(logistics_classifier,save_classifier)
save_classifier.close()

#SGDClassifier
SGDClassifier_classifier=SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print('Original SGDC Classifier algo accuracy percent:',(nltk.classify.accuracy(SGDClassifier_classifier,testing_set))*100)

save_classifier = open("c:/users/user/desktop/project/SGDClassifier_classifier.pickle","wb")
pickle.dump(SGDClassifier_classifier,save_classifier)
save_classifier.close()

#PassiveAgressiveClassifier
PassiveAggressiveClassifier_classifier=SklearnClassifier(PassiveAggressiveClassifier())
PassiveAggressiveClassifier_classifier.train(training_set)
print('Original Passive Aggressive Classifier algo accuracy percent:',(nltk.classify.accuracy(PassiveAggressiveClassifier_classifier,testing_set))*100)

save_classifier = open("c:/users/user/desktop/project/PassiveAggressiveClassifier_classifier.pickle","wb")
pickle.dump(PassiveAggressiveClassifier_classifier,save_classifier)
save_classifier.close()

#LinearSVC_classifier
LinearSVC_classifier=SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print('Original Linear SVC_classifier algo accuracy percent:',(nltk.classify.accuracy(LinearSVC_classifier,testing_set))*100)

save_classifier = open("c:/users/user/desktop/project/LinearSVC_classifier.pickle","wb")
pickle.dump(LinearSVC_classifier,save_classifier)
save_classifier.close()
        SVC(kernel="linear", C=100, class_weight='balanced'),
        SVC(kernel="linear", C=500, class_weight='balanced'),
        SVC(kernel="linear", C=1000, class_weight='balanced'),
        SVC(kernel="linear", C=0.25, class_weight='balanced'),
        SVC(gamma=2, C=0.1, class_weight='balanced'),
        SVC(gamma=2, C=0.25, class_weight='balanced'),
        SVC(C=0.1, class_weight='balanced'),
        SVC(C=5, class_weight='balanced'),
        SVC(C=10, class_weight='balanced'),
        SVC(C=50, class_weight='balanced'),
        SVC(C=100, class_weight='balanced'),
        SVC(C=500, class_weight='balanced'),
        SVC(C=1000, class_weight='balanced'),
        SVC(class_weight='balanced')
    ], [GaussianNB()], [RandomForestClassifier()],
           [PassiveAggressiveClassifier()], [AdaBoostClassifier()],
           [GradientBoostingClassifier()]]

    score = {}
    res = {}
    kfold = KFold(n_splits=10)

    for i in range(len(clf)):
        scores = []
        for j in clf[i]:
            fold_score = []
            for train_index, test_index in kfold.split(x_pol, y_pol):
                x_train = x_pol[train_index]
                x_test = x_pol[test_index]
                y_train = y_pol[train_index]
                y_test = y_pol[test_index]
Ejemplo n.º 21
0
# Learn vocabulary from train set
countVec.fit(trainText)

# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(trainText)
devX = countVec.transform(devText)
testX = countVec.transform(testText)

print("Shape of Train X {}\n".format(trainX.shape))
print("Sample of the vocab:\n {}".format(
    np.random.choice(countVec.get_feature_names(), 20)))

#%% PICK A MODEL AND EXPERIMENT
lr = LogisticRegression()
passAgg = PassiveAggressiveClassifier()
perceptron = Perceptron()

lr.fit(trainX, trainY)
print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")

passAgg.fit(trainX, trainY)
print("Passive Aggressive Train:", passAgg.score(trainX, trainY))
print("Passive Aggressive Dev:", passAgg.score(devX, devY))
print("--")

perceptron.fit(trainX, trainY)
print("Perceptron Train:", perceptron.score(trainX, trainY))
print("Perceptron Dev:", perceptron.score(devX, devY))
Ejemplo n.º 22
0
def train_and_test_classifiers(train_set, test_set):
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Classic Naive Bayes Classifier accuracy percent:",
          (nltk.classify.accuracy(classifier, test_set)) * 100)
    # classifier.show_most_informative_features(15)

    MNB_classifier = SklearnClassifier(
        MultinomialNB(alpha=0.01, fit_prior=False))
    MNB_classifier.train(train_set)
    print("Multinomial Naive Bayes Classifier accuracy percent:",
          (nltk.classify.accuracy(MNB_classifier, test_set)) * 100)

    print("Skipping Gaussian Bayes Classifier accuracy percent")
    # GNB_classifier = SklearnClassifier(GaussianNB())
    # GNB_classifier.fit(features_train, target_train)
    # target_pred = clf.predict(features_test)
    # GNB_classifier.train(train_set)
    # print("Gaussian Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(GNB_classifier, test_set))*100)

    BNB_classifier = SklearnClassifier(BernoulliNB(alpha=.01))
    BNB_classifier.train(train_set)
    print("Bernoulli Naive Bayes Classifier accuracy percent:",
          (nltk.classify.accuracy(BNB_classifier, test_set)) * 100)

    LG_classifier = SklearnClassifier(LogisticRegression(random_state=42))
    LG_classifier.train(train_set)
    print("Logistic Regression Classifier accuracy percent:",
          (nltk.classify.accuracy(LG_classifier, test_set)) * 100)

    # Train SGD with hinge penalty
    SGD_classifier1 = SklearnClassifier(
        SGDClassifier(loss='hinge',
                      penalty='l2',
                      alpha=1e-3,
                      random_state=42,
                      max_iter=1000,
                      tol=None))
    # SGD_classifier = SklearnClassifier(SGDClassifier(alpha=0.0005, max_iter=1000))
    SGD_classifier1.train(train_set)
    print("Stochastic Gradient Descent Classifier 1 accuracy percent:",
          (nltk.classify.accuracy(SGD_classifier1, test_set)) * 100)

    # Train SGD with Elastic Net penalty
    SGD_classifier2 = SklearnClassifier(
        SGDClassifier(alpha=1e-3,
                      random_state=42,
                      penalty="elasticnet",
                      max_iter=1000,
                      tol=None))
    SGD_classifier2.train(train_set)
    print("Stochastic Gradient Descent Classifier 2 accuracy percent:",
          (nltk.classify.accuracy(SGD_classifier2, test_set)) * 100)

    # print("Skipping C-Support Vector Classifier")
    # print("Skipping Linear-Support Vector Classifier")
    SVC_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
    SVC_classifier.train(train_set)
    print("C-Support Vector Classifier accuracy percent:",
          (nltk.classify.accuracy(SVC_classifier, test_set)) * 100)
    LinearSVC_classifier1 = SklearnClassifier(
        SVC(kernel='linear', probability=True, tol=1e-3))
    LinearSVC_classifier1.train(train_set)
    print("Linear Support Vector Classifier 1 accuracy percent:",
          (nltk.classify.accuracy(LinearSVC_classifier1, test_set)) * 100)
    LinearSVC_classifier2 = SklearnClassifier(
        LinearSVC("l1", dual=False, tol=1e-3))
    LinearSVC_classifier2.train(train_set)
    print("Linear Support Vector Classifier 2 accuracy percent:",
          (nltk.classify.accuracy(LinearSVC_classifier2, test_set)) * 100)
    LinearSVC_classifier3 = SklearnClassifier(
        LinearSVC("l2", dual=False, tol=1e-3))
    LinearSVC_classifier3.train(train_set)
    print("Linear Support Vector Classifier 3 accuracy percent:",
          (nltk.classify.accuracy(LinearSVC_classifier3, test_set)) * 100)

    NuSVC_classifier = SklearnClassifier(NuSVC())
    NuSVC_classifier.train(train_set)
    print("Nu-Support Vector Classifier accuracy percent:",
          (nltk.classify.accuracy(NuSVC_classifier, test_set)) * 100)

    # new code

    # Train NearestCentroid (aka Rocchio classifier) without threshold
    Nearest_Centroid_classifier = SklearnClassifier(NearestCentroid())
    Nearest_Centroid_classifier.train(train_set)
    print("Nearest Centroid Classifier accuracy percent:",
          (nltk.classify.accuracy(Nearest_Centroid_classifier, test_set)) *
          100)

    Ridge_classifier = SklearnClassifier(
        RidgeClassifier(alpha=0.5, tol=1e-2, solver="sag"))
    Ridge_classifier.train(train_set)
    print("Ridge Classifier accuracy percent:",
          (nltk.classify.accuracy(Ridge_classifier, test_set)) * 100)

    Perceptron_classifier = SklearnClassifier(Perceptron(max_iter=1000))
    Perceptron_classifier.train(train_set)
    print("Perceptron Classifier accuracy percent:",
          (nltk.classify.accuracy(Perceptron_classifier, test_set)) * 100)

    Passive_Aggressive_classifier = SklearnClassifier(
        PassiveAggressiveClassifier(max_iter=1000))
    Passive_Aggressive_classifier.train(train_set)
    print("Passive-Aggressive Classifier accuracy percent:",
          (nltk.classify.accuracy(Passive_Aggressive_classifier, test_set)) *
          100)

    kNN_classifier = SklearnClassifier(KNeighborsClassifier(n_neighbors=10))
    kNN_classifier.train(train_set)
    print("kNN Classifier accuracy percent:",
          (nltk.classify.accuracy(kNN_classifier, test_set)) * 100)

    voted_classifier = VoteClassifier(classifier, MNB_classifier,
                                      BNB_classifier, LG_classifier,
                                      SGD_classifier2, LinearSVC_classifier2,
                                      NuSVC_classifier)
    print("Voted Classifier Classifier accuracy percent:",
          (nltk.classify.accuracy(voted_classifier, test_set)) * 100)
    print("Classification: ", voted_classifier.classify(test_set[0][0]),
          "Confidence: %",
          voted_classifier.confidence(test_set[0][0]) * 100)
    print("Classification: ", voted_classifier.classify(test_set[2][0]),
          "Confidence: %",
          voted_classifier.confidence(test_set[2][0]) * 100)
    print("Classification: ", voted_classifier.classify(test_set[3][0]),
          "Confidence: %",
          voted_classifier.confidence(test_set[3][0]) * 100)
    print("Classification: ", voted_classifier.classify(test_set[4][0]),
          "Confidence: %",
          voted_classifier.confidence(test_set[4][0]) * 100)
Ejemplo n.º 23
0
def test_partial_fit_weight_class_balanced():
    # partial_fit with class_weight='balanced' not supported
    clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100)
    with pytest.raises(ValueError):
        clf.partial_fit(X, y, classes=np.unique(y))
Ejemplo n.º 24
0
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from baseline_estimators import BaselineClassifier, BaselineRegressor


def getts():
    return dt.datetime.today().strftime('%Y-%m-%d %H:%M:%S')


classifiers = dict(
    lr=LogisticRegression(),
    lrcv=LogisticRegressionCV(),
    sgd=SGDClassifier(),
    perceptron=Perceptron(),
    pac=PassiveAggressiveClassifier(),
    lsvc=LinearSVC(C=5),
    dt=DecisionTreeClassifier(),
    rf=RandomForestClassifier(max_depth=50,
                              n_estimators=100,
                              min_samples_split=20),
    gbm=GradientBoostingClassifier(max_depth=20,
                                   n_estimators=50,
                                   min_samples_split=20),
    gs_lr=GridSearchCV(estimator=LogisticRegression(),
                       param_grid=dict(C=[1, 3, 5, 10], penalty=['l1', 'l2']),
                       cv=3),
    gs_dt=GridSearchCV(estimator=DecisionTreeClassifier(),
                       param_grid=dict(max_depth=[5, 7, 10],
                                       min_samples_split=[10, 20, 30]),
                       cv=3),
Ejemplo n.º 25
0
# Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)


# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(x_test)

# print(tfidf_test)

# Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

# Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
# print(f'Accuracy: {round(score*100,2)}%')

title = "Darren's Amazing Super Powers"
text = "I have 300 IQ, I am 50 feet tall and can shoot lasers out of my eyes. I am the greatest of them all and everyone refer to me as Darren The Great."

# print(pac.predict([[text]]))

f = open('input.txt', "r")
input_data = f.read()
vectorized_input_data = tfidf_vectorizer.transform([input_data])
Ejemplo n.º 26
0
# Predict and calculate accuracy
clf_pred = clf.predict(tfidf_test)
score = accuracy_score(y_test, clf_pred)
print(f'Accuracy: {round(score*100,2)}%')

# ## PASSIVE AGGRESSIVE CLASIFIER

# In[269]:

from sklearn.linear_model import PassiveAggressiveClassifier

# In[302]:

# Initialize the PassiveAggressiveClassifier and fit training sets
pa_classifier = PassiveAggressiveClassifier(max_iter=1000,
                                            early_stopping=True,
                                            random_state=42)
pa_classifier.fit(tfidf_train, y_train)

# In[303]:

# Predict and calculate accuracy
y_pred = pa_classifier.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

# In[272]:

# Build confusion matrix
confusion_matrix(y_test, y_pred, labels=[0, 1])
Ejemplo n.º 27
0
    GradientBoostingClassifier(),
    # GaussianProcessClassifier(multi_class = 'one_vs_rest', n_jobs = -1), # Out of memory 配置不了所需記憶體
    KNeighborsClassifier(n_jobs=-1),
    LGBMClassifier(n_jobs=-1),
    # LabelSpreading(n_jobs = -1),          # Out of Memory 配置不了所需記憶體
    LinearDiscriminantAnalysis(),
    LogisticRegression(solver='newton-cg',
                       multi_class='multinomial',
                       n_jobs=-1),
    LogisticRegressionCV(solver='newton-cg',
                         multi_class='multinomial',
                         n_jobs=-1),
    MLPClassifier(),
    NearestCentroid(),
    # LabelPropagation(n_jobs = -1),        # Out of Memory 配置不了所需記憶體
    PassiveAggressiveClassifier(n_jobs=-1),
    Perceptron(n_jobs=-1),
    QuadraticDiscriminantAnalysis(),
    # RadiusNeighborsClassifier(n_jobs = -1), # Out of Memory 被砍掉
    RandomForestClassifier(n_jobs=-1),
    RidgeClassifier(),
    RidgeClassifierCV(),
    XGBClassifier(n_jobs=-1)
]

# 訓練報告總記錄
train_reports = []

for test_size in test_size_list:
    for random_seed in random_seed_list:
Ejemplo n.º 28
0
def featureSelection(globalIndex, variableSize, run):

    # a few hard-coded values
    numberOfFolds = 10

    # list of classifiers, selected on the basis of our previous paper "
    classifierList = [
        # ensemble
        #[AdaBoostClassifier(), "AdaBoostClassifier"],
        #[AdaBoostClassifier(n_estimators=300), "AdaBoostClassifier(n_estimators=300)"],
        #[AdaBoostClassifier(n_estimators=1500), "AdaBoostClassifier(n_estimators=1500)"],
        #[BaggingClassifier(), "BaggingClassifier"],
        [
            GradientBoostingClassifier(n_estimators=300),
            "GradientBoostingClassifier(n_estimators=300)"
        ],
        [
            RandomForestClassifier(n_estimators=300),
            "RandomForestClassifier(n_estimators=300)"
        ],
        [LogisticRegression(), "LogisticRegression"],
        [PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"],
        [SGDClassifier(), "SGDClassifier"],
        [SVC(kernel='linear'), "SVC(linear)"],
        [RidgeClassifier(), "RidgeClassifier"],
        [
            BaggingClassifier(n_estimators=300),
            "BaggingClassifier(n_estimators=300)"
        ],
        #[ExtraTreesClassifier(), "ExtraTreesClassifier"],
        #[ExtraTreesClassifier(n_estimators=300), "ExtraTreesClassifier(n_estimators=300)"],
        #[GradientBoostingClassifier(), "GradientBoostingClassifier"], # features_importances_
        #[GradientBoostingClassifier(n_estimators=300), "GradientBoostingClassifier(n_estimators=300)"],
        #[GradientBoostingClassifier(n_estimators=1000), "GradientBoostingClassifier(n_estimators=1000)"],
        #[RandomForestClassifier(), "RandomForestClassifier"],
        #[RandomForestClassifier(n_estimators=300), "RandomForestClassifier(n_estimators=300)"],
        #[RandomForestClassifier(n_estimators=1000), "RandomForestClassifier(n_estimators=1000)"], # features_importances_

        # linear
        #[ElasticNet(), "ElasticNet"],
        #[ElasticNetCV(), "ElasticNetCV"],
        #[Lasso(), "Lasso"],
        #[LassoCV(), "LassoCV"],
        #[LogisticRegression(), "LogisticRegression"], # coef_
        #[LogisticRegressionCV(), "LogisticRegressionCV"],
        #[PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"], # coef_
        #[RidgeClassifier(), "RidgeClassifier"], # coef_
        #[RidgeClassifierCV(), "RidgeClassifierCV"],
        #[SGDClassifier(), "SGDClassifier"], # coef_
        #[SVC(kernel='linear'), "SVC(linear)"], # coef_, but only if the kernel is linear...the default is 'rbf', which is NOT linear

        # naive Bayes
        #[BernoulliNB(), "BernoulliNB"],
        #[GaussianNB(), "GaussianNB"],
        #[MultinomialNB(), "MultinomialNB"],

        # neighbors
        #[KNeighborsClassifier(), "KNeighborsClassifier"], # no way to return feature importance
        # TODO this one creates issues
        #[NearestCentroid(), "NearestCentroid"], # it does not have some necessary methods, apparently
        #[RadiusNeighborsClassifier(), "RadiusNeighborsClassifier"],

        # tree
        #[DecisionTreeClassifier(), "DecisionTreeClassifier"],
        #[ExtraTreeClassifier(), "ExtraTreeClassifier"],
    ]

    # this is just a hack to check a few things
    #classifierList = [
    #	   [RandomForestClassifier(), "RandomForestClassifier"]
    #	   ]

    print("Loading dataset...")
    if (globalIndex == 0):
        X, y, biomarkerNames = loadDatasetOriginal(run)
    else:
        X, y, biomarkerNames = loadDataset(globalIndex, run)

    numberOfTopFeatures = int(variableSize)
    # create folder
    folderName = "./run" + str(run) + "/"
    if not os.path.exists(folderName): os.makedirs(folderName)

    # prepare folds
    skf = StratifiedKFold(n_splits=numberOfFolds, shuffle=True)
    indexes = [(training, test) for training, test in skf.split(X, y)]

    # this will be used for the top features
    topFeatures = dict()

    # iterate over all classifiers
    classifierIndex = 0

    globalAccuracy = 0

    for originalClassifier, classifierName in classifierList:

        print("\nClassifier " + classifierName)
        classifierPerformance = []
        classifierTopFeatures = dict()

        # iterate over all folds
        for train_index, test_index in indexes:

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # let's normalize, anyway
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            classifier = copy.deepcopy(originalClassifier)
            classifier.fit(X_train, y_train)
            scoreTraining = classifier.score(X_train, y_train)
            scoreTest = classifier.score(X_test, y_test)

            print("\ttraining: %.4f, test: %.4f" % (scoreTraining, scoreTest))
            classifierPerformance.append(scoreTest)

            # now, let's get a list of the most important features, then mark the ones in the top X
            orderedFeatures = relativeFeatureImportance(classifier)
            for i in range(0, numberOfTopFeatures):

                feature = int(orderedFeatures[i][1])

                if feature in topFeatures:
                    topFeatures[feature] += 1
                else:
                    topFeatures[feature] = 1

                if feature in classifierTopFeatures:
                    classifierTopFeatures[feature] += 1
                else:
                    classifierTopFeatures[feature] = 1

        line = "%s\t%.4f\t%.4f\n" % (classifierName,
                                     np.mean(classifierPerformance),
                                     np.std(classifierPerformance))

        globalAccuracy = globalAccuracy + np.mean(classifierPerformance)

        print(line)
        fileName = folderName + "results.txt"
        fo = open(fileName, 'a')
        fo.write(line)
        fo.close()
        # save most important features for the classifier
        with open(os.path.join(folderName, classifierName + ".csv"),
                  "w") as fp:

            fp.write("feature,frequencyInTop" + str(numberOfTopFeatures) +
                     "\n")

            # transform dictionary into list
            listOfClassifierTopFeatures = [(key, classifierTopFeatures[key])
                                           for key in classifierTopFeatures]
            listOfClassifierTopFeatures = sorted(listOfClassifierTopFeatures,
                                                 key=lambda x: x[1],
                                                 reverse=True)

            for feature, frequency in listOfClassifierTopFeatures:
                fp.write(
                    str(biomarkerNames[feature]) + "," +
                    str(float(frequency / numberOfFolds)) + "\n")

    # save most important features overall
    with open(
            os.path.join(folderName,
                         "global_" + str(int(globalIndex)) + ".csv"),
            "w") as fp:

        fp.write("feature,frequencyInTop" + str(numberOfTopFeatures) + "\n")

        # transform dictionary into list
        listOfTopFeatures = [(key, topFeatures[key]) for key in topFeatures]
        listOfTopFeatures = sorted(listOfTopFeatures,
                                   key=lambda x: x[1],
                                   reverse=True)

        tempIndex = 0
        for feature, frequency in listOfTopFeatures:
            if tempIndex < numberOfTopFeatures:
                fp.write(
                    str(biomarkerNames[feature]) + "," +
                    str(float(frequency / numberOfFolds)) + "\n")
            tempIndex = tempIndex + 1
    globalAccuracy = globalAccuracy / 8
    return globalAccuracy
Ejemplo n.º 29
0
    t0 = time()
    pred = clf.predict(X_test_counts)
    test_time = time() - t0
    test_times.append(test_time)
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    scores.append(score)
    print("accuracy:   %0.3f" % score)


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag",max_iter=1200000), "Ridge Classifier"),
        (Perceptron(max_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=1200000),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
                                       tol=1e-3)))
    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=1200000,
                                           penalty=penalty)))
Ejemplo n.º 30
0
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

classifiers_list = {
    'AdaBoostClassifier': AdaBoostClassifier(),
    'BaggingClassifier': BaggingClassifier(),
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
    'RidgeClassifier': RidgeClassifier(),
    'RidgeClassifierCV': RidgeClassifierCV(),
    'SGDClassifier': SGDClassifier(),
    'MLPClassifier': MLPClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'ExtraTreeClassifier': ExtraTreeClassifier()
}