annot=True, linewidth=.5, square=True, cmap='Blues_r', fmt='f') plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.title('Confusion Matrix', size=15) report = classification_report(test_df['test_label'], mnb_pred) print(report) # ## Passive Aggressive Classifier ## Instantiating a Passive Aggressive Classifier : pa_tfidf_clf pa_tfidf_clf = PassiveAggressiveClassifier() ## Fit the classifier to the training data pa_tfidf_clf.fit(count_train, train_df['train_label']) ## Create the predicted tags: pac_pred pac_pred = pa_tfidf_clf.predict(count_test) ## Calculate the accuracy score: pac_score pac_score = metrics.accuracy_score(test_df['test_label'], pac_pred) ## Calculate the confusion matrix: pac_cm pac_cm = metrics.confusion_matrix(test_df['test_label'], pac_pred, labels=['true', 'false']) print('Confusion Matrix --- PassiveAggressiveClassifier') print(pac_cm)
target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
def modele_ens(clf,tr,y_tr,ts,y_ts): clf.fit(tr, y_tr) pred=clf.predict(ts) y_test_array=np.array(y_ts) score = accuracy_score(y_ts, pred) print(clf) print(score) del pred del y_test_array del score #Ensemblistes modele_ens(SVC( C=0.6, kernel='linear'),x_train2,y_train,x_test2,y_test) modele_ens(PassiveAggressiveClassifier(C=1,random_state=42),x_train2,y_train,x_test2,y_test) modele_ens(Perceptron(n_iter=3,alpha=1,eta0=0.01,warm_start=True),x_train2,y_train,x_test2,y_test) #memory error #modele_ens(RidgeClassifierCV(alphas=0.1),x_train2,y_train,x_test2,y_test) #Non supervisés #Neural Network from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.optimizers import SGD from keras.layers.advanced_activations import PReLU from keras.utils import np_utils
def test_classifier_undefined_methods(): clf = PassiveAggressiveClassifier(max_iter=100) for meth in ("predict_proba", "predict_log_proba", "transform"): with pytest.raises(AttributeError): getattr(clf, meth)
pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(
] sgd_regressors = [('SGD-sl', SGDRegressor(loss='squared_loss', penalty=penalty)), ('SGD-h', SGDRegressor(loss='huber', penalty=penalty)), ('SGD-ei', SGDRegressor(loss='epsilon_insensitive', penalty=penalty)), ('SGD-sei', SGDRegressor(loss='squared_epsilon_insensitive', penalty=penalty))] selected_classifiers = [('SGD', SGDClassifier(loss='hinge', penalty='l1')), ('Perceptron-I', Perceptron(penalty='l1')), ('Perceptron-II', Perceptron(penalty='l2')), ('Perceptron-II', Perceptron(penalty='elasticnet')), ('PA-I', PassiveAggressiveClassifier(loss='hinge')), ('PA-II', PassiveAggressiveClassifier(loss='squared_hinge'))] selected_regressors = [ ('SGD-l1', SGDRegressor(loss='squared_loss', penalty='l2')), ('PA-I', PassiveAggressiveRegressor(loss='epsilon_insensitive')), ('PA-II', PassiveAggressiveRegressor(loss='squared_epsilon_insensitive')) ] scalers = [('Min-Max', MinMaxScaler()), ('Standarization', StandardScaler()), ('Max-Abs', MaxAbsScaler())] data = json.load( open('machine_learner/collected_data/dataset_with_selected_features.json')) features = data['features']
target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50), "Perceptron"), (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(
def Define_pipelines(self): logging.info('Start defining pipelines...\n\n') SGD_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_300_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_300_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_300_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_300_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_300_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_300_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_300_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_300_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_300_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_300_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_300_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_300_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_100_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_100_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_100_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_100_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_100_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_100_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_100_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_100_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_100_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_100_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_100_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_100_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SVC_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_300_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_300_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_300_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_300_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_300_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_300_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_100_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_100_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_100_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_100_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_100_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_100_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_100_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_100_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_100_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_100_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_100_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_100_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_300_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_300_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_300_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_300_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_300_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_300_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) PA_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_300_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_300_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_300_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_300_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_300_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_300_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_100_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_100_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_100_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_100_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_100_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_100_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) # PA_count_embedding_100_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_100_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_100_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_100_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_100_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_100_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) # PA_count_embedding_300_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_300_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_300_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_300_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_300_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_300_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) # ET_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_300_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_300_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_300_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_300_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_300_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_15, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_300_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_15, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_300_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_300_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_300_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_300_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_300_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_300_10, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_300_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_300_10, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_100_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_100_10_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_100_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_100_10_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_100_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_10, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_100_10_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_10, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_100_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_100_15_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_100_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_100_15_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_100_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer( self.model_100_15, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_100_15_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer( self.model_100_15, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) all_models = [ ("SGD tfidf", SGD_tfidf_pipeline), ("SGD count", SGD_count_pipeline), ("SGD count embedding 300 15", SGD_count_embedding_300_15_pipeline), ("SGD tfidf embedding 300 15", SGD_tfidf_embedding_300_15_pipeline), ("SGD count embedding sum 300 15", SGD_count_embedding_300_15_pipeline_sum), ("SGD tfidf embedding sum 300 15", SGD_tfidf_embedding_300_15_pipeline_sum), ("SGD count embedding max 300 15", SGD_count_embedding_300_15_pipeline_max), ("SGD tfidf embedding max 300 15", SGD_tfidf_embedding_300_15_pipeline_max), ("SGD count embedding 300 10", SGD_count_embedding_300_10_pipeline), ("SGD tfidf embedding 300 10", SGD_tfidf_embedding_300_10_pipeline), ("SGD count embedding sum 300 10", SGD_count_embedding_300_10_pipeline_sum), ("SGD tfidf embedding sum 300 10", SGD_tfidf_embedding_300_10_pipeline_sum), ("SGD count embedding max 300 10", SGD_count_embedding_300_10_pipeline_max), ("SGD tfidf embedding max 300 10", SGD_tfidf_embedding_300_10_pipeline_max), ("SGD count embedding 100 10", SGD_count_embedding_100_10_pipeline), ("SGD tfidf embedding 100 10", SGD_tfidf_embedding_100_10_pipeline), ("SGD count embedding sum 100 10", SGD_count_embedding_100_10_pipeline_sum), ("SGD tfidf embedding sum 100 10", SGD_tfidf_embedding_100_10_pipeline_sum), ("SGD count embedding max 100 10", SGD_count_embedding_100_10_pipeline_max), ("SGD tfidf embedding max 100 10", SGD_tfidf_embedding_100_10_pipeline_max), ("SGD count embedding 100 15", SGD_count_embedding_100_15_pipeline), ("SGD tfidf embedding 100 15", SGD_tfidf_embedding_100_15_pipeline), ("SGD count embedding sum 100 15", SGD_count_embedding_100_15_pipeline_sum), ("SGD tfidf embedding sum 100 15", SGD_tfidf_embedding_100_15_pipeline_sum), ("SGD count embedding max 100 15", SGD_count_embedding_100_15_pipeline_max), ("SGD tfidf embedding max 100 15", SGD_tfidf_embedding_100_15_pipeline_max), ("SVC tfidf", SVC_tfidf_pipeline), ("SVC count", SVC_count_pipeline), ("SVC count embedding 300 15", SVC_count_embedding_300_15_pipeline), ("SVC tfidf embedding 300 15", SVC_tfidf_embedding_300_15_pipeline), ("SVC count embedding sum 300 15", SVC_count_embedding_300_15_pipeline_sum), ("SVC tfidf embedding sum 300 15", SVC_tfidf_embedding_300_15_pipeline_sum), ("SVC count embedding max 300 15", SVC_count_embedding_300_15_pipeline_max), ("SVC tfidf embedding max 300 15", SVC_tfidf_embedding_300_15_pipeline_max), ("SVC count embedding 100 15", SVC_count_embedding_100_15_pipeline), ("SVC tfidf embedding 100 15", SVC_tfidf_embedding_100_15_pipeline), ("SVC count embedding sum 100 15", SVC_count_embedding_100_15_pipeline_sum), ("SVC tfidf embedding sum 100 15", SVC_tfidf_embedding_100_15_pipeline_sum), ("SVC count embedding max 100 15", SVC_count_embedding_100_15_pipeline_max), ("SVC tfidf embedding max 100 15", SVC_tfidf_embedding_100_15_pipeline_max), ("SVC count embedding 100 10", SVC_count_embedding_100_10_pipeline), ("SVC tfidf embedding 100 10", SVC_tfidf_embedding_100_10_pipeline), ("SVC count embedding sum 100 10", SVC_count_embedding_100_10_pipeline_sum), ("SVC tfidf embedding sum 100 10", SVC_tfidf_embedding_100_10_pipeline_sum), ("SVC count embedding max 100 10", SVC_count_embedding_100_10_pipeline_max), ("SVC tfidf embedding max 100 10", SVC_tfidf_embedding_100_10_pipeline_max), ("SVC count embedding 300 10", SVC_count_embedding_300_10_pipeline), ("SVC tfidf embedding 300 10", SVC_tfidf_embedding_300_10_pipeline), ("SVC count embedding sum 300 10", SVC_count_embedding_300_10_pipeline_sum), ("SVC tfidf embedding sum 300 10", SVC_tfidf_embedding_300_10_pipeline_sum), ("SVC count embedding max 300 10", SVC_count_embedding_300_10_pipeline_max), ("SVC tfidf embedding max 300 10", SVC_tfidf_embedding_300_10_pipeline_max), ("PA tfidf", PA_tfidf_pipeline), ("PA count", PA_count_pipeline), ("PA count embedding 300 15", PA_count_embedding_300_15_pipeline), ("PA tfidf embedding 300 15", PA_tfidf_embedding_300_15_pipeline), ("PA count embedding sum 300 15", PA_count_embedding_300_15_pipeline_sum), ("PA tfidf embedding sum 300 15", PA_tfidf_embedding_300_15_pipeline_sum), ("PA count embedding max 300 15", PA_count_embedding_300_15_pipeline_max), ("PA tfidf embedding max 300 15", PA_tfidf_embedding_300_15_pipeline_max), ("PA count embedding 100 15", PA_count_embedding_100_15_pipeline), ("PA tfidf embedding 100 15", PA_tfidf_embedding_100_15_pipeline), ("PA count embedding sum 100 15", PA_count_embedding_100_15_pipeline_sum), ("PA tfidf embedding sum 100 15", PA_tfidf_embedding_100_15_pipeline_sum), ("PA count embedding max 100 15", PA_count_embedding_100_15_pipeline_max), ("PA tfidf embedding max 100 15", PA_tfidf_embedding_100_15_pipeline_max), ("PA count embedding 100 10", PA_count_embedding_100_10_pipeline), ("PA tfidf embedding 100 10", PA_tfidf_embedding_100_10_pipeline), ("PA count embedding sum 100 10", PA_count_embedding_100_10_pipeline_sum), ("PA tfidf embedding sum 100 10", PA_tfidf_embedding_100_10_pipeline_sum), ("PA count embedding max 100 10", PA_count_embedding_100_10_pipeline_max), ("PA tfidf embedding max 100 10", PA_tfidf_embedding_100_10_pipeline_max), ("PA count embedding 300 10", PA_count_embedding_300_10_pipeline), ("PA tfidf embedding 300 10", PA_tfidf_embedding_300_10_pipeline), ("PA count embedding sum 300 10", PA_count_embedding_300_10_pipeline_sum), ("PA tfidf embedding sum 300 10", PA_tfidf_embedding_300_10_pipeline_sum), ("PA count embedding max 300 10", PA_count_embedding_300_10_pipeline_max), ("PA tfidf embedding max 300 10", PA_tfidf_embedding_300_10_pipeline_max), ("ET tfidf", ET_tfidf_pipeline), ("ET count", ET_count_pipeline), ("ET count embedding 100 15", ET_count_embedding_100_15_pipeline), ("ET tifdf embedding 100 15", ET_tfidf_embedding_100_15_pipeline), ("ET count embedding sum 100 15", ET_count_embedding_100_15_pipeline_sum), ("ET tifdf embedding sum 100 15", ET_tfidf_embedding_100_15_pipeline_sum), ("ET count embedding max 100 15", ET_count_embedding_100_15_pipeline_max), ("ET tifdf embedding max 100 15", ET_tfidf_embedding_100_15_pipeline_max), ("ET count embedding 100 10", ET_count_embedding_100_10_pipeline), ("ET tifdf embedding 100 10", ET_tfidf_embedding_100_10_pipeline), ("ET count embedding sum 100 10", ET_count_embedding_100_10_pipeline_sum), ("ET tifdf embedding sum 100 10", ET_tfidf_embedding_100_10_pipeline_sum), ("ET count embedding max 100 10", ET_count_embedding_100_10_pipeline_max), ("ET tifdf embedding max 100 10", ET_tfidf_embedding_100_10_pipeline_max), ("ET count embedding 300 10", ET_count_embedding_300_10_pipeline), ("ET tifdf embedding 300 10", ET_tfidf_embedding_300_10_pipeline), ("ET count embedding sum 300 10", ET_count_embedding_300_10_pipeline_sum), ("ET tifdf embedding sum 300 10", ET_tfidf_embedding_300_10_pipeline_sum), ("ET count embedding max 300 10", ET_count_embedding_300_10_pipeline_max), ("ET tifdf embedding max 300 10", ET_tfidf_embedding_300_10_pipeline_max), ("ET count embedding 300 15", ET_count_embedding_300_15_pipeline), ("ET tifdf embedding 300 15", ET_tfidf_embedding_300_15_pipeline), ("ET count embedding sum 300 15", ET_count_embedding_300_15_pipeline_sum), ("ET tifdf embedding sum 300 15", ET_tfidf_embedding_300_15_pipeline_sum), ("ET count embedding max 300 15", ET_count_embedding_300_15_pipeline_max), ("ET tifdf embedding max 300 15", ET_tfidf_embedding_300_15_pipeline_max) ] return all_models
verbose=0, random_state=None, max_iter=1000), n_jobs=1)), ]) for emotion in emotions: SVC.fit(X_train, train[emotion]) prediction = SVC.predict(X_test) print('Test accuracy of :' + emotion) print(accuracy_score(test[emotion], prediction)) # In[19]: print("PAC") pac = Pipeline([ ('tfidf', TfidfVectorizer(stop_words=stop_words)), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier(random_state=0))), ]) for emotion in emotions: pac.fit(X_train, train[emotion]) prediction = pac.predict(X_test) print('Test accuracy of :' + emotion) print(accuracy_score(test[emotion], prediction)) # In[21]: import pickle pkl_filename = "pickle_model.pkl" with open(pkl_filename, 'wb') as file: pickle.dump(SVC, file)
def evaluate_classifiers(): train, test, labelsTrain, labelsTest = prepare_train_test() pipe = make_preprocessing_pipeline() pipe.fit(train, labelsTrain) def benchmark(clf): "Benchmarks an algorithm." print('_' * 10) print(clf) t0 = time() clf.fit(pipe.transform(train), labelsTrain) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(pipe.transform(test)) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = accuracy_score(labelsTest, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 10) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 10) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark( LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print('=' * 10) print("Elastic-Net penalty") results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 10) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 10) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) print('=' * 10) print("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append( benchmark( Pipeline([('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)), ('classification', LinearSVC())])))
def classify2(X, Y, classifier, X_test, Y_test): name = classifier[0] clf = classifier[1] print("training %s" % name) clf.fit(X, Y) y_pred = clf.predict(X_test) accuracy = np.mean(y_pred == Y_test) * 100 print(accuracy) # define different classifiers classifiers = [("KNneighbors", KNeighborsClassifier(n_neighbors=3)), ("SVM", svm.SVC()), ("SAG", LogisticRegression(solver='sag', tol=1e-1)), ("SGD", SGDClassifier()), ("ASGD", SGDClassifier(average=True)), ("Perceptron", Perceptron()), ("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge', C=1.0)), ("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge', C=1.0))] #running k-NN algorithm with 80:20 training:test split: X = data[::50] #< - - Small subset of data Y = labels_tr[::50] t1 = time() classify(X, Y, ("KNneighbors", KNeighborsClassifier(n_neighbors=7)), 0.2) t2 = time() print(t2 - t1, 's')
# Iterator over parsed Reuters SGML files. data_stream = stream_reuters_documents() # We learn a binary classification between the "acq" class and all the others. # "acq" was chosen as it is more or less evenly distributed in the Reuters # files. For other datasets, one should take care of creating a test set with # a realistic portion of positive instances. all_classes = np.array([0, 1]) positive_class = 'acq' # Here are some classifiers that support the `partial_fit` method partial_fit_classifiers = { 'SGD': SGDClassifier(max_iter=5, tol=1e-3), 'Perceptron': Perceptron(tol=1e-3), 'NB Multinomial': MultinomialNB(alpha=0.01), 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3), } def get_minibatch(doc_iter, size, pos_class=positive_class): """Extract a minibatch of examples, return a tuple X_text, y. Note: size is before excluding invalid docs with no topics assigned. """ data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics']) for doc in itertools.islice(doc_iter, size) if doc['topics']] if not len(data): return np.asarray([], dtype=int), np.asarray([], dtype=int) X_text, y = zip(*data) return X_text, np.asarray(y, dtype=int)
y = [ 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male' ] treeCLF = tree.DecisionTreeClassifier() treeCLF = treeCLF.fit(x, y) treePredict = treeCLF.predict([[190, 70, 43]]) print('Decision Tree Classifier') print(treePredict) print('-------------------------') pac = PassiveAggressiveClassifier(random_state=0) pac.fit(x, y) pacPredict = pac.predict([[190, 70, 43]]) print('Passive Aggressive Classifier') print(pacPredict) print('-------------------------') nc = KNeighborsClassifier() nc.fit(x, y) ncPredict = nc.predict([[190, 70, 43]]) print('Neighbor Classifier') print(ncPredict)
def benchmark(clf): t0 = time() clf.fit(x_train, categorie_train) train_time = time() - t0 print(" Train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(x_test) test_time = time() - t0 print(" Test time: %0.3fs" % test_time) score = metrics.accuracy_score(categorie_test, pred) print(" Accuratezza: %0.3f" % score) #SGDClassifier print(" SGDClassifier test:\n") SDGC = SGDClassifier(penalty='l2', n_jobs=-1) benchmark(SDGC) print("\n\n") #ComplementNB print(" ComplementNB test:\n") CNB = ComplementNB() benchmark(CNB) print("\n\n") #PassiveAggressiveClassifier print(" PassiveAggressiveClassifier test:\n") PAC = PassiveAggressiveClassifier(max_iter=50, n_jobs=-1) benchmark(PAC) #Fine script print("\n") print("\n---------- ESECUZIONE TERMINATA! ----------\n\n")
# Iterator over parsed Reuters SGML files. data_stream = stream_reuters_documents() # We learn a binary classification between the "acq" class and all the others. # "acq" was chosen as it is more or less evenly distributed in the Reuters # files. For other datasets, one should take care of creating a test set with # a realistic portion of positive instances. all_classes = np.array([0, 1]) positive_class = 'acq' # Here are some classifiers that support the `partial_fit` method partial_fit_classifiers = { 'SGD': SGDClassifier(), 'Perceptron': Perceptron(), 'NB Multinomial': MultinomialNB(alpha=0.01), 'Passive-Aggressive': PassiveAggressiveClassifier(), } def get_minibatch(doc_iter, size, pos_class=positive_class): """Extract a minibatch of examples, return a tuple X_text, y. Note: size is before excluding invalid docs with no topics assigned. """ data = [(u'{title}\n\n{body}'.format(**doc), pos_class in doc['topics']) for doc in itertools.islice(doc_iter, size) if doc['topics']] if not len(data): return np.asarray([], dtype=int), np.asarray([], dtype=int) X_text, y = zip(*data)
from sklearn.metrics import roc_auc_score, roc_curve from sklearn import metrics from sklearn.model_selection import train_test_split from sklearn.preprocessing import Binarizer from sklearn.linear_model import Perceptron, SGDClassifier, PassiveAggressiveClassifier Data_reader = csv_reader.CsvReader('../DataSet/TalkingData') Data_writer = csv_reader.CsvReader('../output') clfdir = { 'MNB': MultinomialNB(), 'BNB': BernoulliNB(), 'PT': Perceptron(), 'SG': SGDClassifier(), 'PAC': PassiveAggressiveClassifier() } # clfdir={'MNB':MultinomialNB(), # 'BNB':BernoulliNB()} def get_next_data(train_data_chunk, persent=1, number=20000): print('get_next_data ') res_data = pd.DataFrame() test_data = pd.DataFrame() for data_chunk in train_data_chunk: if test_data.empty: test_data = data_chunk else: if random.randint(0, 10) > 2:
def run_exp(outpath, random_states, datasets, model_list, alphas=None, proportion=None): id = time.time() directory = "%s/%i" % (outpath, int(id)) stat_rate = 100 if not os.path.exists(directory): os.makedirs(directory) for random_state in random_states: directory1 = "%s/%i" % (directory, random_state) if not os.path.exists(directory1): os.makedirs(directory1) for dataset in datasets: directory2 = "%s/%s" % (directory1, dataset) if not os.path.exists(directory2): os.makedirs(directory2) if dataset == "20news": data = load_20news(100000, random_state) elif dataset == "mnist": data = load_mnist_train_test(random_state) elif dataset == "product_type": data = load_product_type_dataset(random_state) else: raise Exception( "Dataset %s is not found. Download data from google drive. Set the correct path to data" % dataset) train_data, train_target, test_data, test_target = data if proportion is not None: end = int(train_data.shape[0] * proportion) if end <= 0: raise Exception("Proportion is invalid") train_data, train_target = train_data[:end], train_target[:end] n, d = train_data.shape c = np.unique(train_target).shape[0] scale = n / batch_size # scale = 10e8 if dataset == "20news": lr = 0.05 else: lr = 0.01 n_iter = 1 print( "Data set summery : number of samples %i, number of features %i, number of classes %i" % (n, d, c)) # create models models = {} bmodels = [] for model in model_list: if model == "AROW": if dataset == "20news": m_model = DummyModel() models[model] = m_model else: if c > 2: # m_model = sol.SOL('arow', c) m_model = M_AROW( c, d) # TODO verify the arrow implementation else: m_model = AROW(d) models[model] = m_model elif model == "SGD": m_model = SGDClassifier(max_iter=1, tol=1) models[model] = m_model elif model == "PA": m_model = PassiveAggressiveClassifier(max_iter=1, tol=1) models[model] = m_model elif model == "BB-SVB": m_model = BayesianClassifier("BB-SVB", n_samples, d, c, learning_rate=lr) models[model] = m_model bmodels.append("BB-SVB") elif model == "SSVB": m_model = BayesianClassifier("SSVB", n_samples, d, c, learning_rate=lr) bmodels.append("SSVB") models[model] = m_model elif model == "SVI": m_model = BayesianClassifier("SVI", n_samples, d, c, learning_rate=lr, scale=scale) bmodels.append("SVI") models[model] = m_model elif model == "PVI": if alphas is None or len(alphas) < 1: raise Exception("%s require valid values for alpha" % model) for alpha in alphas: model = "PVI(%d)" % int(np.log10(alpha)) m_model = BayesianClassifier(model, n_samples, d, c, learning_rate=lr, scale=alpha) bmodels.append(model) models[model] = m_model else: raise Exception("Unknown model : %s" % model) loglik_arr = {} loss_arr = {} kl_arr = {} mean_arr = {} std_arr = {} for model in bmodels: loss_arr[model] = [] kl_arr[model] = [] mean_arr[model] = [] std_arr[model] = [] loglik_arr[model] = [] error_counts = {} error_arr = {} out_stats = { "error": error_arr, "loss": loss_arr, "kl": kl_arr, "mean": mean_arr, "std": std_arr } for model in models: error_counts[model] = 0 error_arr[model] = [] for i in range(0, n, batch_size): X, y = get_minibatch(train_data, train_target, i, batch_size) for model in models: if i == 0: if model in ["PA", "SGD"]: pred = [0] else: pred = models[model].predict(X) else: pred = models[model].predict(X) # error logs if pred[0] != y[0]: error_counts[model] += 1 error_arr[model].append(error_counts[model]) if model in loss_arr: loss, kl = models[model].fit(X, y, i, iter=n_iter) if i % stat_interval == 0: log_p = models[model].log_predictive_likelihood( test_data, test_target) loglik_arr[model].append(log_p[0]) _mean, _std = models[model].get_vars() mean_arr[model].append(str(_mean.tolist()[:10])) std_arr[model].append(str(_std.tolist()[:10])) loss_arr[model].append(loss) kl_arr[model].append(kl) elif model in ["PA", "SGD"]: models[model].partial_fit(X, y, range(0, c)) else: models[model].fit(X, y) losses_str = "%d" % i for model in loss_arr: losses_str += " %s : (%f, %f)" % ( model, loss_arr[model][-1], kl_arr[model][-1]) print(losses_str) error_str = "Error Counts ------->" for model in models: error_str += " %s:%d," % (model, error_counts[model]) print(error_str) print( "========================================================") if i % stat_rate == 0 or i == n - 1: print("writing stats to directory : %s" % directory2) for stat in out_stats: path = "%s/%s.csv" % (directory2, stat) stats = out_stats[stat] df = pd.DataFrame(stats) if not os.path.isfile(path): df.to_csv(path) else: df.to_csv(path, mode='a', header=False) for model in stats: stats[model] = [] error_str = "Final error rate ========>>" for model in models: error_str += " %s:%f," % (model, error_counts[model] / float(n)) print(error_str) # final accuracy computation final_acc_arr = {} for model in models: pred = models[model].predict(test_data) if c <= 2: acc = f1_score(test_target, np.array(pred).flatten()) else: acc = f1_score(test_target, np.array(pred).flatten(), average="micro") final_acc_arr[model] = [acc] print("Accuracy of %s : %f" % (model, acc)) final_stats = { "accuracy": final_acc_arr, "likelihood": loglik_arr, } for stat in final_stats: path = "%s/%s.csv" % (directory2, stat) pd.DataFrame(final_stats[stat]).to_csv(path)
}, 'cnb': { 'cnb__alpha': [1, 0.1, 0.01, 0.001] }, 'ridge': {}, 'perceptron': {}, 'pa': {}, 'rf': {}, 'nc': {}, 'cnb': {} } models = { 'mnb': MultinomialNB(), 'ridge': RidgeClassifier(tol=1e-2, solver="sag"), 'perceptron': Perceptron(max_iter=50), 'pa': PassiveAggressiveClassifier(max_iter=50), 'knn': KNeighborsClassifier(n_neighbors=10), 'rf': RandomForestClassifier(), 'lsvc': LinearSVC(dual=False, tol=1e-3), 'sgd': SGDClassifier(alpha=.0001, max_iter=50), 'nc': NearestCentroid(), 'cnb': ComplementNB() } for key in models: print(key) params = {} steps = {} steps[key] = Pipeline(pre_steps + [(key, models[key])]) params[key] = [{**map, **models_param_grid[key]} for map in vect_param] estimator = EstimatorSelectionHelper(steps, params)
save_classifier = open("c:/users/user/desktop/project/logistics_classifier.pickle","wb") pickle.dump(logistics_classifier,save_classifier) save_classifier.close() #SGDClassifier SGDClassifier_classifier=SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) print('Original SGDC Classifier algo accuracy percent:',(nltk.classify.accuracy(SGDClassifier_classifier,testing_set))*100) save_classifier = open("c:/users/user/desktop/project/SGDClassifier_classifier.pickle","wb") pickle.dump(SGDClassifier_classifier,save_classifier) save_classifier.close() #PassiveAgressiveClassifier PassiveAggressiveClassifier_classifier=SklearnClassifier(PassiveAggressiveClassifier()) PassiveAggressiveClassifier_classifier.train(training_set) print('Original Passive Aggressive Classifier algo accuracy percent:',(nltk.classify.accuracy(PassiveAggressiveClassifier_classifier,testing_set))*100) save_classifier = open("c:/users/user/desktop/project/PassiveAggressiveClassifier_classifier.pickle","wb") pickle.dump(PassiveAggressiveClassifier_classifier,save_classifier) save_classifier.close() #LinearSVC_classifier LinearSVC_classifier=SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print('Original Linear SVC_classifier algo accuracy percent:',(nltk.classify.accuracy(LinearSVC_classifier,testing_set))*100) save_classifier = open("c:/users/user/desktop/project/LinearSVC_classifier.pickle","wb") pickle.dump(LinearSVC_classifier,save_classifier) save_classifier.close()
SVC(kernel="linear", C=100, class_weight='balanced'), SVC(kernel="linear", C=500, class_weight='balanced'), SVC(kernel="linear", C=1000, class_weight='balanced'), SVC(kernel="linear", C=0.25, class_weight='balanced'), SVC(gamma=2, C=0.1, class_weight='balanced'), SVC(gamma=2, C=0.25, class_weight='balanced'), SVC(C=0.1, class_weight='balanced'), SVC(C=5, class_weight='balanced'), SVC(C=10, class_weight='balanced'), SVC(C=50, class_weight='balanced'), SVC(C=100, class_weight='balanced'), SVC(C=500, class_weight='balanced'), SVC(C=1000, class_weight='balanced'), SVC(class_weight='balanced') ], [GaussianNB()], [RandomForestClassifier()], [PassiveAggressiveClassifier()], [AdaBoostClassifier()], [GradientBoostingClassifier()]] score = {} res = {} kfold = KFold(n_splits=10) for i in range(len(clf)): scores = [] for j in clf[i]: fold_score = [] for train_index, test_index in kfold.split(x_pol, y_pol): x_train = x_pol[train_index] x_test = x_pol[test_index] y_train = y_pol[train_index] y_test = y_pol[test_index]
# Learn vocabulary from train set countVec.fit(trainText) # Transform list of review to matrix of bag-of-word vectors trainX = countVec.transform(trainText) devX = countVec.transform(devText) testX = countVec.transform(testText) print("Shape of Train X {}\n".format(trainX.shape)) print("Sample of the vocab:\n {}".format( np.random.choice(countVec.get_feature_names(), 20))) #%% PICK A MODEL AND EXPERIMENT lr = LogisticRegression() passAgg = PassiveAggressiveClassifier() perceptron = Perceptron() lr.fit(trainX, trainY) print("Logistic Regression Train:", lr.score(trainX, trainY)) print("Logistic Regression Dev:", lr.score(devX, devY)) print("--") passAgg.fit(trainX, trainY) print("Passive Aggressive Train:", passAgg.score(trainX, trainY)) print("Passive Aggressive Dev:", passAgg.score(devX, devY)) print("--") perceptron.fit(trainX, trainY) print("Perceptron Train:", perceptron.score(trainX, trainY)) print("Perceptron Dev:", perceptron.score(devX, devY))
def train_and_test_classifiers(train_set, test_set): classifier = nltk.NaiveBayesClassifier.train(train_set) print("Classic Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(classifier, test_set)) * 100) # classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier( MultinomialNB(alpha=0.01, fit_prior=False)) MNB_classifier.train(train_set) print("Multinomial Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_set)) * 100) print("Skipping Gaussian Bayes Classifier accuracy percent") # GNB_classifier = SklearnClassifier(GaussianNB()) # GNB_classifier.fit(features_train, target_train) # target_pred = clf.predict(features_test) # GNB_classifier.train(train_set) # print("Gaussian Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(GNB_classifier, test_set))*100) BNB_classifier = SklearnClassifier(BernoulliNB(alpha=.01)) BNB_classifier.train(train_set) print("Bernoulli Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(BNB_classifier, test_set)) * 100) LG_classifier = SklearnClassifier(LogisticRegression(random_state=42)) LG_classifier.train(train_set) print("Logistic Regression Classifier accuracy percent:", (nltk.classify.accuracy(LG_classifier, test_set)) * 100) # Train SGD with hinge penalty SGD_classifier1 = SklearnClassifier( SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=1000, tol=None)) # SGD_classifier = SklearnClassifier(SGDClassifier(alpha=0.0005, max_iter=1000)) SGD_classifier1.train(train_set) print("Stochastic Gradient Descent Classifier 1 accuracy percent:", (nltk.classify.accuracy(SGD_classifier1, test_set)) * 100) # Train SGD with Elastic Net penalty SGD_classifier2 = SklearnClassifier( SGDClassifier(alpha=1e-3, random_state=42, penalty="elasticnet", max_iter=1000, tol=None)) SGD_classifier2.train(train_set) print("Stochastic Gradient Descent Classifier 2 accuracy percent:", (nltk.classify.accuracy(SGD_classifier2, test_set)) * 100) # print("Skipping C-Support Vector Classifier") # print("Skipping Linear-Support Vector Classifier") SVC_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set) SVC_classifier.train(train_set) print("C-Support Vector Classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set)) * 100) LinearSVC_classifier1 = SklearnClassifier( SVC(kernel='linear', probability=True, tol=1e-3)) LinearSVC_classifier1.train(train_set) print("Linear Support Vector Classifier 1 accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier1, test_set)) * 100) LinearSVC_classifier2 = SklearnClassifier( LinearSVC("l1", dual=False, tol=1e-3)) LinearSVC_classifier2.train(train_set) print("Linear Support Vector Classifier 2 accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier2, test_set)) * 100) LinearSVC_classifier3 = SklearnClassifier( LinearSVC("l2", dual=False, tol=1e-3)) LinearSVC_classifier3.train(train_set) print("Linear Support Vector Classifier 3 accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier3, test_set)) * 100) NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier.train(train_set) print("Nu-Support Vector Classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test_set)) * 100) # new code # Train NearestCentroid (aka Rocchio classifier) without threshold Nearest_Centroid_classifier = SklearnClassifier(NearestCentroid()) Nearest_Centroid_classifier.train(train_set) print("Nearest Centroid Classifier accuracy percent:", (nltk.classify.accuracy(Nearest_Centroid_classifier, test_set)) * 100) Ridge_classifier = SklearnClassifier( RidgeClassifier(alpha=0.5, tol=1e-2, solver="sag")) Ridge_classifier.train(train_set) print("Ridge Classifier accuracy percent:", (nltk.classify.accuracy(Ridge_classifier, test_set)) * 100) Perceptron_classifier = SklearnClassifier(Perceptron(max_iter=1000)) Perceptron_classifier.train(train_set) print("Perceptron Classifier accuracy percent:", (nltk.classify.accuracy(Perceptron_classifier, test_set)) * 100) Passive_Aggressive_classifier = SklearnClassifier( PassiveAggressiveClassifier(max_iter=1000)) Passive_Aggressive_classifier.train(train_set) print("Passive-Aggressive Classifier accuracy percent:", (nltk.classify.accuracy(Passive_Aggressive_classifier, test_set)) * 100) kNN_classifier = SklearnClassifier(KNeighborsClassifier(n_neighbors=10)) kNN_classifier.train(train_set) print("kNN Classifier accuracy percent:", (nltk.classify.accuracy(kNN_classifier, test_set)) * 100) voted_classifier = VoteClassifier(classifier, MNB_classifier, BNB_classifier, LG_classifier, SGD_classifier2, LinearSVC_classifier2, NuSVC_classifier) print("Voted Classifier Classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, test_set)) * 100) print("Classification: ", voted_classifier.classify(test_set[0][0]), "Confidence: %", voted_classifier.confidence(test_set[0][0]) * 100) print("Classification: ", voted_classifier.classify(test_set[2][0]), "Confidence: %", voted_classifier.confidence(test_set[2][0]) * 100) print("Classification: ", voted_classifier.classify(test_set[3][0]), "Confidence: %", voted_classifier.confidence(test_set[3][0]) * 100) print("Classification: ", voted_classifier.classify(test_set[4][0]), "Confidence: %", voted_classifier.confidence(test_set[4][0]) * 100)
def test_partial_fit_weight_class_balanced(): # partial_fit with class_weight='balanced' not supported clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100) with pytest.raises(ValueError): clf.partial_fit(X, y, classes=np.unique(y))
from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from baseline_estimators import BaselineClassifier, BaselineRegressor def getts(): return dt.datetime.today().strftime('%Y-%m-%d %H:%M:%S') classifiers = dict( lr=LogisticRegression(), lrcv=LogisticRegressionCV(), sgd=SGDClassifier(), perceptron=Perceptron(), pac=PassiveAggressiveClassifier(), lsvc=LinearSVC(C=5), dt=DecisionTreeClassifier(), rf=RandomForestClassifier(max_depth=50, n_estimators=100, min_samples_split=20), gbm=GradientBoostingClassifier(max_depth=20, n_estimators=50, min_samples_split=20), gs_lr=GridSearchCV(estimator=LogisticRegression(), param_grid=dict(C=[1, 3, 5, 10], penalty=['l1', 'l2']), cv=3), gs_dt=GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=dict(max_depth=[5, 7, 10], min_samples_split=[10, 20, 30]), cv=3),
# Split the dataset x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7) # Initialize a TfidfVectorizer tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) # Fit and transform train set, transform test set tfidf_train = tfidf_vectorizer.fit_transform(x_train) tfidf_test = tfidf_vectorizer.transform(x_test) # print(tfidf_test) # Initialize a PassiveAggressiveClassifier pac=PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train,y_train) # Predict on the test set and calculate accuracy y_pred=pac.predict(tfidf_test) score=accuracy_score(y_test,y_pred) # print(f'Accuracy: {round(score*100,2)}%') title = "Darren's Amazing Super Powers" text = "I have 300 IQ, I am 50 feet tall and can shoot lasers out of my eyes. I am the greatest of them all and everyone refer to me as Darren The Great." # print(pac.predict([[text]])) f = open('input.txt', "r") input_data = f.read() vectorized_input_data = tfidf_vectorizer.transform([input_data])
# Predict and calculate accuracy clf_pred = clf.predict(tfidf_test) score = accuracy_score(y_test, clf_pred) print(f'Accuracy: {round(score*100,2)}%') # ## PASSIVE AGGRESSIVE CLASIFIER # In[269]: from sklearn.linear_model import PassiveAggressiveClassifier # In[302]: # Initialize the PassiveAggressiveClassifier and fit training sets pa_classifier = PassiveAggressiveClassifier(max_iter=1000, early_stopping=True, random_state=42) pa_classifier.fit(tfidf_train, y_train) # In[303]: # Predict and calculate accuracy y_pred = pa_classifier.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score*100,2)}%') # In[272]: # Build confusion matrix confusion_matrix(y_test, y_pred, labels=[0, 1])
GradientBoostingClassifier(), # GaussianProcessClassifier(multi_class = 'one_vs_rest', n_jobs = -1), # Out of memory 配置不了所需記憶體 KNeighborsClassifier(n_jobs=-1), LGBMClassifier(n_jobs=-1), # LabelSpreading(n_jobs = -1), # Out of Memory 配置不了所需記憶體 LinearDiscriminantAnalysis(), LogisticRegression(solver='newton-cg', multi_class='multinomial', n_jobs=-1), LogisticRegressionCV(solver='newton-cg', multi_class='multinomial', n_jobs=-1), MLPClassifier(), NearestCentroid(), # LabelPropagation(n_jobs = -1), # Out of Memory 配置不了所需記憶體 PassiveAggressiveClassifier(n_jobs=-1), Perceptron(n_jobs=-1), QuadraticDiscriminantAnalysis(), # RadiusNeighborsClassifier(n_jobs = -1), # Out of Memory 被砍掉 RandomForestClassifier(n_jobs=-1), RidgeClassifier(), RidgeClassifierCV(), XGBClassifier(n_jobs=-1) ] # 訓練報告總記錄 train_reports = [] for test_size in test_size_list: for random_seed in random_seed_list:
def featureSelection(globalIndex, variableSize, run): # a few hard-coded values numberOfFolds = 10 # list of classifiers, selected on the basis of our previous paper " classifierList = [ # ensemble #[AdaBoostClassifier(), "AdaBoostClassifier"], #[AdaBoostClassifier(n_estimators=300), "AdaBoostClassifier(n_estimators=300)"], #[AdaBoostClassifier(n_estimators=1500), "AdaBoostClassifier(n_estimators=1500)"], #[BaggingClassifier(), "BaggingClassifier"], [ GradientBoostingClassifier(n_estimators=300), "GradientBoostingClassifier(n_estimators=300)" ], [ RandomForestClassifier(n_estimators=300), "RandomForestClassifier(n_estimators=300)" ], [LogisticRegression(), "LogisticRegression"], [PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"], [SGDClassifier(), "SGDClassifier"], [SVC(kernel='linear'), "SVC(linear)"], [RidgeClassifier(), "RidgeClassifier"], [ BaggingClassifier(n_estimators=300), "BaggingClassifier(n_estimators=300)" ], #[ExtraTreesClassifier(), "ExtraTreesClassifier"], #[ExtraTreesClassifier(n_estimators=300), "ExtraTreesClassifier(n_estimators=300)"], #[GradientBoostingClassifier(), "GradientBoostingClassifier"], # features_importances_ #[GradientBoostingClassifier(n_estimators=300), "GradientBoostingClassifier(n_estimators=300)"], #[GradientBoostingClassifier(n_estimators=1000), "GradientBoostingClassifier(n_estimators=1000)"], #[RandomForestClassifier(), "RandomForestClassifier"], #[RandomForestClassifier(n_estimators=300), "RandomForestClassifier(n_estimators=300)"], #[RandomForestClassifier(n_estimators=1000), "RandomForestClassifier(n_estimators=1000)"], # features_importances_ # linear #[ElasticNet(), "ElasticNet"], #[ElasticNetCV(), "ElasticNetCV"], #[Lasso(), "Lasso"], #[LassoCV(), "LassoCV"], #[LogisticRegression(), "LogisticRegression"], # coef_ #[LogisticRegressionCV(), "LogisticRegressionCV"], #[PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"], # coef_ #[RidgeClassifier(), "RidgeClassifier"], # coef_ #[RidgeClassifierCV(), "RidgeClassifierCV"], #[SGDClassifier(), "SGDClassifier"], # coef_ #[SVC(kernel='linear'), "SVC(linear)"], # coef_, but only if the kernel is linear...the default is 'rbf', which is NOT linear # naive Bayes #[BernoulliNB(), "BernoulliNB"], #[GaussianNB(), "GaussianNB"], #[MultinomialNB(), "MultinomialNB"], # neighbors #[KNeighborsClassifier(), "KNeighborsClassifier"], # no way to return feature importance # TODO this one creates issues #[NearestCentroid(), "NearestCentroid"], # it does not have some necessary methods, apparently #[RadiusNeighborsClassifier(), "RadiusNeighborsClassifier"], # tree #[DecisionTreeClassifier(), "DecisionTreeClassifier"], #[ExtraTreeClassifier(), "ExtraTreeClassifier"], ] # this is just a hack to check a few things #classifierList = [ # [RandomForestClassifier(), "RandomForestClassifier"] # ] print("Loading dataset...") if (globalIndex == 0): X, y, biomarkerNames = loadDatasetOriginal(run) else: X, y, biomarkerNames = loadDataset(globalIndex, run) numberOfTopFeatures = int(variableSize) # create folder folderName = "./run" + str(run) + "/" if not os.path.exists(folderName): os.makedirs(folderName) # prepare folds skf = StratifiedKFold(n_splits=numberOfFolds, shuffle=True) indexes = [(training, test) for training, test in skf.split(X, y)] # this will be used for the top features topFeatures = dict() # iterate over all classifiers classifierIndex = 0 globalAccuracy = 0 for originalClassifier, classifierName in classifierList: print("\nClassifier " + classifierName) classifierPerformance = [] classifierTopFeatures = dict() # iterate over all folds for train_index, test_index in indexes: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # let's normalize, anyway scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) classifier = copy.deepcopy(originalClassifier) classifier.fit(X_train, y_train) scoreTraining = classifier.score(X_train, y_train) scoreTest = classifier.score(X_test, y_test) print("\ttraining: %.4f, test: %.4f" % (scoreTraining, scoreTest)) classifierPerformance.append(scoreTest) # now, let's get a list of the most important features, then mark the ones in the top X orderedFeatures = relativeFeatureImportance(classifier) for i in range(0, numberOfTopFeatures): feature = int(orderedFeatures[i][1]) if feature in topFeatures: topFeatures[feature] += 1 else: topFeatures[feature] = 1 if feature in classifierTopFeatures: classifierTopFeatures[feature] += 1 else: classifierTopFeatures[feature] = 1 line = "%s\t%.4f\t%.4f\n" % (classifierName, np.mean(classifierPerformance), np.std(classifierPerformance)) globalAccuracy = globalAccuracy + np.mean(classifierPerformance) print(line) fileName = folderName + "results.txt" fo = open(fileName, 'a') fo.write(line) fo.close() # save most important features for the classifier with open(os.path.join(folderName, classifierName + ".csv"), "w") as fp: fp.write("feature,frequencyInTop" + str(numberOfTopFeatures) + "\n") # transform dictionary into list listOfClassifierTopFeatures = [(key, classifierTopFeatures[key]) for key in classifierTopFeatures] listOfClassifierTopFeatures = sorted(listOfClassifierTopFeatures, key=lambda x: x[1], reverse=True) for feature, frequency in listOfClassifierTopFeatures: fp.write( str(biomarkerNames[feature]) + "," + str(float(frequency / numberOfFolds)) + "\n") # save most important features overall with open( os.path.join(folderName, "global_" + str(int(globalIndex)) + ".csv"), "w") as fp: fp.write("feature,frequencyInTop" + str(numberOfTopFeatures) + "\n") # transform dictionary into list listOfTopFeatures = [(key, topFeatures[key]) for key in topFeatures] listOfTopFeatures = sorted(listOfTopFeatures, key=lambda x: x[1], reverse=True) tempIndex = 0 for feature, frequency in listOfTopFeatures: if tempIndex < numberOfTopFeatures: fp.write( str(biomarkerNames[feature]) + "," + str(float(frequency / numberOfFolds)) + "\n") tempIndex = tempIndex + 1 globalAccuracy = globalAccuracy / 8 return globalAccuracy
t0 = time() pred = clf.predict(X_test_counts) test_time = time() - t0 test_times.append(test_time) print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) scores.append(score) print("accuracy: %0.3f" % score) results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="sag",max_iter=1200000), "Ridge Classifier"), (Perceptron(max_iter=50), "Perceptron"), (PassiveAggressiveClassifier(max_iter=1200000), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=1200000, penalty=penalty)))
from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.linear_model import RidgeClassifier from sklearn.linear_model import RidgeClassifierCV from sklearn.linear_model import SGDClassifier from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.tree import ExtraTreeClassifier classifiers_list = { 'AdaBoostClassifier': AdaBoostClassifier(), 'BaggingClassifier': BaggingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'RandomForestClassifier': RandomForestClassifier(), 'LogisticRegression': LogisticRegression(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'RidgeClassifier': RidgeClassifier(), 'RidgeClassifierCV': RidgeClassifierCV(), 'SGDClassifier': SGDClassifier(), 'MLPClassifier': MLPClassifier(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'ExtraTreeClassifier': ExtraTreeClassifier() }