# ## Perceptrón # Un viejo conocido, en este caso también tiene opción de regularización. He probado, como se puede ver más abajo, con y sin ella (norma l2). # # - Itera por todos los datos. # - Si está bien situado para el dato dado, no cambia. # - Si no lo está, se corrige. # - Si no cambia en una pasada completa o llega a las iteraciones máximas, para. # In[7]: # Modelo Perceptrón from sklearn.linear_model import Perceptron models = [ Perceptron(random_state=1), Perceptron(alpha=0.0001, penalty='l2', random_state=1), Perceptron(alpha=0.00025, penalty='l2', random_state=1), Perceptron(alpha=0.0004, penalty='l2', random_state=1), ] model_strings = [ 'Perceptron sin regularización', 'Perceptron alpha = 0.0001', 'Perceptron alpha = 0.00025', 'Perceptron alpha = 0.0004', ] print(validate_models(model_strings, models, X_train, Y_train)) input('\nPulse cualquier tecla para continuar\n')
def test_predict_proba(create_X_y): X, y = create_X_y clf1 = Perceptron() clf1.fit(X, y) LCA([clf1, clf1]).fit(X, y)
def classify2(X, Y, classifier, X_test, Y_test): name = classifier[0] clf = classifier[1] print("training %s" % name) clf.fit(X, Y) y_pred = clf.predict(X_test) accuracy = np.mean(y_pred == Y_test) * 100 print(accuracy) # define different classifiers classifiers = [("KNneighbors", KNeighborsClassifier(n_neighbors=3)), ("SVM", svm.SVC()), ("SAG", LogisticRegression(solver='sag', tol=1e-1)), ("SGD", SGDClassifier()), ("ASGD", SGDClassifier(average=True)), ("Perceptron", Perceptron()), ("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge', C=1.0)), ("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge', C=1.0))] ##data_test=np.array(1) ##for i in range(10): ## if np.array_equal(data_test,np.array(1)): ## cur=np.load('data_test%d_10_7.npy' % i) ## shape=[1]+list(cur.shape) ## data_test=np.reshape(cur,shape) ## else: ## cur=np.load('data_test%d_10_7.npy' % i) ## shape=[1]+list(cur.shape) ## data_test=np.append(data_test,np.reshape(cur,shape),axis=0)
def main(): args = cmd_line.parse_args() util.prefix_init(args) util.pre_dataset = "wdbc" logger = logging.getLogger() logger.setLevel(logging.DEBUG) log.configure_logger(logger, util.pre_dataset) logger.info("--- WDBC dataset ---") data = np.genfromtxt(args.file, delimiter=",", converters={1: lambda x: 1.0 if x=='M' else 0.0}) Y = data[:, 1].astype(int) X = data[:, features_to_use()] X, Y, X_valid, Y_valid, X_test, Y_test = \ data_util.split_into_train_test_sets(X, Y, args.validation_portion, args.test_portion) logger.debug("%s %s", X.shape, X_test.shape) if args.normalize: logger.info("Normalizing...") util.pre_norm = "n" X, X_valid, X_test = data_util.normalize_all(X, X_valid, X_test) if args.draw_classes_histogram: draw_classes_histogram(X, Y) if args.draw_classes_data: util.draw_classes_data(X, Y, 5, 6) if args.bayes: logger.info("Bayes classifier...") util.pre_alg = "bayes" from ml_lib.gaussian_plugin_classifier import GaussianPlugInClassifier # Gaussian plug-in classifier gpi_classifier = GaussianPlugInClassifier(X, Y, 2) # util.report_accuracy(gpi_classifier.classify(X, Y, 0.5)[0]) util.report_accuracy( gpi_classifier.classify(X_test, Y_test, [0.5, 0.5])[0]) util.draw_ROC_curve(X_test, Y_test, gpi_classifier) # util.draw_classes_pdf(X, Y, gpi_classifier, [0.5, 0.5], 3) if args.naive: logger.info("Naive Bayes classifier...") util.pre_alg = "naive" from ml_lib.gaussian_naive_classifier import GaussianNaiveClassifier # Gaussian naive classifier gn_classifier = GaussianNaiveClassifier(X, Y, 2) # util.report_accuracy(gn_classifier.classify(X, Y, 0.5)[0]) util.report_accuracy( gn_classifier.classify(X_test, Y_test, [0.5, 0.5])[0]) util.draw_ROC_curve(X_test, Y_test, gn_classifier) if args.sklearn_perceptron: logger.info("Scikit-learn Perceptron...") util.pre_alg = "scikitperceptron" from sklearn.linear_model import Perceptron perceptron = Perceptron(tol=None, max_iter=300000) perceptron.fit(X, Y) logger.info("Mean accuracy: %s%%", 100 * perceptron.score(X, Y)) if args.perceptron: logger.info("Perceptron...") util.pre_alg = "perceptron" from ml_lib.perceptron import Perceptron helper.classify_one_vs_one([], X, Y, X_test, Y_test, 2, lambda X, Y: Perceptron(X, Y, args.stochastic, 1, 30000, 0)) if args.logistic: logger.info("Logistic Regression...") util.pre_alg = "logistic" from ml_lib.logistic import Logistic helper.classify_one_vs_one([], X, Y, X_test, Y_test, 2, lambda X, Y: Logistic(X, Y, step_size=0.001, max_steps=15000, reg_constant=1)) if args.knn: logger.info("k-Nearest Neighbor...") util.pre_alg = "knn" from ml_lib.knn import KNN k_range = 10 p_range = 6 # / 2.0 a_matrix = np.zeros((k_range, p_range)) for k in range(k_range): logger.info("%s-NN", k+1) for p in range(p_range): knn_classifier = KNN(X, Y, 1+k, dist_p=(p+1)/2.0) a_matrix[k, p] = util.get_accuracy( knn_classifier.classify(X_test, Y_test)) logger.info("%s", a_matrix) if args.svm: logger.info("Support Vector Machine...") util.pre_alg = "svm" from ml_lib.svm import SVM, RBFKernel single_svm_test = False if single_svm_test: cm = SVM(X, Y, lam=None).classify(X_test, Y_test) util.report_accuracy(cm) single_svm_rbf_test = False if single_svm_rbf_test: svm = SVM(X, Y, lam=100, kernel=RBFKernel(0.3)) cm = svm.classify(X_test, Y_test) util.report_accuracy(cm) linear_svm_validation = False if linear_svm_validation: #lam_val = [math.pow(1.2, p) for p in range(-10,20)] lam_val = [p/2 for p in range(1,200)] acc = np.zeros(len(lam_val)) for i, lam in enumerate(lam_val): svm_classifier = SVM(X, Y, lam)#), kernel=RBFKernel(1)) #util.report_accuracy(svm_classifier.classify(X, Y)) cm = svm_classifier.classify(X_valid, Y_valid) util.report_accuracy(cm) acc[i] = util.get_accuracy(cm) logger.info("\nAccuracies found for lambda:") for i, lam in enumerate(lam_val): logger.info("%f: \t%f", lam, acc[i]) util.plot_accuracy(acc, lam_val) rbf_svm_validation = False if rbf_svm_validation: for reps in range(2): pre_svm_cv_x = "b" if reps == 0 else "l" if pre_svm_cv_x == "b": lam_val = [math.pow(1.5, p+1)*10 for p in range(7)] b_val = [(p+1)/20 for p in range(27)] elif pre_svm_cv_x == "l": lam_val = [math.pow(1.2, p+1)*10 for p in range(27)] b_val = [(p+1)/10 for p in range(7)] logger.debug(lam_val) logger.debug(b_val) # Use a single instance so K matrix can be shared better single_svm = SVM(X) lmbd_classifier = lambda X, Y, b, lam, svm=single_svm: \ svm.initialize(Y, lam, RBFKernel(b)) cm, acc_2d_list = helper.classify([b_val, lam_val], X, Y, X_valid, Y_valid, lmbd_classifier) acc_matrix = np.array(acc_2d_list) logger.info("%s", acc_matrix) suff = "val_%s"%(pre_svm_cv_x) np.savetxt(util.prefix() + suff + ".csv", acc_matrix, delimiter=",", fmt='%.3f') if pre_svm_cv_x == 'b': util.plot_accuracies(acc_matrix.T, b_val, "RBF width b", lam_val, "Lambda (C)", suff) elif pre_svm_cv_x == 'l': util.plot_accuracies(acc_matrix, lam_val, "Lambda (C)", b_val, "RBF width b", suff)
X = iris.data[:, [2, 3]] y = iris.target # 标签已经转换成0,1,2了 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # # 为了追求机器学习和最优化算法的最佳性能,我们将特征缩放 from sklearn.preprocessing import StandardScaler sc = StandardScaler() sc.fit(X_train) # 估算每个特征的平均值和标准差 sc.mean_ # 查看特征的平均值 sc.scale_ # 查看特征的标准差 X_train_std = sc.transform(X_train) # 注意:这里我们要用同样的参数来标准化测试集,使得测试集和训练集之间有可比性 X_test_std = sc.transform(X_test) # 训练感知机模型 from sklearn.linear_model import Perceptron # n_iter:可以理解成梯度下降中迭代的次数 # eta0:可以理解成梯度下降中的学习率 # random_state:设置随机种子的,为了每次迭代都有相同的训练集顺序 ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0) ppn.fit(X_train_std, y_train) # 分类测试集,这将返回一个测试结果的数组 y_pred = ppn.predict(X_test_std) # 计算模型在测试集上的准确性,我的结果为0.9,还不错 accuracy_score(y_test, y_pred)
import numpy as np from sklearn.datasets import load_iris from sklearn.linear_model import Perceptron iris = load_iris() x = iris.data[:, (2, 3)] # 꽃잎의 길이와 너비 y = (iris.target == 0).astype(np.int) # 부채붓꽃(iris setosa)인가? per_clf = Perceptron( ) # loss = 'perceptron', lr = 'constant', eta0(학습률)= 1, penalty=None(규제 없음)인 per_clf.fit(x, y) # SGDClassifer와 같다 y_pred = per_clf.predict([[2, 0.5]]) print(y_pred) # [0]
from sklearn.metrics import accuracy_score import numpy as np def generate(count): x = [] y = [] for ir in range(0, count): math = np.random.randint(1, 6) physics = np.random.randint(1, 6) russian = np.random.randint(1, 6) disabled = np.random.randint(0, 2) x.append([math, physics, russian, disabled]) math_plus = math >= 4 physics_plus = physics >= 4 sum_plus = math + physics + russian >= 11 y.append(1 if disabled == 1 or (math_plus and physics_plus and sum_plus) else 0) return np.array(x), np.array(y) if __name__ == '__main__': X, y = generate(100) X_test, y_test = generate(20) perceptron = Perceptron(tol=0.0000001) perceptron.fit(X, y) predict = perceptron.predict(X_test) accuracy_score(predict, y_test) x_example = [[3, 3, 5, 1]] print("Passed" if perceptron.predict(x_example) == 1 else "Not passed")
TRAIN_DATA_NUM = 2 TEST_DATA_NUM = 1000 #%% if __name__ == "__main__": dataset = Dataset() svm_erro = 0 sv_vs_pla = 0 sv_num = 0 pla_erro = 0 for _ in range(RUNS): dataset.create_target_function() dataset.generate_data(TRAIN_DATA_NUM) pla = Perceptron(max_iter=1000).fit( dataset.X, dataset.Y ) # if fit_intercept=False the learning algorithm will force y intercept at the origin 0 svm = SVM() svm.fit(dataset.X, dataset.Y) plot_contour(svm, dataset) dataset.generate_data(TEST_DATA_NUM) svm_predict = svm.predict(dataset.X) pla_predict = pla.predict(dataset.X) sv_num += len(svm.alphas[svm.sv]) svm_erro += sum(svm_predict != dataset.Y) / TEST_DATA_NUM pla_erro += sum(pla_predict != dataset.Y) / TEST_DATA_NUM if sum(svm_predict != dataset.Y) / TEST_DATA_NUM < sum( pla_predict != dataset.Y) / TEST_DATA_NUM:
non_negative=True) # Iterator over parsed Reuters SGML files. data_stream = stream_reuters_documents() # We learn a binary classification between the "acq" class and all the others. # "acq" was chosen as it is more or less evenly distributed in the Reuters # files. For other datasets, one should take care of creating a test set with # a realistic portion of positive instances. all_classes = np.array([0, 1]) positive_class = 'acq' # Here are some classifiers that support the `partial_fit` method partial_fit_classifiers = { 'SGD': SGDClassifier(), 'Perceptron': Perceptron(), 'NB Multinomial': MultinomialNB(alpha=0.01), 'Passive-Aggressive': PassiveAggressiveClassifier(), } def get_minibatch(doc_iter, size, pos_class=positive_class): """Extract a minibatch of examples, return a tuple X_text, y. Note: size is before excluding invalid docs with no topics assigned. """ data = [(u'{title}\n\n{body}'.format(**doc), pos_class in doc['topics']) for doc in itertools.islice(doc_iter, size) if doc['topics']] if not len(data): return np.asarray([], dtype=int), np.asarray([], dtype=int)
"Perceptron", # "XGBreglinear", # "XGBreglogistic", "NearestNeighbors", # "LinearSVM", "DecisionTree", "RandomForest", "AdaBoost", #"NeuralNet", #"NaiveBayes", #"LDA", #"QDA" ] classifiers = [ Perceptron(), # xgb.XGBClassifier(objective='reg:linear'), # xgb.XGBClassifier(objective='reg:logistic'), KNeighborsClassifier(10), # SVC(kernel="linear"), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), #MLPClassifier(verbose=False), #GaussianNB(), #LinearDiscriminantAnalysis(), #QuadraticDiscriminantAnalysis() ] def getBestClassifiers(X, y, testPerc=0.4):
# perceptron to classify documents from sklearn.datasets import fetch_20newsgroups from sklearn.metrics import f1_score, classification_report from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer from sklearn.linear_model import Perceptron categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes')) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(newsgroups_train['data']) X_test = vectorizer.transform(newsgroups_test['data']) classifier = Perceptron(n_iter=100, eta0=0.1) classifier.fit(X_train, newsgroups_train['target']) predictions = classifier.predict(X_test) print(classification_report(newsgroups_test['target'], predictions))
# Iterator over parsed Reuters SGML files. data_stream = stream_reuters_documents() # We learn a binary classification between the "acq" class and all the others. # "acq" was chosen as it is more or less evenly distributed in the Reuters # files. For other datasets, one should take care of creating a test set with # a realistic portion of positive instances. all_classes = np.array([0, 1]) positive_class = 'acq' # Here are some classifiers that support the `partial_fit` method partial_fit_classifiers = { 'SGD': SGDClassifier(max_iter=5), 'Perceptron': Perceptron(tol=1e-3), 'NB Multinomial': MultinomialNB(alpha=0.01), 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3), } def get_minibatch(doc_iter, size, pos_class=positive_class): """Extract a minibatch of examples, return a tuple X_text, y. Note: size is before excluding invalid docs with no topics assigned. """ data = [(u'{title}\n\n{body}'.format(**doc), pos_class in doc['topics']) for doc in itertools.islice(doc_iter, size) if doc['topics']] if not len(data):
def get_model_from_name(model_name, training_params=None): global keras_imported # For Keras epochs = 250 if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy') epochs = 30 all_model_params = { 'LogisticRegression': {'n_jobs': -2}, 'RandomForestClassifier': {'n_jobs': -2}, 'ExtraTreesClassifier': {'n_jobs': -1}, 'AdaBoostClassifier': {'n_estimators': 10}, 'SGDClassifier': {'n_jobs': -1}, 'Perceptron': {'n_jobs': -1}, 'LinearSVC': {'dual': False}, 'LinearRegression': {'n_jobs': -2}, 'RandomForestRegressor': {'n_jobs': -2}, 'LinearSVR': {'dual': False, 'loss': 'squared_epsilon_insensitive'}, 'ExtraTreesRegressor': {'n_jobs': -1}, 'MiniBatchKMeans': {'n_clusters': 8}, 'GradientBoostingRegressor': {'presort': False, 'learning_rate': 0.05, 'warm_start': True}, 'GradientBoostingClassifier': {'presort': False, 'learning_rate': 0.05, 'warm_start': True}, 'SGDRegressor': {'shuffle': False}, 'PassiveAggressiveRegressor': {'shuffle': False}, 'AdaBoostRegressor': {'n_estimators': 10}, 'XGBRegressor': {'nthread':-1, 'n_estimators': 200}, 'XGBClassifier': {'nthread':-1, 'n_estimators': 200}, 'LGBMRegressor': {'n_estimators': 2000, 'learning_rate': 0.05, 'num_leaves': 8, 'lambda_l2': 0.001}, 'LGBMClassifier': {'n_estimators': 2000, 'learning_rate': 0.05, 'num_leaves': 8, 'lambda_l2': 0.001}, 'DeepLearningRegressor': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}, 'DeepLearningClassifier': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print('After overwriting our defaults with your values, here are the final params that will be used to initialize the model:') print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'SGDClassifier': SGDClassifier(), 'Perceptron': Perceptron(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), 'SGDRegressor': SGDRegressor(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans() } if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor(calc_feature_importance=True) model_map['CatBoostClassifier'] = CatBoostClassifier(calc_feature_importance=True) if model_name[:12] == 'DeepLearning': if keras_imported == False: # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead) try: os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow import logging logging.set_verbosity(logging.INFO) except: pass global maxnorm global Dense, Dropout global LeakyReLU, PReLU global Sequential global keras_load_model global regularizers global KerasRegressor, KerasClassifier from keras.constraints import maxnorm from keras.layers import Dense, Dropout from keras.layers.advanced_activations import LeakyReLU, PReLU from keras.models import Sequential from keras.models import load_model as keras_load_model from keras import regularizers from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier maxnorm Dense Dropout LeakyReLU PReLU Sequential keras_load_model regularizers KerasRegressor KerasClassifier keras_imported = True model_map['DeepLearningClassifier'] = KerasClassifier(build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor(build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print('It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize') raise(e) model_with_params = model_without_params.set_params(**model_params) return model_with_params
def test_predict_proba(): X = X_dsel_ex1 y = y_dsel_ex1 clf1 = Perceptron() clf1.fit(X, y) DESP([clf1, clf1]).fit(X, y)
# Logistic regression CV takes too much time to compile #print "\nUsing Logistic regression CV" #clf_LGCV = LogisticRegressionCV() #scores = cross_val_score(clf_LGCV, feature_normal, labels, cv=10, n_jobs = 4) #print scores #print "Accuracy", scores.mean() print "\nUsing MLPClassifier single hidden layer" mlp = MLPClassifier(alpha=1) scores = cross_val_score(mlp, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nUsing the perceptron" per = Perceptron(fit_intercept=False, n_iter=10, shuffle=False) scores = cross_val_score(per, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nUsing MLPClassifier 3 hidden layer" mlp = MLPClassifier(hidden_layer_sizes=(30, 30, 30)) scores = cross_val_score(mlp, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nUsing Passive aggressive Classifier" pac = PassiveAggressiveClassifier() scores = cross_val_score(pac, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean()
accuracy = total_correct_predictions / total_predictions_made * 100 ###################### Passive Aggressive ###########################--Code from ASTD classifier = PassiveAggressiveClassifier(n_iter=100) classifier.fit(X_train, Y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2] total_predictions_made = np.sum(cm) accuracy = total_correct_predictions / total_predictions_made * 100 ###################### Perceptron ###################################--Code from ASTD classifier = Perceptron(n_iter=100) classifier.fit(X_train, Y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2] total_predictions_made = np.sum(cm) accuracy = total_correct_predictions / total_predictions_made * 100 ###################### bnb ###########################################--Code from ASTD classifier = BernoulliNB(binarize=0.5) classifier.fit(X_train, Y_train) # Predicting the Test set results y_pred = classifier.predict(X_test)
iris = datasets.load_iris() X = iris.data[:, [2, 3]] y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) ppn = Perceptron(n_iter_no_change=40, eta0=0.1, random_state=0) ppn.fit(X_train_std, y_train) # y_pred = ppn.predict(X_test_std) # print(f'Misclassified samples: {(y_test != y_pred).sum()}') X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) plot_decision_regions(X=X_combined_std, y=y_combined, classifier=ppn, test_idx=range(105, 150)) plt.xlabel('petal length [standardized]') plt.ylabel('petal width [standardized]') plt.legend(loc='upper left') plt.show()
# Extracting data from our parser output and apply tokenizing if enabled if USE_NTLK_TOKENIZER: stopWords: set = set(stopwords.words('english')) trainingX: list = [' '.join(word for word in word_tokenize(userData.data) if word not in stopWords) for userData in trainingData] testX: list = [' '.join(word for word in word_tokenize(userData.data) if word not in stopWords) for userData in testData] else: trainingX: list = [userData.data for userData in trainingData] testX: list = [userData.data for userData in testData] trainingYGender: list = [userData.gender for userData in trainingData] testYGender: list = [userData.gender for userData in testData] trainingYAge: list = [userData.age for userData in trainingData] testYAge: list = [userData.age for userData in testData] defaultClassifier = Perceptron() print('INFO: Benchmarking Count Vectorizer (1/2)...', flush = True) BenchmarkVectorizer('CountVectorizer.Ngram12.Word', CountVectorizer(max_features = FEATURES_COUNT, ngram_range = (1, 2), analyzer = 'word'), defaultClassifier).run(timeStr, trainingX, trainingYGender, testX, testYGender) BenchmarkVectorizer('CountVectorizer.Ngram23.Word', CountVectorizer(max_features = FEATURES_COUNT, ngram_range = (2, 3), analyzer = 'word'), defaultClassifier).run(timeStr, trainingX, trainingYGender, testX, testYGender) BenchmarkVectorizer('CountVectorizer.Ngram34.Word', CountVectorizer(max_features = FEATURES_COUNT, ngram_range = (3, 4), analyzer = 'word'), defaultClassifier).run(timeStr, trainingX, trainingYGender, testX, testYGender) print('INFO: Benchmarking Count Vectorizer (2/2)...', flush = True) BenchmarkVectorizer('CountVectorizer.Ngram34.Char', CountVectorizer(max_features = FEATURES_COUNT, ngram_range = (3, 4), analyzer = 'char'), defaultClassifier).run(timeStr, trainingX, trainingYGender, testX, testYGender) BenchmarkVectorizer('CountVectorizer.Ngram45.Char', CountVectorizer(max_features = FEATURES_COUNT, ngram_range = (4, 5), analyzer = 'char'), defaultClassifier).run(timeStr, trainingX, trainingYGender, testX, testYGender) BenchmarkVectorizer('CountVectorizer.Ngram56.Char', CountVectorizer(max_features = FEATURES_COUNT, ngram_range = (5, 6), analyzer = 'char'), defaultClassifier).run(timeStr, trainingX, trainingYGender, testX, testYGender) print('INFO: Benchmarking TFIDF Vectorizer (1/2)...', flush = True)
# 5.4 k-Nearest Neighbors algorithm (or k-NN for short) knn = KNeighborsClassifier(n_neighbors = 3) knn.fit(X_train, Y_train) Y_pred = knn.predict(X_test) acc_knn = round(knn.score(X_train, Y_train) * 100, 2) acc_knn # 5.5 Gaussian Naive Bayes gaussian = GaussianNB() gaussian.fit(X_train, Y_train) Y_pred = gaussian.predict(X_test) acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2) acc_gaussian # 5.6 Perceptron perceptron = Perceptron() perceptron.fit(X_train, Y_train) Y_pred = perceptron.predict(X_test) acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2) acc_perceptron # 5.7 Linear SVC linear_svc = LinearSVC() linear_svc.fit(X_train, Y_train) Y_pred = linear_svc.predict(X_test) acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2) acc_linear_svc # 5.8 Stochastic Gradient Descent sgd = SGDClassifier() sgd.fit(X_train, Y_train)
def test_predict_proba(example_estimate_competence): X, y = example_estimate_competence[0:2] clf1 = Perceptron() clf1.fit(X, y) DESClustering([clf1, clf1]).fit(X, y)
#path_test="./test/train_sample_100k.txt" path_train="D:\\PhD\\Clone\\\MlCC\\train_samples\\train_equal_cloneNonClone.txt" path_train="D:\\PhD\\Clone\\\MlCC\\train_samples\\train_sample_100k.txt" path_test="D:\\PhD\\Clone\\\MlCC\\train_samples\\train_sample_100k.txt" colNames=["block1", "block2", "isClone", "COMP", "NOCL", "NOS", "HLTH", "HVOC", "HEFF", "HBUG", "CREF", "XMET", "LMET", "NLOC", "NOC", "NOA", "MOD", "HDIF", "VDEC", "EXCT", "EXCR", "CAST", "TDN", "HVOL", "NAND", "VREF", "NOPR", "MDN", "NEXP", "LOOP"] clones_test = pd.read_csv(path_test, names=colNames) array = clones_test.values X_test = array[:,3:30] Y_test = array[:,2] print("test loaded") chunkSize=1024 #clf=SGDClassifier() #clf=PassiveAggressiveClassifier() clf=Perceptron() for chunk in pd.read_csv(path_train, names=colNames, chunksize=chunkSize): chunk = chunk.sample(frac=1).reset_index(drop=True) # shuffle data array = chunk.values X_train = array[:, 3:30] Y_train = array[:, 2] start_time = time.time() model =clf.partial_fit(X_train,Y_train,classes=numpy.unique(Y_train.astype(bool))) end_time=time.time() print("one chunk complete") filename = 'sgd_model.sav' pickle.dump(clf, open(filename, 'wb')) print("model saved") # load the model from disk
def test_not_clustering_algorithm(create_X_y): X, y = create_X_y des_clustering = DESClustering(clustering=Perceptron()) with pytest.raises(ValueError): des_clustering.fit(X, y)
"""Read train data""" train_data = pandas.read_csv('perceptron-train.csv', names=['Class', 'Sign1', 'Sign2']) train_class = train_data['Class'] train_signs = train_data.drop(['Class'], inplace=False, axis=1) """Read test data""" test_data = pandas.read_csv('perceptron-test.csv', names=['Class', 'Sign1', 'Sign2']) test_class = test_data['Class'] test_signs = test_data.drop(['Class'], inplace=False, axis=1) "Train the Perceptron" clf = Perceptron(random_state=241) clf.fit(train_signs, train_class) """Check an accuracy of prediction for non-normalized data""" predictions = clf.predict(test_signs) accuracy = sklearn.metrics.accuracy_score(test_class, predictions) print('Non-normalized data ', accuracy, '\n') """Check an accuracy of prediction for normalized data""" from sklearn.preprocessing import StandardScaler scaler = StandardScaler() """Normalize tain and test data""" train_signs_scaled = scaler.fit_transform(train_signs) test_signs_scaled = scaler.transform(test_signs)
#Data split for training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1, stratify=y) #Scaling training data sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) #Creating perceptron with hyperparameters ppn = Perceptron(max_iter=40, eta0=0.45, random_state=1) #This is training the model ppn.fit(X_train_std, y_train) #Scaling test data sc.fit(X_test) X_test_std = sc.transform(X_test) #Testing the model data y_pred = ppn.predict(X_test_std) # View the predict test data y_pred # View model accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) print('Labels counts in y:', np.bincount(y)) print('Labels counts in y_train:', np.bincount(y_train)) print('Labels counts in y_test:', np.bincount(y_test)) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) ppn = Perceptron(max_iter=4, eta0=0.1, random_state=1) ppn.fit(X_train_std, y_train) y_pred = ppn.predict(X_test_std) print('Misclassified samples: %d' % (y_test != y_pred).sum()) print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) print('Accuracy std: %.2f' % ppn.score(X_test_std, y_test)) X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) plot_decision_regions(X=X_combined_std, y=y_combined, classifier=ppn, test_idx=range(105, 150)) plt.xlabel('petal length [standardized]')
""" perceptron : 가장 간단한 인공 신경망 구조 TLU threshold logic unit : 퍼셉트론은 TLU 인공 뉴련을 기반으로 하며 입력의 가중치 합을 계산한 뒤, 계산된 합에 계단함수를 적용하여 결과를 출력 """ import numpy as np from sklearn.datasets import load_iris from sklearn.linear_model import Perceptron iris = load_iris() X = iris.data[:, (2, 3)] y = (iris.target == 0).astype(np.int) per_Clf = Perceptron() per_Clf.fit(X, y) y_pred = per_Clf.predict(X) print(y_pred) ''' [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] '''
print("density: %f" % density(clf.coef_)) print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model
# import pandas as pd # from sklearn.model_selection import train_test_split # Initialize relevant models # Note: PasiveAggressive and MLP not supported by AdaBoost models = [ None, DecisionTreeClassifier(criterion='entropy', splitter='best', max_features=None), BernoulliNB(), LogisticRegression(solver='lbfgs', multi_class='multinomial', penalty='l2', C=1.0), Perceptron(penalty='l1', alpha=0.0001), SVC(kernel='rbf', probability=True) ] algorithms = ['SAMME.R', 'SAMME'] # USER INPUTS ############################################# modelIndex = 1 # Default is DecisionTree n_estimators = 100 # Default is 50 algorithmIndex = 0 # Use 1 for Perceptron, 0 for all others # Choose number of training images to use NUM_TRAINING_IMAGES = 500
for i in target[:10].values.ravel(): foo.append(i) #splitting data - DO NOT RUN WITH FULL SET UNLESS YOU HAVE SEVERAL DAYS TO SPARE! #x_train, x_test, y_train, y_test = train_test_split(augmented_input, target, test_size = test_size, random_state = random_state) #splitting much smaller data x_train, x_test, y_train, y_test = train_test_split(augmented_input[0:10], foo, test_size = test_size, random_state = random_state) n_iter =10 #iterations eta0 =0.1 #learning rate perc = Perceptron(n_iter_no_change=n_iter,eta0=eta0,random_state=random_state) perc.fit(x_train,y_train) #predictions (change to desired test) y_pred = perc.predict(x_test) print("accuracy: {0:.2f}%". format(accuracy_score(y_test, y_pred)*100)) #n = 100 #change target values into something sclearn likes foo = [] for i in target[:100].values.ravel(): foo.append(i)
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier from sklearn.model_selection import cross_val_score # Set random state and number of estimators for tree based models random_state = 4 n_estimators = 100 models = [ LogisticRegression(random_state=random_state), Perceptron(random_state=random_state), SGDClassifier(random_state=random_state), SVC(random_state=random_state), KNeighborsClassifier(), GaussianNB(), DecisionTreeClassifier(random_state=random_state), RandomForestClassifier(random_state=random_state, n_estimators=n_estimators), ExtraTreesClassifier(random_state=random_state, n_estimators=n_estimators), AdaBoostClassifier(random_state=random_state, n_estimators=n_estimators), GradientBoostingClassifier(random_state=random_state, n_estimators=n_estimators) ] # Lists to store the results model_name = []