def mnist_digit_recognition(): train_set, test_set = load_mnist_dataset() n_labels = 10 # 1,2,3,4,5,6,7,9,0 n_features = 28 * 28 draw_ex_images(5, 4, train_set[0].shape[0], train_set[0]) mnist_model = GaussianNaiveBayes(n_labels, n_features) start = time.time() mnist_model.train(train_set[0], train_set[1]) end = time.time() print(end - start) mnist_model.save_model() mean, var, pi = mnist_model.get_parameters() print(f"Model parameters: mean {mean}, var {var}, pi {pi}") test_data, labels = test_set limit = 150 test_data, labels = test_data[:limit], labels[:limit] results = np.arange(limit, dtype=np.int) for n in range(limit): results[n] = mnist_model.classify(test_data[n]) print(f"{n} : predicted {results[n]}, correct {labels[n]}") print("recognition rate: ", (results == labels).mean())
def test_bayes(): iris = load_iris() #dataframe input required df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target']) X, y = df.drop('target', 1), df[['target']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y, shuffle=True) clas = GaussianNaiveBayes() clas.fit(X_train, y_train) pred2 = clas.predict(X_test) print("F1: ", f1_score(pred2, y_test, average='micro'))
def figure_1(train_set_results, X_test, y_test): """Learning curves best SVMvs GNB""" best_SVM_params = max( [(statistics.mean(group_results['test_score']), group_results['estimator'][0].best_params_) for group_results in train_set_results['tf']['plain'].values()], key=lambda x: x[0]) best_SVM = Pipeline([("estimator", SVC(max_iter=1000000))]) best_SVM.set_params(**best_SVM_params[1]) plot_learning_curves_macro(best_SVM, GaussianNaiveBayes(), "SVM", "GNB", "SVM vs GNB learning curves", X_test, y_test)
def naive_bayes_test(): n_labels, n_features = 2, 2 nb = GaussianNaiveBayes(n_labels, n_features) # prepare sample training data data1 = np.random.multivariate_normal([1,4], [[2,0],[0,2]], size=100) data2 = np.random.multivariate_normal([5,7], [[3,0],[0,1]], size=100) data = np.concatenate((data1, data2), axis=0) # prepare training label data labels = np.concatenate((np.array([0]*100), np.array([1]*100)), axis=0) print "correct labels" print labels # nb.load() nb.train(data, labels) # nb.save() results = nb.classify(data) print "predicted labels" print results print "recognition rate: ", (results == labels).mean()
def mnist_digit_recognition(): train_set, valid_set, test_set = load_mnist_dataset("mnist.pkl.gz") n_labels = 10 # 1,2,3,4,5,6,7,9,0 n_features = 28*28 mnist_model = GaussianNaiveBayes(n_labels, n_features) mnist_model.train(train_set[0], train_set[1]) [mean, var], pi = mnist_model.get_parameters() # visualization of learned means create_2D_images_horizontal(mean, w=28, h=28) show() test_data, labels = test_set # slice #limit = len(test_data) limit = 50 test_data, labels = test_data[:limit], labels[:limit] results = np.arange(limit, dtype=np.int) for n in range(limit): results[n] = mnist_model.classify(test_data[n]) print "%d : predicted %s, correct %s" % (n, results[n], labels[n]) # results = mnist_model.classify(test_data) print "recognition rate: ", (results == labels).mean()
def mnist_digit_recognition(): train_set, valid_set, test_set = load_mnist_dataset("mnist.pkl.gz") n_labels = 10 # 1,2,3,4,5,6,7,9,0 n_features = 28 * 28 mnist_model = GaussianNaiveBayes(n_labels, n_features) mnist_model.train(train_set[0], train_set[1]) [mean, var], pi = mnist_model.get_parameters() # visualization of learned means create_2D_images_horizontal(mean, w=28, h=28) show() test_data, labels = test_set # slice #limit = len(test_data) limit = 50 test_data, labels = test_data[:limit], labels[:limit] results = np.arange(limit, dtype=np.int) for n in range(limit): results[n] = mnist_model.classify(test_data[n]) print "%d : predicted %s, correct %s" % (n, results[n], labels[n]) # results = mnist_model.classify(test_data) print "recognition rate: ", (results == labels).mean()
def main(): """ Loads the dataset and either computes or loads precomputed results and using them (or not) completes task designated by the task variable :return: Happiness """ from pathlib import Path class_names = ["ham", "spam"] X, y = load_set() # X, _, y, _ = train_test_split(X,y,train_size=0.12) <- use to make dataset smaller for testing recompute = False pickled_train_set_results = "gridsearch.pickle" # load the precomputed results file if exists if Path(pickled_train_set_results).is_file() and not recompute: with open(pickled_train_set_results, 'rb') as file: results = pickle.load(file) else: # compute the test results (may take a long time) pipelines = { "plain": Pipeline([("estimator", SVC(max_iter=1000000))]), "normalized": Pipeline([("preprocessing", Normalizer()), ("estimator", SVC(max_iter=1000000))]), "scaled": Pipeline([("preprocessing", MaxAbsScaler()), ("estimator", SVC(max_iter=1000000))]), } datasets = {"tf": X, "tf_idf": transform_tf_to_tf_idf(X)} Cs = [10.0**x for x in range(-2, 5)] gammas = [10.0**x for x in range(-8, 1)] + ["scale"] param_grid = { "svm-rbf": { 'estimator__kernel': ['rbf'], 'estimator__gamma': gammas, 'estimator__C': [10.0**x for x in range(-1, 10)] }, "svm-linear": { 'estimator__kernel': ['linear'], 'estimator__C': Cs }, "svm-poly-d2-c00": { 'estimator__kernel': ['poly'], 'estimator__C': Cs, 'estimator__degree': [2] }, "svm-poly-d2-c01": { 'estimator__kernel': ['poly'], 'estimator__C': Cs, 'estimator__degree': [2], 'estimator__coef0': [1] }, "svm-poly-d3-c00": { 'estimator__kernel': ['poly'], 'estimator__C': Cs, 'estimator__degree': [3] }, "svm-poly-d3-c01": { 'estimator__kernel': ['poly'], 'estimator__C': Cs, 'estimator__degree': [3], 'estimator__coef0': [1] }, "gnb-sk": { 'estimator': [GaussianNB()] }, "gnb-my": { 'estimator': [GaussianNaiveBayes()] }, "multinomialnb": { 'estimator': [MultinomialNB()] } } results = { dataset: { pipeline: {group: {} for group in param_grid.keys()} for pipeline in pipelines.keys() } for dataset in datasets.keys() } param_search_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) eval_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) progress = np.zeros(3, dtype=np.uint16) for dataset_name, dataset in datasets.items(): progress = progress * np.array( [1, 0, 0], dtype=np.uint16) + np.array([1, 0, 0], dtype=np.uint16) print(f"Processing dataset {progress[0]} of {len(datasets)}") for pipeline_name, pipeline in pipelines.items(): progress = progress * np.array( [1, 1, 0], dtype=np.uint16) + np.array([0, 1, 0], dtype=np.uint16) print(f"Processing pipeline {progress[1]} of {len(pipelines)}") for group_name, group in param_grid.items(): progress = progress * np.array( [1, 1, 1], dtype=np.uint16) + np.array([0, 0, 1], dtype=np.uint16) print( f"Processing group {progress[2]} of {len(param_grid)}") param_search = GridSearchCV(pipeline, group, n_jobs=-1, cv=param_search_cv, verbose=1) cv_results = cross_validate(param_search, dataset, y, cv=eval_cv, return_estimator=True, verbose=1) results[dataset_name][pipeline_name][ group_name] = cv_results # save the results for future reuse with open(pickled_train_set_results, 'wb') as file: pickle.dump(results, file) # select your task task = "table_7" # <-----here if task == "table_1": table_1(results) elif task == "table_2": table_2(results) elif task == "table_3": table_3(results) elif task == "table_4": table_4(results) elif task == "table_5": table_5(results) elif task == "table_6": table_6(results) elif task == "table_7": table_7(results) elif task == "table_8": table_8(results) elif task == "figure_1": figure_1(results, X, y) elif task == "figure_2": figure_2(results, X, y) elif task == "figure_3": figure_3(results, X, y, class_names) elif task == "figure_4": figure_4(X, y, class_names) elif task == "figure_5": figure_5(X, y, class_names) elif task == "experiment": experiment(X, y) elif task == "histograms": plot_feature_histogram(minmax_scale(X), y, feature=7) else: print(f"Unknown task; {task}")