def prob0(): arff = Arff('datasets/labor.arff', label_count=1) # Trim the id column arff = arff.create_subset_arff(col_idx=slice(1, None)) arff = arff.get_features() km = KMeans(5) km.train(arff, verbose=True, centers=arff.data[:5])
def test_load_arff(self): """ Tests downloading and loading arff file """ t = Arff() t.load_arff(self.iris_path) self.assertListEqual(t.data[t.shape[0]-1].tolist(), [5.9, 3.0, 5.1, 1.8, 2.0])
def prob_0(): arff = Arff('datasets/lenses.arff') d = DecisionTreeLearner() f = arff.get_features() l = arff.get_labels() d.train(f,l) print(d.tree)
def prob0haccomplete(): arff = Arff('datasets/labor.arff', label_count=1) # Trim the id column arff = arff.create_subset_arff(col_idx=slice(1, None)) arff = arff.get_features() hac = HAC(simple=False) hac.train(arff, verbose=True, printk=[5])
def setUp(self): path = os.path.join(utils.get_root(), "test/datasets/cm1_req.arff") data = Arff(arff=path) self.features = data.get_features() self.labels = data.get_labels() self.learner = BaselineLearner()
def setUp(self): # NOTE: for discrete attributes, at least one value must be a float in order for numpy array # functions to work properly. data = np.array([[1.5, -6, 1.0], [2.3, -8, 2], [4.1, self.infinity, 2]]) m = Arff(data, label_count=1) m.attr_names = ['A', 'B', 'C'] m.str_to_enum = [{}, {}, {'R': 0, 'G': 1, 'B': 2}] m.enum_to_str = [{}, {}, {0: 'R', 1: 'G', 2: 'B'}] self.m = m data2 = np.array([[0.0, 1.0, 2.0, 3.0, 0.0], [0.1, 1.1, 2.1, 3.1, 1.0], [0.2, 1.2, 2.2, 3.2, 1.0], [0.3, 1.3, 2.3, 3.3, 2.0], [0.4, 1.4, 2.4, 3.4, 2.0]]) m2 = Arff(data2, label_count=1) m2.attr_names = ['A', 'B', 'C', 'D', 'E'] m2.str_to_enum = [{}, {}, {}, {}, {'R': 0, 'G': 1, 'B': 2}] m2.enum_to_str = [{}, {}, {}, {}, {0: 'R', 1: 'G', 2: 'B'}] self.m2 = m2 self.credit_data_path = os.path.join(utils.get_root(),"test/datasets/creditapproval.arff") self.iris_path = os.path.join(utils.get_root(),"test/datasets/iris.arff")
def main(): arff = Arff(sys.argv[1]) pl = PerceptronLearner() features = arff.get_features() labels = arff.get_labels() accuracy_matrix = np.zeros((5, 20)) for i in range(5): pl.train(features, labels) a = pl.accuracy_tracker[:20] # pad to make 20 wide a = np.pad(a, (0, 20 - len(a)), 'constant', constant_values=a[-1]) accuracy_matrix[i] = a # Average the accuracies of each step print(accuracy_matrix) avg_accuracy = np.sum(accuracy_matrix, axis=0) / 5 print(avg_accuracy) plt.plot(1 - avg_accuracy) plt.xlabel("Epochs") plt.ylabel("Avg Misclassification Rate") plt.title("Avg Misclassification Rate Over Epochs") plt.show()
def main(): arff = Arff(sys.argv[1]) features = arff.get_features() labels = arff.get_labels() pl = PerceptronLearner() pl.train(features, labels) visualize_training(features, labels, pl)
def setup(): arff = Arff('datasets/labor.arff', label_count=1) # Trim the id column arff = arff.create_subset_arff(col_idx=slice(1, None)) arff = arff.get_features() hac = HAC() hac.nominal_indicies = np.where(np.array(arff.attr_types) == 'nominal')[0] print('33,44', hac.get_distance(arff.data[33], arff.data[44])) print('25,34', hac.get_distance(arff.data[25], arff.data[34]))
def test_arff_constructor(self): """ Tests construction of Arff from path, arff, numpy array """ ## Create a Matrix object from arff credit = Arff(arff=self.credit_data_path) credit3 = Arff(arff=credit) credit2 = Arff(arff=credit.data) np.testing.assert_array_almost_equal(credit.data, credit2.data) np.testing.assert_array_almost_equal(credit2.data, credit3.data)
def prob4h(): arff = Arff('datasets/abalone.arff', label_count=0) arff.normalize() domain = np.arange(2, 8) print('single link --------------------') hoc = HAC() hoc.train(arff, printk=domain, silhouette=True) print('complete link -----------------------') hoc = HAC(simple=False) hoc.train(arff, printk=domain, silhouette=True)
def prob4(): arff = Arff('datasets/abalone.arff', label_count=0) arff.normalize() domain = np.arange(2, 8) ssekmm = [] for k in domain: km = KMeans(k) ssek = km.train(arff) ssekmm.append(ssek) print(km.calc_silhouette_score())
def prob2(): iris = Arff('datasets/iris.arff') features = iris.get_features() # features.normalize() # Train k means for 2-7 ks = [2, 3, 4, 5, 6, 7] for k in ks: km = KMeans(k) km.train(features) hac2 = HAC(simple=False) hac2.train(features, printk=ks)
def test_create_subset_arff(self): m2 = Arff(self.m2, [1,2], slice(1,3)) self.assertEqual(m2.shape, (2,2)) m2 = Arff(self.m2, slice(1,3), [1,2]) self.assertEqual(m2.shape, (2,2)) # Automatic label inference self.m2.label_count=3 m2 = Arff(self.m2, slice(1,3), slice(1,None), label_count=None) self.assertEqual(3, m2.label_count) m2 = Arff(self.m2, slice(1,3), slice(1,-1), label_count=None) self.assertEqual(2, m2.label_count)
def prob5(): arff = Arff(sys.argv[2]) imp_atts = [1, 3, 4, 5, 7, 9, 11, 12, 13] arff.shuffle() n = len(arff.get_labels().data) t = int(n * .55) v = n - int(n * .20) train_set = arff.create_subset_arff(row_idx=slice(0, t, 1), col_idx=imp_atts) test_set = arff.create_subset_arff(row_idx=slice(t, v, 1), col_idx=imp_atts) validation_set = arff.create_subset_arff(row_idx=slice(v, n, 1), col_idx=imp_atts) epochs = [] momentums = np.linspace(0, 1.5, 20) # momentums = [.5, 1] for momentum in momentums: print(momentum) nn = NeuralNetwork(8, [30], 11, LR=.1, momentum=momentum) all_acc_va, all_mse_va, all_mse_te, all_mse_tr = nn.train_set( train_set, test_set, validation_set, w=5) epochs.append(len(all_acc_va)) plt.plot(momentums, epochs) plt.title("Vowel Momentum vs Epoch Convergence") plt.xlabel("Momentum") plt.ylabel("Epochs til Conv.") plt.show()
def test_copy_and_slice(self): d = Arff(self.credit_data_path) e = d.copy() e._copy_and_slice_arff(d, 1, 5) self.assertEqual(e.shape, (1,1)) e._copy_and_slice_arff(d, slice(1,4), slice(2,4)) self.assertEqual(e.shape, (3, 2)) # This will create a 1D array, returning coords (1,1), (2,5), (3,7) e._copy_and_slice_arff(d, [1,2,3,7], [1,5,7,8]) self.assertEqual(e.shape, (4, )) e._copy_and_slice_arff(d, [1,2,3,7], slice(0,5)) self.assertEqual(e.shape, (4, 5))
def test_append_columns(self): test_matrix = Arff(self.m) # Verify it works with other matrices test_matrix.append_columns(test_matrix) assert test_matrix.data.shape == (3,6) # Verify it works with numpy array test_matrix.append_columns(test_matrix.data) assert test_matrix.data.shape == (3,12) # Verify it works with 2D list test_matrix.append_columns(test_matrix.data) assert test_matrix.data.shape == (3,24) # Verify incompatible number of columns with self.assertRaises(Exception) as context: test_matrix.append_columns(self.m.data[:1,:]) self.assertTrue('Incompatible number of rows' in str(context.exception))
def test_append_rows(self): test_matrix = Arff(self.m) # Verify it works with other matrices test_matrix.append_rows(test_matrix) assert test_matrix.data.shape == (6,3) # Verify it works with numpy array test_matrix.append_rows(test_matrix.data) assert test_matrix.data.shape == (12,3) # Verify it works with 2D list test_matrix.append_rows(test_matrix.data) assert test_matrix.data.shape == (24,3) # Verify incompatible number of rows with self.assertRaises(Exception) as context: test_matrix.append_rows(self.m.data[:,:2]) print(str(context.exception)) self.assertTrue('Incompatible number of columns' in str(context.exception))
def prob2wclass(): iris = Arff('datasets/iris.arff', label_count=0) # features.normalize() # Train k means for 2-7 ks = [2, 3, 4, 5, 6, 7] for k in ks: km = KMeans(k) km.train(iris) hac2 = HAC(simple=False) hac2.train(iris, printk=ks)
def prob_4(lr): # read in vowels dataset arff = Arff('datasets/vowels.arff') # Leave out the test/train and person features, which are unceccessary. arff = arff.create_subset_arff(col_idx=slice(2, None), label_count=1) # Get a 75/25 split arff.shuffle() training = arff.create_subset_arff(slice(arff.instance_count // 4)) test = arff.create_subset_arff(slice(arff.instance_count // 4, -1)) t_features = test.get_features() t_labels = test.get_labels() # Get a 15% Validation set validation = training.create_subset_arff(slice(arff.instance_count // 5)) training = training.create_subset_arff( slice(arff.instance_count // 5, None)) v_features = validation.get_features() v_labels = validation.get_labels() domain = 2**np.arange(0, 8) training_mse = [] validation_mse = [] test_mse = [] for nodes in domain: mse = 0 vmse = 0 tmse = 0 for _ in range(100): learner = MultilayerPerceptronLearner( [training.features_count, nodes, 11], momentum=0) # learner.zero_weights() learner.lr = lr learner.max_epoch = 500 learner.train(training.get_features(), training.get_labels(), validation.get_features(), validation.get_labels()) tmse += learner.get_mse(test.get_features(), test.get_labels()) mse += learner.get_mse(training.get_features(), training.get_labels()) vmse += learner.get_mse(validation.get_features(), validation.get_labels()) training_mse.append(mse / 100) validation_mse.append(vmse / 100) test_mse.append(tmse / 100) plt.semilogx(domain, test_mse, basex=2, label="Test Set MSE") plt.semilogx(domain, training_mse, basex=2, label="Training Set MSE") plt.semilogx(domain, validation_mse, basex=2, label="Validation Set MSE") plt.title("MSE vs Number of Hidden Nodes") plt.xlabel("Number of Hidden Nodes") plt.ylabel("Mean Squared Error") plt.legend() plt.show()
def prob_6(): # Repeat experiments for magic telescope and housing using weights (w = 1/dist**2) arff = Arff('datasets/credit.arff') arff.shuffle() test = arff.create_subset_arff(slice(arff.instance_count // 4)) train = arff.create_subset_arff(slice(arff.instance_count // 4, None)) train.normalize() test.normalize() krange = np.arange(1, 16, 2) accs = [] for k in krange: knn = KNN(k, weighting=True, vdm=True) predictions = knn.knn(train.get_features(), train.get_labels(), test.get_features()) acc = predictions == np.ravel(test.get_labels().data) print("k:", k, "accuracy:", sum(acc) / len(acc)) accs.append(sum(acc) / len(acc)) plt.plot(krange, accs) plt.title("K Size Versus Accuracy on Credit Approval (Weighted)") plt.xlabel("K") plt.ylabel("Accuracy") plt.show()
def prob_1(): arff = Arff('datasets/lenses.arff') acc = [] for _ in range(10): arff.shuffle() testing = arff.create_subset_arff(slice(arff.instance_count//5)) training = arff.create_subset_arff(slice(arff.instance_count//5, None)) d = DecisionTreeLearner() features = training.get_features() labels = training.get_labels() t_feat = testing.get_features() t_labels = testing.get_labels() d.train(features, labels) accuracy = d.get_accuracy(t_feat, t_labels) print(accuracy) acc.append(accuracy) print(sum(acc)/len(acc))
def test_cases(): # test_1() attr_types = [ "real", "real", "real", "real", "cat", "real", "cat", "real", "real", "cat", "real", "cat", "cat", "cat", "cat", "cat", "cat" ] attr_idx = [[], [], [], [], ['none', 'tcf', 'tc'], [], ['none', 'ret_allw', 'empl_contr'], [], [], ['yes', 'no'], [], ['below_average', 'average', 'generous'], ['yes', 'no'], ['none', 'half', 'full'], ['yes', 'no'], ['none', 'half', 'full'], ['bad', 'good']] k = 5 arff = Arff("labor.arff") # arff.normalize() features = arff.get_features().data labels = arff.get_labels().data # attributes = arff.get_attr_names() data = np.hstack((features, labels))[:, 1:-1] kmc = HA_Clustering(k, data, data, attr_types, "complete link", attr_idx)
def prob_5(): cont_mask = [1, 2, 7, 10, 13, 14, 16] cate_mask = [0, 3, 4, 5, 6, 8, 9, 11, 12, 15] arff = Arff("credit_approval_data.arff") arff.shuffle() arff.normalize() n = len(arff.get_labels().data) t = int(n * .7) train_data = arff.create_subset_arff(row_idx=slice(0, t, 1)) test_data = arff.create_subset_arff(row_idx=slice(t, n, 1)) test_data = np.hstack((test_data.get_features().data, test_data.get_labels().data)) train_data = np.hstack((train_data.get_features().data, train_data.get_labels().data)) #b,30.83,0,u,g,w,v,1.25,t,t,01,f,g,00202,0,+ dist_matrix = np.ones((16, 16)) np.fill_diagonal(dist_matrix, 0) KNNC = KNNClassifier(8, train_data, test_data) print(KNNC.get_accuracy_mixed(cate_mask, cont_mask, dist_matrix))
def main(): # Train each perceptron on its own split data set fast_v_mid = Arff('datasets/restaurants/fast_v_mid.arff') fast_v_fine = Arff('datasets/restaurants/fast_v_fine.arff') mid_v_fine = Arff('datasets/restaurants/mid_v_fine.arff') pl_fast_mid = PerceptronLearner() pl_fast_fine = PerceptronLearner() pl_mid_v_fine = PerceptronLearner() # Train each perceptron train_perceptron(pl_fast_mid, fast_v_mid) train_perceptron(pl_fast_fine, fast_v_fine) train_perceptron(pl_mid_v_fine, mid_v_fine) # Run on new data # Burger King burger_king = [4, 2, 2] # Cheesecake Factory cheesecake_factory = [2, 4, 4] # Best fine-dining in the world best_fine_dining = [1, 5, 3] print_findings( "Burger King", determine_category(burger_king, pl_fast_mid, pl_fast_fine, pl_mid_v_fine)) print_findings( "Cheesecake Factory", determine_category(cheesecake_factory, pl_fast_mid, pl_fast_fine, pl_mid_v_fine)) print_findings( "'Best Food In The World'", determine_category(best_fine_dining, pl_fast_mid, pl_fast_fine, pl_mid_v_fine))
def prob3(): """ """ arff = Arff(sys.argv[2]) imp_atts = [1, 3, 4, 5, 7, 9, 11, 12, 13] arff.shuffle() n = len(arff.get_labels().data) t = int(n * .55) v = n - int(n * .20) train_set = arff.create_subset_arff(row_idx=slice(0, t, 1), col_idx=imp_atts) test_set = arff.create_subset_arff(row_idx=slice(t, v, 1), col_idx=imp_atts) validation_set = arff.create_subset_arff(row_idx=slice(v, n, 1), col_idx=imp_atts) best_mse_te = [] best_mse_tr = [] best_mse_va = [] epochs = [] LRS = [.01, .1, .5, .8, 1.5] for LR in LRS: # print(LR) nn = NeuralNetwork(8, [16], 11, LR=LR, momentum=0) all_acc_va, all_mse_va, all_mse_te, all_mse_tr = nn.train_set( train_set, test_set, validation_set, w=5) best_mse_te.append(min(all_mse_te)) best_mse_tr.append(min(all_mse_tr)) best_mse_va.append(min(all_mse_va)) epochs.append(len(all_mse_va)) plt.plot(LRS, best_mse_te, label="MSE Te") plt.plot(LRS, best_mse_tr, label="MSE Tr") plt.plot(LRS, best_mse_va, label="MSE V.A") plt.title("Vowel MSE vs Learning Rate") plt.xlabel("Learning Rate") plt.ylabel("MSE") plt.legend() plt.show() plt.plot(LRS, epochs) plt.title("Vowel Epochs vs Learning Rate") plt.xlabel("Learning Rate") plt.ylabel("Epochs") plt.legend() plt.show()
def prob3_normalized(): arff = Arff('datasets/abalone.arff', label_count=0) arff.normalize() domain = np.arange(2, 8) ssekmm = [] for k in domain: km = KMeans(k) ssek = km.train(arff) ssekmm.append(ssek) hac = HAC() hac2 = HAC(simple=False) ssehac = hac.train(arff, printk=domain) ssehac2 = hac2.train(arff, printk=domain) plt.plot(domain, ssekmm, label="K-Means SSE") plt.plot(domain, ssehac[::-1], label="HAC (Single-Link) SSE") plt.plot(domain, ssehac2[::-1], label="HAC (Complete-Link) SSE") plt.title("Abalone SSE (Normalized) vs # of Clusters") plt.xlabel("# of Clusters") plt.ylabel('SSE') plt.legend() plt.show()
def prob_6_b(): # read in vowels dataset arff = Arff('datasets/vowels.arff') # Leave out the test/train and person features, which are unceccessary. arff = arff.create_subset_arff(col_idx=slice(2, None), label_count=1) # Get a 75/25 split arff.shuffle() training = arff.create_subset_arff(slice(arff.instance_count // 4)) test = arff.create_subset_arff(slice(arff.instance_count // 4, -1)) t_features = test.get_features() t_labels = test.get_labels() # Get a 15% Validation set validation = training.create_subset_arff(slice(arff.instance_count // 5)) training = training.create_subset_arff( slice(arff.instance_count // 5, None)) v_features = validation.get_features() v_labels = validation.get_labels() features = training.get_features() labels = training.get_labels() taccuracy = [] accuracy = [] domain = 2**np.arange(1, 7) for i in domain: tacc = 0 acc = 0 for _ in range(3): learner = MultilayerPerceptronLearner([training.features_count] + [i] * (32 // i) + [11], momentum=.85) # learner.zero_weights() learner.lr = .1 learner.max_epoch = 500 learner.train(training.get_features(), training.get_labels(), validation.get_features(), validation.get_labels()) tacc += learner.get_accuracy(t_features, t_labels) acc += learner.get_accuracy(features, labels) accuracy.append(acc / 3) taccuracy.append(tacc / 3) plt.semilogx(domain, accuracy, basex=2, label="Training Set Accuracy") plt.semilogx(domain, taccuracy, basex=2, label="Test Set Accuracy") plt.title("Node Distribution vs Accuracy") plt.xlabel("Number of Nodes per Hidden Layer") plt.ylabel("Accuracy") plt.legend() plt.show()
def prob_5(lr, hidden): # read in vowels dataset arff = Arff('datasets/vowels.arff') # Leave out the test/train and person features, which are unceccessary. arff = arff.create_subset_arff(col_idx=slice(2, None), label_count=1) # Get a 75/25 split arff.shuffle() training = arff.create_subset_arff(slice(arff.instance_count // 4)) test = arff.create_subset_arff(slice(arff.instance_count // 4, -1)) t_features = test.get_features() t_labels = test.get_labels() # Get a 15% Validation set validation = training.create_subset_arff(slice(arff.instance_count // 5)) training = training.create_subset_arff( slice(arff.instance_count // 5, None)) v_features = validation.get_features() v_labels = validation.get_labels() epochs = [] accuracy = [] domain = np.linspace(0, 1, 20) for momentum in domain: e = 0 acc = 0 for _ in range(10): learner = MultilayerPerceptronLearner( [training.features_count, hidden, 11], momentum=momentum) # learner.zero_weights() learner.lr = lr learner.max_epoch = 500 learner.train(training.get_features(), training.get_labels(), validation.get_features(), validation.get_labels()) acc += learner.get_accuracy(t_features, t_labels) e += learner.epochs epochs.append(e / 10) accuracy.append(acc / 10) print(accuracy) plt.plot(domain, epochs) plt.title("Number of Training Epochs vs Momentum") plt.xlabel("Momentum Constant") plt.ylabel("Number of Training Epochs") plt.show()
def prob_2(): # Get accuracy on cars.arff arff = Arff('datasets/cars.arff') arff.shuffle() acc, tacc, = k_fold_cv(arff, 10) print('cars:') print('acc',acc) print('tacc',tacc) print('tot',sum(tacc)/len(tacc)) print() # Get accuracy of voting.arff arff = Arff('datasets/voting.arff') arff.shuffle() acc,tacc, = k_fold_cv(arff, 10) print('voting;') print('acc',acc) print('tacc',tacc) print('tot',sum(tacc)/len(tacc)) print()