def main(): arff_file1, arff_file2 = read_args() arff1 = Arff.fromfile(arff_file1) arff2 = Arff.fromfile(arff_file2) norm_arff1 = arff1.normalize() norm_arff2 = arff2.normalize() space1= Space(len(norm_arff1.attributes)) populate_space_from_arff(norm_arff1, space1) print "Outliers from "+arff_file1+":" outliers1 = get_outliers(3, 5, space1) if outliers1: for x in outliers1: print x else: print "None" space2 = Space(len(norm_arff2.attributes)) populate_space_from_arff(norm_arff2, space2) print "Outliers from "+arff_file2+":" outliers2 = get_outliers(3, 12, space2) if outliers2: for x in outliers2: print x else: print "None"
def part1(): print('Running Part 1...') # Debug Data sets: # mat = Arff("../data/knn/debug/seismic-bumps_train.arff",label_count=1) # mat2 = Arff("../data/knn/debug/seismic-bumps_test.arff",label_count=1) # Evaluation Data sets: mat = Arff("../data/knn/evaluation/diabetes.arff", label_count=1) mat2 = Arff("../data/knn/evaluation/diabetes_test.arff", label_count=1) k_neighbors = 3 raw_data = mat.data h, w = raw_data.shape train_data = raw_data[:, :-1] train_labels = raw_data[:, -1] raw_data2 = mat2.data h2, w2 = raw_data2.shape test_data = raw_data2[:, :-1] test_labels = raw_data2[:, -1] KNN = KNNClassifier(label_type='classification', weight_type='inverse_distance', k_neighbors=k_neighbors) print("Fitting data ...") KNN.fit(train_data, train_labels) print("Predict data ...") pred = KNN.predict(test_data) print("Scoring data ...") score = KNN.score(test_data, test_labels) np.savetxt("diabetes-prediction.csv", pred, delimiter=',', fmt="%i") print("Accuracy = [{:.2f}]\n".format(score * 100))
def sk_learn(data="oldGames.arff", min_split=300, min_leaf=15): folds = 10 mat = Arff(data, label_count=1) counts = [] ## this is so you know how many types for each column for i in range(mat.data.shape[1]): counts += [mat.unique_value_count(i)] # np.random.seed(35) np.random.shuffle(mat.data) splits = np.array_split(mat.data, folds) Acc = 0 # min_split = 300 # print("Minsplit: {}".format(min_split)) for f in range(folds): # print("Fold {}:".format(f)) train = np.array([]) for other in range(folds): if train.size == 0 and other != f: train = splits[other].copy() elif other != f: train = np.concatenate((train, splits[other])) data = train[:, 0:-1] labels = train[:, -1].reshape(-1, 1) clf = tree.DecisionTreeClassifier( ) #min_samples_split=min_split, min_samples_leaf=min_leaf clf = clf.fit(data, labels) pred = clf.predict(data) new_acc = score(pred, labels) # print("\tTrain Acc {}".format(new_acc)) data2 = splits[f][:, 0:-1] labels2 = splits[f][:, -1].reshape(-1, 1) pred = clf.predict(data2) new_acc = score(pred, labels2) # print("\tTest Acc {}".format(new_acc)) Acc += new_acc Acc = Acc / folds print("Accuracy = [{:.4f}]".format(Acc)) classes = [ "Overwhelmingly_Positive", "Very_Positive", "Positive", "Mostly_Positive", "Mixed", "Mostly_Negative", "Negative", "Very_Negative", "Overwhelmingly_Negative" ] dot_data = tree.export_graphviz(clf, out_file=None, feature_names=mat.get_attr_names()[:-1], class_names=classes, filled=True, rounded=True) # max_depth=6, graph = graphviz.Source(dot_data) graph.render("old_games") return Acc
def part2(): print('Running Part 2...') # Part 2 Data sets: mat = Arff("../data/knn/magic-telescope/mt_training.arff", label_count=1) mat2 = Arff("../data/knn/magic-telescope/mt_testing.arff", label_count=1) k_neighbors = 3 raw_data = mat.data h, w = raw_data.shape train_data = raw_data[:, :-1] train_labels = raw_data[:, -1] raw_data2 = mat2.data h2, w2 = raw_data2.shape test_data = raw_data2[:, :-1] test_labels = raw_data2[:, -1] KNN = KNNClassifier(label_type='classification', weight_type='no_weight', k_neighbors=k_neighbors) print("Fitting data ...") KNN.fit(train_data, train_labels) print("Scoring data ...") score = KNN.score(test_data, test_labels) print("Accuracy = [{:.2f}]\n".format(score * 100)) norm_train_data, norm_test_data = normalizeDataSets(train_data, test_data) print("Fitting normalized data ...") KNN.fit(norm_train_data, train_labels) print("Scoring normalized data ...") score = KNN.score(norm_test_data, test_labels) print("Accuracy = [{:.2f}]\n".format(score * 100)) print('Running K Values tests...') k_values = [1, 3, 5, 7, 9, 11, 13, 15] scores = [] for k_val in k_values: KNN = KNNClassifier(label_type='classification', weight_type='no_weight', k_neighbors=k_val) KNN.fit(norm_train_data, train_labels) score = KNN.score(norm_test_data, test_labels) scores.append(score * 100) print('Plotting K values vs Scores w/ Normalization...') fig = plt.figure() ax = fig.add_subplot(111) plt.plot(k_values, scores, label='Accuracy') for xy in zip(k_values, scores): # <-- ax.annotate('(%s, %.1f)' % xy, xy=xy, textcoords='data') plt.title('Accuracy Using Different K Values and No Distance Weighting') plt.xlabel('K Values') plt.ylabel('Accuracy (%)') plt.legend() plt.savefig('k-value-and-accuracy.png') plt.show()
def main(self): if self.eval_method == "training": self.train(self.arff.get_features(), self.arff.get_labels()) self._print_confusion_matrix(self.arff.get_features(), self.arff.get_labels()) elif self.eval_method == "random": train_features, train_labels, test_features, test_labels = self.training_test_split( train_percent=self.eval_parameter) self.train(train_features, train_labels) self.test(test_features, test_labels) self._print_confusion_matrix(test_features, test_labels) elif self.eval_method == "static": self.train(self.arff.get_features(), self.arff.get_labels()) arff_file = self.eval_parameter test_data = Arff(arff_file) if self.normalize: test_data.normalize() self.test(features=test_data.get_features(), labels=test_data.get_labels()) self._print_confusion_matrix(features=test_data.get_features(), labels=test_data.get_labels()) elif self.eval_method == "cross": # print('PARAMETER') self.eval_parameter = int(self.eval_parameter) self.cross_validate( self.eval_parameter) # confusion matrix not supported for CV type(self.eval_parameter) else: raise Exception("Unrecognized evaluation method '{}'".format( self.eval_method))
def separable(): print("----------------separable-----------------------") mat = Arff("./separableIsSquare.arff", label_count=1) np_mat = mat.data data = mat[:, :-1] labels = mat[:, -1].reshape(-1, 1) print(data[:, 1]) print(labels) ### Make the Classifier ##### P3Class = None for lr in range(10, 0, -1): P3Class = PerceptronClassifier(lr=0.1*lr, shuffle=False) P3Class.fit(data, labels, standard_weight_value=None) Accuracy = P3Class.score(data, labels) print("Learning Rate = ", 0.1*lr) print("Accuracy = [{:.2f}]".format(Accuracy)) print("Epochs = ", P3Class.get_epochs_trained()) # print(P3Class) ## could not get graphing to work in time... # graph(data[:, 0], data[:, 1], labels=mat[:, -1]) w = P3Class.get_weights() y = lambda x: (-w[0]/w[1])*x - (w[2]/w[1]) grapher = Grapher() grapher.graph(data[:, 0], data[:, 1], labels=mat[:, -1], title="Separable") grapher.add_function(y) grapher.show("separable.svg")
def inseparable(): print("----------------Inseparable-----------------------") mat = Arff("./impossible.arff", label_count=1) np_mat = mat.data data = mat[:, :-1] labels = mat[:, -1].reshape(-1, 1) ### Make the Classifier ##### P4Class = None for lr in range(10, 0, -1): P4Class = PerceptronClassifier(lr=0.1*lr, deterministic=10, shuffle=False) P4Class.fit(data, labels, standard_weight_value=None) Accuracy = P4Class.score(data, labels) print("Learning Rate = ", 0.1*lr) print("Accuracy = [{:.2f}]".format(Accuracy)) print("Epochs = ", P4Class.get_epochs_trained()) w = P4Class.get_weights() y = lambda x: (-w[0]/w[1])*x - (w[2]/w[1]) grapher = Grapher() grapher.graph(data[:, 0], data[:, 1], labels=mat[:, -1], title="Inseparable") grapher.add_function(y) grapher.show("Inseparable.svg")
def all_lenses(): print("---------all-lenses----------") lens_data = Arff("./lenses.arff", label_count=1) all_lens_data = Arff("./all_lenses.arff", label_count=1) lens_train = lens_data.data[:, :-1] lens_label_train = lens_data.data[:, -1].reshape(-1, 1) lens_test = all_lens_data.data[:, :-1] lens_label_test = all_lens_data.data[:, -1].reshape(-1, 1) dtree = DTClassifier(features=lens_data.get_attr_names()) dtree.fit(lens_train, lens_label_train) score = dtree.score(lens_test, lens_label_test) print("Train Accuracy=[{:.2f}]".format( dtree.score(lens_train, lens_label_train))) print("Accuracy=[{:.2f}]".format(score))
def soybean(): print("----------------soybean------------------") mat = Arff("./soybean.arff", label_count=1, missing=float(37.0)) # data = mat.data[:, 0:-1] # labels = mat.data[:, -1]#.reshape(-1, 1) splits = 10 kfolder = KFold(n_splits=splits) data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], mat.data[:, -1].reshape( -1, 1), test_size=.25) best_tree = (0, 0, None, -.1, -.1) trace = [] for dummy_iterator in range(16): max_depth = np.random.randint(1, mat.features_count) max_features = np.random.uniform(.1, 1) print("max depth", max_depth, "max features", max_features) which_split = 0 for train, validate in kfolder.split(data, labels): # print(train, validate) dtree = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features) dtree.fit(data[train], labels[train]) score = dtree.score(data[validate], labels[validate]) train_score = dtree.score(data[train], labels[train]) trace.append([ dummy_iterator, which_split, score, train_score, max_depth, max_features ]) which_split = which_split + 1 if score > best_tree[0]: print("score update", score) best_tree = (score, train_score, dtree, max_depth, max_features) print(best_tree) print('Best tree accuracy: {:.2f}'.format(best_tree[2].score( tData, tLabels))) np.savetxt( "soybean.csv", trace, delimiter=',', header= "iteration_of_10_fold,which_fold,score,train_score,max_depth,max_features" ) export_graphviz(best_tree[2], out_file="soybean_tree") export_graphviz(best_tree[2], out_file="soybean_tree_truncated", max_depth=5)
def voting(): print("----------------voting------------------") mat = Arff("./voting.arff", label_count=1) # data = mat.data[:, 0:-1] # labels = mat.data[:, -1]#.reshape(-1, 1) splits = 10 kfolder = KFold(n_splits=splits) scores = [[], []] data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], mat.data[:, -1].reshape( -1, 1), test_size=.25) best_tree = (0, None) for train, validate in kfolder.split(data, labels): # print(train, validate) dtree = DTClassifier(features=mat.get_attr_names()) dtree.fit(data[train], labels[train]) scores[0].append(dtree.score(data[validate], labels[validate])) scores[1].append(dtree.score(data[train], labels[train])) if scores[0][-1] > best_tree[0]: best_tree = (scores[0][-1], dtree) average = np.sum(scores, axis=1) / splits scores[0].append(average[0]) scores[1].append(average[1]) header_text = '' for x in range(splits): header_text = header_text + str(x) + ' ' np.savetxt("voting.csv", scores, header=header_text + 'average', delimiter=',') print(scores) print('Average CV accuracy: {:.2f}'.format(scores[0][-1])) print('Best tree accuracy: {:.2f}'.format(best_tree[1].score( tData, tLabels))) f = open("voting_tree", "w") f.write(dtree.graph(class_translator=lambda x: mat.attr_value(-1, x))) f.close()
def part3(): print('Running Part 3...') # Part 3 Data sets: mat = Arff("../data/knn/housing-price/hp_training.arff", label_count=1) mat2 = Arff("../data/knn/housing-price/hp_testing.arff", label_count=1) k_neighbors = 3 raw_data = mat.data h, w = raw_data.shape train_data = raw_data[:, :-1] train_labels = raw_data[:, -1] raw_data2 = mat2.data h2, w2 = raw_data2.shape test_data = raw_data2[:, :-1] test_labels = raw_data2[:, -1] # Normalize Data. train_data, test_data = normalizeDataSets(train_data, test_data) print('Running K Values tests...') k_values = [1, 3, 5, 7, 9, 11, 13, 15] mses = [] for k_val in k_values: KNN = KNNClassifier(label_type='regression', weight_type='no_weight', k_neighbors=k_val) KNN.fit(train_data, train_labels) score = KNN.score(test_data, test_labels) mses.append(score) print('Plotting K values vs MSE Scores...') fig = plt.figure() ax = fig.add_subplot(111) plt.plot(k_values, mses, label='MSE') for xy in zip(k_values, mses): # <-- ax.annotate('(%s, %.2f)' % xy, xy=xy, textcoords='data') plt.title('MSE Using Different K Value sand No Distance Weighting') plt.xlabel('K Values') plt.ylabel('MSE') plt.legend() plt.savefig('k-value-and-mse.png') plt.show()
def main(): arff_file1, arff_file2 = read_args() arff1 = Arff.fromfile(arff_file1) arff2 = Arff.fromfile(arff_file2) norm_arff1 = arff1.normalize() norm_arff2 = arff2.normalize() space1= Space(len(norm_arff1.attributes)) populate_space_from_arff(norm_arff1, space1) print "Outliers from "+arff_file1+":" for x in get_outliers(space1, 0.05): print x space2 = Space(len(norm_arff2.attributes)) populate_space_from_arff(norm_arff2, space2) print "Outliers from "+arff_file2+":" for x in get_outliers(space2, 0.05): print x
def debug(): print("------------arff-------------------") mat = Arff("../data/perceptron/debug/linsep2nonorigin.arff", label_count=1) data = mat.data[:, 0:-1] labels = mat.data[:, -1].reshape(-1, 1) PClass = PerceptronClassifier( lr=0.1, shuffle=False, deterministic=10, printIt=False) PClass.fit(data, labels) Accuracy = PClass.score(data, labels) print("Accuray = [{:.2f}]".format(Accuracy)) print("Final Weights =", PClass.get_weights())
def nan_lenses(): print("----------------nan_lenses------------------") mat = Arff("./nan_lenses.arff", label_count=1) # data = mat.data[:, 0:-1] # labels = mat.data[:, -1].reshape(-1, 1) data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], mat.data[:, -1].reshape( -1, 1), test_size=.25) dtree = DTClassifier(features=mat.get_attr_names()) dtree.fit(data, labels) print(dtree.tree) # results = dtree.predict(tData) # for r, t in zip(results, tLabels): # print(r, t) score = dtree.score(tData, tLabels) print("Accuracy=[{:.2f}]".format(score))
def evaluate(): print("------------eval-------------------") mat = Arff("../data/perceptron/evaluation/data_banknote_authentication.arff", label_count=1) data = mat.data[:, 0:-1] labels = mat.data[:, -1].reshape(-1, 1) # print("data\n", data) MLPClass = MLPClassifier([2*np.shape(data)[1]], lr=0.1, shuffle=False, deterministic=10) MLPClass.fit(data, labels, momentum=0.5, percent_verify=0, standard_weight=0) Accuracy = MLPClass.score(data, labels) # print(MLPClass) # print("Final Weights =", MLPClass.get_weights()) # print(MLPClass) print(MLPClass.csv_print())
def evaluation(): print("--------------arf2------------------------------") mat = Arff("../data/perceptron/evaluation/data_banknote_authentication.arff", label_count=1) np_mat = mat.data data = mat[:, :-1] labels = mat[:, -1].reshape(-1, 1) #### Make Classifier #### P2Class = PerceptronClassifier(lr=0.1, shuffle=False, deterministic=10) P2Class.fit(data, labels) Accuracy = P2Class.score(data, labels) print("Accuray = [{:.2f}]".format(Accuracy)) print("Final Weights =", P2Class.get_weights())
def evaluation(): print("----------------evaluation---------------") zoo_data = Arff("./zoo.arff", label_count=1) all_zoo_data = Arff("./all_zoo.arff", label_count=1) zoo_train = zoo_data.data[:, :-1] zoo_label_train = zoo_data.data[:, -1].reshape(-1, 1) zoo_test = all_zoo_data.data[:, :-1] zoo_label_test = all_zoo_data.data[:, -1].reshape(-1, 1) dtree = DTClassifier(features=zoo_data.get_attr_names()) dtree.fit(zoo_train, zoo_label_train) print("Train Accuracy=[{:.2f}]".format( dtree.score(zoo_train, zoo_label_train))) predicted = dtree.predict(zoo_test) np.savetxt('predicted_zoo.csv', predicted, delimiter=',', header="predicted") score = dtree.score(zoo_test, zoo_label_test) print("Accuracy=[{:.2f}]".format(score))
def main(): num_points = 100 dim = 2 means = [2.4, 5.5, 6.2] sigmas = [1, 1.5, 0.5] raw_points = generate_norm(num_points, dim, means, sigmas) attrs = [] for d in range(dim): attrs.append(('f'+str(d), 'real')) points = [] for i in range(num_points): point = {} for j, (attr, _) in enumerate(attrs): point[attr] = raw_points[i][j] points.append((i+1, point)) relation = 'DATASET' arff = Arff(relation, attrs, points) arff._print_arff() arff.output_file("synthetic.arff")
def debug(): print("------------arff-------------------") mat = Arff("../data/perceptron/debug/linsep2nonorigin.arff", label_count=1) data = mat.data[:, 0:-1] labels = mat.data[:, -1].reshape(-1, 1) # print("data\n", data) MLPClass = MLPClassifier([2*np.shape(data)[1]], lr=0.1, shuffle=False, deterministic=10) MLPClass.fit(data, labels, momentum=0.5, percent_verify=0, standard_weight=0) Accuracy = MLPClass.score(data, labels) # print(MLPClass) retrieved_weights = MLPClass.get_weights() for layer in range(len(retrieved_weights)): np.savetxt("linsep_weights_eval_" + str(layer) + ".csv", retrieved_weights[layer], delimiter=',')
def runMahCode(arff, shuffle=True, determ=0, training=False, lr=.1, quiet=False): mat = Arff(arff,label_count=1) data = mat.data[:,0:-1] labels = mat.data[:,-1:] PClass = PerceptronClassifier(lr=lr,shuffle=shuffle,deterministic=determ) Accuracy = 0.0 if (training): X_train, y_train, X_test, y_test = PerceptronClassifier.split_training(data,labels) PClass.fit(X_train,y_train) Accuracy = PClass.score(X_test,y_test) else: PClass.fit(data,labels) Accuracy = PClass.score(data,labels) if not quiet: print("Accuracy = [{:.5f}]".format(Accuracy)) print("Final Weights =",PClass.get_weights()) else: return Accuracy
def extractAll(self, nomeArquivoArff=None, classes=None, overwrite=True): #print 'Gerando ARFF para o Banco de Imagens ' + self.nomeBancoImagens + "..." bancoImagens = BancoImagens(self.nomeBancoImagens, self.nomePastaRaiz) extratores = Extratores() print 'Localização do Banco ' + bancoImagens.pastaBancoImagens nomeArquivoArff = bancoImagens.nomeArquivoArff if nomeArquivoArff is None else bancoImagens.pastaBancoImagens + nomeArquivoArff if overwrite == False and os.path.isfile(nomeArquivoArff): print 'Arquivo ARFF encontrado em ' + nomeArquivoArff return if classes is None: classes = bancoImagens.classes print 'Classes Encontradas' print classes # Aqui começa a extração de atributos de todas as imagens de cada classe dados = [] nomesAtributos = [] tiposAtributos = [] valoresAtributos = [] for classe in classes: imagens = bancoImagens.imagens_da_classe(classe) #print "Processando %s imagens da classe %s " % (len(imagens),classe) for imagem in imagens: nomesAtributos, tiposAtributos, valoresAtributos = extratores.extrai_todos( imagem) dados.append(valoresAtributos + [classe]) if len(classes) > 0: Arff().cria(nomeArquivoArff, dados, self.nomeBancoImagens, nomesAtributos, tiposAtributos, bancoImagens.classes) print 'Arquivo ARFF gerado em ' + nomeArquivoArff
def extractOneFile(self, nomeArquivoArff, nomeImagem): bancoImagens = BancoImagens(self.nomeBancoImagens, self.nomePastaRaiz) extratores = Extratores() nomeArquivoArff = bancoImagens.pastaBancoImagens + nomeArquivoArff dados = [] nomesAtributos = [] tiposAtributos = [] valoresAtributos = [] imagem = cv2.imread(bancoImagens.pastaBancoImagens + nomeImagem) nomesAtributos, tiposAtributos, valoresAtributos = extratores.extrai_todos( imagem) dados.append(valoresAtributos + [bancoImagens.classes[0]]) Arff().cria(nomeArquivoArff, dados, self.nomeBancoImagens, nomesAtributos, tiposAtributos, bancoImagens.classes)
def sk_voting(): print("----------------sk_voting------------------") mat = Arff("./voting.arff", label_count=1, missing=float(37.0)) # data = mat.data[:, 0:-1] # labels = mat.data[:, -1]#.reshape(-1, 1) splits = 10 kfolder = KFold(n_splits=splits) scores = [[], []] data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], mat.data[:, -1].reshape( -1, 1), test_size=.25) best_tree = (0, None) for train, validate in kfolder.split(data, labels): # print(train, validate) dtree = DecisionTreeClassifier() dtree.fit(data[train], labels[train]) scores[0].append(dtree.score(data[validate], labels[validate])) scores[1].append(dtree.score(data[train], labels[train])) if scores[0][-1] > best_tree[0]: best_tree = (scores[0][-1], dtree) average = np.sum(scores, axis=1) / splits scores[0].append(average[0]) scores[1].append(average[1]) header_text = '' for x in range(splits): header_text = header_text + str(x) + ' ' np.savetxt("sk_voting.csv", scores, header=header_text + 'average', delimiter=',') print(scores) print('Average CV accuracy: {:.2f}'.format(scores[0][-1])) print('Best tree accuracy: {:.2f}'.format(best_tree[1].score( tData, tLabels)))
def iris(): print("-------------iris----------------") mat = Arff("../data/perceptron/iris.arff", label_count=3) y = mat.data[:,-1] # print(y) lb = preprocessing.LabelBinarizer() lb.fit(y) y = lb.transform(y) # split it # data, labels, tData, tLabels = _shuffle_split(mat.data[:, :-1], y, .25) data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], y, test_size=.25) MLPClass = MLPClassifier([2*np.shape(data)[1]], lr=0.1, shuffle=True, one_hot=True) MLPClass.fit(data, labels, momentum=0.5, percent_verify=.25) np.savetxt("Iris_eval.csv", MLPClass.stupidData[1:], header=reduce(MLPClass.stupidData[0]), delimiter=',') accuracy = MLPClass.score(tData, tLabels) print("Test Accuracy = [{:.2f}]".format(accuracy))
def sci_kit(): print("------------------------sci-kit learn------------------") mat = Arff("./tic-tac-toe.arff", label_count=1) y = mat.data[:,-1] lb = preprocessing.LabelBinarizer() lb.fit(y) y = lb.transform(y) data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], y, test_size=.25) # going to randomly search learn_rate, number of nodes, number of hidden layers, and momentum for dummy_iterator in range(16): # 4 features times 4 features MLPClass = None real_one = None learn_rate = np.random.uniform(0.001, 10) number_of_nodes = int(abs(round(np.random.normal(2*np.shape(data)[1], round(2*np.shape(data)[1] - 0.6))))) hidden_layers = np.random.randint(1, 4) momentum = abs(np.random.normal(0, 0.1)) nodes = [1 if number_of_nodes == 0 else number_of_nodes] * hidden_layers print("learn rate", learn_rate, "layers", nodes, "momentum", momentum) MLPClass = MLPClassifier([2*np.shape(data)[1]], lr=0.1, shuffle=True, one_hot=True) MLPClass.fit(data, labels, momentum=0.5, percent_verify=.25) real_one = Skl_Classifier(nodes, momentum=momentum, learning_rate_init=learn_rate, activation='logistic', early_stopping=True, validation_fraction=.25) print("x") real_one.fit(data, np.reshape(labels, (-1,))) real_accuracy = real_one.score(tData, np.reshape(tLabels, (-1,))) accuracy = MLPClass.score(tData, tLabels) print(accuracy, "vs", real_accuracy)
def voting(): print("--------------voting---------------------") mat = Arff("../data/perceptron/vote.arff", label_count=1) np_mat = mat.data avg = [] for iteration in range(5): print("xxxxxxxxxxx " + str(iteration) + " xxxxxxxx") training, testing = _shuffle_split(mat.data, .3) data = training[:, :-1] labels = training[:, -1].reshape(-1, 1) P5Class = PerceptronClassifier(lr=0.1, shuffle=True) P5Class.fit(data, labels) Accuracy = P5Class.score(data, labels) print("Accuracy = [{:.2f}]".format(Accuracy)) print("Epochs = ", P5Class.get_epochs_trained()) tData = testing[:, :-1] tLabels = testing[:, -1].reshape(-1, 1) tAccuracy = P5Class.score(tData, tLabels) print("Test Accuracy = [{:.2f}]".format(tAccuracy)) weights = P5Class.get_weights() print(weights) sort_weights = sorted(zip(weights, list(range(len(weights)))), key=lambda x: abs(x[0]), reverse=True) print("sorted:\r\n", sort_weights) scores = P5Class.getTrace().getColumns("epochScore") print('scores', scores) avg.append((float(scores[-2][0]) - float(scores[0][0])) / len(scores)) print('avg', avg) grapher = Grapher() grapher.graph(list(range(len(avg))), avg, labels=[1]*len(avg), points=False, title="Average Scores", xlabel="Iteration", ylabel="score") grapher.show("AverageScores.svg")
def part5(): print('Running Part 5 ...') # Part credit approval Data sets: mat = Arff("../data/creditapproval.arff", label_count=1) k_val = 3 raw_data = mat.data h, w = raw_data.shape data = raw_data[:, :-1] labels = raw_data[:, -1] # Split data and labels into test and train sets. X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, shuffle=False) # Normalize Data. X_train, X_test = normalizeDataSets(X_train, X_test) KNN = KNNClassifier(label_type='classification', weight_type='inverse_distance', k_neighbors=k_val) KNN.fit(X_train, y_train) score = KNN.score(X_test, y_test)
from perceptron import PerceptronClassifier from arff import Arff import numpy as np mat = Arff("../data/perceptron/evaluation/data_banknote_authentication.arff", label_count=1) data = mat.data[:, 0:-1] labels = mat.data[:, -1:] PClass = PerceptronClassifier(lr=0.1, shuffle=False, deterministic=10) PClass.fit(data, labels) Accuracy = PClass.score(data, labels) print("Accuray = [{:.5f}]".format(Accuracy)) print("Final Weights =", PClass.get_weights())
else: print("class array", classificationArray[0]) counts = np.bincount(classificationArray) finalPred = np.argmax(counts) # print(finalPred, "final pred") return finalPred if __name__ == "__main__": # mat = Arff("magic_telescope_train.arff",label_count=1) # mat2 = Arff("magic_telescope_test.arff",label_count=1) # mat = Arff("diabetes.arff",label_count=1) # mat2 = Arff("diabetes_test.arff",label_count=1) mat = Arff("seismic-bumps_train.arff",label_count=1) mat2 = Arff("seismic-bumps_test.arff",label_count=1) # mat = Arff("house_train.arff",label_count=1) # mat2 = Arff("house_test.arff",label_count=1) # mat = Arff("credit.arff",label_count=1) raw_data = mat.data h,w = raw_data.shape train_data = raw_data[:,:-1] train_labels = raw_data[:,-1] raw_data2 = mat2.data h2,w2 = raw_data2.shape test_data = raw_data2[:,:-1] test_labels = raw_data2[:,-1] # neigh = KNeighborsClassifier(n_neighbors=15)
from arff import Arff import numpy as np from sklearn.neural_network import MLPClassifier from my_shuffling import shuffle from splitter import split from warnings import filterwarnings filterwarnings('ignore') mat = Arff("default.arff", label_count=0) print("parsed") sum_of_incorrect_labels = 0 max_counted = 0 most_common_data_indexes = None data = mat.data[:, :-1] labels = mat.data[:, -1:] unique_data = np.unique(data, axis=0) for ud in unique_data: indexes = np.where((data == ud).all(axis=1))[0] labels_for_same_game = labels[indexes] labels_for_same_game = np.unique(labels_for_same_game, return_counts=True) if len(labels_for_same_game[1]) > 1: maximum_indx = np.argmax(labels_for_same_game[1]) incorrect_count = sum( labels_for_same_game[1]) - labels_for_same_game[1][maximum_indx] sum_of_incorrect_labels += incorrect_count # print(sum_of_incorrect_labels) if sum(labels_for_same_game[1]) > max_counted: max_counted = sum(labels_for_same_game[1])
classes = bancoImagens.classes print 'Classes Encontradas' print classes # Aqui começa a extração de atributos de todas as imagens de cada classe dados = [] nomesAtributos = [] tiposAtributos = [] valoresAtributos = [] for classe in classes: imagens = bancoImagens.imagens_da_classe(classe) print "Processando %s imagens da classe %s " % (len(imagens), classe) for imagem in imagens: nomesAtributos, tiposAtributos, valoresAtributos = extratores.extrai_todos( imagem) dados.append(valoresAtributos + [classe]) if len(classes) > 0: Arff().cria(bancoImagens.nomeArquivoArff, dados, nomeBancoImagens, nomesAtributos, tiposAtributos, classes) print 'Arquivo ARFF gerado em ' + bancoImagens.nomeArquivoArff
from perceptron import PerceptronClassifier from arff import Arff import numpy as np mat = Arff("../data/perceptron/debug/linsep2nonorigin.arff",label_count=1) data = mat.data[:,0:-1] labels = mat.data[:,-1].reshape(-1,1) PClass = PerceptronClassifier(lr=0.1,shuffle=False,deterministic=10) PClass.fit(data,labels) Accuracy = PClass.score(data,labels) print("Accuray = [{:.2f}]".format(Accuracy)) print("Final Weights =",PClass.get_weights())
# # 4. Learn Voting # First I want to know the baseline if I just try to fit on the entire dataset # In[15]: runMahCode("standardVoting.arff") # Now trying it slightly random five times: # In[16]: mat = Arff("standardVoting.arff",label_count=1) data = mat.data[:,0:-1] labels = mat.data[:,-1:] test = [] train = [] iters = [] all_scores = [] print("Iterations | Training | Testing") for i in range(5): PClass = PerceptronClassifier(lr=.1,shuffle=True) Accuracy = 0.0 X_train, y_train, X_test, y_test = PerceptronClassifier.split_training(data,labels) trash, iterr, scores = PClass.fit(X_train,y_train, quiet=True) all_scores.append(scores) training = PClass.score(X_train,y_train) testing = PClass.score(X_test,y_test)