Ejemplo n.º 1
0
def main():
  arff_file1, arff_file2 = read_args()
  arff1 = Arff.fromfile(arff_file1)
  arff2 = Arff.fromfile(arff_file2)
  norm_arff1 = arff1.normalize()
  norm_arff2 = arff2.normalize()

  space1= Space(len(norm_arff1.attributes))
  populate_space_from_arff(norm_arff1, space1)

  print "Outliers from "+arff_file1+":"
  outliers1 = get_outliers(3, 5, space1)
  if outliers1:
    for x in outliers1:
      print x
  else:
    print "None"

  space2 = Space(len(norm_arff2.attributes))
  populate_space_from_arff(norm_arff2, space2)

  print "Outliers from "+arff_file2+":"
  outliers2 = get_outliers(3, 12, space2)
  if outliers2:
    for x in outliers2:
      print x
  else:
    print "None"
Ejemplo n.º 2
0
def part1():
    print('Running Part 1...')
    # Debug Data sets:
    # mat = Arff("../data/knn/debug/seismic-bumps_train.arff",label_count=1)
    # mat2 = Arff("../data/knn/debug/seismic-bumps_test.arff",label_count=1)

    # Evaluation Data sets:
    mat = Arff("../data/knn/evaluation/diabetes.arff", label_count=1)
    mat2 = Arff("../data/knn/evaluation/diabetes_test.arff", label_count=1)

    k_neighbors = 3
    raw_data = mat.data
    h, w = raw_data.shape
    train_data = raw_data[:, :-1]
    train_labels = raw_data[:, -1]

    raw_data2 = mat2.data
    h2, w2 = raw_data2.shape
    test_data = raw_data2[:, :-1]
    test_labels = raw_data2[:, -1]

    KNN = KNNClassifier(label_type='classification',
                        weight_type='inverse_distance',
                        k_neighbors=k_neighbors)
    print("Fitting data ...")
    KNN.fit(train_data, train_labels)
    print("Predict data ...")
    pred = KNN.predict(test_data)
    print("Scoring data ...")
    score = KNN.score(test_data, test_labels)
    np.savetxt("diabetes-prediction.csv", pred, delimiter=',', fmt="%i")
    print("Accuracy = [{:.2f}]\n".format(score * 100))
def sk_learn(data="oldGames.arff", min_split=300, min_leaf=15):
    folds = 10
    mat = Arff(data, label_count=1)

    counts = []  ## this is so you know how many types for each column
    for i in range(mat.data.shape[1]):
        counts += [mat.unique_value_count(i)]

    # np.random.seed(35)
    np.random.shuffle(mat.data)
    splits = np.array_split(mat.data, folds)

    Acc = 0
    # min_split = 300
    # print("Minsplit: {}".format(min_split))
    for f in range(folds):
        # print("Fold {}:".format(f))
        train = np.array([])
        for other in range(folds):
            if train.size == 0 and other != f:
                train = splits[other].copy()
            elif other != f:
                train = np.concatenate((train, splits[other]))

        data = train[:, 0:-1]
        labels = train[:, -1].reshape(-1, 1)

        clf = tree.DecisionTreeClassifier(
        )  #min_samples_split=min_split, min_samples_leaf=min_leaf
        clf = clf.fit(data, labels)
        pred = clf.predict(data)
        new_acc = score(pred, labels)
        # print("\tTrain Acc {}".format(new_acc))

        data2 = splits[f][:, 0:-1]
        labels2 = splits[f][:, -1].reshape(-1, 1)
        pred = clf.predict(data2)
        new_acc = score(pred, labels2)
        # print("\tTest Acc {}".format(new_acc))
        Acc += new_acc

    Acc = Acc / folds
    print("Accuracy = [{:.4f}]".format(Acc))

    classes = [
        "Overwhelmingly_Positive", "Very_Positive", "Positive",
        "Mostly_Positive", "Mixed", "Mostly_Negative", "Negative",
        "Very_Negative", "Overwhelmingly_Negative"
    ]
    dot_data = tree.export_graphviz(clf,
                                    out_file=None,
                                    feature_names=mat.get_attr_names()[:-1],
                                    class_names=classes,
                                    filled=True,
                                    rounded=True)  # max_depth=6,
    graph = graphviz.Source(dot_data)
    graph.render("old_games")

    return Acc
Ejemplo n.º 4
0
def part2():
    print('Running Part 2...')
    # Part 2 Data sets:
    mat = Arff("../data/knn/magic-telescope/mt_training.arff", label_count=1)
    mat2 = Arff("../data/knn/magic-telescope/mt_testing.arff", label_count=1)

    k_neighbors = 3
    raw_data = mat.data
    h, w = raw_data.shape
    train_data = raw_data[:, :-1]
    train_labels = raw_data[:, -1]

    raw_data2 = mat2.data
    h2, w2 = raw_data2.shape
    test_data = raw_data2[:, :-1]
    test_labels = raw_data2[:, -1]

    KNN = KNNClassifier(label_type='classification',
                        weight_type='no_weight',
                        k_neighbors=k_neighbors)
    print("Fitting data ...")
    KNN.fit(train_data, train_labels)
    print("Scoring data ...")
    score = KNN.score(test_data, test_labels)
    print("Accuracy = [{:.2f}]\n".format(score * 100))

    norm_train_data, norm_test_data = normalizeDataSets(train_data, test_data)

    print("Fitting normalized data ...")
    KNN.fit(norm_train_data, train_labels)
    print("Scoring normalized data ...")
    score = KNN.score(norm_test_data, test_labels)
    print("Accuracy = [{:.2f}]\n".format(score * 100))

    print('Running K Values tests...')
    k_values = [1, 3, 5, 7, 9, 11, 13, 15]
    scores = []
    for k_val in k_values:
        KNN = KNNClassifier(label_type='classification',
                            weight_type='no_weight',
                            k_neighbors=k_val)
        KNN.fit(norm_train_data, train_labels)
        score = KNN.score(norm_test_data, test_labels)
        scores.append(score * 100)

    print('Plotting K values vs Scores w/ Normalization...')
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(k_values, scores, label='Accuracy')
    for xy in zip(k_values, scores):  # <--
        ax.annotate('(%s, %.1f)' % xy, xy=xy, textcoords='data')
    plt.title('Accuracy Using Different K Values and No Distance Weighting')
    plt.xlabel('K Values')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.savefig('k-value-and-accuracy.png')
    plt.show()
Ejemplo n.º 5
0
    def main(self):
        if self.eval_method == "training":
            self.train(self.arff.get_features(), self.arff.get_labels())
            self._print_confusion_matrix(self.arff.get_features(),
                                         self.arff.get_labels())
        elif self.eval_method == "random":
            train_features, train_labels, test_features, test_labels = self.training_test_split(
                train_percent=self.eval_parameter)
            self.train(train_features, train_labels)
            self.test(test_features, test_labels)
            self._print_confusion_matrix(test_features, test_labels)

        elif self.eval_method == "static":
            self.train(self.arff.get_features(), self.arff.get_labels())
            arff_file = self.eval_parameter
            test_data = Arff(arff_file)
            if self.normalize:
                test_data.normalize()
            self.test(features=test_data.get_features(),
                      labels=test_data.get_labels())
            self._print_confusion_matrix(features=test_data.get_features(),
                                         labels=test_data.get_labels())

        elif self.eval_method == "cross":
            # print('PARAMETER')
            self.eval_parameter = int(self.eval_parameter)
            self.cross_validate(
                self.eval_parameter)  # confusion matrix not supported for CV
            type(self.eval_parameter)
        else:
            raise Exception("Unrecognized evaluation method '{}'".format(
                self.eval_method))
Ejemplo n.º 6
0
def separable():
    print("----------------separable-----------------------")

    mat = Arff("./separableIsSquare.arff", label_count=1)
    np_mat = mat.data
    data = mat[:, :-1]
    labels = mat[:, -1].reshape(-1, 1)
    print(data[:, 1])
    print(labels)

    ### Make the Classifier #####
    P3Class = None
    for lr in range(10, 0, -1):
        P3Class = PerceptronClassifier(lr=0.1*lr, shuffle=False)
        P3Class.fit(data, labels, standard_weight_value=None)
        Accuracy = P3Class.score(data, labels)
        print("Learning Rate = ", 0.1*lr)
        print("Accuracy = [{:.2f}]".format(Accuracy))
        print("Epochs = ", P3Class.get_epochs_trained())
    # print(P3Class)


    ## could not get graphing to work in time...
    # graph(data[:, 0], data[:, 1], labels=mat[:, -1])

    w = P3Class.get_weights()
    y = lambda x: (-w[0]/w[1])*x - (w[2]/w[1])

    grapher = Grapher()
    grapher.graph(data[:, 0], data[:, 1], labels=mat[:, -1], title="Separable")
    grapher.add_function(y)

    grapher.show("separable.svg")
Ejemplo n.º 7
0
def inseparable():
    print("----------------Inseparable-----------------------")

    mat = Arff("./impossible.arff", label_count=1)
    np_mat = mat.data
    data = mat[:, :-1]
    labels = mat[:, -1].reshape(-1, 1)

    ### Make the Classifier #####
    P4Class = None
    for lr in range(10, 0, -1):
        P4Class = PerceptronClassifier(lr=0.1*lr, deterministic=10, shuffle=False)
        P4Class.fit(data, labels, standard_weight_value=None)
        Accuracy = P4Class.score(data, labels)
        print("Learning Rate = ", 0.1*lr)
        print("Accuracy = [{:.2f}]".format(Accuracy))
        print("Epochs = ", P4Class.get_epochs_trained())

    w = P4Class.get_weights()
    y = lambda x: (-w[0]/w[1])*x - (w[2]/w[1])

    grapher = Grapher()
    grapher.graph(data[:, 0], data[:, 1], labels=mat[:, -1], title="Inseparable")
    grapher.add_function(y)

    grapher.show("Inseparable.svg")
Ejemplo n.º 8
0
def all_lenses():
    print("---------all-lenses----------")

    lens_data = Arff("./lenses.arff", label_count=1)
    all_lens_data = Arff("./all_lenses.arff", label_count=1)

    lens_train = lens_data.data[:, :-1]
    lens_label_train = lens_data.data[:, -1].reshape(-1, 1)
    lens_test = all_lens_data.data[:, :-1]
    lens_label_test = all_lens_data.data[:, -1].reshape(-1, 1)

    dtree = DTClassifier(features=lens_data.get_attr_names())
    dtree.fit(lens_train, lens_label_train)
    score = dtree.score(lens_test, lens_label_test)
    print("Train Accuracy=[{:.2f}]".format(
        dtree.score(lens_train, lens_label_train)))
    print("Accuracy=[{:.2f}]".format(score))
Ejemplo n.º 9
0
def soybean():
    print("----------------soybean------------------")

    mat = Arff("./soybean.arff", label_count=1, missing=float(37.0))
    # data = mat.data[:, 0:-1]
    # labels = mat.data[:, -1]#.reshape(-1, 1)
    splits = 10
    kfolder = KFold(n_splits=splits)

    data, tData, labels, tLabels = train_test_split(mat.data[:, :-1],
                                                    mat.data[:, -1].reshape(
                                                        -1, 1),
                                                    test_size=.25)

    best_tree = (0, 0, None, -.1, -.1)

    trace = []

    for dummy_iterator in range(16):
        max_depth = np.random.randint(1, mat.features_count)
        max_features = np.random.uniform(.1, 1)
        print("max depth", max_depth, "max features", max_features)
        which_split = 0
        for train, validate in kfolder.split(data, labels):
            # print(train, validate)
            dtree = DecisionTreeClassifier(max_depth=max_depth,
                                           max_features=max_features)
            dtree.fit(data[train], labels[train])

            score = dtree.score(data[validate], labels[validate])
            train_score = dtree.score(data[train], labels[train])

            trace.append([
                dummy_iterator, which_split, score, train_score, max_depth,
                max_features
            ])
            which_split = which_split + 1

            if score > best_tree[0]:
                print("score update", score)
                best_tree = (score, train_score, dtree, max_depth,
                             max_features)

    print(best_tree)
    print('Best tree accuracy: {:.2f}'.format(best_tree[2].score(
        tData, tLabels)))
    np.savetxt(
        "soybean.csv",
        trace,
        delimiter=',',
        header=
        "iteration_of_10_fold,which_fold,score,train_score,max_depth,max_features"
    )
    export_graphviz(best_tree[2], out_file="soybean_tree")
    export_graphviz(best_tree[2],
                    out_file="soybean_tree_truncated",
                    max_depth=5)
Ejemplo n.º 10
0
def voting():
    print("----------------voting------------------")

    mat = Arff("./voting.arff", label_count=1)
    # data = mat.data[:, 0:-1]
    # labels = mat.data[:, -1]#.reshape(-1, 1)
    splits = 10
    kfolder = KFold(n_splits=splits)

    scores = [[], []]

    data, tData, labels, tLabels = train_test_split(mat.data[:, :-1],
                                                    mat.data[:, -1].reshape(
                                                        -1, 1),
                                                    test_size=.25)
    best_tree = (0, None)
    for train, validate in kfolder.split(data, labels):
        # print(train, validate)
        dtree = DTClassifier(features=mat.get_attr_names())
        dtree.fit(data[train], labels[train])

        scores[0].append(dtree.score(data[validate], labels[validate]))
        scores[1].append(dtree.score(data[train], labels[train]))
        if scores[0][-1] > best_tree[0]:
            best_tree = (scores[0][-1], dtree)

    average = np.sum(scores, axis=1) / splits
    scores[0].append(average[0])
    scores[1].append(average[1])
    header_text = ''
    for x in range(splits):
        header_text = header_text + str(x) + ' '

    np.savetxt("voting.csv",
               scores,
               header=header_text + 'average',
               delimiter=',')
    print(scores)
    print('Average CV accuracy: {:.2f}'.format(scores[0][-1]))
    print('Best tree accuracy: {:.2f}'.format(best_tree[1].score(
        tData, tLabels)))
    f = open("voting_tree", "w")
    f.write(dtree.graph(class_translator=lambda x: mat.attr_value(-1, x)))
    f.close()
Ejemplo n.º 11
0
def part3():
    print('Running Part 3...')
    # Part 3 Data sets:
    mat = Arff("../data/knn/housing-price/hp_training.arff", label_count=1)
    mat2 = Arff("../data/knn/housing-price/hp_testing.arff", label_count=1)

    k_neighbors = 3
    raw_data = mat.data
    h, w = raw_data.shape
    train_data = raw_data[:, :-1]
    train_labels = raw_data[:, -1]

    raw_data2 = mat2.data
    h2, w2 = raw_data2.shape
    test_data = raw_data2[:, :-1]
    test_labels = raw_data2[:, -1]

    # Normalize Data.
    train_data, test_data = normalizeDataSets(train_data, test_data)

    print('Running K Values tests...')
    k_values = [1, 3, 5, 7, 9, 11, 13, 15]
    mses = []
    for k_val in k_values:
        KNN = KNNClassifier(label_type='regression',
                            weight_type='no_weight',
                            k_neighbors=k_val)
        KNN.fit(train_data, train_labels)
        score = KNN.score(test_data, test_labels)
        mses.append(score)

    print('Plotting K values vs MSE Scores...')
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(k_values, mses, label='MSE')
    for xy in zip(k_values, mses):  # <--
        ax.annotate('(%s, %.2f)' % xy, xy=xy, textcoords='data')
    plt.title('MSE Using Different K Value sand No Distance Weighting')
    plt.xlabel('K Values')
    plt.ylabel('MSE')
    plt.legend()
    plt.savefig('k-value-and-mse.png')
    plt.show()
Ejemplo n.º 12
0
def main():
  arff_file1, arff_file2 = read_args()
  arff1 = Arff.fromfile(arff_file1)
  arff2 = Arff.fromfile(arff_file2)
  norm_arff1 = arff1.normalize()
  norm_arff2 = arff2.normalize()

  space1= Space(len(norm_arff1.attributes))
  populate_space_from_arff(norm_arff1, space1)

  print "Outliers from "+arff_file1+":"
  for x in get_outliers(space1, 0.05):
    print x

  space2 = Space(len(norm_arff2.attributes))
  populate_space_from_arff(norm_arff2, space2)

  print "Outliers from "+arff_file2+":"
  for x in get_outliers(space2, 0.05):
    print x
Ejemplo n.º 13
0
def debug():
    print("------------arff-------------------")

    mat = Arff("../data/perceptron/debug/linsep2nonorigin.arff", label_count=1)
    data = mat.data[:, 0:-1]
    labels = mat.data[:, -1].reshape(-1, 1)
    PClass = PerceptronClassifier(
        lr=0.1, shuffle=False, deterministic=10, printIt=False)
    PClass.fit(data, labels)
    Accuracy = PClass.score(data, labels)
    print("Accuray = [{:.2f}]".format(Accuracy))
    print("Final Weights =", PClass.get_weights())
Ejemplo n.º 14
0
def nan_lenses():
    print("----------------nan_lenses------------------")

    mat = Arff("./nan_lenses.arff", label_count=1)
    # data = mat.data[:, 0:-1]
    # labels = mat.data[:, -1].reshape(-1, 1)

    data, tData, labels, tLabels = train_test_split(mat.data[:, :-1],
                                                    mat.data[:, -1].reshape(
                                                        -1, 1),
                                                    test_size=.25)

    dtree = DTClassifier(features=mat.get_attr_names())
    dtree.fit(data, labels)
    print(dtree.tree)

    # results = dtree.predict(tData)
    # for r, t in zip(results, tLabels):
    #     print(r, t)

    score = dtree.score(tData, tLabels)
    print("Accuracy=[{:.2f}]".format(score))
Ejemplo n.º 15
0
def evaluate():
    print("------------eval-------------------")

    mat = Arff("../data/perceptron/evaluation/data_banknote_authentication.arff", label_count=1)
    data = mat.data[:, 0:-1]
    labels = mat.data[:, -1].reshape(-1, 1)
    # print("data\n", data)
    MLPClass = MLPClassifier([2*np.shape(data)[1]], lr=0.1, shuffle=False, deterministic=10)
    MLPClass.fit(data, labels, momentum=0.5, percent_verify=0, standard_weight=0)
    Accuracy = MLPClass.score(data, labels)
    # print(MLPClass)
    # print("Final Weights =", MLPClass.get_weights())
    # print(MLPClass)
    print(MLPClass.csv_print())
Ejemplo n.º 16
0
def evaluation():
    print("--------------arf2------------------------------")

    mat = Arff("../data/perceptron/evaluation/data_banknote_authentication.arff", label_count=1)
    np_mat = mat.data
    data = mat[:, :-1]
    labels = mat[:, -1].reshape(-1, 1)

    #### Make Classifier ####
    P2Class = PerceptronClassifier(lr=0.1, shuffle=False, deterministic=10)
    P2Class.fit(data, labels)
    Accuracy = P2Class.score(data, labels)
    print("Accuray = [{:.2f}]".format(Accuracy))
    print("Final Weights =", P2Class.get_weights())
Ejemplo n.º 17
0
def evaluation():
    print("----------------evaluation---------------")

    zoo_data = Arff("./zoo.arff", label_count=1)
    all_zoo_data = Arff("./all_zoo.arff", label_count=1)

    zoo_train = zoo_data.data[:, :-1]
    zoo_label_train = zoo_data.data[:, -1].reshape(-1, 1)
    zoo_test = all_zoo_data.data[:, :-1]
    zoo_label_test = all_zoo_data.data[:, -1].reshape(-1, 1)

    dtree = DTClassifier(features=zoo_data.get_attr_names())
    dtree.fit(zoo_train, zoo_label_train)
    print("Train Accuracy=[{:.2f}]".format(
        dtree.score(zoo_train, zoo_label_train)))

    predicted = dtree.predict(zoo_test)
    np.savetxt('predicted_zoo.csv',
               predicted,
               delimiter=',',
               header="predicted")
    score = dtree.score(zoo_test, zoo_label_test)
    print("Accuracy=[{:.2f}]".format(score))
Ejemplo n.º 18
0
def main():
  num_points = 100
  dim = 2
  means = [2.4, 5.5, 6.2]
  sigmas = [1, 1.5, 0.5]
  raw_points = generate_norm(num_points, dim, means, sigmas)

  attrs = []
  for d in range(dim):
    attrs.append(('f'+str(d), 'real'))

  points = []
  for i in range(num_points):
    point = {}
    for j, (attr, _) in enumerate(attrs):
      point[attr] = raw_points[i][j]
    points.append((i+1, point))

  relation = 'DATASET'

  arff = Arff(relation, attrs, points)
  arff._print_arff()
  arff.output_file("synthetic.arff")
Ejemplo n.º 19
0
def debug():
    print("------------arff-------------------")

    mat = Arff("../data/perceptron/debug/linsep2nonorigin.arff", label_count=1)
    data = mat.data[:, 0:-1]
    labels = mat.data[:, -1].reshape(-1, 1)
    # print("data\n", data)
    MLPClass = MLPClassifier([2*np.shape(data)[1]], lr=0.1, shuffle=False, deterministic=10)
    MLPClass.fit(data, labels, momentum=0.5, percent_verify=0, standard_weight=0)
    Accuracy = MLPClass.score(data, labels)
    # print(MLPClass)
    retrieved_weights = MLPClass.get_weights()

    for layer in range(len(retrieved_weights)):
        np.savetxt("linsep_weights_eval_" + str(layer) + ".csv", retrieved_weights[layer], delimiter=',')
Ejemplo n.º 20
0
def runMahCode(arff, shuffle=True, determ=0, training=False, lr=.1, quiet=False):
    mat = Arff(arff,label_count=1)
    data = mat.data[:,0:-1]
    labels = mat.data[:,-1:]
    PClass = PerceptronClassifier(lr=lr,shuffle=shuffle,deterministic=determ)
    Accuracy = 0.0
    if (training):
        X_train, y_train, X_test, y_test = PerceptronClassifier.split_training(data,labels)
        PClass.fit(X_train,y_train)
        Accuracy = PClass.score(X_test,y_test)
    else:
        PClass.fit(data,labels)
        Accuracy = PClass.score(data,labels)
    if not quiet:
        print("Accuracy = [{:.5f}]".format(Accuracy))
        print("Final Weights =",PClass.get_weights())
    else:
        return Accuracy
Ejemplo n.º 21
0
    def extractAll(self, nomeArquivoArff=None, classes=None, overwrite=True):

        #print 'Gerando ARFF para o Banco de Imagens ' + self.nomeBancoImagens + "..."

        bancoImagens = BancoImagens(self.nomeBancoImagens, self.nomePastaRaiz)
        extratores = Extratores()

        print 'Localização do Banco ' + bancoImagens.pastaBancoImagens

        nomeArquivoArff = bancoImagens.nomeArquivoArff if nomeArquivoArff is None else bancoImagens.pastaBancoImagens + nomeArquivoArff

        if overwrite == False and os.path.isfile(nomeArquivoArff):
            print 'Arquivo ARFF encontrado em ' + nomeArquivoArff
            return

        if classes is None:
            classes = bancoImagens.classes

            print 'Classes Encontradas'
            print classes

        # Aqui começa a extração de atributos de todas as imagens de cada classe

        dados = []
        nomesAtributos = []
        tiposAtributos = []
        valoresAtributos = []

        for classe in classes:
            imagens = bancoImagens.imagens_da_classe(classe)

            #print "Processando %s imagens da classe %s " % (len(imagens),classe)

            for imagem in imagens:
                nomesAtributos, tiposAtributos, valoresAtributos = extratores.extrai_todos(
                    imagem)

                dados.append(valoresAtributos + [classe])

        if len(classes) > 0:
            Arff().cria(nomeArquivoArff, dados, self.nomeBancoImagens,
                        nomesAtributos, tiposAtributos, bancoImagens.classes)

        print 'Arquivo ARFF gerado em ' + nomeArquivoArff
Ejemplo n.º 22
0
    def extractOneFile(self, nomeArquivoArff, nomeImagem):

        bancoImagens = BancoImagens(self.nomeBancoImagens, self.nomePastaRaiz)
        extratores = Extratores()

        nomeArquivoArff = bancoImagens.pastaBancoImagens + nomeArquivoArff

        dados = []
        nomesAtributos = []
        tiposAtributos = []
        valoresAtributos = []

        imagem = cv2.imread(bancoImagens.pastaBancoImagens + nomeImagem)
        nomesAtributos, tiposAtributos, valoresAtributos = extratores.extrai_todos(
            imagem)

        dados.append(valoresAtributos + [bancoImagens.classes[0]])

        Arff().cria(nomeArquivoArff, dados, self.nomeBancoImagens,
                    nomesAtributos, tiposAtributos, bancoImagens.classes)
Ejemplo n.º 23
0
def sk_voting():
    print("----------------sk_voting------------------")

    mat = Arff("./voting.arff", label_count=1, missing=float(37.0))
    # data = mat.data[:, 0:-1]
    # labels = mat.data[:, -1]#.reshape(-1, 1)
    splits = 10
    kfolder = KFold(n_splits=splits)

    scores = [[], []]

    data, tData, labels, tLabels = train_test_split(mat.data[:, :-1],
                                                    mat.data[:, -1].reshape(
                                                        -1, 1),
                                                    test_size=.25)
    best_tree = (0, None)
    for train, validate in kfolder.split(data, labels):
        # print(train, validate)
        dtree = DecisionTreeClassifier()
        dtree.fit(data[train], labels[train])

        scores[0].append(dtree.score(data[validate], labels[validate]))
        scores[1].append(dtree.score(data[train], labels[train]))
        if scores[0][-1] > best_tree[0]:
            best_tree = (scores[0][-1], dtree)

    average = np.sum(scores, axis=1) / splits
    scores[0].append(average[0])
    scores[1].append(average[1])
    header_text = ''
    for x in range(splits):
        header_text = header_text + str(x) + ' '

    np.savetxt("sk_voting.csv",
               scores,
               header=header_text + 'average',
               delimiter=',')
    print(scores)
    print('Average CV accuracy: {:.2f}'.format(scores[0][-1]))
    print('Best tree accuracy: {:.2f}'.format(best_tree[1].score(
        tData, tLabels)))
Ejemplo n.º 24
0
def iris():
    print("-------------iris----------------")
    mat = Arff("../data/perceptron/iris.arff", label_count=3)

    y = mat.data[:,-1]
    # print(y)

    lb = preprocessing.LabelBinarizer()
    lb.fit(y)
    y = lb.transform(y)

    # split it
    # data, labels, tData, tLabels = _shuffle_split(mat.data[:, :-1], y, .25)
    data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], y, test_size=.25)

    MLPClass = MLPClassifier([2*np.shape(data)[1]], lr=0.1, shuffle=True, one_hot=True)
    MLPClass.fit(data, labels, momentum=0.5, percent_verify=.25)

    np.savetxt("Iris_eval.csv", MLPClass.stupidData[1:], header=reduce(MLPClass.stupidData[0]), delimiter=',')

    accuracy = MLPClass.score(tData, tLabels)
    print("Test Accuracy = [{:.2f}]".format(accuracy))
Ejemplo n.º 25
0
def sci_kit():
    print("------------------------sci-kit learn------------------")
    mat = Arff("./tic-tac-toe.arff", label_count=1)

    y = mat.data[:,-1]

    lb = preprocessing.LabelBinarizer()
    lb.fit(y)
    y = lb.transform(y)

    data, tData, labels, tLabels = train_test_split(mat.data[:, :-1], y, test_size=.25)

    # going to randomly search learn_rate, number of nodes, number of hidden layers, and momentum
    for dummy_iterator in range(16): # 4 features times 4 features
        MLPClass = None
        real_one = None

        learn_rate = np.random.uniform(0.001, 10)
        number_of_nodes = int(abs(round(np.random.normal(2*np.shape(data)[1], round(2*np.shape(data)[1] - 0.6)))))
        hidden_layers = np.random.randint(1, 4)
        momentum = abs(np.random.normal(0, 0.1))

        nodes = [1 if number_of_nodes == 0 else number_of_nodes] * hidden_layers

        print("learn rate", learn_rate, "layers", nodes, "momentum", momentum)


        MLPClass = MLPClassifier([2*np.shape(data)[1]], lr=0.1, shuffle=True, one_hot=True)
        MLPClass.fit(data, labels, momentum=0.5, percent_verify=.25)

        real_one = Skl_Classifier(nodes, momentum=momentum, learning_rate_init=learn_rate, activation='logistic', early_stopping=True, validation_fraction=.25)
        print("x")
        real_one.fit(data, np.reshape(labels, (-1,)))

        real_accuracy = real_one.score(tData, np.reshape(tLabels, (-1,)))
        accuracy = MLPClass.score(tData, tLabels)

        print(accuracy, "vs", real_accuracy)
Ejemplo n.º 26
0
def voting():
    print("--------------voting---------------------")
    mat = Arff("../data/perceptron/vote.arff", label_count=1)
    np_mat = mat.data

    avg = []

    for iteration in range(5):
        print("xxxxxxxxxxx   " + str(iteration) + "  xxxxxxxx")
        training, testing = _shuffle_split(mat.data, .3)

        data = training[:, :-1]
        labels = training[:, -1].reshape(-1, 1)
        P5Class = PerceptronClassifier(lr=0.1, shuffle=True)
        P5Class.fit(data, labels)

        Accuracy = P5Class.score(data, labels)
        print("Accuracy = [{:.2f}]".format(Accuracy))
        print("Epochs = ", P5Class.get_epochs_trained())    

        tData = testing[:, :-1]
        tLabels = testing[:, -1].reshape(-1, 1)
        tAccuracy = P5Class.score(tData, tLabels)
        print("Test Accuracy = [{:.2f}]".format(tAccuracy))

        weights = P5Class.get_weights()
        print(weights)
        sort_weights = sorted(zip(weights, list(range(len(weights)))), key=lambda x: abs(x[0]), reverse=True)
        print("sorted:\r\n", sort_weights)

        scores = P5Class.getTrace().getColumns("epochScore")
        print('scores', scores)
        avg.append((float(scores[-2][0]) - float(scores[0][0])) / len(scores))
    
    print('avg', avg)
    grapher = Grapher()
    grapher.graph(list(range(len(avg))), avg, labels=[1]*len(avg), points=False, title="Average Scores", xlabel="Iteration", ylabel="score")
    grapher.show("AverageScores.svg")
Ejemplo n.º 27
0
def part5():
    print('Running Part 5 ...')
    # Part credit approval Data sets:
    mat = Arff("../data/creditapproval.arff", label_count=1)

    k_val = 3
    raw_data = mat.data
    h, w = raw_data.shape
    data = raw_data[:, :-1]
    labels = raw_data[:, -1]

    # Split data and labels into test and train sets.
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.33,
                                                        shuffle=False)

    # Normalize Data.
    X_train, X_test = normalizeDataSets(X_train, X_test)
    KNN = KNNClassifier(label_type='classification',
                        weight_type='inverse_distance',
                        k_neighbors=k_val)
    KNN.fit(X_train, y_train)
    score = KNN.score(X_test, y_test)
Ejemplo n.º 28
0
Archivo: eval.py Proyecto: pts314/CS472
from perceptron import PerceptronClassifier
from arff import Arff
import numpy as np

mat = Arff("../data/perceptron/evaluation/data_banknote_authentication.arff",
           label_count=1)
data = mat.data[:, 0:-1]
labels = mat.data[:, -1:]
PClass = PerceptronClassifier(lr=0.1, shuffle=False, deterministic=10)
PClass.fit(data, labels)
Accuracy = PClass.score(data, labels)
print("Accuray = [{:.5f}]".format(Accuracy))
print("Final Weights =", PClass.get_weights())
Ejemplo n.º 29
0
Archivo: KNN.py Proyecto: ccrwong/cs472
                
            else:
                print("class array", classificationArray[0])
                counts = np.bincount(classificationArray)
                finalPred = np.argmax(counts)

        # print(finalPred, "final pred")
        return finalPred


if __name__ == "__main__":
    # mat = Arff("magic_telescope_train.arff",label_count=1)
    # mat2 = Arff("magic_telescope_test.arff",label_count=1)
    # mat = Arff("diabetes.arff",label_count=1)
    # mat2 = Arff("diabetes_test.arff",label_count=1)
    mat = Arff("seismic-bumps_train.arff",label_count=1)
    mat2 = Arff("seismic-bumps_test.arff",label_count=1)
    # mat = Arff("house_train.arff",label_count=1)
    # mat2 = Arff("house_test.arff",label_count=1)
    # mat = Arff("credit.arff",label_count=1)
    raw_data = mat.data
    h,w = raw_data.shape
    train_data = raw_data[:,:-1]
    train_labels = raw_data[:,-1]

    raw_data2 = mat2.data
    h2,w2 = raw_data2.shape
    test_data = raw_data2[:,:-1]
    test_labels = raw_data2[:,-1]

    # neigh = KNeighborsClassifier(n_neighbors=15)
from arff import Arff
import numpy as np
from sklearn.neural_network import MLPClassifier
from my_shuffling import shuffle
from splitter import split
from warnings import filterwarnings
filterwarnings('ignore')

mat = Arff("default.arff", label_count=0)
print("parsed")

sum_of_incorrect_labels = 0
max_counted = 0
most_common_data_indexes = None
data = mat.data[:, :-1]
labels = mat.data[:, -1:]

unique_data = np.unique(data, axis=0)
for ud in unique_data:
    indexes = np.where((data == ud).all(axis=1))[0]
    labels_for_same_game = labels[indexes]
    labels_for_same_game = np.unique(labels_for_same_game, return_counts=True)
    if len(labels_for_same_game[1]) > 1:
        maximum_indx = np.argmax(labels_for_same_game[1])
        incorrect_count = sum(
            labels_for_same_game[1]) - labels_for_same_game[1][maximum_indx]
        sum_of_incorrect_labels += incorrect_count
        # print(sum_of_incorrect_labels)

        if sum(labels_for_same_game[1]) > max_counted:
            max_counted = sum(labels_for_same_game[1])
Ejemplo n.º 31
0
classes = bancoImagens.classes

print 'Classes Encontradas'
print classes

# Aqui começa a extração de atributos de todas as imagens de cada classe

dados = []
nomesAtributos = []
tiposAtributos = []
valoresAtributos = []

for classe in classes:

    imagens = bancoImagens.imagens_da_classe(classe)

    print "Processando %s imagens da classe %s " % (len(imagens), classe)

    for imagem in imagens:

        nomesAtributos, tiposAtributos, valoresAtributos = extratores.extrai_todos(
            imagem)

        dados.append(valoresAtributos + [classe])

if len(classes) > 0:

    Arff().cria(bancoImagens.nomeArquivoArff, dados, nomeBancoImagens,
                nomesAtributos, tiposAtributos, classes)

print 'Arquivo ARFF gerado em ' + bancoImagens.nomeArquivoArff
Ejemplo n.º 32
0
from perceptron import PerceptronClassifier
from arff import Arff
import numpy as np


mat = Arff("../data/perceptron/debug/linsep2nonorigin.arff",label_count=1)
data = mat.data[:,0:-1]
labels = mat.data[:,-1].reshape(-1,1)
PClass = PerceptronClassifier(lr=0.1,shuffle=False,deterministic=10)
PClass.fit(data,labels)
Accuracy = PClass.score(data,labels)
print("Accuray = [{:.2f}]".format(Accuracy))
print("Final Weights =",PClass.get_weights())
Ejemplo n.º 33
0
# # 4. Learn Voting
# First I want to know the baseline if I just try to fit on the entire dataset

# In[15]:


runMahCode("standardVoting.arff")


# Now trying it slightly random five times:

# In[16]:


mat = Arff("standardVoting.arff",label_count=1)
data = mat.data[:,0:-1]
labels = mat.data[:,-1:]
test = []
train = []
iters = []
all_scores = []
print("Iterations | Training | Testing")
for i in range(5):
    PClass = PerceptronClassifier(lr=.1,shuffle=True)
    Accuracy = 0.0
    X_train, y_train, X_test, y_test = PerceptronClassifier.split_training(data,labels)
    trash, iterr, scores = PClass.fit(X_train,y_train, quiet=True)
    all_scores.append(scores)
    training = PClass.score(X_train,y_train)
    testing = PClass.score(X_test,y_test)