Example #1
0
def prob0():
    arff = Arff('datasets/labor.arff', label_count=1)
    # Trim the id column
    arff = arff.create_subset_arff(col_idx=slice(1, None))
    arff = arff.get_features()
    km = KMeans(5)
    km.train(arff, verbose=True, centers=arff.data[:5])
Example #2
0
    def test_load_arff(self):
        """ Tests downloading and loading arff file
        """

        t = Arff()
        t.load_arff(self.iris_path)
        self.assertListEqual(t.data[t.shape[0]-1].tolist(), [5.9, 3.0, 5.1, 1.8, 2.0])
def prob_0():
    arff = Arff('datasets/lenses.arff')
    d = DecisionTreeLearner()
    f = arff.get_features()
    l = arff.get_labels()
    d.train(f,l)
    print(d.tree)
Example #4
0
def prob0haccomplete():
    arff = Arff('datasets/labor.arff', label_count=1)
    # Trim the id column
    arff = arff.create_subset_arff(col_idx=slice(1, None))
    arff = arff.get_features()
    hac = HAC(simple=False)
    hac.train(arff, verbose=True, printk=[5])
Example #5
0
    def setUp(self):
        path = os.path.join(utils.get_root(), "test/datasets/cm1_req.arff")
        data = Arff(arff=path)

        self.features = data.get_features()
        self.labels = data.get_labels()
        self.learner = BaselineLearner()
Example #6
0
    def setUp(self):

        # NOTE: for discrete attributes, at least one value must be a float in order for numpy array
        # functions to work properly.
        data = np.array([[1.5, -6, 1.0],
                         [2.3, -8, 2],
                         [4.1, self.infinity, 2]])
        m = Arff(data, label_count=1)
        m.attr_names = ['A', 'B', 'C']
        m.str_to_enum = [{}, {}, {'R': 0, 'G': 1, 'B': 2}]
        m.enum_to_str = [{}, {}, {0: 'R', 1: 'G', 2: 'B'}]
        self.m = m

        data2 = np.array([[0.0, 1.0, 2.0, 3.0, 0.0],
                   [0.1, 1.1, 2.1, 3.1, 1.0],
                   [0.2, 1.2, 2.2, 3.2, 1.0],
                   [0.3, 1.3, 2.3, 3.3, 2.0],
                   [0.4, 1.4, 2.4, 3.4, 2.0]])

        m2 = Arff(data2, label_count=1)
        m2.attr_names = ['A', 'B', 'C', 'D', 'E']
        m2.str_to_enum = [{}, {}, {}, {}, {'R': 0, 'G': 1, 'B': 2}]
        m2.enum_to_str = [{}, {}, {}, {}, {0: 'R', 1: 'G', 2: 'B'}]
        self.m2 = m2

        self.credit_data_path = os.path.join(utils.get_root(),"test/datasets/creditapproval.arff")
        self.iris_path = os.path.join(utils.get_root(),"test/datasets/iris.arff")
def main():
    arff = Arff(sys.argv[1])
    pl = PerceptronLearner()
    features = arff.get_features()
    labels = arff.get_labels()

    accuracy_matrix = np.zeros((5, 20))

    for i in range(5):

        pl.train(features, labels)

        a = pl.accuracy_tracker[:20]
        # pad to make 20 wide
        a = np.pad(a, (0, 20 - len(a)), 'constant', constant_values=a[-1])
        accuracy_matrix[i] = a

    # Average the accuracies of each step
    print(accuracy_matrix)
    avg_accuracy = np.sum(accuracy_matrix, axis=0) / 5
    print(avg_accuracy)

    plt.plot(1 - avg_accuracy)
    plt.xlabel("Epochs")
    plt.ylabel("Avg Misclassification Rate")
    plt.title("Avg Misclassification Rate Over Epochs")

    plt.show()
Example #8
0
def main():
    arff = Arff(sys.argv[1])
    features = arff.get_features()
    labels = arff.get_labels()

    pl = PerceptronLearner()
    pl.train(features, labels)

    visualize_training(features, labels, pl)
Example #9
0
def setup():
    arff = Arff('datasets/labor.arff', label_count=1)
    # Trim the id column
    arff = arff.create_subset_arff(col_idx=slice(1, None))
    arff = arff.get_features()
    hac = HAC()
    hac.nominal_indicies = np.where(np.array(arff.attr_types) == 'nominal')[0]
    print('33,44', hac.get_distance(arff.data[33], arff.data[44]))
    print('25,34', hac.get_distance(arff.data[25], arff.data[34]))
Example #10
0
    def test_arff_constructor(self):
        """ Tests construction of Arff from path, arff, numpy array
        """
        ## Create a Matrix object from arff
        credit = Arff(arff=self.credit_data_path)
        credit3 = Arff(arff=credit)
        credit2 = Arff(arff=credit.data)

        np.testing.assert_array_almost_equal(credit.data, credit2.data)
        np.testing.assert_array_almost_equal(credit2.data, credit3.data)
Example #11
0
def prob4h():
    arff = Arff('datasets/abalone.arff', label_count=0)
    arff.normalize()
    domain = np.arange(2, 8)
    print('single link --------------------')
    hoc = HAC()
    hoc.train(arff, printk=domain, silhouette=True)
    print('complete link -----------------------')
    hoc = HAC(simple=False)
    hoc.train(arff, printk=domain, silhouette=True)
Example #12
0
def prob4():
    arff = Arff('datasets/abalone.arff', label_count=0)
    arff.normalize()
    domain = np.arange(2, 8)

    ssekmm = []
    for k in domain:
        km = KMeans(k)
        ssek = km.train(arff)
        ssekmm.append(ssek)
        print(km.calc_silhouette_score())
Example #13
0
def prob2():
    iris = Arff('datasets/iris.arff')
    features = iris.get_features()
    # features.normalize()
    # Train k means for 2-7
    ks = [2, 3, 4, 5, 6, 7]
    for k in ks:
        km = KMeans(k)
        km.train(features)

    hac2 = HAC(simple=False)
    hac2.train(features, printk=ks)
Example #14
0
    def test_create_subset_arff(self):
        m2 = Arff(self.m2, [1,2], slice(1,3))
        self.assertEqual(m2.shape, (2,2))

        m2 = Arff(self.m2, slice(1,3), [1,2])
        self.assertEqual(m2.shape, (2,2))

        # Automatic label inference
        self.m2.label_count=3
        m2 = Arff(self.m2, slice(1,3), slice(1,None), label_count=None)
        self.assertEqual(3, m2.label_count)
        m2 = Arff(self.m2, slice(1,3), slice(1,-1), label_count=None)
        self.assertEqual(2, m2.label_count)
Example #15
0
def prob5():
    arff = Arff(sys.argv[2])
    imp_atts = [1, 3, 4, 5, 7, 9, 11, 12, 13]
    arff.shuffle()
    n = len(arff.get_labels().data)
    t = int(n * .55)
    v = n - int(n * .20)
    train_set = arff.create_subset_arff(row_idx=slice(0, t, 1),
                                        col_idx=imp_atts)
    test_set = arff.create_subset_arff(row_idx=slice(t, v, 1),
                                       col_idx=imp_atts)
    validation_set = arff.create_subset_arff(row_idx=slice(v, n, 1),
                                             col_idx=imp_atts)

    epochs = []
    momentums = np.linspace(0, 1.5, 20)
    # momentums = [.5, 1]

    for momentum in momentums:
        print(momentum)
        nn = NeuralNetwork(8, [30], 11, LR=.1, momentum=momentum)
        all_acc_va, all_mse_va, all_mse_te, all_mse_tr = nn.train_set(
            train_set, test_set, validation_set, w=5)
        epochs.append(len(all_acc_va))

    plt.plot(momentums, epochs)
    plt.title("Vowel Momentum vs Epoch Convergence")
    plt.xlabel("Momentum")
    plt.ylabel("Epochs til Conv.")
    plt.show()
Example #16
0
    def test_copy_and_slice(self):
        d = Arff(self.credit_data_path)
        e = d.copy()

        e._copy_and_slice_arff(d, 1, 5)
        self.assertEqual(e.shape, (1,1))

        e._copy_and_slice_arff(d, slice(1,4), slice(2,4))
        self.assertEqual(e.shape, (3, 2))

        # This will create a 1D array, returning coords (1,1), (2,5), (3,7)
        e._copy_and_slice_arff(d, [1,2,3,7], [1,5,7,8])
        self.assertEqual(e.shape, (4, ))

        e._copy_and_slice_arff(d, [1,2,3,7], slice(0,5))
        self.assertEqual(e.shape, (4, 5))
Example #17
0
    def test_append_columns(self):
        test_matrix = Arff(self.m)

        # Verify it works with other matrices
        test_matrix.append_columns(test_matrix)
        assert test_matrix.data.shape == (3,6)

        # Verify it works with numpy array
        test_matrix.append_columns(test_matrix.data)
        assert test_matrix.data.shape == (3,12)

        # Verify it works with 2D list
        test_matrix.append_columns(test_matrix.data)
        assert test_matrix.data.shape == (3,24)

        # Verify incompatible number of columns
        with self.assertRaises(Exception) as context:
            test_matrix.append_columns(self.m.data[:1,:])
        self.assertTrue('Incompatible number of rows' in str(context.exception))
Example #18
0
    def test_append_rows(self):
        test_matrix = Arff(self.m)

        # Verify it works with other matrices
        test_matrix.append_rows(test_matrix)
        assert test_matrix.data.shape == (6,3)

        # Verify it works with numpy array
        test_matrix.append_rows(test_matrix.data)
        assert test_matrix.data.shape == (12,3)

        # Verify it works with 2D list
        test_matrix.append_rows(test_matrix.data)
        assert test_matrix.data.shape == (24,3)

        # Verify incompatible number of rows
        with self.assertRaises(Exception) as context:
            test_matrix.append_rows(self.m.data[:,:2])
        print(str(context.exception))
        self.assertTrue('Incompatible number of columns' in str(context.exception))
Example #19
0
def prob2wclass():
    iris = Arff('datasets/iris.arff', label_count=0)
    # features.normalize()
    # Train k means for 2-7
    ks = [2, 3, 4, 5, 6, 7]
    for k in ks:
        km = KMeans(k)
        km.train(iris)

    hac2 = HAC(simple=False)
    hac2.train(iris, printk=ks)
Example #20
0
def prob_4(lr):
    # read in vowels dataset
    arff = Arff('datasets/vowels.arff')

    # Leave out the test/train and person features, which are unceccessary.
    arff = arff.create_subset_arff(col_idx=slice(2, None), label_count=1)

    # Get a 75/25 split
    arff.shuffle()
    training = arff.create_subset_arff(slice(arff.instance_count // 4))
    test = arff.create_subset_arff(slice(arff.instance_count // 4, -1))
    t_features = test.get_features()
    t_labels = test.get_labels()

    # Get a 15% Validation set
    validation = training.create_subset_arff(slice(arff.instance_count // 5))
    training = training.create_subset_arff(
        slice(arff.instance_count // 5, None))
    v_features = validation.get_features()
    v_labels = validation.get_labels()

    domain = 2**np.arange(0, 8)
    training_mse = []
    validation_mse = []
    test_mse = []
    for nodes in domain:
        mse = 0
        vmse = 0
        tmse = 0
        for _ in range(100):
            learner = MultilayerPerceptronLearner(
                [training.features_count, nodes, 11], momentum=0)
            # learner.zero_weights()
            learner.lr = lr
            learner.max_epoch = 500
            learner.train(training.get_features(), training.get_labels(),
                          validation.get_features(), validation.get_labels())

            tmse += learner.get_mse(test.get_features(), test.get_labels())
            mse += learner.get_mse(training.get_features(),
                                   training.get_labels())
            vmse += learner.get_mse(validation.get_features(),
                                    validation.get_labels())

        training_mse.append(mse / 100)
        validation_mse.append(vmse / 100)
        test_mse.append(tmse / 100)

    plt.semilogx(domain, test_mse, basex=2, label="Test Set MSE")
    plt.semilogx(domain, training_mse, basex=2, label="Training Set MSE")
    plt.semilogx(domain, validation_mse, basex=2, label="Validation Set MSE")
    plt.title("MSE vs Number of Hidden Nodes")
    plt.xlabel("Number of Hidden Nodes")
    plt.ylabel("Mean Squared Error")
    plt.legend()
    plt.show()
Example #21
0
def prob_6():
    # Repeat experiments for magic telescope and housing using weights (w = 1/dist**2)
    arff = Arff('datasets/credit.arff')
    arff.shuffle()
    test = arff.create_subset_arff(slice(arff.instance_count // 4))
    train = arff.create_subset_arff(slice(arff.instance_count // 4, None))

    train.normalize()
    test.normalize()

    krange = np.arange(1, 16, 2)
    accs = []
    for k in krange:
        knn = KNN(k, weighting=True, vdm=True)
        predictions = knn.knn(train.get_features(), train.get_labels(),
                              test.get_features())
        acc = predictions == np.ravel(test.get_labels().data)
        print("k:", k, "accuracy:", sum(acc) / len(acc))
        accs.append(sum(acc) / len(acc))

    plt.plot(krange, accs)
    plt.title("K Size Versus Accuracy on Credit Approval (Weighted)")
    plt.xlabel("K")
    plt.ylabel("Accuracy")
    plt.show()
def prob_1():
    arff = Arff('datasets/lenses.arff')
    acc = []
    for _ in range(10):

        arff.shuffle()
        testing = arff.create_subset_arff(slice(arff.instance_count//5))
        training = arff.create_subset_arff(slice(arff.instance_count//5, None))

        d = DecisionTreeLearner()

        features = training.get_features()
        labels = training.get_labels()


        t_feat = testing.get_features()
        t_labels = testing.get_labels()

        d.train(features, labels)

        accuracy = d.get_accuracy(t_feat, t_labels)
        print(accuracy)
        acc.append(accuracy)

    print(sum(acc)/len(acc))
Example #23
0
def test_cases():
    # test_1()

    attr_types = [
        "real", "real", "real", "real", "cat", "real", "cat", "real", "real",
        "cat", "real", "cat", "cat", "cat", "cat", "cat", "cat"
    ]

    attr_idx = [[], [], [], [], ['none', 'tcf', 'tc'], [],
                ['none', 'ret_allw', 'empl_contr'], [], [], ['yes', 'no'], [],
                ['below_average', 'average', 'generous'], ['yes', 'no'],
                ['none', 'half', 'full'], ['yes', 'no'],
                ['none', 'half', 'full'], ['bad', 'good']]

    k = 5
    arff = Arff("labor.arff")
    # arff.normalize()
    features = arff.get_features().data
    labels = arff.get_labels().data
    # attributes = arff.get_attr_names()
    data = np.hstack((features, labels))[:, 1:-1]
    kmc = HA_Clustering(k, data, data, attr_types, "complete link", attr_idx)
Example #24
0
def prob_5():
    cont_mask = [1, 2, 7, 10, 13, 14, 16]
    cate_mask = [0, 3, 4, 5, 6, 8, 9, 11, 12, 15]

    arff = Arff("credit_approval_data.arff")
    arff.shuffle()
    arff.normalize()

    n = len(arff.get_labels().data)
    t = int(n * .7)
    train_data = arff.create_subset_arff(row_idx=slice(0, t, 1))
    test_data = arff.create_subset_arff(row_idx=slice(t, n, 1))
    test_data = np.hstack((test_data.get_features().data, test_data.get_labels().data))
    train_data = np.hstack((train_data.get_features().data, train_data.get_labels().data))
    #b,30.83,0,u,g,w,v,1.25,t,t,01,f,g,00202,0,+
    dist_matrix = np.ones((16, 16))
    np.fill_diagonal(dist_matrix, 0)
    KNNC = KNNClassifier(8, train_data, test_data)
    print(KNNC.get_accuracy_mixed(cate_mask, cont_mask, dist_matrix))
def main():
    # Train each perceptron on its own split data set
    fast_v_mid = Arff('datasets/restaurants/fast_v_mid.arff')
    fast_v_fine = Arff('datasets/restaurants/fast_v_fine.arff')
    mid_v_fine = Arff('datasets/restaurants/mid_v_fine.arff')

    pl_fast_mid = PerceptronLearner()
    pl_fast_fine = PerceptronLearner()
    pl_mid_v_fine = PerceptronLearner()

    # Train each perceptron
    train_perceptron(pl_fast_mid, fast_v_mid)
    train_perceptron(pl_fast_fine, fast_v_fine)
    train_perceptron(pl_mid_v_fine, mid_v_fine)

    # Run on new data
    # Burger King
    burger_king = [4, 2, 2]

    # Cheesecake Factory
    cheesecake_factory = [2, 4, 4]

    # Best fine-dining in the world
    best_fine_dining = [1, 5, 3]

    print_findings(
        "Burger King",
        determine_category(burger_king, pl_fast_mid, pl_fast_fine,
                           pl_mid_v_fine))
    print_findings(
        "Cheesecake Factory",
        determine_category(cheesecake_factory, pl_fast_mid, pl_fast_fine,
                           pl_mid_v_fine))
    print_findings(
        "'Best Food In The World'",
        determine_category(best_fine_dining, pl_fast_mid, pl_fast_fine,
                           pl_mid_v_fine))
Example #26
0
def prob3():
    """ """
    arff = Arff(sys.argv[2])
    imp_atts = [1, 3, 4, 5, 7, 9, 11, 12, 13]
    arff.shuffle()
    n = len(arff.get_labels().data)
    t = int(n * .55)
    v = n - int(n * .20)
    train_set = arff.create_subset_arff(row_idx=slice(0, t, 1),
                                        col_idx=imp_atts)
    test_set = arff.create_subset_arff(row_idx=slice(t, v, 1),
                                       col_idx=imp_atts)
    validation_set = arff.create_subset_arff(row_idx=slice(v, n, 1),
                                             col_idx=imp_atts)

    best_mse_te = []
    best_mse_tr = []
    best_mse_va = []
    epochs = []

    LRS = [.01, .1, .5, .8, 1.5]
    for LR in LRS:
        # print(LR)
        nn = NeuralNetwork(8, [16], 11, LR=LR, momentum=0)
        all_acc_va, all_mse_va, all_mse_te, all_mse_tr = nn.train_set(
            train_set, test_set, validation_set, w=5)
        best_mse_te.append(min(all_mse_te))
        best_mse_tr.append(min(all_mse_tr))
        best_mse_va.append(min(all_mse_va))
        epochs.append(len(all_mse_va))

    plt.plot(LRS, best_mse_te, label="MSE Te")
    plt.plot(LRS, best_mse_tr, label="MSE Tr")
    plt.plot(LRS, best_mse_va, label="MSE V.A")
    plt.title("Vowel MSE vs Learning Rate")
    plt.xlabel("Learning Rate")
    plt.ylabel("MSE")
    plt.legend()
    plt.show()

    plt.plot(LRS, epochs)
    plt.title("Vowel Epochs vs Learning Rate")
    plt.xlabel("Learning Rate")
    plt.ylabel("Epochs")
    plt.legend()
    plt.show()
Example #27
0
def prob3_normalized():
    arff = Arff('datasets/abalone.arff', label_count=0)
    arff.normalize()
    domain = np.arange(2, 8)

    ssekmm = []
    for k in domain:
        km = KMeans(k)
        ssek = km.train(arff)
        ssekmm.append(ssek)

    hac = HAC()
    hac2 = HAC(simple=False)
    ssehac = hac.train(arff, printk=domain)
    ssehac2 = hac2.train(arff, printk=domain)

    plt.plot(domain, ssekmm, label="K-Means SSE")
    plt.plot(domain, ssehac[::-1], label="HAC (Single-Link) SSE")
    plt.plot(domain, ssehac2[::-1], label="HAC (Complete-Link) SSE")
    plt.title("Abalone SSE (Normalized) vs # of Clusters")
    plt.xlabel("# of Clusters")
    plt.ylabel('SSE')
    plt.legend()
    plt.show()
Example #28
0
def prob_6_b():
    # read in vowels dataset
    arff = Arff('datasets/vowels.arff')

    # Leave out the test/train and person features, which are unceccessary.
    arff = arff.create_subset_arff(col_idx=slice(2, None), label_count=1)

    # Get a 75/25 split
    arff.shuffle()
    training = arff.create_subset_arff(slice(arff.instance_count // 4))
    test = arff.create_subset_arff(slice(arff.instance_count // 4, -1))
    t_features = test.get_features()
    t_labels = test.get_labels()

    # Get a 15% Validation set
    validation = training.create_subset_arff(slice(arff.instance_count // 5))
    training = training.create_subset_arff(
        slice(arff.instance_count // 5, None))
    v_features = validation.get_features()
    v_labels = validation.get_labels()
    features = training.get_features()
    labels = training.get_labels()

    taccuracy = []
    accuracy = []
    domain = 2**np.arange(1, 7)
    for i in domain:
        tacc = 0
        acc = 0
        for _ in range(3):
            learner = MultilayerPerceptronLearner([training.features_count] +
                                                  [i] * (32 // i) + [11],
                                                  momentum=.85)
            # learner.zero_weights()
            learner.lr = .1
            learner.max_epoch = 500
            learner.train(training.get_features(), training.get_labels(),
                          validation.get_features(), validation.get_labels())
            tacc += learner.get_accuracy(t_features, t_labels)
            acc += learner.get_accuracy(features, labels)
        accuracy.append(acc / 3)
        taccuracy.append(tacc / 3)

    plt.semilogx(domain, accuracy, basex=2, label="Training Set Accuracy")
    plt.semilogx(domain, taccuracy, basex=2, label="Test Set Accuracy")
    plt.title("Node Distribution vs Accuracy")
    plt.xlabel("Number of Nodes per Hidden Layer")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()
Example #29
0
def prob_5(lr, hidden):
    # read in vowels dataset
    arff = Arff('datasets/vowels.arff')

    # Leave out the test/train and person features, which are unceccessary.
    arff = arff.create_subset_arff(col_idx=slice(2, None), label_count=1)

    # Get a 75/25 split
    arff.shuffle()
    training = arff.create_subset_arff(slice(arff.instance_count // 4))
    test = arff.create_subset_arff(slice(arff.instance_count // 4, -1))
    t_features = test.get_features()
    t_labels = test.get_labels()

    # Get a 15% Validation set
    validation = training.create_subset_arff(slice(arff.instance_count // 5))
    training = training.create_subset_arff(
        slice(arff.instance_count // 5, None))
    v_features = validation.get_features()
    v_labels = validation.get_labels()

    epochs = []
    accuracy = []
    domain = np.linspace(0, 1, 20)
    for momentum in domain:
        e = 0
        acc = 0
        for _ in range(10):
            learner = MultilayerPerceptronLearner(
                [training.features_count, hidden, 11], momentum=momentum)
            # learner.zero_weights()
            learner.lr = lr
            learner.max_epoch = 500
            learner.train(training.get_features(), training.get_labels(),
                          validation.get_features(), validation.get_labels())
            acc += learner.get_accuracy(t_features, t_labels)
            e += learner.epochs
        epochs.append(e / 10)
        accuracy.append(acc / 10)

    print(accuracy)
    plt.plot(domain, epochs)
    plt.title("Number of Training Epochs vs Momentum")
    plt.xlabel("Momentum Constant")
    plt.ylabel("Number of Training Epochs")
    plt.show()
def prob_2():
    # Get accuracy on cars.arff
    arff = Arff('datasets/cars.arff')
    arff.shuffle()

    acc, tacc, = k_fold_cv(arff, 10)
    print('cars:')
    print('acc',acc)
    print('tacc',tacc)
    print('tot',sum(tacc)/len(tacc))
    print()

    # Get accuracy of voting.arff
    arff = Arff('datasets/voting.arff')
    arff.shuffle()
    acc,tacc, = k_fold_cv(arff, 10)

    print('voting;')
    print('acc',acc)
    print('tacc',tacc)
    print('tot',sum(tacc)/len(tacc))
    print()