def q_2_6():
    data, test_data, feature_names, class_names = load_titanic_data()
    data = preprocess_titanic(data, True)

    perm = np.random.RandomState(seed=20).permutation((data.shape[0]))
    data = data[perm]
    data, valid = data[:800], data[800:]

    type_map, categories_map = gen_maps(data)
    classifier = DecisionTree(type_map, categories_map, feature_names,
                              class_names)
    classifier.fit(data, 3, 10)
    print(classifier)
Ejemplo n.º 2
0
 def test_gini_impurity_2(self):
     X = np.array([1, 4, 2, 6])
     y = np.array([1, 0, 1, 0])
     thresh = 3
     gini = DecisionTree.gini_impurity(X, y, thresh)
     G_above = 0
     G_under = 0
     self.assertEqual(gini, 0.5 * G_above + 0.5 * G_under)
Ejemplo n.º 3
0
 def test_information_gain(self):
     X = np.array([1, 4, 2, 6])
     y = np.array([1, 0, 0, 1])
     thresh = 3
     gain = DecisionTree.information_gain(X, y, thresh)
     H_y = -0.5 * np.log(0.5) * 2
     H_above = -0.5 * np.log(0.5) * 2
     H_under = -0.5 * np.log(0.5) * 2
     self.assertEqual(gain, H_y - 0.5 * H_above - 0.5 * H_under)
def kaggle():
    data, test_data, feature_names, class_names = load_titanic_data()
    data = preprocess_titanic(data, True)
    test = preprocess_titanic(test_data, False)

    type_map, categories_map = gen_maps(data)
    classifier = DecisionTree(type_map, categories_map)

    classifier.fit(data, 4, 10)
    predictions = classifier.predict(test)
    pred_train = classifier.predict(data)
    actual = extract_column(data, 9)
    print(error_rate(pred_train, actual))
    results_to_csv(predictions.flatten())
    """
def q_2_4():
    print("******RUNNING TITANIC DATA SET*****")

    data, test_data, feature_names, class_names = load_titanic_data()
    data = preprocess_titanic(data, True)

    perm = np.random.RandomState(seed=20).permutation((data.shape[0]))
    data = data[perm]
    data, valid = data[:800], data[800:]
    idy = data.shape[1] - 1

    type_map, categories_map = gen_maps(data)
    classifier = DecisionTree(type_map, categories_map)
    classifier.fit(data, 4, 10)
    train_predictions = classifier.predict(data)
    train_actual = extract_column(data, idy)
    valid_predictions = classifier.predict(valid)
    valid_actual = extract_column(valid, idy)

    print("Decision Tree training Accuracies:       ",
          error_rate(train_predictions, train_actual))
    print("Decision Tree Validation Accuracies:    ",
          error_rate(valid_predictions, valid_actual))

    classifier = RandomForest(300, 300, 2, type_map, categories_map, 20)
    classifier.fit(data, 10, 10)
    train_predictions = classifier.predict(data)
    train_actual = extract_column(data, idy)
    valid_predictions = classifier.predict(valid)
    valid_actual = extract_column(valid, idy)

    print("Random Forest training Accuracies:       ",
          error_rate(train_predictions, train_actual))
    print("Random Forest Validation Accuracies:    ",
          error_rate(valid_predictions, valid_actual))