예제 #1
0
def test_two_best_splits():
    # Note that splitting on x1 @ 0 and x2 @ 0 both split the data perfectly
    x_train = [[3.1, 0.1], [4, 1], [7, 2], [6, 3], [0, 0], [-1, 3],
               [-4.1, 2.8], [-10, -10]]
    y_train = [1, 1, 1, 1, 0, 0, 0, 0]
    x_df = pd.DataFrame(x_train, columns=['v1', 'v2'])
    y_df = pd.DataFrame(y_train, columns=['y'])
    dtree = ct.DecisionTree(x_df, y_df)
    assert dtree.tnode.lhs.path == ['v1', ' <= ', '0.0']
예제 #2
0
def r2_tree():
    # Split into quadrants: y = 0 if x in Quad. 1 or 4, and 1 otherwise
    x_data = [[1, 5], [4, 2], [1, 1], [2, 4], [3, 1], [-1, 6], [-2, 9],
              [-5, 1], [-7, 1], [-6, 4], [-0.1, 0.1], [-4, -5], [-1, -2],
              [-0.2, -5], [3, -4], [5, -1]]
    x_df = pd.DataFrame(x_data, columns=['var1', 'var2'])
    y_data = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1]
    y_df = pd.DataFrame(y_data, columns=['y'])
    tree = ct.DecisionTree(x_df, y_df)
    return tree, x_df, y_df
예제 #3
0
def r1_tree():
    # if x <= -2, then y = 1, if -2 <= x < 9, then y = 0, else y = 1
    x_data = [[-10], [-5], [-2], [0], [2], [6], [9], [12], [20], [100]]
    x_df = pd.DataFrame(x_data, columns=['var1'])
    y_data = [1, 1, 1, 0, 0, 0, 0, 1, 1, 1]
    y_df = pd.DataFrame(y_data, columns=['y'])
    x_test_data = [[-10], [5], [8], [12], [50]]
    x_test_df = pd.DataFrame(x_test_data, columns=['var1'])
    t1 = ct.DecisionTree(x_df, y_df)
    return t1, x_test_df
예제 #4
0
def r3_tree():
    # y = 1 if x1 <= 0 and x2 <= 0 or x1 > 0 and x3 <= 0. Otherwise y = 0
    x_data = [[-5, -4, -1], [-5, -1, -3], [2, -1, 17], [2.5, 5, -10],
              [2, 2, 2], [-6, 1, 8], [4, -5, 0], [5, 0, 0.5], [16, 4, -5],
              [100, 3, 3], [4, -10, 17], [8, -1, 5], [-3, 5, 6],
              [-0.1, -0.1, 0.1], [10, -0.1, -4]]
    x_df = pd.DataFrame(x_data, columns=['var1', 'var2', 'var3'])
    y_data = [1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]
    y_df = pd.DataFrame(y_data, columns=['y'])

    x_test_data = [[-1, -1, 5], [1, 2, -1], [6, 0, 0], [5, -1, 6]]
    x_test_df = pd.DataFrame(x_test_data, columns=['var1', 'var2', 'var3'])

    t3 = ct.DecisionTree(x_df, y_df)
    return t3, x_test_df, y_df
예제 #5
0
    def build_tree(self):
        """Build tree, starting with parent node, by splitting variable, rows randomly.
    
        
        1. Start with parent node. 
        2. Create lhs and rhs nodes. 
        3. Use recursion so that each of the children) have children, etc.
    
        Returns
        -------
        A list of trees 
        
        Examples
        --------
        >>> self.trees = self.build_trees()
        
        See Also
        --------
        - get_data: Uses the past_split node while maintaining the x_train and y_train data as separate entities
        - find_impurity: Finds impurity of a particular cutoff point / variable pair given the criterion measure

        """
        # List of all rows/columns in X
        row_list = list(range(self.rows))
        x_col_list = list(range(self.xcols))

        # Randomly columns in X
        x_col_rand = np.random.choice(x_col_list, self.n_features,
                                      False)  # Sample without replacement

        # Make sure that the corresponding y's chosen for the rows have 0's and 1's

        uniquey = 0
        while uniquey != 2:
            x_row_rand = np.random.choice(
                row_list, self.sample_size)  # Sample with replacement
            y_train = self.y_train.iloc[x_row_rand, :]
            uniquey = len(y_train.iloc[:, 0].unique())

        # Fit decision tree with sampled data
        x_train = self.x_train.iloc[x_row_rand, x_col_rand]
        tree = ct.DecisionTree(x_train, y_train, self.criterion)
        return tree
예제 #6
0
def test_random_valid():
    x_df = pd.DataFrame(np.random.randn(50, 5), columns=range(5))
    y_df = pd.DataFrame(np.random.random((50, 1)), columns=['y']).round()
    dt = ct.DecisionTree(x_df, y_df)
    assert dt.is_valid
예제 #7
0
def time_buildtree():
    return ct.DecisionTree(x_df, y_df)
예제 #8
0
def test_sql_python_same():
    cur = st.sqlconnect(host=cd.host,
                        database=cd.database,
                        user=cd.user,
                        password=cd.password)
    x_df = pd.DataFrame([[1.35090528, -0.22763714, 0.62503887],
                         [-0.0715539, -0.64119863, -0.19062135],
                         [-1.11177092, 0.50165846, -0.86722735],
                         [1.24392279, -0.08266315, -0.82700858],
                         [0.41391078, -1.06708343, -0.591038],
                         [-0.11328491, 2.19414569, -1.0890808],
                         [1.00572935, -0.92290436, 1.38861161],
                         [-0.78596497, 1.56025647, 0.95610325],
                         [1.59251311, 2.18732072, -0.73577758],
                         [-1.16918551, -0.21258418, 1.27649019],
                         [0.70237481, 1.82188747, -0.04181062],
                         [-0.56060812, 0.56029165, -0.90909157],
                         [0.44574311, 0.94814604, -0.01507905],
                         [-1.3072048, 1.62805262, -0.56249722],
                         [0.62097551, -1.33599419, 0.1845642]],
                        columns=['v1', 'v2', 'v3'])

    y_df = pd.DataFrame([[1], [0], [0], [1], [0], [0], [1], [0], [1], [0], [0],
                         [1], [0], [1], [0]],
                        columns=['y'])

    tree_ct = ct.DecisionTree(x_df, y_df)

    x_test = pd.DataFrame([[0.31269028, 1.86935075, 1.3147904],
                           [1.47276502, -1.77782668, -0.36375857],
                           [1.59640162, -1.21098536, -0.07769382],
                           [-0.40091173, -0.7496455, 0.39000357],
                           [-0.29370055, -0.40686242, 1.44866448],
                           [0.06426318, -1.30074211, 0.49274947],
                           [0.16542666, 0.61140155, -1.94330865]],
                          columns=['v1', 'v2', 'v3'])

    tree_ct_preds = tree_ct.predict(x_test)
    tree_preds_df = pd.concat([
        x_test,
        pd.DataFrame([int(p) for p in tree_ct_preds], columns=['preds'])
    ],
                              axis=1).sort_values('v1')
    tree_preds_df.index = range(tree_preds_df.shape[0])

    cur.execute(
        "CREATE TABLE IF NOT EXISTS datatable (v1 FLOAT, v2 FLOAT, v3 FLOAT, y INT);"
    )
    cur.execute("DELETE FROM datatable;")

    a = "INSERT INTO datatable (v1, v2, v3, y) VALUES (1.35090528, -0.22763714,  0.62503887, 1), "
    b = "(-0.0715539 , -0.64119863, -0.19062135, 0), (-1.11177092,  0.50165846, -0.86722735, 0),"
    c = "(1.24392279, -0.08266315, -0.82700858, 1), (0.41391078, -1.06708343, -0.591038, 0),"
    d = "(-0.11328491,  2.19414569, -1.0890808, 0),(1.00572935, -0.92290436,  1.38861161, 1),"
    e = "(-0.78596497,  1.56025647,  0.95610325, 0),(1.59251311,  2.18732072, -0.73577758, 1),"
    f = "(-1.16918551, -0.21258418,  1.27649019, 0),(0.70237481,  1.82188747, -0.04181062, 0),"
    g = "(-0.56060812,  0.56029165, -0.90909157, 1), (0.44574311,  0.94814604, -0.01507905, 0),"
    h = "(-1.3072048 ,  1.62805262, -0.56249722, 1),(0.62097551, -1.33599419,  0.1845642, 0);"
    query_data = a + b + c + d + e + f + g + h
    cur.execute(query_data)
    tree_st = st.SQLTree("datatable", ['v1', 'v2', 'v3'], 'y', cur)
    cur.execute(
        "CREATE TABLE IF NOT EXISTS testtable (v1 FLOAT, v2 FLOAT, v3 FLOAT);")
    cur.execute("DELETE FROM testtable;")

    i = "INSERT INTO testtable (v1, v2, v3) VALUES (0.31269028,  1.86935075,  1.3147904), "
    j = "(1.47276502, -1.77782668, -0.36375857), (1.59640162, -1.21098536, -0.07769382), "
    k = "(-0.40091173, -0.7496455, 0.39000357), (-0.29370055, -0.40686242,  1.44866448), "
    l = "(0.06426318, -1.30074211,  0.49274947), (0.16542666,  0.61140155, -1.94330865);"
    query_test = i + j + k + l
    cur.execute(query_test)
    tree_st.predict("testtable")
    cur.execute("SELECT * FROM testtable;")
    preds_sql = cur.fetchall()
    preds_df_sql = pd.DataFrame(preds_sql, columns=['v1', 'v2', 'v3',
                                                    'preds']).sort_values('v1')
    preds_df_sql.index = range(preds_df_sql.shape[0])

    assert tree_preds_df.equals(preds_df_sql)

    # We should also get the same results if we prune at the same level
    tree_ct.prune(alphas=[0.2], cross_validate=False)
    tree_st.prune(alpha=0.2)

    # Python preds, pruned
    tree_ct_preds_pruned = tree_ct.predict(x_test)
    tree_preds_df_pruned = pd.concat([
        x_test,
        pd.DataFrame([int(p) for p in tree_ct_preds_pruned], columns=['preds'])
    ],
                                     axis=1).sort_values('v1')
    tree_preds_df_pruned.index = range(tree_preds_df_pruned.shape[0])

    # SQL preds, pruned
    cur.execute("ALTER TABLE testtable DROP COLUMN preds;")
    tree_st.predict("testtable")
    cur.execute("SELECT * FROM testtable;")
    preds_sql_pruned = cur.fetchall()
    preds_df_sql_pruned = pd.DataFrame(preds_sql_pruned,
                                       columns=['v1', 'v2', 'v3',
                                                'preds']).sort_values('v1')
    preds_df_sql_pruned.index = range(preds_df_sql_pruned.shape[0])

    assert tree_preds_df_pruned.equals(preds_df_sql_pruned)
예제 #9
0
alphas = [0, 0.2, 0.4, 0.6, 0.8]
trees = [1, 2, 5, 10]

for i in range(3):
    for alpha in alphas:

        # Randomly split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)

        X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test),
        y_train, y_test = pd.DataFrame(y_train), pd.DataFrame(y_test)

        # Create decision tree
        tree = ct.DecisionTree(X_train, y_train)
        tree.prune(alphas=[alpha], cross_validate=False)
        preds = tree.predict(X_test)
        print("Confusion matrix at alpha = " + str(alpha) + " is:")
        print(confusion_matrix(np.array(y_test), np.array(preds)))

    for ntree in trees:

        # Randomly split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)

        X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test),
        y_train, y_test = pd.DataFrame(y_train), pd.DataFrame(y_test)