def test_two_best_splits(): # Note that splitting on x1 @ 0 and x2 @ 0 both split the data perfectly x_train = [[3.1, 0.1], [4, 1], [7, 2], [6, 3], [0, 0], [-1, 3], [-4.1, 2.8], [-10, -10]] y_train = [1, 1, 1, 1, 0, 0, 0, 0] x_df = pd.DataFrame(x_train, columns=['v1', 'v2']) y_df = pd.DataFrame(y_train, columns=['y']) dtree = ct.DecisionTree(x_df, y_df) assert dtree.tnode.lhs.path == ['v1', ' <= ', '0.0']
def r2_tree(): # Split into quadrants: y = 0 if x in Quad. 1 or 4, and 1 otherwise x_data = [[1, 5], [4, 2], [1, 1], [2, 4], [3, 1], [-1, 6], [-2, 9], [-5, 1], [-7, 1], [-6, 4], [-0.1, 0.1], [-4, -5], [-1, -2], [-0.2, -5], [3, -4], [5, -1]] x_df = pd.DataFrame(x_data, columns=['var1', 'var2']) y_data = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1] y_df = pd.DataFrame(y_data, columns=['y']) tree = ct.DecisionTree(x_df, y_df) return tree, x_df, y_df
def r1_tree(): # if x <= -2, then y = 1, if -2 <= x < 9, then y = 0, else y = 1 x_data = [[-10], [-5], [-2], [0], [2], [6], [9], [12], [20], [100]] x_df = pd.DataFrame(x_data, columns=['var1']) y_data = [1, 1, 1, 0, 0, 0, 0, 1, 1, 1] y_df = pd.DataFrame(y_data, columns=['y']) x_test_data = [[-10], [5], [8], [12], [50]] x_test_df = pd.DataFrame(x_test_data, columns=['var1']) t1 = ct.DecisionTree(x_df, y_df) return t1, x_test_df
def r3_tree(): # y = 1 if x1 <= 0 and x2 <= 0 or x1 > 0 and x3 <= 0. Otherwise y = 0 x_data = [[-5, -4, -1], [-5, -1, -3], [2, -1, 17], [2.5, 5, -10], [2, 2, 2], [-6, 1, 8], [4, -5, 0], [5, 0, 0.5], [16, 4, -5], [100, 3, 3], [4, -10, 17], [8, -1, 5], [-3, 5, 6], [-0.1, -0.1, 0.1], [10, -0.1, -4]] x_df = pd.DataFrame(x_data, columns=['var1', 'var2', 'var3']) y_data = [1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1] y_df = pd.DataFrame(y_data, columns=['y']) x_test_data = [[-1, -1, 5], [1, 2, -1], [6, 0, 0], [5, -1, 6]] x_test_df = pd.DataFrame(x_test_data, columns=['var1', 'var2', 'var3']) t3 = ct.DecisionTree(x_df, y_df) return t3, x_test_df, y_df
def build_tree(self): """Build tree, starting with parent node, by splitting variable, rows randomly. 1. Start with parent node. 2. Create lhs and rhs nodes. 3. Use recursion so that each of the children) have children, etc. Returns ------- A list of trees Examples -------- >>> self.trees = self.build_trees() See Also -------- - get_data: Uses the past_split node while maintaining the x_train and y_train data as separate entities - find_impurity: Finds impurity of a particular cutoff point / variable pair given the criterion measure """ # List of all rows/columns in X row_list = list(range(self.rows)) x_col_list = list(range(self.xcols)) # Randomly columns in X x_col_rand = np.random.choice(x_col_list, self.n_features, False) # Sample without replacement # Make sure that the corresponding y's chosen for the rows have 0's and 1's uniquey = 0 while uniquey != 2: x_row_rand = np.random.choice( row_list, self.sample_size) # Sample with replacement y_train = self.y_train.iloc[x_row_rand, :] uniquey = len(y_train.iloc[:, 0].unique()) # Fit decision tree with sampled data x_train = self.x_train.iloc[x_row_rand, x_col_rand] tree = ct.DecisionTree(x_train, y_train, self.criterion) return tree
def test_random_valid(): x_df = pd.DataFrame(np.random.randn(50, 5), columns=range(5)) y_df = pd.DataFrame(np.random.random((50, 1)), columns=['y']).round() dt = ct.DecisionTree(x_df, y_df) assert dt.is_valid
def time_buildtree(): return ct.DecisionTree(x_df, y_df)
def test_sql_python_same(): cur = st.sqlconnect(host=cd.host, database=cd.database, user=cd.user, password=cd.password) x_df = pd.DataFrame([[1.35090528, -0.22763714, 0.62503887], [-0.0715539, -0.64119863, -0.19062135], [-1.11177092, 0.50165846, -0.86722735], [1.24392279, -0.08266315, -0.82700858], [0.41391078, -1.06708343, -0.591038], [-0.11328491, 2.19414569, -1.0890808], [1.00572935, -0.92290436, 1.38861161], [-0.78596497, 1.56025647, 0.95610325], [1.59251311, 2.18732072, -0.73577758], [-1.16918551, -0.21258418, 1.27649019], [0.70237481, 1.82188747, -0.04181062], [-0.56060812, 0.56029165, -0.90909157], [0.44574311, 0.94814604, -0.01507905], [-1.3072048, 1.62805262, -0.56249722], [0.62097551, -1.33599419, 0.1845642]], columns=['v1', 'v2', 'v3']) y_df = pd.DataFrame([[1], [0], [0], [1], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1], [0]], columns=['y']) tree_ct = ct.DecisionTree(x_df, y_df) x_test = pd.DataFrame([[0.31269028, 1.86935075, 1.3147904], [1.47276502, -1.77782668, -0.36375857], [1.59640162, -1.21098536, -0.07769382], [-0.40091173, -0.7496455, 0.39000357], [-0.29370055, -0.40686242, 1.44866448], [0.06426318, -1.30074211, 0.49274947], [0.16542666, 0.61140155, -1.94330865]], columns=['v1', 'v2', 'v3']) tree_ct_preds = tree_ct.predict(x_test) tree_preds_df = pd.concat([ x_test, pd.DataFrame([int(p) for p in tree_ct_preds], columns=['preds']) ], axis=1).sort_values('v1') tree_preds_df.index = range(tree_preds_df.shape[0]) cur.execute( "CREATE TABLE IF NOT EXISTS datatable (v1 FLOAT, v2 FLOAT, v3 FLOAT, y INT);" ) cur.execute("DELETE FROM datatable;") a = "INSERT INTO datatable (v1, v2, v3, y) VALUES (1.35090528, -0.22763714, 0.62503887, 1), " b = "(-0.0715539 , -0.64119863, -0.19062135, 0), (-1.11177092, 0.50165846, -0.86722735, 0)," c = "(1.24392279, -0.08266315, -0.82700858, 1), (0.41391078, -1.06708343, -0.591038, 0)," d = "(-0.11328491, 2.19414569, -1.0890808, 0),(1.00572935, -0.92290436, 1.38861161, 1)," e = "(-0.78596497, 1.56025647, 0.95610325, 0),(1.59251311, 2.18732072, -0.73577758, 1)," f = "(-1.16918551, -0.21258418, 1.27649019, 0),(0.70237481, 1.82188747, -0.04181062, 0)," g = "(-0.56060812, 0.56029165, -0.90909157, 1), (0.44574311, 0.94814604, -0.01507905, 0)," h = "(-1.3072048 , 1.62805262, -0.56249722, 1),(0.62097551, -1.33599419, 0.1845642, 0);" query_data = a + b + c + d + e + f + g + h cur.execute(query_data) tree_st = st.SQLTree("datatable", ['v1', 'v2', 'v3'], 'y', cur) cur.execute( "CREATE TABLE IF NOT EXISTS testtable (v1 FLOAT, v2 FLOAT, v3 FLOAT);") cur.execute("DELETE FROM testtable;") i = "INSERT INTO testtable (v1, v2, v3) VALUES (0.31269028, 1.86935075, 1.3147904), " j = "(1.47276502, -1.77782668, -0.36375857), (1.59640162, -1.21098536, -0.07769382), " k = "(-0.40091173, -0.7496455, 0.39000357), (-0.29370055, -0.40686242, 1.44866448), " l = "(0.06426318, -1.30074211, 0.49274947), (0.16542666, 0.61140155, -1.94330865);" query_test = i + j + k + l cur.execute(query_test) tree_st.predict("testtable") cur.execute("SELECT * FROM testtable;") preds_sql = cur.fetchall() preds_df_sql = pd.DataFrame(preds_sql, columns=['v1', 'v2', 'v3', 'preds']).sort_values('v1') preds_df_sql.index = range(preds_df_sql.shape[0]) assert tree_preds_df.equals(preds_df_sql) # We should also get the same results if we prune at the same level tree_ct.prune(alphas=[0.2], cross_validate=False) tree_st.prune(alpha=0.2) # Python preds, pruned tree_ct_preds_pruned = tree_ct.predict(x_test) tree_preds_df_pruned = pd.concat([ x_test, pd.DataFrame([int(p) for p in tree_ct_preds_pruned], columns=['preds']) ], axis=1).sort_values('v1') tree_preds_df_pruned.index = range(tree_preds_df_pruned.shape[0]) # SQL preds, pruned cur.execute("ALTER TABLE testtable DROP COLUMN preds;") tree_st.predict("testtable") cur.execute("SELECT * FROM testtable;") preds_sql_pruned = cur.fetchall() preds_df_sql_pruned = pd.DataFrame(preds_sql_pruned, columns=['v1', 'v2', 'v3', 'preds']).sort_values('v1') preds_df_sql_pruned.index = range(preds_df_sql_pruned.shape[0]) assert tree_preds_df_pruned.equals(preds_df_sql_pruned)
alphas = [0, 0.2, 0.4, 0.6, 0.8] trees = [1, 2, 5, 10] for i in range(3): for alpha in alphas: # Randomly split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test), y_train, y_test = pd.DataFrame(y_train), pd.DataFrame(y_test) # Create decision tree tree = ct.DecisionTree(X_train, y_train) tree.prune(alphas=[alpha], cross_validate=False) preds = tree.predict(X_test) print("Confusion matrix at alpha = " + str(alpha) + " is:") print(confusion_matrix(np.array(y_test), np.array(preds))) for ntree in trees: # Randomly split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test), y_train, y_test = pd.DataFrame(y_train), pd.DataFrame(y_test)