def learn_with_purity(purity): # training decision tree with specific purity dtree = DecisionTree(x_train, y_train, max_depth=n_attr, purity=purity) dtree.fit() # train_accuracy = dtree.accuracy(x_train, y_train) # test_accuracy = dtree.accuracy(x_test, y_test) test_preds = dtree.predict(x_test) return test_preds
def fit(self, X, y): '''The common way is to, at each node, pick d features randomly and and select that which maximizes the information gain for splitting. This is done for performance reasons, and it's particularly useful if the number of features in the dataset is very large. [In the API of sklearns DecisionTreeClassifier class, this option is given through the paramter 'splitter'. In the API of DecisionForest the attribute 'max_features' specifies the number of features consider] However, Since the number of features of our dataset was limited to 14, we decided to do an exhaustive search of the features at each node''' self.estimators_ = [] # Russell's method if self.russells_method: for i in range(self.n_estimators): size = int(X.shape[1] / 1.5) idxs = np.random.choice(range(X.shape[1]), size, replace=False) samples = (X.T[idxs]).T tree = DecisionTree(max_depth=4) tree.fit(samples, y) self.estimators_.append(tree) return # Standard method for i in range(self.n_estimators): # for some reason I don't know, we draw n samples with REPLACEMENT # this is what is called bootstrapping idxs = np.random.choice(range(X.shape[0]), X.shape[0], \ replace=True if self.bootstrap else False) tree = DecisionTree(max_depth=4) tree.fit(X[idxs], y[idxs]) self.estimators_.append(tree)
def assessLoan(self, event): """Event handler for Assess Loan Application button.""" click = event.GetEventObject().GetLabel() text_entered = self.textbox.GetValue() print(text_entered) dt = DecisionTree() dt.setCreditScore(float(text_entered)) dt.executeTree() self.textbox2.SetValue(dt.printState())
def k_fold_cross_validation(x, y, k, shf=False): if shf: to_shf = np.column_stack((x, y)) to_shf = list(to_shf) shuffle(to_shf) to_shf = np.array(to_shf) x = np.delete(to_shf, -1, axis=1) y = to_shf[:, -1] train_acc = np.zeros((k, n_attr)) val_acc = np.zeros((k, n_attr)) for d in range(k): print(d, "th fold...") x_train = np.array([row for i, row in enumerate(x) if i % k != d]) x_val = np.array([row for i, row in enumerate(x) if i % k == d]) y_train = np.array([val for i, val in enumerate(y) if i % k != d]) y_val = np.array([val for i, val in enumerate(y) if i % k == d]) for depth in range(n_attr): dtree = DecisionTree(x_train, y_train, max_depth=depth) dtree.fit() # train_acc[d, depth] = dtree.accuracy(x_train, y_train) val_acc[d, depth] = dtree.accuracy(x_val, y_val) return val_acc
def learn_depths(): # training decision tree for different heights train_acc = np.zeros(n_attr) test_acc = np.zeros(n_attr) for depth in range(n_attr): dtree = DecisionTree(x_train, y_train, max_depth=depth) dtree.fit() train_acc[depth] = dtree.accuracy(x_train, y_train) test_acc[depth] = dtree.accuracy(x_test, y_test) df = pd.DataFrame({ 'depth': range(1, n_attr + 1), 'Train accuracy': train_acc, 'Test accuracy': test_acc }) # df.to_csv('res/acc.csv') return df
def main(): parser = argparse.ArgumentParser(description="csv data file path") parser.add_argument("--csv", type=str, help="The data file path") parser.add_argument( "--eval", type=str, default="gini", help= "The evaluation function, could be gini or entropy. Default using gini." ) cli_args = parser.parse_args() if cli_args.eval not in ['gini', 'entropy']: print('The evaluation function should be gini or entropy') exit(0) data = pd.read_csv(cli_args.csv) tree = DecisionTree() str_list = ''' 0:[CRP(mg/L)<5.5] yes=1,no=2,missing=2 1:[白细胞总数(x10^9/L)<18.6850014] yes=3,no=4,missing=3 3:[血小板计数(x10^9/L)<171.5] yes=7,no=8,missing=7 7:[白细胞总数(x10^9/L)<11.8999996] yes=13,no=14,missing=13 13:[CRP(mg/L)<1.5] yes=19,no=20,missing=20 19:[中性粒细胞百分比(%)<52.5999985] yes=27,no=28,missing=28 27:leaf=-0.0121546965 28:leaf=0.0117647061 20:[出生时体重(g)<1840] yes=29,no=30,missing=29 29:leaf=0.0510822535 30:leaf=0.00118343194 14:[白细胞总数(x10^9/L)<14.71] yes=21,no=22,missing=21 21:leaf=-0.0139534893 22:leaf=0.00118343194 8:[临床表现异常数<1.5] yes=15,no=16,missing=15 15:[PCT(ng/ML)<0.375] yes=23,no=24,missing=23 23:[中性杆状核粒细胞百分比(%)<5] yes=31,no=32,missing=31 31:leaf=-0.146943495 32:leaf=0.00930232555 24:[中性粒细胞百分比(%)<41.0500031] yes=33,no=34,missing=34 33:leaf=0.0122905029 34:leaf=-0.00952380989 16:[出生时体重(g)<1340] yes=25,no=26,missing=25 25:leaf=-0.00346820801 26:[出生时体重(g)<1670] yes=35,no=36,missing=35 35:leaf=0.0171428584 36:leaf=-0.00116959063 4:[PCT(ng/ML)<0.13499999] yes=9,no=10,missing=10 9:leaf=-0.00952380989 10:[出生时体重(g)<2270] yes=17,no=18,missing=17 17:leaf=0.084153004 18:leaf=0.00118343194 2:[CRP(mg/L)<6.5] yes=5,no=6,missing=6 5:[白细胞总数(x10^9/L)<12.04] yes=11,no=12,missing=11 11:leaf=0.0200000014 12:leaf=-0.00952380989 6:leaf=0.117241383 ''' tree.buildFromString(str_list.split('\n'), data, {0: 1, 1: 2.5}) print(cal_metric(tree.classify(data), data.values[:, -1])) tree.index_to_class = {0: '无感染', 1: '感染'} tree.savePDF('parse_output.pdf') tree.savePNG('parse_output.png')
if len(args) == 1: print("usage: main <train_path> <test_path> <test_out_Path>") exit(0) elif len(args) != 4: print( "incorrect number of arguments, 3 args are needed, you have {:d}". format(len(args) - 1)) print("usage: main <train_path> <test_path> <test_out_Path>") exit(0) train_path = args[1] test_path = args[2] test_out_path = args[3] train_data = utils.load_csv(args[1]) test_data = utils.load_csv(args[2]) tree = DecisionTree(method="cart") enable_prune = [False] def configure(config): for k, v in config.items(): if k in tree.config: tree.config[k] = v enable_prune[0] = config["prune"] def train(x, y): tree.train(x, y) if enable_prune[0]: tree.prune(train_data[:, 0:6], train_data[:, 6], min_gain=0.001) trainer = LabelTrainer(data=train_data[:, 0:6],
train_labels = train_data[:, 0] train_data = train_data[:, 1:] test_data = np.array(read_file("data/spect_test.txt", sep=",")) test_labels = test_data[:, 0] test_data = test_data[:, 1:] # create a map of the attributes so we can retain the original column numbers as the tree splits the data attributes = list(range(len(train_data[0]))) # Do an initial run with the full training dataset correct = [] p_max = 1.0 level_max = 9 tree = DecisionTree(train_data, train_labels, attributes, p_threshold=p_max, max_level=level_max) y = tree.classify(test_data) print("correct = {}".format( sum(np.asarray(y == test_labels, dtype=int)) / len(y) * 100)) # Do 10 runs of 25-round bootstrap training varying the depth of the trees from 1 to 10 levels n = 25 num_depths = 10 bias = np.zeros(num_depths) variance = np.zeros(num_depths) accuracy = np.zeros(num_depths) depths = np.arange(1, num_depths + 1) for depth in depths: y = np.zeros((n, len(test_data)))
df = pd.read_csv('test.csv') df = clean.titanicClean(df) X = df.values y = pd.read_csv('gender_submission.csv') y = y.set_index('PassengerId') y = y.values y = y.flatten() return X, y clfs = [SVC(C=1.0, kernel='rbf'), LinearSVC(penalty='l2', C=1.0, max_iter=1000), LogisticRegression(), DecisionTreeClassifier(max_depth=4, splitter='best'), RandomForestClassifier(n_estimators = 3, max_depth=4), DecisionTree(max_depth=4), DecisionForest(n_estimators=5, bootstrap=True), DecisionForest(n_estimators=100, russells_method=True)] X_train, y_train = train_data() X_test, y_test = test_data() for clf in clfs: clf.fit(X_train, y_train) print("%s: \t[%f/%f]" % (clf.__class__.__name__, clf.score(X_train, y_train), clf.score(X_test, y_test)) ) if isinstance(clf, DecisionTreeClassifier): sklearn.tree.export_graphviz(clf, out_file='tree.dot')
def main(): parser = argparse.ArgumentParser(description="csv data file path") parser.add_argument("--csv", type=str, help="The data file path") parser.add_argument( "--eval", type=str, default="gini", help= "The evaluation function, could be gini or entropy. Default using gini." ) cli_args = parser.parse_args() if cli_args.eval not in ['gini', 'entropy']: print('The evaluation function should be gini or entropy') exit(0) data = pd.read_csv(cli_args.csv) train = data.sample(frac=0.75, random_state=0) test = pd.concat([train, data]).drop_duplicates(keep=False) class_weights = {'setosa': 1, 'versicolor': 1, 'virginica': 1} tree = DecisionTree() tree.fit(train, class_weights, gini) # print(tree._error_rate(tree.root)) print(tree._count_leaf(tree.root)) # tree.prune(test, 0.0) print(tree.treeToString()) data = pd.DataFrame( [[5.1, 3.5, np.nan, 1]], columns=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']) print(tree.classify(data)) tree.savePDF('output.pdf') tree.savePNG('output.png')
# %% import numpy as np from preprocess import get_train_data, get_test_data from dtree import DecisionTree x_train, y_train = get_train_data() x_test, y_test = get_test_data() decision_tree = DecisionTree(x_train, y_train, max_depth=1) decision_tree.fit() decision_tree.traverse() y_hat = decision_tree.predict(x_test) print("accuracy: ", decision_tree.accuracy(x_test, y_test)) # %% def get_stats(): TP = np.sum(np.logical_and(y_test == 1, y_hat == 1)) FP = np.sum(np.logical_and(y_test == 0, y_hat == 1)) TN = np.sum(np.logical_and(y_test == 0, y_hat == 0)) FN = np.sum(np.logical_and(y_test == 1, y_hat == 0)) return TP, FP, TN, FN def specificity(): TP, FP, TN, FN = get_stats() return TN / (TN + FP) def sensitivity(): TP, FP, TN, FN = get_stats() return TP / (TP + FN)
if __name__ == '__main__': train_data = np.array(read_file('data/spect_train.txt', sep=',')) train_labels = train_data[:,0] train_data = train_data[:,1:] test_data = np.array(read_file('data/spect_test.txt', sep=',')) test_labels = test_data[:,0] test_data = test_data[:,1:] # create a map of the attributes so we can retain the original column numbers as the tree splits the data attributes = list(range(len(train_data[0]))) # Do an initial run with the full training dataset correct=[] p_max = 1.0 level_max = 9 tree = DecisionTree(train_data, train_labels, attributes, p_threshold=p_max, max_level=level_max) y = tree.classify(test_data) print("correct = {}".format(sum(np.asarray(y == test_labels, dtype=int))/len(y)*100)) # Do 10 runs of 25-round bootstrap training varying the depth of the trees from 1 to 10 levels n = 25 num_depths = 10 bias = np.zeros(num_depths) variance = np.zeros(num_depths) accuracy = np.zeros(num_depths) depths = np.arange(1,num_depths + 1) for depth in depths: y = np.zeros((n, len(test_data))) # We are assuming that N(x) = 0, so there's no noise. This means y_star = y_t y_star = t = test_labels for i in range(n):
from dtree import DecisionTree from id3 import information_gain values = [ "sunny hot high weak no", "sunny hot high strong no", "overcast hot high weak yes", "rain mild high weak yes", "rain cool normal weak yes", "rain cool normal strong no", "overcast cool normal strong yes", "sunny mild high weak no", "sunny cool normal weak yes", "rain mild normal weak yes", "sunny mild normal strong yes", "overcast mild high strong yes", "overcast hot normal weak yes", "rain mild high strong no" ] keys = ["outlook", "temperature", "humidity", "wind", "target"] data = [dict(zip(keys, x.split())) for x in values] if __name__ == '__main__': tree = DecisionTree(data, keys[:-1], information_gain, target_name=keys[-1]) from pprint import PrettyPrinter pp = PrettyPrinter() pp.pprint(tree.tree) print[(x[0]['target'], x[1]) for x in tree.classify(data)]
from dtree import DecisionTree from id3 import information_gain values = ["sunny hot high weak no", "sunny hot high strong no", "overcast hot high weak yes", "rain mild high weak yes", "rain cool normal weak yes", "rain cool normal strong no", "overcast cool normal strong yes", "sunny mild high weak no", "sunny cool normal weak yes", "rain mild normal weak yes", "sunny mild normal strong yes", "overcast mild high strong yes", "overcast hot normal weak yes", "rain mild high strong no"] keys = ["outlook", "temperature", "humidity", "wind", "target"] data = [dict(zip(keys, x.split())) for x in values] if __name__ == '__main__': tree = DecisionTree(data, keys[:-1], information_gain, target_name=keys[-1]) from pprint import PrettyPrinter pp = PrettyPrinter() pp.pprint(tree.tree) print [(x[0]['target'], x[1]) for x in tree.classify(data)]