def fit(self, X, y): self.trees = [] for _ in range(self.n_trees): tree = Decision_Tree(min_samples_split=self.min_sample_split, max_depth=self.max_depth, n_feats=self.n_feats) X_sample, y_sample = bootstrap_sample(X, y) tree.fit(X_sample, y_sample) self.trees.append(tree)
def decision_tree_test(): data = load_boston() X = data.data Y = data.target X = X[:, [0, 2, 4, 5, 7, 11]] dt = Decision_Tree(max_depth=5) dt.fit(X, Y) Y_pre = dt.predict(X[-20:]) return Y_pre
def fit(self, X, Y): # 循环训练每一颗决策树 self.tree = [] for s_t in range(self.n_trees): X_Sample, Y_Sample = boost_trap(X, Y) single_tree = Decision_Tree(max_depth=self.max_depth, classifier=True, Loss="Gini") single_tree.fit(X_Sample, Y_Sample) self.tree.append(single_tree) pass pass
def train(self, training_data): """ Data should be nx(m+1) numpy matrix where n is the number of examples and m is the number of features (recall that the first element of the vector is the label). I recommend implementing the specific algorithms in a seperate module and then determining which method to call based on classifier_type. E.g. if you had a module called neural_nets: if self.classifier_type == 'neural_net': import neural_nets neural_nets.train_neural_net(self.params, training_data) Note that your training algorithms should be modifying the parameters so make sure that your methods are actually modifying self.params You should print the accuracy, precision, and recall on the training data. """ if self.classifier_type == 'neural_network': #change num_input, num_output based upon the data self.nn = Neural_Network("neural_network",weights = [], num_input=self.params['num_input'], num_hidden=1000, num_output=self.params['num_output'], alt_weight=self.params['one']=='1', momentum=self.params['two']=='1') self.nn.train(training_data) elif self.classifier_type == 'naive_bayes': self.nb = Naive_Bayes("naive_bayes") self.nb.train(training_data) elif self.classifier_type =='decision_tree': self.dt = Decision_Tree("decision_tree", pruning=self.params['one']=='1', info_gain_ratio=self.params['two']=='1') self.dt.train(training_data)
def decision_tree(self, training_data, training_labels, testing_data): # Create and build the decision tree tree = Decision_Tree(training_data, training_labels) tree.print_tree_dfs() # Test for when we encounter a new category not seen before in testing # test = ["low", "high", "high", "high", "high", "high", "high", "potato"] # print(f"YEET: {tree.predict(test)}") predictions = [] for i in range(len(testing_data)): predictions.append(tree.predict(testing_data[i])) return predictions
from decision_tree import Decision_Tree from sklearn.datasets import load_iris import numpy as np from sklearn.preprocessing import OrdinalEncoder from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier as DTC from sklearn import tree # Test w/ Iris dataset using my class dataset = load_iris() X, y = dataset.data, dataset.target clf_iris = Decision_Tree(max_depth = 5) # Test to make target class strings instead of integers y = ["one" if val == 1 or val == 2 else "zero" for val in y] y = np.array(y) # Need to ordinally encode strings to integers if "int" not in str(y.dtype): # Reshape y array so it works w/ ordinal encoder y = y.reshape(-1, 1) encoder = OrdinalEncoder() y = encoder.fit_transform(y) y = y.astype(int) y = y.reshape(y.size,) clf_iris.fit(X, y) temp1 = np.array([[3, 2, 1, .5]]) temp2 = np.array([[4, 2.9, 1.3, .2]]) temp3 = np.array([[3.8, 3, 1.4, .4]]) temp4 = np.array([[7.7, 2.8, 6.7, 2]])
from read_file import Read_File from data_processor import Data_Processor from svm import SVM from decision_tree import Decision_Tree fileName = 'data/census-income_10percentData.csv' file_reader = Read_File(fileName) file_reader.read() features = file_reader.get_features() labels = file_reader.get_labels() data_fill = Data_Processor(features) data_fill.fill_empty_fields() # will perform svm task my_svm = SVM(features, labels) my_svm.calculate_info_gain() my_svm.stratified_k_fold(10) my_svm.svm() my_svm.draw_svm() my_tree = Decision_Tree(features, labels) my_tree.calculate_info_gain() for i in range(1, 14): print "============================{}============================".format( i) my_tree.decision(i)
def decisionTree_class(): X, Y = make_blobs(n_samples=100, centers=10, n_features=10, random_state=5) dt = Decision_Tree(max_depth=5, classifier=True, Loss="Gini") dt.fit(X, Y) Y_P = dt.predict(X) return Y_P, Y
class Classifier: def __init__(self, classifier_type, **kwargs): """ Initializer. Classifier_type should be a string which refers to the specific algorithm the current classifier is using. Use keyword arguments to store parameters specific to the algorithm being used. E.g. if you were making a neural net with 30 input nodes, hidden layer with 10 units, and 3 output nodes your initalization might look something like this: neural_net = Classifier(weights = [], num_input=30, num_hidden=10, num_output=3) Here I have the weight matrices being stored in a list called weights (initially empty). """ self.classifier_type = classifier_type self.params = kwargs """ The kwargs you inputted just becomes a dictionary, so we can save that dictionary to be used in other methods. """ def train(self, training_data): """ Data should be nx(m+1) numpy matrix where n is the number of examples and m is the number of features (recall that the first element of the vector is the label). I recommend implementing the specific algorithms in a seperate module and then determining which method to call based on classifier_type. E.g. if you had a module called neural_nets: if self.classifier_type == 'neural_net': import neural_nets neural_nets.train_neural_net(self.params, training_data) Note that your training algorithms should be modifying the parameters so make sure that your methods are actually modifying self.params You should print the accuracy, precision, and recall on the training data. """ if self.classifier_type == 'neural_network': #change num_input, num_output based upon the data self.nn = Neural_Network("neural_network",weights = [], num_input=self.params['num_input'], num_hidden=1000, num_output=self.params['num_output'], alt_weight=self.params['one']=='1', momentum=self.params['two']=='1') self.nn.train(training_data) elif self.classifier_type == 'naive_bayes': self.nb = Naive_Bayes("naive_bayes") self.nb.train(training_data) elif self.classifier_type =='decision_tree': self.dt = Decision_Tree("decision_tree", pruning=self.params['one']=='1', info_gain_ratio=self.params['two']=='1') self.dt.train(training_data) def predict(self, data): """ Predict class of a single data vector Data should be 1x(m+1) numpy matrix where m is the number of features (recall that the first element of the vector is the label). I recommend implementing the specific algorithms in a seperate module and then determining which method to call based on classifier_type. This method should return the predicted label. """ def test(self, test_data): """ Data should be nx(m+1) numpy matrix where n is the number of examples and m is the number of features (recall that the first element of the vector is the label). You should print the accuracy, precision, and recall on the test data. """ #pdb.set_trace() #Accuracy, Recall, and Precision relevant_and_retrieved, relevant, retrieved, total, hit = 0, 0, 0, 0, 0 for person in test_data: predict = 0 if self.classifier_type == 'neural_network': predict = self.nn.predict(person) elif self.classifier_type == 'naive_bayes': predict = self.nb.predict(person) elif self.classifier_type == 'decision_tree': predict = self.dt.predict(person) if predict == person[0]: if predict == 0: relevant_and_retrieved += 1 hit += 1 if person[0] == 0: relevant += 1 if predict == 0: retrieved += 1 total += 1 accuracy = hit/float(total) recall = relevant_and_retrieved/float(relevant) precision = relevant_and_retrieved/float(retrieved) print "Accuracy: ", accuracy print "Precision ", precision print "Recall: " , recall
import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split from decision_tree import Decision_Tree def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy data = datasets.load_breast_cancer() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) clf = Decision_Tree(max_depth=10) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy(y_test, y_pred) print("Accuracy:", acc)
{ # Choosing month of date "date": row[0].split("/")[0], # log10 of confirmed "confirmed": int(math.log10(int(row[1]))), # log10 of recovered "recovered": int(math.log10(int(row[2]))), # log10 of deaths "deaths": int(math.log10(int(row[3]))), }, discretise_target(row[4])) examples.append(data) # Shuffling for randomness random.shuffle(examples) tre = list() tee = list() # Split the data 80/20 split_index = (int)((80 / 100) * len(examples)) tre = examples[:split_index] tee = examples[split_index:] decision_tree = Decision_Tree(tre, depth, pruning) print("DECISION TREE") print(decision_tree) print("MAXIMUM DEPTH REACHED") print(decision_tree.depth_reached) print("ACCURACY OVER TESTING") print(decision_tree.test_accuracy(tee) * 100, "%")