def train(self, data, labels): self.data = data self.labels = labels for i in range(self.num_trees): sample_index = np.random.choice(self.data.shape[0], self.num_sample, replace=True) train_data = self.data[sample_index, :] train_labels = self.labels[sample_index] tree = DecisionTree(self.max_depth, self.num_feature) tree.train(train_data, train_labels) self.trees.append(tree)
def test_all_file(self): options = { 'df': pd.read_csv("benchmark.csv", sep=';'), 'label_column': "Joga" } tr = DecisionTree() model = tr.train(options) for _, row in options['df'].iterrows(): target_label = row["Joga"] predicted = model.predict(row.drop("Joga")) self.assertEqual(target_label, predicted)
def test_benchmark(self): options = { 'df': pd.read_csv("benchmark.csv", sep=';'), 'label_column': "Joga" } tr = DecisionTree() model = tr.train(options) inf_data = pd.Series( ["Ensolarado", "Quente", "Normal", "Verdadeiro"], index=["Tempo", "Temperatura", "Umidade", "Ventoso"], name="InferenceData") self.assertEqual(model.predict(inf_data), 'Sim')
def train(self, options): """ train a random forest, using n_trees decision trees options['df']: pandas dataframe options['n_trees']: number of trees options['label_column']: label column to be predicted options['bootstrap_size']: the size of the bootstrap, entries not used in the bootstrap will be ignored """ num_trees = options['n_trees'] df = options['df'] bootstrap_size = options['bootstrap_size'] tree_options = { 'label_column': options['label_column'] } for i in range(num_trees): tree_options['df'] = get_bootstrap(df, bootstrap_size) new_tree = DecisionTree() self.ensemble.append(new_tree.train(tree_options)) return self
cat_data_test = np.array(cat_data_test, dtype='float') # zip categorical and non-categorical data together train_data = np.concatenate((cat_data, non_cat_data), axis=1) train_label = data[:, -1].astype(int) validation_data = train_data[:200, :] validation_label = train_label[:200] train_data = train_data[:, :] train_label = train_label[:] test_data = np.concatenate((cat_data_test, non_cat_data_test), axis=1) # decision tree tree = DecisionTree(5, train_data.shape[0]) tree.train(train_data, train_label) res = tree.predict(validation_data) score = 0 for i in range(len(res)): if res[i] == validation_label[i]: score += 1 score /= len(res) print(score) # random forest forest = RandomForest(100,5,train_data.shape[0],6) forest.train(train_data, train_label) res = forest.predict(validation_data)