def train(self, X, y): #check dimensions if not len(X) == len(y): raise IndexError("The number of samples in X and y do not match") #check if X and y are numpy arrays if type(X) is not np.ndarray: X = self.__numpify(X) if not X: raise TypeError("input dataset X is not a valid numeric array") if type(y) is not np.ndarray: y = self.__numpify(y) if not y: raise TypeError("input label vector y is not a valid numeric array") #check if trained if self.trained: self.__untrain() indices = np.arange(len(X)) #determine the size of the bootstrap sample strapsize = np.int(len(X)*self.fraction) for t in xrange(self.n_trees): #creat a new classification tree tree = ClassificationTree(depth_limit=self.depth_limit, impurity=self.impurity) #bootstrap a sample bootstrap = np.random.choice(indices, strapsize) Xstrap = X[bootstrap,:] ystrap = y[bootstrap] #train the t-th tree with the strapped sample tree.train(Xstrap,ystrap) self.trees[t] = tree self.trained = True print("%d trees grown" % self.n_trees)
def train(self, X, y): #check dimensions if not len(X) == len(y): raise IndexError("The number of samples in X and y do not match") #check if X and y are numpy arrays if type(X) is not np.ndarray: X = self.__numpify(X) if not X: raise TypeError("input dataset X is not a valid numeric array") if type(y) is not np.ndarray: y = self.__numpify(y) if not y: raise TypeError( "input label vector y is not a valid numeric array") #check if trained if self.trained: self.__untrain() indices = np.arange(len(X)) #determine the size of the bootstrap sample strapsize = np.int(len(X) * self.fraction) for t in xrange(self.n_trees): #creat a new classification tree tree = ClassificationTree(depth_limit=self.depth_limit, impurity=self.impurity) #bootstrap a sample bootstrap = np.random.choice(indices, strapsize) Xstrap = X[bootstrap, :] ystrap = y[bootstrap] #train the t-th tree with the strapped sample tree.train(Xstrap, ystrap) self.trees[t] = tree self.trained = True print("%d trees grown" % self.n_trees)
def train(self, X, y): # check dimensions if not len(X) == len(y): raise IndexError("The number of samples in X and y do not match") # check if X and y are numpy arrays if type(X) is not np.ndarray: X = self.__numpify(X) if not X: raise TypeError("input dataset X is not a valid numeric array") if type(y) is not np.ndarray: y = self.__numpify(y) if not y: raise TypeError( "input label vector y is not a valid numeric array") # check if trained if self.trained: self.__untrain() indices = np.arange(len(X)) # determine the size of the bootstrap sample strapsize = np.int(len(X) * self.fraction) features = np.arange(X.shape[1]) # determine the number of features to subsample each iteration # using the sqrt(n) rule of thumb if n > 10 subsize = np.ceil(np.sqrt(X.shape[1])).astype( np.int) if X.shape[1] >= 9 else X.shape[1] # start growing the tree for t in range(self.n_trees): # creat a new classification tree tree = ClassificationTree(depth_limit=self.depth_limit, impurity=self.impurity) # bootstrap a sample bootstrap = np.random.choice(indices, strapsize) subfeature = np.random.choice( features, subsize, replace=False) #features are not sampled with replacement Xstrap = X[bootstrap, :][:, subfeature] ystrap = y[bootstrap] # train the t-th tree with the strapped sample tree.train(Xstrap, ystrap) # for each tree, need to save which features to use self.trees[t] = [tree, subfeature] self.trained = True print("%d trees grown" % self.n_trees)
def train(self, X, y): # check dimensions if not len(X) == len(y): raise IndexError("The number of samples in X and y do not match") # check if X and y are numpy arrays if type(X) is not np.ndarray: X = self.__numpify(X) if not X: raise TypeError("input dataset X is not a valid numeric array") if type(y) is not np.ndarray: y = self.__numpify(y) if not y: raise TypeError("input label vector y is not a valid numeric array") # check if trained if self.trained: self.__untrain() indices = np.arange(len(X)) # determine the size of the bootstrap sample strapsize = np.int(len(X)*self.fraction) features = np.arange(X.shape[1]) # determine the number of features to subsample each iteration # using the sqrt(n) rule of thumb if n > 10 subsize = np.ceil(np.sqrt(X.shape[1])).astype(np.int) if X.shape[1] >= 9 else X.shape[1] # start growing the tree for t in xrange(self.n_trees): # creat a new classification tree tree = ClassificationTree(depth_limit=self.depth_limit, impurity=self.impurity) # bootstrap a sample bootstrap = np.random.choice(indices, strapsize) subfeature = np.random.choice(features, subsize, replace=False) #features are not sampled with replacement Xstrap = X[bootstrap,:][:,subfeature] ystrap = y[bootstrap] # train the t-th tree with the strapped sample tree.train(Xstrap,ystrap) # for each tree, need to save which features to use self.trees[t] = [tree, subfeature] self.trained = True print("%d trees grown" % self.n_trees)
__author__ = "metjush" # An example file for the decision_tree repository, using datasets from scikit-learn # to demonstrate classification with a single tree, bagged forest and random forest. # If you just want to see if the package works, run this file. # Importing all requirements import numpy as np from ClassTree import ClassificationTree from ClassTreeBagging import TreeBagger from ClassForest import RandomForest # Create the classifier objects tree = ClassificationTree() bag = TreeBagger(n_trees=50) forest = RandomForest(n_trees=50) # Get datasets from scikit-learn from sklearn.datasets import load_iris # iris classification # Save to arrays iris = load_iris() X_iris = iris.data y_iris = iris.target # Train classifiers with Iris data # Simple tree training tree.train(X_iris, y_iris)
if fileitem.filename: # strip leading path fn = os.path.basename(fileitem.filename) filehash.update(fn) name = filehash.hexdigest() open(UPLOAD_DIR + name + ".csv", 'wb').write(fileitem.file.read()) savedfile = open(UPLOAD_DIR + name + ".csv", 'r') else: raise IOError("Upload of file failed") # we will be returning a json file, so set header message_header = "header('Content-type: application/json');" # Import ClassificationTree class from ClassTree import ClassificationTree import numpy as np # read the saved file as a numpy array data = np.loadtxt(savedfile, delimiter=",") X = np.concatenate((data[:,0:label_column], data[:,(label_column+1):])) y = data[:,label_column] train_tree = ClassificationTree(depth_limit=depth) train_tree.train() train_json = train_tree.to_json(JSON_DIR + name + ".json")
depth = form['depth'] label_column = form['label'] # checking if fileitem.filename: # strip leading path fn = os.path.basename(fileitem.filename) filehash.update(fn) name = filehash.hexdigest() open(UPLOAD_DIR + name + ".csv", 'wb').write(fileitem.file.read()) savedfile = open(UPLOAD_DIR + name + ".csv", 'r') else: raise IOError("Upload of file failed") # we will be returning a json file, so set header message_header = "header('Content-type: application/json');" # Import ClassificationTree class from ClassTree import ClassificationTree import numpy as np # read the saved file as a numpy array data = np.loadtxt(savedfile, delimiter=",") X = np.concatenate((data[:, 0:label_column], data[:, (label_column + 1):])) y = data[:, label_column] train_tree = ClassificationTree(depth_limit=depth) train_tree.train() train_json = train_tree.to_json(JSON_DIR + name + ".json")
import numpy as np from ClassTree import ClassificationTree from ClassTreeBagging import TreeBagger from ClassForest import RandomForest import sys import string import copy from collections import Counter from operator import itemgetter # Create the classifier objects tree = ClassificationTree() bag = TreeBagger(n_trees=50) forest = RandomForest(n_trees=50) # Get datasets from scikit-learn from sklearn.datasets import load_iris # iris classification def process_str(s): rem_punc = str.maketrans('', '', string.punctuation) return s.translate(rem_punc).lower().split() def read_dataset(file_name): dataset = [] with open(file_name) as f: for line in f: index, class_label, text = line.strip().split('\t') words = process_str(text) dataset.append( (int(class_label), words) )