def train(self, X, y):
        #check dimensions
        if not len(X) == len(y):
            raise IndexError("The number of samples in X and y do not match")
        #check if X and y are numpy arrays
        if type(X) is not np.ndarray:
            X = self.__numpify(X)
            if not X:
                raise TypeError("input dataset X is not a valid numeric array")
        if type(y) is not np.ndarray:
            y = self.__numpify(y)
            if not y:
                raise TypeError("input label vector y is not a valid numeric array")

        #check if trained
        if self.trained:
            self.__untrain()

        indices = np.arange(len(X))
        #determine the size of the bootstrap sample
        strapsize = np.int(len(X)*self.fraction)
        for t in xrange(self.n_trees):
            #creat a new classification tree
            tree = ClassificationTree(depth_limit=self.depth_limit, impurity=self.impurity)
            #bootstrap a sample
            bootstrap = np.random.choice(indices, strapsize)
            Xstrap = X[bootstrap,:]
            ystrap = y[bootstrap]
            #train the t-th tree with the strapped sample
            tree.train(Xstrap,ystrap)
            self.trees[t] = tree
        self.trained = True
        print("%d trees grown" % self.n_trees)
Beispiel #2
0
    def train(self, X, y):
        #check dimensions
        if not len(X) == len(y):
            raise IndexError("The number of samples in X and y do not match")
        #check if X and y are numpy arrays
        if type(X) is not np.ndarray:
            X = self.__numpify(X)
            if not X:
                raise TypeError("input dataset X is not a valid numeric array")
        if type(y) is not np.ndarray:
            y = self.__numpify(y)
            if not y:
                raise TypeError(
                    "input label vector y is not a valid numeric array")

        #check if trained
        if self.trained:
            self.__untrain()

        indices = np.arange(len(X))
        #determine the size of the bootstrap sample
        strapsize = np.int(len(X) * self.fraction)
        for t in xrange(self.n_trees):
            #creat a new classification tree
            tree = ClassificationTree(depth_limit=self.depth_limit,
                                      impurity=self.impurity)
            #bootstrap a sample
            bootstrap = np.random.choice(indices, strapsize)
            Xstrap = X[bootstrap, :]
            ystrap = y[bootstrap]
            #train the t-th tree with the strapped sample
            tree.train(Xstrap, ystrap)
            self.trees[t] = tree
        self.trained = True
        print("%d trees grown" % self.n_trees)
Beispiel #3
0
    def train(self, X, y):
        # check dimensions
        if not len(X) == len(y):
            raise IndexError("The number of samples in X and y do not match")
        # check if X and y are numpy arrays
        if type(X) is not np.ndarray:
            X = self.__numpify(X)
            if not X:
                raise TypeError("input dataset X is not a valid numeric array")
        if type(y) is not np.ndarray:
            y = self.__numpify(y)
            if not y:
                raise TypeError(
                    "input label vector y is not a valid numeric array")

        # check if trained
        if self.trained:
            self.__untrain()

        indices = np.arange(len(X))
        # determine the size of the bootstrap sample
        strapsize = np.int(len(X) * self.fraction)
        features = np.arange(X.shape[1])
        # determine the number of features to subsample each iteration
        # using the sqrt(n) rule of thumb if n > 10
        subsize = np.ceil(np.sqrt(X.shape[1])).astype(
            np.int) if X.shape[1] >= 9 else X.shape[1]

        # start growing the tree
        for t in range(self.n_trees):
            # creat a new classification tree
            tree = ClassificationTree(depth_limit=self.depth_limit,
                                      impurity=self.impurity)
            # bootstrap a sample
            bootstrap = np.random.choice(indices, strapsize)
            subfeature = np.random.choice(
                features, subsize,
                replace=False)  #features are not sampled with replacement
            Xstrap = X[bootstrap, :][:, subfeature]
            ystrap = y[bootstrap]
            # train the t-th tree with the strapped sample
            tree.train(Xstrap, ystrap)
            # for each tree, need to save which features to use
            self.trees[t] = [tree, subfeature]
        self.trained = True
        print("%d trees grown" % self.n_trees)
Beispiel #4
0
    def train(self, X, y):
        # check dimensions
        if not len(X) == len(y):
            raise IndexError("The number of samples in X and y do not match")
        # check if X and y are numpy arrays
        if type(X) is not np.ndarray:
            X = self.__numpify(X)
            if not X:
                raise TypeError("input dataset X is not a valid numeric array")
        if type(y) is not np.ndarray:
            y = self.__numpify(y)
            if not y:
                raise TypeError("input label vector y is not a valid numeric array")

        # check if trained
        if self.trained:
            self.__untrain()

        indices = np.arange(len(X))
        # determine the size of the bootstrap sample
        strapsize = np.int(len(X)*self.fraction)
        features = np.arange(X.shape[1])
        # determine the number of features to subsample each iteration
        # using the sqrt(n) rule of thumb if n > 10
        subsize = np.ceil(np.sqrt(X.shape[1])).astype(np.int) if X.shape[1] >= 9 else X.shape[1]

        # start growing the tree
        for t in xrange(self.n_trees):
            # creat a new classification tree
            tree = ClassificationTree(depth_limit=self.depth_limit, impurity=self.impurity)
            # bootstrap a sample
            bootstrap = np.random.choice(indices, strapsize)
            subfeature = np.random.choice(features, subsize, replace=False) #features are not sampled with replacement
            Xstrap = X[bootstrap,:][:,subfeature]
            ystrap = y[bootstrap]
            # train the t-th tree with the strapped sample
            tree.train(Xstrap,ystrap)
            # for each tree, need to save which features to use
            self.trees[t] = [tree, subfeature]
        self.trained = True
        print("%d trees grown" % self.n_trees)
Beispiel #5
0
__author__ = "metjush"

# An example file for the decision_tree repository, using datasets from scikit-learn
# to demonstrate classification with a single tree, bagged forest and random forest.
# If you just want to see if the package works, run this file.

# Importing all requirements

import numpy as np
from ClassTree import ClassificationTree
from ClassTreeBagging import TreeBagger
from ClassForest import RandomForest

# Create the classifier objects
tree = ClassificationTree()
bag = TreeBagger(n_trees=50)
forest = RandomForest(n_trees=50)

# Get datasets from scikit-learn
from sklearn.datasets import load_iris # iris classification

# Save to arrays
iris = load_iris()

X_iris = iris.data
y_iris = iris.target

# Train classifiers with Iris data

# Simple tree training
tree.train(X_iris, y_iris)
if fileitem.filename:

    # strip leading path
    fn = os.path.basename(fileitem.filename)
    filehash.update(fn)
    name = filehash.hexdigest()
    open(UPLOAD_DIR + name + ".csv", 'wb').write(fileitem.file.read())
    savedfile = open(UPLOAD_DIR + name + ".csv", 'r')
else:
    raise IOError("Upload of file failed")

# we will be returning a json file, so set header
message_header = "header('Content-type: application/json');"

# Import ClassificationTree class
from ClassTree import ClassificationTree
import numpy as np




# read the saved file as a numpy array
data = np.loadtxt(savedfile, delimiter=",")
X = np.concatenate((data[:,0:label_column], data[:,(label_column+1):]))
y = data[:,label_column]

train_tree = ClassificationTree(depth_limit=depth)
train_tree.train()
train_json = train_tree.to_json(JSON_DIR + name + ".json")

Beispiel #7
0
depth = form['depth']
label_column = form['label']

# checking
if fileitem.filename:

    # strip leading path
    fn = os.path.basename(fileitem.filename)
    filehash.update(fn)
    name = filehash.hexdigest()
    open(UPLOAD_DIR + name + ".csv", 'wb').write(fileitem.file.read())
    savedfile = open(UPLOAD_DIR + name + ".csv", 'r')
else:
    raise IOError("Upload of file failed")

# we will be returning a json file, so set header
message_header = "header('Content-type: application/json');"

# Import ClassificationTree class
from ClassTree import ClassificationTree
import numpy as np

# read the saved file as a numpy array
data = np.loadtxt(savedfile, delimiter=",")
X = np.concatenate((data[:, 0:label_column], data[:, (label_column + 1):]))
y = data[:, label_column]

train_tree = ClassificationTree(depth_limit=depth)
train_tree.train()
train_json = train_tree.to_json(JSON_DIR + name + ".json")
Beispiel #8
0
__author__ = "metjush"

# An example file for the decision_tree repository, using datasets from scikit-learn
# to demonstrate classification with a single tree, bagged forest and random forest.
# If you just want to see if the package works, run this file.

# Importing all requirements

import numpy as np
from ClassTree import ClassificationTree
from ClassTreeBagging import TreeBagger
from ClassForest import RandomForest

# Create the classifier objects
tree = ClassificationTree()
bag = TreeBagger(n_trees=50)
forest = RandomForest(n_trees=50)

# Get datasets from scikit-learn
from sklearn.datasets import load_iris  # iris classification

# Save to arrays
iris = load_iris()

X_iris = iris.data
y_iris = iris.target

# Train classifiers with Iris data

# Simple tree training
tree.train(X_iris, y_iris)
Beispiel #9
0
import numpy as np
from ClassTree import ClassificationTree
from ClassTreeBagging import TreeBagger
from ClassForest import RandomForest
import sys
import string
import copy
from collections import Counter
from operator import itemgetter

# Create the classifier objects
tree = ClassificationTree()
bag = TreeBagger(n_trees=50)
forest = RandomForest(n_trees=50)

# Get datasets from scikit-learn
from sklearn.datasets import load_iris # iris classification


def process_str(s):
    rem_punc = str.maketrans('', '', string.punctuation)
    return s.translate(rem_punc).lower().split()

def read_dataset(file_name):
    dataset = []
    with open(file_name) as f:
        for line in f:
            index, class_label, text = line.strip().split('\t')
            words = process_str(text)
            dataset.append( (int(class_label), words) )