Example #1
0
def test_random_hasher_sparse_data():
    X, y = datasets.make_multilabel_classification(return_indicator=True,
                                                   random_state=0)
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X_transformed = hasher.fit_transform(X)
    X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
Example #2
0
def test_random_trees_dense_equal():
    # Test that the `sparse_output` parameter of RandomTreesEmbedding
    # works by returning the same array for both argument values.

    # Create the RTEs
    hasher_dense = RandomTreesEmbedding(n_estimators=10, sparse_output=False, random_state=0)
    hasher_sparse = RandomTreesEmbedding(n_estimators=10, sparse_output=True, random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed_dense = hasher_dense.fit_transform(X)
    X_transformed_sparse = hasher_sparse.fit_transform(X)

    # Assert that dense and sparse hashers have same array.
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense)
Example #3
0
def rt_embedding(X, n_estimators=100, max_depth=10, n_jobs=-1):
    """Embed data matrix X in a random forest.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data matrix.
    n_estimators : int, optional
        The number of trees in the embedding.
    max_depth : int, optional
        The maximum depth of each tree.
    n_jobs : int, optional
        Number of compute jobs when fitting the trees. -1 means number
        of processors on the current computer.

    Returns
    -------
    rt : RandomTreesEmbedding object
        The embedding object.
    X_transformed : sparse matrix
        The transformed data.
    """
    rt = RandomTreesEmbedding(n_estimators=n_estimators, max_depth=max_depth,
                              n_jobs=n_jobs)
    X_transformed = rt.fit_transform(X)
    return rt, X_transformed
Example #4
0
def test_random_trees_dense_type():
    # Test that the `sparse_output` parameter of RandomTreesEmbedding
    # works by returning a dense array.

    # Create the RTE with sparse=False
    hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # Assert that type is ndarray, not scipy.sparse.csr.csr_matrix
    assert_equal(type(X_transformed), np.ndarray)
Example #5
0
def  do_TRT(ne = 10, md = 3):
    from sklearn.ensemble import RandomTreesEmbedding
    from sklearn.naive_bayes import BernoulliNB
    train_X, train_Y, test_X, test_Y = analysis_glass()
    all_X = np.vstack((train_X, test_X))
    hasher = RandomTreesEmbedding(n_estimators=ne,\
                                  random_state=0, max_depth=md)
    all_X_trans = hasher.fit_transform(all_X)
    train_X_trans = all_X[0:149, :]
    test_X_trans = all_X[149:, :]

    nb = BernoulliNB()
    nb.fit(train_X_trans, train_Y)

    return nb.score(test_X_trans, test_Y)
Example #6
0
def RandomTrees(X, labels, imgs, **kwargs):
    """
    稳定

    """
    # Random Trees embedding of the dataset dataset
    print("Computing Random Trees embedding")
    hasher = RandomTreesEmbedding(n_estimators=200,
                                  random_state=0,
                                  max_depth=5)
    t = time()
    X_transformed = hasher.fit_transform(X)
    pca = TruncatedSVD(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)

    plot_embedding(
        X_reduced, labels, imgs,
        "Random Trees embedding of the dataset (time %.2fs)" % (time() - t),
        **kwargs)
Example #7
0
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two pca dimensions
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray())

    # one leaf active per data point per forest
    assert_equal(X_transformed.shape[0], X.shape[0])
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    pca = RandomizedPCA(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert_equal(linear_clf.score(X_reduced, y), 1.0)
Example #8
0
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two pca dimensions
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    assert_array_equal(
        hasher.fit(X).transform(X).toarray(), X_transformed.toarray())

    # one leaf active per data point per forest
    assert_equal(X_transformed.shape[0], X.shape[0])
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    pca = RandomizedPCA(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert_equal(linear_clf.score(X_reduced, y), 1.)
Example #9
0
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two SVD dimensions
    # Note: Not all random_states produce perfect results.
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    assert_array_equal(
        hasher.fit(X).transform(X).toarray(), X_transformed.toarray())

    # one leaf active per data point per forest
    assert X_transformed.shape[0] == X.shape[0]
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    svd = TruncatedSVD(n_components=2)
    X_reduced = svd.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert linear_clf.score(X_reduced, y) == 1.
Example #10
0
class Clustering():
    def __init__(self, compounds, output=False, seed=False):
        np.random.seed(seed=seed)
        self.seed = seed
        self.compounds = compounds
        self.count = 0
        self.count_1 = 0
        self.output = output
        self.tools = clustertools()
        if self.output is not False:
            self.figures = clusterfigures(self.compounds)
        self.testcompound = []

    def cluster_training(self, train, distance=False):
        '''
        This is the basic clustering function
        '''
        self.train_matrix = train.train
        '''
        Step one is to make sure that their is a distance matrix in place.
        It is best to feed an existing distance matrix if one is available.
        '''
        if distance is False:
            self.p_feat_matrix = self.tools.pairwise_distance_matrix(
                train.train, 'jaccard')
        else:
            self.p_feat_matrix = distance
        '''
        Step two is to cluster your data using a random trees embedding. This a
        random ensemble of trees. This is a transformation on the data, into a
        high dimensional, sparse space
        '''
        self.clf = RandomTreesEmbedding(n_estimators=512,
                                        random_state=self.seed,
                                        max_depth=5)
        #self.clf.fit(self.train_matrix)
        X_transformed = self.clf.fit_transform(self.train_matrix)
        '''
        Step three performs truncated SVD (similar to PCA). It operates on the sample
        vectors directly, rather than the covariance matrix. It takes the first two
        components. Essentially this reduces the sparse embedding to a low dimensional
        representation.
        '''
        self.svd = TruncatedSVD(n_components=2)
        self.svd.clf = self.svd.fit(X_transformed)
        self.model = self.svd.clf.transform(X_transformed)
        '''
        The next step is to take the transformed model and the original dataset and
        determine the max silhouette_score of clusters
        '''
        (self.cluster_assignment, self.cluster_num,
         self.cluster_score) = self.tools.identify_accurate_number_of_clusters(
             self.model, self.compounds)
        self.individualclusters = []
        '''
        The individual datapoints are assessed with regard to the best clustering scheme
        '''
        for i in range(self.cluster_num):
            self.individualclusters.append([])
            for j in range(len(self.cluster_assignment)):
                if self.cluster_assignment[j] == i:
                    self.individualclusters[i].append(self.model[j, :])
            self.individualclusters[i] = np.array(self.individualclusters[i])
        '''
        Finally, this clustering scheme is used to generate a one class Support
        Vector Machine decision boundary.
        '''
        (self.clf_OCSVM,
         self.OCSVM_model) = self.tools.determine_test_similarity(
             self.individualclusters)

    def cluster_testing(self, testing):
        '''Create RandomTreesEmbedding of data'''
        clf = RandomTreesEmbedding(n_estimators=512,
                                   random_state=self.seed,
                                   max_depth=5)
        '''Fit testing data to training model'''
        clf.fit = self.clf.fit(testing)
        X_transformed = self.clf.fit_transform(testing)
        n_components = 2
        '''SVD transform data'''
        svd = TruncatedSVD(n_components=n_components)
        svd.clf = svd.fit(X_transformed)
        svd.model = svd.clf.transform(X_transformed)
        '''Train transformed data using original model'''
        train_transformed = clf.fit.transform(self.train_matrix)
        train_model = svd.clf.transform(train_transformed)
        '''Generate One Class SVM rejection criteria'''
        (clf_OCSVM_t, OCSVMmodel_t
         ) = self.tools.determine_testing_data_similarity(train_model)
        predicted = []
        '''Remove testing compounds outside rejection margin'''
        for i in range(len(svd.model)):
            p = OCSVMmodel_t.predict(svd.model[i, :].reshape(1, -1))
            pred = OCSVMmodel_t.decision_function(svd.model[i, :].reshape(
                1, -1)).ravel()
            if (p == 1):
                predicted.append(i)
        return predicted
original data.
"""
import pylab as pl
import numpy as np

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import RandomizedPCA
from sklearn.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
X_transformed = hasher.fit_transform(X)

# Visualize result using PCA
pca = RandomizedPCA(n_components=2)
X_reduced = pca.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)


# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)

original data.
"""
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
X_transformed = hasher.fit_transform(X)

# Visualize result after dimensionality reduction using truncated SVD
svd = TruncatedSVD(n_components=2)
X_reduced = svd.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)

# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)

# scatter plot of original and reduced data
fig = plt.figure(figsize=(9, 8))
Example #13
0
class UnsupervisedVisualBagClassifier(Classifier):
    """
    ===============================
    UnsupervisedVisualBagClassifier
    ===============================
    1. Unsupervised
    2. Binary bag of words
    3. Totally random trees
    """
    def __init__(self,
                 coordinator,
                 base_classifier,
                 n_estimators=10,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 n_jobs=-1,
                 random_state=None,
                 verbose=0,
                 min_density=None):
        Classifier.__init__(self, coordinator, base_classifier)
        self.histoSize = 0
        self._visualBagger = RandomTreesEmbedding(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            min_density=min_density)

    def _preprocess(self, image_buffer, learningPhase):
        if learningPhase:
            self.setTask(1, "Extracting the features (model creation)")
        else:
            self.setTask(1, "Extracting the features (prediction)")

        X_pred, y = self._coord.process(image_buffer,
                                        learningPhase=learningPhase)

        y_user = self._convertLabel(y)

        #Cleaning up
        self._coord.clean(y)
        del y

        self.endTask()

        #Bag-of-word transformation
        self.setTask(1, "Transforming data into bag-of-words (Tree part)")

        X2 = None
        if learningPhase:
            X2 = self._visualBagger.fit_transform(X_pred, y_user)
            self.histoSize = X2.shape[1]
        else:
            X2 = self._visualBagger.transform(X_pred)

        #Cleaning up
        self._coord.clean(X_pred)
        del X_pred
        del y_user

        self.endTask()

        nbFactor = X2.shape[0] // len(image_buffer)

        if not sps.isspmatrix_csr(X2):
            X2 = X2.tocsr()

        if nbFactor == 1:
            return X2

        self.setTask(len(image_buffer),
                     "Transforming data into bag-of-words (Histogram part)")
        nbTrees = self._visualBagger.n_estimators
        X3 = computeHistogram(len(image_buffer), nbFactor, nbTrees, X2)
        self.endTask()

        #Cleaning up
        del X2  # Should be useless

        return X3

    def fit_histogram(self, hist, y):
        #Delegating the classification
        self.setTask(1, "Learning the model")

        self._classifier.fit(hist, y)

        self.endTask()

        return self

    def fit(self, image_buffer):
        """
        Fits the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to learn from

        Return
        -------
        self : :class:`Classifier`
            This instance
        """
        #Updating the labels
        y_user = image_buffer.getLabels()
        self._buildLUT(y_user)
        y = self._convertLabel(y_user)

        X = self._preprocess(image_buffer, learningPhase=True)

        return self.fit_histogram(X, y)

    def predict(self, image_buffer):
        """
        Classify the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of int
            each entry is the classification label corresponding to the input
        """

        X = self._preprocess(image_buffer, learningPhase=False)
        y_classif = self._classifier.predict(X)
        return self._convertLabelsBackToUser(y_classif)

    def predict_proba(self, image_buffer):
        """
        Classify softly the data contained is the :class:`ImageBuffer`
        instance. i.e. yields a probability vector of belongin to each
        class

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of list of float
            each entry is the probability vector of the input of the same
            index as computed by the base classifier
        """
        if not hasattr(self._classifier, "predict_proba"):
            #Early error
            self._classifier.predict_proba(np.zeros((1, 1)))

        X = self._preprocess(image_buffer, learningPhase=False)
        return self._classifier.predict_proba(X)
Example #14
0
class Clustering():
    def __init__(self, compounds, output=False, seed=False):
        np.random.seed(seed=seed)
        self.seed = seed
        self.compounds = compounds
        self.count = 0
        self.count_1 = 0
        self.output = output
        self.tools = clustertools()
        if self.output is not False:
            self.figures = clusterfigures(self.compounds)
        self.testcompound = []

    def cluster_training(self, train, distance=False):
        '''
        This is the basic clustering function
        '''
        self.train_matrix = train.train
        '''
        Step one is to make sure that their is a distance matrix in place.
        It is best to feed an existing distance matrix if one is available.
        '''
        if distance is False:
            self.p_feat_matrix = self.tools.pairwise_distance_matrix(train.train, 'jaccard')
        else:
            self.p_feat_matrix = distance
        '''
        Step two is to cluster your data using a random trees embedding. This a
        random ensemble of trees. This is a transformation on the data, into a
        high dimensional, sparse space
        '''
        self.clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
        #self.clf.fit(self.train_matrix)
        X_transformed = self.clf.fit_transform(self.train_matrix)
        '''
        Step three performs truncated SVD (similar to PCA). It operates on the sample
        vectors directly, rather than the covariance matrix. It takes the first two
        components. Essentially this reduces the sparse embedding to a low dimensional
        representation.
        '''
        self.svd = TruncatedSVD(n_components=2)
        self.svd.clf = self.svd.fit(X_transformed)
        self.model = self.svd.clf.transform(X_transformed)
        '''
        The next step is to take the transformed model and the original dataset and
        determine the max silhouette_score of clusters
        '''
        (self.cluster_assignment,
         self.cluster_num,
         self.cluster_score) = self.tools.identify_accurate_number_of_clusters(self.model, self.compounds)
        self.individualclusters = []
        '''
        The individual datapoints are assessed with regard to the best clustering scheme
        '''
        for i in range(self.cluster_num):
            self.individualclusters.append([])
            for j in range(len(self.cluster_assignment)):
                if self.cluster_assignment[j] == i:
                    self.individualclusters[i].append(self.model[j, :])
            self.individualclusters[i] = np.array(self.individualclusters[i])
        '''
        Finally, this clustering scheme is used to generate a one class Support
        Vector Machine decision boundary.
        '''
        (self.clf_OCSVM,
         self.OCSVM_model) = self.tools.determine_test_similarity(self.individualclusters)

    def cluster_testing(self, testing):
        '''Create RandomTreesEmbedding of data'''
        clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
        '''Fit testing data to training model'''
        clf.fit = self.clf.fit(testing)
        X_transformed = self.clf.fit_transform(testing)
        n_components = 2
        '''SVD transform data'''
        svd = TruncatedSVD(n_components=n_components)
        svd.clf = svd.fit(X_transformed)
        svd.model = svd.clf.transform(X_transformed)
        '''Train transformed data using original model'''
        train_transformed = clf.fit.transform(self.train_matrix)
        train_model = svd.clf.transform(train_transformed)
        '''Generate One Class SVM rejection criteria'''
        (clf_OCSVM_t, OCSVMmodel_t) = self.tools.determine_testing_data_similarity(train_model)
        predicted = []
        '''Remove testing compounds outside rejection margin'''
        for i in range(len(svd.model)):
            p = OCSVMmodel_t.predict(svd.model[i, :].reshape(1, -1))
            pred = OCSVMmodel_t.decision_function(svd.model[i, :].reshape(1, -1)).ravel()
            if (p == 1):
                predicted.append(i)
        return predicted
def random_forest_embedding():
	import numpy as np
	import matplotlib.pyplot as plt
	
	from sklearn.datasets import make_circles
	from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
	from sklearn.decomposition import TruncatedSVD
	from sklearn.naive_bayes import BernoulliNB
	
	#建立数据集
	X, y = make_circles(factor = 0.5, random_state = 0, noise = 0.05)
	
	#print y
	#print X.shape #X 是100 * 2, y是100 * 1 (0,1数组)
	
	
	#Transform data
	hasher = RandomTreesEmbedding(n_estimators = 10, random_state = 0, max_depth = 3) #设置参数,生成model
	X_transformed = hasher.fit_transform(X)
	
	#print X_transformed[99]
	#print X_transformed.shape #100 * 74 ? 可能是如下原因 -- 为什么利用高维稀疏表示之后可以有助于分类?
	#RandomTreesEmbedding provides a way to map data to a very high-dimensional, 
	#sparse representation, which might be beneficial for classification. 
	
	pca = TruncatedSVD(n_components = 2)
	X_reduced = pca.fit_transform(X_transformed)
	
	#print X_reduced #这里是X_reduced 是 100 * 2

	#Learn a Naive bayes classifier on the transformed data
	nb = BernoulliNB()
	nb.fit(X_transformed, y) #利用高维稀疏矩阵和y进行训练
	
	#Learn a ExtraTreesClassifier for comparison
	trees = ExtraTreesClassifier(max_depth = 3, n_estimators = 10, random_state = 0)
	trees.fit(X, y) #这里是利用原始的2维X和y进行训练
	
	#scatter plot of original and reduced data
	fig = plt.figure(figsize = (9, 8))
	ax = plt.subplot(221)
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	ax.set_title("Original Data(2d)")
	ax.set_xticks(())
	ax.set_yticks(())
	
	ax = plt.subplot(222)
	#注意虽然X在转化之后了,但是对应的label没有变,所以可以根据label来分析transfrom的效果
	ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c = y, s = 50) 
	ax.set_title("pca reduction (2d) of transformed data (%dd)" % X_transformed.shape[1]) 
	ax.set_xticks(())
	ax.set_yticks(())
	
	
	
	#Plot the decision in original space
	h = 0.01
	x_min, x_max = X[:, 0].min() - 0.5, X[:,0].max() + 0.5
	y_min, y_max = X[:, 1].min() - 0.5, X[:,1].max() + 0.5
	
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
	
	#transform grid using RandomTreesEmbedding
	#利用nb来做predict
	transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()])
	y_grid_pred = nb.predict_proba(transformed_grid)[:, 1]
	
	
	ax = plt.subplot(223)
	ax.set_title("Naive Bayes on Transformed data")
	ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	
	ax.set_ylim(-1.4, 1.4)
	ax.set_xlim(-1.4, 1.4)
	ax.set_xticks(())
	ax.set_yticks(())
	
	
	#transform grid using ExtraTreesClassifier
	#利用trees做predict
	y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
	
	ax = plt.subplot(224)
	ax.set_title("ExtraTrees predictions")
	ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	
	ax.set_ylim(-1.4, 1.4)
	ax.set_xlim(-1.4, 1.4)
	ax.set_xticks(())
	ax.set_yticks(())

	plt.tight_layout()
	plt.show()
Example #16
0
    def post(self, request):

        age = request.data["age"]
        sex = request.data["gen"]
        cp = request.data["c_pain"]
        trestbps = request.data["bp_lvl"]
        chol = request.data["choles"]
        fbs = request.data["bp_fast"]
        restecg = request.data["ecg"]
        talach = request.data["h_rate"]
        exang = request.data["i_exe"]
        oldpeak = request.data["d_exe"]
        slope = request.data["sd_seg"]
        ca = request.data["his"]
        thal = request.data["thal_scn"]

        dspath = settings.BASE_DIR + settings.STATIC_URL + "heart.xlsx"
        data = read_excel(dspath, "heart")
        # data = read_csv("heart.csv")
        X = data.iloc[:, 0:13].values
        y = data.iloc[:, 13].values

        hasher = RandomTreesEmbedding(n_estimators=10,
                                      random_state=0,
                                      max_depth=3)
        X_transformed = hasher.fit_transform(X)

        clf = LogisticRegression(random_state=0).fit(X_transformed, y)

        inp = age + "#" + sex + "#" + cp + "#" + trestbps + "#" + chol + "#" + fbs + "#" + restecg + "#" + talach + "#" + exang + "#" + oldpeak + "#" + slope + "#" + ca + "#" + thal
        import numpy as np
        inpa = np.fromstring(inp, dtype=np.float, sep='#')

        transformed_grid = hasher.transform([inpa])

        o = clf.predict(transformed_grid)
        print(o)

        obj = AddMedicalRecord()
        obj.uid = request.data["uid"]
        obj.date = datetime.date.today()
        # obj.date = "2020-02-02"
        if o == [1]:
            obj.result = "HEART PATIENT"
        if o == [0]:
            obj.result = "NO HEART DISEASE"

        dspath = settings.BASE_DIR + settings.STATIC_URL + "heart.xlsx"
        data = read_excel(dspath, "heart")
        # data = read_csv("heart.csv")
        X = data.iloc[:, 0:13].values
        y = data.iloc[:, 13].values
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            random_state=0)
        hasher = RandomTreesEmbedding(n_estimators=10,
                                      random_state=0,
                                      max_depth=3)
        X_transformed = hasher.fit_transform(X_train)

        clf = LogisticRegression(random_state=0).fit(X_transformed, y_train)
        X_test = hasher.fit_transform(X_test)
        y_score = clf.predict(X_test)

        sc = accuracy_score(y_test, y_score)

        obj.accu = sc * 100
        obj.save()

        return HttpResponse("Success with Acc :" + str(sc))
Example #17
0
class UnsupervisedVisualBagClassifier(Classifier):
    """
    ===============================
    UnsupervisedVisualBagClassifier
    ===============================
    1. Unsupervised
    2. Binary bag of words
    3. Totally random trees
    """

    def __init__(self, coordinator, base_classifier, n_estimators=10,
                 max_depth=5, min_samples_split=2, min_samples_leaf=1,
                 n_jobs=-1, random_state=None, verbose=0, min_density=None):
        Classifier.__init__(self, coordinator, base_classifier)
        self.histoSize = 0
        self._visualBagger = RandomTreesEmbedding(n_estimators=n_estimators,
                                                  max_depth=max_depth,
                                                  min_samples_split=min_samples_split,
                                                  min_samples_leaf=min_samples_leaf,
                                                  n_jobs=n_jobs,
                                                  random_state=random_state,
                                                  verbose=verbose,
                                                  min_density=min_density)


    def _preprocess(self, image_buffer, learningPhase):
        if learningPhase:
            self.setTask(1, "Extracting the features (model creation)")
        else:
            self.setTask(1, "Extracting the features (prediction)")

        X_pred, y = self._coord.process(image_buffer,
                                        learningPhase=learningPhase)

        y_user = self._convertLabel(y)

        #Cleaning up
        self._coord.clean(y)
        del y

        self.endTask()

        #Bag-of-word transformation
        self.setTask(1, "Transforming data into bag-of-words (Tree part)")

        X2 = None
        if learningPhase:
            X2 = self._visualBagger.fit_transform(X_pred, y_user)
            self.histoSize = X2.shape[1]
        else:
            X2 = self._visualBagger.transform(X_pred)

        #Cleaning up
        self._coord.clean(X_pred)
        del X_pred
        del y_user

        self.endTask()

        nbFactor = X2.shape[0] // len(image_buffer)

        if not sps.isspmatrix_csr(X2):
            X2 = X2.tocsr()

        if nbFactor == 1:
            return X2

        self.setTask(len(image_buffer), "Transforming data into bag-of-words (Histogram part)")
        nbTrees = self._visualBagger.n_estimators
        X3 = computeHistogram(len(image_buffer), nbFactor, nbTrees, X2)
        self.endTask()

        #Cleaning up
        del X2  # Should be useless

        return X3

    def fit_histogram(self, hist, y):
        #Delegating the classification
        self.setTask(1, "Learning the model")

        self._classifier.fit(hist, y)

        self.endTask()

        return self

    def fit(self, image_buffer):
        """
        Fits the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to learn from

        Return
        -------
        self : :class:`Classifier`
            This instance
        """
        #Updating the labels
        y_user = image_buffer.getLabels()
        self._buildLUT(y_user)
        y = self._convertLabel(y_user)

        X = self._preprocess(image_buffer, learningPhase=True)

        return self.fit_histogram(X, y)

    def predict(self, image_buffer):
        """
        Classify the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of int
            each entry is the classification label corresponding to the input
        """

        X = self._preprocess(image_buffer, learningPhase=False)
        y_classif = self._classifier.predict(X)
        return self._convertLabelsBackToUser(y_classif)

    def predict_proba(self, image_buffer):
        """
        Classify softly the data contained is the :class:`ImageBuffer`
        instance. i.e. yields a probability vector of belongin to each
        class

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of list of float
            each entry is the probability vector of the input of the same
            index as computed by the base classifier
        """
        if not hasattr(self._classifier, "predict_proba"):
            #Early error
            self._classifier.predict_proba(np.zeros((1, 1)))

        X = self._preprocess(image_buffer, learningPhase=False)
        return self._classifier.predict_proba(X)
        'Gradient Boost'
    ]
    return classifier_list, classifier_name_list


def print_evaluation_metrics(trained_model, trained_model_name, X_test,
                             y_test):
    print '--------- For Model : ', trained_model_name
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test, predicted_values)
    print "Accuracy Score : ", metrics.accuracy_score(y_test, predicted_values)
    print "---------------------------------------\n"


filename = 'train.csv'
imperial_frame = pd.read_csv(filename)
feature_hash = hashfeatures.FeatureHash(max_feature_num=5000)
insult_features = feature_hash.get_feature_set(
    list(imperial_frame['Comment'].values))
class_labels = list(imperial_frame['Insult'].values)
rf_embed_features = RandomTreesEmbedding(n_estimators=151, random_state=42)
insult_features = rf_embed_features.fit_transform(insult_features)
X_train, X_test, y_train, y_test = train_test_split(insult_features,
                                                    class_labels,
                                                    test_size=0.1,
                                                    random_state=42)
classifier_list, classifier_name_list = get_ensemble_models()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    print_evaluation_metrics(classifier, classifier_name, X_test, y_test)
Example #19
0
# pca = PCA(n_components=2)
# X = pca.fit_transform(X)
# print(X.shape)

fig, (ax1, ax2) = plt.subplots(1, 2)

for n_components in range_n_components:
    values = []

    for n_clusters in range_n_clusters:
        # reducer = PCA(n_components=n_components)
        # reducer = FastICA(n_components=n_components)
        # reducer = SparseRandomProjection(n_components=n_components)
        # x = reducer.fit_transform(X)
        reducer = RandomTreesEmbedding(n_estimators=n_components, max_depth=3)
        x = reducer.fit_transform(X).toarray()
        print(x.shape)
        # x = X

        # # Create a subplot with 1 row and 2 columns
        # fig, (ax1, ax2) = plt.subplots(1, 2)
        # fig.set_size_inches(18, 7)

        # # The 1st subplot is the silhouette plot
        # # The silhouette coefficient can range from -1, 1 but in this example all
        # # lie within [-0.1, 1]
        # ax1.set_xlim([-0.1, 1])
        # # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # # plots of individual clusters, to demarcate them clearly.
        # ax1.set_ylim([0, len(x) + (n_clusters + 1) * 10])
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_leaf_nodes=None,
                 min_impurity_decrease=0.,
                 min_impurity_split=None,
                 sparse_output=True, 是否做稀疏矩阵输出
                 n_jobs=1,
                 random_state=None,
                 verbose=0,
                 warm_start=False):
    """
    algo = RandomTreesEmbedding(n_estimators=100,
                                max_depth=2,
                                sparse_output=True)
    # 模型训练
    X_train2 = algo.fit_transform(X_train)
    print(X_train2)

    # 查看下API属性
    x_test2 = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3],
               [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2]]
    print("样本的转换值:")
    print(algo.transform(x_test2))
    # # 模型效果评估
    # print('训练集上的准确率:{}'.format(algo.score(X_train, Y_train)))
    # print('测试集上的准确率:{}'.format(algo.score(X_test, Y_test)))

    print("训练好的所有子模型:\n{}".format(algo.estimators_))

    # 所有子模型可视化
    for k, estimator in enumerate(algo.estimators_):
Example #21
0
def test_random_hasher_sparse_data():
    X, y = datasets.make_multilabel_classification(random_state=0)
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X_transformed = hasher.fit_transform(X)
    X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
Example #22
0
def randomtrees_embedding_others(df2):
    df = df2.copy()
    rte = RandomTreesEmbedding(random_state=seed)
    df = pd.DataFrame(rte.fit_transform(df).toarray())
    return df