def get_new_model(self, X, Y, corr_mat):
        max_depth = np.log2(X.shape[0])
        # print("Max Depth ", max_depth)
        depth = np.random.randint(max_depth) + 1
        # Create Model
        hasher = RandomTreesEmbedding(n_estimators=1, max_depth=depth)
        hasher.fit(X)
        x_transformed = hasher.transform(X)
        x_trans_dense = x_transformed.todense()
        y_transformed = hasher.transform(Y)
        y_trans_dense = y_transformed.todense()
        for i in range(x_trans_dense.shape[1]):
            # print(x_trans_dense)
            index_array_x = np.where(x_trans_dense[:, i] == 1.0)[0]

            index_array_y = np.where(y_trans_dense[:, i] == 1.0)[0]
            # print("Index array ", i, index_array)
            for idx in index_array_y:
                corr_mat[idx, index_array_x] += 1
        # print(depth, corr_mat)
        # print("Shape of the transformed ", X_transformed.shape, depth)
        # print("hi ",np.where( != 0.0)[1])

        # X_est = hasher.estimators_
        # for estimator in X_est:
        # 	pred = estimator.predict(X)
        # 	print("Shape ", pred)

        # reg.fit(self.X, self.pred)

        return
class RandomTreesEmbeddingPrim(primitive):
    def __init__(self, random_state=0):
        super(RandomTreesEmbeddingPrim,
              self).__init__(name='RandomTreesEmbedding')
        self.id = 54
        self.PCA_LAPACK_Prim = []
        self.type = 'feature engineering'
        self.description = "FastICA: a fast algorithm for Independent Component Analysis."
        self.hyperparams_run = {'default': True}
        self.pca = RandomTreesEmbedding(random_state=random_state)
        self.accept_type = 'c_t'

    def can_accept(self, data):
        return self.can_accept_c(data)

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.pca.fit(data['X'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        code = ''.join(word[0] for word in cols)[:10]
        result = self.pca.transform(output['X']).toarray()
        new_cols = list(map(str, list(range(result.shape[1]))))
        cols = ["{}_rfembdng{}".format(x, code) for x in new_cols]
        output['X'] = pd.DataFrame(result, columns=cols)
        final_output = {0: output}
        return final_output
    def test_random_trees_embedding(self):
        X, _ = make_regression(
            n_features=5, n_samples=100, n_targets=1, random_state=42,
            n_informative=3)
        X = X.astype(numpy.float32)

        model = RandomTreesEmbedding(
            n_estimators=3, max_depth=2, sparse_output=False).fit(X)
        model.transform(X)
        model_onnx = to_onnx(
            model, X[:1], target_opset=TARGET_OPSET)
        with open("model.onnx", "wb") as f:
            f.write(model_onnx.SerializeToString())
        self.check_model(model_onnx, X)
        dump_data_and_model(
            X.astype(numpy.float32), model, model_onnx,
            basename="SklearnRandomTreesEmbedding")
class _RandomTreesEmbeddingImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Example #5
0
class RandomTreesEmbeddingTransformation(Transformer):
    def __init__(self,
                 n_estimators=10,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=1.0,
                 max_leaf_nodes='None',
                 sparse_output=True,
                 bootstrap='False',
                 n_jobs=-1,
                 random_state=1):
        super().__init__("random_trees_embedding", 18)
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'
        self.output_type = CATEGORICAL

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.bootstrap = bootstrap
        self.sparse_output = sparse_output
        self.n_jobs = n_jobs
        self.random_state = random_state

    @ease_trans
    def operate(self, input_datanode: DataNode, target_fields=None):
        from sklearn.ensemble import RandomTreesEmbedding

        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(input_datanode.feature_types,
                                           self.input_type)
        X_new = X[:, target_fields]
        if not self.model:
            self.n_estimators = int(self.n_estimators)
            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)

            # Skip heavy computation. max depth is set to 6.
            if X.shape[0] > 5000:
                self.max_depth = min(6, self.max_depth)

            self.min_samples_split = int(self.min_samples_split)
            self.min_samples_leaf = int(self.min_samples_leaf)
            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)
            self.min_weight_fraction_leaf = float(
                self.min_weight_fraction_leaf)
            self.bootstrap = check_for_bool(self.bootstrap)

            self.model = RandomTreesEmbedding(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_leaf_nodes=self.max_leaf_nodes,
                sparse_output=self.sparse_output,
                n_jobs=self.n_jobs,
                random_state=self.random_state)

            self.model.fit(X_new)

        _X = self.model.transform(X_new).toarray()

        return _X

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None,
                                        optimizer='smac'):
        n_estimators = UniformIntegerHyperparameter(name="n_estimators",
                                                    lower=10,
                                                    upper=100,
                                                    default_value=10)
        max_depth = UniformIntegerHyperparameter(name="max_depth",
                                                 lower=2,
                                                 upper=10,
                                                 default_value=5)
        min_samples_split = UniformIntegerHyperparameter(
            name="min_samples_split", lower=2, upper=20, default_value=2)
        min_samples_leaf = UniformIntegerHyperparameter(
            name="min_samples_leaf", lower=1, upper=20, default_value=1)
        min_weight_fraction_leaf = Constant('min_weight_fraction_leaf', 1.0)
        max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes",
                                                      value="None")
        bootstrap = CategoricalHyperparameter('bootstrap', ['True', 'False'])
        cs = ConfigurationSpace()
        cs.add_hyperparameters([
            n_estimators, max_depth, min_samples_split, min_samples_leaf,
            min_weight_fraction_leaf, max_leaf_nodes, bootstrap
        ])
        return cs
n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
rt_lm = LogisticRegression()
rt.fit(X_train, y_train)
rt_lm.fit(rt.transform(X_train_lr), y_train_lr)

y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
def RandomForest_Codebook(num_features, num_descriptors):
    # root folder with images
    folder_name = 'data/Caltech_101/101_ObjectCategories'
    # list of folders of images classes
    class_list = os.listdir(folder_name)
    # macOS: discart '.DS_Store' file
    if '.DS_Store' in class_list:
        class_list.remove('.DS_Store')

    # SIFT feature extractor
    sift = cv2.xfeatures2d.SIFT_create()

    # TRAINING
    # list of descriptors
    descriptors_train = []
    raw_train = defaultdict(dict)
    # iterate over image classes
    for c in range(len(class_list)):
        # subfolder pointer
        sub_folder_name = os.path.join(folder_name, class_list[c])
        # filter non-images files out
        img_list = glob.glob(os.path.join(sub_folder_name, '*.jpg'))
        # shuffle images to break correlation
        np.random.shuffle(img_list)
        # training examples
        img_train = img_list[:15]
        # iterate over image samples of a class
        for i in range(len(img_train)):
            # fetch image sample
            raw_img = cv2.imread(img_train[i])
            img = raw_img.copy()
            # convert to gray scale for SIFT compatibility
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            # apply SIFT algorithm
            kp, des = sift.detectAndCompute(gray, None)
            # store descriptors
            raw_train[c][i] = des
            for d in des:
                descriptors_train.append(d)
    # NumPy-friendly array of descriptors
    descriptors_train = np.asarray(descriptors_train)
    # random selection of descriptors WITHOUT REPLACEMENT
    descriptors_random = descriptors_train[np.random.choice(
        len(descriptors_train), min(len(descriptors_train),
                                    num_descriptors),
        replace=False)]

    # TESTING
    raw_test = defaultdict(dict)
    # iterate over image classes
    for c in range(len(class_list)):
        # subfolder pointer
        sub_folder_name = os.path.join(folder_name, class_list[c])
        # filter non-images files out
        img_list = glob.glob(os.path.join(sub_folder_name, '*.jpg'))
        # testing examples
        img_test = img_list[15:30]
        # iterate over image samples of a class
        for i in range(len(img_test)):
            # fetch image sample
            raw_img = cv2.imread(img_test[i])
            img = raw_img.copy()
            # convert to gray scale for SIFT compatibility
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            # apply SIFT algorithm
            kp, des = sift.detectAndCompute(gray, None)
            # store descriptors
            raw_test[c][i] = des

    # K-Means clustering algorithm
    codebook_algorithm = RandomTreesEmbedding(
        n_estimators=num_features).fit(descriptors_random)

    n_out = codebook_algorithm.transform(raw_train[0][0]).sum(axis=0).shape[1]

    # vector quantisation
    data_train = np.zeros(
        (len(class_list)*15, n_out+1))

    for i in range(len(class_list)):
        for j in range(15):
            # set features
            data_train[15 * (i)+j, :-1] = codebook_algorithm.transform(
                raw_train[i][j]).sum(axis=0).ravel()
            # set label
            data_train[15*(i)+j, -1] = i

    # vector quantisation
    data_query = np.zeros(
        (len(class_list)*15, n_out+1))

    for i in range(len(class_list)):
        for j in range(15):
            # set features
            data_query[15 *
                       (i)+j, :-1] = codebook_algorithm.transform(
                raw_test[i][j]).sum(axis=0).ravel()
            # set label
            data_query[15*(i)+j, -1] = i

    return data_train, data_query
Example #8
0
bootstrap=True,
#给定是否采用有放回的方式产生子数据集,默认为True表示采用
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None)
'''
algo = RandomTreesEmbedding(n_estimators=100, max_depth=3)

#5.算法模型的训练
algo.fit(x_train, y_train)

#6.直接获取模型的扩展结果
x_train2 = algo.transform(x_train)
x_test2 = algo.transform(x_test)
print('扩展前大小:{},扩展后大小:{}'.format(x_train.shape, x_train2.shape))
print('扩展前大小:{},扩展后大小:{}'.format(x_test.shape, x_test2.shape))

#8.随机森林可视化
print('随机森林中的子模型列表:{}'.format(len(algo.estimators_)))
#2.方式二:直接使用pydotplus插件直接生成pdf文件进行保存
from sklearn import tree
import pydotplus

#feature_names=None,class_names=None 分别给定特征属性和目标属性的name信息
for i in range(len(algo.estimators_)):
    dt = algo.estimators_[i]
    dot_data = train_test_split(
        decision_tree=algo,
Example #9
0
def rt_embedding(training_features, testing_features):
    rt = RandomTreesEmbedding()
    rt.fit(training_features)
    testing_features = rt.transform(testing_features)
    training_features = rt.transform(training_features)
    return training_features, testing_features
#The dimensionality of the resulting representation is n_out <= n_estimators * max_leaf_nodes. If max_leaf_nodes == None, the number of leaf nodes is at most n_estimators * 2 ** max_depth.

data_tr_rf = []
data_te_rf = []
codebook = RandomTreesEmbedding(n_estimators=100,
                                max_depth=20,
                                min_samples_split=3,
                                max_leaf_nodes=50,
                                n_jobs=-1).fit(desc_sel)

for nem in desc_tr.keys():  #keys are the same for training and test
    i = 0
    while i < len(desc_tr[nem]):
        #training data
        this_col = desc_tr[nem][i]  #get the image we want
        hp = codebook.transform(this_col)
        hp2 = np.asarray(hp.sum(axis=0).ravel()).flatten()
        data_tr_rf.append(hp2)

        #test data
        this_img = desc_te[nem][i]
        hp = codebook.transform(this_img)
        hp2 = np.asarray(hp.sum(axis=0).ravel()).flatten()
        data_te_rf.append(hp2)

        i = i + 1

len(data_tr_rf[0])

treenos = [2, 5, 10, 20, 50, 100, 200]
max_depth = [5, 10, 25, 50]
n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
rt_lm = LogisticRegression()
rt.fit(X_train, y_train)
rt_lm.fit(rt.transform(X_train_lr), y_train_lr)

y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)


# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
def random_forest_embedding():
	import numpy as np
	import matplotlib.pyplot as plt
	
	from sklearn.datasets import make_circles
	from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
	from sklearn.decomposition import TruncatedSVD
	from sklearn.naive_bayes import BernoulliNB
	
	#建立数据集
	X, y = make_circles(factor = 0.5, random_state = 0, noise = 0.05)
	
	#print y
	#print X.shape #X 是100 * 2, y是100 * 1 (0,1数组)
	
	
	#Transform data
	hasher = RandomTreesEmbedding(n_estimators = 10, random_state = 0, max_depth = 3) #设置参数,生成model
	X_transformed = hasher.fit_transform(X)
	
	#print X_transformed[99]
	#print X_transformed.shape #100 * 74 ? 可能是如下原因 -- 为什么利用高维稀疏表示之后可以有助于分类?
	#RandomTreesEmbedding provides a way to map data to a very high-dimensional, 
	#sparse representation, which might be beneficial for classification. 
	
	pca = TruncatedSVD(n_components = 2)
	X_reduced = pca.fit_transform(X_transformed)
	
	#print X_reduced #这里是X_reduced 是 100 * 2

	#Learn a Naive bayes classifier on the transformed data
	nb = BernoulliNB()
	nb.fit(X_transformed, y) #利用高维稀疏矩阵和y进行训练
	
	#Learn a ExtraTreesClassifier for comparison
	trees = ExtraTreesClassifier(max_depth = 3, n_estimators = 10, random_state = 0)
	trees.fit(X, y) #这里是利用原始的2维X和y进行训练
	
	#scatter plot of original and reduced data
	fig = plt.figure(figsize = (9, 8))
	ax = plt.subplot(221)
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	ax.set_title("Original Data(2d)")
	ax.set_xticks(())
	ax.set_yticks(())
	
	ax = plt.subplot(222)
	#注意虽然X在转化之后了,但是对应的label没有变,所以可以根据label来分析transfrom的效果
	ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c = y, s = 50) 
	ax.set_title("pca reduction (2d) of transformed data (%dd)" % X_transformed.shape[1]) 
	ax.set_xticks(())
	ax.set_yticks(())
	
	
	
	#Plot the decision in original space
	h = 0.01
	x_min, x_max = X[:, 0].min() - 0.5, X[:,0].max() + 0.5
	y_min, y_max = X[:, 1].min() - 0.5, X[:,1].max() + 0.5
	
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
	
	#transform grid using RandomTreesEmbedding
	#利用nb来做predict
	transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()])
	y_grid_pred = nb.predict_proba(transformed_grid)[:, 1]
	
	
	ax = plt.subplot(223)
	ax.set_title("Naive Bayes on Transformed data")
	ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	
	ax.set_ylim(-1.4, 1.4)
	ax.set_xlim(-1.4, 1.4)
	ax.set_xticks(())
	ax.set_yticks(())
	
	
	#transform grid using ExtraTreesClassifier
	#利用trees做predict
	y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
	
	ax = plt.subplot(224)
	ax.set_title("ExtraTrees predictions")
	ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	
	ax.set_ylim(-1.4, 1.4)
	ax.set_xlim(-1.4, 1.4)
	ax.set_xticks(())
	ax.set_yticks(())

	plt.tight_layout()
	plt.show()
ax = plt.subplot(222)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50, edgecolor='k')
ax.set_title("Truncated SVD reduction (2d) of transformed data (%dd)" %
             X_transformed.shape[1])
ax.set_xticks(())
ax.set_yticks(())

# Plot the decision in original space. For that, we will assign a color
# to each point in the mesh [x_min, x_max]x[y_min, y_max].
h = .01
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# transform grid using RandomTreesEmbedding
transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()])
y_grid_pred = nb.predict_proba(transformed_grid)[:, 1]

ax = plt.subplot(223)
ax.set_title("Naive Bayes on Transformed data")
ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k')
ax.set_ylim(-1.4, 1.4)
ax.set_xlim(-1.4, 1.4)
ax.set_xticks(())
ax.set_yticks(())

# transform grid using ExtraTreesClassifier
y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

ax = plt.subplot(224)
Example #14
0
class UnsupervisedVisualBagClassifier(Classifier):
    """
    ===============================
    UnsupervisedVisualBagClassifier
    ===============================
    1. Unsupervised
    2. Binary bag of words
    3. Totally random trees
    """

    def __init__(self, coordinator, base_classifier, n_estimators=10,
                 max_depth=5, min_samples_split=2, min_samples_leaf=1,
                 n_jobs=-1, random_state=None, verbose=0, min_density=None):
        Classifier.__init__(self, coordinator, base_classifier)
        self.histoSize = 0
        self._visualBagger = RandomTreesEmbedding(n_estimators=n_estimators,
                                                  max_depth=max_depth,
                                                  min_samples_split=min_samples_split,
                                                  min_samples_leaf=min_samples_leaf,
                                                  n_jobs=n_jobs,
                                                  random_state=random_state,
                                                  verbose=verbose,
                                                  min_density=min_density)


    def _preprocess(self, image_buffer, learningPhase):
        if learningPhase:
            self.setTask(1, "Extracting the features (model creation)")
        else:
            self.setTask(1, "Extracting the features (prediction)")

        X_pred, y = self._coord.process(image_buffer,
                                        learningPhase=learningPhase)

        y_user = self._convertLabel(y)

        #Cleaning up
        self._coord.clean(y)
        del y

        self.endTask()

        #Bag-of-word transformation
        self.setTask(1, "Transforming data into bag-of-words (Tree part)")

        X2 = None
        if learningPhase:
            X2 = self._visualBagger.fit_transform(X_pred, y_user)
            self.histoSize = X2.shape[1]
        else:
            X2 = self._visualBagger.transform(X_pred)

        #Cleaning up
        self._coord.clean(X_pred)
        del X_pred
        del y_user

        self.endTask()

        nbFactor = X2.shape[0] // len(image_buffer)

        if not sps.isspmatrix_csr(X2):
            X2 = X2.tocsr()

        if nbFactor == 1:
            return X2

        self.setTask(len(image_buffer), "Transforming data into bag-of-words (Histogram part)")
        nbTrees = self._visualBagger.n_estimators
        X3 = computeHistogram(len(image_buffer), nbFactor, nbTrees, X2)
        self.endTask()

        #Cleaning up
        del X2  # Should be useless

        return X3

    def fit_histogram(self, hist, y):
        #Delegating the classification
        self.setTask(1, "Learning the model")

        self._classifier.fit(hist, y)

        self.endTask()

        return self

    def fit(self, image_buffer):
        """
        Fits the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to learn from

        Return
        -------
        self : :class:`Classifier`
            This instance
        """
        #Updating the labels
        y_user = image_buffer.getLabels()
        self._buildLUT(y_user)
        y = self._convertLabel(y_user)

        X = self._preprocess(image_buffer, learningPhase=True)

        return self.fit_histogram(X, y)

    def predict(self, image_buffer):
        """
        Classify the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of int
            each entry is the classification label corresponding to the input
        """

        X = self._preprocess(image_buffer, learningPhase=False)
        y_classif = self._classifier.predict(X)
        return self._convertLabelsBackToUser(y_classif)

    def predict_proba(self, image_buffer):
        """
        Classify softly the data contained is the :class:`ImageBuffer`
        instance. i.e. yields a probability vector of belongin to each
        class

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of list of float
            each entry is the probability vector of the input of the same
            index as computed by the base classifier
        """
        if not hasattr(self._classifier, "predict_proba"):
            #Early error
            self._classifier.predict_proba(np.zeros((1, 1)))

        X = self._preprocess(image_buffer, learningPhase=False)
        return self._classifier.predict_proba(X)
Example #15
0
def getCaltech_RandomForest(savefig_images: bool = False,
                            num_features: int = 10,
                            num_descriptors: int = 100000,
                            num_training_samples_per_class: int = 15,
                            num_testing_samples_per_class: int = 15,
                            random_state: int = None,
                            pickle_dump: bool = True) -> Data:
    """Caltech 101 training and testing data generator
    using Random Forest Codebook.

    Parameters
    ----------
    savefig_images: bool
        Save raw training & testing images and their
        SIFT masked grayscale transforms
    num_descriptors: int
        Number of SIFT descriptors kept for BoW
    num_training_samples_per_class: int
        Number of samples per class used for training
    num_testing_samples_per_class: int
        Number of samples per class used for testing
    random_state: int
        `np.random.seed` initial state

    Returns
    -------
    data: NamedTuple
        * data_train: numpy.ndarray
        * data_query: numpy.ndarray
    """
    class_list, descriptors_random, raw_train, raw_test, images_train, \
        images_test = getCaltech_pre(num_features, num_descriptors,
                                     num_training_samples_per_class,
                                     num_testing_samples_per_class,
                                     random_state, pickle_dump)

    if savefig_images:
        getCaltech_plot(class_list, images_train, images_test)

    # K-Means clustering algorithm
    codebook_algorithm = RandomTreesEmbedding(
        n_estimators=num_features).fit(descriptors_random)

    n_out = codebook_algorithm.transform(raw_train[0][0]).sum(axis=0).shape[1]

    # vector quantisation
    data_train = np.zeros(
        (len(class_list) * num_training_samples_per_class, n_out + 1))

    for i in range(len(class_list)):
        for j in range(num_training_samples_per_class):
            # set features
            data_train[num_training_samples_per_class * (i) +
                       j, :-1] = codebook_algorithm.transform(
                           raw_train[i][j]).sum(axis=0).ravel()
            # set label
            data_train[num_training_samples_per_class * (i) + j, -1] = i

    # vector quantisation
    data_query = np.zeros(
        (len(class_list) * num_testing_samples_per_class, n_out + 1))

    for i in range(len(class_list)):
        for j in range(num_testing_samples_per_class):
            # set features
            data_query[num_testing_samples_per_class * (i) +
                       j, :-1] = codebook_algorithm.transform(
                           raw_test[i][j]).sum(axis=0).ravel()
            # set label
            data_query[num_testing_samples_per_class * (i) + j, -1] = i

    # cache data to avoid recalculation every time
    if pickle_dump:
        pickle.dump(Data(data_train, data_query),
                    open('tmp/models/codebooks/caltech_rf.pkl', 'wb'))

    return Data(data_train, data_query)
mpl.rcParams["font.family"] = 'Arial Unicode MS'

names = ['A', 'B', 'C', 'D', 'cla', ]
df = pd.read_csv('../../data_set/iris.data', names=names)
df.info()

X = df[names[0:-1]]

"""
n_estimators: Any = 100,最终训练的子模型数量
max_depth: Any = 5,最大树深
min_samples_split: Any = 2,树分裂的最小样本数目
min_samples_leaf: Any = 1,叶子节点最小样本数目
min_weight_fraction_leaf: Any = 0.,样本权重的最小加成参数(暂无用)
max_leaf_nodes: Any = None,最多允许的叶子节点数目 None 不限制
min_impurity_decrease: Any = 0.,分裂导致不纯度减少大于等于该值则分裂
min_impurity_split: Any = None,分裂提前停止阈值,一个节点不纯度大于此阈值才能分裂
sparse_output: Any = True,是否返回稀疏矩阵
warm_start: Any = False, 是否预热(重用之前的模型进行训练) 默认否
n_jobs: Any = None, 线程数
random_state: Any = None, 随机数种子
verbose: Any = 0  是否打印训练过程 0不打印 1打印
"""
algo = RandomTreesEmbedding(n_estimators=10, max_depth=3, sparse_output=False)
algo.fit(X)
x_ex = algo.transform(X)

# 追加特征列表
for x in x_ex[0:10]:
    print(x)
Example #17
0
class UnsupervisedVisualBagClassifier(Classifier):
    """
    ===============================
    UnsupervisedVisualBagClassifier
    ===============================
    1. Unsupervised
    2. Binary bag of words
    3. Totally random trees
    """
    def __init__(self,
                 coordinator,
                 base_classifier,
                 n_estimators=10,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 n_jobs=-1,
                 random_state=None,
                 verbose=0,
                 min_density=None):
        Classifier.__init__(self, coordinator, base_classifier)
        self.histoSize = 0
        self._visualBagger = RandomTreesEmbedding(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            min_density=min_density)

    def _preprocess(self, image_buffer, learningPhase):
        if learningPhase:
            self.setTask(1, "Extracting the features (model creation)")
        else:
            self.setTask(1, "Extracting the features (prediction)")

        X_pred, y = self._coord.process(image_buffer,
                                        learningPhase=learningPhase)

        y_user = self._convertLabel(y)

        #Cleaning up
        self._coord.clean(y)
        del y

        self.endTask()

        #Bag-of-word transformation
        self.setTask(1, "Transforming data into bag-of-words (Tree part)")

        X2 = None
        if learningPhase:
            X2 = self._visualBagger.fit_transform(X_pred, y_user)
            self.histoSize = X2.shape[1]
        else:
            X2 = self._visualBagger.transform(X_pred)

        #Cleaning up
        self._coord.clean(X_pred)
        del X_pred
        del y_user

        self.endTask()

        nbFactor = X2.shape[0] // len(image_buffer)

        if not sps.isspmatrix_csr(X2):
            X2 = X2.tocsr()

        if nbFactor == 1:
            return X2

        self.setTask(len(image_buffer),
                     "Transforming data into bag-of-words (Histogram part)")
        nbTrees = self._visualBagger.n_estimators
        X3 = computeHistogram(len(image_buffer), nbFactor, nbTrees, X2)
        self.endTask()

        #Cleaning up
        del X2  # Should be useless

        return X3

    def fit_histogram(self, hist, y):
        #Delegating the classification
        self.setTask(1, "Learning the model")

        self._classifier.fit(hist, y)

        self.endTask()

        return self

    def fit(self, image_buffer):
        """
        Fits the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to learn from

        Return
        -------
        self : :class:`Classifier`
            This instance
        """
        #Updating the labels
        y_user = image_buffer.getLabels()
        self._buildLUT(y_user)
        y = self._convertLabel(y_user)

        X = self._preprocess(image_buffer, learningPhase=True)

        return self.fit_histogram(X, y)

    def predict(self, image_buffer):
        """
        Classify the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of int
            each entry is the classification label corresponding to the input
        """

        X = self._preprocess(image_buffer, learningPhase=False)
        y_classif = self._classifier.predict(X)
        return self._convertLabelsBackToUser(y_classif)

    def predict_proba(self, image_buffer):
        """
        Classify softly the data contained is the :class:`ImageBuffer`
        instance. i.e. yields a probability vector of belongin to each
        class

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of list of float
            each entry is the probability vector of the input of the same
            index as computed by the base classifier
        """
        if not hasattr(self._classifier, "predict_proba"):
            #Early error
            self._classifier.predict_proba(np.zeros((1, 1)))

        X = self._preprocess(image_buffer, learningPhase=False)
        return self._classifier.predict_proba(X)
Example #18
0
# 10. 其他特殊的API
print("子模型列表:\n{}".format(algo.estimators_))

from sklearn import tree
import pydotplus

k = 0
for algo1 in algo.estimators_:
    dot_data = tree.export_graphviz(decision_tree=algo1, out_file=None,
                                    feature_names=['A', 'B', 'C', 'D'],
                                    class_names=['1', '2', '3'],
                                    filled=True, rounded=True,
                                    special_characters=True
                                    )

    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png('trte_{}.png'.format(k))
    k += 1
    if k > 3:
        break

# 做一个维度扩展
print("*" * 100)
x_test2 = x_test.iloc[:2, :]
print(x_test2)
# apply方法返回的是叶子节点下标
print(algo.apply(x_test2))
# transform转换数据(其实就是apply方法+哑编码)
print(algo.transform(x_test2))
                 random_state=None,
                 verbose=0,
                 warm_start=False):
    """
    algo = RandomTreesEmbedding(n_estimators=100,
                                max_depth=2,
                                sparse_output=True)
    # 模型训练
    X_train2 = algo.fit_transform(X_train)
    print(X_train2)

    # 查看下API属性
    x_test2 = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3],
               [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2]]
    print("样本的转换值:")
    print(algo.transform(x_test2))
    # # 模型效果评估
    # print('训练集上的准确率:{}'.format(algo.score(X_train, Y_train)))
    # print('测试集上的准确率:{}'.format(algo.score(X_test, Y_test)))

    print("训练好的所有子模型:\n{}".format(algo.estimators_))

    # 所有子模型可视化
    for k, estimator in enumerate(algo.estimators_):
        dot_data = tree.export_graphviz(decision_tree=estimator,
                                        out_file=None,
                                        feature_names=['f1', 'f2', 'f3', 'f4'],
                                        class_names=['A', 'B', 'C'],
                                        rounded=True,
                                        filled=True,
                                        special_characters=True)
ax = pl.subplot(222)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50)
ax.set_title("PCA reduction (2d) of transformed data (%dd)" %
             X_transformed.shape[1])
ax.set_xticks(())
ax.set_yticks(())

# Plot the decision in original space. For that, we will assign a color to each
# point in the mesh [x_min, m_max] x [y_min, y_max].
h = .01
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# transform grid using RandomTreesEmbedding
transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()])
y_grid_pred = nb.predict_proba(transformed_grid)[:, 1]

ax = pl.subplot(223)
ax.set_title("Naive Bayes on Transformed data")
ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
ax.scatter(X[:, 0], X[:, 1], c=y, s=50)
ax.set_ylim(-1.4, 1.4)
ax.set_xlim(-1.4, 1.4)
ax.set_xticks(())
ax.set_yticks(())

# transform grid using ExtraTreesClassifier
y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

ax = pl.subplot(224)
Example #21
0
    def post(self, request):

        age = request.data["age"]
        sex = request.data["gen"]
        cp = request.data["c_pain"]
        trestbps = request.data["bp_lvl"]
        chol = request.data["choles"]
        fbs = request.data["bp_fast"]
        restecg = request.data["ecg"]
        talach = request.data["h_rate"]
        exang = request.data["i_exe"]
        oldpeak = request.data["d_exe"]
        slope = request.data["sd_seg"]
        ca = request.data["his"]
        thal = request.data["thal_scn"]

        dspath = settings.BASE_DIR + settings.STATIC_URL + "heart.xlsx"
        data = read_excel(dspath, "heart")
        # data = read_csv("heart.csv")
        X = data.iloc[:, 0:13].values
        y = data.iloc[:, 13].values

        hasher = RandomTreesEmbedding(n_estimators=10,
                                      random_state=0,
                                      max_depth=3)
        X_transformed = hasher.fit_transform(X)

        clf = LogisticRegression(random_state=0).fit(X_transformed, y)

        inp = age + "#" + sex + "#" + cp + "#" + trestbps + "#" + chol + "#" + fbs + "#" + restecg + "#" + talach + "#" + exang + "#" + oldpeak + "#" + slope + "#" + ca + "#" + thal
        import numpy as np
        inpa = np.fromstring(inp, dtype=np.float, sep='#')

        transformed_grid = hasher.transform([inpa])

        o = clf.predict(transformed_grid)
        print(o)

        obj = AddMedicalRecord()
        obj.uid = request.data["uid"]
        obj.date = datetime.date.today()
        # obj.date = "2020-02-02"
        if o == [1]:
            obj.result = "HEART PATIENT"
        if o == [0]:
            obj.result = "NO HEART DISEASE"

        dspath = settings.BASE_DIR + settings.STATIC_URL + "heart.xlsx"
        data = read_excel(dspath, "heart")
        # data = read_csv("heart.csv")
        X = data.iloc[:, 0:13].values
        y = data.iloc[:, 13].values
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            random_state=0)
        hasher = RandomTreesEmbedding(n_estimators=10,
                                      random_state=0,
                                      max_depth=3)
        X_transformed = hasher.fit_transform(X_train)

        clf = LogisticRegression(random_state=0).fit(X_transformed, y_train)
        X_test = hasher.fit_transform(X_test)
        y_score = clf.predict(X_test)

        sc = accuracy_score(y_test, y_score)

        obj.accu = sc * 100
        obj.save()

        return HttpResponse("Success with Acc :" + str(sc))