def get_new_model(self, X, Y, corr_mat): max_depth = np.log2(X.shape[0]) # print("Max Depth ", max_depth) depth = np.random.randint(max_depth) + 1 # Create Model hasher = RandomTreesEmbedding(n_estimators=1, max_depth=depth) hasher.fit(X) x_transformed = hasher.transform(X) x_trans_dense = x_transformed.todense() y_transformed = hasher.transform(Y) y_trans_dense = y_transformed.todense() for i in range(x_trans_dense.shape[1]): # print(x_trans_dense) index_array_x = np.where(x_trans_dense[:, i] == 1.0)[0] index_array_y = np.where(y_trans_dense[:, i] == 1.0)[0] # print("Index array ", i, index_array) for idx in index_array_y: corr_mat[idx, index_array_x] += 1 # print(depth, corr_mat) # print("Shape of the transformed ", X_transformed.shape, depth) # print("hi ",np.where( != 0.0)[1]) # X_est = hasher.estimators_ # for estimator in X_est: # pred = estimator.predict(X) # print("Shape ", pred) # reg.fit(self.X, self.pred) return
class RandomTreesEmbeddingPrim(primitive): def __init__(self, random_state=0): super(RandomTreesEmbeddingPrim, self).__init__(name='RandomTreesEmbedding') self.id = 54 self.PCA_LAPACK_Prim = [] self.type = 'feature engineering' self.description = "FastICA: a fast algorithm for Independent Component Analysis." self.hyperparams_run = {'default': True} self.pca = RandomTreesEmbedding(random_state=random_state) self.accept_type = 'c_t' def can_accept(self, data): return self.can_accept_c(data) def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.pca.fit(data['X']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) code = ''.join(word[0] for word in cols)[:10] result = self.pca.transform(output['X']).toarray() new_cols = list(map(str, list(range(result.shape[1])))) cols = ["{}_rfembdng{}".format(x, code) for x in new_cols] output['X'] = pd.DataFrame(result, columns=cols) final_output = {0: output} return final_output
def test_random_trees_embedding(self): X, _ = make_regression( n_features=5, n_samples=100, n_targets=1, random_state=42, n_informative=3) X = X.astype(numpy.float32) model = RandomTreesEmbedding( n_estimators=3, max_depth=2, sparse_output=False).fit(X) model.transform(X) model_onnx = to_onnx( model, X[:1], target_opset=TARGET_OPSET) with open("model.onnx", "wb") as f: f.write(model_onnx.SerializeToString()) self.check_model(model_onnx, X) dump_data_and_model( X.astype(numpy.float32), model, model_onnx, basename="SklearnRandomTreesEmbedding")
class _RandomTreesEmbeddingImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class RandomTreesEmbeddingTransformation(Transformer): def __init__(self, n_estimators=10, max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=1.0, max_leaf_nodes='None', sparse_output=True, bootstrap='False', n_jobs=-1, random_state=1): super().__init__("random_trees_embedding", 18) self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL] self.compound_mode = 'only_new' self.output_type = CATEGORICAL self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_leaf_nodes = max_leaf_nodes self.min_weight_fraction_leaf = min_weight_fraction_leaf self.bootstrap = bootstrap self.sparse_output = sparse_output self.n_jobs = n_jobs self.random_state = random_state @ease_trans def operate(self, input_datanode: DataNode, target_fields=None): from sklearn.ensemble import RandomTreesEmbedding X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(input_datanode.feature_types, self.input_type) X_new = X[:, target_fields] if not self.model: self.n_estimators = int(self.n_estimators) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) # Skip heavy computation. max depth is set to 6. if X.shape[0] > 5000: self.max_depth = min(6, self.max_depth) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_weight_fraction_leaf = float( self.min_weight_fraction_leaf) self.bootstrap = check_for_bool(self.bootstrap) self.model = RandomTreesEmbedding( n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_leaf_nodes=self.max_leaf_nodes, sparse_output=self.sparse_output, n_jobs=self.n_jobs, random_state=self.random_state) self.model.fit(X_new) _X = self.model.transform(X_new).toarray() return _X @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): n_estimators = UniformIntegerHyperparameter(name="n_estimators", lower=10, upper=100, default_value=10) max_depth = UniformIntegerHyperparameter(name="max_depth", lower=2, upper=10, default_value=5) min_samples_split = UniformIntegerHyperparameter( name="min_samples_split", lower=2, upper=20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter( name="min_samples_leaf", lower=1, upper=20, default_value=1) min_weight_fraction_leaf = Constant('min_weight_fraction_leaf', 1.0) max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes", value="None") bootstrap = CategoricalHyperparameter('bootstrap', ['True', 'False']) cs = ConfigurationSpace() cs.add_hyperparameters([ n_estimators, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, bootstrap ]) return cs
n_estimator = 10 X, y = make_classification(n_samples=80000) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator) rt_lm = LogisticRegression() rt.fit(X_train, y_train) rt_lm.fit(rt.transform(X_train_lr), y_train_lr) y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
def RandomForest_Codebook(num_features, num_descriptors): # root folder with images folder_name = 'data/Caltech_101/101_ObjectCategories' # list of folders of images classes class_list = os.listdir(folder_name) # macOS: discart '.DS_Store' file if '.DS_Store' in class_list: class_list.remove('.DS_Store') # SIFT feature extractor sift = cv2.xfeatures2d.SIFT_create() # TRAINING # list of descriptors descriptors_train = [] raw_train = defaultdict(dict) # iterate over image classes for c in range(len(class_list)): # subfolder pointer sub_folder_name = os.path.join(folder_name, class_list[c]) # filter non-images files out img_list = glob.glob(os.path.join(sub_folder_name, '*.jpg')) # shuffle images to break correlation np.random.shuffle(img_list) # training examples img_train = img_list[:15] # iterate over image samples of a class for i in range(len(img_train)): # fetch image sample raw_img = cv2.imread(img_train[i]) img = raw_img.copy() # convert to gray scale for SIFT compatibility gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # apply SIFT algorithm kp, des = sift.detectAndCompute(gray, None) # store descriptors raw_train[c][i] = des for d in des: descriptors_train.append(d) # NumPy-friendly array of descriptors descriptors_train = np.asarray(descriptors_train) # random selection of descriptors WITHOUT REPLACEMENT descriptors_random = descriptors_train[np.random.choice( len(descriptors_train), min(len(descriptors_train), num_descriptors), replace=False)] # TESTING raw_test = defaultdict(dict) # iterate over image classes for c in range(len(class_list)): # subfolder pointer sub_folder_name = os.path.join(folder_name, class_list[c]) # filter non-images files out img_list = glob.glob(os.path.join(sub_folder_name, '*.jpg')) # testing examples img_test = img_list[15:30] # iterate over image samples of a class for i in range(len(img_test)): # fetch image sample raw_img = cv2.imread(img_test[i]) img = raw_img.copy() # convert to gray scale for SIFT compatibility gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # apply SIFT algorithm kp, des = sift.detectAndCompute(gray, None) # store descriptors raw_test[c][i] = des # K-Means clustering algorithm codebook_algorithm = RandomTreesEmbedding( n_estimators=num_features).fit(descriptors_random) n_out = codebook_algorithm.transform(raw_train[0][0]).sum(axis=0).shape[1] # vector quantisation data_train = np.zeros( (len(class_list)*15, n_out+1)) for i in range(len(class_list)): for j in range(15): # set features data_train[15 * (i)+j, :-1] = codebook_algorithm.transform( raw_train[i][j]).sum(axis=0).ravel() # set label data_train[15*(i)+j, -1] = i # vector quantisation data_query = np.zeros( (len(class_list)*15, n_out+1)) for i in range(len(class_list)): for j in range(15): # set features data_query[15 * (i)+j, :-1] = codebook_algorithm.transform( raw_test[i][j]).sum(axis=0).ravel() # set label data_query[15*(i)+j, -1] = i return data_train, data_query
bootstrap=True, #给定是否采用有放回的方式产生子数据集,默认为True表示采用 oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ''' algo = RandomTreesEmbedding(n_estimators=100, max_depth=3) #5.算法模型的训练 algo.fit(x_train, y_train) #6.直接获取模型的扩展结果 x_train2 = algo.transform(x_train) x_test2 = algo.transform(x_test) print('扩展前大小:{},扩展后大小:{}'.format(x_train.shape, x_train2.shape)) print('扩展前大小:{},扩展后大小:{}'.format(x_test.shape, x_test2.shape)) #8.随机森林可视化 print('随机森林中的子模型列表:{}'.format(len(algo.estimators_))) #2.方式二:直接使用pydotplus插件直接生成pdf文件进行保存 from sklearn import tree import pydotplus #feature_names=None,class_names=None 分别给定特征属性和目标属性的name信息 for i in range(len(algo.estimators_)): dt = algo.estimators_[i] dot_data = train_test_split( decision_tree=algo,
def rt_embedding(training_features, testing_features): rt = RandomTreesEmbedding() rt.fit(training_features) testing_features = rt.transform(testing_features) training_features = rt.transform(training_features) return training_features, testing_features
#The dimensionality of the resulting representation is n_out <= n_estimators * max_leaf_nodes. If max_leaf_nodes == None, the number of leaf nodes is at most n_estimators * 2 ** max_depth. data_tr_rf = [] data_te_rf = [] codebook = RandomTreesEmbedding(n_estimators=100, max_depth=20, min_samples_split=3, max_leaf_nodes=50, n_jobs=-1).fit(desc_sel) for nem in desc_tr.keys(): #keys are the same for training and test i = 0 while i < len(desc_tr[nem]): #training data this_col = desc_tr[nem][i] #get the image we want hp = codebook.transform(this_col) hp2 = np.asarray(hp.sum(axis=0).ravel()).flatten() data_tr_rf.append(hp2) #test data this_img = desc_te[nem][i] hp = codebook.transform(this_img) hp2 = np.asarray(hp.sum(axis=0).ravel()).flatten() data_te_rf.append(hp2) i = i + 1 len(data_tr_rf[0]) treenos = [2, 5, 10, 20, 50, 100, 200] max_depth = [5, 10, 25, 50]
def random_forest_embedding(): import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_circles from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier from sklearn.decomposition import TruncatedSVD from sklearn.naive_bayes import BernoulliNB #建立数据集 X, y = make_circles(factor = 0.5, random_state = 0, noise = 0.05) #print y #print X.shape #X 是100 * 2, y是100 * 1 (0,1数组) #Transform data hasher = RandomTreesEmbedding(n_estimators = 10, random_state = 0, max_depth = 3) #设置参数,生成model X_transformed = hasher.fit_transform(X) #print X_transformed[99] #print X_transformed.shape #100 * 74 ? 可能是如下原因 -- 为什么利用高维稀疏表示之后可以有助于分类? #RandomTreesEmbedding provides a way to map data to a very high-dimensional, #sparse representation, which might be beneficial for classification. pca = TruncatedSVD(n_components = 2) X_reduced = pca.fit_transform(X_transformed) #print X_reduced #这里是X_reduced 是 100 * 2 #Learn a Naive bayes classifier on the transformed data nb = BernoulliNB() nb.fit(X_transformed, y) #利用高维稀疏矩阵和y进行训练 #Learn a ExtraTreesClassifier for comparison trees = ExtraTreesClassifier(max_depth = 3, n_estimators = 10, random_state = 0) trees.fit(X, y) #这里是利用原始的2维X和y进行训练 #scatter plot of original and reduced data fig = plt.figure(figsize = (9, 8)) ax = plt.subplot(221) ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label ax.set_title("Original Data(2d)") ax.set_xticks(()) ax.set_yticks(()) ax = plt.subplot(222) #注意虽然X在转化之后了,但是对应的label没有变,所以可以根据label来分析transfrom的效果 ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c = y, s = 50) ax.set_title("pca reduction (2d) of transformed data (%dd)" % X_transformed.shape[1]) ax.set_xticks(()) ax.set_yticks(()) #Plot the decision in original space h = 0.01 x_min, x_max = X[:, 0].min() - 0.5, X[:,0].max() + 0.5 y_min, y_max = X[:, 1].min() - 0.5, X[:,1].max() + 0.5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) #transform grid using RandomTreesEmbedding #利用nb来做predict transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()]) y_grid_pred = nb.predict_proba(transformed_grid)[:, 1] ax = plt.subplot(223) ax.set_title("Naive Bayes on Transformed data") ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape)) ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label ax.set_ylim(-1.4, 1.4) ax.set_xlim(-1.4, 1.4) ax.set_xticks(()) ax.set_yticks(()) #transform grid using ExtraTreesClassifier #利用trees做predict y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] ax = plt.subplot(224) ax.set_title("ExtraTrees predictions") ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape)) ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label ax.set_ylim(-1.4, 1.4) ax.set_xlim(-1.4, 1.4) ax.set_xticks(()) ax.set_yticks(()) plt.tight_layout() plt.show()
ax = plt.subplot(222) ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50, edgecolor='k') ax.set_title("Truncated SVD reduction (2d) of transformed data (%dd)" % X_transformed.shape[1]) ax.set_xticks(()) ax.set_yticks(()) # Plot the decision in original space. For that, we will assign a color # to each point in the mesh [x_min, x_max]x[y_min, y_max]. h = .01 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # transform grid using RandomTreesEmbedding transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()]) y_grid_pred = nb.predict_proba(transformed_grid)[:, 1] ax = plt.subplot(223) ax.set_title("Naive Bayes on Transformed data") ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape)) ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k') ax.set_ylim(-1.4, 1.4) ax.set_xlim(-1.4, 1.4) ax.set_xticks(()) ax.set_yticks(()) # transform grid using ExtraTreesClassifier y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] ax = plt.subplot(224)
class UnsupervisedVisualBagClassifier(Classifier): """ =============================== UnsupervisedVisualBagClassifier =============================== 1. Unsupervised 2. Binary bag of words 3. Totally random trees """ def __init__(self, coordinator, base_classifier, n_estimators=10, max_depth=5, min_samples_split=2, min_samples_leaf=1, n_jobs=-1, random_state=None, verbose=0, min_density=None): Classifier.__init__(self, coordinator, base_classifier) self.histoSize = 0 self._visualBagger = RandomTreesEmbedding(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, n_jobs=n_jobs, random_state=random_state, verbose=verbose, min_density=min_density) def _preprocess(self, image_buffer, learningPhase): if learningPhase: self.setTask(1, "Extracting the features (model creation)") else: self.setTask(1, "Extracting the features (prediction)") X_pred, y = self._coord.process(image_buffer, learningPhase=learningPhase) y_user = self._convertLabel(y) #Cleaning up self._coord.clean(y) del y self.endTask() #Bag-of-word transformation self.setTask(1, "Transforming data into bag-of-words (Tree part)") X2 = None if learningPhase: X2 = self._visualBagger.fit_transform(X_pred, y_user) self.histoSize = X2.shape[1] else: X2 = self._visualBagger.transform(X_pred) #Cleaning up self._coord.clean(X_pred) del X_pred del y_user self.endTask() nbFactor = X2.shape[0] // len(image_buffer) if not sps.isspmatrix_csr(X2): X2 = X2.tocsr() if nbFactor == 1: return X2 self.setTask(len(image_buffer), "Transforming data into bag-of-words (Histogram part)") nbTrees = self._visualBagger.n_estimators X3 = computeHistogram(len(image_buffer), nbFactor, nbTrees, X2) self.endTask() #Cleaning up del X2 # Should be useless return X3 def fit_histogram(self, hist, y): #Delegating the classification self.setTask(1, "Learning the model") self._classifier.fit(hist, y) self.endTask() return self def fit(self, image_buffer): """ Fits the data contained in the :class:`ImageBuffer` instance Parameters ----------- image_buffer : :class:`ImageBuffer` The data to learn from Return ------- self : :class:`Classifier` This instance """ #Updating the labels y_user = image_buffer.getLabels() self._buildLUT(y_user) y = self._convertLabel(y_user) X = self._preprocess(image_buffer, learningPhase=True) return self.fit_histogram(X, y) def predict(self, image_buffer): """ Classify the data contained in the :class:`ImageBuffer` instance Parameters ----------- image_buffer : :class:`ImageBuffer` The data to classify Return ------- list : list of int each entry is the classification label corresponding to the input """ X = self._preprocess(image_buffer, learningPhase=False) y_classif = self._classifier.predict(X) return self._convertLabelsBackToUser(y_classif) def predict_proba(self, image_buffer): """ Classify softly the data contained is the :class:`ImageBuffer` instance. i.e. yields a probability vector of belongin to each class Parameters ----------- image_buffer : :class:`ImageBuffer` The data to classify Return ------- list : list of list of float each entry is the probability vector of the input of the same index as computed by the base classifier """ if not hasattr(self._classifier, "predict_proba"): #Early error self._classifier.predict_proba(np.zeros((1, 1))) X = self._preprocess(image_buffer, learningPhase=False) return self._classifier.predict_proba(X)
def getCaltech_RandomForest(savefig_images: bool = False, num_features: int = 10, num_descriptors: int = 100000, num_training_samples_per_class: int = 15, num_testing_samples_per_class: int = 15, random_state: int = None, pickle_dump: bool = True) -> Data: """Caltech 101 training and testing data generator using Random Forest Codebook. Parameters ---------- savefig_images: bool Save raw training & testing images and their SIFT masked grayscale transforms num_descriptors: int Number of SIFT descriptors kept for BoW num_training_samples_per_class: int Number of samples per class used for training num_testing_samples_per_class: int Number of samples per class used for testing random_state: int `np.random.seed` initial state Returns ------- data: NamedTuple * data_train: numpy.ndarray * data_query: numpy.ndarray """ class_list, descriptors_random, raw_train, raw_test, images_train, \ images_test = getCaltech_pre(num_features, num_descriptors, num_training_samples_per_class, num_testing_samples_per_class, random_state, pickle_dump) if savefig_images: getCaltech_plot(class_list, images_train, images_test) # K-Means clustering algorithm codebook_algorithm = RandomTreesEmbedding( n_estimators=num_features).fit(descriptors_random) n_out = codebook_algorithm.transform(raw_train[0][0]).sum(axis=0).shape[1] # vector quantisation data_train = np.zeros( (len(class_list) * num_training_samples_per_class, n_out + 1)) for i in range(len(class_list)): for j in range(num_training_samples_per_class): # set features data_train[num_training_samples_per_class * (i) + j, :-1] = codebook_algorithm.transform( raw_train[i][j]).sum(axis=0).ravel() # set label data_train[num_training_samples_per_class * (i) + j, -1] = i # vector quantisation data_query = np.zeros( (len(class_list) * num_testing_samples_per_class, n_out + 1)) for i in range(len(class_list)): for j in range(num_testing_samples_per_class): # set features data_query[num_testing_samples_per_class * (i) + j, :-1] = codebook_algorithm.transform( raw_test[i][j]).sum(axis=0).ravel() # set label data_query[num_testing_samples_per_class * (i) + j, -1] = i # cache data to avoid recalculation every time if pickle_dump: pickle.dump(Data(data_train, data_query), open('tmp/models/codebooks/caltech_rf.pkl', 'wb')) return Data(data_train, data_query)
mpl.rcParams["font.family"] = 'Arial Unicode MS' names = ['A', 'B', 'C', 'D', 'cla', ] df = pd.read_csv('../../data_set/iris.data', names=names) df.info() X = df[names[0:-1]] """ n_estimators: Any = 100,最终训练的子模型数量 max_depth: Any = 5,最大树深 min_samples_split: Any = 2,树分裂的最小样本数目 min_samples_leaf: Any = 1,叶子节点最小样本数目 min_weight_fraction_leaf: Any = 0.,样本权重的最小加成参数(暂无用) max_leaf_nodes: Any = None,最多允许的叶子节点数目 None 不限制 min_impurity_decrease: Any = 0.,分裂导致不纯度减少大于等于该值则分裂 min_impurity_split: Any = None,分裂提前停止阈值,一个节点不纯度大于此阈值才能分裂 sparse_output: Any = True,是否返回稀疏矩阵 warm_start: Any = False, 是否预热(重用之前的模型进行训练) 默认否 n_jobs: Any = None, 线程数 random_state: Any = None, 随机数种子 verbose: Any = 0 是否打印训练过程 0不打印 1打印 """ algo = RandomTreesEmbedding(n_estimators=10, max_depth=3, sparse_output=False) algo.fit(X) x_ex = algo.transform(X) # 追加特征列表 for x in x_ex[0:10]: print(x)
class UnsupervisedVisualBagClassifier(Classifier): """ =============================== UnsupervisedVisualBagClassifier =============================== 1. Unsupervised 2. Binary bag of words 3. Totally random trees """ def __init__(self, coordinator, base_classifier, n_estimators=10, max_depth=5, min_samples_split=2, min_samples_leaf=1, n_jobs=-1, random_state=None, verbose=0, min_density=None): Classifier.__init__(self, coordinator, base_classifier) self.histoSize = 0 self._visualBagger = RandomTreesEmbedding( n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, n_jobs=n_jobs, random_state=random_state, verbose=verbose, min_density=min_density) def _preprocess(self, image_buffer, learningPhase): if learningPhase: self.setTask(1, "Extracting the features (model creation)") else: self.setTask(1, "Extracting the features (prediction)") X_pred, y = self._coord.process(image_buffer, learningPhase=learningPhase) y_user = self._convertLabel(y) #Cleaning up self._coord.clean(y) del y self.endTask() #Bag-of-word transformation self.setTask(1, "Transforming data into bag-of-words (Tree part)") X2 = None if learningPhase: X2 = self._visualBagger.fit_transform(X_pred, y_user) self.histoSize = X2.shape[1] else: X2 = self._visualBagger.transform(X_pred) #Cleaning up self._coord.clean(X_pred) del X_pred del y_user self.endTask() nbFactor = X2.shape[0] // len(image_buffer) if not sps.isspmatrix_csr(X2): X2 = X2.tocsr() if nbFactor == 1: return X2 self.setTask(len(image_buffer), "Transforming data into bag-of-words (Histogram part)") nbTrees = self._visualBagger.n_estimators X3 = computeHistogram(len(image_buffer), nbFactor, nbTrees, X2) self.endTask() #Cleaning up del X2 # Should be useless return X3 def fit_histogram(self, hist, y): #Delegating the classification self.setTask(1, "Learning the model") self._classifier.fit(hist, y) self.endTask() return self def fit(self, image_buffer): """ Fits the data contained in the :class:`ImageBuffer` instance Parameters ----------- image_buffer : :class:`ImageBuffer` The data to learn from Return ------- self : :class:`Classifier` This instance """ #Updating the labels y_user = image_buffer.getLabels() self._buildLUT(y_user) y = self._convertLabel(y_user) X = self._preprocess(image_buffer, learningPhase=True) return self.fit_histogram(X, y) def predict(self, image_buffer): """ Classify the data contained in the :class:`ImageBuffer` instance Parameters ----------- image_buffer : :class:`ImageBuffer` The data to classify Return ------- list : list of int each entry is the classification label corresponding to the input """ X = self._preprocess(image_buffer, learningPhase=False) y_classif = self._classifier.predict(X) return self._convertLabelsBackToUser(y_classif) def predict_proba(self, image_buffer): """ Classify softly the data contained is the :class:`ImageBuffer` instance. i.e. yields a probability vector of belongin to each class Parameters ----------- image_buffer : :class:`ImageBuffer` The data to classify Return ------- list : list of list of float each entry is the probability vector of the input of the same index as computed by the base classifier """ if not hasattr(self._classifier, "predict_proba"): #Early error self._classifier.predict_proba(np.zeros((1, 1))) X = self._preprocess(image_buffer, learningPhase=False) return self._classifier.predict_proba(X)
# 10. 其他特殊的API print("子模型列表:\n{}".format(algo.estimators_)) from sklearn import tree import pydotplus k = 0 for algo1 in algo.estimators_: dot_data = tree.export_graphviz(decision_tree=algo1, out_file=None, feature_names=['A', 'B', 'C', 'D'], class_names=['1', '2', '3'], filled=True, rounded=True, special_characters=True ) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_png('trte_{}.png'.format(k)) k += 1 if k > 3: break # 做一个维度扩展 print("*" * 100) x_test2 = x_test.iloc[:2, :] print(x_test2) # apply方法返回的是叶子节点下标 print(algo.apply(x_test2)) # transform转换数据(其实就是apply方法+哑编码) print(algo.transform(x_test2))
random_state=None, verbose=0, warm_start=False): """ algo = RandomTreesEmbedding(n_estimators=100, max_depth=2, sparse_output=True) # 模型训练 X_train2 = algo.fit_transform(X_train) print(X_train2) # 查看下API属性 x_test2 = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2]] print("样本的转换值:") print(algo.transform(x_test2)) # # 模型效果评估 # print('训练集上的准确率:{}'.format(algo.score(X_train, Y_train))) # print('测试集上的准确率:{}'.format(algo.score(X_test, Y_test))) print("训练好的所有子模型:\n{}".format(algo.estimators_)) # 所有子模型可视化 for k, estimator in enumerate(algo.estimators_): dot_data = tree.export_graphviz(decision_tree=estimator, out_file=None, feature_names=['f1', 'f2', 'f3', 'f4'], class_names=['A', 'B', 'C'], rounded=True, filled=True, special_characters=True)
ax = pl.subplot(222) ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50) ax.set_title("PCA reduction (2d) of transformed data (%dd)" % X_transformed.shape[1]) ax.set_xticks(()) ax.set_yticks(()) # Plot the decision in original space. For that, we will assign a color to each # point in the mesh [x_min, m_max] x [y_min, y_max]. h = .01 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # transform grid using RandomTreesEmbedding transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()]) y_grid_pred = nb.predict_proba(transformed_grid)[:, 1] ax = pl.subplot(223) ax.set_title("Naive Bayes on Transformed data") ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape)) ax.scatter(X[:, 0], X[:, 1], c=y, s=50) ax.set_ylim(-1.4, 1.4) ax.set_xlim(-1.4, 1.4) ax.set_xticks(()) ax.set_yticks(()) # transform grid using ExtraTreesClassifier y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] ax = pl.subplot(224)
def post(self, request): age = request.data["age"] sex = request.data["gen"] cp = request.data["c_pain"] trestbps = request.data["bp_lvl"] chol = request.data["choles"] fbs = request.data["bp_fast"] restecg = request.data["ecg"] talach = request.data["h_rate"] exang = request.data["i_exe"] oldpeak = request.data["d_exe"] slope = request.data["sd_seg"] ca = request.data["his"] thal = request.data["thal_scn"] dspath = settings.BASE_DIR + settings.STATIC_URL + "heart.xlsx" data = read_excel(dspath, "heart") # data = read_csv("heart.csv") X = data.iloc[:, 0:13].values y = data.iloc[:, 13].values hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3) X_transformed = hasher.fit_transform(X) clf = LogisticRegression(random_state=0).fit(X_transformed, y) inp = age + "#" + sex + "#" + cp + "#" + trestbps + "#" + chol + "#" + fbs + "#" + restecg + "#" + talach + "#" + exang + "#" + oldpeak + "#" + slope + "#" + ca + "#" + thal import numpy as np inpa = np.fromstring(inp, dtype=np.float, sep='#') transformed_grid = hasher.transform([inpa]) o = clf.predict(transformed_grid) print(o) obj = AddMedicalRecord() obj.uid = request.data["uid"] obj.date = datetime.date.today() # obj.date = "2020-02-02" if o == [1]: obj.result = "HEART PATIENT" if o == [0]: obj.result = "NO HEART DISEASE" dspath = settings.BASE_DIR + settings.STATIC_URL + "heart.xlsx" data = read_excel(dspath, "heart") # data = read_csv("heart.csv") X = data.iloc[:, 0:13].values y = data.iloc[:, 13].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0) hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3) X_transformed = hasher.fit_transform(X_train) clf = LogisticRegression(random_state=0).fit(X_transformed, y_train) X_test = hasher.fit_transform(X_test) y_score = clf.predict(X_test) sc = accuracy_score(y_test, y_score) obj.accu = sc * 100 obj.save() return HttpResponse("Success with Acc :" + str(sc))