Example #1
0
def rf_codebook(desc_tr,
                desc_te,
                desc_sizes,
                max_depth,
                n_estimators,
                n_leafs,
                return_time=False):

    print('Computing RF Codebook...')

    # Reformat the training and testing data
    for i in range(10):
        for n in range(15):
            if i == 0 and n == 0:
                data_train = desc_tr[i][n]
                data_test = desc_te[i][n]
            else:
                data_train = np.hstack((data_train, desc_tr[i][n]))
                data_test = np.hstack((data_test, desc_te[i][n]))
    data_train = data_train.T
    data_test = data_test.T

    # Compute the random forest
    # max_depth = 10
    # n_estimators = 100
    RFE = RandomTreesEmbedding(n_estimators=n_estimators,
                               max_depth=max_depth,
                               max_leaf_nodes=n_leafs,
                               random_state=0,
                               n_jobs=3)

    RFE.fit(data_train)

    # Compute the bag of words for each of the predictions
    histogram_train = bag_of_words_rf_jorge(desc_tr, desc_sizes, RFE, n_leafs)
    histogram_test = bag_of_words_rf_jorge(desc_te, desc_sizes, RFE, n_leafs)

    print('Done')

    return histogram_train, histogram_test
Example #2
0
    def operate(self, input_datanode: DataNode, target_fields=None):
        from sklearn.ensemble import RandomTreesEmbedding

        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(input_datanode.feature_types,
                                           self.input_type)
        X_new = X[:, target_fields]

        if not self.model:
            self.n_estimators = int(self.n_estimators)
            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)
            self.min_samples_split = int(self.min_samples_split)
            self.min_samples_leaf = int(self.min_samples_leaf)
            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)
            self.min_weight_fraction_leaf = float(
                self.min_weight_fraction_leaf)
            self.bootstrap = check_for_bool(self.bootstrap)

            self.model = RandomTreesEmbedding(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_leaf_nodes=self.max_leaf_nodes,
                sparse_output=self.sparse_output,
                n_jobs=self.n_jobs,
                random_state=self.random_state)

            self.model.fit(X_new)

        _X = self.model.transform(X_new).toarray()

        return _X
Example #3
0
def learn_representation_sparse(represent_train,
                                represent_test,
                                labels_train,
                                labels_test,
                                train_id,
                                test_id,
                                depth,
                                ntree,
                                random_seed,
                                is_terminal=True,
                                normal=False):
    rt = RandomTreesEmbedding(max_depth=depth,
                              n_estimators=ntree,
                              random_state=random_seed,
                              n_jobs=-1)

    traincv = represent_train
    testcv = represent_test
    trainind = np.unique(train_id)
    testind = np.unique(test_id)

    trainlabels = labels_train
    testlabels = labels_test

    randTrees = rt.fit(traincv.values)

    trainRep = randTrees.apply(traincv.values)
    testRep = randTrees.apply(testcv.values)

    trainbow = np.apply_along_axis(train_f, 0, trainRep, train_id)
    trainbow = sparse.hstack(trainbow)

    testbow = np.apply_along_axis(train_f, 0, testRep, test_id)
    testbow = sparse.hstack(testbow)

    if normal:
        trainbow = normalize(trainbow, norm='l1', axis=1)
        testbow = normalize(testbow, norm='l1', axis=1)

    return trainbow, testbow
Example #4
0
def learn_representation_sparse_2_complex(represent_train, represent_test,labels_train,labels_test,train_id,test_id,depth, ntree,random_seed,is_terminal=True,normal=False):
	
	rt = RandomTreesEmbedding(max_depth=depth,n_estimators=ntree,random_state =random_seed,n_jobs=-1)

	traincv=represent_train
	trainind=np.unique(train_id)
	trainlabels=labels_train

	randTrees=rt.fit(traincv.values)	
	trainRep=randTrees.apply(traincv.values)

	if represent_test is not None:
		
		testcv=represent_test
		testind=np.unique(test_id)
		testlabels=labels_test
		test_time = time.time()
		testRep=randTrees.apply(testcv.values)
		test_time = time.time()-test_time

		allRep = np.vstack((trainRep,testRep))
		allids = np.concatenate((np.array(train_id),np.array(test_id)+np.max(train_id)+1),axis = 0)
	else:
		allRep = trainRep
		allids = np.array(train_id)

	ids=np.tile(allids,ntree)
	
	
	increments=np.arange(0,ntree)*(2**depth)
	allRep=allRep+increments
	node_ids=allRep.flatten('F')
	
	data=np.repeat(1,len(ids))
	
	allbow=sparse.coo_matrix((data,(ids,node_ids)), dtype=np.int8).tocsr()
	
	select_ind =trainind.shape[0]
	return allbow[:select_ind,:],allbow[select_ind:,:],test_time
Example #5
0
 def __init__(self,
              coordinator,
              base_classifier,
              n_estimators=10,
              max_depth=5,
              min_samples_split=2,
              min_samples_leaf=1,
              n_jobs=-1,
              random_state=None,
              verbose=0,
              min_density=None):
     Classifier.__init__(self, coordinator, base_classifier)
     self.histoSize = 0
     self._visualBagger = RandomTreesEmbedding(
         n_estimators=n_estimators,
         max_depth=max_depth,
         min_samples_split=min_samples_split,
         min_samples_leaf=min_samples_leaf,
         n_jobs=n_jobs,
         random_state=random_state,
         verbose=verbose,
         min_density=min_density)
Example #6
0
from sklearn.metrics import classification_report

## labels for the data
dic = pickle.load(open('letterdict_normalized.pickle'))
mypath = '/home/asriva20/SrivastavaA/Data/3_AD_Normal/'
names = [name for name in sorted(listdir(mypath))]
Y = [1 if n[2:8] in dic['AD'] else \
     0 if n[2:8] in  dic['Normal'] else \
    -1 for n in names]
Y = np.asarray(Y)

mat = sio.loadmat('X.mat')
X = mat['Data']
print np.shape(X)

forest = RandomTreesEmbedding(n_estimators=50, max_depth=3)
forest.fit(X)
print(forest.apply(X))
sum = 0
for tree in forest.estimators_:
    n_nodes = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold

    node_depth = np.zeros(shape=n_nodes)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    parent_id = {}
    # seed is root node and id is parent depth
    stack = [(0, -1)]
Example #7
0
    test_images = np.zeros((10, 28 * 28))
    for label in range(10):
        index = np.where(y_test == label)[0][0]
        test_images[label] = x_test[index]
    results = np.zeros((2, 10, 28 * 28))
    for mi, model in enumerate(("supervised", "unsupervised")):
        print("Start Autoencoder using {} model".format(model))
        if model == "supervised":
            eforest = RandomForestClassifier(n_estimators=n_trees,
                                             max_depth=None,
                                             n_jobs=-1,
                                             random_state=0)
            eforest.fit(x_train, y_train)
        else:
            eforest = RandomTreesEmbedding(n_estimators=n_trees,
                                           max_depth=None,
                                           n_jobs=-1,
                                           random_state=0)
            eforest.fit(x_train)

        x_encode = eforest.encode(test_images)
        x_decode = eforest.decode(x_encode)
        results[mi] = x_decode
    rheads = ["origin", "supervised", "unsupervised"]
    test_images = test_images.reshape(1, 10, 28, 28)
    results = results.reshape(2, 10, 28, 28)
    fig = plot_mnist(rheads, np.vstack((test_images, results)))
    plt.show()

    import IPython
    IPython.embed()
Example #8
0
                                          random_state=0)
    elif n_algo == 'ada':
        fi_model = AdaBoostClassifier(n_estimators=1000,
                                      learning_rate=0.50,
                                      random_state=0)
    elif n_algo == 'ext':
        fi_model = ExtraTreesClassifier(n_estimators=1000,
                                        max_features=0.90,
                                        random_state=0)
    elif n_algo == 'gbm':
        fi_model = GradientBoostingClassifier(n_estimators=1000,
                                              max_features=0.90,
                                              max_depth=6,
                                              random_state=0)
    elif n_algo == 'rte':
        fi_model = RandomTreesEmbedding(n_estimators=1000, random_state=0)

    print('       Train:', n_algo)
    fi_model.fit(X_all, y)
    print('       Save:', n_algo)
    fi = sorted(zip(map(lambda x: round(x, 4), fi_model.feature_importances_),
                    names),
                reverse=True)
    df_fi = pd.DataFrame.from_records(fi)
    df_fi.to_csv('data/feat_eng/rv/_' + n_algo + '_feature_importance.csv',
                 sep=',')

print("-" * 100)
print("Perform Chi^2 Feature Selection and Save Results to File")
sel = SelectKBest(
    chi2, k='all')  # (All) X features (you can use reduced k as a cutoff also)
Example #9
0
plt.title(
    "UMAP Projection of Titanic Dataset\n Using Extra Trees Classifier Embedding"
)

# Use DecisionTreeClassifier Embedding
model = DecisionTreeClassifier(max_leaf_nodes=2)
tree = model.fit(X, y)
clusters = tree.apply(X)
# plotting the embedding
plt.figure()
plt.scatter(embedding[:, 0], embedding[:, 1], c=clusters, cmap='Spectral', s=8)
plt.gca().set_aspect("equal", "datalim")
plt.title(
    'UMAP Projection of Titanic Dataset\n Using Clustering with Decision Trees'
)

# Use RandomTreesEmbedding
model = RandomTreesEmbedding(n_estimators=100, max_leaf_nodes=2)
model.fit(X, y)
leaves = model.apply(X)
reducer = umap.UMAP(metric='hamming', random_state=42)
embedding = reducer.fit_transform(leaves)
# plotting the embedding
plt.figure()
plt.scatter(embedding[:, 0], embedding[:, 1], c=y, cmap="Spectral", s=8)
plt.gca().set_aspect("equal", "datalim")
cb = plt.colorbar()
loc = np.arange(0, max(y) + 0.5, 1)
cb.set_ticks(loc)
plt.title("UMAP Projection of Titanic Dataset\n Using Random Trees Embedding")
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve

n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
rt_lm = LogisticRegression()
rt.fit(X_train, y_train)
rt_lm.fit(rt.transform(X_train_lr), y_train_lr)

y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
Example #11
0
        return KMeans(n_clusters=2,
                      random_state=RandomState(17)).fit_predict(rdc_data)

    print_results("kmeans", labels, compute_kmeans(data))
    # print_results("kmeans+rdc", labels, compute_kmeans_rdc(data))

    tsne_embedding_data = TSNE(n_components=3,
                               verbose=10,
                               n_jobs=4,
                               random_state=17).fit_transform(data)
    print_results("tsne fast kmeans", labels,
                  compute_kmeans(tsne_embedding_data))

    tree_embedding_data = RandomTreesEmbedding(n_estimators=200,
                                               random_state=0,
                                               max_depth=5).fit_transform(data)
    print_results("tree kmeans", labels, compute_kmeans(tree_embedding_data))

    0 / 0
    srp_emb_data = random_projection.SparseRandomProjection(
        n_components=20, random_state=42).fit_transform(data)
    print_results("SparseRandomProjection kmeans", labels,
                  compute_kmeans(srp_emb_data))

    iso_emb_data = manifold.Isomap(30, n_components=2).fit_transform(data)
    print_results("iso kmeans", labels, compute_kmeans(iso_emb_data))

    # lle_emb_data = manifold.LocallyLinearEmbedding(10, n_components=2, method='ltsa').fit_transform(data)
    # print_results("lle kmeans", labels, compute_kmeans(lle_emb_data))
    def fit(self, X, y=None):
        '''
        y could be the array of starting state of the demonstrated trajectories/policies
        if it is None, it implicitly implies a MaxEnt model. Other wise, it serves as the feature mapping
        of the starting state. This data might also be potentially used for learning the passive dynamics
        for a pure model-free learning with some regressors and regularization.
        '''
        #check parameters...
        assert (type(self.n_estimators) == int)
        assert (self.n_estimators > 0)
        assert (type(self.max_depth) == int)
        assert (self.max_depth > 0)
        assert (type(self.min_samples_split) == int)
        assert (self.min_samples_split > 0)
        assert (type(self.min_samples_leaf) == int)
        assert (self.min_samples_leaf > 0)
        assert (type(self.em_itrs) == int)

        #an initial partitioning of data with random forest embedding
        self.random_embedding_mdl_ = RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state)

        #we probably do not need the data type to differentiate it is a demonstration
        #of trajectory or commanded state, do we?
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            self.random_embedding_mdl_.fit(X[:, X.shape[1] / 2:])
            indices = self.random_embedding_mdl_.apply(X[:, X.shape[1] / 2:])
            # X_tmp = np.array(X)
            # X_tmp[:, X.shape[1]/2:] = X_tmp[:, X.shape[1]/2:] - X_tmp[:, :X.shape[1]/2]
            # self.random_embedding_mdl_.fit(X_tmp)

            # indices = self.random_embedding_mdl_.apply(X_tmp)
        else:
            self.random_embedding_mdl_.fit(X)
            #figure out indices
            indices = self.random_embedding_mdl_.apply(X)

        partitioned_data = defaultdict(list)

        leaf_idx = defaultdict(set)
        weight_idx = defaultdict(float)
        #group data belongs to the same partition and have the weights...
        #is weight really necessary for EM steps? Hmm, seems to be for the initialization
        #d_idx: data index; p_idx: partition index (comprised of estimator index and leaf index)
        for d_idx, d, p_idx in zip(range(len(X)), X, indices):
            for e_idx, l_idx in enumerate(p_idx):
                partitioned_data[e_idx, l_idx].append(d)
                leaf_idx[e_idx] |= {l_idx}

            for e_idx, l_idx in enumerate(p_idx):
                weight_idx[e_idx, l_idx] = float(
                    len(partitioned_data[e_idx, l_idx])) / len(X)
                # weight_idx[e_idx, l_idx] = 1. / len(p_idx)

        #for each grouped data, solve an easy IOC problem by assuming quadratic cost-to-go function
        #note that, if the passive dynamics need to be learned, extra steps is needed to train a regressor with weighted data
        #otherwise, just a simply gaussian for each conditional probability distribution model
        self.estimators_ = []
        #another copy to store the parameters all together, for EM/evaluation on all of the models
        self.estimators_full_ = defaultdict(list)
        #<hyin/Feb-6th-2016> an estimator and leaf indexed structure to record the passive likelihood of data...
        passive_likelihood_dict = defaultdict(list)
        for e_idx in range(self.n_estimators):
            #for each estimator
            estimator_parms = defaultdict(list)
            for l_idx in leaf_idx[e_idx]:
                if self.verbose:
                    print 'Processing {0}-th estimator and {1}-th leaf...'.format(
                        e_idx, l_idx)
                #and for each data partition
                data_partition = np.array(partitioned_data[e_idx, l_idx])
                if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
                    X_new = data_partition[:, data_partition.shape[1] / 2:]
                    X_old = data_partition[:, 0:data_partition.shape[1] / 2]
                    X_new_passive = np.array([
                        self.passive_dyn_func(X_old[sample_idx])
                        for sample_idx in range(data_partition.shape[0])
                    ])
                    passive_likelihood = _passive_dyn_likelihood(
                        X_new, X_new_passive, self.passive_dyn_noise,
                        self.passive_dyn_ctrl, self.reg)

                    weights = passive_likelihood / np.sum(passive_likelihood)
                    weighted_mean = np.sum((weights * X_new.T).T, axis=0)

                    estimator_parms['means'].append(weighted_mean)
                    estimator_parms['covars'].append(
                        _frequency_weighted_covariance(X_new,
                                                       weighted_mean,
                                                       weights,
                                                       spherical=False))

                    #for full estimators
                    self.estimators_full_['means'].append(
                        estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(
                        estimator_parms['covars'][-1])

                    #<hyin/Feb-6th-2016> also remember the data weight according to the passive likelihood
                    #this could be useful if the weights according to the passive likelihood is desired for other applications
                    #to evaluate some statistics within the data parition
                    passive_likelihood_dict[e_idx, l_idx] = weights
                else:
                    estimator_parms['means'].append(
                        np.mean(data_partition, axis=0))
                    estimator_parms['covars'].append(np.cov(data_partition.T))

                    #for full estimators
                    self.estimators_full_['means'].append(
                        estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(
                        estimator_parms['covars'][-1])

                    #for MaxEnt, uniform passive likelihood
                    passive_likelihood_dict[e_idx, l_idx] = np.ones(
                        len(data_partition)) / float(len(data_partition))

                estimator_parms['weights'].append(weight_idx[e_idx, l_idx])
                self.estimators_full_['weights'].append(
                    weight_idx[e_idx, l_idx] / float(self.n_estimators))

            self.estimators_.append(estimator_parms)
        #can stop here or go for expectation maximization for each estimator...
        if self.em_itrs > 0:
            #prepare em results for each estimator
            em_res = [
                self._em_steps(e_idx, X, y)
                for e_idx in range(self.n_estimators)
            ]
            #or do EM on the full model?
            # <hyin/Dec-2nd-2015> no, doing this seems to harm the learning as the aggregated model is really
            # complex so optimizing that model tends to overfit...
            # em_res = self._em_steps(None, X, y)
            #then use them
            self.estimators_ = em_res

        self.prepare_inv_and_constants()
        return indices, leaf_idx, passive_likelihood_dict
Example #13
0
def getCaltech_RandomForest(savefig_images: bool = False,
                            num_features: int = 10,
                            num_descriptors: int = 100000,
                            num_training_samples_per_class: int = 15,
                            num_testing_samples_per_class: int = 15,
                            random_state: int = None,
                            pickle_dump: bool = True) -> Data:
    """Caltech 101 training and testing data generator
    using Random Forest Codebook.

    Parameters
    ----------
    savefig_images: bool
        Save raw training & testing images and their
        SIFT masked grayscale transforms
    num_descriptors: int
        Number of SIFT descriptors kept for BoW
    num_training_samples_per_class: int
        Number of samples per class used for training
    num_testing_samples_per_class: int
        Number of samples per class used for testing
    random_state: int
        `np.random.seed` initial state

    Returns
    -------
    data: NamedTuple
        * data_train: numpy.ndarray
        * data_query: numpy.ndarray
    """
    class_list, descriptors_random, raw_train, raw_test, images_train, \
        images_test = getCaltech_pre(num_features, num_descriptors,
                                     num_training_samples_per_class,
                                     num_testing_samples_per_class,
                                     random_state, pickle_dump)

    if savefig_images:
        getCaltech_plot(class_list, images_train, images_test)

    # K-Means clustering algorithm
    codebook_algorithm = RandomTreesEmbedding(
        n_estimators=num_features).fit(descriptors_random)

    n_out = codebook_algorithm.transform(raw_train[0][0]).sum(axis=0).shape[1]

    # vector quantisation
    data_train = np.zeros(
        (len(class_list) * num_training_samples_per_class, n_out + 1))

    for i in range(len(class_list)):
        for j in range(num_training_samples_per_class):
            # set features
            data_train[num_training_samples_per_class * (i) +
                       j, :-1] = codebook_algorithm.transform(
                           raw_train[i][j]).sum(axis=0).ravel()
            # set label
            data_train[num_training_samples_per_class * (i) + j, -1] = i

    # vector quantisation
    data_query = np.zeros(
        (len(class_list) * num_testing_samples_per_class, n_out + 1))

    for i in range(len(class_list)):
        for j in range(num_testing_samples_per_class):
            # set features
            data_query[num_testing_samples_per_class * (i) +
                       j, :-1] = codebook_algorithm.transform(
                           raw_test[i][j]).sum(axis=0).ravel()
            # set label
            data_query[num_testing_samples_per_class * (i) + j, -1] = i

    # cache data to avoid recalculation every time
    if pickle_dump:
        pickle.dump(Data(data_train, data_query),
                    open('tmp/models/codebooks/caltech_rf.pkl', 'wb'))

    return Data(data_train, data_query)
Example #14
0
def randomtrees_embedding_others(df2):
    df = df2.copy()
    rte = RandomTreesEmbedding(random_state=seed)
    df = pd.DataFrame(rte.fit_transform(df).toarray())
    return df
def trainModel(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # It is important to train the ensemble of trees on a different subset
    # of the training data than the linear regression model to avoid
    # overfitting, in particular if the total number of leaves is
    # similar to the number of training samples
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.3)

    # Unsupervised transformation based on totally random trees
    rt = RandomTreesEmbedding(max_depth=3,
                              n_estimators=n_estimator,
                              random_state=0)

    rt_lm = LogisticRegression(max_iter=1000)
    pipeline = make_pipeline(rt, rt_lm)
    pipeline.fit(X_train, y_train)
    y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
    fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

    # Supervised transformation based on random forests
    rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
    rf_enc = OneHotEncoder()
    rf_lm = LogisticRegression(max_iter=1000)
    rf.fit(X_train, y_train)
    rf_enc.fit(rf.apply(X_train))
    rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

    y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:,
                                                                           1]
    fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

    # Supervised transformation based on gradient boosted trees
    grd = GradientBoostingClassifier(n_estimators=n_estimator)
    grd_enc = OneHotEncoder()
    grd_lm = LogisticRegression(max_iter=1000)
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

    y_pred_grd_lm = grd_lm.predict_proba(
        grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
    fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)

    # The gradient boosted model by itself
    y_pred_grd = grd.predict_proba(X_test)[:, 1]
    fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)

    # The random forest model by itself
    y_pred_rf = rf.predict_proba(X_test)[:, 1]
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

    # 以写二进制的方式打开文件
    file = open("./grd_lm.pickle", "wb")
    pickle.dump(grd, file)
    file.close()

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

    plt.figure(2)
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve (zoomed in at top left)')
    plt.legend(loc='best')
    plt.show()
Example #16
0
mat = sio.loadmat('./data/X.mat')
X = mat['Data']

gamma = np.zeros((51,8))
n_pos = len([n for n in Y if n == 1])
n_neg = len([n for n in Y if n == -1]) 
X_transformed = np.zeros((51,16))
Y_transformed = np.zeros((51,8))

print np.shape(X), n_pos, n_neg

# estimators fixed from 50 to 100 to 400, we force the tree to have atleat 2 elements at the 
# leaf as there is a numerical error when calculating the covarience of a single element! 
# there has to be some parameter or tree object which makes balanced grown trees.
forest = RandomTreesEmbedding(n_estimators=400, max_depth=3, random_state=0, min_samples_leaf=2)
forest.fit(X)

count = 0

# for each decision tree in forest we want to find the values in all
# the leaf nodes. after that we must have meu_m = {- 1,...,L, + 1,...,L}
# TODO! assumption : the odd numbered leaves are - and vice versa  
 
for tree in forest.estimators_:
    n_nodes = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold   
    
Example #17
0
"""## K-Means"""

# perform kmeans with k clusters and pca data
k = 250

kmeans = KMeans(init ='k-means++', n_clusters = k, n_init = 10, 
                random_state=0, verbose = 0).fit(data)

codewords = kmeans.cluster_centers_
codewords.shape

"""## Random Trees Embedding"""

rtree = RandomTreesEmbedding(n_estimators=1000, max_depth=70, 
                             min_samples_leaf=1, min_samples_split=2,
                             verbose=1, random_state=0)

rtree.fit(data)

# For each datapoint x in X and for each tree in the forest, 
# return the index of the leaf x ends up in.
leafs = rtree.apply(data)

leafs.shape

"""# Histogram of visual words"""

# note how many SIFTs are per image knowing there are 150 images

def count_sifts_per_image(x):
Example #18
0
min_weight_fraction_leaf=0.,
max_features="auto",
#同决策树算法
max_leaf_nodes=None,
min_impurity_decrease=0.,
min_impurity_split=None,
bootstrap=True,
#给定是否采用有放回的方式产生子数据集,默认为True表示采用
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None)
'''
algo = RandomTreesEmbedding(n_estimators=100, max_depth=3)

#5.算法模型的训练
algo.fit(x_train, y_train)

#6.直接获取模型的扩展结果
x_train2 = algo.transform(x_train)
x_test2 = algo.transform(x_test)
print('扩展前大小:{},扩展后大小:{}'.format(x_train.shape, x_train2.shape))
print('扩展前大小:{},扩展后大小:{}'.format(x_test.shape, x_test2.shape))

#8.随机森林可视化
print('随机森林中的子模型列表:{}'.format(len(algo.estimators_)))
#2.方式二:直接使用pydotplus插件直接生成pdf文件进行保存
from sklearn import tree
import pydotplus
Example #19
0
test = pd.read_csv('Data/test.csv')
sample = pd.read_csv('Data/sampleSubmission.csv')

# drop ids and get labels
labels = train.target.values
train = train.drop('id', axis=1)
train = train.drop('target', axis=1)
test = test.drop('id', axis=1)

# scale features
scaler = StandardScaler()
train = scaler.fit_transform(train.astype(float))
test = scaler.transform(test.astype(float))

# random trees embedding
rte = RandomTreesEmbedding(n_estimators = 50, verbose = 1)
rte.fit(train)
tran = rte.apply(train)

# encode labels 
lbl_enc = LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# set up datasets for cross eval
x_train, x_test, y_train, y_test = train_test_split(train, labels)
#label_binary = LabelBinarizer()
#y_test = label_binary.fit_transform(y_test)

# train a random forest classifier
clf = LogisticRegression()
clf.fit(x_train, y_train)
Example #20
0
    def post(self, request):

        age = request.data["age"]
        sex = request.data["gen"]
        cp = request.data["c_pain"]
        trestbps = request.data["bp_lvl"]
        chol = request.data["choles"]
        fbs = request.data["bp_fast"]
        restecg = request.data["ecg"]
        talach = request.data["h_rate"]
        exang = request.data["i_exe"]
        oldpeak = request.data["d_exe"]
        slope = request.data["sd_seg"]
        ca = request.data["his"]
        thal = request.data["thal_scn"]

        dspath = settings.BASE_DIR + settings.STATIC_URL + "heart.xlsx"
        data = read_excel(dspath, "heart")
        # data = read_csv("heart.csv")
        X = data.iloc[:, 0:13].values
        y = data.iloc[:, 13].values

        hasher = RandomTreesEmbedding(n_estimators=10,
                                      random_state=0,
                                      max_depth=3)
        X_transformed = hasher.fit_transform(X)

        clf = LogisticRegression(random_state=0).fit(X_transformed, y)

        inp = age + "#" + sex + "#" + cp + "#" + trestbps + "#" + chol + "#" + fbs + "#" + restecg + "#" + talach + "#" + exang + "#" + oldpeak + "#" + slope + "#" + ca + "#" + thal
        import numpy as np
        inpa = np.fromstring(inp, dtype=np.float, sep='#')

        transformed_grid = hasher.transform([inpa])

        o = clf.predict(transformed_grid)
        print(o)

        obj = AddMedicalRecord()
        obj.uid = request.data["uid"]
        obj.date = datetime.date.today()
        # obj.date = "2020-02-02"
        if o == [1]:
            obj.result = "HEART PATIENT"
        if o == [0]:
            obj.result = "NO HEART DISEASE"

        dspath = settings.BASE_DIR + settings.STATIC_URL + "heart.xlsx"
        data = read_excel(dspath, "heart")
        # data = read_csv("heart.csv")
        X = data.iloc[:, 0:13].values
        y = data.iloc[:, 13].values
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            random_state=0)
        hasher = RandomTreesEmbedding(n_estimators=10,
                                      random_state=0,
                                      max_depth=3)
        X_transformed = hasher.fit_transform(X_train)

        clf = LogisticRegression(random_state=0).fit(X_transformed, y_train)
        X_test = hasher.fit_transform(X_test)
        y_score = clf.predict(X_test)

        sc = accuracy_score(y_test, y_score)

        obj.accu = sc * 100
        obj.save()

        return HttpResponse("Success with Acc :" + str(sc))
    elif trans_name == 'scaler1':
        from sklearn.preprocessing import MinMaxScaler

        qt = MinMaxScaler()
    elif trans_name == 'scaler2':
        from sklearn.preprocessing import StandardScaler

        qt = StandardScaler()
    elif trans_name == 'scaler3':
        from sklearn.preprocessing import RobustScaler

        qt = RobustScaler()
    elif trans_name == 'random_tree_embedding':
        from sklearn.ensemble import RandomTreesEmbedding

        qt = RandomTreesEmbedding()
    elif trans_name == 'polynomial':
        from sklearn.preprocessing import PolynomialFeatures

        qt = PolynomialFeatures()
    elif trans_name == 'pca':
        from sklearn.decomposition import PCA

        qt = PCA()
    elif trans_name == 'nystronem':
        from sklearn.kernel_approximation import Nystroem

        qt = Nystroem()
    elif trans_name == 'kernel_pca':
        from solnml.components.feature_engineering.transformations.utils import KernelPCA
Example #22
0
def rt_embedding(training_features, testing_features):
    rt = RandomTreesEmbedding()
    rt.fit(training_features)
    testing_features = rt.transform(testing_features)
    training_features = rt.transform(training_features)
    return training_features, testing_features
Example #23
0
import matplotlib.pyplot as plt

path = ''
data = pd.read_csv(path + 'feature_score.csv', header=None)

X = data[range(3000)]
y = data[[3000]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

n_estimator = 10
# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3,
                          n_estimators=n_estimator,
                          random_state=0)
rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
Example #24
0
                                       random_state=10)
random_forest.fit(X_train_ensemble, y_train_ensemble)

gradient_boosting = GradientBoostingClassifier(n_estimators=n_estimators,
                                               max_depth=max_depth,
                                               random_state=10)
_ = gradient_boosting.fit(X_train_ensemble, y_train_ensemble)

# %%
# The :class:`~sklearn.ensemble.RandomTreesEmbedding` is an unsupervised method
# and thus does not required to be trained independently.

from sklearn.ensemble import RandomTreesEmbedding

random_tree_embedding = RandomTreesEmbedding(n_estimators=n_estimators,
                                             max_depth=max_depth,
                                             random_state=0)

# %%
# Now, we will create three pipelines that will use the above embedding as
# a preprocessing stage.
#
# The random trees embedding can be directly pipelined with the logistic
# regression because it is a standard scikit-learn transformer.

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

rt_model = make_pipeline(random_tree_embedding,
                         LogisticRegression(max_iter=1000))
rt_model.fit(X_train_linear, y_train_linear)
Example #25
0
    "Isomap embedding": Isomap(n_neighbors=n_neighbors, n_components=2),
    "Standard LLE embedding": LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=2, method="standard"
    ),
    "Modified LLE embedding": LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=2, method="modified"
    ),
    "Hessian LLE embedding": LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=2, method="hessian"
    ),
    "LTSA LLE embedding": LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=2, method="ltsa"
    ),
    "MDS embedding": MDS(n_components=2, n_init=1, max_iter=100),
    "Random Trees embedding": make_pipeline(
        RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0),
        TruncatedSVD(n_components=2),
    ),
    "Spectral embedding": SpectralEmbedding(
        n_components=2, random_state=0, eigen_solver="arpack"
    ),
    "t-SNE embeedding": TSNE(
        n_components=2, init="pca", learning_rate="auto", random_state=0
    ),
    "NCA embedding": NeighborhoodComponentsAnalysis(
        n_components=2, init="random", random_state=0
    ),
}

# %%
# Once we declared all the methodes of interest, we can run and perform the projection
Example #26
0
def GetAllModelsForComparison(X_train, Y_train):
    models = {
        'ARDRegression': ARDRegression(),
        'BayesianRidge': BayesianRidge(),
        'ElasticNet': ElasticNet(),
        'ElasticNetCV': ElasticNetCV(),
        'Hinge': Hinge(),
        #'Huber': Huber(),
        'HuberRegressor': HuberRegressor(),
        'Lars': Lars(),
        'LarsCV': LarsCV(),
        'Lasso': Lasso(),
        'LassoCV': LassoCV(),
        'LassoLars': LassoLars(),
        'LassoLarsCV': LassoLarsCV(),
        'LinearRegression': LinearRegression(),
        'Log': Log(),
        'LogisticRegression': LogisticRegression(),
        'LogisticRegressionCV': LogisticRegressionCV(),
        'ModifiedHuber': ModifiedHuber(),
        'MultiTaskElasticNet': MultiTaskElasticNet(),
        'MultiTaskElasticNetCV': MultiTaskElasticNetCV(),
        'MultiTaskLasso': MultiTaskLasso(),
        'MultiTaskLassoCV': MultiTaskLassoCV(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),
        'Perceptron': Perceptron(),
        'RANSACRegressor': RANSACRegressor(),
        #'RandomizedLasso': RandomizedLasso(),
        #'RandomizedLogisticRegression': RandomizedLogisticRegression(),
        'Ridge': Ridge(),
        'RidgeCV': RidgeCV(),
        'RidgeClassifier': RidgeClassifier(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        'SquaredLoss': SquaredLoss(),
        'TheilSenRegressor': TheilSenRegressor(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LinearClassifierMixin': LinearClassifierMixin(),
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'StandardScaler': StandardScaler(),
        'TransformerMixin': TransformerMixin(),
        'BaseEstimator': BaseEstimator(),
        'KernelRidge': KernelRidge(),
        'RegressorMixin': RegressorMixin(),
        'LinearSVC': LinearSVC(),
        'LinearSVR': LinearSVR(),
        'NuSVC': NuSVC(),
        'NuSVR': NuSVR(),
        'OneClassSVM': OneClassSVM(),
        'SVC': SVC(),
        'SVR': SVR(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        #'BallTree': BallTree(),
        #'DistanceMetric': DistanceMetric(),
        #'KDTree': KDTree(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        'KNeighborsRegressor': KNeighborsRegressor(),
        'KernelDensity': KernelDensity(),
        #'LSHForest': LSHForest(),
        'LocalOutlierFactor': LocalOutlierFactor(),
        'NearestCentroid': NearestCentroid(),
        'NearestNeighbors': NearestNeighbors(),
        'RadiusNeighborsClassifier': RadiusNeighborsClassifier(),
        'RadiusNeighborsRegressor': RadiusNeighborsRegressor(),
        #'GaussianProcess': GaussianProcess(),
        'GaussianProcessRegressor': GaussianProcessRegressor(),
        'GaussianProcessClassifier': GaussianProcessClassifier(),
        'CCA': CCA(),
        'PLSCanonical': PLSCanonical(),
        'PLSRegression': PLSRegression(),
        'PLSSVD': PLSSVD(),
        #'ABCMeta': ABCMeta(),
        #'BaseDiscreteNB': BaseDiscreteNB(),
        'BaseEstimator': BaseEstimator(),
        #'BaseNB': BaseNB(),
        'BernoulliNB': BernoulliNB(),
        'ClassifierMixin': ClassifierMixin(),
        'GaussianNB': GaussianNB(),
        'LabelBinarizer': LabelBinarizer(),
        'MultinomialNB': MultinomialNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'DecisionTreeRegressor': DecisionTreeRegressor(),
        'ExtraTreeClassifier': ExtraTreeClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'BaggingClassifier': BaggingClassifier(),
        'BaggingRegressor': BaggingRegressor(),
        #'BaseEnsemble': BaseEnsemble(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'IsolationForest': IsolationForest(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RandomForestRegressor': RandomForestRegressor(),
        'RandomTreesEmbedding': RandomTreesEmbedding(),
        #'VotingClassifier': VotingClassifier(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LabelBinarizer': LabelBinarizer(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'OneVsOneClassifier': OneVsOneClassifier(),
        #'OneVsRestClassifier': OneVsRestClassifier(),
        #'OutputCodeClassifier': OutputCodeClassifier(),
        'Parallel': Parallel(),
        #'ABCMeta': ABCMeta(),
        'BaseEstimator': BaseEstimator(),
        #'ClassifierChain': ClassifierChain(),
        'ClassifierMixin': ClassifierMixin(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'MultiOutputClassifier': MultiOutputClassifier(),
        #'MultiOutputEstimator': MultiOutputEstimator(),
        #'MultiOutputRegressor': MultiOutputRegressor(),
        'Parallel': Parallel(),
        'RegressorMixin': RegressorMixin(),
        'LabelPropagation': LabelPropagation(),
        'LabelSpreading': LabelSpreading(),
        'BaseEstimator': BaseEstimator(),
        'IsotonicRegression': IsotonicRegression(),
        'RegressorMixin': RegressorMixin(),
        'TransformerMixin': TransformerMixin(),
        'BernoulliRBM': BernoulliRBM(),
        'MLPClassifier': MLPClassifier(),
        'MLPRegressor': MLPRegressor()
    }
    return models
space with an ExtraTreesClassifier forests learned on the
original data.
"""
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
X_transformed = hasher.fit_transform(X)

# Visualize result after dimensionality reduction using truncated SVD
svd = TruncatedSVD(n_components=2)
X_reduced = svd.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)

# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)

# scatter plot of original and reduced data
        desc_te[nem][i] = des_te

        i = i + 1

#randomly select 100k SIFT descriptors for clustering
desc_sel = np.concatenate(desc_sl)
rand100k = random.sample(range(0, len(desc_sel)), 100000)
desc_sel_100k = desc_sel[rand100k, :]

#The dimensionality of the resulting representation is n_out <= n_estimators * max_leaf_nodes. If max_leaf_nodes == None, the number of leaf nodes is at most n_estimators * 2 ** max_depth.

data_tr_rf = []
data_te_rf = []
codebook = RandomTreesEmbedding(n_estimators=100,
                                max_depth=20,
                                min_samples_split=3,
                                max_leaf_nodes=50,
                                n_jobs=-1).fit(desc_sel)

for nem in desc_tr.keys():  #keys are the same for training and test
    i = 0
    while i < len(desc_tr[nem]):
        #training data
        this_col = desc_tr[nem][i]  #get the image we want
        hp = codebook.transform(this_col)
        hp2 = np.asarray(hp.sum(axis=0).ravel()).flatten()
        data_tr_rf.append(hp2)

        #test data
        this_img = desc_te[nem][i]
        hp = codebook.transform(this_img)
Example #29
0
def test_random_hasher_sparse_data():
    X, y = datasets.make_multilabel_classification(random_state=0)
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X_transformed = hasher.fit_transform(X)
    X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
Example #30
0
def RandomForest_Codebook(num_features, num_descriptors):
    # root folder with images
    folder_name = 'data/Caltech_101/101_ObjectCategories'
    # list of folders of images classes
    class_list = os.listdir(folder_name)
    # macOS: discart '.DS_Store' file
    if '.DS_Store' in class_list:
        class_list.remove('.DS_Store')

    # SIFT feature extractor
    sift = cv2.xfeatures2d.SIFT_create()

    # TRAINING
    # list of descriptors
    descriptors_train = []
    raw_train = defaultdict(dict)
    # iterate over image classes
    for c in range(len(class_list)):
        # subfolder pointer
        sub_folder_name = os.path.join(folder_name, class_list[c])
        # filter non-images files out
        img_list = glob.glob(os.path.join(sub_folder_name, '*.jpg'))
        # shuffle images to break correlation
        np.random.shuffle(img_list)
        # training examples
        img_train = img_list[:15]
        # iterate over image samples of a class
        for i in range(len(img_train)):
            # fetch image sample
            raw_img = cv2.imread(img_train[i])
            img = raw_img.copy()
            # convert to gray scale for SIFT compatibility
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            # apply SIFT algorithm
            kp, des = sift.detectAndCompute(gray, None)
            # store descriptors
            raw_train[c][i] = des
            for d in des:
                descriptors_train.append(d)
    # NumPy-friendly array of descriptors
    descriptors_train = np.asarray(descriptors_train)
    # random selection of descriptors WITHOUT REPLACEMENT
    descriptors_random = descriptors_train[np.random.choice(
        len(descriptors_train), min(len(descriptors_train),
                                    num_descriptors),
        replace=False)]

    # TESTING
    raw_test = defaultdict(dict)
    # iterate over image classes
    for c in range(len(class_list)):
        # subfolder pointer
        sub_folder_name = os.path.join(folder_name, class_list[c])
        # filter non-images files out
        img_list = glob.glob(os.path.join(sub_folder_name, '*.jpg'))
        # testing examples
        img_test = img_list[15:30]
        # iterate over image samples of a class
        for i in range(len(img_test)):
            # fetch image sample
            raw_img = cv2.imread(img_test[i])
            img = raw_img.copy()
            # convert to gray scale for SIFT compatibility
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            # apply SIFT algorithm
            kp, des = sift.detectAndCompute(gray, None)
            # store descriptors
            raw_test[c][i] = des

    # K-Means clustering algorithm
    codebook_algorithm = RandomTreesEmbedding(
        n_estimators=num_features).fit(descriptors_random)

    n_out = codebook_algorithm.transform(raw_train[0][0]).sum(axis=0).shape[1]

    # vector quantisation
    data_train = np.zeros(
        (len(class_list)*15, n_out+1))

    for i in range(len(class_list)):
        for j in range(15):
            # set features
            data_train[15 * (i)+j, :-1] = codebook_algorithm.transform(
                raw_train[i][j]).sum(axis=0).ravel()
            # set label
            data_train[15*(i)+j, -1] = i

    # vector quantisation
    data_query = np.zeros(
        (len(class_list)*15, n_out+1))

    for i in range(len(class_list)):
        for j in range(15):
            # set features
            data_query[15 *
                       (i)+j, :-1] = codebook_algorithm.transform(
                raw_test[i][j]).sum(axis=0).ravel()
            # set label
            data_query[15*(i)+j, -1] = i

    return data_train, data_query