def get_new_model(self, X, Y, corr_mat):
        max_depth = np.log2(X.shape[0])
        # print("Max Depth ", max_depth)
        depth = np.random.randint(max_depth) + 1
        # Create Model
        hasher = RandomTreesEmbedding(n_estimators=1, max_depth=depth)
        hasher.fit(X)
        x_transformed = hasher.transform(X)
        x_trans_dense = x_transformed.todense()
        y_transformed = hasher.transform(Y)
        y_trans_dense = y_transformed.todense()
        for i in range(x_trans_dense.shape[1]):
            # print(x_trans_dense)
            index_array_x = np.where(x_trans_dense[:, i] == 1.0)[0]

            index_array_y = np.where(y_trans_dense[:, i] == 1.0)[0]
            # print("Index array ", i, index_array)
            for idx in index_array_y:
                corr_mat[idx, index_array_x] += 1
        # print(depth, corr_mat)
        # print("Shape of the transformed ", X_transformed.shape, depth)
        # print("hi ",np.where( != 0.0)[1])

        # X_est = hasher.estimators_
        # for estimator in X_est:
        # 	pred = estimator.predict(X)
        # 	print("Shape ", pred)

        # reg.fit(self.X, self.pred)

        return
class RandomTreesEmbeddingPrim(primitive):
    def __init__(self, random_state=0):
        super(RandomTreesEmbeddingPrim,
              self).__init__(name='RandomTreesEmbedding')
        self.id = 54
        self.PCA_LAPACK_Prim = []
        self.type = 'feature engineering'
        self.description = "FastICA: a fast algorithm for Independent Component Analysis."
        self.hyperparams_run = {'default': True}
        self.pca = RandomTreesEmbedding(random_state=random_state)
        self.accept_type = 'c_t'

    def can_accept(self, data):
        return self.can_accept_c(data)

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.pca.fit(data['X'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        code = ''.join(word[0] for word in cols)[:10]
        result = self.pca.transform(output['X']).toarray()
        new_cols = list(map(str, list(range(result.shape[1]))))
        cols = ["{}_rfembdng{}".format(x, code) for x in new_cols]
        output['X'] = pd.DataFrame(result, columns=cols)
        final_output = {0: output}
        return final_output
Example #3
0
    def random_forest_embedding(self, data, n_estimators=30, random_state=0, max_depth=3, min_samples_leaf=1):
        """
        learn a density with random forest representation
        """
        """
        scikit-learn only supports axis-align sepration, let's first stick to this and see how it works
        """
        # n_estimators = 400
        # random_state = 0
        # max_depth = 5
        rf_mdl = RandomTreesEmbedding(
            n_estimators=n_estimators,
            random_state=random_state,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf)
        rf_mdl.fit(data)

        indices = rf_mdl.apply(data)
        samples_by_node = defaultdict(list)
        idx_by_node = defaultdict(list)
        #kde_by_node = defaultdict(KernelDensity)

        for idx, sample, est_data in zip(range(len(data)), data, indices):
            for est_ind, leaf in enumerate(est_data):
                samples_by_node[ est_ind, leaf ].append(sample)
                idx_by_node[ est_ind, leaf ].append(idx)

        res_mdl = dict()
        res_mdl['rf_mdl'] = rf_mdl
        res_mdl['samples_dict'] = samples_by_node
        res_mdl['idx_dict'] = idx_by_node
        # res_mdl['kde_dict'] = kde_by_node
        return res_mdl
Example #4
0
def train():
    embedder = RandomTreesEmbedding(n_estimators=10,
                                    random_state=1,
                                    max_leaf_nodes=30)
    import retro

    raw_env = retro.make("SuperMarioBros-Nes")
    env = FrameStack(raw_env, 4)
    first_obs = env.reset()
    index_right = raw_env.buttons.index("RIGHT")
    index_a = raw_env.buttons.index("A")
    index_b = raw_env.buttons.index("B")
    all_observations = []
    for i in range(100):
        action = [0] * 9
        action[index_right] = 1
        action[index_b] = 1
        action[index_a] = 1 if random.random() > 0.1 else 0
        obs, reward, end_of_episode, aditional_information = env.step(action)
        flat = np.array(obs).flatten()
        all_observations.append(flat)
        if end_of_episode:
            print("dead")
            env.reset()
        if i % 300 == 0:
            env.render()
    raw_env.render(close=True)
    embedder.fit(all_observations)
    print("done")
    with open("data/embedder2.pickle", "wb") as pickle_outfile:
        pickle.dump(embedder, pickle_outfile)
Example #5
0
def get_rf_codebook_times():

    print('Computing RF Codebook...')

    # Reformat the training and testing data
    for i in range(10):
        for n in range(15):
            if i == 0 and n == 0:
                data_train = desc_tr[i][n]
                data_test = desc_te[i][n]
            else:
                data_train = np.hstack((data_train, desc_tr[i][n]))
                data_test = np.hstack((data_test, desc_te[i][n]))
    data_train = data_train.T
    data_test = data_test.T

    # Compute the random forest
    max_depth = 10
    times = []
    vocabulary_sizes = [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 17, 20, 24, 28, 33, 39, 45, 50,
        60, 75, 100, 150, 200, 250, 300, 350, 400, 500, 600, 750, 850, 1000
    ]
    for n_estimators in vocabulary_sizes:
        start_time = time.time()
        RFE = RandomTreesEmbedding(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_leaf_nodes=n_leafs,
                                   random_state=0,
                                   n_jobs=3)

        RFE.fit(data_train)
        times.append(time.time() - start_time)
    return times
Example #6
0
def random_forest_embedding(data,
                            n_estimators=400,
                            random_state=0,
                            max_depth=5,
                            min_samples_leaf=1):
    """
    learn a density with random forest representation
    """
    """
    scikit-learn only supports axis-align sepration, let's first stick to this and see how it works
    """
    # n_estimators = 400
    # random_state = 0
    # max_depth = 5
    rf_mdl = RandomTreesEmbedding(n_estimators=n_estimators,
                                  random_state=random_state,
                                  max_depth=max_depth,
                                  min_samples_leaf=min_samples_leaf)
    rf_mdl.fit(data)

    # forestClf.fit(trainingData, trainingLabels)
    # indices = forestClf.apply(trainingData)
    # samples_by_node = defaultdict(list)
    # for est_ind, est_data in enumerate(indices.T):
    # for sample_ind, leaf in enumerate(est_data):
    # samples_by_node[ est_ind, leaf ].append(sample_ind)
    # indexOfSamples = samples_by_node[0,10]
    # # samples_by_node[treeIndex, leafIndex within that tree]
    # leafNodeSamples = trainingAngles[indexOfSamples]
    # kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(leafNodeSamples)

    indices = rf_mdl.apply(data)
    samples_by_node = defaultdict(list)
    idx_by_node = defaultdict(list)
    kde_by_node = defaultdict(KernelDensity)

    for idx, sample, est_data in zip(range(len(data)), data, indices):
        for est_ind, leaf in enumerate(est_data):
            samples_by_node[est_ind, leaf].append(sample)
            idx_by_node[est_ind, leaf].append(idx)

    #Kernel Density Estimation for each leaf node
    # for k,v in samples_by_node.iteritems():
    #     est_ind, leaf = k
    # params = {'bandwidth': np.logspace(-1, 1, 20)}
    # grid = GridSearchCV(KernelDensity(), params)
    # grid.fit(v)

    #     kde_by_node[ est_ind, leaf ] = grid.best_estimator_

    res_mdl = dict()
    res_mdl['rf_mdl'] = rf_mdl
    res_mdl['samples_dict'] = samples_by_node
    res_mdl['idx_dict'] = idx_by_node
    # res_mdl['kde_dict'] = kde_by_node
    return res_mdl
Example #7
0
def random_forest_embedding(data, n_estimators=400, random_state=0, max_depth=5, min_samples_leaf=1):
    """
    learn a density with random forest representation
    """
    """
    scikit-learn only supports axis-align sepration, let's first stick to this and see how it works
    """
    # n_estimators = 400
    # random_state = 0
    # max_depth = 5
    rf_mdl = RandomTreesEmbedding(
        n_estimators=n_estimators, 
        random_state=random_state, 
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf)
    rf_mdl.fit(data)
    
    # forestClf.fit(trainingData, trainingLabels)
    # indices = forestClf.apply(trainingData)
    # samples_by_node = defaultdict(list)
    # for est_ind, est_data in enumerate(indices.T):
    # for sample_ind, leaf in enumerate(est_data):
    # samples_by_node[ est_ind, leaf ].append(sample_ind)
    # indexOfSamples = samples_by_node[0,10]
    # # samples_by_node[treeIndex, leafIndex within that tree]
    # leafNodeSamples = trainingAngles[indexOfSamples]
    # kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(leafNodeSamples)

    indices = rf_mdl.apply(data)
    samples_by_node = defaultdict(list)
    idx_by_node = defaultdict(list)
    kde_by_node = defaultdict(KernelDensity)

    for idx, sample, est_data in zip(range(len(data)), data, indices):
        for est_ind, leaf in enumerate(est_data):
            samples_by_node[ est_ind, leaf ].append(sample)
            idx_by_node[ est_ind, leaf ].append(idx)

        
    #Kernel Density Estimation for each leaf node
    # for k,v in samples_by_node.iteritems():
    #     est_ind, leaf = k
          # params = {'bandwidth': np.logspace(-1, 1, 20)}
          # grid = GridSearchCV(KernelDensity(), params)
          # grid.fit(v)

    #     kde_by_node[ est_ind, leaf ] = grid.best_estimator_

    res_mdl = dict()
    res_mdl['rf_mdl'] = rf_mdl
    res_mdl['samples_dict'] = samples_by_node
    res_mdl['idx_dict'] = idx_by_node
    # res_mdl['kde_dict'] = kde_by_node
    return res_mdl
Example #8
0
def RandomTreesEmbeddingAlgo(x_train_vft, y_train, x_test_vft, y_test, vec):
    print("Random Trees Embedding")
    rte = RandomTreesEmbedding(n_jobs=2, random_state=0)
    rte.fit(x_train_vft, y_train)
    y_predict_class = rte.predict(x_test_vft)
    print("Confusion Matrix")
    print(confusion_matrix(np.array(y_test), np.array(y_predict_class)))
    print('Accuracy Score :', accuracy_score(y_test, y_predict_class))
    print('ROC(Receiver Operating Characteristic) and AUC(Area Under Curve)', roc_auc_score(y_test, y_predict_class))
    print('Average Precision Score:', average_precision_score(y_test, y_predict_class))
    if rte.predict(vec) == [1]:
        return "Positive"
    else:
        return "Negative"
class _RandomTreesEmbeddingImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Example #10
0
def learn_representation(represent_train, represent_test,labels_train,labels_test,train_id,test_id,depth, ntree,random_seed,is_terminal=True,normal=False):
	rt = RandomTreesEmbedding(max_depth=depth,n_estimators=ntree,random_state =random_seed,n_jobs=-1)

	traincv=represent_train
	testcv=represent_test
	trainind=np.unique(train_id)
	testind=np.unique(test_id)

	trainlabels=labels_train
	testlabels=labels_test
	
	randTrees=rt.fit(traincv.values)
	
	if is_terminal:
		trainRep=randTrees.transform(traincv.values)
		testRep=randTrees.transform(testcv.values)
	else:
		trainRep=randTrees.decision_path(traincv)[0]
		testRep=randTrees.decision_path(testcv)[0]

	newId=np.unique(train_id)
	Mask = sparse.csr_matrix((np.ones(traincv.shape[0],int),(train_id, np.arange(traincv.shape[0]))), shape=(newId.shape[0],traincv.shape[0]))

	trainbow = Mask * trainRep

	newId=np.unique(test_id)
	Mask = sparse.csr_matrix((np.ones(testcv.shape[0],int),(test_id, np.arange(testcv.shape[0]))), shape=(newId.shape[0],testcv.shape[0]))

	testbow = Mask * testRep              

	if normal:
		trainbow = normalize(trainbow, norm='l1', axis=1)
		testbow = normalize(testbow, norm='l1', axis=1)
	
	return trainbow,testbow
Example #11
0
def learn_representation_sparse(represent_train, represent_test,labels_train,labels_test,train_id,test_id,depth, ntree,random_seed,is_terminal=True,normal=False):
	rt = RandomTreesEmbedding(max_depth=depth,n_estimators=ntree,random_state =random_seed,n_jobs=-1)

	traincv=represent_train
	testcv=represent_test
	trainind=np.unique(train_id)
	testind=np.unique(test_id)

	trainlabels=labels_train
	testlabels=labels_test
	
	randTrees=rt.fit(traincv.values)
	
	trainRep=randTrees.apply(traincv.values)
	testRep=randTrees.apply(testcv.values)
            
	trainbow = np.apply_along_axis(train_f, 0, trainRep,train_id)
	trainbow = sparse.hstack(trainbow)
	
	testbow = np.apply_along_axis(train_f, 0, testRep,test_id)
	testbow = sparse.hstack(testbow)
	
	if normal:
		trainbow = normalize(trainbow, norm='l1', axis=1)
		testbow = normalize(testbow, norm='l1', axis=1)
	
	return trainbow,testbow
Example #12
0
class TopRandomTreesEmbedding(BaseEstimator,TransformerMixin):
    def __init__(self, k=100,n_estimators=20, max_depth=10):
        self.k = k
        self.n_estimators = n_estimators
        self.max_depth = max_depth

    def fit(self, X, y):
        self._rtree = RandomTreesEmbedding(n_estimators=self.n_estimators, max_depth=self.max_depth,sparse_output=False) #sparse_output=False,,sparse_output=False
        self._rtree.fit(X, y)
        non_zero_indics = np.nonzero(self._rtree.feature_importances_)[0]
        important_indics = self._rtree.feature_importances_.argsort()[::-1][:self.k]
        self.important_indices = np.intersect1d(important_indics,non_zero_indics)
        return self

    def transform(self, X):
        return X[:,self.important_indices].toarray()
Example #13
0
 def cluster_testing(self, testing):
     '''Create RandomTreesEmbedding of data'''
     clf = RandomTreesEmbedding(n_estimators=512,
                                random_state=self.seed,
                                max_depth=5)
     '''Fit testing data to training model'''
     clf.fit = self.clf.fit(testing)
     X_transformed = self.clf.fit_transform(testing)
     n_components = 2
     '''SVD transform data'''
     svd = TruncatedSVD(n_components=n_components)
     svd.clf = svd.fit(X_transformed)
     svd.model = svd.clf.transform(X_transformed)
     '''Train transformed data using original model'''
     train_transformed = clf.fit.transform(self.train_matrix)
     train_model = svd.clf.transform(train_transformed)
     '''Generate One Class SVM rejection criteria'''
     (clf_OCSVM_t, OCSVMmodel_t
      ) = self.tools.determine_testing_data_similarity(train_model)
     predicted = []
     '''Remove testing compounds outside rejection margin'''
     for i in range(len(svd.model)):
         p = OCSVMmodel_t.predict(svd.model[i, :].reshape(1, -1))
         pred = OCSVMmodel_t.decision_function(svd.model[i, :].reshape(
             1, -1)).ravel()
         if (p == 1):
             predicted.append(i)
     return predicted
Example #14
0
def learn_representation_sparse_new(represent_train,labels_train,train_id,depth, ntree,random_seed,is_terminal=True,normal=False):
	
	rt = RandomTreesEmbedding(max_depth=depth,n_estimators=ntree,random_state =random_seed,n_jobs=-1)
	
	traincv=represent_train
	trainind=np.unique(train_id)
	trainlabels=labels_train

	randTrees=rt.fit(traincv.values)	
	trainRep=randTrees.apply(traincv.values)

	allRep = trainRep
	allids = np.array(train_id)

	ids=np.tile(allids,ntree)
		
	increments=np.arange(0,ntree)*(2**depth)
	allRep=allRep+increments
	node_ids=allRep.flatten('F')
	
	data=np.repeat(1,len(ids))
	allbow=sparse.coo_matrix((data,(ids,node_ids)), dtype=np.int8).tocsr()
	select_ind =trainind.shape[0]

	allbow = normalize(allbow, norm='l1', axis=1)
		
	return allbow,randTrees
Example #15
0
def learn_representation_sparse_2_complex(represent_train,
                                          represent_test,
                                          labels_train,
                                          labels_test,
                                          train_id,
                                          test_id,
                                          depth,
                                          ntree,
                                          random_seed,
                                          is_terminal=True,
                                          normal=False):

    rt = RandomTreesEmbedding(max_depth=depth,
                              n_estimators=ntree,
                              random_state=random_seed,
                              n_jobs=-1)

    traincv = represent_train
    trainind = np.unique(train_id)
    trainlabels = labels_train

    randTrees = rt.fit(traincv.values)
    trainRep = randTrees.apply(traincv.values)

    if represent_test is not None:

        testcv = represent_test
        testind = np.unique(test_id)
        testlabels = labels_test
        test_time = time.time()
        testRep = randTrees.apply(testcv.values)
        test_time = time.time() - test_time

        allRep = np.vstack((trainRep, testRep))
        allids = np.concatenate(
            (np.array(train_id), np.array(test_id) + np.max(train_id) + 1),
            axis=0)
    else:
        allRep = trainRep
        allids = np.array(train_id)

    ids = np.tile(allids, ntree)

    increments = np.arange(0, ntree) * (2**depth)
    allRep = allRep + increments
    node_ids = allRep.flatten('F')

    data = np.repeat(1, len(ids))

    allbow = sparse.coo_matrix((data, (ids, node_ids)), dtype=np.int8).tocsr()

    select_ind = trainind.shape[0]
    return allbow[:select_ind, :], allbow[select_ind:, :], test_time
Example #16
0
    def fit(self, X, y=None, max_depth=5):
        self.unsupervised = (y is None)
        self.n_attributes = X.shape[1]
        self.in_size = X.shape[0]
        if y is None:
            forest = RandomTreesEmbedding(self.out_size, max_depth=max_depth)
            forest.fit(X)
        else:
            forest = RandomForestClassifier(n_estimators=self.out_size,
                                            max_depth=max_depth)
            forest.fit(X, y)
        for i in range(self.out_size):
            self.trees.append(
                CompletelyRandomTree(forest.estimators_[i], self.n_attributes))

        self.global_lower_bounds = np.min(X, axis=0).astype(np.double)
        self.global_upper_bounds = np.max(X, axis=0).astype(np.double)

        self.default_path_rule = PathRule(self.n_attributes)
        self.default_path_rule.set_global_bounds(self.global_lower_bounds,
                                                 self.global_upper_bounds)
Example #17
0
def rf_codebook(desc_tr,
                desc_te,
                desc_sizes,
                max_depth,
                n_estimators,
                n_leafs,
                return_time=False):

    print('Computing RF Codebook...')

    # Reformat the training and testing data
    for i in range(10):
        for n in range(15):
            if i == 0 and n == 0:
                data_train = desc_tr[i][n]
                data_test = desc_te[i][n]
            else:
                data_train = np.hstack((data_train, desc_tr[i][n]))
                data_test = np.hstack((data_test, desc_te[i][n]))
    data_train = data_train.T
    data_test = data_test.T

    # Compute the random forest
    # max_depth = 10
    # n_estimators = 100
    RFE = RandomTreesEmbedding(n_estimators=n_estimators,
                               max_depth=max_depth,
                               max_leaf_nodes=n_leafs,
                               random_state=0,
                               n_jobs=3)

    RFE.fit(data_train)

    # Compute the bag of words for each of the predictions
    histogram_train = bag_of_words_rf_jorge(desc_tr, desc_sizes, RFE, n_leafs)
    histogram_test = bag_of_words_rf_jorge(desc_te, desc_sizes, RFE, n_leafs)

    print('Done')

    return histogram_train, histogram_test
Example #18
0
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two pca dimensions
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray())

    # one leaf active per data point per forest
    assert_equal(X_transformed.shape[0], X.shape[0])
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    pca = RandomizedPCA(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert_equal(linear_clf.score(X_reduced, y), 1.0)
Example #19
0
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two pca dimensions
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    assert_array_equal(
        hasher.fit(X).transform(X).toarray(), X_transformed.toarray())

    # one leaf active per data point per forest
    assert_equal(X_transformed.shape[0], X.shape[0])
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    pca = RandomizedPCA(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert_equal(linear_clf.score(X_reduced, y), 1.)
Example #20
0
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two SVD dimensions
    # Note: Not all random_states produce perfect results.
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    assert_array_equal(
        hasher.fit(X).transform(X).toarray(), X_transformed.toarray())

    # one leaf active per data point per forest
    assert X_transformed.shape[0] == X.shape[0]
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    svd = TruncatedSVD(n_components=2)
    X_reduced = svd.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert linear_clf.score(X_reduced, y) == 1.
Example #21
0
 def cluster_testing(self, testing):
     '''Create RandomTreesEmbedding of data'''
     clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
     '''Fit testing data to training model'''
     clf.fit = self.clf.fit(testing)
     X_transformed = self.clf.fit_transform(testing)
     n_components = 2
     '''SVD transform data'''
     svd = TruncatedSVD(n_components=n_components)
     svd.clf = svd.fit(X_transformed)
     svd.model = svd.clf.transform(X_transformed)
     '''Train transformed data using original model'''
     train_transformed = clf.fit.transform(self.train_matrix)
     train_model = svd.clf.transform(train_transformed)
     '''Generate One Class SVM rejection criteria'''
     (clf_OCSVM_t, OCSVMmodel_t) = self.tools.determine_testing_data_similarity(train_model)
     predicted = []
     '''Remove testing compounds outside rejection margin'''
     for i in range(len(svd.model)):
         p = OCSVMmodel_t.predict(svd.model[i, :].reshape(1, -1))
         pred = OCSVMmodel_t.decision_function(svd.model[i, :].reshape(1, -1)).ravel()
         if (p == 1):
             predicted.append(i)
     return predicted
Example #22
0
    --n_estimators=<n>    Number of trees in the forest [default:10]
"""


import pandas as pd
import sys
import numpy as np
import cPickle
from sklearn.ensemble import RandomTreesEmbedding
from docopt import docopt

arguments = docopt(__doc__)
input_path = arguments["<training_set>"]
n = int(arguments["--n_estimators"])
output_path = arguments["<mapper_path>"]

print "Reading Data"
data = pd.read_csv(input_path,header=None).values[:,1:]


print "Constructing Mapper"
mapper = RandomTreesEmbedding(n_estimators=n)
mapper.fit(data)

print "Saving Mapper to {}".format(output_path)
with open(output_path,"w") as f:
    cPickle.dump(mapper,f)

    

Example #23
0
sample = pd.read_csv('Data/sampleSubmission.csv')

# drop ids and get labels
labels = train.target.values
train = train.drop('id', axis=1)
train = train.drop('target', axis=1)
test = test.drop('id', axis=1)

# scale features
scaler = StandardScaler()
train = scaler.fit_transform(train.astype(float))
test = scaler.transform(test.astype(float))

# random trees embedding
rte = RandomTreesEmbedding(n_estimators = 50, verbose = 1)
rte.fit(train)
tran = rte.apply(train)

# encode labels 
lbl_enc = LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# set up datasets for cross eval
x_train, x_test, y_train, y_test = train_test_split(train, labels)
#label_binary = LabelBinarizer()
#y_test = label_binary.fit_transform(y_test)

# train a random forest classifier
clf = LogisticRegression()
clf.fit(x_train, y_train)
Example #24
0
class RandomTreesEmbeddingTransformation(Transformer):
    def __init__(self,
                 n_estimators=10,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=1.0,
                 max_leaf_nodes='None',
                 sparse_output=True,
                 bootstrap='False',
                 n_jobs=-1,
                 random_state=1):
        super().__init__("random_trees_embedding", 18)
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'
        self.output_type = CATEGORICAL

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.bootstrap = bootstrap
        self.sparse_output = sparse_output
        self.n_jobs = n_jobs
        self.random_state = random_state

    @ease_trans
    def operate(self, input_datanode: DataNode, target_fields=None):
        from sklearn.ensemble import RandomTreesEmbedding

        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(input_datanode.feature_types,
                                           self.input_type)
        X_new = X[:, target_fields]
        if not self.model:
            self.n_estimators = int(self.n_estimators)
            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)

            # Skip heavy computation. max depth is set to 6.
            if X.shape[0] > 5000:
                self.max_depth = min(6, self.max_depth)

            self.min_samples_split = int(self.min_samples_split)
            self.min_samples_leaf = int(self.min_samples_leaf)
            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)
            self.min_weight_fraction_leaf = float(
                self.min_weight_fraction_leaf)
            self.bootstrap = check_for_bool(self.bootstrap)

            self.model = RandomTreesEmbedding(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_leaf_nodes=self.max_leaf_nodes,
                sparse_output=self.sparse_output,
                n_jobs=self.n_jobs,
                random_state=self.random_state)

            self.model.fit(X_new)

        _X = self.model.transform(X_new).toarray()

        return _X

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None,
                                        optimizer='smac'):
        n_estimators = UniformIntegerHyperparameter(name="n_estimators",
                                                    lower=10,
                                                    upper=100,
                                                    default_value=10)
        max_depth = UniformIntegerHyperparameter(name="max_depth",
                                                 lower=2,
                                                 upper=10,
                                                 default_value=5)
        min_samples_split = UniformIntegerHyperparameter(
            name="min_samples_split", lower=2, upper=20, default_value=2)
        min_samples_leaf = UniformIntegerHyperparameter(
            name="min_samples_leaf", lower=1, upper=20, default_value=1)
        min_weight_fraction_leaf = Constant('min_weight_fraction_leaf', 1.0)
        max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes",
                                                      value="None")
        bootstrap = CategoricalHyperparameter('bootstrap', ['True', 'False'])
        cs = ConfigurationSpace()
        cs.add_hyperparameters([
            n_estimators, max_depth, min_samples_split, min_samples_leaf,
            min_weight_fraction_leaf, max_leaf_nodes, bootstrap
        ])
        return cs
Example #25
0
max_leaf_nodes=None,
min_impurity_decrease=0.,
min_impurity_split=None,
bootstrap=True,
#给定是否采用有放回的方式产生子数据集,默认为True表示采用
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None)
'''
algo = RandomTreesEmbedding(n_estimators=100, max_depth=3)

#5.算法模型的训练
algo.fit(x_train, y_train)

#6.直接获取模型的扩展结果
x_train2 = algo.transform(x_train)
x_test2 = algo.transform(x_test)
print('扩展前大小:{},扩展后大小:{}'.format(x_train.shape, x_train2.shape))
print('扩展前大小:{},扩展后大小:{}'.format(x_test.shape, x_test2.shape))

#8.随机森林可视化
print('随机森林中的子模型列表:{}'.format(len(algo.estimators_)))
#2.方式二:直接使用pydotplus插件直接生成pdf文件进行保存
from sklearn import tree
import pydotplus

#feature_names=None,class_names=None 分别给定特征属性和目标属性的name信息
for i in range(len(algo.estimators_)):
Example #26
0
## labels for the data
dic = pickle.load(open('letterdict_normalized.pickle'))
mypath = '/home/asriva20/SrivastavaA/Data/3_AD_Normal/'
names = [name for name in sorted(listdir(mypath))]
Y = [1 if n[2:8] in dic['AD'] else \
     0 if n[2:8] in  dic['Normal'] else \
    -1 for n in names]
Y = np.asarray(Y)

mat = sio.loadmat('X.mat')
X = mat['Data']
print np.shape(X)

forest = RandomTreesEmbedding(n_estimators=50, max_depth=3)
forest.fit(X)
print(forest.apply(X))
sum = 0
for tree in forest.estimators_:
    n_nodes = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold

    node_depth = np.zeros(shape=n_nodes)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    parent_id = {}
    # seed is root node and id is parent depth
    stack = [(0, -1)]
Example #27
0
class Clustering():
    def __init__(self, compounds, output=False, seed=False):
        np.random.seed(seed=seed)
        self.seed = seed
        self.compounds = compounds
        self.count = 0
        self.count_1 = 0
        self.output = output
        self.tools = clustertools()
        if self.output is not False:
            self.figures = clusterfigures(self.compounds)
        self.testcompound = []

    def cluster_training(self, train, distance=False):
        '''
        This is the basic clustering function
        '''
        self.train_matrix = train.train
        '''
        Step one is to make sure that their is a distance matrix in place.
        It is best to feed an existing distance matrix if one is available.
        '''
        if distance is False:
            self.p_feat_matrix = self.tools.pairwise_distance_matrix(train.train, 'jaccard')
        else:
            self.p_feat_matrix = distance
        '''
        Step two is to cluster your data using a random trees embedding. This a
        random ensemble of trees. This is a transformation on the data, into a
        high dimensional, sparse space
        '''
        self.clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
        #self.clf.fit(self.train_matrix)
        X_transformed = self.clf.fit_transform(self.train_matrix)
        '''
        Step three performs truncated SVD (similar to PCA). It operates on the sample
        vectors directly, rather than the covariance matrix. It takes the first two
        components. Essentially this reduces the sparse embedding to a low dimensional
        representation.
        '''
        self.svd = TruncatedSVD(n_components=2)
        self.svd.clf = self.svd.fit(X_transformed)
        self.model = self.svd.clf.transform(X_transformed)
        '''
        The next step is to take the transformed model and the original dataset and
        determine the max silhouette_score of clusters
        '''
        (self.cluster_assignment,
         self.cluster_num,
         self.cluster_score) = self.tools.identify_accurate_number_of_clusters(self.model, self.compounds)
        self.individualclusters = []
        '''
        The individual datapoints are assessed with regard to the best clustering scheme
        '''
        for i in range(self.cluster_num):
            self.individualclusters.append([])
            for j in range(len(self.cluster_assignment)):
                if self.cluster_assignment[j] == i:
                    self.individualclusters[i].append(self.model[j, :])
            self.individualclusters[i] = np.array(self.individualclusters[i])
        '''
        Finally, this clustering scheme is used to generate a one class Support
        Vector Machine decision boundary.
        '''
        (self.clf_OCSVM,
         self.OCSVM_model) = self.tools.determine_test_similarity(self.individualclusters)

    def cluster_testing(self, testing):
        '''Create RandomTreesEmbedding of data'''
        clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
        '''Fit testing data to training model'''
        clf.fit = self.clf.fit(testing)
        X_transformed = self.clf.fit_transform(testing)
        n_components = 2
        '''SVD transform data'''
        svd = TruncatedSVD(n_components=n_components)
        svd.clf = svd.fit(X_transformed)
        svd.model = svd.clf.transform(X_transformed)
        '''Train transformed data using original model'''
        train_transformed = clf.fit.transform(self.train_matrix)
        train_model = svd.clf.transform(train_transformed)
        '''Generate One Class SVM rejection criteria'''
        (clf_OCSVM_t, OCSVMmodel_t) = self.tools.determine_testing_data_similarity(train_model)
        predicted = []
        '''Remove testing compounds outside rejection margin'''
        for i in range(len(svd.model)):
            p = OCSVMmodel_t.predict(svd.model[i, :].reshape(1, -1))
            pred = OCSVMmodel_t.decision_function(svd.model[i, :].reshape(1, -1)).ravel()
            if (p == 1):
                predicted.append(i)
        return predicted
            trans_test = v.transform(X_test[f_cat])
            for nb in range(nb_comp):
                new_col = '{}_{:03d}'.format(k, nb + 1)
                X_train[new_col] = trans_train[:, nb]
                X_valid[new_col] = trans_valid[:, nb]
                X_test[new_col] = trans_test[:, nb]

        #known cluster
        f = 'f_clu_{:03d}'.format(n_clust)
        f_y_enc.append(f)
        X_train[f] = clust.fit_predict(X_train)
        X_valid[f] = clust.predict(X_valid)
        X_test[f] = clust.predict(X_test)

        #embed
        embed.fit(X_train)
        trans_train = embed.apply(X_train)
        trans_valid = embed.apply(X_valid)
        trans_test = embed.apply(X_test)

        for tree in range(trans_train.shape[1]):
            f = 'f_embed_{:04d}'.format(tree)
            f_y_enc.append(f)
            leaf_lbl = LabelEncoder()
            leaf_train = trans_train[:, tree].tolist()
            leaf_valid = trans_valid[:, tree].tolist()
            leaf_test = trans_test[:, tree].tolist()

            leaf_lbl.fit(leaf_train + leaf_valid + leaf_test)
            X_train[f] = leaf_lbl.transform(leaf_train)
            X_valid[f] = leaf_lbl.transform(leaf_valid)
n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
rt_lm = LogisticRegression()
rt.fit(X_train, y_train)
rt_lm.fit(rt.transform(X_train_lr), y_train_lr)

y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)


# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
class EnsembleIOC(BaseEstimator, RegressorMixin):

    def __init__(self,  n_estimators=20, 
                        max_depth=5, min_samples_split=10, min_samples_leaf=10,
                        random_state=0,
                        em_itrs=5,
                        regularization=0.05,
                        passive_dyn_func=None,
                        passive_dyn_ctrl=None,
                        passive_dyn_noise=None,
                        verbose=False):
        '''
        n_estimators        - number of ensembled models
        ...                 - a batch of parameters used for RandomTreesEmbedding, see relevant documents
        em_itrs             - maximum number of EM iterations to take
        regularization      - small positive scalar to prevent singularity of matrix inversion
        passive_dyn_func    - function to evaluate passive dynamics; None for MaxEnt model
        passive_dyn_ctrl    - function to return the control matrix which might depend on the state...
        passive_dyn_noise   - covariance of a Gaussian noise; only applicable when passive_dyn is Gaussian; None for MaxEnt model
                                note this implies a dynamical system with constant input gain. It is extendable to have state dependent
                                input gain then we need covariance for each data point
        verbose             - output training information
        '''
        BaseEstimator.__init__(self)

        self.n_estimators=n_estimators
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.random_state=random_state
        self.em_itrs=em_itrs
        self.reg=regularization
        self.passive_dyn_func=passive_dyn_func
        self.passive_dyn_ctrl=passive_dyn_ctrl
        self.passive_dyn_noise=passive_dyn_noise
        self.verbose=verbose
        return

    def fit(self, X, y=None):
        '''
        y could be the array of starting state of the demonstrated trajectories/policies
        if it is None, it implicitly implies a MaxEnt model. Other wise, it serves as the feature mapping
        of the starting state. This data might also be potentially used for learning the passive dynamics
        for a pure model-free learning with some regressors and regularization.
        '''
        #check parameters...
        assert(type(self.n_estimators)==int)
        assert(self.n_estimators > 0)
        assert(type(self.max_depth)==int)
        assert(self.max_depth > 0)
        assert(type(self.min_samples_split)==int)
        assert(self.min_samples_split > 0)
        assert(type(self.min_samples_leaf)==int)
        assert(self.min_samples_leaf > 0)
        assert(type(self.em_itrs)==int)

        #an initial partitioning of data with random forest embedding
        self.random_embedding_mdl_ = RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
            )

        #we probably do not need the data type to differentiate it is a demonstration
        #of trajectory or commanded state, do we?
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            self.random_embedding_mdl_.fit(X[:, X.shape[1]/2:])
            indices = self.random_embedding_mdl_.apply(X[:, X.shape[1]/2:])
            # X_tmp = np.array(X)
            # X_tmp[:, X.shape[1]/2:] = X_tmp[:, X.shape[1]/2:] - X_tmp[:, :X.shape[1]/2]
            # self.random_embedding_mdl_.fit(X_tmp)

            # indices = self.random_embedding_mdl_.apply(X_tmp)
        else:
            self.random_embedding_mdl_.fit(X)
            #figure out indices
            indices = self.random_embedding_mdl_.apply(X)

        partitioned_data = defaultdict(list)

        leaf_idx = defaultdict(set)
        weight_idx = defaultdict(float)
        #group data belongs to the same partition and have the weights...
        #is weight really necessary for EM steps? Hmm, seems to be for the initialization
        #d_idx: data index; p_idx: partition index (comprised of estimator index and leaf index)
        for d_idx, d, p_idx in zip(range(len(X)), X, indices):
            for e_idx, l_idx in enumerate(p_idx):
                partitioned_data[e_idx, l_idx].append(d)
                leaf_idx[e_idx] |= {l_idx}

            for e_idx, l_idx in enumerate(p_idx):
                weight_idx[e_idx, l_idx] = float(len(partitioned_data[e_idx, l_idx])) / len(X)
                # weight_idx[e_idx, l_idx] = 1. / len(p_idx)

        #for each grouped data, solve an easy IOC problem by assuming quadratic cost-to-go function
        #note that, if the passive dynamics need to be learned, extra steps is needed to train a regressor with weighted data
        #otherwise, just a simply gaussian for each conditional probability distribution model
        self.estimators_ = []
        #another copy to store the parameters all together, for EM/evaluation on all of the models
        self.estimators_full_ = defaultdict(list)
        #<hyin/Feb-6th-2016> an estimator and leaf indexed structure to record the passive likelihood of data...
        passive_likelihood_dict = defaultdict(list)
        for e_idx in range(self.n_estimators):
            #for each estimator
            estimator_parms = defaultdict(list)
            for l_idx in leaf_idx[e_idx]:
                if self.verbose:
                    print 'Processing {0}-th estimator and {1}-th leaf...'.format(e_idx, l_idx)
                #and for each data partition
                data_partition=np.array(partitioned_data[e_idx, l_idx])
                if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
                    X_new         = data_partition[:, data_partition.shape[1]/2:]
                    X_old         = data_partition[:, 0:data_partition.shape[1]/2]
                    X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(data_partition.shape[0])])
                    passive_likelihood = _passive_dyn_likelihood(X_new, X_new_passive, self.passive_dyn_noise, self.passive_dyn_ctrl, self.reg)

                    weights = passive_likelihood / np.sum(passive_likelihood)
                    weighted_mean = np.sum((weights*X_new.T).T, axis=0)

                    estimator_parms['means'].append(weighted_mean)
                    estimator_parms['covars'].append(_frequency_weighted_covariance(X_new, weighted_mean, weights, spherical=False))

                    #for full estimators
                    self.estimators_full_['means'].append(estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(estimator_parms['covars'][-1])

                    #<hyin/Feb-6th-2016> also remember the data weight according to the passive likelihood
                    #this could be useful if the weights according to the passive likelihood is desired for other applications
                    #to evaluate some statistics within the data parition
                    passive_likelihood_dict[e_idx, l_idx] = weights
                else:
                    estimator_parms['means'].append(np.mean(data_partition, axis=0))
                    estimator_parms['covars'].append(np.cov(data_partition.T))

                    #for full estimators
                    self.estimators_full_['means'].append(estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(estimator_parms['covars'][-1])

                    #for MaxEnt, uniform passive likelihood
                    passive_likelihood_dict[e_idx, l_idx] = np.ones(len(data_partition)) / float(len(data_partition))


                estimator_parms['weights'].append(weight_idx[e_idx, l_idx])
                self.estimators_full_['weights'].append(weight_idx[e_idx, l_idx]/float(self.n_estimators))

            self.estimators_.append(estimator_parms)
        #can stop here or go for expectation maximization for each estimator...
        if self.em_itrs > 0:
            #prepare em results for each estimator
            em_res = [self._em_steps(e_idx, X, y) for e_idx in range(self.n_estimators)]
            #or do EM on the full model?
            # <hyin/Dec-2nd-2015> no, doing this seems to harm the learning as the aggregated model is really
            # complex so optimizing that model tends to overfit...
            # em_res = self._em_steps(None, X, y)
            #then use them
            self.estimators_=em_res

        self.prepare_inv_and_constants()
        return indices, leaf_idx, passive_likelihood_dict

    def _em_steps(self, estimator_idx, X, y=None):
        #use current estimation as initialization to perform expectation-maximization
        #now reuse the procedure implemented by scikit-learn, actually a costumized implementation
        #is required if the passive dynamics also needs to be learned.
        if self.verbose:
            if estimator_idx is not None:
                print 'EM steps for the estimator {0}'.format(estimator_idx)
            else:
                print 'EM steps...'

        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            #extract X_old, X_new, X_new_passive
            X_old = X[:, 0:X.shape[1]/2]
            X_new = X[:, X.shape[1]/2:]
            X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(X.shape[0])])


            # EM algorithms
            current_log_likelihood = None
            # reset self.converged_ to False
            converged = False
            # this line should be removed when 'thresh' is removed in v0.18
            tol = 1e-4
            #use the internal EM steps for non-uniform passive dynamics case
            for i in range(self.em_itrs):
                prev_log_likelihood = current_log_likelihood
                # Expectation step
                log_likelihoods, responsibilities = self._do_estep(
                    estimator_idx, X_new_passive, X_new, y)
                current_log_likelihood = log_likelihoods.mean()

                if self.verbose:
                    print 'current_log_likelihood:', current_log_likelihood
                if prev_log_likelihood is not None:
                    change = abs(current_log_likelihood - prev_log_likelihood)
                    if change < tol:
                        converged = True
                        break

                # Maximization step
                if estimator_idx is not None:
                    self._do_mstep(X_new_passive, X_new, responsibilities, self.estimators_[estimator_idx])
                else:
                    self._do_mstep(X_new_passive, X_new, responsibilities, self.estimators_full_)

            if estimator_idx is None:
                res=self.estimators_full_
            else:
                res=self.estimators_[estimator_idx]
        else:
            if estimator_idx is not None:
                n_partitions=len(self.estimators_[estimator_idx]['weights'])
                #use our own initialization
                g = mixture.GMM(n_components=n_partitions, n_iter=self.em_itrs, init_params='',
                    covariance_type='full')
                g.means_=np.array(self.estimators_[estimator_idx]['means'])
                g.covars_=np.array(self.estimators_[estimator_idx]['covars'])
                g.weights_=np.array(self.estimators_[estimator_idx]['weights'])
            else:
                n_partitions=len(self.estimators_full_['weights'])
                g = mixture.GMM(n_components=n_partitions, n_iter=self.em_itrs, init_params='',
                    covariance_type='full')
                g.means_=np.array(self.estimators_full_['means'])
                g.covars_=np.array(self.estimators_full_['covars'])
                g.weights_=np.array(self.estimators_full_['weights'])

            g.fit(X)

            #prepare to return a defaultdict
            res=defaultdict(list)
            res['means']=list(g.means_)
            res['covars']=list(g.covars_)
            res['weights']=list(g.weights_)

        return res

    def _do_estep(self, estimator_idx, X_new_passive, X_new, y):
        return self._score_sample_for_passive_mdl_helper(
                    estimator_idx, X_new_passive, X_new, y)

    def _do_mstep(self, X_new_passive, X_new, responsibilities, parms, min_covar=1e-7):
        """
        X_new_passive    -  An array of the propagation of the old state through the passiv edynamics
        X_new            -  An array of the new states that observed  
        responsibilities -  array_like, shape (n_samples, n_components)
                            Posterior probabilities of each mixture component for each data
        """
        n_samples, n_dim = X_new.shape
        weights = responsibilities.sum(axis=0)
        weighted_X_new_sum = np.dot(responsibilities.T, X_new)
        weighted_X_new_passive_sum = np.dot(responsibilities.T, X_new_passive)
        inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS)
        weighted_X_new_mean = weighted_X_new_sum * inverse_weights
        weighted_X_new_passive_mean = weighted_X_new_passive_sum * inverse_weights

        if 'weights' in parms:
            parms['weights'] = (weights / (weights.sum() + 10 * EPS) + EPS)

        # delta_X_new                 = [None] * n_samples
        # delta_X_new_passive         = [None] * n_samples
        # delta_X_new_passive_Sigma_0 = [None] * n_samples
        # one_array = np.ones(n_dim)
        # for c in range(len(parms['weights'])):
        #     delta_X_new[c]                 = X_new - weighted_X_new_mean[c]
        #     delta_X_new_passive[c]         = X_new_passive - weighted_X_new_passive_mean[c]
        #     delta_X_new_passive_Sigma_0[c] = (1./self.passive_dyn_noise * np.eye(n_dim).dot(delta_X_new_passive[c].T)).T

        # if 'covars' in parms:
        #     #now only support diagonal covariance matrix
        #     for c, old_covar in enumerate(parms['covars']):
        #         constant=np.sum(delta_X_new[c]*delta_X_new[c]*responsibilities[:, c][:, np.newaxis], axis=0)#*inverse_weights[c, 0]
        #         so_coeff=np.sum(delta_X_new_passive_Sigma_0[c]*delta_X_new_passive_Sigma_0[c]*responsibilities[:, c][:, np.newaxis], axis=0)#*inverse_weights[c, 0]
        #         #take the roots for S matrix
        #         S_k=(np.sqrt(one_array+4*so_coeff*constant)-one_array)/(2*so_coeff)
        #         #get Sigma_k from S_k through S_k^(-1) = Sigma_k^(-1) + Sigma_0^(-1)
        #         Sigma_k = 1./(1./S_k -  1./self.passive_dyn_noise * np.ones(n_dim))
        #         print S_k, Sigma_k
        #         parms['covars'][c] = np.diag(Sigma_k)
        # if 'means' in parms:
        #     for c, old_mean in enumerate(parms['means']):
        #         Sigma_k_array = np.diag(parms['covars'][c])
        #         S_k=1./Sigma_k_array + 1./self.passive_dyn_noise * np.ones(n_dim)
        #         coeff_mat = np.diag(Sigma_k_array*(1./S_k))
        #         #difference betwen X_new and X_new_passive
        #         delta_X_new_X_new_passive = X_new - (np.diag(S_k).dot(X_new_passive.T)).T
        #         parms['means'][c] = coeff_mat.dot(np.sum(delta_X_new_X_new_passive*responsibilities[:, c][:, np.newaxis]*inverse_weights[c, 0], axis=0))
        #<hyin/Oct-23rd-2015> Try the formulation from Bellman equation, this seems leading t a weighted-linear regression problem...
        # c = (X_new - X_new_passive)
        #<hyin/OCt-27th-2015> Try the closed-form solutions for a relaxed lower-bound
        # if 'means' in parms:
        #     parms['means'] = weighted_X_new_mean
        # if 'covars' in parms:
        #     for c, old_covar in enumerate(parms['covars']):
        #         data_weights = responsibilities[:, c]
        #         parms['covars'][c] = _frequency_weighted_covariance(X_new, parms['means'][c], data_weights)

        #<hyin/Nov-20th-2015> As far as I realize, the above close-form solution actually optimize a value lower than the actual objective
        #however, this approximation is not tight thus unfortunately we cannot guarantee the optimum is also obtained for the actual objective...
        #another idea is to symplify the model by only learning the mean, or say the center of the RBF function
        #the width of the RBF basis can be adapted by solving a one-dimensional numerical optimization, this should lead to 
        #a generalized EM algorithm
        #<hyin/Jan-22nd-2016> note that without the adaptation of covariance, the shift of mean
        #is not that great option, so let's only keeps the weights adapatation. We need numerical optimization for the covariance adaptation
        #to see if it would help the mean shift 
        if 'means' in parms:
            for c, old_mean in enumerate(parms['means']):
                Sigma_k_array = parms['covars'][c]
                # S_k = self.passive_dyn_noise * self.passive_dyn_ctrl + Sigma_k_array + 1e-5*np.eye(X_new.shape[1])
                # # coeff_mat = np.diag(Sigma_k_array*(1./S_k))
                # inv_Sigma_k_array = np.linalg.pinv(Sigma_k_array)
                # inv_Sigma_sum = np.linalg.pinv(S_k + Sigma_k_array)
                # #could use woodbury here...
                # coeff_mat = np.linalg.pinv(inv_Sigma_k_array - inv_Sigma_sum)
                # #difference betwen X_new and X_new_passive
                # delta_X_new_X_new_passive = (inv_Sigma_k_array.dot(X_new.T) - inv_Sigma_sum.dot(X_new_passive.T)).T

                # parms['means'][c] = coeff_mat.dot(np.sum(delta_X_new_X_new_passive*responsibilities[:, c][:, np.newaxis]*inverse_weights[c, 0], axis=0))

                # #another formulation? which one is correct?
                # <hyin/Dec-2nd-2015> this seems more straightforward and at least give a keep increasing likelihood
                # need to check the original formulation to see whats the problem
                inv_Sigma_k_array = np.linalg.pinv(Sigma_k_array)
                inv_Sigma_0 = np.linalg.pinv(self.passive_dyn_noise * self.passive_dyn_ctrl + self.reg*np.eye(X_new.shape[1]))
                coeff_mat = Sigma_k_array
                inv_Sigma_sum = inv_Sigma_k_array + inv_Sigma_0
                delta_X_new_X_new_passive = (inv_Sigma_sum.dot(X_new.T) - inv_Sigma_0.dot(X_new_passive.T)).T
                parms['means'][c] = coeff_mat.dot(np.sum(delta_X_new_X_new_passive*responsibilities[:, c][:, np.newaxis]*inverse_weights[c, 0], axis=0))
        # return

    def sample(self, n_samples=1, random_state=None):
        '''
        return samples that are synthesized from the model
        '''
        if not hasattr(self, 'estimators_'):
            print 'The model has not been trained yet...'
            return
        else:
            pass
        return

    def score(self, X, y=None):
        #take log likelihood for each estimator for a given trajectory/state
        #without considering the passive dynamics: MaxEnt model
        estimator_scores=[_log_multivariate_normal_density_full(
                            X,
                            np.array(self.estimators_[e_idx]['means']),
                            np.array(self.estimators_[e_idx]['covars']))
                            +np.log(self.estimators_[e_idx]['weights']) for e_idx in range(self.n_estimators)]

        # concatenate different models...
        # estimator_scores=np.concatenate(estimator_scores,axis=1)
        # res=[logsumexp(x)-np.log(1./self.n_estimators) for x in np.array(estimator_scores)]
        # another way: mean of evaluated cost functions
        # helper to evaluate a single model
        mdl_eval = lambda scores: [logsumexp(x_score) for x_score in scores]
        estimator_scores = np.array([mdl_eval(scores) for scores in estimator_scores])

        responsibilities = [np.exp(estimator_scores[e_idx] - estimator_scores[e_idx][:, np.newaxis]) for e_idx in range(self.n_estimators)]
        #average seems to be more reasonable...
        res=np.mean(estimator_scores,axis=0)
        res_responsibilities = np.mean(np.array(responsibilities), axis=0)
        return -np.array(res), res_responsibilities

    def score_samples(self, X, y=None, min_covar=1.e-7):
        #a different version to evaluate the quality/likelihood of state pairs
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            X_old = X[:, 0:X.shape[1]/2]
            X_new = X[:, X.shape[1]/2:]
            X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(X.shape[0])])

            log_prob_lst = [None] * self.n_estimators
            respon_lst = [None] * self.n_estimators
            for e_idx in range(self.n_estimators):
                log_prob_lst[e_idx], respon_lst[e_idx] = self._score_sample_for_passive_mdl_helper(
                    e_idx, X_new_passive, X_new, y, min_covar)
            res = -np.mean(np.array(log_prob_lst),axis=0)
            res_responsibilities = np.mean(np.array(respon_lst), axis=0)
        else:
            #this should be a trajectory/maximum ent model, use score...
            res, res_responsibilities = self.score(X, y)
        return res, res_responsibilities 


    def value_eval_samples(self, X, y=None, average=False, full=True, const=True):
        #switching off the constant term seems to smooth the value function
        #I don't quite understand why, my current guess is that the axis-align partition results in 
        #oversized covariance matrices, making the constant terms extremely large for some partitions
        #this can be shown adding a fixed term to the covariance matrices to mitigate the singularity
        #this could be cast as a kind of regularization

        #the new switch is actually equivalent to average=True, but since the training parameters are separated
        #lets keep this ugly solution...
        n_samples, n_dim = X.shape

        if not average:
            if not full:
                weights = []
                for idx in range(self.n_estimators):
                    weights = weights + (np.array(self.estimators_[idx]['weights'])/self.n_estimators).tolist()
                #the real function to evaluate the value functions, which are actually un-normalized Gaussians
                def value_estimator_eval(d):
                    res = []
                    for idx in range(self.n_estimators):
                        for i, (m, c_inv) in enumerate(   zip(self.estimators_[idx]['means'], 
                                                    self.estimators_[idx]['inv_covars'])):
                            diff_data = d - m
                            res.append(.5*diff_data.dot(c_inv).dot(diff_data) + self.estimators_[idx]['beta'][i]*const)
                    return np.array(res)

                res = np.array([ -logsumexp(-value_estimator_eval(d), b=np.array(weights)) for d in X])
            else:
                res = np.zeros(X.shape[0])
                res_mat = np.zeros((X.shape[0], len(self.estimators_full_['means'])))
                for i, (m, c_inv)   in enumerate(   zip(self.estimators_full_['means'], 
                                                self.estimators_full_['inv_covars'])):
                    diff_data = X - m
                    res_mat[:, i] = np.array([e_prod.dot(e)*0.5 + self.estimators_full_['beta'][i]*const for e_prod, e in zip(diff_data.dot(c_inv), diff_data)])
                for d_idx, r in enumerate(res_mat):
                    res[d_idx] = -logsumexp(-r, b=self.estimators_full_['weights'])
        else:
            #the real function to evaluate the value functions, which are actually un-normalized Gaussians
            def value_estimator_eval(idx):
                res = np.zeros((X.shape[0], len(self.estimators_[idx]['means'])))
                logsumexp_res=np.zeros(len(res))
                for i, (m, c_inv) in enumerate(   zip(self.estimators_[idx]['means'], 
                                            self.estimators_[idx]['inv_covars'])):
                    diff_data = X - m
                    res[:, i] = np.array([e_prod.dot(e)*0.5 + self.estimators_[idx]['beta'][i]*const for e_prod, e in zip(diff_data.dot(c_inv), diff_data)])
                for d_idx, r in enumerate(res):
                    logsumexp_res[d_idx] = -logsumexp(-r, b=self.estimators_[idx]['weights'])

                return logsumexp_res
                
            estimator_scores = [ value_estimator_eval(e_idx) for e_idx in range(self.n_estimators) ]
            #take average
            res = np.mean(np.array(estimator_scores), axis=0)
        return res
 
    def _score_sample_for_passive_mdl_helper(self, estimator_idx, X_new_passive, X_new, y, min_covar=1.e-7):
        #for the specified estimator with a passive dynamics model,
        #evaluate the likelihood for given state pairs
        #to call this, ensure passive dynamics and noise are available
        n_samples, n_dim = X_new.shape

        #incorporate the likelihood of passive dynamics - a Gaussian
        """
                        P_0(x'|x) exp^(V(x'))
        P(x'|x) = --------------------------------- = N(x', m(x), S)
                    int_x'' P_0(x''|x) exp^(V(x''))
        """
        """
        for sake of maximization step and simplicity, evaluate a lower-bound instead
        log(P(x'|x)) > -0.5 * D * log(2*pi) + 0.5*log(det(S^{-1})) -0.5*log2 + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
                     > -0.5 * D * log(2*pi) + 0.5*log(det(S^{-1})/2) + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k)
                     > -0.5 * D * log(2*pi) + 0.5*log((det(Sigma_k)^{-1}+det(Sigma_0)^{-1})/2) + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
                     > -0.5 * D * log(2*pi) + 0.5*log(det(Sigma_k)^{-1})/2 + 0.5*log(det(Sigma_0))/2 + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
        Any way to bound the last term to also make it independent from matrix other than Sigma_k?
        """

        # regularize to prevent numerical instability
        Sigma_0 = self.passive_dyn_noise * self.passive_dyn_ctrl + self.reg*np.eye(X_new.shape[1])
        # + 1e-2 * np.eye(X_new.shape[1])
        Sigma_0_inv = np.linalg.pinv(Sigma_0)
        if estimator_idx is not None:
            Sigma   = self.estimators_[estimator_idx]['covars']
            mu      = self.estimators_[estimator_idx]['means']
            w       = self.estimators_[estimator_idx]['weights']
        else:
            Sigma   = self.estimators_full_['covars']
            mu      = self.estimators_full_['means']
            w       = self.estimators_full_['weights']
        nmix    = len(mu)

        log_prob  = np.empty((n_samples, nmix))
        for c, (mu_k, Sigma_k) in enumerate(zip(mu, Sigma)):
            #obviously, this fraction can be optimized by exploiting the structure of covariance matrix
            #using say Cholesky decomposition
            Sigma_k_inv = np.linalg.pinv(Sigma_k)
            S_inv       = Sigma_k_inv + Sigma_0_inv
            S           = np.linalg.pinv(S_inv)
            try:
                S_chol = linalg.cholesky(S, lower=True)
            except linalg.LinAlgError:
                # The model is most probably stuck in a component with too
                # few observations, we need to reinitialize this components
                S_chol = linalg.cholesky(S + min_covar * np.eye(n_dim),
                                          lower=True)
            m = S.dot((Sigma_k_inv.dot(mu_k)+Sigma_0_inv.dot(X_new_passive.T).T).T).T
            #fraction part of above equation
            # scale_log_det = -.5 * (np.log(2*np.pi) + np.sum(np.log(S_inv)) + 
            #     2*np.sum(np.log(np.diag(Sigma_k_chol))) + np.sum(np.log(np.diag(Sigma_0))))
            # #exp() part of the above equation
            # S_sol = linalg.solve_triangular(M_chol, (X_new - X_old).T, lower=True).T

            # scale_log_rbf = -.5 * (np.sum(M_sol**2), axis=1)
            S_log_det = 2 * np.sum(np.log(np.diag(S_chol)))
            # print 'S_log_det:', S_log_det
            S_sol = linalg.solve_triangular(S_chol, (X_new - m).T, lower=True).T
            log_prob[:, c] = -.5 * (np.sum(S_sol**2, axis=1) + n_dim * np.log(2 * np.pi) + S_log_det)
        lpr = log_prob + np.log(w)
        # print 'log_prob:', log_prob
        # print 'w:', w
        # print 'lpr:', lpr
        logprob = logsumexp(lpr, axis=1)
        responsibilities = np.exp(lpr - logprob[:, np.newaxis])
        return logprob, responsibilities

    def prepare_inv_and_constants(self):
        '''
        supplement steps to prepare inverse of variance matrices and constant terms
        ''' 
        regularization = self.reg
        for idx in range(self.n_estimators):
            self.estimators_[idx]['inv_covars'] = [ np.linalg.pinv(covar + np.eye(covar.shape[0])*regularization) for covar in self.estimators_[idx]['covars']]
            self.estimators_[idx]['beta'] = [.5*np.log(pseudo_determinant(covar + np.eye(covar.shape[0])*regularization)) + .5*np.log(2*np.pi)*covar.shape[0] for covar in self.estimators_[idx]['covars']]

        self.estimators_full_['weights'] = []
        self.estimators_full_['means'] = []
        self.estimators_full_['covars'] = []
        for e_idx in range(self.n_estimators):
            for leaf_idx in range(len(self.estimators_[e_idx]['weights'])):
                self.estimators_full_['weights'].append(self.estimators_[e_idx]['weights'][leaf_idx]/float(self.n_estimators))
                self.estimators_full_['covars'].append(self.estimators_[e_idx]['covars'][leaf_idx])
                self.estimators_full_['means'].append(self.estimators_[e_idx]['means'][leaf_idx])
        # self.estimators_full_['inv_covars'] = [ np.linalg.pinv(covar) for covar in self.estimators_full_['covars']]
        # self.estimators_full_['beta'] = [.5*np.log(pseudo_determinant(covar)) + .5*np.log(2*np.pi)*covar.shape[0] for covar in self.estimators_full_['covars']]
                self.estimators_full_['inv_covars'].append(self.estimators_[e_idx]['inv_covars'][leaf_idx])
                self.estimators_full_['beta'].append(self.estimators_[e_idx]['beta'][leaf_idx])
        return
"""
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
hasher.fit(X)
model = SelectFromModel(hasher, prefit=True)
X_transformed = model.transform(X)

# Visualize result using PCA
pca = TruncatedSVD(n_components=2)
X_reduced = pca.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)

# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)
Example #32
0
# perform kmeans with k clusters and pca data
k = 250

kmeans = KMeans(init ='k-means++', n_clusters = k, n_init = 10, 
                random_state=0, verbose = 0).fit(data)

codewords = kmeans.cluster_centers_
codewords.shape

"""## Random Trees Embedding"""

rtree = RandomTreesEmbedding(n_estimators=1000, max_depth=70, 
                             min_samples_leaf=1, min_samples_split=2,
                             verbose=1, random_state=0)

rtree.fit(data)

# For each datapoint x in X and for each tree in the forest, 
# return the index of the leaf x ends up in.
leafs = rtree.apply(data)

leafs.shape

"""# Histogram of visual words"""

# note how many SIFTs are per image knowing there are 150 images

def count_sifts_per_image(x):
  sift = cv.xfeatures2d.SIFT_create()
  n_sift = []
  for label_img in x:
Example #33
0
                    constraints.append(line)
                else:
                    res.append(self._parse_line(constraints[:]))
                    visited_leaf = True
                    leaf_depth = line.count('|   ')
            res_.append(res[:])
        return res_


# Load data.
data = load_digits()
X = data.data

# Fit the encoder and define the decoder.
rte = RandomTreesEmbedding(n_estimators=20, sparse_output=False, max_depth=50)
rte.fit(X)
rted = RandomTreesEmbeddingDecoder(rte)

# Encode and decode the pictures.
e = rted.encode(X)
d = rted.decode(e, dim=64, method='mean')
np.abs(X - d).mean()

# Plot a single number.
to_plot = 0
fig = plt.figure(figsize=(12, 6))
ax1 = fig.add_subplot(121)
ax1.imshow(X[to_plot].reshape(8, 8))
ax1.set_title('Original')

ax3 = fig.add_subplot(122)
mpl.rcParams["font.family"] = 'Arial Unicode MS'

names = ['A', 'B', 'C', 'D', 'cla', ]
df = pd.read_csv('../../data_set/iris.data', names=names)
df.info()

X = df[names[0:-1]]

"""
n_estimators: Any = 100,最终训练的子模型数量
max_depth: Any = 5,最大树深
min_samples_split: Any = 2,树分裂的最小样本数目
min_samples_leaf: Any = 1,叶子节点最小样本数目
min_weight_fraction_leaf: Any = 0.,样本权重的最小加成参数(暂无用)
max_leaf_nodes: Any = None,最多允许的叶子节点数目 None 不限制
min_impurity_decrease: Any = 0.,分裂导致不纯度减少大于等于该值则分裂
min_impurity_split: Any = None,分裂提前停止阈值,一个节点不纯度大于此阈值才能分裂
sparse_output: Any = True,是否返回稀疏矩阵
warm_start: Any = False, 是否预热(重用之前的模型进行训练) 默认否
n_jobs: Any = None, 线程数
random_state: Any = None, 随机数种子
verbose: Any = 0  是否打印训练过程 0不打印 1打印
"""
algo = RandomTreesEmbedding(n_estimators=10, max_depth=3, sparse_output=False)
algo.fit(X)
x_ex = algo.transform(X)

# 追加特征列表
for x in x_ex[0:10]:
    print(x)
n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
rt_lm = LogisticRegression()
rt.fit(X_train, y_train)
rt_lm.fit(rt.transform(X_train_lr), y_train_lr)

y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
Example #36
0
def rt_embedding(training_features, testing_features):
    rt = RandomTreesEmbedding()
    rt.fit(training_features)
    testing_features = rt.transform(testing_features)
    training_features = rt.transform(training_features)
    return training_features, testing_features
class EnsembleIOC(BaseEstimator, RegressorMixin):
    def __init__(self,
                 n_estimators=20,
                 max_depth=5,
                 min_samples_split=10,
                 min_samples_leaf=10,
                 random_state=0,
                 em_itrs=5,
                 regularization=0.05,
                 passive_dyn_func=None,
                 passive_dyn_ctrl=None,
                 passive_dyn_noise=None,
                 verbose=False):
        '''
        n_estimators        - number of ensembled models
        ...                 - a batch of parameters used for RandomTreesEmbedding, see relevant documents
        em_itrs             - maximum number of EM iterations to take
        regularization      - small positive scalar to prevent singularity of matrix inversion
        passive_dyn_func    - function to evaluate passive dynamics; None for MaxEnt model
        passive_dyn_ctrl    - function to return the control matrix which might depend on the state...
        passive_dyn_noise   - covariance of a Gaussian noise; only applicable when passive_dyn is Gaussian; None for MaxEnt model
                                note this implies a dynamical system with constant input gain. It is extendable to have state dependent
                                input gain then we need covariance for each data point
        verbose             - output training information
        '''
        BaseEstimator.__init__(self)

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state
        self.em_itrs = em_itrs
        self.reg = regularization
        self.passive_dyn_func = passive_dyn_func
        self.passive_dyn_ctrl = passive_dyn_ctrl
        self.passive_dyn_noise = passive_dyn_noise
        self.verbose = verbose
        return

    def fit(self, X, y=None):
        '''
        y could be the array of starting state of the demonstrated trajectories/policies
        if it is None, it implicitly implies a MaxEnt model. Other wise, it serves as the feature mapping
        of the starting state. This data might also be potentially used for learning the passive dynamics
        for a pure model-free learning with some regressors and regularization.
        '''
        #check parameters...
        assert (type(self.n_estimators) == int)
        assert (self.n_estimators > 0)
        assert (type(self.max_depth) == int)
        assert (self.max_depth > 0)
        assert (type(self.min_samples_split) == int)
        assert (self.min_samples_split > 0)
        assert (type(self.min_samples_leaf) == int)
        assert (self.min_samples_leaf > 0)
        assert (type(self.em_itrs) == int)

        #an initial partitioning of data with random forest embedding
        self.random_embedding_mdl_ = RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state)

        #we probably do not need the data type to differentiate it is a demonstration
        #of trajectory or commanded state, do we?
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            self.random_embedding_mdl_.fit(X[:, X.shape[1] / 2:])
            indices = self.random_embedding_mdl_.apply(X[:, X.shape[1] / 2:])
            # X_tmp = np.array(X)
            # X_tmp[:, X.shape[1]/2:] = X_tmp[:, X.shape[1]/2:] - X_tmp[:, :X.shape[1]/2]
            # self.random_embedding_mdl_.fit(X_tmp)

            # indices = self.random_embedding_mdl_.apply(X_tmp)
        else:
            self.random_embedding_mdl_.fit(X)
            #figure out indices
            indices = self.random_embedding_mdl_.apply(X)

        partitioned_data = defaultdict(list)

        leaf_idx = defaultdict(set)
        weight_idx = defaultdict(float)
        #group data belongs to the same partition and have the weights...
        #is weight really necessary for EM steps? Hmm, seems to be for the initialization
        #d_idx: data index; p_idx: partition index (comprised of estimator index and leaf index)
        for d_idx, d, p_idx in zip(range(len(X)), X, indices):
            for e_idx, l_idx in enumerate(p_idx):
                partitioned_data[e_idx, l_idx].append(d)
                leaf_idx[e_idx] |= {l_idx}

            for e_idx, l_idx in enumerate(p_idx):
                weight_idx[e_idx, l_idx] = float(
                    len(partitioned_data[e_idx, l_idx])) / len(X)
                # weight_idx[e_idx, l_idx] = 1. / len(p_idx)

        #for each grouped data, solve an easy IOC problem by assuming quadratic cost-to-go function
        #note that, if the passive dynamics need to be learned, extra steps is needed to train a regressor with weighted data
        #otherwise, just a simply gaussian for each conditional probability distribution model
        self.estimators_ = []
        #another copy to store the parameters all together, for EM/evaluation on all of the models
        self.estimators_full_ = defaultdict(list)
        #<hyin/Feb-6th-2016> an estimator and leaf indexed structure to record the passive likelihood of data...
        passive_likelihood_dict = defaultdict(list)
        for e_idx in range(self.n_estimators):
            #for each estimator
            estimator_parms = defaultdict(list)
            for l_idx in leaf_idx[e_idx]:
                if self.verbose:
                    print 'Processing {0}-th estimator and {1}-th leaf...'.format(
                        e_idx, l_idx)
                #and for each data partition
                data_partition = np.array(partitioned_data[e_idx, l_idx])
                if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
                    X_new = data_partition[:, data_partition.shape[1] / 2:]
                    X_old = data_partition[:, 0:data_partition.shape[1] / 2]
                    X_new_passive = np.array([
                        self.passive_dyn_func(X_old[sample_idx])
                        for sample_idx in range(data_partition.shape[0])
                    ])
                    passive_likelihood = _passive_dyn_likelihood(
                        X_new, X_new_passive, self.passive_dyn_noise,
                        self.passive_dyn_ctrl, self.reg)

                    weights = passive_likelihood / np.sum(passive_likelihood)
                    weighted_mean = np.sum((weights * X_new.T).T, axis=0)

                    estimator_parms['means'].append(weighted_mean)
                    estimator_parms['covars'].append(
                        _frequency_weighted_covariance(X_new,
                                                       weighted_mean,
                                                       weights,
                                                       spherical=False))

                    #for full estimators
                    self.estimators_full_['means'].append(
                        estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(
                        estimator_parms['covars'][-1])

                    #<hyin/Feb-6th-2016> also remember the data weight according to the passive likelihood
                    #this could be useful if the weights according to the passive likelihood is desired for other applications
                    #to evaluate some statistics within the data parition
                    passive_likelihood_dict[e_idx, l_idx] = weights
                else:
                    estimator_parms['means'].append(
                        np.mean(data_partition, axis=0))
                    estimator_parms['covars'].append(np.cov(data_partition.T))

                    #for full estimators
                    self.estimators_full_['means'].append(
                        estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(
                        estimator_parms['covars'][-1])

                    #for MaxEnt, uniform passive likelihood
                    passive_likelihood_dict[e_idx, l_idx] = np.ones(
                        len(data_partition)) / float(len(data_partition))

                estimator_parms['weights'].append(weight_idx[e_idx, l_idx])
                self.estimators_full_['weights'].append(
                    weight_idx[e_idx, l_idx] / float(self.n_estimators))

            self.estimators_.append(estimator_parms)
        #can stop here or go for expectation maximization for each estimator...
        if self.em_itrs > 0:
            #prepare em results for each estimator
            em_res = [
                self._em_steps(e_idx, X, y)
                for e_idx in range(self.n_estimators)
            ]
            #or do EM on the full model?
            # <hyin/Dec-2nd-2015> no, doing this seems to harm the learning as the aggregated model is really
            # complex so optimizing that model tends to overfit...
            # em_res = self._em_steps(None, X, y)
            #then use them
            self.estimators_ = em_res

        self.prepare_inv_and_constants()
        return indices, leaf_idx, passive_likelihood_dict

    def _em_steps(self, estimator_idx, X, y=None):
        #use current estimation as initialization to perform expectation-maximization
        #now reuse the procedure implemented by scikit-learn, actually a costumized implementation
        #is required if the passive dynamics also needs to be learned.
        if self.verbose:
            if estimator_idx is not None:
                print 'EM steps for the estimator {0}'.format(estimator_idx)
            else:
                print 'EM steps...'

        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            #extract X_old, X_new, X_new_passive
            X_old = X[:, 0:X.shape[1] / 2]
            X_new = X[:, X.shape[1] / 2:]
            X_new_passive = np.array([
                self.passive_dyn_func(X_old[sample_idx])
                for sample_idx in range(X.shape[0])
            ])

            # EM algorithms
            current_log_likelihood = None
            # reset self.converged_ to False
            converged = False
            # this line should be removed when 'thresh' is removed in v0.18
            tol = 1e-4
            #use the internal EM steps for non-uniform passive dynamics case
            for i in range(self.em_itrs):
                prev_log_likelihood = current_log_likelihood
                # Expectation step
                log_likelihoods, responsibilities = self._do_estep(
                    estimator_idx, X_new_passive, X_new, y)
                current_log_likelihood = log_likelihoods.mean()

                if self.verbose:
                    print 'current_log_likelihood:', current_log_likelihood
                if prev_log_likelihood is not None:
                    change = abs(current_log_likelihood - prev_log_likelihood)
                    if change < tol:
                        converged = True
                        break

                # Maximization step
                if estimator_idx is not None:
                    self._do_mstep(X_new_passive, X_new, responsibilities,
                                   self.estimators_[estimator_idx])
                else:
                    self._do_mstep(X_new_passive, X_new, responsibilities,
                                   self.estimators_full_)

            if estimator_idx is None:
                res = self.estimators_full_
            else:
                res = self.estimators_[estimator_idx]
        else:
            if estimator_idx is not None:
                n_partitions = len(self.estimators_[estimator_idx]['weights'])
                #use our own initialization
                g = mixture.GMM(n_components=n_partitions,
                                n_iter=self.em_itrs,
                                init_params='',
                                covariance_type='full')
                g.means_ = np.array(self.estimators_[estimator_idx]['means'])
                g.covars_ = np.array(self.estimators_[estimator_idx]['covars'])
                g.weights_ = np.array(
                    self.estimators_[estimator_idx]['weights'])
            else:
                n_partitions = len(self.estimators_full_['weights'])
                g = mixture.GMM(n_components=n_partitions,
                                n_iter=self.em_itrs,
                                init_params='',
                                covariance_type='full')
                g.means_ = np.array(self.estimators_full_['means'])
                g.covars_ = np.array(self.estimators_full_['covars'])
                g.weights_ = np.array(self.estimators_full_['weights'])

            g.fit(X)

            #prepare to return a defaultdict
            res = defaultdict(list)
            res['means'] = list(g.means_)
            res['covars'] = list(g.covars_)
            res['weights'] = list(g.weights_)

        return res

    def _do_estep(self, estimator_idx, X_new_passive, X_new, y):
        return self._score_sample_for_passive_mdl_helper(
            estimator_idx, X_new_passive, X_new, y)

    def _do_mstep(self,
                  X_new_passive,
                  X_new,
                  responsibilities,
                  parms,
                  min_covar=1e-7):
        """
        X_new_passive    -  An array of the propagation of the old state through the passiv edynamics
        X_new            -  An array of the new states that observed  
        responsibilities -  array_like, shape (n_samples, n_components)
                            Posterior probabilities of each mixture component for each data
        """
        n_samples, n_dim = X_new.shape
        weights = responsibilities.sum(axis=0)
        weighted_X_new_sum = np.dot(responsibilities.T, X_new)
        weighted_X_new_passive_sum = np.dot(responsibilities.T, X_new_passive)
        inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS)
        weighted_X_new_mean = weighted_X_new_sum * inverse_weights
        weighted_X_new_passive_mean = weighted_X_new_passive_sum * inverse_weights

        if 'weights' in parms:
            parms['weights'] = (weights / (weights.sum() + 10 * EPS) + EPS)

        # delta_X_new                 = [None] * n_samples
        # delta_X_new_passive         = [None] * n_samples
        # delta_X_new_passive_Sigma_0 = [None] * n_samples
        # one_array = np.ones(n_dim)
        # for c in range(len(parms['weights'])):
        #     delta_X_new[c]                 = X_new - weighted_X_new_mean[c]
        #     delta_X_new_passive[c]         = X_new_passive - weighted_X_new_passive_mean[c]
        #     delta_X_new_passive_Sigma_0[c] = (1./self.passive_dyn_noise * np.eye(n_dim).dot(delta_X_new_passive[c].T)).T

        # if 'covars' in parms:
        #     #now only support diagonal covariance matrix
        #     for c, old_covar in enumerate(parms['covars']):
        #         constant=np.sum(delta_X_new[c]*delta_X_new[c]*responsibilities[:, c][:, np.newaxis], axis=0)#*inverse_weights[c, 0]
        #         so_coeff=np.sum(delta_X_new_passive_Sigma_0[c]*delta_X_new_passive_Sigma_0[c]*responsibilities[:, c][:, np.newaxis], axis=0)#*inverse_weights[c, 0]
        #         #take the roots for S matrix
        #         S_k=(np.sqrt(one_array+4*so_coeff*constant)-one_array)/(2*so_coeff)
        #         #get Sigma_k from S_k through S_k^(-1) = Sigma_k^(-1) + Sigma_0^(-1)
        #         Sigma_k = 1./(1./S_k -  1./self.passive_dyn_noise * np.ones(n_dim))
        #         print S_k, Sigma_k
        #         parms['covars'][c] = np.diag(Sigma_k)
        # if 'means' in parms:
        #     for c, old_mean in enumerate(parms['means']):
        #         Sigma_k_array = np.diag(parms['covars'][c])
        #         S_k=1./Sigma_k_array + 1./self.passive_dyn_noise * np.ones(n_dim)
        #         coeff_mat = np.diag(Sigma_k_array*(1./S_k))
        #         #difference betwen X_new and X_new_passive
        #         delta_X_new_X_new_passive = X_new - (np.diag(S_k).dot(X_new_passive.T)).T
        #         parms['means'][c] = coeff_mat.dot(np.sum(delta_X_new_X_new_passive*responsibilities[:, c][:, np.newaxis]*inverse_weights[c, 0], axis=0))
        #<hyin/Oct-23rd-2015> Try the formulation from Bellman equation, this seems leading t a weighted-linear regression problem...
        # c = (X_new - X_new_passive)
        #<hyin/OCt-27th-2015> Try the closed-form solutions for a relaxed lower-bound
        # if 'means' in parms:
        #     parms['means'] = weighted_X_new_mean
        # if 'covars' in parms:
        #     for c, old_covar in enumerate(parms['covars']):
        #         data_weights = responsibilities[:, c]
        #         parms['covars'][c] = _frequency_weighted_covariance(X_new, parms['means'][c], data_weights)

        #<hyin/Nov-20th-2015> As far as I realize, the above close-form solution actually optimize a value lower than the actual objective
        #however, this approximation is not tight thus unfortunately we cannot guarantee the optimum is also obtained for the actual objective...
        #another idea is to symplify the model by only learning the mean, or say the center of the RBF function
        #the width of the RBF basis can be adapted by solving a one-dimensional numerical optimization, this should lead to
        #a generalized EM algorithm
        #<hyin/Jan-22nd-2016> note that without the adaptation of covariance, the shift of mean
        #is not that great option, so let's only keeps the weights adapatation. We need numerical optimization for the covariance adaptation
        #to see if it would help the mean shift
        if 'means' in parms:
            for c, old_mean in enumerate(parms['means']):
                Sigma_k_array = parms['covars'][c]
                # S_k = self.passive_dyn_noise * self.passive_dyn_ctrl + Sigma_k_array + 1e-5*np.eye(X_new.shape[1])
                # # coeff_mat = np.diag(Sigma_k_array*(1./S_k))
                # inv_Sigma_k_array = np.linalg.pinv(Sigma_k_array)
                # inv_Sigma_sum = np.linalg.pinv(S_k + Sigma_k_array)
                # #could use woodbury here...
                # coeff_mat = np.linalg.pinv(inv_Sigma_k_array - inv_Sigma_sum)
                # #difference betwen X_new and X_new_passive
                # delta_X_new_X_new_passive = (inv_Sigma_k_array.dot(X_new.T) - inv_Sigma_sum.dot(X_new_passive.T)).T

                # parms['means'][c] = coeff_mat.dot(np.sum(delta_X_new_X_new_passive*responsibilities[:, c][:, np.newaxis]*inverse_weights[c, 0], axis=0))

                # #another formulation? which one is correct?
                # <hyin/Dec-2nd-2015> this seems more straightforward and at least give a keep increasing likelihood
                # need to check the original formulation to see whats the problem
                inv_Sigma_k_array = np.linalg.pinv(Sigma_k_array)
                inv_Sigma_0 = np.linalg.pinv(self.passive_dyn_noise *
                                             self.passive_dyn_ctrl +
                                             self.reg * np.eye(X_new.shape[1]))
                coeff_mat = Sigma_k_array
                inv_Sigma_sum = inv_Sigma_k_array + inv_Sigma_0
                delta_X_new_X_new_passive = (
                    inv_Sigma_sum.dot(X_new.T) -
                    inv_Sigma_0.dot(X_new_passive.T)).T
                parms['means'][c] = coeff_mat.dot(
                    np.sum(delta_X_new_X_new_passive *
                           responsibilities[:, c][:, np.newaxis] *
                           inverse_weights[c, 0],
                           axis=0))
        # return

    def sample(self, n_samples=1, random_state=None):
        '''
        return samples that are synthesized from the model
        '''
        if not hasattr(self, 'estimators_'):
            print 'The model has not been trained yet...'
            return
        else:
            pass
        return

    def score(self, X, y=None):
        #take log likelihood for each estimator for a given trajectory/state
        #without considering the passive dynamics: MaxEnt model
        estimator_scores = [
            _log_multivariate_normal_density_full(
                X, np.array(self.estimators_[e_idx]['means']),
                np.array(self.estimators_[e_idx]['covars'])) +
            np.log(self.estimators_[e_idx]['weights'])
            for e_idx in range(self.n_estimators)
        ]

        # concatenate different models...
        # estimator_scores=np.concatenate(estimator_scores,axis=1)
        # res=[logsumexp(x)-np.log(1./self.n_estimators) for x in np.array(estimator_scores)]
        # another way: mean of evaluated cost functions
        # helper to evaluate a single model
        mdl_eval = lambda scores: [logsumexp(x_score) for x_score in scores]
        estimator_scores = np.array(
            [mdl_eval(scores) for scores in estimator_scores])

        responsibilities = [
            np.exp(estimator_scores[e_idx] -
                   estimator_scores[e_idx][:, np.newaxis])
            for e_idx in range(self.n_estimators)
        ]
        #average seems to be more reasonable...
        res = np.mean(estimator_scores, axis=0)
        res_responsibilities = np.mean(np.array(responsibilities), axis=0)
        return -np.array(res), res_responsibilities

    def score_samples(self, X, y=None, min_covar=1.e-7):
        #a different version to evaluate the quality/likelihood of state pairs
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            X_old = X[:, 0:X.shape[1] / 2]
            X_new = X[:, X.shape[1] / 2:]
            X_new_passive = np.array([
                self.passive_dyn_func(X_old[sample_idx])
                for sample_idx in range(X.shape[0])
            ])

            log_prob_lst = [None] * self.n_estimators
            respon_lst = [None] * self.n_estimators
            for e_idx in range(self.n_estimators):
                log_prob_lst[e_idx], respon_lst[
                    e_idx] = self._score_sample_for_passive_mdl_helper(
                        e_idx, X_new_passive, X_new, y, min_covar)
            res = -np.mean(np.array(log_prob_lst), axis=0)
            res_responsibilities = np.mean(np.array(respon_lst), axis=0)
        else:
            #this should be a trajectory/maximum ent model, use score...
            res, res_responsibilities = self.score(X, y)
        return res, res_responsibilities

    def value_eval_samples(self,
                           X,
                           y=None,
                           average=False,
                           full=True,
                           const=True):
        #switching off the constant term seems to smooth the value function
        #I don't quite understand why, my current guess is that the axis-align partition results in
        #oversized covariance matrices, making the constant terms extremely large for some partitions
        #this can be shown adding a fixed term to the covariance matrices to mitigate the singularity
        #this could be cast as a kind of regularization

        #the new switch is actually equivalent to average=True, but since the training parameters are separated
        #lets keep this ugly solution...
        n_samples, n_dim = X.shape

        if not average:
            if not full:
                weights = []
                for idx in range(self.n_estimators):
                    weights = weights + (
                        np.array(self.estimators_[idx]['weights']) /
                        self.n_estimators).tolist()
                #the real function to evaluate the value functions, which are actually un-normalized Gaussians
                def value_estimator_eval(d):
                    res = []
                    for idx in range(self.n_estimators):
                        for i, (m, c_inv) in enumerate(
                                zip(self.estimators_[idx]['means'],
                                    self.estimators_[idx]['inv_covars'])):
                            diff_data = d - m
                            res.append(
                                .5 * diff_data.dot(c_inv).dot(diff_data) +
                                self.estimators_[idx]['beta'][i] * const)
                    return np.array(res)

                res = np.array([
                    -logsumexp(-value_estimator_eval(d), b=np.array(weights))
                    for d in X
                ])
            else:
                res = np.zeros(X.shape[0])
                res_mat = np.zeros(
                    (X.shape[0], len(self.estimators_full_['means'])))
                for i, (m, c_inv) in enumerate(
                        zip(self.estimators_full_['means'],
                            self.estimators_full_['inv_covars'])):
                    diff_data = X - m
                    res_mat[:, i] = np.array([
                        e_prod.dot(e) * 0.5 +
                        self.estimators_full_['beta'][i] * const
                        for e_prod, e in zip(diff_data.dot(c_inv), diff_data)
                    ])
                for d_idx, r in enumerate(res_mat):
                    res[d_idx] = -logsumexp(-r,
                                            b=self.estimators_full_['weights'])
        else:
            #the real function to evaluate the value functions, which are actually un-normalized Gaussians
            def value_estimator_eval(idx):
                res = np.zeros(
                    (X.shape[0], len(self.estimators_[idx]['means'])))
                logsumexp_res = np.zeros(len(res))
                for i, (m, c_inv) in enumerate(
                        zip(self.estimators_[idx]['means'],
                            self.estimators_[idx]['inv_covars'])):
                    diff_data = X - m
                    res[:, i] = np.array([
                        e_prod.dot(e) * 0.5 +
                        self.estimators_[idx]['beta'][i] * const
                        for e_prod, e in zip(diff_data.dot(c_inv), diff_data)
                    ])
                for d_idx, r in enumerate(res):
                    logsumexp_res[d_idx] = -logsumexp(
                        -r, b=self.estimators_[idx]['weights'])

                return logsumexp_res

            estimator_scores = [
                value_estimator_eval(e_idx)
                for e_idx in range(self.n_estimators)
            ]
            #take average
            res = np.mean(np.array(estimator_scores), axis=0)
        return res

    def _score_sample_for_passive_mdl_helper(self,
                                             estimator_idx,
                                             X_new_passive,
                                             X_new,
                                             y,
                                             min_covar=1.e-7):
        #for the specified estimator with a passive dynamics model,
        #evaluate the likelihood for given state pairs
        #to call this, ensure passive dynamics and noise are available
        n_samples, n_dim = X_new.shape

        #incorporate the likelihood of passive dynamics - a Gaussian
        """
                        P_0(x'|x) exp^(V(x'))
        P(x'|x) = --------------------------------- = N(x', m(x), S)
                    int_x'' P_0(x''|x) exp^(V(x''))
        """
        """
        for sake of maximization step and simplicity, evaluate a lower-bound instead
        log(P(x'|x)) > -0.5 * D * log(2*pi) + 0.5*log(det(S^{-1})) -0.5*log2 + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
                     > -0.5 * D * log(2*pi) + 0.5*log(det(S^{-1})/2) + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k)
                     > -0.5 * D * log(2*pi) + 0.5*log((det(Sigma_k)^{-1}+det(Sigma_0)^{-1})/2) + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
                     > -0.5 * D * log(2*pi) + 0.5*log(det(Sigma_k)^{-1})/2 + 0.5*log(det(Sigma_0))/2 + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
        Any way to bound the last term to also make it independent from matrix other than Sigma_k?
        """

        # regularize to prevent numerical instability
        Sigma_0 = self.passive_dyn_noise * self.passive_dyn_ctrl + self.reg * np.eye(
            X_new.shape[1])
        # + 1e-2 * np.eye(X_new.shape[1])
        Sigma_0_inv = np.linalg.pinv(Sigma_0)
        if estimator_idx is not None:
            Sigma = self.estimators_[estimator_idx]['covars']
            mu = self.estimators_[estimator_idx]['means']
            w = self.estimators_[estimator_idx]['weights']
        else:
            Sigma = self.estimators_full_['covars']
            mu = self.estimators_full_['means']
            w = self.estimators_full_['weights']
        nmix = len(mu)

        log_prob = np.empty((n_samples, nmix))
        for c, (mu_k, Sigma_k) in enumerate(zip(mu, Sigma)):
            #obviously, this fraction can be optimized by exploiting the structure of covariance matrix
            #using say Cholesky decomposition
            Sigma_k_inv = np.linalg.pinv(Sigma_k)
            S_inv = Sigma_k_inv + Sigma_0_inv
            S = np.linalg.pinv(S_inv)
            try:
                S_chol = linalg.cholesky(S, lower=True)
            except linalg.LinAlgError:
                # The model is most probably stuck in a component with too
                # few observations, we need to reinitialize this components
                S_chol = linalg.cholesky(S + min_covar * np.eye(n_dim),
                                         lower=True)
            m = S.dot((Sigma_k_inv.dot(mu_k) +
                       Sigma_0_inv.dot(X_new_passive.T).T).T).T
            #fraction part of above equation
            # scale_log_det = -.5 * (np.log(2*np.pi) + np.sum(np.log(S_inv)) +
            #     2*np.sum(np.log(np.diag(Sigma_k_chol))) + np.sum(np.log(np.diag(Sigma_0))))
            # #exp() part of the above equation
            # S_sol = linalg.solve_triangular(M_chol, (X_new - X_old).T, lower=True).T

            # scale_log_rbf = -.5 * (np.sum(M_sol**2), axis=1)
            S_log_det = 2 * np.sum(np.log(np.diag(S_chol)))
            # print 'S_log_det:', S_log_det
            S_sol = linalg.solve_triangular(S_chol, (X_new - m).T,
                                            lower=True).T
            log_prob[:, c] = -.5 * (np.sum(S_sol**2, axis=1) +
                                    n_dim * np.log(2 * np.pi) + S_log_det)
        lpr = log_prob + np.log(w)
        # print 'log_prob:', log_prob
        # print 'w:', w
        # print 'lpr:', lpr
        logprob = logsumexp(lpr, axis=1)
        responsibilities = np.exp(lpr - logprob[:, np.newaxis])
        return logprob, responsibilities

    def prepare_inv_and_constants(self):
        '''
        supplement steps to prepare inverse of variance matrices and constant terms
        '''
        regularization = self.reg
        for idx in range(self.n_estimators):
            self.estimators_[idx]['inv_covars'] = [
                np.linalg.pinv(covar + np.eye(covar.shape[0]) * regularization)
                for covar in self.estimators_[idx]['covars']
            ]
            self.estimators_[idx]['beta'] = [
                .5 * np.log(
                    pseudo_determinant(covar + np.eye(covar.shape[0]) *
                                       regularization)) +
                .5 * np.log(2 * np.pi) * covar.shape[0]
                for covar in self.estimators_[idx]['covars']
            ]

        self.estimators_full_['weights'] = []
        self.estimators_full_['means'] = []
        self.estimators_full_['covars'] = []
        for e_idx in range(self.n_estimators):
            for leaf_idx in range(len(self.estimators_[e_idx]['weights'])):
                self.estimators_full_['weights'].append(
                    self.estimators_[e_idx]['weights'][leaf_idx] /
                    float(self.n_estimators))
                self.estimators_full_['covars'].append(
                    self.estimators_[e_idx]['covars'][leaf_idx])
                self.estimators_full_['means'].append(
                    self.estimators_[e_idx]['means'][leaf_idx])
                # self.estimators_full_['inv_covars'] = [ np.linalg.pinv(covar) for covar in self.estimators_full_['covars']]
                # self.estimators_full_['beta'] = [.5*np.log(pseudo_determinant(covar)) + .5*np.log(2*np.pi)*covar.shape[0] for covar in self.estimators_full_['covars']]
                self.estimators_full_['inv_covars'].append(
                    self.estimators_[e_idx]['inv_covars'][leaf_idx])
                self.estimators_full_['beta'].append(
                    self.estimators_[e_idx]['beta'][leaf_idx])
        return
"""
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
hasher.fit(X)
model = SelectFromModel(hasher, prefit=True)
X_transformed = model.transform(X)

# Visualize result using PCA
pca = TruncatedSVD(n_components=2)
X_reduced = pca.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)


# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)