Example #1
1
class EnsembleIOC(BaseEstimator, RegressorMixin):
    '''
    Handling state/state pairs as input
    '''
    def __init__(self,  n_estimators=20,
                        max_depth=5, min_samples_split=10, min_samples_leaf=10, clustering=0,
                        random_state=0,
                        em_itrs=5,
                        regularization=0.05,
                        passive_dyn_func=None,
                        passive_dyn_ctrl=None,
                        passive_dyn_noise=None,
                        verbose=False):
        '''
        n_estimators        - number of ensembled models
        ...                 - a batch of parameters used for RandomTreesEmbedding, see relevant documents
        clustering          - whether or not to force the number of subset. If non-zero, call a clustering scheme with the learned metric
        em_itrs             - maximum number of EM iterations to take if one would like to increase the likelihood of the MaxEnt approximation
        regularization      - small positive scalar to prevent singularity of matrix inversion. This is especially necessary when passive dynamics
                              is considered. Notably, the underactuated system will assum zero covariance for uncontrolled state dimensions but this might not
                              not be the case in reality since the collected data could be corrupted by noises.
        passive_dyn_func    - function to evaluate passive dynamics; None for MaxEnt model
        passive_dyn_ctrl    - function to return the control matrix which might depend on the state...
        passive_dyn_noise   - covariance of a Gaussian noise; only applicable when passive_dyn is Gaussian; None for MaxEnt model
                                note this implies a dynamical system with constant input gain. It is extendable to have state dependent
                                input gain then we need covariance for each data point
        verbose             - output training information
        '''
        BaseEstimator.__init__(self)

        self.n_estimators=n_estimators
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.clustering=clustering
        self.random_state=random_state
        self.em_itrs=em_itrs
        self.reg=regularization
        self.passive_dyn_func=passive_dyn_func
        self.passive_dyn_ctrl=passive_dyn_ctrl
        self.passive_dyn_noise=passive_dyn_noise
        self.verbose=verbose
        return

    def predict(self, X):
        n_samples, n_dim = X.shape

        # use approximated GMM to capture the correlation, which provides us an initialization to iterate
        # the MAP estimation
        tmp_gmm = gmm.GMM(  n_components=len(self.gmm_estimators_full_['weights']),
                            priors=np.array(self.gmm_estimators_full_['weights']),
                            means=np.array(self.gmm_estimators_full_['means']),
                            covariances=self.gmm_estimators_full_['covars'])

        init_guess, init_covar = tmp_gmm.predict_with_covariance(indices=range(n_dim), X=X)

        def objfunc(x, *args):
            prior_mu, prior_inv_var = args
            vals, grads = self.value_eval_samples_helper(np.array([x]), average=False, const=True)
            prior_prob = .5*(x - prior_mu).dot(prior_inv_var).dot(x - prior_mu)
            prior_grad = prior_inv_var.dot(x-prior_mu)
            return vals[0] + prior_prob, grads[0] + prior_grad

        res = []
        for sample_idx in range(n_samples):
            opt_res = sciopt.minimize(  fun=objfunc,
                                        x0=init_guess[sample_idx, :],
                                        args=(init_guess[sample_idx, :], np.linalg.pinv(init_covar[sample_idx])),
                                        method='BFGS',
                                        jac=True,
                                        options={'gtol': 1e-8, 'disp': False})
            # print opt_res.message, opt_res.x,
            # print opt_res.fun, opt_res.jac
            # print init_guess[sample_idx, :], init_covar[sample_idx], opt_res.x
            res.append(opt_res.x)
        res = np.array(res)
        return res

    def _check_grads(self, X):
        n_samples, n_dim = X.shape

        # #predict the next state x_{t+1} given x_{t}
        tmp_gmm = gmm.GMM(  n_components=len(self.gmm_estimators_full_['weights']),
                            priors=np.array(self.gmm_estimators_full_['weights']),
                            means=np.array(self.gmm_estimators_full_['means']),
                            covariances=self.gmm_estimators_full_['covars'])

        init_guess, init_covar = tmp_gmm.predict_with_covariance(indices=range(n_dim), X=X)

        def objfunc(x, *args):
            prior_mu, prior_var = args
            vals, grads = self.value_eval_samples_helper(np.array([x]), average=False, const=True)
            prior_prob = .5*(x - prior_mu).dot(prior_var).dot(x - prior_mu)
            prior_grad = prior_var.dot(x-prior_mu)
            return vals[0] + prior_prob, grads[0] + prior_grad

        res = []
        for sample_idx in range(n_samples):
            def check_grad_fun(x):
                return objfunc(x, init_guess[sample_idx, :], init_covar[sample_idx])[0]
            def check_grad_fun_jac(x):
                return objfunc(x, init_guess[sample_idx, :], init_covar[sample_idx])[1]

            res.append(sciopt.check_grad(check_grad_fun, check_grad_fun_jac, X[sample_idx, :]))

        return np.mean(res)

    def fit(self, X, y=None):
        '''
        X - an array of concatenated features X_i = (x_{t-1}, x_{t}) corresponding to the infinite horizon case
        '''
        #check parameters...
        assert(type(self.n_estimators)==int)
        assert(self.n_estimators > 0)
        assert(type(self.max_depth)==int)
        assert(self.max_depth > 0)
        assert(type(self.min_samples_split)==int)
        assert(self.min_samples_split > 0)
        assert(type(self.min_samples_leaf)==int)
        assert(self.min_samples_leaf > 0)
        assert(type(self.em_itrs)==int)

        n_samples, n_dims = X.shape

        #an initial partitioning of data with random forest embedding
        self.random_embedding_mdl_ = RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
            )

        #we probably do not need the data type to differentiate it is a demonstration
        #of trajectory or commanded state, do we?
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            # self.random_embedding_mdl_.fit(X[:, X.shape[1]/2:])
            # indices = self.random_embedding_mdl_.apply(X[:, X.shape[1]/2:])
            self.random_embedding_mdl_.fit(X[:, :X.shape[1]/2])
            indices = self.random_embedding_mdl_.apply(X[:, :X.shape[1]/2])
            # X_tmp = np.array(X)
            # X_tmp[:, X.shape[1]/2:] = X_tmp[:, X.shape[1]/2:] - X_tmp[:, :X.shape[1]/2]
            # self.random_embedding_mdl_.fit(X_tmp)

            # indices = self.random_embedding_mdl_.apply(X_tmp)
        else:
            self.random_embedding_mdl_.fit(X)
            #figure out indices
            indices = self.random_embedding_mdl_.apply(X)

        #prepare ensemble for prediction
        self.random_prediction_mdl_ = RandomForestRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
            )

        self.random_prediction_mdl_.fit(X[:, :X.shape[1]/2], X[:, X.shape[1]/2:])

        if self.clustering > 0:
            #we need to force the data to situate in clusters with the given number and the random embeddings
            #first construct affinity
            #use extracted indices as sparse features to construct an affinity matrix
            if self.n_estimators > 1:
                if self.verbose:
                    print 'Building {0} subset of data depending on their random embedding similarity...'.format(self.clustering)
                #it makes sense to use the random embedding to do the clustering if we have ensembled features
                aff_mat = _affinity_matrix_from_indices(indices, 'binary')
                #using spectral mapping (Laplacian eigenmap)
                self.cluster = SpectralClustering(n_clusters=self.clustering, affinity='precomputed')
                self.cluster.fit(aff_mat)
            else:
                if self.verbose:
                    print 'Building {0} subset of data depending on their Euclidean similarity...'.format(self.clustering)
                #otherwise, use euclidean distance, this should be enough when the state space is low dimensional
                self.cluster = KMeans(n_clusters=self.clustering, max_iter=200, n_init=5)
                self.cluster.fit(X)

            partitioned_data = defaultdict(list)
            leaf_idx = defaultdict(set)
            weight_idx = defaultdict(float)
            for d_idx, d, p_idx in zip(range(len(X)), X, self.cluster.labels_):
                partitioned_data[0, p_idx].append(d)
                leaf_idx[0] |= {p_idx}
            for p_idx in range(self.clustering):
                weight_idx[0, p_idx] = 1./self.clustering
            num_estimators = 1
        else:
            partitioned_data = defaultdict(list)
            leaf_idx = defaultdict(set)
            weight_idx = defaultdict(float)
            #group data belongs to the same partition and have the weights...
            #is weight really necessary for EM steps? Hmm, seems to be for the initialization
            #d_idx: data index; p_idx: partition index (comprised of estimator index and leaf index)
            for d_idx, d, p_idx in zip(range(len(X)), X, indices):
                for e_idx, l_idx in enumerate(p_idx):
                    partitioned_data[e_idx, l_idx].append(d)
                    leaf_idx[e_idx] |= {l_idx}

                for e_idx, l_idx in enumerate(p_idx):
                    weight_idx[e_idx, l_idx] = float(len(partitioned_data[e_idx, l_idx])) / len(X)
                    # weight_idx[e_idx, l_idx] = 1. / len(p_idx)
            num_estimators = self.n_estimators

        #for each grouped data, solve an easy IOC problem by assuming quadratic cost-to-go function
        #note that, if the passive dynamics need to be learned, extra steps is needed to train a regressor with weighted data
        #otherwise, just a simply gaussian for each conditional probability distribution model
        self.estimators_ = []
        #another copy to store the parameters all together, for EM/evaluation on all of the models
        self.estimators_full_ = defaultdict(list)

        #<hyin/Feb-6th-2016> an estimator and leaf indexed structure to record the passive likelihood of data...
        passive_likelihood_dict = defaultdict(list)
        for e_idx in range(num_estimators):
            #for each estimator
            estimator_parms = defaultdict(list)
            for l_idx in leaf_idx[e_idx]:
                if self.verbose:
                    print 'Processing {0}-th estimator and {1}-th leaf/partition...'.format(e_idx, l_idx)
                #and for each data partition
                data_partition=np.array(partitioned_data[e_idx, l_idx])

                estimator_parms['means'].append(np.mean(data_partition, axis=0))
                estimator_parms['covars'].append(np.cov(data_partition.T) + np.eye(data_partition.shape[1])*self.reg)

                #for MaxEnt, uniform passive likelihood
                passive_likelihood_dict[e_idx, l_idx] = np.ones(len(data_partition)) / float(len(data_partition))


                estimator_parms['weights'].append(weight_idx[e_idx, l_idx])

            self.estimators_.append(estimator_parms)

        #can stop here or go for expectation maximization for each estimator...
        if self.em_itrs > 0:
            #prepare em results for each estimator
            em_res = [self._em_steps(e_idx, X, y) for e_idx in range(num_estimators)]

            self.estimators_ = em_res

        #record the gmm approximation
        self.gmm_estimators_ = copy.deepcopy(self.estimators_)
        self.gmm_estimators_full_ = defaultdict(list)

        for est in self.estimators_:
            for comp_idx in range(len(est['weights'])):
                est['means'][comp_idx] = est['means'][comp_idx][(n_dims/2):]
                est['covars'][comp_idx] = est['covars'][comp_idx][(n_dims/2):, (n_dims/2):]
                self.estimators_full_['weights'].append(est['weights'][comp_idx]/float(num_estimators))
                #for full estimators
                self.estimators_full_['means'].append(est['means'][comp_idx])
                self.estimators_full_['covars'].append(est['covars'][comp_idx])

        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            X_new         = X[:, X.shape[1]/2:]
            X_old         = X[:, 0:X.shape[1]/2]

            #merge the model knowledge if passive dynamics model is available, use MaxEnt assumption otherwise
            X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(X.shape[0])])
            passive_likelihood = _passive_dyn_likelihood(X_new, X_new_passive, self.passive_dyn_noise, self.passive_dyn_ctrl, self.reg)
            weights = passive_likelihood / (np.sum(passive_likelihood) + self.reg)

            if np.sum(weights) < 1e-10:
                weights = 1./len(weights) * np.ones(len(weights))
            #a GMM as a MaxEnt surrogate
            tmp_gmm = gmm.GMM(  n_components=len(self.estimators_[0]['weights']),
                                priors=self.estimators_[0]['weights'],
                                means=self.estimators_[0]['means'],
                                covariances=self.estimators_[0]['covars'])
            for e_idx in range(num_estimators):
                tmp_gmm.n_components = len(self.estimators_[e_idx]['weights'])
                tmp_gmm.priors = self.estimators_[e_idx]['weights']
                tmp_gmm.means = self.estimators_[e_idx]['means']
                tmp_gmm.covariances = self.estimators_[e_idx]['covars']

                responsibilities = tmp_gmm.to_responsibilities(X_new)
                responsibilities = responsibilities / (np.sum(responsibilities, axis=0) + 1e-10)
                new_weights = (weights * responsibilities.T).T

                new_weights = (new_weights + 1e-10) / (np.sum(new_weights +1e-10, axis=0))

                weighted_means = [np.sum((new_weight*X_new.T).T, axis=0) for new_weight in new_weights.T]

                weighted_covars =[ _frequency_weighted_covariance(X_new, weighted_mean, new_weight, spherical=False)
                                        for new_weight, weighted_mean in zip(new_weights.T, weighted_means)]

                self.estimators_[e_idx]['means'] = weighted_means
                self.estimators_[e_idx]['covars'] = weighted_covars


        self.prepare_inv_and_constants()
        return indices, leaf_idx, partitioned_data, passive_likelihood_dict

    def _em_steps(self, estimator_idx, X, y=None):
        #use current estimation as initialization to perform expectation-maximization
        #now reuse the procedure implemented by scikit-learn, actually a costumized implementation
        #is required if the passive dynamics also needs to be learned.
        if self.verbose:
            if estimator_idx is not None:
                print 'EM steps for the estimator {0}'.format(estimator_idx)
            else:
                print 'EM steps...'

        if estimator_idx is not None:
            n_partitions=len(self.estimators_[estimator_idx]['weights'])
            if self.verbose:
                print 'num of partitions:', n_partitions
            #use our own initialization
            g = gmm.GMM(n_components=n_partitions, priors=np.array(self.estimators_[estimator_idx]['weights']),
                means=np.array(self.estimators_[estimator_idx]['means']),
                covariances=np.array(self.estimators_[estimator_idx]['covars']),
                n_iter=self.em_itrs,
                covariance_type='full')
        else:
            n_partitions=len(self.estimators_full_['weights'])
            g = mixture.GaussianMixture(n_components=n_partitions, priors=np.array(self.estimators_[estimator_idx]['weights']),
                means=np.array(self.estimators_[estimator_idx]['means']),
                covariances=np.array(self.estimators_[estimator_idx]['covars']),
                n_iter=self.em_itrs,
                covariance_type='full')

        # g.fit(X[:, (X.shape[1]/2):])
        g.fit(X)

        #prepare to return a defaultdict
        res=defaultdict(list)
        res['means']=list(g.means)
        res['covars']=list(g.covariances)
        res['weights']=list(g.priors)

        return res

    def sample(self, n_samples=1, random_state=None):
        '''
        return samples that are synthesized from the model
        '''
        if not hasattr(self, 'estimators_'):
            print 'The model has not been trained yet...'
            return
        else:
            pass
        return

    def score(self, X, y=None):
        return self.value_eval_samples(X, y, False, True)

    def value_eval_samples(self, X, y=None, average=False, const=True):
        scores, grads = self.value_eval_samples_helper(X, y, average, const)
        return scores

    def value_eval_samples_helper(self, X, y=None, average=False, const=True):
        n_samples, n_dim = X.shape

        grads = np.zeros((n_samples, n_dim))

        if self.clustering > 0:
            num_estimators = 1
        else:
            num_estimators = self.n_estimators

        if not average:
            res = np.zeros(X.shape[0])
            res_mat = np.zeros((X.shape[0], len(self.estimators_full_['means'])))
            res_grad_tmp = []
            for i, (m, c_inv)   in enumerate(   zip(self.estimators_full_['means'],
                                            self.estimators_full_['inv_covars'])):
                diff_data = X - m
                res_mat[:, i] = np.array([e_prod.dot(e)*0.5 + self.estimators_full_['beta'][i]*const for e_prod, e in zip(diff_data.dot(c_inv), diff_data)])
                res_grad_tmp.append(c_inv.dot(diff_data.T).T)
            for d_idx, r in enumerate(res_mat):
                res[d_idx] = -logsumexp(-r, b=np.array(self.estimators_full_['weights']))
            resp = ((np.exp(-res_mat)*np.array(self.estimators_full_['weights'])).T / np.exp(-res)).T
            for e_idx in range(res_mat.shape[1]):
                grads += (res_grad_tmp[e_idx].T * resp[:, e_idx]).T
        else:
            def value_estimator_eval(d, est_idx):
                res = []
                for i, (m, c_inv) in enumerate(   zip(self.estimators_[est_idx]['means'],
                                            self.estimators_[est_idx]['inv_covars'])):
                    diff_data = d - m
                    res.append((.5*diff_data.dot(c_inv).dot(diff_data.T) + self.estimators_[est_idx]['beta'][i]*const)[0])
                return np.array(res).T
            def value_estimator_grad(d, est_idx, val):
                res_grad = 0
                for i, (m, c_inv) in enumerate(   zip(self.estimators_[est_idx]['means'],
                                            self.estimators_[est_idx]['inv_covars'])):
                    diff_data = d - m
                    resp = np.exp(-(.5*diff_data.dot(c_inv).dot(diff_data.T) + self.estimators_[est_idx]['beta'][i]*const)[0]) * self.estimators_[est_idx]['weights'][i]
                    grad_comp = c_inv.dot(diff_data.T).T
                    res_grad += (grad_comp.T * (resp / np.exp(-val))).T
                return res_grad
            res = np.array([-logsumexp(-value_estimator_eval(X, idx), axis=1, b=self.estimators_[idx]['weights']) for idx in range(num_estimators)]).T
            res_grad = [value_estimator_grad(X, idx, res[:, idx]) for idx in range(num_estimators)]
            res = np.mean(res, axis=1)
            grads = np.mean(res_grad, axis=0)
        return res, grads

    def prepare_inv_and_constants(self):
        '''
        supplement steps to prepare inverse of variance matrices and constant terms
        '''
        regularization = self.reg

        if self.clustering > 0:
            num_estimators = 1
        else:
            num_estimators = self.n_estimators

        for idx in range(num_estimators):
            self.estimators_[idx]['inv_covars'] = [ np.linalg.pinv(covar + np.eye(covar.shape[0])*regularization) for covar in self.estimators_[idx]['covars']]
            self.estimators_[idx]['beta'] = [.5*np.log(pseudo_determinant(covar + np.eye(covar.shape[0])*regularization)) + .5*np.log(2*np.pi)*covar.shape[0] for covar in self.estimators_[idx]['covars']]

        self.estimators_full_['weights'] = []
        self.estimators_full_['means'] = []
        self.estimators_full_['covars'] = []

        self.gmm_estimators_full_['weights'] = []
        self.gmm_estimators_full_['means'] = []
        self.gmm_estimators_full_['covars'] = []
        for e_idx in range(num_estimators):
            for leaf_idx in range(len(self.estimators_[e_idx]['weights'])):
                self.estimators_full_['weights'].append(self.estimators_[e_idx]['weights'][leaf_idx]/float(num_estimators))
                self.estimators_full_['covars'].append(self.estimators_[e_idx]['covars'][leaf_idx])
                self.estimators_full_['means'].append(self.estimators_[e_idx]['means'][leaf_idx])

                self.estimators_full_['inv_covars'].append(self.estimators_[e_idx]['inv_covars'][leaf_idx])
                self.estimators_full_['beta'].append(self.estimators_[e_idx]['beta'][leaf_idx])

                self.gmm_estimators_full_['weights'].append(self.gmm_estimators_[e_idx]['weights'][leaf_idx]/float(num_estimators))
                self.gmm_estimators_full_['covars'].append(self.gmm_estimators_[e_idx]['covars'][leaf_idx])
                self.gmm_estimators_full_['means'].append(self.gmm_estimators_[e_idx]['means'][leaf_idx])
        return
Example #2
0
def rt_embedding(X, n_estimators=100, max_depth=10, n_jobs=-1):
    """Embed data matrix X in a random forest.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data matrix.
    n_estimators : int, optional
        The number of trees in the embedding.
    max_depth : int, optional
        The maximum depth of each tree.
    n_jobs : int, optional
        Number of compute jobs when fitting the trees. -1 means number
        of processors on the current computer.

    Returns
    -------
    rt : RandomTreesEmbedding object
        The embedding object.
    X_transformed : sparse matrix
        The transformed data.
    """
    rt = RandomTreesEmbedding(n_estimators=n_estimators, max_depth=max_depth,
                              n_jobs=n_jobs)
    X_transformed = rt.fit_transform(X)
    return rt, X_transformed
Example #3
0
def test_random_hasher_sparse_data():
    X, y = datasets.make_multilabel_classification(return_indicator=True,
                                                   random_state=0)
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X_transformed = hasher.fit_transform(X)
    X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
    def random_forest_embedding(self, data, n_estimators=30, random_state=0, max_depth=3, min_samples_leaf=1):
        """
        learn a density with random forest representation
        """
        """
        scikit-learn only supports axis-align sepration, let's first stick to this and see how it works
        """
        # n_estimators = 400
        # random_state = 0
        # max_depth = 5
        rf_mdl = RandomTreesEmbedding(
            n_estimators=n_estimators,
            random_state=random_state,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf)
        rf_mdl.fit(data)

        indices = rf_mdl.apply(data)
        samples_by_node = defaultdict(list)
        idx_by_node = defaultdict(list)
        #kde_by_node = defaultdict(KernelDensity)

        for idx, sample, est_data in zip(range(len(data)), data, indices):
            for est_ind, leaf in enumerate(est_data):
                samples_by_node[ est_ind, leaf ].append(sample)
                idx_by_node[ est_ind, leaf ].append(idx)

        res_mdl = dict()
        res_mdl['rf_mdl'] = rf_mdl
        res_mdl['samples_dict'] = samples_by_node
        res_mdl['idx_dict'] = idx_by_node
        # res_mdl['kde_dict'] = kde_by_node
        return res_mdl
Example #5
0
def random_forest_embedding(data, n_estimators=400, random_state=0, max_depth=5, min_samples_leaf=1):
    """
    learn a density with random forest representation
    """
    """
    scikit-learn only supports axis-align sepration, let's first stick to this and see how it works
    """
    # n_estimators = 400
    # random_state = 0
    # max_depth = 5
    rf_mdl = RandomTreesEmbedding(
        n_estimators=n_estimators, 
        random_state=random_state, 
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf)
    rf_mdl.fit(data)
    
    # forestClf.fit(trainingData, trainingLabels)
    # indices = forestClf.apply(trainingData)
    # samples_by_node = defaultdict(list)
    # for est_ind, est_data in enumerate(indices.T):
    # for sample_ind, leaf in enumerate(est_data):
    # samples_by_node[ est_ind, leaf ].append(sample_ind)
    # indexOfSamples = samples_by_node[0,10]
    # # samples_by_node[treeIndex, leafIndex within that tree]
    # leafNodeSamples = trainingAngles[indexOfSamples]
    # kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(leafNodeSamples)

    indices = rf_mdl.apply(data)
    samples_by_node = defaultdict(list)
    idx_by_node = defaultdict(list)
    kde_by_node = defaultdict(KernelDensity)

    for idx, sample, est_data in zip(range(len(data)), data, indices):
        for est_ind, leaf in enumerate(est_data):
            samples_by_node[ est_ind, leaf ].append(sample)
            idx_by_node[ est_ind, leaf ].append(idx)

        
    #Kernel Density Estimation for each leaf node
    # for k,v in samples_by_node.iteritems():
    #     est_ind, leaf = k
          # params = {'bandwidth': np.logspace(-1, 1, 20)}
          # grid = GridSearchCV(KernelDensity(), params)
          # grid.fit(v)

    #     kde_by_node[ est_ind, leaf ] = grid.best_estimator_

    res_mdl = dict()
    res_mdl['rf_mdl'] = rf_mdl
    res_mdl['samples_dict'] = samples_by_node
    res_mdl['idx_dict'] = idx_by_node
    # res_mdl['kde_dict'] = kde_by_node
    return res_mdl
Example #6
0
def test_random_trees_dense_type():
    # Test that the `sparse_output` parameter of RandomTreesEmbedding
    # works by returning a dense array.

    # Create the RTE with sparse=False
    hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # Assert that type is ndarray, not scipy.sparse.csr.csr_matrix
    assert_equal(type(X_transformed), np.ndarray)
Example #7
0
def test_random_trees_dense_equal():
    # Test that the `sparse_output` parameter of RandomTreesEmbedding
    # works by returning the same array for both argument values.

    # Create the RTEs
    hasher_dense = RandomTreesEmbedding(n_estimators=10, sparse_output=False, random_state=0)
    hasher_sparse = RandomTreesEmbedding(n_estimators=10, sparse_output=True, random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed_dense = hasher_dense.fit_transform(X)
    X_transformed_sparse = hasher_sparse.fit_transform(X)

    # Assert that dense and sparse hashers have same array.
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense)
Example #8
0
def  do_TRT(ne = 10, md = 3):
    from sklearn.ensemble import RandomTreesEmbedding
    from sklearn.naive_bayes import BernoulliNB
    train_X, train_Y, test_X, test_Y = analysis_glass()
    all_X = np.vstack((train_X, test_X))
    hasher = RandomTreesEmbedding(n_estimators=ne,\
                                  random_state=0, max_depth=md)
    all_X_trans = hasher.fit_transform(all_X)
    train_X_trans = all_X[0:149, :]
    test_X_trans = all_X[149:, :]

    nb = BernoulliNB()
    nb.fit(train_X_trans, train_Y)

    return nb.score(test_X_trans, test_Y)
Example #9
0
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two pca dimensions
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
    assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray())

    # one leaf active per data point per forest
    assert_equal(X_transformed.shape[0], X.shape[0])
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    pca = RandomizedPCA(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert_equal(linear_clf.score(X_reduced, y), 1.0)
Example #10
0
 def cluster_training(self, train, distance=False):
     '''
     This is the basic clustering function
     '''
     self.train_matrix = train.train
     '''
     Step one is to make sure that their is a distance matrix in place.
     It is best to feed an existing distance matrix if one is available.
     '''
     if distance is False:
         self.p_feat_matrix = self.tools.pairwise_distance_matrix(train.train, 'jaccard')
     else:
         self.p_feat_matrix = distance
     '''
     Step two is to cluster your data using a random trees embedding. This a
     random ensemble of trees. This is a transformation on the data, into a
     high dimensional, sparse space
     '''
     self.clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
     #self.clf.fit(self.train_matrix)
     X_transformed = self.clf.fit_transform(self.train_matrix)
     '''
     Step three performs truncated SVD (similar to PCA). It operates on the sample
     vectors directly, rather than the covariance matrix. It takes the first two
     components. Essentially this reduces the sparse embedding to a low dimensional
     representation.
     '''
     self.svd = TruncatedSVD(n_components=2)
     self.svd.clf = self.svd.fit(X_transformed)
     self.model = self.svd.clf.transform(X_transformed)
     '''
     The next step is to take the transformed model and the original dataset and
     determine the max silhouette_score of clusters
     '''
     (self.cluster_assignment,
      self.cluster_num,
      self.cluster_score) = self.tools.identify_accurate_number_of_clusters(self.model, self.compounds)
     self.individualclusters = []
     '''
     The individual datapoints are assessed with regard to the best clustering scheme
     '''
     for i in range(self.cluster_num):
         self.individualclusters.append([])
         for j in range(len(self.cluster_assignment)):
             if self.cluster_assignment[j] == i:
                 self.individualclusters[i].append(self.model[j, :])
         self.individualclusters[i] = np.array(self.individualclusters[i])
     '''
     Finally, this clustering scheme is used to generate a one class Support
     Vector Machine decision boundary.
     '''
     (self.clf_OCSVM,
      self.OCSVM_model) = self.tools.determine_test_similarity(self.individualclusters)
Example #11
0
 def __init__(self, coordinator, base_classifier, n_estimators=10,
              max_depth=5, min_samples_split=2, min_samples_leaf=1,
              n_jobs=-1, random_state=None, verbose=0, min_density=None):
     Classifier.__init__(self, coordinator, base_classifier)
     self.histoSize = 0
     self._visualBagger = RandomTreesEmbedding(n_estimators=n_estimators,
                                               max_depth=max_depth,
                                               min_samples_split=min_samples_split,
                                               min_samples_leaf=min_samples_leaf,
                                               n_jobs=n_jobs,
                                               random_state=random_state,
                                               verbose=verbose,
                                               min_density=min_density)
Example #12
0
 def cluster_testing(self, testing):
     '''Create RandomTreesEmbedding of data'''
     clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
     '''Fit testing data to training model'''
     clf.fit = self.clf.fit(testing)
     X_transformed = self.clf.fit_transform(testing)
     n_components = 2
     '''SVD transform data'''
     svd = TruncatedSVD(n_components=n_components)
     svd.clf = svd.fit(X_transformed)
     svd.model = svd.clf.transform(X_transformed)
     '''Train transformed data using original model'''
     train_transformed = clf.fit.transform(self.train_matrix)
     train_model = svd.clf.transform(train_transformed)
     '''Generate One Class SVM rejection criteria'''
     (clf_OCSVM_t, OCSVMmodel_t) = self.tools.determine_testing_data_similarity(train_model)
     predicted = []
     '''Remove testing compounds outside rejection margin'''
     for i in range(len(svd.model)):
         p = OCSVMmodel_t.predict(svd.model[i, :].reshape(1, -1))
         pred = OCSVMmodel_t.decision_function(svd.model[i, :].reshape(1, -1)).ravel()
         if (p == 1):
             predicted.append(i)
     return predicted
    def fit(self, X, y=None):
        '''
        y could be the array of starting state of the demonstrated trajectories/policies
        if it is None, it implicitly implies a MaxEnt model. Other wise, it serves as the feature mapping
        of the starting state. This data might also be potentially used for learning the passive dynamics
        for a pure model-free learning with some regressors and regularization.
        '''
        #check parameters...
        assert(type(self.n_estimators)==int)
        assert(self.n_estimators > 0)
        assert(type(self.max_depth)==int)
        assert(self.max_depth > 0)
        assert(type(self.min_samples_split)==int)
        assert(self.min_samples_split > 0)
        assert(type(self.min_samples_leaf)==int)
        assert(self.min_samples_leaf > 0)
        assert(type(self.em_itrs)==int)

        #an initial partitioning of data with random forest embedding
        self.random_embedding_mdl_ = RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
            )

        #we probably do not need the data type to differentiate it is a demonstration
        #of trajectory or commanded state, do we?
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            self.random_embedding_mdl_.fit(X[:, X.shape[1]/2:])
            indices = self.random_embedding_mdl_.apply(X[:, X.shape[1]/2:])
            # X_tmp = np.array(X)
            # X_tmp[:, X.shape[1]/2:] = X_tmp[:, X.shape[1]/2:] - X_tmp[:, :X.shape[1]/2]
            # self.random_embedding_mdl_.fit(X_tmp)

            # indices = self.random_embedding_mdl_.apply(X_tmp)
        else:
            self.random_embedding_mdl_.fit(X)
            #figure out indices
            indices = self.random_embedding_mdl_.apply(X)

        partitioned_data = defaultdict(list)

        leaf_idx = defaultdict(set)
        weight_idx = defaultdict(float)
        #group data belongs to the same partition and have the weights...
        #is weight really necessary for EM steps? Hmm, seems to be for the initialization
        #d_idx: data index; p_idx: partition index (comprised of estimator index and leaf index)
        for d_idx, d, p_idx in zip(range(len(X)), X, indices):
            for e_idx, l_idx in enumerate(p_idx):
                partitioned_data[e_idx, l_idx].append(d)
                leaf_idx[e_idx] |= {l_idx}

            for e_idx, l_idx in enumerate(p_idx):
                weight_idx[e_idx, l_idx] = float(len(partitioned_data[e_idx, l_idx])) / len(X)
                # weight_idx[e_idx, l_idx] = 1. / len(p_idx)

        #for each grouped data, solve an easy IOC problem by assuming quadratic cost-to-go function
        #note that, if the passive dynamics need to be learned, extra steps is needed to train a regressor with weighted data
        #otherwise, just a simply gaussian for each conditional probability distribution model
        self.estimators_ = []
        #another copy to store the parameters all together, for EM/evaluation on all of the models
        self.estimators_full_ = defaultdict(list)
        #<hyin/Feb-6th-2016> an estimator and leaf indexed structure to record the passive likelihood of data...
        passive_likelihood_dict = defaultdict(list)
        for e_idx in range(self.n_estimators):
            #for each estimator
            estimator_parms = defaultdict(list)
            for l_idx in leaf_idx[e_idx]:
                if self.verbose:
                    print 'Processing {0}-th estimator and {1}-th leaf...'.format(e_idx, l_idx)
                #and for each data partition
                data_partition=np.array(partitioned_data[e_idx, l_idx])
                if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
                    X_new         = data_partition[:, data_partition.shape[1]/2:]
                    X_old         = data_partition[:, 0:data_partition.shape[1]/2]
                    X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(data_partition.shape[0])])
                    passive_likelihood = _passive_dyn_likelihood(X_new, X_new_passive, self.passive_dyn_noise, self.passive_dyn_ctrl, self.reg)

                    weights = passive_likelihood / np.sum(passive_likelihood)
                    weighted_mean = np.sum((weights*X_new.T).T, axis=0)

                    estimator_parms['means'].append(weighted_mean)
                    estimator_parms['covars'].append(_frequency_weighted_covariance(X_new, weighted_mean, weights, spherical=False))

                    #for full estimators
                    self.estimators_full_['means'].append(estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(estimator_parms['covars'][-1])

                    #<hyin/Feb-6th-2016> also remember the data weight according to the passive likelihood
                    #this could be useful if the weights according to the passive likelihood is desired for other applications
                    #to evaluate some statistics within the data parition
                    passive_likelihood_dict[e_idx, l_idx] = weights
                else:
                    estimator_parms['means'].append(np.mean(data_partition, axis=0))
                    estimator_parms['covars'].append(np.cov(data_partition.T))

                    #for full estimators
                    self.estimators_full_['means'].append(estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(estimator_parms['covars'][-1])

                    #for MaxEnt, uniform passive likelihood
                    passive_likelihood_dict[e_idx, l_idx] = np.ones(len(data_partition)) / float(len(data_partition))


                estimator_parms['weights'].append(weight_idx[e_idx, l_idx])
                self.estimators_full_['weights'].append(weight_idx[e_idx, l_idx]/float(self.n_estimators))

            self.estimators_.append(estimator_parms)
        #can stop here or go for expectation maximization for each estimator...
        if self.em_itrs > 0:
            #prepare em results for each estimator
            em_res = [self._em_steps(e_idx, X, y) for e_idx in range(self.n_estimators)]
            #or do EM on the full model?
            # <hyin/Dec-2nd-2015> no, doing this seems to harm the learning as the aggregated model is really
            # complex so optimizing that model tends to overfit...
            # em_res = self._em_steps(None, X, y)
            #then use them
            self.estimators_=em_res

        self.prepare_inv_and_constants()
        return indices, leaf_idx, passive_likelihood_dict
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve

n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
rt_lm = LogisticRegression()
rt.fit(X_train, y_train)
rt_lm.fit(rt.transform(X_train_lr), y_train_lr)

y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)


# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
Example #15
0
# print(true_label)
# print(Y)

# 4. 数据分割
# train_size: 给定划分之后的训练数据的占比是多少,默认0.75
# random_state:给定在数据划分过程中,使用到的随机数种子,默认为None,使用当前的时间戳;给定非None的值,可以保证多次运行的结果是一致的。
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=28)
print("训练数据X的格式:{}, 以及类型:{}".format(x_train.shape, type(x_train)))
print("测试数据X的格式:{}".format(x_test.shape))
print("训练数据Y的类型:{}".format(type(y_train)))

# 5. 特征工程的操作
# NOTE: 不做特征工程

# 6. 模型对象的构建
algo = RandomTreesEmbedding(n_estimators=10, max_depth=2, sparse_output=False)

# 7. 模型的训练
algo.fit(x_train)

# 10. 其他特殊的API
print("子模型列表:\n{}".format(algo.estimators_))

from sklearn import tree
import pydotplus

k = 0
for algo1 in algo.estimators_:
    dot_data = tree.export_graphviz(decision_tree=algo1, out_file=None,
                                    feature_names=['A', 'B', 'C', 'D'],
                                    class_names=['1', '2', '3'],
Example #16
0
#featuresnp = np.array(features[0:2000]+features[-2000:], dtype='float32')
#targetnp = np.array(target[0:2000]+target[-2000:], dtype='int32')

featuresnp = np.array(features, dtype='float32')
targetnp = np.array(target, dtype='int32')

featuresnp -= np.mean(featuresnp, axis=0)
featuresnp /= np.std(featuresnp, axis=0)


# make a synthetic dataset
X, y = featuresnp, targetnp

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=50, random_state=0, max_depth=1)
X_transformed = hasher.fit_transform(X)

## Visualize result using PCA
#pca = RandomizedPCA(n_components=50)
#X_reduced = pca.fit_transform(X_transformed)

print("Computing Isomap embedding")

X_reduced = manifold.Isomap(n_neighbors=30, n_components=2).fit_transform(X)
print("Done.")

#print("Computing Spectral embedding")
#embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
#                                      eigen_solver="arpack")
#X_reduced = embedder.fit_transform(X)
    return prob


# use the Iris dataset as a simple example

iris = datasets.load_iris()

samples = iris.data
labels = iris.target
n_estimator = 200

plt.figure(figsize=(15, 10))

# train a random trees embedding model of 200 estimators

clf = RandomTreesEmbedding(n_estimators=n_estimator, max_depth=10)
clf = clf.fit(samples)
forest = clf.estimators_

# compute the distances

d_Nearest_Common_Ancestor = NCA(
    forest, samples)  # the depth of the nearest common ancestor
d_shortest_path = SP(forest, samples)  # the shortest path
prob = PM(clf, samples)  # the proximity matrix

# plot the heatmap

plt.subplot(1, 3, 1)
plt.imshow(d_Nearest_Common_Ancestor)
plt.subplot(1, 3, 2)
Example #18
0
"""

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
X_transformed = hasher.fit_transform(X)

# Visualize result after dimensionality reduction using truncated SVD
svd = TruncatedSVD(n_components=2)
X_reduced = svd.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)


# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)
Example #19
0
class EnsembleIOCTraj(BaseEstimator, RegressorMixin):
    '''
    Handling the entire trajectories as the input
    '''
    def __init__(self,  traj_clusters=3, ti=True,
                        n_estimators=20,
                        max_depth=5, min_samples_split=10, min_samples_leaf=10, state_n_estimators=100, state_n_clusters=0,
                        random_state=0,
                        em_itrs=5,
                        regularization=0.05,
                        passive_dyn_func=None,
                        passive_dyn_ctrl=None,
                        passive_dyn_noise=None,
                        verbose=False):
        '''
        traj_clusters       - number of clusters of trajectories
        ti                  - whether or not to extract time invariant states

        ***The remained parameters are for the state ioc estimators***
        n_estimators        - number of ensembled models
        ...                 - a batch of parameters used for RandomTreesEmbedding, see relevant documents

        state_n_estimators  - number of state estimators
        state_n_clusters    - number of clusters for states for each trajectory group
        em_itrs             - maximum number of EM iterations to take
        regularization      - small positive scalar to prevent singularity of matrix inversion
        passive_dyn_func    - function to evaluate passive dynamics; None for MaxEnt model
        passive_dyn_ctrl    - function to return the control matrix which might depend on the state...
        passive_dyn_noise   - covariance of a Gaussian noise; only applicable when passive_dyn is Gaussian; None for MaxEnt model
                                note this implies a dynamical system with constant input gain. It is extendable to have state dependent
                                input gain then we need covariance for each data point
        verbose             - output training information
        '''
        self.n_traj_clusters = traj_clusters
        if isinstance(state_n_clusters, int):
            state_clusters_lst = [state_n_clusters] * self.n_traj_clusters
        else:
            state_clusters_lst = state_n_clusters

        self.eioc_mdls = [ EnsembleIOC( n_estimators=state_n_estimators,
                                        max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, clustering=state_n_clusters,  #let random embedding decides how many clusters we should have
                                        random_state=random_state,
                                        em_itrs=em_itrs,
                                        regularization=regularization,
                                        passive_dyn_func=passive_dyn_func,
                                        passive_dyn_ctrl=passive_dyn_ctrl,
                                        passive_dyn_noise=passive_dyn_noise,
                                        verbose=verbose) for i in range(self.n_traj_clusters) ]
        self.ti = ti
        self.n_estimators=n_estimators
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.random_state=random_state
        self.state_n_estimators = state_n_estimators
        self.state_n_clusters = state_n_clusters
        self.em_itrs=em_itrs
        self.reg=regularization
        self.passive_dyn_func=passive_dyn_func
        self.passive_dyn_ctrl=passive_dyn_ctrl
        self.passive_dyn_noise=passive_dyn_noise
        self.verbose=verbose

        self.clustered_trajs = None
        return

    def cluster_trajectories(self, trajs):
        #clustering the trajectories according to random embedding parameters and number of clusters
        #flatten each trajectories
        flattened_trajs = np.array([np.array(traj).T.flatten() for traj in trajs])

        #an initial partitioning of data with random forest embedding
        self.random_embedding_mdl_ = RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
            )

        self.random_embedding_mdl_.fit(flattened_trajs)
        #figure out indices
        indices = self.random_embedding_mdl_.apply(flattened_trajs)

        #we need to force the data to situate in clusters with the given number and the random embeddings
        #first construct affinity
        #use extracted indices as sparse features to construct an affinity matrix
        if self.verbose:
            print 'Building {0} subset of trajectories depending on their random embedding similarity...'.format(self.n_traj_clusters)
        aff_mat = _affinity_matrix_from_indices(indices, 'binary')
        #using spectral mapping (Laplacian eigenmap)
        self.cluster = SpectralClustering(n_clusters=self.n_traj_clusters, affinity='precomputed')
        self.cluster.fit(aff_mat)

        clustered_trajs = [[] for i in range(self.n_traj_clusters)]

        for d_idx, d, p_idx in zip(range(len(trajs)), trajs, self.cluster.labels_):
            clustered_trajs[p_idx].append(d)

        #let's see how the DBSCAN works
        #here it means at least how many trajectories do we need to form a cluster
        #dont know why always assign all of the data as noise...
        # self.cluster = DBSCAN(eps=0.5, min_samples=self.n_traj_clusters, metric='euclidean', algorithm='auto')
        # flatten_trajs = [traj.T.flatten() for traj in trajs]
        # self.cluster.fit(flatten_trajs)
        # labels = self.cluster.labels_
        # print labels
        # # Number of clusters in labels, ignoring noise if present.
        # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        #
        # clustered_trajs = [[] for i in range(n_clusters_)]
        #
        # for d_idx, d, p_idx in zip(range(len(trajs)), trajs, labels):
        #     clustered_trajs[p_idx].append(d)

        return np.array(clustered_trajs)

    def fit(self, X, y=None):
        '''
        X is an array of trajectories
        '''
        #first cluster these trajectories to locally similar data sets (here 'locally' does not necessarily mean euclidean distance)
        clustered_trajs = self.cluster_trajectories(X)

        for i in range(len(clustered_trajs)):
            #for each clustered trajectories train the sub eioc model
            #reform the trajectories if necessary
            if not self.ti:
                #time varing system, just flatten them
                flattened_trajs = [ np.array(traj).T.flatten() in clustered_trajs[i]]
                self.eioc_mdls[i].clustering=1
                self.eioc_mdls[i].fit(flattened_trajs)
                #note the fit model retains mean and covariance of the flattened trajectories
            else:
                #time invariant
                aug_states = []
                for traj in clustered_trajs[i]:
                    for t_idx in range(len(traj)-1):
                        aug_states.append(np.array(traj)[t_idx:t_idx+2, :].flatten())

                self.eioc_mdls[i].fit(np.array(aug_states))

        self.clustered_trajs = clustered_trajs
        return

    def score(self, X, gamma=1.0, average=False):
        #score a query state
        if self.clustered_trajs is not None:
            #the model ensemble has been trained
            # score_ensemble = [np.array(model.score(X)[0]) for model in self.eioc_mdls]
            score_ensemble = [np.array(model.value_eval_samples(X,average=average)) for model in self.eioc_mdls]
            #average (maximum likelihood) or logsumexp (softmaximum -> maximum posterior)
            if gamma is None:
                res = np.mean(score_ensemble, axis=0)
            else:
                # mdl_eval = lambda scores: [logsumexp(x_score) for x_score in scores]
                res = np.array([-logsumexp(-gamma*np.array([score[sample_idx] for score in score_ensemble])) for sample_idx, sample in enumerate(X)])

        return res
                 n_estimators=10,  子模型个数
                 max_depth=5, 决策树最大深度
                 min_samples_split=2, 
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_leaf_nodes=None,
                 min_impurity_decrease=0.,
                 min_impurity_split=None,
                 sparse_output=True, 是否做稀疏矩阵输出
                 n_jobs=1,
                 random_state=None,
                 verbose=0,
                 warm_start=False):
    """
    algo = RandomTreesEmbedding(n_estimators=100,
                                max_depth=2,
                                sparse_output=True)
    # 模型训练
    X_train2 = algo.fit_transform(X_train)
    print(X_train2)

    # 查看下API属性
    x_test2 = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3],
               [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2]]
    print("样本的转换值:")
    print(algo.transform(x_test2))
    # # 模型效果评估
    # print('训练集上的准确率:{}'.format(algo.score(X_train, Y_train)))
    # print('测试集上的准确率:{}'.format(algo.score(X_test, Y_test)))

    print("训练好的所有子模型:\n{}".format(algo.estimators_))
Example #21
0
 def __init__(self, **hyperparams):
     self._hyperparams = hyperparams
     self._wrapped_model = Op(**self._hyperparams)
Example #22
0
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve

n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
rt_lm = LogisticRegression()
rt.fit(X_train, y_train)
rt_lm.fit(SelectFromModel(rt, prefit=True).transform(X_train_lr), y_train_lr)

y_pred_rt = rt_lm.predict_proba(
    SelectFromModel(rt, prefit=True).transform(X_test))[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
Example #23
0
def dummy_multiMethods(df3):
    data = df3[['in hour', 'dayofweek', 'Mall ID_x']]
    data_dummy_feature = pd.get_dummies(data)
    data_dummy_label = df3['delay label']

    # method_1: regression
    from sklearn import datasets, linear_model
    from sklearn.cross_validation import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(data_dummy_feature,
                                                        data_dummy_label,
                                                        test_size=0.20,
                                                        random_state=42)
    MODEL = linear_model.LogisticRegression().fit(X_train, y_train)
    ##predict new samples
    result = MODEL.predict(X_test)
    # score
    score = MODEL.score(X_test, y_test)
    print 'accuracy=' + str(score)  #0.7674
    plot_roc(y_test, result, 'lr')

    # method_2:  random
    from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                                  GradientBoostingClassifier)
    from sklearn.pipeline import make_pipeline

    n_estimator = 10
    rt = RandomTreesEmbedding(max_depth=3,
                              n_estimators=n_estimator,
                              random_state=0)
    rt_lm = linear_model.LogisticRegression()
    pipeline = make_pipeline(rt, rt_lm)
    pipeline.fit(X_train, y_train)
    y_pred_randomTree = pipeline.predict_proba(X_test)[:, 1]
    plot_roc(y_test, y_pred_randomTree, 'randomTrees')
    score_RandomTrees = pipeline.score(X_test, y_test)
    print 'randomTree accuracy=' + str(score_RandomTrees)

    # method_3: Supervised transformation based on random forests
    rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
    from sklearn.preprocessing import OneHotEncoder
    rf_enc = OneHotEncoder()
    rf_lm = linear_model.LogisticRegression()
    rf.fit(X_train, y_train)
    rf_enc.fit(rf.apply(X_train))
    rf_lm.fit(rf_enc.transform(rf.apply(X_train)), y_train)
    y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:,
                                                                           1]
    plot_roc(y_test, y_pred_rf_lm, 'randomForest')

    ## method_4: gradient boosting
    grd = GradientBoostingClassifier(n_estimators=n_estimator)
    grd_enc = OneHotEncoder()
    grd_lm = linear_model.LogisticRegression()
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_lm.fit(grd_enc.transform(grd.apply(X_train)[:, :, 0]), y_train)
    y_pred_grd_lm = grd_lm.predict_proba(
        grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
    #y_pred_grd_lm = grd_lm.predict(grd_enc.transform(grd.apply(X_test)[:, :, 0]))
    #a=y_pred_grd_lm==y_test
    #accuraccy=a.sum()*1.0/len(a)
    plot_roc(y_test, y_pred_grd_lm, 'Logistic regression')

    # method_5: The gradient boosted model by itself
    y_pred_grd = grd.predict_proba(X_test)[:, 1]
    plot_roc(y_test, y_pred_grd, 'Pure_GradientBoosting')

    # method_6: The random forest model by itself
    y_pred_rf = rf.predict_proba(X_test)[:, 1]
    plot_roc(y_test, y_pred_rf, 'Pure_randomForest')
Example #24
0
class UnsupervisedVisualBagClassifier(Classifier):
    """
    ===============================
    UnsupervisedVisualBagClassifier
    ===============================
    1. Unsupervised
    2. Binary bag of words
    3. Totally random trees
    """
    def __init__(self,
                 coordinator,
                 base_classifier,
                 n_estimators=10,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 n_jobs=-1,
                 random_state=None,
                 verbose=0,
                 min_density=None):
        Classifier.__init__(self, coordinator, base_classifier)
        self.histoSize = 0
        self._visualBagger = RandomTreesEmbedding(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            min_density=min_density)

    def _preprocess(self, image_buffer, learningPhase):
        if learningPhase:
            self.setTask(1, "Extracting the features (model creation)")
        else:
            self.setTask(1, "Extracting the features (prediction)")

        X_pred, y = self._coord.process(image_buffer,
                                        learningPhase=learningPhase)

        y_user = self._convertLabel(y)

        #Cleaning up
        self._coord.clean(y)
        del y

        self.endTask()

        #Bag-of-word transformation
        self.setTask(1, "Transforming data into bag-of-words (Tree part)")

        X2 = None
        if learningPhase:
            X2 = self._visualBagger.fit_transform(X_pred, y_user)
            self.histoSize = X2.shape[1]
        else:
            X2 = self._visualBagger.transform(X_pred)

        #Cleaning up
        self._coord.clean(X_pred)
        del X_pred
        del y_user

        self.endTask()

        nbFactor = X2.shape[0] // len(image_buffer)

        if not sps.isspmatrix_csr(X2):
            X2 = X2.tocsr()

        if nbFactor == 1:
            return X2

        self.setTask(len(image_buffer),
                     "Transforming data into bag-of-words (Histogram part)")
        nbTrees = self._visualBagger.n_estimators
        X3 = computeHistogram(len(image_buffer), nbFactor, nbTrees, X2)
        self.endTask()

        #Cleaning up
        del X2  # Should be useless

        return X3

    def fit_histogram(self, hist, y):
        #Delegating the classification
        self.setTask(1, "Learning the model")

        self._classifier.fit(hist, y)

        self.endTask()

        return self

    def fit(self, image_buffer):
        """
        Fits the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to learn from

        Return
        -------
        self : :class:`Classifier`
            This instance
        """
        #Updating the labels
        y_user = image_buffer.getLabels()
        self._buildLUT(y_user)
        y = self._convertLabel(y_user)

        X = self._preprocess(image_buffer, learningPhase=True)

        return self.fit_histogram(X, y)

    def predict(self, image_buffer):
        """
        Classify the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of int
            each entry is the classification label corresponding to the input
        """

        X = self._preprocess(image_buffer, learningPhase=False)
        y_classif = self._classifier.predict(X)
        return self._convertLabelsBackToUser(y_classif)

    def predict_proba(self, image_buffer):
        """
        Classify softly the data contained is the :class:`ImageBuffer`
        instance. i.e. yields a probability vector of belongin to each
        class

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of list of float
            each entry is the probability vector of the input of the same
            index as computed by the base classifier
        """
        if not hasattr(self._classifier, "predict_proba"):
            #Early error
            self._classifier.predict_proba(np.zeros((1, 1)))

        X = self._preprocess(image_buffer, learningPhase=False)
        return self._classifier.predict_proba(X)
Example #25
0
 LocallyLinearEmbedding(n_neighbors=n_neighbors,
                        n_components=2,
                        method="modified"),
 "Hessian LLE embedding":
 LocallyLinearEmbedding(n_neighbors=n_neighbors,
                        n_components=2,
                        method="hessian"),
 "LTSA LLE embedding":
 LocallyLinearEmbedding(n_neighbors=n_neighbors,
                        n_components=2,
                        method="ltsa"),
 "MDS embedding":
 MDS(n_components=2, n_init=1, max_iter=120, n_jobs=2),
 "Random Trees embedding":
 make_pipeline(
     RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0),
     TruncatedSVD(n_components=2),
 ),
 "Spectral embedding":
 SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack"),
 "t-SNE embeedding":
 TSNE(
     n_components=2,
     init="pca",
     learning_rate="auto",
     n_iter=500,
     n_iter_without_progress=150,
     n_jobs=2,
     random_state=0,
 ),
 "NCA embedding":
Example #26
0
    results = np.zeros((2, 10, 32 * 32, 3))
    for mi, model in enumerate(("supervised", "unsupervised")):
        print("Start Autoencoder using {} model".format(model))
        eforest_channels = []
        if model == "supervised":
            for c in range(3):
                eforest = RandomForestClassifier(n_estimators=n_trees,
                                                 max_depth=None,
                                                 n_jobs=-1,
                                                 random_state=0)
                eforest.fit(x_train[:, :, 0], y_train)
                eforest_channels.append(eforest)
        else:
            for c in range(3):
                eforest = RandomTreesEmbedding(n_estimators=n_trees,
                                               max_depth=None,
                                               n_jobs=-1,
                                               random_state=0)
                eforest.fit(x_train[:, :, 0])
                eforest_channels.append(eforest)

        for c in range(3):
            x_encode = eforest.encode(test_images[:, :, c])
            x_decode = eforest.decode(x_encode)
            results[mi, :, :, c] = x_decode
    rheads = ["origin", "supervised", "unsupervised"]
    test_images = test_images.reshape(1, 10, 32, 32, 3).astype(np.uint8)
    results = results.reshape(2, 10, 32, 32, 3).astype(np.uint8)
    fig = plot_cifar10(rheads, np.vstack((test_images, results)))
    plt.show()

    import IPython
Example #27
0
class Clustering():
    def __init__(self, compounds, output=False, seed=False):
        np.random.seed(seed=seed)
        self.seed = seed
        self.compounds = compounds
        self.count = 0
        self.count_1 = 0
        self.output = output
        self.tools = clustertools()
        if self.output is not False:
            self.figures = clusterfigures(self.compounds)
        self.testcompound = []

    def cluster_training(self, train, distance=False):
        '''
        This is the basic clustering function
        '''
        self.train_matrix = train.train
        '''
        Step one is to make sure that their is a distance matrix in place.
        It is best to feed an existing distance matrix if one is available.
        '''
        if distance is False:
            self.p_feat_matrix = self.tools.pairwise_distance_matrix(train.train, 'jaccard')
        else:
            self.p_feat_matrix = distance
        '''
        Step two is to cluster your data using a random trees embedding. This a
        random ensemble of trees. This is a transformation on the data, into a
        high dimensional, sparse space
        '''
        self.clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
        #self.clf.fit(self.train_matrix)
        X_transformed = self.clf.fit_transform(self.train_matrix)
        '''
        Step three performs truncated SVD (similar to PCA). It operates on the sample
        vectors directly, rather than the covariance matrix. It takes the first two
        components. Essentially this reduces the sparse embedding to a low dimensional
        representation.
        '''
        self.svd = TruncatedSVD(n_components=2)
        self.svd.clf = self.svd.fit(X_transformed)
        self.model = self.svd.clf.transform(X_transformed)
        '''
        The next step is to take the transformed model and the original dataset and
        determine the max silhouette_score of clusters
        '''
        (self.cluster_assignment,
         self.cluster_num,
         self.cluster_score) = self.tools.identify_accurate_number_of_clusters(self.model, self.compounds)
        self.individualclusters = []
        '''
        The individual datapoints are assessed with regard to the best clustering scheme
        '''
        for i in range(self.cluster_num):
            self.individualclusters.append([])
            for j in range(len(self.cluster_assignment)):
                if self.cluster_assignment[j] == i:
                    self.individualclusters[i].append(self.model[j, :])
            self.individualclusters[i] = np.array(self.individualclusters[i])
        '''
        Finally, this clustering scheme is used to generate a one class Support
        Vector Machine decision boundary.
        '''
        (self.clf_OCSVM,
         self.OCSVM_model) = self.tools.determine_test_similarity(self.individualclusters)

    def cluster_testing(self, testing):
        '''Create RandomTreesEmbedding of data'''
        clf = RandomTreesEmbedding(n_estimators=512, random_state=self.seed, max_depth=5)
        '''Fit testing data to training model'''
        clf.fit = self.clf.fit(testing)
        X_transformed = self.clf.fit_transform(testing)
        n_components = 2
        '''SVD transform data'''
        svd = TruncatedSVD(n_components=n_components)
        svd.clf = svd.fit(X_transformed)
        svd.model = svd.clf.transform(X_transformed)
        '''Train transformed data using original model'''
        train_transformed = clf.fit.transform(self.train_matrix)
        train_model = svd.clf.transform(train_transformed)
        '''Generate One Class SVM rejection criteria'''
        (clf_OCSVM_t, OCSVMmodel_t) = self.tools.determine_testing_data_similarity(train_model)
        predicted = []
        '''Remove testing compounds outside rejection margin'''
        for i in range(len(svd.model)):
            p = OCSVMmodel_t.predict(svd.model[i, :].reshape(1, -1))
            pred = OCSVMmodel_t.decision_function(svd.model[i, :].reshape(1, -1)).ravel()
            if (p == 1):
                predicted.append(i)
        return predicted
Example #28
0
class Clustering():
    def __init__(self, compounds, output=False, seed=False):
        np.random.seed(seed=seed)
        self.seed = seed
        self.compounds = compounds
        self.count = 0
        self.count_1 = 0
        self.output = output
        self.tools = clustertools()
        if self.output is not False:
            self.figures = clusterfigures(self.compounds)
        self.testcompound = []

    def cluster_training(self, train, distance=False):
        '''
        This is the basic clustering function
        '''
        self.train_matrix = train.train
        '''
        Step one is to make sure that their is a distance matrix in place.
        It is best to feed an existing distance matrix if one is available.
        '''
        if distance is False:
            self.p_feat_matrix = self.tools.pairwise_distance_matrix(
                train.train, 'jaccard')
        else:
            self.p_feat_matrix = distance
        '''
        Step two is to cluster your data using a random trees embedding. This a
        random ensemble of trees. This is a transformation on the data, into a
        high dimensional, sparse space
        '''
        self.clf = RandomTreesEmbedding(n_estimators=512,
                                        random_state=self.seed,
                                        max_depth=5)
        #self.clf.fit(self.train_matrix)
        X_transformed = self.clf.fit_transform(self.train_matrix)
        '''
        Step three performs truncated SVD (similar to PCA). It operates on the sample
        vectors directly, rather than the covariance matrix. It takes the first two
        components. Essentially this reduces the sparse embedding to a low dimensional
        representation.
        '''
        self.svd = TruncatedSVD(n_components=2)
        self.svd.clf = self.svd.fit(X_transformed)
        self.model = self.svd.clf.transform(X_transformed)
        '''
        The next step is to take the transformed model and the original dataset and
        determine the max silhouette_score of clusters
        '''
        (self.cluster_assignment, self.cluster_num,
         self.cluster_score) = self.tools.identify_accurate_number_of_clusters(
             self.model, self.compounds)
        self.individualclusters = []
        '''
        The individual datapoints are assessed with regard to the best clustering scheme
        '''
        for i in range(self.cluster_num):
            self.individualclusters.append([])
            for j in range(len(self.cluster_assignment)):
                if self.cluster_assignment[j] == i:
                    self.individualclusters[i].append(self.model[j, :])
            self.individualclusters[i] = np.array(self.individualclusters[i])
        '''
        Finally, this clustering scheme is used to generate a one class Support
        Vector Machine decision boundary.
        '''
        (self.clf_OCSVM,
         self.OCSVM_model) = self.tools.determine_test_similarity(
             self.individualclusters)

    def cluster_testing(self, testing):
        '''Create RandomTreesEmbedding of data'''
        clf = RandomTreesEmbedding(n_estimators=512,
                                   random_state=self.seed,
                                   max_depth=5)
        '''Fit testing data to training model'''
        clf.fit = self.clf.fit(testing)
        X_transformed = self.clf.fit_transform(testing)
        n_components = 2
        '''SVD transform data'''
        svd = TruncatedSVD(n_components=n_components)
        svd.clf = svd.fit(X_transformed)
        svd.model = svd.clf.transform(X_transformed)
        '''Train transformed data using original model'''
        train_transformed = clf.fit.transform(self.train_matrix)
        train_model = svd.clf.transform(train_transformed)
        '''Generate One Class SVM rejection criteria'''
        (clf_OCSVM_t, OCSVMmodel_t
         ) = self.tools.determine_testing_data_similarity(train_model)
        predicted = []
        '''Remove testing compounds outside rejection margin'''
        for i in range(len(svd.model)):
            p = OCSVMmodel_t.predict(svd.model[i, :].reshape(1, -1))
            pred = OCSVMmodel_t.decision_function(svd.model[i, :].reshape(
                1, -1)).ravel()
            if (p == 1):
                predicted.append(i)
        return predicted
Example #29
0
    def fit(self, X, y=None):
        '''
        X - an array of concatenated features X_i = (x_{t-1}, x_{t}) corresponding to the infinite horizon case
        '''
        #check parameters...
        assert(type(self.n_estimators)==int)
        assert(self.n_estimators > 0)
        assert(type(self.max_depth)==int)
        assert(self.max_depth > 0)
        assert(type(self.min_samples_split)==int)
        assert(self.min_samples_split > 0)
        assert(type(self.min_samples_leaf)==int)
        assert(self.min_samples_leaf > 0)
        assert(type(self.em_itrs)==int)

        n_samples, n_dims = X.shape

        #an initial partitioning of data with random forest embedding
        self.random_embedding_mdl_ = RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
            )

        #we probably do not need the data type to differentiate it is a demonstration
        #of trajectory or commanded state, do we?
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            # self.random_embedding_mdl_.fit(X[:, X.shape[1]/2:])
            # indices = self.random_embedding_mdl_.apply(X[:, X.shape[1]/2:])
            self.random_embedding_mdl_.fit(X[:, :X.shape[1]/2])
            indices = self.random_embedding_mdl_.apply(X[:, :X.shape[1]/2])
            # X_tmp = np.array(X)
            # X_tmp[:, X.shape[1]/2:] = X_tmp[:, X.shape[1]/2:] - X_tmp[:, :X.shape[1]/2]
            # self.random_embedding_mdl_.fit(X_tmp)

            # indices = self.random_embedding_mdl_.apply(X_tmp)
        else:
            self.random_embedding_mdl_.fit(X)
            #figure out indices
            indices = self.random_embedding_mdl_.apply(X)

        #prepare ensemble for prediction
        self.random_prediction_mdl_ = RandomForestRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
            )

        self.random_prediction_mdl_.fit(X[:, :X.shape[1]/2], X[:, X.shape[1]/2:])

        if self.clustering > 0:
            #we need to force the data to situate in clusters with the given number and the random embeddings
            #first construct affinity
            #use extracted indices as sparse features to construct an affinity matrix
            if self.n_estimators > 1:
                if self.verbose:
                    print 'Building {0} subset of data depending on their random embedding similarity...'.format(self.clustering)
                #it makes sense to use the random embedding to do the clustering if we have ensembled features
                aff_mat = _affinity_matrix_from_indices(indices, 'binary')
                #using spectral mapping (Laplacian eigenmap)
                self.cluster = SpectralClustering(n_clusters=self.clustering, affinity='precomputed')
                self.cluster.fit(aff_mat)
            else:
                if self.verbose:
                    print 'Building {0} subset of data depending on their Euclidean similarity...'.format(self.clustering)
                #otherwise, use euclidean distance, this should be enough when the state space is low dimensional
                self.cluster = KMeans(n_clusters=self.clustering, max_iter=200, n_init=5)
                self.cluster.fit(X)

            partitioned_data = defaultdict(list)
            leaf_idx = defaultdict(set)
            weight_idx = defaultdict(float)
            for d_idx, d, p_idx in zip(range(len(X)), X, self.cluster.labels_):
                partitioned_data[0, p_idx].append(d)
                leaf_idx[0] |= {p_idx}
            for p_idx in range(self.clustering):
                weight_idx[0, p_idx] = 1./self.clustering
            num_estimators = 1
        else:
            partitioned_data = defaultdict(list)
            leaf_idx = defaultdict(set)
            weight_idx = defaultdict(float)
            #group data belongs to the same partition and have the weights...
            #is weight really necessary for EM steps? Hmm, seems to be for the initialization
            #d_idx: data index; p_idx: partition index (comprised of estimator index and leaf index)
            for d_idx, d, p_idx in zip(range(len(X)), X, indices):
                for e_idx, l_idx in enumerate(p_idx):
                    partitioned_data[e_idx, l_idx].append(d)
                    leaf_idx[e_idx] |= {l_idx}

                for e_idx, l_idx in enumerate(p_idx):
                    weight_idx[e_idx, l_idx] = float(len(partitioned_data[e_idx, l_idx])) / len(X)
                    # weight_idx[e_idx, l_idx] = 1. / len(p_idx)
            num_estimators = self.n_estimators

        #for each grouped data, solve an easy IOC problem by assuming quadratic cost-to-go function
        #note that, if the passive dynamics need to be learned, extra steps is needed to train a regressor with weighted data
        #otherwise, just a simply gaussian for each conditional probability distribution model
        self.estimators_ = []
        #another copy to store the parameters all together, for EM/evaluation on all of the models
        self.estimators_full_ = defaultdict(list)

        #<hyin/Feb-6th-2016> an estimator and leaf indexed structure to record the passive likelihood of data...
        passive_likelihood_dict = defaultdict(list)
        for e_idx in range(num_estimators):
            #for each estimator
            estimator_parms = defaultdict(list)
            for l_idx in leaf_idx[e_idx]:
                if self.verbose:
                    print 'Processing {0}-th estimator and {1}-th leaf/partition...'.format(e_idx, l_idx)
                #and for each data partition
                data_partition=np.array(partitioned_data[e_idx, l_idx])

                estimator_parms['means'].append(np.mean(data_partition, axis=0))
                estimator_parms['covars'].append(np.cov(data_partition.T) + np.eye(data_partition.shape[1])*self.reg)

                #for MaxEnt, uniform passive likelihood
                passive_likelihood_dict[e_idx, l_idx] = np.ones(len(data_partition)) / float(len(data_partition))


                estimator_parms['weights'].append(weight_idx[e_idx, l_idx])

            self.estimators_.append(estimator_parms)

        #can stop here or go for expectation maximization for each estimator...
        if self.em_itrs > 0:
            #prepare em results for each estimator
            em_res = [self._em_steps(e_idx, X, y) for e_idx in range(num_estimators)]

            self.estimators_ = em_res

        #record the gmm approximation
        self.gmm_estimators_ = copy.deepcopy(self.estimators_)
        self.gmm_estimators_full_ = defaultdict(list)

        for est in self.estimators_:
            for comp_idx in range(len(est['weights'])):
                est['means'][comp_idx] = est['means'][comp_idx][(n_dims/2):]
                est['covars'][comp_idx] = est['covars'][comp_idx][(n_dims/2):, (n_dims/2):]
                self.estimators_full_['weights'].append(est['weights'][comp_idx]/float(num_estimators))
                #for full estimators
                self.estimators_full_['means'].append(est['means'][comp_idx])
                self.estimators_full_['covars'].append(est['covars'][comp_idx])

        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            X_new         = X[:, X.shape[1]/2:]
            X_old         = X[:, 0:X.shape[1]/2]

            #merge the model knowledge if passive dynamics model is available, use MaxEnt assumption otherwise
            X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(X.shape[0])])
            passive_likelihood = _passive_dyn_likelihood(X_new, X_new_passive, self.passive_dyn_noise, self.passive_dyn_ctrl, self.reg)
            weights = passive_likelihood / (np.sum(passive_likelihood) + self.reg)

            if np.sum(weights) < 1e-10:
                weights = 1./len(weights) * np.ones(len(weights))
            #a GMM as a MaxEnt surrogate
            tmp_gmm = gmm.GMM(  n_components=len(self.estimators_[0]['weights']),
                                priors=self.estimators_[0]['weights'],
                                means=self.estimators_[0]['means'],
                                covariances=self.estimators_[0]['covars'])
            for e_idx in range(num_estimators):
                tmp_gmm.n_components = len(self.estimators_[e_idx]['weights'])
                tmp_gmm.priors = self.estimators_[e_idx]['weights']
                tmp_gmm.means = self.estimators_[e_idx]['means']
                tmp_gmm.covariances = self.estimators_[e_idx]['covars']

                responsibilities = tmp_gmm.to_responsibilities(X_new)
                responsibilities = responsibilities / (np.sum(responsibilities, axis=0) + 1e-10)
                new_weights = (weights * responsibilities.T).T

                new_weights = (new_weights + 1e-10) / (np.sum(new_weights +1e-10, axis=0))

                weighted_means = [np.sum((new_weight*X_new.T).T, axis=0) for new_weight in new_weights.T]

                weighted_covars =[ _frequency_weighted_covariance(X_new, weighted_mean, new_weight, spherical=False)
                                        for new_weight, weighted_mean in zip(new_weights.T, weighted_means)]

                self.estimators_[e_idx]['means'] = weighted_means
                self.estimators_[e_idx]['covars'] = weighted_covars


        self.prepare_inv_and_constants()
        return indices, leaf_idx, partitioned_data, passive_likelihood_dict
Example #30
0
def test_random_hasher_sparse_data():
    X, y = datasets.make_multilabel_classification(random_state=0)
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X_transformed = hasher.fit_transform(X)
    X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
Example #31
0
def wrapper_feature_transformer_ensembles_trees_clf_v2(
    X,
    y,
    X_test,
    y_test,
    n_estimator=10,
    clf_obj=LogisticRegression(max_iter=1000)):

    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude="category")),
                      ('cat', categorical_transformer,
                       selector(dtype_include="category"))])

    scaler = StandardScaler()
    scaler.fit(X)
    x_train_scaled = scaler.transform(X)
    x_test_scaled = scaler.transform(X_test)

    clf_name = str(clf_obj).split('(')[0]

    # It is important to train the ensemble of trees on a different subset
    # of the training data than the linear regression model to avoid
    # overfitting, in particular if the total number of leaves is
    # similar to the number of training samples
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(x_train_scaled,
                                                                y,
                                                                test_size=0.5,
                                                                random_state=0)

    # Unsupervised transformation based on totally random trees
    rt = RandomTreesEmbedding(max_depth=3,
                              n_estimators=n_estimator,
                              random_state=0)
    rt_clf = sklearn.base.clone(clf_obj)
    pipeline = make_pipeline(rt, rt_clf)
    pipeline.fit(X_train, y_train)
    y_pred_rt = pipeline.predict(X_test)
    fpr_rt_clf, tpr_rt_clf, _ = roc_curve(y_test, y_pred_rt)

    # Supervised transformation based on random forests
    rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
    rf_enc = OneHotEncoder()
    rf_clf = sklearn.base.clone(clf_obj)
    rf.fit(X_train, y_train)
    rf_enc.fit(rf.apply(X_train))
    rf_clf.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

    y_pred_rf_clf = rf_clf.predict(rf_enc.transform(rf.apply(x_test_scaled)))
    fpr_rf_clf, tpr_rf_clf, _ = roc_curve(y_test, y_pred_rf_clf)

    # Supervised transformation based on gradient boosted trees
    grd = GradientBoostingClassifier(n_estimators=n_estimator)
    grd_enc = OneHotEncoder()
    grd_clf = sklearn.base.clone(clf_obj)
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_clf.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

    y_pred_grd_clf = grd_clf.predict(
        # grd_enc.transform(grd.apply(x_test_scaled)[:, :, 0]))[:, 1]
        grd_enc.transform(grd.apply(x_test_scaled)[:, :, 0]))
    fpr_grd_clf, tpr_grd_clf, _ = roc_curve(y_test, y_pred_grd_clf)

    # The gradient boosted model by itself
    y_pred_grd = grd.predict(x_test_scaled)
    fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)

    # The random forest model by itself
    y_pred_rf = rf.predict(X_test)
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_clf, tpr_rt_clf, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_clf, tpr_rf_clf, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_clf, tpr_grd_clf, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve - {clf_name}')
    plt.legend(loc='best')
    plt.show()

    plt.figure(2)
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_clf, tpr_rt_clf, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_clf, tpr_rf_clf, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_clf, tpr_grd_clf, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve (zoomed in at top left) - {clf_name}')
    plt.legend(loc='best')
    plt.show()

    pass
from sklearn.pipeline import make_pipeline

n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3,
                          n_estimators=n_estimator,
                          random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
Example #33
0
# pca = PCA(n_components=2)
# X = pca.fit_transform(X)
# print(X.shape)

fig, (ax2, ax3) = plt.subplots(1, 2)

for n_components in range_n_components:
    values = []

    for n_clusters in range_n_clusters:
        # reducer = PCA(n_components=n_components)
        # reducer = SparseRandomProjection(n_components=n_components)
        # reducer = FastICA(n_components=n_components)
        # x = reducer.fit_transform(X)
        reducer = RandomTreesEmbedding(n_estimators=n_components, max_depth=3)
        x = reducer.fit_transform(X).toarray()
        print(x.shape)

        clusterer = GaussianMixture(n_components=n_clusters)
        cluster_labels = clusterer.fit_predict(x)

        silhouette_avg = 0  #silhouette_score(X, cluster_labels)

        log_prob, aic, bic = clusterer.score(x), clusterer.aic(
            x), clusterer.bic(x)
        print("The average log_prob is:", log_prob)
        print("The aic is:", aic)
        print("The bic is:", bic)

        values.append([silhouette_avg, log_prob, aic, bic])
Example #34
0
                                       random_state=10)
random_forest.fit(X_train_ensemble, y_train_ensemble)

gradient_boosting = GradientBoostingClassifier(n_estimators=n_estimators,
                                               max_depth=max_depth,
                                               random_state=10)
_ = gradient_boosting.fit(X_train_ensemble, y_train_ensemble)

# %%
# The :class:`~sklearn.ensemble.RandomTreesEmbedding` is an unsupervised method
# and thus does not required to be trained independently.

from sklearn.ensemble import RandomTreesEmbedding

random_tree_embedding = RandomTreesEmbedding(n_estimators=n_estimators,
                                             max_depth=max_depth,
                                             random_state=0)

# %%
# Now, we will create three pipelines that will use the above embedding as
# a preprocessing stage.
#
# The random trees embedding can be directly pipelined with the logistic
# regression because it is a standard scikit-learn transformer.

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

rt_model = make_pipeline(random_tree_embedding,
                         LogisticRegression(max_iter=1000))
rt_model.fit(X_train_linear, y_train_linear)
def main():

    # Checks for correct number of arguments
    if len(sys.argv) != 3:
        print(
            'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]')
        sys.exit()

    # set up dataset
    data_train = pd.read_csv(sys.argv[1])
    data_test = pd.read_csv(sys.argv[2])

    print('train: {}'.format(sys.argv[1]))
    print('test: {}'.format(sys.argv[2]))

    x_train = data_train.drop(
        [data_train.columns[0], data_train.columns[1], data_train.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_train = pd.Series(data_train.iloc[:, -1])
    x_test = data_test.drop(
        [data_test.columns[0], data_test.columns[1], data_test.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_test = pd.Series(data_test.iloc[:, -1])

    type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ')
    if type == 1:
        method = input('method: [1: classification, 2: regression] ')
        if method == 1:
            classifier = input(
                'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] '
            )
            if classifier == 1:
                criterion = input('criterion: [1: gini, 2: entropy] ')
                if criterion == 1:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='gini')
                elif criterion == 2:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='entropy')
                else:
                    print('no criterion chosen')
                    exit()
            elif classifier == 2:
                print(type, method, classifier)
                model = ExtraTreeClassifier()
            elif classifier == 3:
                print(type, method, classifier)
                model = ExtraTreesClassifier()
            elif classifier == 4:
                n = input('n: [1: 1, 2: 3: 3: 5] ')
                if n == 1:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=1)
                elif n == 2:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=3)
                elif n == 3:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=5)
                else:
                    print('no n chosen')
                    exit()
            elif classifier == 5:
                version = input(
                    'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] '
                )
                if version == 1:
                    print(type, method, classifier, version)
                    model = GaussianNB()
                elif version == 2:
                    print(type, method, classifier, version)
                    model = BernoulliNB()
                elif version == 3:
                    print(type, method, classifier, version)
                    model = MultinomialNB()
                elif version == 4:
                    print(type, method, classifier, version)
                    model = ComplementNB()
                else:
                    print('no version chosen')
                    exit()
            elif classifier == 6:
                print(type, method, classifier)
                model = RadiusNeighborsClassifier(radius=1.0)
            elif classifier == 7:
                print(type, method, classifier)
                model = RandomForestClassifier(n_estimators=50, random_state=1)
            elif classifier == 8:
                print(type, method, classifier)
                model = LinearSVC(
                    multi_class='crammer_singer')  #multi_class='ovr'
            elif classifier == 9:
                print(type, method, classifier)
                model = GradientBoostingClassifier()
            elif classifier == 10:
                print(type, method, classifier)
                model = GaussianProcessClassifier(multi_class='one_vs_one')
                # model = GaussianProcessClassifier(multi_class='one_vs_rest')
            elif classifier == 11:
                print(type, method, classifier)
                model = SGDClassifier()
            elif classifier == 12:
                print(type, method, classifier)
                model = PassiveAggressiveClassifier()
            elif classifier == 13:
                print(type, method, classifier)
                model = NearestCentroid()
            elif classifier == 14:
                print(type, method, classifier)
                model = Perceptron(tol=1e-3, random_state=0)
            elif classifier == 15:
                print(type, method, classifier)
                model = MLPClassifier()
            elif classifier == 16:
                print(type, method, classifier)
                model = AdaBoostClassifier(n_estimators=100)
            else:
                print('no classifier chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # predict output
            predictions = pd.Series(model.predict(x_test))

            filename = '{},{},{}.txt'.format(type, method, classifier)
            with open(filename, 'w') as output:
                output.write('{:10}\t{:10}\t{:10}\t{:10}'.format(
                    'actual', 'predict', 'approximate', 'match?'))
                for i in range(len(predictions)):
                    match = True if (y_test[i] == predictions[i]) else False
                    output.write('{:10}\t{:10}\t{:10}'.format(
                        y_train[i], predictions[i], match))
                output.write('accuracy: {:7.2f}%'.format(
                    100 * accuracy_score(y_test, predictions)))

            print('accuracy: {:7.2f}%'.format(
                100 * accuracy_score(y_test, predictions)))
            print(
                classification_report(
                    y_test,
                    predictions,
                    target_names=['RightTroll', 'LeftTroll', 'Other']))
            print(
                confusion_matrix(y_test,
                                 predictions,
                                 labels=["RightTroll", "LeftTroll", "Other"]))
        elif method == 2:
            # transform into binary classification problem
            # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1)
            # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1)

            # transform string labels into integers
            # le = LabelEncoder()
            # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1]))
            # print(le.classes_)
            #
            # y_train = le.transform(y_train)
            # y_test = le.transform(y_test)

            regressor = input(
                'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] '
            )
            if regressor == 1:
                print(type, method, regressor)
                model = LinearDiscriminantAnalysis()
            elif regressor == 2:
                print(type, method, regressor)
                model = LogisticRegression(
                    solver='lbfgs', multi_class='multinomial')  #'newton-cg'
            elif regressor == 3:
                print(type, method, regressor)
                model = RidgeClassifier()
            elif regressor == 4:
                print(type, method, regressor)
                model = QuadraticDiscriminantAnalysis()
            elif regressor == 5:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(LinearRegression())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(LinearRegression())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 6:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(DecisionTreeRegressor())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(DecisionTreeRegressor())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 7:
                print(type, method, regressor)
                model = PLSRegression(n_components=2)
            elif regressor == 8:
                print(type, method, regressor)
                model = PLSCanonical(n_components=2)
            elif regressor == 9:
                print(type, method, regressor)
                model = CCA(n_components=1)
            elif regressor == 10:
                print(type, method, regressor)
                model = Lasso(alpha=0.1)
            elif regressor == 11:
                print(type, method, regressor)
                model = MultiTaskLasso(alpha=0.1)
            elif regressor == 12:
                print(type, method, regressor)
                model = ElasticNet(random_state=0)
            elif regressor == 13:
                print(type, method, regressor)
                model = MultiTaskElasticNet(random_state=0)
            elif regressor == 14:
                print(type, method, regressor)
                model = Lars(n_nonzero_coefs=1)
            elif regressor == 15:
                print(type, method, regressor)
                model = LassoLars(alpha=.1)
            elif regressor == 16:
                print(type, method, regressor)
                model = OrthogonalMatchingPursuit()
            elif regressor == 17:
                print(type, method, regressor)
                model = BayesianRidge()
            elif regressor == 18:
                print(type, method, regressor)
                model = ARDRegression()
            elif regressor == 19:
                print(type, method, regressor)
                model = TheilSenRegressor(random_state=0)
            elif regressor == 20:
                print(type, method, regressor)
                model = HuberRegressor()
            elif regressor == 21:
                print(type, method, regressor)
                model = RANSACRegressor(random_state=0)
            else:
                print('no regressor chosen')
                exit()

            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # print('coefficient:', model.coef_)
            # print('intercept:', model.intercept_)

            # predict output
            predictions = pd.Series(model.predict(x_test))
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # calculate accuracy
            numerator = 0.0
            denominator = float(len(predictions))
            for i in range(len(predictions)):
                match = True if (y_test[i] == predictions[i]) else False
                numerator += 1 if match else 0
                print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                                   match))
            print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))

        else:
            print('no method chosen')
            exit()
    elif type == 2:
        classifier = input(
            'classifier: [1: label propagation, 2: label spreading] ')
        if classifier == 1:
            print(type, classifier)
            model = LabelPropagation()
        elif classifier == 2:
            print(type, classifier)
            model = LabelSpreading()
        else:
            print('no classifier chosen')
            exit()
        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    elif type == 3:
        method = input(
            'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] '
        )
        if method == 1:
            clusterer = input('clustere: [1: k means]')
            if clusterer == 1:
                clusters = input('clusters: [1: 1, 2: 2, 3: 3] ')
                if clusters == 1:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=1, random_state=0)
                elif clusters == 2:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=2, random_state=0)
                elif clusters == 3:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=3, random_state=0)
                else:
                    print('no clusters chosen')
                    exit()
            else:
                print('no clusterer chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.predict(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # check details
            print('centroids: ' + model.cluster_centers_)
            # print('labels: ' + model.labels_)
        elif method == 2:
            model = RandomTreesEmbedding()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.apply(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))
        elif method == 3:
            model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
            # train the model using the training sets and check score
            model.fit(x_train)
            distances, indices = nbrs.kneighbors(X)

        else:
            print('no method chosen')
            exit()

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    else:
        print('no type chosen')
        exit()
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve

n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
rt_lm = LogisticRegression()
rt.fit(X_train, y_train)
rt_lm.fit(SelectFromModel(rt, prefit=True).transform(X_train_lr), y_train_lr)

y_pred_rt = rt_lm.predict_proba(
	SelectFromModel(rt, prefit=True).transform(X_test))[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
        'Gradient Boost'
    ]
    return classifier_list, classifier_name_list


def print_evaluation_metrics(trained_model, trained_model_name, X_test,
                             y_test):
    print '--------- For Model : ', trained_model_name
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test, predicted_values)
    print "Accuracy Score : ", metrics.accuracy_score(y_test, predicted_values)
    print "---------------------------------------\n"


filename = 'train.csv'
imperial_frame = pd.read_csv(filename)
feature_hash = hashfeatures.FeatureHash(max_feature_num=5000)
insult_features = feature_hash.get_feature_set(
    list(imperial_frame['Comment'].values))
class_labels = list(imperial_frame['Insult'].values)
rf_embed_features = RandomTreesEmbedding(n_estimators=151, random_state=42)
insult_features = rf_embed_features.fit_transform(insult_features)
X_train, X_test, y_train, y_test = train_test_split(insult_features,
                                                    class_labels,
                                                    test_size=0.1,
                                                    random_state=42)
classifier_list, classifier_name_list = get_ensemble_models()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    print_evaluation_metrics(classifier, classifier_name, X_test, y_test)
class EnsembleIOC(BaseEstimator, RegressorMixin):

    def __init__(self,  n_estimators=20, 
                        max_depth=5, min_samples_split=10, min_samples_leaf=10,
                        random_state=0,
                        em_itrs=5,
                        regularization=0.05,
                        passive_dyn_func=None,
                        passive_dyn_ctrl=None,
                        passive_dyn_noise=None,
                        verbose=False):
        '''
        n_estimators        - number of ensembled models
        ...                 - a batch of parameters used for RandomTreesEmbedding, see relevant documents
        em_itrs             - maximum number of EM iterations to take
        regularization      - small positive scalar to prevent singularity of matrix inversion
        passive_dyn_func    - function to evaluate passive dynamics; None for MaxEnt model
        passive_dyn_ctrl    - function to return the control matrix which might depend on the state...
        passive_dyn_noise   - covariance of a Gaussian noise; only applicable when passive_dyn is Gaussian; None for MaxEnt model
                                note this implies a dynamical system with constant input gain. It is extendable to have state dependent
                                input gain then we need covariance for each data point
        verbose             - output training information
        '''
        BaseEstimator.__init__(self)

        self.n_estimators=n_estimators
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.random_state=random_state
        self.em_itrs=em_itrs
        self.reg=regularization
        self.passive_dyn_func=passive_dyn_func
        self.passive_dyn_ctrl=passive_dyn_ctrl
        self.passive_dyn_noise=passive_dyn_noise
        self.verbose=verbose
        return

    def fit(self, X, y=None):
        '''
        y could be the array of starting state of the demonstrated trajectories/policies
        if it is None, it implicitly implies a MaxEnt model. Other wise, it serves as the feature mapping
        of the starting state. This data might also be potentially used for learning the passive dynamics
        for a pure model-free learning with some regressors and regularization.
        '''
        #check parameters...
        assert(type(self.n_estimators)==int)
        assert(self.n_estimators > 0)
        assert(type(self.max_depth)==int)
        assert(self.max_depth > 0)
        assert(type(self.min_samples_split)==int)
        assert(self.min_samples_split > 0)
        assert(type(self.min_samples_leaf)==int)
        assert(self.min_samples_leaf > 0)
        assert(type(self.em_itrs)==int)

        #an initial partitioning of data with random forest embedding
        self.random_embedding_mdl_ = RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
            )

        #we probably do not need the data type to differentiate it is a demonstration
        #of trajectory or commanded state, do we?
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            self.random_embedding_mdl_.fit(X[:, X.shape[1]/2:])
            indices = self.random_embedding_mdl_.apply(X[:, X.shape[1]/2:])
            # X_tmp = np.array(X)
            # X_tmp[:, X.shape[1]/2:] = X_tmp[:, X.shape[1]/2:] - X_tmp[:, :X.shape[1]/2]
            # self.random_embedding_mdl_.fit(X_tmp)

            # indices = self.random_embedding_mdl_.apply(X_tmp)
        else:
            self.random_embedding_mdl_.fit(X)
            #figure out indices
            indices = self.random_embedding_mdl_.apply(X)

        partitioned_data = defaultdict(list)

        leaf_idx = defaultdict(set)
        weight_idx = defaultdict(float)
        #group data belongs to the same partition and have the weights...
        #is weight really necessary for EM steps? Hmm, seems to be for the initialization
        #d_idx: data index; p_idx: partition index (comprised of estimator index and leaf index)
        for d_idx, d, p_idx in zip(range(len(X)), X, indices):
            for e_idx, l_idx in enumerate(p_idx):
                partitioned_data[e_idx, l_idx].append(d)
                leaf_idx[e_idx] |= {l_idx}

            for e_idx, l_idx in enumerate(p_idx):
                weight_idx[e_idx, l_idx] = float(len(partitioned_data[e_idx, l_idx])) / len(X)
                # weight_idx[e_idx, l_idx] = 1. / len(p_idx)

        #for each grouped data, solve an easy IOC problem by assuming quadratic cost-to-go function
        #note that, if the passive dynamics need to be learned, extra steps is needed to train a regressor with weighted data
        #otherwise, just a simply gaussian for each conditional probability distribution model
        self.estimators_ = []
        #another copy to store the parameters all together, for EM/evaluation on all of the models
        self.estimators_full_ = defaultdict(list)
        #<hyin/Feb-6th-2016> an estimator and leaf indexed structure to record the passive likelihood of data...
        passive_likelihood_dict = defaultdict(list)
        for e_idx in range(self.n_estimators):
            #for each estimator
            estimator_parms = defaultdict(list)
            for l_idx in leaf_idx[e_idx]:
                if self.verbose:
                    print 'Processing {0}-th estimator and {1}-th leaf...'.format(e_idx, l_idx)
                #and for each data partition
                data_partition=np.array(partitioned_data[e_idx, l_idx])
                if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
                    X_new         = data_partition[:, data_partition.shape[1]/2:]
                    X_old         = data_partition[:, 0:data_partition.shape[1]/2]
                    X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(data_partition.shape[0])])
                    passive_likelihood = _passive_dyn_likelihood(X_new, X_new_passive, self.passive_dyn_noise, self.passive_dyn_ctrl, self.reg)

                    weights = passive_likelihood / np.sum(passive_likelihood)
                    weighted_mean = np.sum((weights*X_new.T).T, axis=0)

                    estimator_parms['means'].append(weighted_mean)
                    estimator_parms['covars'].append(_frequency_weighted_covariance(X_new, weighted_mean, weights, spherical=False))

                    #for full estimators
                    self.estimators_full_['means'].append(estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(estimator_parms['covars'][-1])

                    #<hyin/Feb-6th-2016> also remember the data weight according to the passive likelihood
                    #this could be useful if the weights according to the passive likelihood is desired for other applications
                    #to evaluate some statistics within the data parition
                    passive_likelihood_dict[e_idx, l_idx] = weights
                else:
                    estimator_parms['means'].append(np.mean(data_partition, axis=0))
                    estimator_parms['covars'].append(np.cov(data_partition.T))

                    #for full estimators
                    self.estimators_full_['means'].append(estimator_parms['means'][-1])
                    self.estimators_full_['covars'].append(estimator_parms['covars'][-1])

                    #for MaxEnt, uniform passive likelihood
                    passive_likelihood_dict[e_idx, l_idx] = np.ones(len(data_partition)) / float(len(data_partition))


                estimator_parms['weights'].append(weight_idx[e_idx, l_idx])
                self.estimators_full_['weights'].append(weight_idx[e_idx, l_idx]/float(self.n_estimators))

            self.estimators_.append(estimator_parms)
        #can stop here or go for expectation maximization for each estimator...
        if self.em_itrs > 0:
            #prepare em results for each estimator
            em_res = [self._em_steps(e_idx, X, y) for e_idx in range(self.n_estimators)]
            #or do EM on the full model?
            # <hyin/Dec-2nd-2015> no, doing this seems to harm the learning as the aggregated model is really
            # complex so optimizing that model tends to overfit...
            # em_res = self._em_steps(None, X, y)
            #then use them
            self.estimators_=em_res

        self.prepare_inv_and_constants()
        return indices, leaf_idx, passive_likelihood_dict

    def _em_steps(self, estimator_idx, X, y=None):
        #use current estimation as initialization to perform expectation-maximization
        #now reuse the procedure implemented by scikit-learn, actually a costumized implementation
        #is required if the passive dynamics also needs to be learned.
        if self.verbose:
            if estimator_idx is not None:
                print 'EM steps for the estimator {0}'.format(estimator_idx)
            else:
                print 'EM steps...'

        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            #extract X_old, X_new, X_new_passive
            X_old = X[:, 0:X.shape[1]/2]
            X_new = X[:, X.shape[1]/2:]
            X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(X.shape[0])])


            # EM algorithms
            current_log_likelihood = None
            # reset self.converged_ to False
            converged = False
            # this line should be removed when 'thresh' is removed in v0.18
            tol = 1e-4
            #use the internal EM steps for non-uniform passive dynamics case
            for i in range(self.em_itrs):
                prev_log_likelihood = current_log_likelihood
                # Expectation step
                log_likelihoods, responsibilities = self._do_estep(
                    estimator_idx, X_new_passive, X_new, y)
                current_log_likelihood = log_likelihoods.mean()

                if self.verbose:
                    print 'current_log_likelihood:', current_log_likelihood
                if prev_log_likelihood is not None:
                    change = abs(current_log_likelihood - prev_log_likelihood)
                    if change < tol:
                        converged = True
                        break

                # Maximization step
                if estimator_idx is not None:
                    self._do_mstep(X_new_passive, X_new, responsibilities, self.estimators_[estimator_idx])
                else:
                    self._do_mstep(X_new_passive, X_new, responsibilities, self.estimators_full_)

            if estimator_idx is None:
                res=self.estimators_full_
            else:
                res=self.estimators_[estimator_idx]
        else:
            if estimator_idx is not None:
                n_partitions=len(self.estimators_[estimator_idx]['weights'])
                #use our own initialization
                g = mixture.GMM(n_components=n_partitions, n_iter=self.em_itrs, init_params='',
                    covariance_type='full')
                g.means_=np.array(self.estimators_[estimator_idx]['means'])
                g.covars_=np.array(self.estimators_[estimator_idx]['covars'])
                g.weights_=np.array(self.estimators_[estimator_idx]['weights'])
            else:
                n_partitions=len(self.estimators_full_['weights'])
                g = mixture.GMM(n_components=n_partitions, n_iter=self.em_itrs, init_params='',
                    covariance_type='full')
                g.means_=np.array(self.estimators_full_['means'])
                g.covars_=np.array(self.estimators_full_['covars'])
                g.weights_=np.array(self.estimators_full_['weights'])

            g.fit(X)

            #prepare to return a defaultdict
            res=defaultdict(list)
            res['means']=list(g.means_)
            res['covars']=list(g.covars_)
            res['weights']=list(g.weights_)

        return res

    def _do_estep(self, estimator_idx, X_new_passive, X_new, y):
        return self._score_sample_for_passive_mdl_helper(
                    estimator_idx, X_new_passive, X_new, y)

    def _do_mstep(self, X_new_passive, X_new, responsibilities, parms, min_covar=1e-7):
        """
        X_new_passive    -  An array of the propagation of the old state through the passiv edynamics
        X_new            -  An array of the new states that observed  
        responsibilities -  array_like, shape (n_samples, n_components)
                            Posterior probabilities of each mixture component for each data
        """
        n_samples, n_dim = X_new.shape
        weights = responsibilities.sum(axis=0)
        weighted_X_new_sum = np.dot(responsibilities.T, X_new)
        weighted_X_new_passive_sum = np.dot(responsibilities.T, X_new_passive)
        inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS)
        weighted_X_new_mean = weighted_X_new_sum * inverse_weights
        weighted_X_new_passive_mean = weighted_X_new_passive_sum * inverse_weights

        if 'weights' in parms:
            parms['weights'] = (weights / (weights.sum() + 10 * EPS) + EPS)

        # delta_X_new                 = [None] * n_samples
        # delta_X_new_passive         = [None] * n_samples
        # delta_X_new_passive_Sigma_0 = [None] * n_samples
        # one_array = np.ones(n_dim)
        # for c in range(len(parms['weights'])):
        #     delta_X_new[c]                 = X_new - weighted_X_new_mean[c]
        #     delta_X_new_passive[c]         = X_new_passive - weighted_X_new_passive_mean[c]
        #     delta_X_new_passive_Sigma_0[c] = (1./self.passive_dyn_noise * np.eye(n_dim).dot(delta_X_new_passive[c].T)).T

        # if 'covars' in parms:
        #     #now only support diagonal covariance matrix
        #     for c, old_covar in enumerate(parms['covars']):
        #         constant=np.sum(delta_X_new[c]*delta_X_new[c]*responsibilities[:, c][:, np.newaxis], axis=0)#*inverse_weights[c, 0]
        #         so_coeff=np.sum(delta_X_new_passive_Sigma_0[c]*delta_X_new_passive_Sigma_0[c]*responsibilities[:, c][:, np.newaxis], axis=0)#*inverse_weights[c, 0]
        #         #take the roots for S matrix
        #         S_k=(np.sqrt(one_array+4*so_coeff*constant)-one_array)/(2*so_coeff)
        #         #get Sigma_k from S_k through S_k^(-1) = Sigma_k^(-1) + Sigma_0^(-1)
        #         Sigma_k = 1./(1./S_k -  1./self.passive_dyn_noise * np.ones(n_dim))
        #         print S_k, Sigma_k
        #         parms['covars'][c] = np.diag(Sigma_k)
        # if 'means' in parms:
        #     for c, old_mean in enumerate(parms['means']):
        #         Sigma_k_array = np.diag(parms['covars'][c])
        #         S_k=1./Sigma_k_array + 1./self.passive_dyn_noise * np.ones(n_dim)
        #         coeff_mat = np.diag(Sigma_k_array*(1./S_k))
        #         #difference betwen X_new and X_new_passive
        #         delta_X_new_X_new_passive = X_new - (np.diag(S_k).dot(X_new_passive.T)).T
        #         parms['means'][c] = coeff_mat.dot(np.sum(delta_X_new_X_new_passive*responsibilities[:, c][:, np.newaxis]*inverse_weights[c, 0], axis=0))
        #<hyin/Oct-23rd-2015> Try the formulation from Bellman equation, this seems leading t a weighted-linear regression problem...
        # c = (X_new - X_new_passive)
        #<hyin/OCt-27th-2015> Try the closed-form solutions for a relaxed lower-bound
        # if 'means' in parms:
        #     parms['means'] = weighted_X_new_mean
        # if 'covars' in parms:
        #     for c, old_covar in enumerate(parms['covars']):
        #         data_weights = responsibilities[:, c]
        #         parms['covars'][c] = _frequency_weighted_covariance(X_new, parms['means'][c], data_weights)

        #<hyin/Nov-20th-2015> As far as I realize, the above close-form solution actually optimize a value lower than the actual objective
        #however, this approximation is not tight thus unfortunately we cannot guarantee the optimum is also obtained for the actual objective...
        #another idea is to symplify the model by only learning the mean, or say the center of the RBF function
        #the width of the RBF basis can be adapted by solving a one-dimensional numerical optimization, this should lead to 
        #a generalized EM algorithm
        #<hyin/Jan-22nd-2016> note that without the adaptation of covariance, the shift of mean
        #is not that great option, so let's only keeps the weights adapatation. We need numerical optimization for the covariance adaptation
        #to see if it would help the mean shift 
        if 'means' in parms:
            for c, old_mean in enumerate(parms['means']):
                Sigma_k_array = parms['covars'][c]
                # S_k = self.passive_dyn_noise * self.passive_dyn_ctrl + Sigma_k_array + 1e-5*np.eye(X_new.shape[1])
                # # coeff_mat = np.diag(Sigma_k_array*(1./S_k))
                # inv_Sigma_k_array = np.linalg.pinv(Sigma_k_array)
                # inv_Sigma_sum = np.linalg.pinv(S_k + Sigma_k_array)
                # #could use woodbury here...
                # coeff_mat = np.linalg.pinv(inv_Sigma_k_array - inv_Sigma_sum)
                # #difference betwen X_new and X_new_passive
                # delta_X_new_X_new_passive = (inv_Sigma_k_array.dot(X_new.T) - inv_Sigma_sum.dot(X_new_passive.T)).T

                # parms['means'][c] = coeff_mat.dot(np.sum(delta_X_new_X_new_passive*responsibilities[:, c][:, np.newaxis]*inverse_weights[c, 0], axis=0))

                # #another formulation? which one is correct?
                # <hyin/Dec-2nd-2015> this seems more straightforward and at least give a keep increasing likelihood
                # need to check the original formulation to see whats the problem
                inv_Sigma_k_array = np.linalg.pinv(Sigma_k_array)
                inv_Sigma_0 = np.linalg.pinv(self.passive_dyn_noise * self.passive_dyn_ctrl + self.reg*np.eye(X_new.shape[1]))
                coeff_mat = Sigma_k_array
                inv_Sigma_sum = inv_Sigma_k_array + inv_Sigma_0
                delta_X_new_X_new_passive = (inv_Sigma_sum.dot(X_new.T) - inv_Sigma_0.dot(X_new_passive.T)).T
                parms['means'][c] = coeff_mat.dot(np.sum(delta_X_new_X_new_passive*responsibilities[:, c][:, np.newaxis]*inverse_weights[c, 0], axis=0))
        # return

    def sample(self, n_samples=1, random_state=None):
        '''
        return samples that are synthesized from the model
        '''
        if not hasattr(self, 'estimators_'):
            print 'The model has not been trained yet...'
            return
        else:
            pass
        return

    def score(self, X, y=None):
        #take log likelihood for each estimator for a given trajectory/state
        #without considering the passive dynamics: MaxEnt model
        estimator_scores=[_log_multivariate_normal_density_full(
                            X,
                            np.array(self.estimators_[e_idx]['means']),
                            np.array(self.estimators_[e_idx]['covars']))
                            +np.log(self.estimators_[e_idx]['weights']) for e_idx in range(self.n_estimators)]

        # concatenate different models...
        # estimator_scores=np.concatenate(estimator_scores,axis=1)
        # res=[logsumexp(x)-np.log(1./self.n_estimators) for x in np.array(estimator_scores)]
        # another way: mean of evaluated cost functions
        # helper to evaluate a single model
        mdl_eval = lambda scores: [logsumexp(x_score) for x_score in scores]
        estimator_scores = np.array([mdl_eval(scores) for scores in estimator_scores])

        responsibilities = [np.exp(estimator_scores[e_idx] - estimator_scores[e_idx][:, np.newaxis]) for e_idx in range(self.n_estimators)]
        #average seems to be more reasonable...
        res=np.mean(estimator_scores,axis=0)
        res_responsibilities = np.mean(np.array(responsibilities), axis=0)
        return -np.array(res), res_responsibilities

    def score_samples(self, X, y=None, min_covar=1.e-7):
        #a different version to evaluate the quality/likelihood of state pairs
        if self.passive_dyn_func is not None and self.passive_dyn_ctrl is not None and self.passive_dyn_noise is not None:
            X_old = X[:, 0:X.shape[1]/2]
            X_new = X[:, X.shape[1]/2:]
            X_new_passive = np.array([self.passive_dyn_func(X_old[sample_idx]) for sample_idx in range(X.shape[0])])

            log_prob_lst = [None] * self.n_estimators
            respon_lst = [None] * self.n_estimators
            for e_idx in range(self.n_estimators):
                log_prob_lst[e_idx], respon_lst[e_idx] = self._score_sample_for_passive_mdl_helper(
                    e_idx, X_new_passive, X_new, y, min_covar)
            res = -np.mean(np.array(log_prob_lst),axis=0)
            res_responsibilities = np.mean(np.array(respon_lst), axis=0)
        else:
            #this should be a trajectory/maximum ent model, use score...
            res, res_responsibilities = self.score(X, y)
        return res, res_responsibilities 


    def value_eval_samples(self, X, y=None, average=False, full=True, const=True):
        #switching off the constant term seems to smooth the value function
        #I don't quite understand why, my current guess is that the axis-align partition results in 
        #oversized covariance matrices, making the constant terms extremely large for some partitions
        #this can be shown adding a fixed term to the covariance matrices to mitigate the singularity
        #this could be cast as a kind of regularization

        #the new switch is actually equivalent to average=True, but since the training parameters are separated
        #lets keep this ugly solution...
        n_samples, n_dim = X.shape

        if not average:
            if not full:
                weights = []
                for idx in range(self.n_estimators):
                    weights = weights + (np.array(self.estimators_[idx]['weights'])/self.n_estimators).tolist()
                #the real function to evaluate the value functions, which are actually un-normalized Gaussians
                def value_estimator_eval(d):
                    res = []
                    for idx in range(self.n_estimators):
                        for i, (m, c_inv) in enumerate(   zip(self.estimators_[idx]['means'], 
                                                    self.estimators_[idx]['inv_covars'])):
                            diff_data = d - m
                            res.append(.5*diff_data.dot(c_inv).dot(diff_data) + self.estimators_[idx]['beta'][i]*const)
                    return np.array(res)

                res = np.array([ -logsumexp(-value_estimator_eval(d), b=np.array(weights)) for d in X])
            else:
                res = np.zeros(X.shape[0])
                res_mat = np.zeros((X.shape[0], len(self.estimators_full_['means'])))
                for i, (m, c_inv)   in enumerate(   zip(self.estimators_full_['means'], 
                                                self.estimators_full_['inv_covars'])):
                    diff_data = X - m
                    res_mat[:, i] = np.array([e_prod.dot(e)*0.5 + self.estimators_full_['beta'][i]*const for e_prod, e in zip(diff_data.dot(c_inv), diff_data)])
                for d_idx, r in enumerate(res_mat):
                    res[d_idx] = -logsumexp(-r, b=self.estimators_full_['weights'])
        else:
            #the real function to evaluate the value functions, which are actually un-normalized Gaussians
            def value_estimator_eval(idx):
                res = np.zeros((X.shape[0], len(self.estimators_[idx]['means'])))
                logsumexp_res=np.zeros(len(res))
                for i, (m, c_inv) in enumerate(   zip(self.estimators_[idx]['means'], 
                                            self.estimators_[idx]['inv_covars'])):
                    diff_data = X - m
                    res[:, i] = np.array([e_prod.dot(e)*0.5 + self.estimators_[idx]['beta'][i]*const for e_prod, e in zip(diff_data.dot(c_inv), diff_data)])
                for d_idx, r in enumerate(res):
                    logsumexp_res[d_idx] = -logsumexp(-r, b=self.estimators_[idx]['weights'])

                return logsumexp_res
                
            estimator_scores = [ value_estimator_eval(e_idx) for e_idx in range(self.n_estimators) ]
            #take average
            res = np.mean(np.array(estimator_scores), axis=0)
        return res
 
    def _score_sample_for_passive_mdl_helper(self, estimator_idx, X_new_passive, X_new, y, min_covar=1.e-7):
        #for the specified estimator with a passive dynamics model,
        #evaluate the likelihood for given state pairs
        #to call this, ensure passive dynamics and noise are available
        n_samples, n_dim = X_new.shape

        #incorporate the likelihood of passive dynamics - a Gaussian
        """
                        P_0(x'|x) exp^(V(x'))
        P(x'|x) = --------------------------------- = N(x', m(x), S)
                    int_x'' P_0(x''|x) exp^(V(x''))
        """
        """
        for sake of maximization step and simplicity, evaluate a lower-bound instead
        log(P(x'|x)) > -0.5 * D * log(2*pi) + 0.5*log(det(S^{-1})) -0.5*log2 + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
                     > -0.5 * D * log(2*pi) + 0.5*log(det(S^{-1})/2) + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k)
                     > -0.5 * D * log(2*pi) + 0.5*log((det(Sigma_k)^{-1}+det(Sigma_0)^{-1})/2) + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
                     > -0.5 * D * log(2*pi) + 0.5*log(det(Sigma_k)^{-1})/2 + 0.5*log(det(Sigma_0))/2 + 0.5*log2 - 0.5*(x'-f(x))^TSigma^{-1}(x'-f(x)) - 0.5*(x'-mu_k)^TSimga_k^{-1}(x'-mu_k) + 0.5*(mu_k-f(x))^TM^{-1}(mu_k-f(x))
        Any way to bound the last term to also make it independent from matrix other than Sigma_k?
        """

        # regularize to prevent numerical instability
        Sigma_0 = self.passive_dyn_noise * self.passive_dyn_ctrl + self.reg*np.eye(X_new.shape[1])
        # + 1e-2 * np.eye(X_new.shape[1])
        Sigma_0_inv = np.linalg.pinv(Sigma_0)
        if estimator_idx is not None:
            Sigma   = self.estimators_[estimator_idx]['covars']
            mu      = self.estimators_[estimator_idx]['means']
            w       = self.estimators_[estimator_idx]['weights']
        else:
            Sigma   = self.estimators_full_['covars']
            mu      = self.estimators_full_['means']
            w       = self.estimators_full_['weights']
        nmix    = len(mu)

        log_prob  = np.empty((n_samples, nmix))
        for c, (mu_k, Sigma_k) in enumerate(zip(mu, Sigma)):
            #obviously, this fraction can be optimized by exploiting the structure of covariance matrix
            #using say Cholesky decomposition
            Sigma_k_inv = np.linalg.pinv(Sigma_k)
            S_inv       = Sigma_k_inv + Sigma_0_inv
            S           = np.linalg.pinv(S_inv)
            try:
                S_chol = linalg.cholesky(S, lower=True)
            except linalg.LinAlgError:
                # The model is most probably stuck in a component with too
                # few observations, we need to reinitialize this components
                S_chol = linalg.cholesky(S + min_covar * np.eye(n_dim),
                                          lower=True)
            m = S.dot((Sigma_k_inv.dot(mu_k)+Sigma_0_inv.dot(X_new_passive.T).T).T).T
            #fraction part of above equation
            # scale_log_det = -.5 * (np.log(2*np.pi) + np.sum(np.log(S_inv)) + 
            #     2*np.sum(np.log(np.diag(Sigma_k_chol))) + np.sum(np.log(np.diag(Sigma_0))))
            # #exp() part of the above equation
            # S_sol = linalg.solve_triangular(M_chol, (X_new - X_old).T, lower=True).T

            # scale_log_rbf = -.5 * (np.sum(M_sol**2), axis=1)
            S_log_det = 2 * np.sum(np.log(np.diag(S_chol)))
            # print 'S_log_det:', S_log_det
            S_sol = linalg.solve_triangular(S_chol, (X_new - m).T, lower=True).T
            log_prob[:, c] = -.5 * (np.sum(S_sol**2, axis=1) + n_dim * np.log(2 * np.pi) + S_log_det)
        lpr = log_prob + np.log(w)
        # print 'log_prob:', log_prob
        # print 'w:', w
        # print 'lpr:', lpr
        logprob = logsumexp(lpr, axis=1)
        responsibilities = np.exp(lpr - logprob[:, np.newaxis])
        return logprob, responsibilities

    def prepare_inv_and_constants(self):
        '''
        supplement steps to prepare inverse of variance matrices and constant terms
        ''' 
        regularization = self.reg
        for idx in range(self.n_estimators):
            self.estimators_[idx]['inv_covars'] = [ np.linalg.pinv(covar + np.eye(covar.shape[0])*regularization) for covar in self.estimators_[idx]['covars']]
            self.estimators_[idx]['beta'] = [.5*np.log(pseudo_determinant(covar + np.eye(covar.shape[0])*regularization)) + .5*np.log(2*np.pi)*covar.shape[0] for covar in self.estimators_[idx]['covars']]

        self.estimators_full_['weights'] = []
        self.estimators_full_['means'] = []
        self.estimators_full_['covars'] = []
        for e_idx in range(self.n_estimators):
            for leaf_idx in range(len(self.estimators_[e_idx]['weights'])):
                self.estimators_full_['weights'].append(self.estimators_[e_idx]['weights'][leaf_idx]/float(self.n_estimators))
                self.estimators_full_['covars'].append(self.estimators_[e_idx]['covars'][leaf_idx])
                self.estimators_full_['means'].append(self.estimators_[e_idx]['means'][leaf_idx])
        # self.estimators_full_['inv_covars'] = [ np.linalg.pinv(covar) for covar in self.estimators_full_['covars']]
        # self.estimators_full_['beta'] = [.5*np.log(pseudo_determinant(covar)) + .5*np.log(2*np.pi)*covar.shape[0] for covar in self.estimators_full_['covars']]
                self.estimators_full_['inv_covars'].append(self.estimators_[e_idx]['inv_covars'][leaf_idx])
                self.estimators_full_['beta'].append(self.estimators_[e_idx]['beta'][leaf_idx])
        return
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.neural_network import MLPRegressor
from sklearn.mixture import BayesianGaussianMixture
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process import GaussianProcessRegressor

# sklearn NO random forest KAIKI
lr = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
rte = RandomTreesEmbedding()
mr = MLPRegressor(max_iter=1000)
omp = OrthogonalMatchingPursuit()
ran = RANSACRegressor()
tsr = TheilSenRegressor(random_state=42)
br = BayesianRidge(n_iter=300, tol=0.001)
bgm = BayesianGaussianMixture()
knr = KNeighborsRegressor(n_neighbors=5)
rnr = RadiusNeighborsRegressor(radius=1.0)
pls = PLSRegression(n_components=1)
gnb = GaussianNB()
mnb = MultinomialNB()
svl = SVR(kernel='linear')
svr = SVR()
las = Lasso()
en = ElasticNet()
class RandomTreesEmbeddingTransformation(Transformer):
    def __init__(self,
                 n_estimators=10,
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=1.0,
                 max_leaf_nodes='None',
                 sparse_output=True,
                 bootstrap='False',
                 n_jobs=1,
                 random_state=None):
        super().__init__("random_trees_embedding", 18)
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'
        self.output_type = CATEGORICAL

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.bootstrap = bootstrap
        self.sparse_output = sparse_output
        self.n_jobs = n_jobs
        self.random_state = random_state

    @ease_trans
    def operate(self, input_datanode: DataNode, target_fields=None):
        from sklearn.ensemble import RandomTreesEmbedding

        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(input_datanode.feature_types,
                                           self.input_type)
        X_new = X[:, target_fields]
        if not self.model:
            self.n_estimators = int(self.n_estimators)
            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)
            if X.shape[0] > 5000:
                self.max_depth = min(4, self.max_depth)
            self.min_samples_split = int(self.min_samples_split)
            self.min_samples_leaf = int(self.min_samples_leaf)
            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)
            self.min_weight_fraction_leaf = float(
                self.min_weight_fraction_leaf)
            self.bootstrap = check_for_bool(self.bootstrap)

            self.model = RandomTreesEmbedding(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_leaf_nodes=self.max_leaf_nodes,
                sparse_output=self.sparse_output,
                n_jobs=self.n_jobs,
                random_state=self.random_state)

            self.model.fit(X_new)

        _X = self.model.transform(X_new).toarray()

        return _X

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        n_estimators = UniformIntegerHyperparameter(name="n_estimators",
                                                    lower=10,
                                                    upper=100,
                                                    default_value=10)
        max_depth = UniformIntegerHyperparameter(name="max_depth",
                                                 lower=2,
                                                 upper=5,
                                                 default_value=5)
        min_samples_split = UniformIntegerHyperparameter(
            name="min_samples_split", lower=2, upper=20, default_value=2)
        min_samples_leaf = UniformIntegerHyperparameter(
            name="min_samples_leaf", lower=1, upper=20, default_value=1)
        min_weight_fraction_leaf = Constant('min_weight_fraction_leaf', 1.0)
        max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes",
                                                      value="None")
        bootstrap = CategoricalHyperparameter('bootstrap', ['True', 'False'])
        cs = ConfigurationSpace()
        cs.add_hyperparameters([
            n_estimators, max_depth, min_samples_split, min_samples_leaf,
            min_weight_fraction_leaf, max_leaf_nodes, bootstrap
        ])
        return cs
Example #41
0
class UnsupervisedVisualBagClassifier(Classifier):
    """
    ===============================
    UnsupervisedVisualBagClassifier
    ===============================
    1. Unsupervised
    2. Binary bag of words
    3. Totally random trees
    """

    def __init__(self, coordinator, base_classifier, n_estimators=10,
                 max_depth=5, min_samples_split=2, min_samples_leaf=1,
                 n_jobs=-1, random_state=None, verbose=0, min_density=None):
        Classifier.__init__(self, coordinator, base_classifier)
        self.histoSize = 0
        self._visualBagger = RandomTreesEmbedding(n_estimators=n_estimators,
                                                  max_depth=max_depth,
                                                  min_samples_split=min_samples_split,
                                                  min_samples_leaf=min_samples_leaf,
                                                  n_jobs=n_jobs,
                                                  random_state=random_state,
                                                  verbose=verbose,
                                                  min_density=min_density)


    def _preprocess(self, image_buffer, learningPhase):
        if learningPhase:
            self.setTask(1, "Extracting the features (model creation)")
        else:
            self.setTask(1, "Extracting the features (prediction)")

        X_pred, y = self._coord.process(image_buffer,
                                        learningPhase=learningPhase)

        y_user = self._convertLabel(y)

        #Cleaning up
        self._coord.clean(y)
        del y

        self.endTask()

        #Bag-of-word transformation
        self.setTask(1, "Transforming data into bag-of-words (Tree part)")

        X2 = None
        if learningPhase:
            X2 = self._visualBagger.fit_transform(X_pred, y_user)
            self.histoSize = X2.shape[1]
        else:
            X2 = self._visualBagger.transform(X_pred)

        #Cleaning up
        self._coord.clean(X_pred)
        del X_pred
        del y_user

        self.endTask()

        nbFactor = X2.shape[0] // len(image_buffer)

        if not sps.isspmatrix_csr(X2):
            X2 = X2.tocsr()

        if nbFactor == 1:
            return X2

        self.setTask(len(image_buffer), "Transforming data into bag-of-words (Histogram part)")
        nbTrees = self._visualBagger.n_estimators
        X3 = computeHistogram(len(image_buffer), nbFactor, nbTrees, X2)
        self.endTask()

        #Cleaning up
        del X2  # Should be useless

        return X3

    def fit_histogram(self, hist, y):
        #Delegating the classification
        self.setTask(1, "Learning the model")

        self._classifier.fit(hist, y)

        self.endTask()

        return self

    def fit(self, image_buffer):
        """
        Fits the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to learn from

        Return
        -------
        self : :class:`Classifier`
            This instance
        """
        #Updating the labels
        y_user = image_buffer.getLabels()
        self._buildLUT(y_user)
        y = self._convertLabel(y_user)

        X = self._preprocess(image_buffer, learningPhase=True)

        return self.fit_histogram(X, y)

    def predict(self, image_buffer):
        """
        Classify the data contained in the :class:`ImageBuffer` instance

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of int
            each entry is the classification label corresponding to the input
        """

        X = self._preprocess(image_buffer, learningPhase=False)
        y_classif = self._classifier.predict(X)
        return self._convertLabelsBackToUser(y_classif)

    def predict_proba(self, image_buffer):
        """
        Classify softly the data contained is the :class:`ImageBuffer`
        instance. i.e. yields a probability vector of belongin to each
        class

        Parameters
        -----------
        image_buffer : :class:`ImageBuffer`
            The data to classify

        Return
        -------
        list : list of list of float
            each entry is the probability vector of the input of the same
            index as computed by the base classifier
        """
        if not hasattr(self._classifier, "predict_proba"):
            #Early error
            self._classifier.predict_proba(np.zeros((1, 1)))

        X = self._preprocess(image_buffer, learningPhase=False)
        return self._classifier.predict_proba(X)
def random_forest_embedding():
	import numpy as np
	import matplotlib.pyplot as plt
	
	from sklearn.datasets import make_circles
	from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
	from sklearn.decomposition import TruncatedSVD
	from sklearn.naive_bayes import BernoulliNB
	
	#建立数据集
	X, y = make_circles(factor = 0.5, random_state = 0, noise = 0.05)
	
	#print y
	#print X.shape #X 是100 * 2, y是100 * 1 (0,1数组)
	
	
	#Transform data
	hasher = RandomTreesEmbedding(n_estimators = 10, random_state = 0, max_depth = 3) #设置参数,生成model
	X_transformed = hasher.fit_transform(X)
	
	#print X_transformed[99]
	#print X_transformed.shape #100 * 74 ? 可能是如下原因 -- 为什么利用高维稀疏表示之后可以有助于分类?
	#RandomTreesEmbedding provides a way to map data to a very high-dimensional, 
	#sparse representation, which might be beneficial for classification. 
	
	pca = TruncatedSVD(n_components = 2)
	X_reduced = pca.fit_transform(X_transformed)
	
	#print X_reduced #这里是X_reduced 是 100 * 2

	#Learn a Naive bayes classifier on the transformed data
	nb = BernoulliNB()
	nb.fit(X_transformed, y) #利用高维稀疏矩阵和y进行训练
	
	#Learn a ExtraTreesClassifier for comparison
	trees = ExtraTreesClassifier(max_depth = 3, n_estimators = 10, random_state = 0)
	trees.fit(X, y) #这里是利用原始的2维X和y进行训练
	
	#scatter plot of original and reduced data
	fig = plt.figure(figsize = (9, 8))
	ax = plt.subplot(221)
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	ax.set_title("Original Data(2d)")
	ax.set_xticks(())
	ax.set_yticks(())
	
	ax = plt.subplot(222)
	#注意虽然X在转化之后了,但是对应的label没有变,所以可以根据label来分析transfrom的效果
	ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c = y, s = 50) 
	ax.set_title("pca reduction (2d) of transformed data (%dd)" % X_transformed.shape[1]) 
	ax.set_xticks(())
	ax.set_yticks(())
	
	
	
	#Plot the decision in original space
	h = 0.01
	x_min, x_max = X[:, 0].min() - 0.5, X[:,0].max() + 0.5
	y_min, y_max = X[:, 1].min() - 0.5, X[:,1].max() + 0.5
	
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
	
	#transform grid using RandomTreesEmbedding
	#利用nb来做predict
	transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()])
	y_grid_pred = nb.predict_proba(transformed_grid)[:, 1]
	
	
	ax = plt.subplot(223)
	ax.set_title("Naive Bayes on Transformed data")
	ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	
	ax.set_ylim(-1.4, 1.4)
	ax.set_xlim(-1.4, 1.4)
	ax.set_xticks(())
	ax.set_yticks(())
	
	
	#transform grid using ExtraTreesClassifier
	#利用trees做predict
	y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
	
	ax = plt.subplot(224)
	ax.set_title("ExtraTrees predictions")
	ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
	ax.scatter(X[:, 0], X[:, 1], c = y, s = 50) #X[:, 0]是X坐标 X[:, 1]是Y坐标, y是label
	
	ax.set_ylim(-1.4, 1.4)
	ax.set_xlim(-1.4, 1.4)
	ax.set_xticks(())
	ax.set_yticks(())

	plt.tight_layout()
	plt.show()
Example #43
0
def predefined_ops():
    '''return dict of user defined none-default instances of operators
    '''
    clean = {
        'clean':
        Cleaner(dtype_filter='not_datetime',
                na1='null',
                na2='mean',
                drop_uid=True),
        'cleanNA':
        Cleaner(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'),
        'cleanMn':
        Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'),
    }
    #
    encode = {
        'woe8': WoeEncoder(max_leaf_nodes=8),
        'woe5': WoeEncoder(max_leaf_nodes=5),
        'woeq8': WoeEncoder(q=8),
        'woeq5': WoeEncoder(q=5),
        'woeb5': WoeEncoder(bins=5),
        'woem': WoeEncoder(mono=True),
        'oht': OhtEncoder(),
        'ordi': OrdiEncoder(),

        # 'bin10': BinEncoder(bins=10, int_bins=True),  # 10 bin edges encoder
        # 'bin5': BinEncoder(bins=5, int_bins=True),  # 5 bin edges encoder
        # 'binm10': BinEncoder(max_leaf_nodes=10,
        #                      int_bins=True),  # 10 bin tree cut edges encoder
        # 'binm5': BinEncoder(max_leaf_nodes=5,
        #                     int_bins=True),  # 5 bin tree cut edges encoder
    }

    resample = {
        # over_sampling
        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),
        # clean outliers
        'inlierForest':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'IsolationForest',
                            'contamination': 0.1
                        }),
        'inlierLocal':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'LocalOutlierFactor',
                            'contamination': 0.1
                        }),
        'inlierEllip':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'EllipticEnvelope',
                            'contamination': 0.1
                        }),
        'inlierOsvm':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'OneClassSVM',
                            'contamination': 0.1
                        }),
    }

    scale = {
        'stdscale': StandardScaler(),
        'minmax': MinMaxScaler(),
        'absmax': MaxAbsScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'quantile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        # kernel approximation
        'Nys': Nystroem(random_state=0),
        'rbf': RBFSampler(random_state=0),
        'rfembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(WoeEncoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(LogisticRegression(penalty='l1', solver='saga',
                                           C=1e-2)),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fxgb':
        SelectFromModel(
            XGBClassifier(n_jobs=-1,
                          booster='gbtree',
                          max_depth=2,
                          n_estimators=50), ),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)),

        # fixed number of features
        'fxgb20':
        SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'),
                        max_features=20),
        'frf20':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=20),
        'frf10':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=10),
        'fRFElog':
        RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1),
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }

    imp = {
        "impXGB":
        XGBClassifier(n_jobs=-1,
                      booster='gbtree',
                      max_depth=2,
                      n_estimators=50),
        "impRF":
        ExtraTreesClassifier(n_estimators=100, max_depth=2)
    }

    instances = {}
    instances.update(**clean, **encode, **scale, **feature_c, **feature_m,
                     **feature_u, **resample, **imp)
    return instances
Example #44
0
    --n_estimators=<n>    Number of trees in the forest [default:10]
"""


import pandas as pd
import sys
import numpy as np
import cPickle
from sklearn.ensemble import RandomTreesEmbedding
from docopt import docopt

arguments = docopt(__doc__)
input_path = arguments["<training_set>"]
n = int(arguments["--n_estimators"])
output_path = arguments["<mapper_path>"]

print "Reading Data"
data = pd.read_csv(input_path,header=None).values[:,1:]


print "Constructing Mapper"
mapper = RandomTreesEmbedding(n_estimators=n)
mapper.fit(data)

print "Saving Mapper to {}".format(output_path)
with open(output_path,"w") as f:
    cPickle.dump(mapper,f)

    

Example #45
0
        return KMeans(n_clusters=2,
                      random_state=RandomState(17)).fit_predict(rdc_data)

    print_results("kmeans", labels, compute_kmeans(data))
    # print_results("kmeans+rdc", labels, compute_kmeans_rdc(data))

    tsne_embedding_data = TSNE(n_components=3,
                               verbose=10,
                               n_jobs=4,
                               random_state=17).fit_transform(data)
    print_results("tsne fast kmeans", labels,
                  compute_kmeans(tsne_embedding_data))

    tree_embedding_data = RandomTreesEmbedding(n_estimators=200,
                                               random_state=0,
                                               max_depth=5).fit_transform(data)
    print_results("tree kmeans", labels, compute_kmeans(tree_embedding_data))

    0 / 0
    srp_emb_data = random_projection.SparseRandomProjection(
        n_components=20, random_state=42).fit_transform(data)
    print_results("SparseRandomProjection kmeans", labels,
                  compute_kmeans(srp_emb_data))

    iso_emb_data = manifold.Isomap(30, n_components=2).fit_transform(data)
    print_results("iso kmeans", labels, compute_kmeans(iso_emb_data))

    # lle_emb_data = manifold.LocallyLinearEmbedding(10, n_components=2, method='ltsa').fit_transform(data)
    # print_results("lle kmeans", labels, compute_kmeans(lle_emb_data))
space with an ExtraTreesClassifier forests learned on the
original data.
"""
import pylab as pl
import numpy as np

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import RandomizedPCA
from sklearn.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
X_transformed = hasher.fit_transform(X)

# Visualize result using PCA
pca = RandomizedPCA(n_components=2)
X_reduced = pca.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)


# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)
    tfs['grp'] = GaussianRandomProjection(n_components=nb_comp['grp'],
                                          eps=0.1,
                                          random_state=seed_tf)
    tfs['srp'] = SparseRandomProjection(n_components=nb_comp['srp'],
                                        dense_output=True,
                                        random_state=seed_tf)
    tfs['nmf'] = NMF(n_components=nb_comp['nmf'],
                     shuffle=True,
                     init='random',
                     random_state=seed_tf)
    #embedding
    trees, depth, leafs = 25, 8, 32  #2 ** 8 = 256
    embed = RandomTreesEmbedding(n_estimators=trees,
                                 max_depth=depth,
                                 max_leaf_nodes=leafs,
                                 min_samples_split=32,
                                 min_samples_leaf=8,
                                 sparse_output=False,
                                 n_jobs=-1,
                                 random_state=seed_val)

    #feats and data
    feats = list(
        set(df_train.columns.tolist()).difference([target, target_id]))
    train_X = df_train[feats]
    test_X = df_test[feats]

    #preds
    train_preds = pd.DataFrame()
    test_preds = pd.DataFrame()
    test_preds[target_id] = test_id
Example #48
0
    def run(self):

        if not self.verify_data():
            print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m")
            raise Exception("BadTrainingInputData")

        applyClassWeights = False
        if self.parameters['classifier'] == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'RandomForestClassifier':
            clf = RandomForestClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier':
            rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0)
            clf0 = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
            clf = make_pipeline(rt, clf0)
        elif self.parameters['classifier'] == 'XGBClassifier':
            clf = XGBClassifier(
                    learning_rate=self.parameters['learning_rate'],
                    max_depth=self.parameters['max_depth'],
                    n_estimators=self.parameters['n_estimators'],
                    objective='binary:logitraw',
                    colsample_bytree=self.parameters['colsample_bytree'],
                    subsample=self.parameters['subsample'],
                    min_child_weight=self.parameters['min_child_weight'],
                    gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0,
                    #reg_alpha=8,
                    reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0,
                    reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0,
                    ) 
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'MLPClassifier':
            classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']}
            clf = MLPClassifier(**classifierParams) 
        elif self.parameters['classifier'] in ['SVC', 'LinearSVC']:
            '''
            clf = SVC(
                        C=1.0,
                        cache_size=4000,
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=3,
                        gamma='auto',
                        kernel='rbf',
                        max_iter=100000,
                        probability=False,
                        random_state=None,
                        shrinking=True,
                        tol=0.001,
                        verbose=True
                    )
            '''
            bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False
            if self.parameters['classifier'] == 'LinearSVC':
                clf = LinearSVC(
                            class_weight='balanced',
                            dual=self.parameters['dual'],
                            max_iter=self.parameters['max_iter'],
                            C=self.parameters['C'],
                            penalty=self.parameters['penalty'],
                            loss=self.parameters['loss'],
                            tol=self.parameters['tol'],
                            verbose=True,
                        )
            else:
                # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000
                clf =  SVC(
                        C=self.parameters['C'],
                        cache_size=self.parameters['cache_size'],
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=self.parameters['degree'],
                        gamma=self.parameters['gamma'],
                        kernel=self.parameters['kernel'],
                        max_iter=self.parameters['max_iter'],
                        probability=False,
                        random_state=None,
                        shrinking=self.parameters['shrinking'],
                        tol=self.parameters['tol'],
                        verbose=True
                    )

            if bagged:
                n_estimators = bagged
                if 'bag_oversampling' in self.parameters:
                    n_estimators = int(n_estimators * self.parameters['bag_oversampling'])

                clf0 = clf
                clf = BaggingClassifier(
                        clf0,
                        max_samples=1.0 / bagged,
                        max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0,
                        bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False,
                        n_estimators=n_estimators,
                    )

        else:
            clf = AdaBoostClassifier(
                    DecisionTreeClassifier(
                        min_samples_leaf=self.parameters['min_samples_leaf'], 
                        max_depth=self.parameters['max_depth'], 
                        class_weight=self.parameters['class_weight'], 
                        criterion=self.parameters['criterion'],
                        splitter=self.parameters['splitter'],
                        max_features=self.parameters['max_features'],
                        ), 
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    algorithm=self.parameters['algorithm'],
                )

        #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile:
        #    clf = pickle.load(inputFile)

        # preprocessing
        print("transformation...")

        if 'scaler' in self.parameters:
            if self.parameters['scaler'] == 'standard':
                self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'minmax':
                self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'robust':
                self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X'])
            else:
                self.scaler = None
        else:
            self.scaler = None

        if self.scaler:
            self.data['train']['X'] = self.scaler.transform(self.data['train']['X'])
            self.data['test']['X'] = self.scaler.transform(self.data['test']['X'])

        # SHUFFLE all samples before
        self.shuffle = False
        if self.shuffle:
            print("shuffle input data...")
            for dataset in self.datasets:
                nSamples = self.data[dataset][self.varsets[0]].shape[0]
                randomPermutation = np.random.permutation(nSamples)
                for var in self.varsets:
                    self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0)

        # LIMIT number of training samples
        # recommended to also shuffle samples before, because they are ordered by signal/background
        limitNumTrainingSamples = self.parameters['limit']
        if (limitNumTrainingSamples > 0):
            print("limit training samples to:", limitNumTrainingSamples)
            #for dataset in self.datasets:
            #    for var in self.varsets:
            #        self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples]
            for dataset in self.datasets:
                self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False)

        # oversample
        upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None
        if upscale:
            upscalemax =  self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 
            upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal
            indices = []
            for i in range(len(self.data['train']['sample_weight'])):
                #print(x)
                x= self.data['train']['sample_weight'][i]
                if self.data['train']['y'][i] > 0.5:
                    x *= upscalesignal
                n = x * upscale
                # limit oversampling factor!
                if n > upscalemax:
                    n=upscalemax
                if n<1:
                    n=1
                intN = int(n)
                indices += [i]*intN
                #floatN = n-intN
                #if floatN > 0:
                #    if random.uniform(0.0,1.0) < floatN:
                #        indices += [i]

            self.data['train']['X'] = self.data['train']['X'][indices]
            self.data['train']['y'] = self.data['train']['y'][indices]
            self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices]
            self.verify_data()

        # BALANCE weights
        # calculate total weights and class_weights
        nSig = len([x for x in self.data['train']['y'] if x >= 0.5])
        nBkg = len([x for x in self.data['train']['y'] if x < 0.5])
        print("#SIG:", nSig)
        print("#BKG:", nBkg)
        weightsSignal = []
        weightsBackground = []
        for i in range(len(self.data['train']['sample_weight'])):
            if self.data['train']['y'][i] < 0.5:
                weightsBackground.append(self.data['train']['sample_weight'][i])
            else:
                weightsSignal.append(self.data['train']['sample_weight'][i])
        weightsSignal.sort()
        weightsBackground.sort()
        totalWeightSignal = sum(weightsSignal)
        totalWeightBackground = sum(weightsBackground)
        signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight']
        backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground
        print("SUM of weights for signal:", totalWeightSignal)
        print("SUM of weights for background:", totalWeightBackground)
        
        if applyClassWeights:
            print("re-weight signals by:", signalReweight)
            print("re-weight background by:", backgroundReweight)
            for i in range(len(self.data['train']['sample_weight'])):
                if self.data['train']['y'][i] < 0.5:
                    self.data['train']['sample_weight'][i] *= backgroundReweight
                else:
                    self.data['train']['sample_weight'][i] *= signalReweight
        else:
            print("DO NOT re-weight signals by:", signalReweight)
        print("...")
        # TRAINING

        learningCurve = []
        if self.parameters['classifier'] == 'XGBClassifier':
            clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True)
        else:
            try:
                clf = clf.fit(**self.data['train'])
            except:
                clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])
                
                if 'rounds' in self.parameters and self.parameters['rounds'] > 1:
                    for rNumber in range(self.parameters['rounds']):
                        results = clf.predict_proba(self.data['test']['X']) 
                        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
                        print(" round ", rNumber, " AUC=", auc1)
                        learningCurve.append(auc1)
                        clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])

        print("***** FIT done")

        # TEST
        try:
            results = clf.decision_function(self.data['test']['X'])
            print("***** EVALUATION on test sample done")
            results_train = clf.decision_function(self.data['train']['X'])
            print("***** EVALUATION on training sample done")

            print("R:", results.shape, results)

            results = np.c_[np.ones(results.shape[0]), results]
            results_train = np.c_[np.ones(results_train.shape[0]), results_train]
        except:
            results = clf.predict_proba(self.data['test']['X'])
            results_train = clf.predict_proba(self.data['train']['X'])

        # ROC curve
        print("calculating auc...")
        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
        auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight'])
        print("AUC:", auc1, " (training:", auc_training, ")")

        print("**** compute quantiles")
        qx = np.array([0.01, 0.99])
        qy = np.array([0.0, 0.0])
        thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0)
        nS = len(results)
        for i in range(nS):
            thq.Fill(results[i][1])
        thq.GetQuantiles(2, qy, qx)

        # rescaling of SCORE to [0, 1]
        minProb = 2.0
        maxProb = -1.0
        #for i in range(len(self.data['train']['X'])):
        #    if results_train[i][1] > maxProb:
        #        maxProb = results_train[i][1]
        #    if results_train[i][1] < minProb:
        #        minProb = results_train[i][1]
        #for i in range(len(self.data['test']['X'])):
        #    if results[i][1] > maxProb:
        #        maxProb = results[i][1]
        #    if results[i][1] < minProb:
        #        minProb = results[i][1]

        minProb = qy[0]
        maxProb = qy[1]
        delta = maxProb-minProb
        minProb -= delta * 0.01
        maxProb += delta * 0.10

        useSqrt = False

        # fill TRAINING SCORE histogram (class probability)
        h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0)
        h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0)
        for i in range(len(self.data['train']['X'])):
            result = (results_train[i][1]-minProb)/(maxProb-minProb)
Example #49
0
# LAPLACIAN EIGENMAP
print("Performing Laplacian Eigenmap (Spectral Embedding) ...")
plt.subplot(335)
model = SpectralEmbedding(n_components=2, n_neighbors=50)
se = model.fit_transform(X)
plt.scatter(se[:, 0], se[:, 1], c=Y, cmap='viridis', s=1)
plt.title('Laplacian Eigenmap')
#plt.colorbar()
plt.xlabel("LAP1")
plt.ylabel("LAP2")

# RANDOM FOREST EMBEDDING
print("Performing Random Forest Embedding (RFE) ...")
plt.subplot(336)
hasher = RandomTreesEmbedding(n_estimators=200, random_state=1, max_depth=5)
X_transformed = hasher.fit_transform(X)
model = TruncatedSVD(n_components=2)
svd = model.fit_transform(X_transformed)
plt.scatter(svd[:, 0], svd[:, 1], c=Y, cmap='viridis', s=1)
plt.title('Random Forest Embedding (RFE)')
#plt.colorbar()
plt.xlabel("RFE1")
plt.ylabel("RFE2")

# T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE)
print("Performing T-Distributed Stochastic Neighbor Embedding (tSNE) ...")
plt.subplot(337)
model = TSNE(learning_rate=10, n_components=2, random_state=123, perplexity=30)
tsne = model.fit_transform(X)
plt.scatter(tsne[:, 0], tsne[:, 1], c=Y, cmap='viridis', s=1)