def transform(self, X): """Transform data X according to the fitted model. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. Returns ------- doc_topic_distr : shape=(n_samples, n_topics) Unnormalized document topic distribution for X. """ X = self._check_inference(X, "HierarchicalDirichletProcess.transform") n_jobs = _get_n_jobs(self.n_jobs) verbose = max(0, self.verbose-1) with Parallel(n_jobs=n_jobs, verbose=verbose) as parallel: doc_topic_distr, _, _ = self._e_step(X, cal_sstats=False, cal_doc_distr=True, cal_likelihood=False, parallel=parallel) return doc_topic_distr
def _e_step(self, X, cal_sstats, random_init, parallel=None): """E-step in EM update. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. cal_sstats : boolean Parameter that indicate whether to calculate sufficient statistics or not. Set ``cal_sstats`` to True when we need to run M-step. random_init : boolean Parameter that indicate whether to initialize document topic distribution randomly in the E-step. Set it to True in training steps. parallel : joblib.Parallel (optional) Pre-initialized instance of joblib.Parallel. Returns ------- (doc_topic_distr, suff_stats) : `doc_topic_distr` is unnormalized topic distribution for each document. In the literature, this is called `gamma`. `suff_stats` is expected sufficient statistics for the M-step. When `cal_sstats == False`, it will be None. """ # Run e-step in parallel random_state = self.random_state_ if random_init else None # TODO: make Parallel._effective_n_jobs public instead? n_jobs = _get_n_jobs(self.n_jobs) if parallel is None: parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) results = parallel( delayed(_update_doc_distribution) (X[idx_slice, :], self.exp_dirichlet_component_, self.doc_topic_prior_, self.max_doc_update_iter, self.mean_change_tol, cal_sstats, random_state) for idx_slice in gen_even_slices(X.shape[0], n_jobs)) # merge result doc_topics, sstats_list = zip(*results) doc_topic_distr = np.vstack(doc_topics) if cal_sstats: # This step finishes computing the sufficient statistics for the # M-step. suff_stats = np.zeros(self.components_.shape) for sstats in sstats_list: suff_stats += sstats suff_stats *= self.exp_dirichlet_component_ else: suff_stats = None return (doc_topic_distr, suff_stats)
def _partition_estimators(n_estimators, n_jobs): """Private function used to partition estimators between jobs.""" # Compute the number of jobs n_jobs = min(_get_n_jobs(n_jobs), n_estimators) # Partition estimators between jobs n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs, dtype=np.int) n_estimators_per_job[:n_estimators % n_jobs] += 1 starts = np.cumsum(n_estimators_per_job) return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
def _approximate_bound(self, X): """Calculate approximate log-likelihood for the model Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. Returns ------- likelihood : float approximate log-likelihood for variational parameters """ likelihood = 0.0 # calculate doc likelihood n_jobs = _get_n_jobs(self.n_jobs) verbose = max(0, self.verbose-1) with Parallel(n_jobs=n_jobs, verbose=verbose) as parallel: _, _, doc_likelihood = self._e_step(X, cal_sstats=False, cal_doc_distr=False, cal_likelihood=True, parallel=parallel) likelihood += doc_likelihood # E[log(p(beta|eta) - log(q(beta|lambda))] # `beta` is Dirichlet distribution lambda_ = self.lambda_ elog_beta_ = self.elog_beta_ n_features = lambda_.shape[1] likelihood += np.sum((self.eta - lambda_) * elog_beta_) likelihood += np.sum(gammaln(lambda_) - gammaln(self.eta)) likelihood += np.sum(gammaln(self.eta * n_features) - gammaln(np.sum(lambda_, 1))) # E[log(p(v_k|omega)) - log(q(v_k|a_k))] # `v_k` is Beta distribution v_k = self.v_stick_ likelihood += (v_k.shape[1] * np.log(self.omega)) v_k_col_sum = np.sum(v_k, 0) dig_sum = psi(v_k_col_sum) likelihood += np.sum( (np.array([1.0, self.omega])[:, np.newaxis] - v_k) * (psi(v_k) - dig_sum)) likelihood += np.sum(gammaln(v_k)) likelihood -= np.sum(gammaln(v_k_col_sum)) return likelihood
def get_neighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors = self.n_neighbors query_is_train = False X = check_array(X, accept_sparse='csr') train_size = self._fit_X.shape[0] n_samples, _ = X.shape sample_range = np.arange(n_samples)[:, None] n_jobs = _get_n_jobs(self.n_jobs) result = Parallel(n_jobs, backend='threading')( delayed(self._tree.query, check_pickle=False)( X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) if return_distance: dist, neigh_ind = tuple(zip(*result)) result = np.vstack(dist), np.vstack(neigh_ind) else: result = np.vstack(result) return result
def partial_fit(self, X, y=None): """Online VB with Mini-Batch update. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. y : Ignored. Returns ------- self """ self._check_params() X = self._check_non_neg_array(X, "LatentDirichletAllocation.partial_fit") n_samples, n_features = X.shape batch_size = self.batch_size # initialize parameters or check if not hasattr(self, 'components_'): self._init_latent_vars(n_features) if n_features != self.components_.shape[1]: raise ValueError("The provided data has %d dimensions while " "the model was trained with feature size %d." % (n_features, self.components_.shape[1])) n_jobs = _get_n_jobs(self.n_jobs) with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel: for idx_slice in gen_batches(n_samples, batch_size): self._em_step(X[idx_slice, :], total_samples=self.total_samples, batch_update=False, parallel=parallel) return self
def fit(self, X, y=None): """Learn model for the data X Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. Returns ------- self """ self._check_params() X = self._check_non_neg_array( X, "HierarchicalDirichletProcess.fit") self._init_global_latent_vars(*X.shape) n_jobs = _get_n_jobs(self.n_jobs) verbose = max(0, self.verbose - 1) evaluate_every = self.evaluate_every with Parallel(n_jobs=n_jobs, verbose=verbose) as parallel: for i in xrange(self.max_iter): # batch update _, sstats, _ = self._e_step(X, cal_sstats=True, cal_doc_distr=False, cal_likelihood=False, parallel=parallel) self._m_step(sstats, n_samples=X.shape[0], online_update=False) # check perplexity if evaluate_every > 0 and (i + 1) % evaluate_every == 0: bound = self.score(X) if self.verbose: print('iteration: %d, ELOB: %.4f' % (i + 1, bound)) self.n_iter_ += 1 return self
def fit(self, X, y=None): """Learn model for the data X with variational Bayes method. When `learning_method` is 'online', use mini-batch update. Otherwise, use batch update. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. y : Ignored. Returns ------- self """ self._check_params() X = self._check_non_neg_array(X, "LatentDirichletAllocation.fit") n_samples, n_features = X.shape max_iter = self.max_iter evaluate_every = self.evaluate_every learning_method = self.learning_method if learning_method is None: warnings.warn( "The default value for 'learning_method' will be " "changed from 'online' to 'batch' in the release " "0.20. This warning was introduced in 0.18.", DeprecationWarning) learning_method = 'online' batch_size = self.batch_size # initialize parameters self._init_latent_vars(n_features) # change to perplexity later last_bound = None n_jobs = _get_n_jobs(self.n_jobs) with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel: for i in xrange(max_iter): if learning_method == 'online': for idx_slice in gen_batches(n_samples, batch_size): self._em_step(X[idx_slice, :], total_samples=n_samples, batch_update=False, parallel=parallel) else: # batch update self._em_step(X, total_samples=n_samples, batch_update=True, parallel=parallel) # check perplexity if evaluate_every > 0 and (i + 1) % evaluate_every == 0: doc_topics_distr, _ = self._e_step(X, cal_sstats=False, random_init=False, parallel=parallel) bound = self._perplexity_precomp_distr(X, doc_topics_distr, sub_sampling=False) if self.verbose: print( 'iteration: %d of max_iter: %d, perplexity: %.4f' % (i + 1, max_iter, bound)) if last_bound and abs(last_bound - bound) < self.perp_tol: break last_bound = bound elif self.verbose: print('iteration: %d of max_iter: %d' % (i + 1, max_iter)) self.n_iter_ += 1 # calculate final perplexity value on train set doc_topics_distr, _ = self._e_step(X, cal_sstats=False, random_init=False, parallel=parallel) self.bound_ = self._perplexity_precomp_distr(X, doc_topics_distr, sub_sampling=False) return self
def partial_fit(self, X, y=None): """Online VB with Mini-Batch update. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. Returns ------- self """ self._check_params() X = self._check_non_neg_array(X, "LatentDirichletAllocation.partial_fit") n_samples, n_features = X.shape batch_size = self.batch_size self.total_samples += n_samples # initialize parameters or check if not hasattr(self, 'components_'): raise ValueError # self._init_latent_vars(n_features) if n_features != self.components_.shape[1]: raise ValueError("The provided data has %d dimensions while " "the model was trained with feature size %d." % (n_features, self.components_.shape[1])) n_jobs = _get_n_jobs(self.n_jobs) max_iter = self.partial_max_iter evaluate_every = self.partial_evaluate_every self.n_partial_iter_ = 0 last_bound = None doc_topic_distr = None with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel: for i in xrange(max_iter): for idx_slice in gen_batches(n_samples, batch_size): self._em_step(X[idx_slice, :], total_samples=self.total_samples, batch_update=False, parallel=parallel) # check perplexity if evaluate_every > 0 and (i + 1) % evaluate_every == 0: doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False, parallel=parallel) bound = self.perplexity(X, doc_topic_distr, sub_sampling=False) if self.verbose: print('iteration: %d, perplexity: %.4f' % (i + 1, bound)) if last_bound and abs(last_bound - bound) < self.perp_tol: break last_bound = bound self.n_partial_iter_ += 1 if doc_topic_distr is None: doc_topic_distr = self.transform(X) else: doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis] return doc_topic_distr
def kneighbors(self, X=None, E=None, n_neighbors=None, return_distance=True): #IY modified to account std dev """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the lengths to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. Examples -------- In the following example, we construct a NeighborsClassifier class from an array representing our data set and ask who's the closest point to [1,1,1] >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] >>> from sklearn.neighbors import NearestNeighbors >>> neigh = NearestNeighbors(n_neighbors=1) >>> neigh.fit(samples) # doctest: +ELLIPSIS NearestNeighbors(algorithm='auto', leaf_size=30, ...) >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS (array([[ 0.5]]), array([[2]]...)) As you can see, it returns [[0.5]], and [[2]], which means that the element is at distance 0.5 and is the third element of samples (indexes start at 0). You can also query for multiple points: >>> X = [[0., 1., 0.], [1., 0., 1.]] >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS array([[1], [2]]...) """ if self._fit_method is None: #IY: usually in SOMPY is set to 'auto' raise NotFittedError("Must fit neighbors before querying.") if n_neighbors is None: n_neighbors = self.n_neighbors if X is not None: query_is_train = False X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later. Results don't consider the test point itself as a neighbor n_neighbors += 1 train_size = self._fit_X.shape[0] if n_neighbors > train_size: raise ValueError("Expected n_neighbors <= n_samples, " " but n_samples = %d, n_neighbors = %d" % (train_size, n_neighbors)) n_samples, _ = X.shape sample_range = np.arange(n_samples)[:, None] n_jobs = _get_n_jobs(self.n_jobs) #IY: single core at the moment if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': dist = pairwise_chidistances( X, self._fit_X, 'euclidean', #IY n_jobs=n_jobs, sigma=E, squared=True) else: raise ValueError("kneighbor for project_realdata() works only" " with euclidean metric") neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) neigh_ind = neigh_ind[:, :n_neighbors] # argpartition doesn't guarantee sorted order, so we sort again neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind else: result = neigh_ind ## IY: only brute force at the moment... #elif self._fit_method in ['ball_tree', 'kd_tree']: # if issparse(X): # raise ValueError( # "%s does not work with sparse matrices. Densify the data, " # "or set algorithm='brute'" % self._fit_method) # result = Parallel(n_jobs, backend='threading')( # delayed(self._tree.query, check_pickle=False)( # X[s], n_neighbors, return_distance) # for s in gen_even_slices(X.shape[0], n_jobs) # ) # if return_distance: # dist, neigh_ind = tuple(zip(*result)) # result = np.vstack(dist), np.vstack(neigh_ind) # else: # result = np.vstack(result) else: raise ValueError( "only brute force algorithm accepted as _fit_method ") if not query_is_train: return result else: # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: dist, neigh_ind = result else: neigh_ind = result sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False neigh_ind = np.reshape(neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) if return_distance: dist = np.reshape(dist[sample_mask], (n_samples, n_neighbors - 1)) return dist, neigh_ind return neigh_ind