def bicv_scores(model, X, fit_params=None, strategy="speckled", heldout_frac=0.1, n_repeats=10, seed=None): """ Estimate train and test error for a model by bi-cross-validation. """ # Initialize dictionary for fit keyword args. if fit_params is None: fit_params = dict() m, n = X.shape # Initialize random number generator. rs = get_random_state(seed) # Allocate space to store train/test scores. train_scores = np.empty(n_repeats) test_scores = np.empty(n_repeats) # Run cross-validation. for itr in range(n_repeats): # Create shuffled view into data. ii = rs.permutation(m) jj = rs.permutation(n) Xs = np.copy(X[ii][:, jj]) # Partition columns and rows. si = int(m - m * heldout_frac) sj = int(n - n * heldout_frac) # Fit model to training set. model.fit(Xs[:si, :sj], mask=None) # Extend model factors. model.bicv_extend(Xs[:si, sj:], Xs[si:, :sj]) # Construct mask for training set. train_mask = np.zeros((m, n), dtype=bool) train_mask[:si, :sj] = True # Construct mask for test set. test_mask = np.zeros((m, n), dtype=bool) test_mask[si:, sj:] = True # Compute performance on train and test partitions. train_scores[itr] = model.score(Xs, mask=train_mask) test_scores[itr] = model.score(Xs, mask=test_mask) return train_scores, test_scores
def speckled_cv_scores(model, X, fit_params=None, heldout_frac=0.1, n_repeats=10, resampler=None, return_params=False, seed=None, progress_bar=False): """ Estimate train and test error for a model by cross-validation. """ # Initialize dictionary for fit keyword args. if fit_params is None: fit_params = dict() # Initialize random number generator. rs = get_random_state(seed) # Allocate space to store train/test scores. train_scores = np.empty(n_repeats) test_scores = np.empty(n_repeats) params = [] # Run cross-validation. pbar = trange(n_repeats) if progress_bar else range(n_repeats) for itr in pbar: # If desired, resample X (e.g. apply random shuffle). if resampler is not None: Xsamp = resampler(X) else: Xsamp = X # Generate a new holdout pattern. mask = speckled_mask(X.shape, heldout_frac, rs) # Fit model. model.fit(Xsamp, mask=mask) # Save parameters. if return_params: params.append(tuple(p.copy() for p in model.factors)) # Compute performance on train and test partitions. train_scores[itr] = model.score(Xsamp, mask=mask) test_scores[itr] = model.score(Xsamp, mask=~mask) # Return data. return ((train_scores, test_scores, params) if return_params else (train_scores, test_scores))
def mixed_poiss_cd(X, Y, rank, mask, tol, maxiter, seed): """ Parameters ---------- X : ndarray Matrix holding inputs data. Has shape (n_inputs, n_obs). Y : ndarray Matrix holding data. Has shape (n_features, n_obs). mask : ndarray Binary array specifying observed data points (where mask == 1) and unobserved data points (where mask == 0). Has shape (n_features, n_obs). """ assert X.shape[1] == Y.shape[1] n_in, n_obs = X.shape n_features, n_obs = Y.shape # Initialize parameters. rs = get_random_state(seed) U = rs.uniform(-1, 1, size=(n_features, n_in + rank)) Vt = rs.uniform(-1, 1, size=(n_in + rank, n_obs)) Vt[:n_in] = X if mask is None: update_rule = _poiss_cd_update mask_T = None else: update_rule = _poiss_cd_update_with_mask mask_T = mask.T for itr in range(maxiter): # Update U. update_rule(Y, U, Vt, mask, inner_iters) # Update rows of V without over-writing inputs (X). ut = U[:, n_in:].T v = Vt[n_in:] ls = update_rule(Y.T, v, ut, mask_T, inner_iters) # Check convergence. loss_hist.append(ls) if itr > 0 and ((loss_hist[-2] - loss_hist[-1]) < tol): break return W, H, np.array(loss_hist)
def _init_kmeans(X, rank, mask, init, seed): """ Dispatches the desired initialization method. Parameters ---------- X : ndarray Data matrix. Has shape (m, n) rank : int Number of cluster centroids. mask : ndarray Mask for missing data. Has shape (m, n). init : str Specifies initialization method. seed : int or numpy.random.RandomState Seeds random number generator. Returns ------- W : ndarray First factor matrix. Has shape (m, rank). H : ndarray Second factor matrix. Has shape (rank, n). xtx : float Squared Frobenius norm of X. This is later used to scale the model loss. """ # Seed random number generator. rs = get_random_state(seed) # Random initialization. if init == "rand": idx = rs.choice(X.shape[0], size=rank, replace=False) centroids = X[idx] # Soft k-means initialization. elif init == "soft": _, centroids = soft_kmeans_em(X, rank, mask, "rand", 100, 1e-5, seed) else: raise NotImplementedError("Did not recognize init method.") return centroids
def __init__(self, n_components, seed=None): self.nc = n_components self._rs = get_random_state(seed)
def __init__(self, seed=None): self._rs = get_random_state(seed)
def poisson_lorenz(n_out, n_steps, x0=None, dt=0.01, latent_noise_scale=10.0, max_rate=10.0, min_rate=0.01, seed=None): """ Simulate high-dimensional count data series following low-dimensional Lorenz attractor dynamics. Parameters ---------- n_out : int Dimensional of observations. n_steps: int Number of observed timesteps. dt : float Euler integration step of the continuous time ODE. latent_noise_scale : float Scale of Wiener process noise on latent states. Note that the square root of dt also scales this noise source (Euler–Maruyama integration). max_rate : float Maximum rate parameter in the simulated dataset. min_rate : float Minimum rate parameter in the simulated dataset. seed : None, int, or np.random.RandomState Seed for random number generator. Returns ------- data : ndarray Data array holding simulated count data. Has shape (n_steps, n_out). rates : ndarray True time-varying rate parameters, associated with 'data'. Has shape (n_steps, n_out). W : ndarray Weight matrix. Has shape (n_out, 3). X : ndarray Simulated latent states. Has shape (n_steps, 3). """ # Initialize random number generator. rs = get_random_state(seed) # Parameters of Lorenz equations (chaotic regime). sigma = 10.0 beta = 8 / 3 rho = 28.0 # Allocate space for simulation. x = x0 if x0 is not None else np.ones(3) dxdt = np.empty(3) x_hist = np.empty((n_steps, 3)) # Draw random readout matrix. W = rand_orth(3, n_out, seed=rs) # Simulate latent states. for t in range(n_steps): # Lorenz equations dxdt[0] = sigma * (x[1] - x[0]) dxdt[1] = x[0] * (rho - x[2]) - x[1] dxdt[2] = x[0] * x[1] - beta * x[2] # Euler–Maruyama integration eta = latent_noise_scale * rs.randn(3) x = x + (dt * dxdt) + (np.sqrt(dt) * eta) # Store latent variable traces x_hist[t] = x # Center the x's so they exert comparable effects # in the observed data. x_hist = x_hist - np.mean(x_hist, axis=0) # Rescale rates to desired range. log_rates = np.dot(x_hist, W) log_rates = \ (log_rates - np.min(log_rates)) / np.ptp(log_rates) log_rates = \ log_rates * np.log(max_rate / min_rate) + np.log(min_rate) rates = np.exp(log_rates) # Draw Poisson distributed observations. data = rs.poisson(rates) # Return quantities of interest. return data, rates, W, x_hist
def _init_tsvd(X, rank, mask, init, seed): """ Dispatches the desired initialization method. Parameters ---------- X : ndarray Data matrix. Has shape (m, n) rank : int Number of components. mask : ndarray Mask for missing data. Has shape (m, n). init : str Specifies initialization method. seed : int or numpy.random.RandomState Seeds random number generator. Returns ------- W : ndarray First factor matrix. Has shape (m, rank). H : ndarray Second factor matrix. Has shape (rank, n). xtx : float Squared Frobenius norm of X. This is later used to scale the model loss. """ # Data dimensions. m, n = X.shape # Mask data. if mask is not None: Xm = mask * X else: Xm = X # Compute norm of masked data. xtx = np.dot(Xm.ravel(), Xm.ravel()) # Seed random number generator. rs = get_random_state(seed) # Random initialization. if init == "rand_orth": # Randomized initialization. U = rand_orth(m, rank) Vt = rand_orth(rank, n) # Determine appropriate scaling. e = np.dot(U, Vt) * mask alpha = np.sqrt(xtx / np.dot(e.ravel(), e.ravel())) # Scale randomized initialization. U *= alpha Vt *= alpha else: raise NotImplementedError("Did not recognize init method.") return U, Vt, xtx
def poisson_mf_cd(X, rank, mask, Vbasis, tol, maxiter, seed): """ Parameters ---------- X : ndarray Matrix holding inputs data. Has shape (m, n). rank : int Number of components. mask : ndarray Mask for missing data. Has shape (m, n). tol : float Convergence tolerance. maxiter : int Number of iterations. seed : int or np.random.RandomState Seed for random number generator for initialization. Returns ------- U : ndarray First factor matrix. Has shape (m, rank). Vt : ndarray Second factor matrix. Has shape (rank, n). loss_hist : ndarray Vector holding loss values. Has shape (n_iterations,). """ X = np.asarray(X, dtype='float') # Initialize parameters. loss_hist = [] m, n = X.shape rs = get_random_state(seed) # Account for masked entries. if mask is not None: X = np.copy(X) X[~mask] = np.mean(X[mask]) Xpred = np.empty((m, n)) # Initialize parameters. U = rs.uniform(-1, 1, size=(m, rank)) if Vbasis is None: Vt = rs.uniform(-1, 1, size=(rank, n)) else: Vt = rs.uniform(-1, 1, size=(rank, Vbasis.shape[0])) # Convergence check on parameters. Ulast = np.empty_like(U) Vlast = np.empty_like(Vt) # Main optimization loop. for itr in range(maxiter): # Update U. if Vbasis is None: _poiss_cd_update(X, U, Vt, mask) else: _poiss_cd_update(X, U, Vt @ Vbasis, mask) # Update V. if Vbasis is None: ls = _poiss_cd_update(X.T, Vt.T, U.T, mask_T) else: ls = _poiss_cd_update_with_basis(X.T, Vt.T, U.T, mask_T, Vbasis) # Update masked elements. if mask is not None: np.dot(U, Vt, out=Xpred) X[~mask] = Xpred[~mask] # Store loss. loss_hist.append(ls / X.size) # Check convergence. U_upd = np.linalg.norm(Ulast - U) / np.linalg.norm(U) V_upd = np.linalg.norm(Vlast - V) / np.linalg.norm(Vt) if (itr > 0) and (U_upd < tol) and (V_upd < tol): break # Make copies of previous parameters. np.copyto(Ulast, U) np.copyto(Vlast, Vt) return U, Vt, np.array(loss_hist)