def _prepare_data(self): """ Standardize or demean data. """ adj_data = self._adjusted_data self._mu = nanmean(adj_data, axis=0) self._sigma = np.sqrt(nanmean((adj_data - self._mu)**2.0, axis=0)) if self._standardize: data = (adj_data - self._mu) / self._sigma elif self._demean: data = (adj_data - self._mu) else: data = adj_data return data / np.sqrt(self.weights)
def _prepare_data(self): """ Standardize or demean data. """ adj_data = self._adjusted_data if np.all(np.isnan(adj_data)): return np.empty(adj_data.shape[1]).fill(np.nan) self._mu = nanmean(adj_data, axis=0) self._sigma = np.sqrt(nanmean((adj_data - self._mu)**2.0, axis=0)) if self._standardize: data = (adj_data - self._mu) / self._sigma elif self._demean: data = (adj_data - self._mu) else: data = adj_data return data / np.sqrt(self.weights)
def _prepare_data(self): """ Standardize or demean data. """ adj_data = self._adjusted_data if np.all(np.isnan(adj_data)): return np.empty(adj_data.shape[1]).fill(np.nan) self._mu = nanmean(adj_data, axis=0) self._sigma = np.sqrt(nanmean((adj_data - self._mu) ** 2.0, axis=0)) if self._standardize: data = (adj_data - self._mu) / self._sigma elif self._demean: data = (adj_data - self._mu) else: data = adj_data return data / np.sqrt(self.weights)
def _fill_missing_em(self): """ EM algorithm to fill missing values """ non_missing = np.logical_not(np.isnan(self.data)) # If nothing missing, return without altering the data if np.all(non_missing): return self.data # 1. Standardized data as needed data = self.transformed_data = np.asarray(self._prepare_data()) ncomp = self._ncomp # 2. Check for all nans col_non_missing = np.sum(non_missing, 1) row_non_missing = np.sum(non_missing, 0) if np.any(col_non_missing < ncomp) or np.any(row_non_missing < ncomp): raise ValueError('Implementation requires that all columns and ' 'all rows have at least ncomp non-missing values') # 3. Get mask mask = np.isnan(data) # 4. Compute mean mu = nanmean(data, 0) # 5. Replace missing with mean projection = np.ones((self._nobs, 1)) * mu projection_masked = projection[mask] data[mask] = projection_masked # 6. Compute eigenvalues and fit diff = 1.0 _iter = 0 while diff > self._tol_em and _iter < self._max_em_iter: last_projection_masked = projection_masked # Set transformed data to compute eigenvalues self.transformed_data = data # Call correct eig function here self._compute_eig() # Call function to compute factors and projection self._compute_pca_from_eig() projection = np.asarray( self.project(transform=False, unweight=False)) projection_masked = projection[mask] data[mask] = projection_masked delta = last_projection_masked - projection_masked diff = _norm(delta) / _norm(projection_masked) _iter += 1 # Must copy to avoid overwriting original data since replacing values data = self._adjusted_data + 0.0 projection = np.asarray(self.project()) data[mask] = projection[mask] return data
def _fill_missing_em(self): """ EM algorithm to fill missing values """ non_missing = np.logical_not(np.isnan(self.data)) # If nothing missing, return without altering the data if np.all(non_missing): return self.data # 1. Standardized data as needed data = self.transformed_data = self._prepare_data() ncomp = self._ncomp # 2. Check for all nans col_non_missing = np.sum(non_missing, 1) row_non_missing = np.sum(non_missing, 0) if np.any(col_non_missing < ncomp) or np.any(row_non_missing < ncomp): raise ValueError('Implementation requires that all columns and ' 'all rows have at least ncomp non-missing values') # 3. Get mask mask = np.isnan(data) # 4. Compute mean mu = nanmean(data, 0) # 5. Replace missing with mean projection = np.ones((self._nobs, 1)) * mu projection_masked = projection[mask] data[mask] = projection_masked # 6. Compute eigenvalues and fit diff = 1.0 _iter = 0 while diff > self._tol_em and _iter < self._max_em_iter: last_projection_masked = projection_masked # Set transformed data to compute eigenvalues self.transformed_data = data # Call correct eig function here self._compute_eig() # Call function to compute factors and projection self._compute_pca_from_eig() projection = self.project(transform=False, unweight=False) projection_masked = projection[mask] data[mask] = projection_masked delta = last_projection_masked - projection_masked diff = _norm(delta) / _norm(projection_masked) _iter += 1 # Must copy to avoid overwriting original data since replacing values data = self._adjusted_data + 0.0 projection = self.project() data[mask] = projection[mask] return data
def test_replace_missing(self): x = self.x.copy() x[::5, ::7] = np.nan pc = PCA(x, missing='drop-row') x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))] pc_dropped = PCA(x_dropped_row) assert_equal(pc.projection, pc_dropped.projection) assert_equal(x, pc.data) pc = PCA(x, missing='drop-col') x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))] pc_dropped = PCA(x_dropped_col) assert_equal(pc.projection, pc_dropped.projection) assert_equal(x, pc.data) pc = PCA(x, missing='drop-min') if x_dropped_row.size > x_dropped_col.size: x_dropped_min = x_dropped_row else: x_dropped_min = x_dropped_col pc_dropped = PCA(x_dropped_min) assert_equal(pc.projection, pc_dropped.projection) assert_equal(x, pc.data) pc = PCA(x, ncomp=3, missing='fill-em') missing = np.isnan(x) mu = nanmean(x, axis=0) errors = x - mu sigma = np.sqrt(nanmean(errors ** 2, axis=0)) x_std = errors / sigma x_std[missing] = 0.0 last = x_std[missing] delta = 1.0 count = 0 while delta > 5e-8: pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False) x_std[missing] = pc_temp.projection[missing] current = x_std[missing] diff = current - last delta = np.sqrt(np.sum(diff ** 2)) / np.sqrt(np.sum(current ** 2)) last = current count += 1 x = self.x + 0.0 projection = pc_temp.projection * sigma + mu x[missing] = projection[missing] assert_allclose(pc._adjusted_data, x) # Check data for no changes assert_equal(self.x, self.x_copy) x = self.x pc = PCA(x) pc_dropped = PCA(x, missing='drop-row') assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5) pc_dropped = PCA(x, missing='drop-col') assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5) pc_dropped = PCA(x, missing='drop-min') assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5) pc = PCA(x, ncomp=3) pc_dropped = PCA(x, ncomp=3, missing='fill-em') assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5) # Test too many missing for missing='fill-em' x = self.x.copy() x[:, :] = np.nan assert_raises(ValueError, PCA, x, missing='drop-row') assert_raises(ValueError, PCA, x, missing='drop-col') assert_raises(ValueError, PCA, x, missing='drop-min') assert_raises(ValueError, PCA, x, missing='fill-em')
def test_replace_missing(self): x = self.x.copy() x[::5, ::7] = np.nan pc = PCA(x, missing='drop-row') x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))] pc_dropped = PCA(x_dropped_row) assert_equal(pc.projection, pc_dropped.projection) assert_equal(x, pc.data) pc = PCA(x, missing='drop-col') x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))] pc_dropped = PCA(x_dropped_col) assert_equal(pc.projection, pc_dropped.projection) assert_equal(x, pc.data) pc = PCA(x, missing='drop-min') if x_dropped_row.size > x_dropped_col.size: x_dropped_min = x_dropped_row else: x_dropped_min = x_dropped_col pc_dropped = PCA(x_dropped_min) assert_equal(pc.projection, pc_dropped.projection) assert_equal(x, pc.data) pc = PCA(x, ncomp=3, missing='fill-em') missing = np.isnan(x) mu = nanmean(x, axis=0) errors = x - mu sigma = np.sqrt(nanmean(errors**2, axis=0)) x_std = errors / sigma x_std[missing] = 0.0 last = x_std[missing] delta = 1.0 count = 0 while delta > 5e-8: pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False) x_std[missing] = pc_temp.projection[missing] current = x_std[missing] diff = current - last delta = np.sqrt(np.sum(diff**2)) / np.sqrt(np.sum(current**2)) last = current count += 1 x = self.x + 0.0 projection = pc_temp.projection * sigma + mu x[missing] = projection[missing] assert_allclose(pc._adjusted_data, x) # Check data for no changes assert_equal(self.x, self.x_copy) x = self.x pc = PCA(x) pc_dropped = PCA(x, missing='drop-row') assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5) pc_dropped = PCA(x, missing='drop-col') assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5) pc_dropped = PCA(x, missing='drop-min') assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5) pc = PCA(x, ncomp=3) pc_dropped = PCA(x, ncomp=3, missing='fill-em') assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5) # Test too many missing for missing='fill-em' x = self.x.copy() x[:, :] = np.nan assert_raises(ValueError, PCA, x, missing='drop-row') assert_raises(ValueError, PCA, x, missing='drop-col') assert_raises(ValueError, PCA, x, missing='drop-min') assert_raises(ValueError, PCA, x, missing='fill-em')