def pca(data): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ # Compute mean count, D = data.shape mu = data.sum(axis=0) / float(count) # Compute covariance summed_covar = functools.reduce(lambda acc, x: acc + np.outer(x, x), data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) # Compute eigen decomposition eigenvalues, P = np.linalg.eigh(A) # Compute a permutation of dimensions to balance variance among 2 subvectors permuted_inds = eigenvalue_allocation(2, eigenvalues) # Build the permutation into the rotation matrix. One can alternately keep # these steps separate, rotating and then permuting, if desired. P = P[:, permuted_inds] return P, mu
def main(args): # assume hdfs path in params filename = copy_from_hdfs(args.pca_params) print 'Loading PCA Model locally from {} copied from {}'.format( filename, args.pca_params) params = pkl.load(open(filename)) os.remove(filename) P = params['P'] E = params['E'] mu = params['mu'] # Reduce dimension - eigenvalues assumed in ascending order E = E[-args.D:] P = P[:, -args.D:] # Balance variance across halves permuted_inds = eigenvalue_allocation(2, E) P = P[:, permuted_inds] # Save new params f = NamedTemporaryFile(delete=False) pkl.dump({'P': P, 'mu': mu}, open(f.name, 'w')) f.close() copy_to_hdfs(f, args.output) os.remove(f.name)
def pca(data): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ # Compute mean count, D = data.shape mu = data.sum(axis=0) / float(count) # Compute covariance summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) # Compute eigen decomposition eigenvalues, P = np.linalg.eigh(A) # Compute a permutation of dimensions to balance variance among 2 subvectors permuted_inds = eigenvalue_allocation(2, eigenvalues) # Build the permutation into the rotation matrix. One can alternately keep # these steps separate, rotating and then permuting, if desired. P = P[:, permuted_inds] return P, mu
def main(args): # assume hdfs path in params filename = copy_from_hdfs(args.pca_params) print 'Loading PCA Model locally from {} copied from {}'.format(filename, args.pca_params) params = pkl.load(open(filename)) os.remove(filename) P = params['P'] E = params['E'] mu = params['mu'] # Reduce dimension - eigenvalues assumed in ascending order E = E[-args.D:] P = P[:,-args.D:] # Balance variance across halves permuted_inds = eigenvalue_allocation(2, E) P = P[:, permuted_inds] # Save new params f = NamedTemporaryFile(delete=False) pkl.dump({'P': P, 'mu': mu }, open(f.name, 'w')) f.close() copy_to_hdfs(f, args.output) os.remove(f.name)
def test_eigenvalue_allocation(): a = pkl.load(open(relpath('./testdata/test_eigenvalue_allocation_input.pkl'))) vals, vecs = np.linalg.eigh(a) res = eigenvalue_allocation(4, vals) expected = np.array([ 63, 56, 52, 48, 44, 40, 36, 30, 26, 22, 18, 14, 10, 6, 3, 0, 62, 57, 53, 51, 45, 41, 39, 33, 32, 31, 29, 25, 21, 17, 13, 9, 61, 58, 54, 49, 47, 42, 38, 34, 28, 24, 20, 16, 12, 8, 5, 2, 60, 59, 55, 50, 46, 43, 37, 35, 27, 23, 19, 15, 11, 7, 4, 1 ]) assert_true(np.equal(res, expected).all())
def test_eigenvalue_allocation_normalized_features(): eigenvalues = np.array([ 2.02255824, 1.01940991, 0.01569471, 0.01355569, 0.01264379, 0.01137654, 0.01108961, 0.01054673, 0.01023358, 0.00989679, 0.00939045, 0.00900322, 0.00878857, 0.00870027, 0.00850136, 0.00825236, 0.00813437, 0.00800231, 0.00790201, 0.00782219, 0.00763405, 0.00752334, 0.00739174, 0.00728246, 0.00701366, 0.00697365, 0.00677283, 0.00669658, 0.00654397, 0.00647679, 0.00630645, 0.00621057 ]) indices = eigenvalue_allocation(2, eigenvalues) first_half = eigenvalues[indices[:16]] second_half = eigenvalues[indices[16:]] diff = np.abs(np.sum(np.log(first_half)) - np.sum(np.log(second_half))) assert_true(diff < .1, "eigenvalue_allocation is not working correctly")
def test_eigenvalue_allocation(): a = pkl.load( open(relpath('./testdata/test_eigenvalue_allocation_input.pkl'))) vals, vecs = np.linalg.eigh(a) res = eigenvalue_allocation(4, vals) expected = np.array([ 63, 56, 52, 48, 44, 40, 36, 30, 26, 22, 18, 14, 10, 6, 3, 0, 62, 57, 53, 51, 45, 41, 39, 33, 32, 31, 29, 25, 21, 17, 13, 9, 61, 58, 54, 49, 47, 42, 38, 34, 28, 24, 20, 16, 12, 8, 5, 2, 60, 59, 55, 50, 46, 43, 37, 35, 27, 23, 19, 15, 11, 7, 4, 1 ]) assert_true(np.equal(res, expected).all())
def pca(data): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = data.shape mu = data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, permuted_inds] return P, mu
def main(args): params = pkl.load(open(args.pca_params)) P = params['P'] E = params['E'] mu = params['mu'] # Reduce dimension - eigenvalues assumed in ascending order E = E[-args.D:] P = P[:, -args.D:] # Balance variance across halves permuted_inds = eigenvalue_allocation(2, E) P = P[:, permuted_inds] # Save new params pkl.dump({'P': P, 'mu': mu}, open(args.output, 'w'))
def main(args): params = pkl.load(open(args.pca_params)) P = params['P'] E = params['E'] mu = params['mu'] # Reduce dimension - eigenvalues assumed in ascending order E = E[-args.D:] P = P[:,-args.D:] # Balance variance across halves permuted_inds = eigenvalue_allocation(2, E) P = P[:, permuted_inds] # Save new params pkl.dump({ 'P': P, 'mu': mu }, open(args.output, 'w'))