Beispiel #1
0
    def go_svd(self):
        """ Seperate SVD's for individual data chunks are computed and combined into a single, best estimate
        of the matrix S and V^T. The data is subsequently revisited to get a new estimate of the matrix U.  

        The estimate of U on the basis of the chunked SVD approach is possible, but not as accurate. 
        """

        for this_chunk in self.parts:
            self.SVD_on_chunk(this_chunk)

        # do an SVD on the SVD results of the individual chunks.
        tmp_bases = np.vstack(self.partial_bases)
        ub, sb, vb = isvd(tmp_bases.astype('float64'), self.k_singular)
        vbt = vb.transpose()
        ubs = np.array_split(ub, self.N_split)
        new_s = sb
        new_vt = vbt

        # revisit the data to get the U matrix again
        new_us = []
        for part in self.parts:
            this_new_u = self.get_u_given_basis(new_s, new_vt, part)
            new_us.append(this_new_u)

        # stack it up and unmix the data
        new_us = np.vstack(new_us)
        if self.randomize:
            new_us = new_us[self.inv_order]
        return new_us, sb, vbt
Beispiel #2
0
    def SVD_on_chunk(self, this_chunk):
        """Perform an SVD on a subset of the data

        Parameters:
        -----------

        this_chunk:  a list of indices that maps back to the self.data array 
        """

        tmp_data = self.data[this_chunk, :]

        u, s, v = isvd(tmp_data.astype('float64'), self.k_singular)
        vt = v.transpose()
        self.partial_svd_u.append(u)
        self.partial_svd_s.append(s)
        self.partial_svd_vt.append(vt)
        self.partial_bases.append(np.diag(s).dot(vt))
Beispiel #3
0
def tst_batched():
    N = 10000
    M = 2000
    K = 200
    P = 4

    data = test_function(N, M, 0.0001)
    print("Data constructed")
    e0 = time.time()
    u, s, vt = isvd(data.astype('float64'), P)
    vt = vt.transpose()
    e1 = time.time()

    bSVD = batched_SVD(data, K, P, randomize=False)
    e4 = time.time()
    us, ss, vst = bSVD.go_svd()
    e5 = time.time()

    assert np.std((s - ss) / ss) < 1e-3
    print("Singular values match between batch and full approach")

    # checking the reconstructions
    da = us.dot(np.diag(ss).dot(vst))
    dc = u.dot(np.diag(s).dot(vt))
    delta_both = np.std((da - dc))
    assert np.abs(delta_both) < 1e-2

    print('Reconstruction Error is similar in batched versus full approach')
    print('Time for full: %4.2f  batched: %4.2f' % (e1 - e0, e5 - e4))

    # Now we want to do this using a random order or data

    bSVD = batched_SVD(data, K, P, randomize=True)
    e4 = time.time()
    us, ss, vst = bSVD.go_svd()
    e5 = time.time()

    assert np.std((s - ss) / s) < 1e-3
    print("Singular values match between randomized batch and full approach")
    delta_both = np.std((dc - da))
    assert np.abs(delta_both) < 1e-2
    print('Reconstruction Error is decent')
    print('Time for full: %4.2f  batched: %4.2f' % (e1 - e0, e5 - e4))
Beispiel #4
0
def tst_MPI():
    N = 10000  # number of observations
    M = 2000  # dimension of an observations
    P = 4  # number of singular values

    # make the data
    mpi_comm = MPI.COMM_WORLD
    if mpi_comm.Get_rank() == 0:
        data = test_function(N, M, 0.0001)

        e0 = time.time()
        u, s, vt = isvd(data.astype('float64'), P)
        e1 = time.time()
        print('Standard svd takes %12.3f seconds' % (e1 - e0))
        print('\n')

        e0 = time.time()
        # lets quickly see if this works in the batched setup
        bSVD = batched_SVD(data, 1000, P, randomize=True)
        ub, sb, vbt = bSVD.go_svd()
        e1 = time.time()
        print(
            'Single core batched version with data in memory takes %12.3f seconds'
            % (e1 - e0))

        # write this to an hdf5 file
        print('Creating h5 data file')
        f = h5py.File('test_data.h5', 'w')
        dset = f.create_dataset("data", data=data, dtype='float32')
        f.close()
        del data

        # now we read the data
        f = h5py.File('test_data.h5', 'r')
        data = f['/data']
        print(data.shape)
        e0 = time.time()
        # lets quickly see if this works in the batched setup
        bSVD = batched_SVD(data, 1000, P, randomize=False)
        ub, sb, vbt = bSVD.go_svd()
        e1 = time.time()
        print(
            'Single core batched version while reading data from HDF5 takes %12.3f seconds'
            % (e1 - e0))
        print(sb)
        f.close()

    mpi_comm.Barrier()
    f = h5py.File('test_data.h5', 'r')  # lets see if we can get away with this
    data = f['data']
    e2 = time.time()
    print('build it')
    mSVD = parallel_SVD_MPI(data, 1000, P, mpi_comm, False, None)
    print('go')
    u, s, v, sel = mSVD.go_svd()
    e3 = time.time()
    print(
        'Core %i with batches version takes %12.3f seconds, while reading data from hdf5'
        % (mpi_comm.rank, e3 - e2))
    print(s, mpi_comm.rank)

    #lets read the data into memory
    print('Reading data into memory')
    data2 = f['data'].value  #[:,:]
    print(data2.shape)
    print('done')

    mpi_comm.Barrier()
    e2 = time.time()
    mSVD = parallel_SVD_MPI(data2, 500, P, mpi_comm, True)
    u, s, v, sel = mSVD.go_svd()
    e3 = time.time()
    print(
        'Core %i with batches version takes %12.3f seconds, with data in memory'
        % (mpi_comm.rank, e3 - e2))

    mpi_comm.Barrier()
    selection = np.arange(1000)
    print('Testing setup with an included selection array')
    data2[1000, :] = 0.
    mSVD = parallel_SVD_MPI(data2,
                            50000,
                            P,
                            mpi_comm,
                            False,
                            selection=selection)
    SVDs = batched_SVD.batched_SVD()
    up, sp, vp, order_split = mSVD.go_svd()
    uf, sf, vf = isvd(data2[0:1000, :].astype('float64'), P)
    assert np.mean((np.abs(sp - sf) / sf)) < 2e-2

    mpi_comm.Barrier()
    if mpi_comm.Get_rank() == 0:
        # now remove this file again
        print('Removing h5 data file')
        os.remove('test_data.h5')
Beispiel #5
0
    def go_svd(self):
        """Do the SVD ihn each core, on several chunks.
        """

        partial_u = []
        partial_s = []
        partial_vt = []
        partial_bases = []

        rank_selection = self.rank_splits[self.mpi_rank]
        N_chunks = int(len(rank_selection) / self.N_max) + 1
        chunks = np.array_split(rank_selection, N_chunks)
        u = None
        s = None
        vt = None
        this_rank_bases = None
        for chunk in tqdm(chunks, position=self.mpi_rank):
            partial_data = self.data[chunk, :]
            u, s, vt = isvd(partial_data.astype('float64'), self.k_singular)
            partial_u.append(u)
            partial_s.append(s)
            partial_vt.append(vt)

            partial_bases.append(np.diag(s).dot(vt))
        # now that we have the partial svd results, we bnring stuff together
        if N_chunks > 1:
            all_bases = np.vstack(partial_bases)
            uc, sc, vct = isvd(all_bases.astype('float64'), self.k_singular)
            # now we need to pass this guy to the main rank
            this_rank_bases = np.diag(sc).dot(vct)
        else:
            this_rank_bases = np.diag(s).dot(vt)
        self.mpi_comm.Barrier()

        gathered_bases = None
        if self.mpi_rank == 0:
            gathered_bases = np.empty(
                [self.mpi_size, self.k_singular, self.N_dim], dtype='d')
        gathered_bases = self.mpi_comm.gather(this_rank_bases, root=0)

        final_sg = np.zeros([self.k_singular], dtype='float32')
        final_vgt = np.zeros([self.k_singular, self.N_dim], dtype='float32')

        if self.mpi_rank == 0:
            gathered_bases = np.vstack(gathered_bases)
            ug, final_sg, final_vgt = isvd(gathered_bases.astype('float64'),
                                           self.k_singular)
        # we now need to scatter back the sg and vgt matrices
        self.mpi_comm.Barrier()
        self.mpi_comm.Bcast(final_sg, root=0)
        self.mpi_comm.Barrier()
        self.mpi_comm.Bcast(final_vgt, root=0)
        self.mpi_comm.Barrier()
        # now that we have the final and best sigma and Vt, we need to go back to the data and
        # reestimate the the U matrices
        inv_multi = final_vgt.transpose().dot(np.diag(final_sg))
        chunks_of_u = []
        for chunk in chunks:
            partial_data = self.data[chunk, :]
            this_u = partial_data.dot(inv_multi)
            chunks_of_u.append(this_u)

        chunks_of_u = np.vstack(chunks_of_u)
        # we need to return this, including a placement array
        return chunks_of_u, final_sg, final_vgt, self.inv_order_split