def get_minibatch_iterator(seed=8675309, dataorderseed=0, nBatch=10, nObsBatch=None, nObsTotal=25000, nLap=1, startLap=0, **kwargs): ''' Args -------- seed : integer seed for random number generator, used for actually *generating* the data dataorderseed : integer seed that determines (a) how data is divided into minibatches (b) order these minibatches are traversed Returns ------- bnpy MinibatchIterator object, with nObsTotal observations divided into nBatch batches ''' X, TrueZ = get_X(seed, nObsTotal) Data = XData(X=X) Data.summary = get_data_info() DataIterator = MinibatchIterator(Data, nBatch=nBatch, nObsBatch=nObsBatch, nLap=nLap, startLap=startLap, dataorderseed=dataorderseed) return DataIterator
def get_size_of_batch_from_file(self, filepath): if filepath.endswith('.ldac'): with open(filepath, 'r') as f: return len(f.readlines()) elif self.dataset_type == 'GroupXData': return XData.read_file(filepath).nDoc elif self.dataset_type == 'XData': return XData.read_file(filepath).nObs else: raise ValueError('Unrecognized file type: ' + filepath) """
def get_data(**kwargs): ''' Args ------- filepath Returns ------- Data : bnpy XData object, with nObsTotal observations ''' X = np.loadtxt(filepath, dtype=np.float64) Data = XData(X=X) Data.name = get_short_name() Data.summary = get_data_info() return Data
def MakeData(self, K=3, Nperclass=1000): ''' Creates simple toy dataset for testing. Simple 3 component data with eye covar and distinct, well-sep means mu0 = [-10, -10] mu1 = [0, 0] mu2 = [10, 10] ''' PRNG = np.random.RandomState(8675309) # Means: [-10 -10; 0 0; 10 10] Mu = np.zeros((3, 2)) Mu[0] = Mu[0] - 10 Mu[2] = Mu[2] + 10 # Covariances: identity Sigma = np.eye(2) # Generate data from K components, each with Nperclass examples self.TrueResp = np.zeros((K * Nperclass, K)) Xlist = list() for k in range(K): Xcur = mvnrand(Mu[k], Sigma, Nperclass, PRNG) Xlist.append(Xcur) self.TrueResp[k * Nperclass:(k + 1) * Nperclass, k] = 1.0 X = np.vstack(Xlist) self.Data = XData(X=X) self.Mu = Mu assert np.abs(self.TrueResp.sum() - self.Data.nObs) < 1e-2
def get_data(seed=8675309, nObsTotal=25000, **kwargs): ''' Args ------- seed : integer seed for random number generator, used for actually *generating* the data nObsTotal : total number of observations for the dataset. Returns ------- Data : bnpy XData object, with nObsTotal observations ''' X, TrueZ = get_X(seed, nObsTotal) Data = XData(X=X, TrueZ=TrueZ) Data.summary = get_data_info() return Data
def setUp(self): X = np.random.randn(100, 3) self.Data = XData(X=X) aPDict = dict(alpha0=1.0) oPDict = dict(dF=5, ECovMat='eye', sF=1.0) self.hmodel = HModel.CreateEntireModel('VB', 'MixModel', 'ZMGauss', aPDict, oPDict, self.Data)
def MakeData(self, K=3, Nperclass=1000): ''' Create simple toy XData with K components, add as attribute to self Simple 3 component data with eye covar and distinct, well-sep means mu0 = [-10, -10] mu1 = [0, 0] mu2 = [10, 10] ''' Mu = np.zeros((3, 2)) Mu[0] = Mu[0] - 10 Mu[2] = Mu[2] + 10 Sigma = np.eye(2) self.TrueResp = np.zeros((K * Nperclass, K)) self.DupResp = np.zeros((K * Nperclass, 2 * K)) Xlist = list() for k in range(K): Xcur = mvnrand(Mu[k], Sigma, Nperclass) Xlist.append(Xcur) self.TrueResp[k * Nperclass:(k + 1) * Nperclass, k] = 1.0 start = k * Nperclass stop = (k + 1) * Nperclass half = 0.5 * (start + stop) self.DupResp[start:half, k] = 1.0 self.DupResp[half:stop, K + k] = 1.0 X = np.vstack(Xlist) self.Data = XData(X=X) self.Mu = Mu assert np.abs(self.TrueResp.sum() - self.Data.nObs) < 1e-2 assert np.abs(self.DupResp.sum() - self.Data.nObs) < 1e-2
def generateRandomBinaryDataFromMixture(**kwargs): for key in Defaults: if key not in kwargs: kwargs[key] = Defaults[key] phi = makePhi(**kwargs) nObsTotal = kwargs['nObsTotal'] PRNG = np.random.RandomState(kwargs['seed']) # Select number of observations from each cluster beta = 1.0 / K * np.ones(K) if nObsTotal < 2 * K: # force examples from every cluster nPerCluster = np.ceil(nObsTotal / K) * np.ones(K) else: nPerCluster = as1D(PRNG.multinomial(nObsTotal, beta, size=1)) nPerCluster = np.int32(nPerCluster) # Generate data from each cluster! X = np.zeros((nObsTotal, D)) Z = np.zeros(nObsTotal, dtype=np.int32) start = 0 for k in xrange(K): stop = start + nPerCluster[k] X[start:stop] = np.float64( PRNG.rand(nPerCluster[k], D) < phi[k, :][np.newaxis, :]) Z[start:stop] = k start = stop TrueParams = dict() TrueParams['beta'] = beta TrueParams['phi'] = phi TrueParams['Z'] = Z return XData(X, TrueParams=TrueParams)
def setUp(self): X = np.random.randn(100, 3) self.Data = XData(X=X) aPDict = dict(alpha0=1.0) oPDict = dict(min_covar=1e-9) self.hmodel = HModel.CreateEntireModel('EM', 'MixModel', 'ZMGauss', aPDict, oPDict, self.Data)
def get_data(seed=8675309, nObsTotal=25000, **kwargs): ''' Create and return toy dataset from 1D standard normal distribution. Args ------- seed : integer seed for random number generator, used for actually *generating* the data nObsTotal : total number of observations for the dataset. Returns ------- Data : bnpy XData object, with nObsTotal observations ''' X, TrueZ = generate_data(seed, nObsTotal) Data = XData(X=X, TrueZ=TrueZ) Data.name = get_short_name() Data.summary = get_data_info() return Data
def MakeData(self, K=5, Nperclass=1000): PRNG = np.random.RandomState(867) sigma = 1e-3 Xlist = list() for k in range(K): Xcur = sigma * PRNG.randn(Nperclass, 2) Xcur += k Xlist.append(Xcur) self.Data = XData(np.vstack(Xlist))
def get_data(seed=8675309, nObsTotal=None, nPerState=20, **kwargs): ''' Args ------- seed : integer seed for random number generator, used for actually *generating* the data nObsTotal : total number of observations for the dataset. Returns ------- Data : bnpy XData object, with nObsTotal observations ''' if nObsTotal is not None: nPerState = nObsTotal // K X, TrueZ = genToyData(seed=seed, nPerState=nPerState) Data = XData(X=X, TrueZ=TrueZ) Data.name = get_short_name() Data.summary = get_data_info() return Data
def setUp(self): PRNG = np.random.RandomState(867) X = PRNG.randn(100,2) self.Data = XData(X=X) aPDict = dict(alpha0=1.0) oPDict = dict(min_covar=1e-9) self.hmodel = HModel.CreateEntireModel('EM','MixModel','ZMGauss', aPDict, oPDict, self.Data) initParams = dict(initname='randexamples', seed=0, K=5) self.hmodel.init_global_params(self.Data, **initParams)
def MakeData(self, nObsC=200): if self.obsM is None: return XList = list() np.random.seed(505) for k in range(self.obsM.K): Sigma = self.obsM.get_covar_mat_for_comp(k) mu = self.obsM.get_mean_for_comp(k) Xcur = mvnrand(mu, Sigma, nObsC) XList.append(Xcur) X = np.vstack(XList) self.nObsC = nObsC self.Data = XData(X=X)
def init_global_params(hmodel, Data, initname='randexamples', seed=0, K=0, **kwargs): PRNG = np.random.RandomState(seed) X = Data.X if initname == 'randexamples': ''' Choose K items uniformly at random from the Data then component params by M-step given those single items ''' resp = np.zeros((Data.nObs, K)) permIDs = PRNG.permutation(Data.nObs).tolist() for k in xrange(K): resp[permIDs[k], k] = 1.0 elif initname == 'randexamplesbydist': ''' Choose K items from the Data, selecting the first at random, then subsequently proportional to euclidean distance to the closest item ''' objID = discrete_single_draw(np.ones(Data.nObs), PRNG) chosenObjIDs = list([objID]) minDistVec = np.inf * np.ones(Data.nObs) for k in range(1, K): curDistVec = np.sum((Data.X - Data.X[objID])**2, axis=1) minDistVec = np.minimum(minDistVec, curDistVec) objID = discrete_single_draw(minDistVec, PRNG) chosenObjIDs.append(objID) resp = np.zeros((Data.nObs, K)) for k in xrange(K): resp[chosenObjIDs[k], k] = 1.0 elif initname == 'randsoftpartition': ''' Randomly assign all data items some mass in each of K components then create component params by M-step given that soft partition ''' resp = PRNG.rand(Data.nObs, K) resp = resp / np.sum(resp, axis=1)[:, np.newaxis] elif initname == 'randomnaive': ''' Generate K "fake" examples from the diagonalized data covariance, creating params by assigning each "fake" example to a component. ''' Sig = np.sqrt(np.diag(np.cov(Data.X.T))) Xfake = Sig * PRNG.randn(K, Data.dim) Data = XData(Xfake) resp = np.eye(K) LP = dict(resp=resp) SS = hmodel.get_global_suff_stats(Data, LP) hmodel.update_global_params(SS)
def setUp(self): oDict = dict(inferType='EM', min_covar=0.0) compDictList = [dict(Sigma=np.eye(2)), dict(Sigma=100 * np.eye(2))] obsPrior = None self.obsM = ZMGaussObsModel.CreateWithAllComps(oDict, obsPrior, compDictList) self.C = 10 XList = list() for k in range(self.obsM.K): Xcur = np.random.randn(self.C, 2) sig = np.sqrt(self.obsM.comp[k].Sigma[0, 0]) XList.append(sig * Xcur) self.Data = XData(X=np.vstack(XList)) print self.Data.X
def loadDataForBatch(self, batchID): ''' Load the data assigned to a particular batch Returns ------- Dchunk : bnpy.data.DataObj subclass ''' dpath = self.datafileList[batchID] if dpath.endswith('.ldac'): return BagOfWordsData.LoadFromFile_ldac(dpath, **self.DataInfo) elif self.dataset_type == 'GroupXData': return GroupXData.LoadFromFile(dpath, **self.DataInfo) else: return XData.read_file(dpath, **self.DataInfo)
def loadDataForSlice(filepath='', dataset_type='', **kwargs): """ Return data object loaded from specific file. Keyword args ------------ workerID nWorkers """ if filepath.endswith('.ldac'): return BagOfWordsData.LoadFromFile_ldac(filepath, **kwargs) else: if dataset_type == 'GroupXData': return GroupXData.LoadFromFile(filepath, **kwargs) else: return XData.LoadFromFile(filepath, **kwargs)
def MakeData(self, N=10000): S1 = np.asarray([[100, 0], [0, 0.01]]) Sigma = np.zeros((2, 2, 4)) Sigma[:, :, 0] = S1 Sigma[:, :, 1] = RandUtil.rotateCovMat(S1, theta=np.pi / 4) Sigma[:, :, 2] = RandUtil.rotateCovMat(S1, theta=2 * np.pi / 4) Sigma[:, :, 3] = RandUtil.rotateCovMat(S1, theta=3 * np.pi / 4) self.Sigma = Sigma Xlist = list() Rlist = list() for k in range(Sigma.shape[2]): curX = RandUtil.mvnrand([0, 0], Sigma[:, :, k], N) curresp = np.zeros((N, 4)) curresp[:, k] = 1.0 Xlist.append(curX) Rlist.append(curresp) X = np.vstack(Xlist) self.Data = XData(X=X) self.trueresp = np.vstack(Rlist)
def setUp(self, K=7): ''' Create random data, and a K component MixModel to go with it Call this original model "hmodel". We copy hmodel into "modelB", and then save to file via save_model() ''' self.K = K PRNG = np.random.RandomState(867) X = PRNG.randn(100,2) self.Data = XData(X=X) aPDict = dict(alpha0=1.0) oPDict = dict(min_covar=1e-9) self.hmodel = HModel.CreateEntireModel('EM','MixModel','ZMGauss', aPDict, oPDict, self.Data) modelB = self.hmodel.copy() initParams = dict(initname='randexamples', seed=0, K=self.K) modelB.init_global_params(self.Data, **initParams) ModelWriter.save_model(modelB, '/tmp/', 'Test') self.modelB = modelB
def get_data(seed=8675309, nObsTotal=25000, **kwargs): X, TrueZ = generateData( seed, nObsTotal) Data = XData(X=X, TrueZ=TrueZ) Data.summary = get_data_info() return Data
def get_minibatch_iterator(seed=8675309, nObsTotal=25000, **kwargs): X, TrueZ = generateData(seed, nObsTotal) Data = XData(X=X, TrueZ=TrueZ) DataIterator = MinibatchIterator(Data, **kwargs) DataIterator.summary = get_data_info() return DataIterator
def setUp(self): X = np.random.randn(100, 3) self.Data = XData(X=X) self.DataIterator = MinibatchIterator(self.Data, nBatch=10, nLap=10)
def get_data(seed=8675309, nObsTotal=25000, **kwargs): X, TrueZ = generateData(seed, nObsTotal) Data = XData(X=X, TrueZ=TrueZ) Data.name = get_short_name() Data.summary = get_data_info() return Data