def get_data(seed=123456, nDocTotal=32, T=1000, **kwargs): ''' Generate several data sequences, returned as a bnpy data-object Args ------- seed : integer seed for random number generator, used for actually *generating* the data seqLens : total number of observations in each sequence Returns ------- Data : bnpy GroupXData object, with nObsTotal observations ''' fullX, fullZ, doc_range = get_X(seed, nDocTotal, T) X = np.vstack(fullX) Z = np.asarray(fullZ) nUsedStates = len(np.unique(Z)) if nUsedStates < K: print 'WARNING: NOT ALL TRUE STATES USED IN GENERATED DATA' Data = GroupXData(X=X, doc_range=doc_range, TrueZ=Z) Data.name = get_short_name() Data.summary = get_data_info() return Data
def get_data(**kwargs): ''' Returns data from audio tracks ''' if os.path.exists(DATAFILE_MAT): Data = GroupXData.LoadFromFile(DATAFILE_MAT) else: obs = [] doc_range = [0] count = 0 with h5py.File('../tracks.h5', 'r') as tracks: for track, grp in ProgressBar(tracks.items()): if 'gfccs' not in grp: continue data = grp['gfccs'] count += data.shape[0] doc_range.append(count) obs.append(data.value.astype(np.float64)) X = np.vstack(obs) Data = GroupXData(X=X, doc_range=doc_range) Data.save_to_mat(DATAFILE_MAT) Data.name = 'AudioCorpus' Data.summary = 'Audio Corpus. obs=10.5M docs=559' return Data
def get_data(seed=8675309, nDocTotal=52, T=800, **kwargs): ''' Args ------- seed : integer seed for random number generator, used for actually *generating* the data nObsTotal : total number of observations for the dataset. Returns ------- Data : bnpy XData object, with nObsTotal observations ''' X, Xprev, TrueZ, doc_range = genToyData( seed=seed, nDocTotal=nDocTotal, T=T) Data = GroupXData(X=X, TrueZ=TrueZ, Xprev=Xprev, doc_range=doc_range) Data.name = get_short_name() Data.summary = get_data_info() return Data
def get_data(seed=86758, seqLens=((3000, 3000, 3000, 3000, 500)), **kwargs): ''' Generate several data sequences, returned as a bnpy data-object Args ------- seed : integer seed for random number generator, used for actually *generating* the data nObsTotal : total number of observations for the dataset. Returns ------- Data : bnpy GroupXData object, with nObsTotal observations ''' fullX, fullZ, seqIndicies = get_X(seed, seqLens) X = np.vstack(fullX) Z = np.asarray(fullZ) doc_range = np.asarray(seqIndicies) Data = GroupXData(X=X, doc_range=doc_range, TrueZ=Z) Data.name = get_short_name() Data.summary = get_data_info() return Data
def get_data(seed=DEFAULT_SEED, T=DEFAULT_LEN, **kwargs): ''' Generate toy data sequences, returned as a bnpy data-object Args ------- seed : integer seed for random number generator, used for actually *generating* the data T : int number of observations in each sequence Returns ------- Data : bnpy GroupXData object, with nObsTotal observations ''' X, Xprev, Z, doc_range = get_X(seed, T) nUsedStates = len(np.unique(Z)) if nUsedStates < K: print 'WARNING: NOT ALL TRUE STATES USED IN GENERATED DATA' Data = GroupXData(X=X, Xprev=Xprev, doc_range=doc_range, TrueZ=Z) Data.name = get_short_name() Data.summary = get_data_info() return Data
def train_image_specific_topics(self, y, sigma, Niter=50, Kfresh=100, pixelMask=None): print('Training %d image-specific clusters...' % Kfresh) D, patchSize, GP = self.D, int(np.sqrt(self.D)), self.GP # gather fully observable patches if pixelMask is None: # gray-scale image denoising v = im2col(y, patchSize) else: # color image inpainting C = 3 patchMask = np.logical_not( np.any(im2col(pixelMask, patchSize), axis=0)) v = np.hstack( tuple([ im2col(y[:, :, c], patchSize)[:, patchMask] for c in xrange(C) ])) v -= np.mean(v, axis=0) v = v.T testData = GroupXData(X=v, doc_range=[0, len(v)], nDocTotal=1) testData.name = 'test_image_patches' # set up hyper-parameters and run Bregman k-means cached_B_name = 'models/HDP/B.mat' xBar = loadmat(cached_B_name)['Cov'] xBar2 = loadmat(cached_B_name)['Cov2'] tmp0 = (np.diag(xBar) + sigma**2)**2 tmp1 = np.diag(xBar2) + 6 * np.diag(xBar) * sigma**2 + 3 * sigma**4 nu = D + 3 + 2 * np.sum(tmp0) / np.sum(tmp1 - tmp0) B = (nu - D - 1) * (xBar + sigma**2 * np.eye(D)) obsModel = ZeroMeanGaussObsModel(D=D, min_covar=1e-8, inferType='memoVB', B=B, nu=nu) Z, Mu, Lscores = runKMeans_BregmanDiv(testData.X, Kfresh, obsModel, Niter=Niter, assert_monotonic=False) Korig = self.K Kall = np.max(Z) + Korig + 1 Kfresh = Kall - Korig Z += Korig # load SuffStats of training images trainSS = loadSuffStatBag('models/HDP/SS.dump') trainSS.insertEmptyComps(Kfresh) # construct SuffStats of the test image DocTopicCount = np.bincount(Z, minlength=int(Kall)).reshape((1, Kall)) DocTopicCount = np.array(DocTopicCount, dtype=np.float64) resp = np.zeros((len(Z), Kall)) resp[np.arange(len(Z)), Z] = 1.0 testLP = dict(resp=resp, DocTopicCount=DocTopicCount) alphaPi0 = np.hstack( (GP.alphaPi0, GP.alphaPi0Rem / (Kfresh + 1) * np.ones(Kfresh))) alphaPi0Rem = GP.alphaPi0Rem / (Kfresh + 1) testLP = updateLPGivenDocTopicCount(testLP, DocTopicCount, alphaPi0, alphaPi0Rem) testSS = self.patchModel.get_global_suff_stats( testData, testLP, doPrecompEntropy=1, doTrackTruncationGrowth=1) xxT = np.zeros((Kall, D, D)) for k in xrange(Korig, Kall): idx = Z == k tmp = np.einsum('nd,ne->de', v[idx], v[idx]) tmp -= testSS.N[k] * sigma**2 * np.eye(D) val, vec = np.linalg.eig(tmp) val[val < EPS] = EPS xxT[k] = np.dot(vec, np.dot(np.diag(val), vec.T)) testSS.setField('xxT', xxT, dims=('K', 'D', 'D')) testSS.setUIDs(trainSS.uids) # combine training and test SS; update model parameters combinedSS = trainSS + testSS self.patchModel.update_global_params(combinedSS) self.calcGlobalParams()