def _loadWikicorp(): DIR = os.path.dirname(os.path.realpath(__file__)).split( 'vae_sparse')[0] + 'vae_sparse/optvaedatasets/wikicorp' if not os.path.exists(DIR): os.mkdir(DIR) locations = {} locations[ 'WestburyLab.wikicorp.201004.txt.bz2'] = 'http://nlp.stanford.edu/data/WestburyLab.wikicorp.201004.txt.bz2' if not os.path.exists(DIR + '/WestburyLab.wikicorp.201004.txt'): _getData(DIR, locations) if not os.path.exists(DIR + '/data-learning.h5') or not os.path.exists( DIR + '/misc-learning.pkl'): raise ValueError, 'Run ProcessWikicorp(-learning/-large).ipynb to setup data.h5' else: dataset = {} dataset['data_type'] = 'bow' dataset['train'] = loadSparseHDF5('train', DIR + '/data-learning.h5') dataset['valid'] = loadSparseHDF5('valid', DIR + '/data-learning.h5') dataset['test'] = loadSparseHDF5('test', DIR + '/data-learning.h5') dataset['dim_observations'] = dataset['train'].shape[1] objs = readPickle(DIR + '/misc-learning.pkl', nobjects=3) dataset['mapIdx'] = objs[0] dataset['vocabulary'] = objs[1] dataset['vocabulary_singular'] = objs[2] return dataset
def load_lorenz(): curdir = os.path.dirname(os.path.realpath(__file__)) fname = curdir+'/lorenz.pkl' if os.path.exists(fname): print 'Reloading dataset from ' + fname return readPickle(fname)[0] state = generate_data() shufidx = np.random.permutation(N) ntrain = int(0.7*N) ntest = int(0.2*N) nval = N - ntrain - ntest indices = {} indices['train'] = shufidx[:ntrain] indices['valid'] = shufidx[ntrain:ntrain+nval] indices['test'] = shufidx[ntrain+nval:] dataset = {} dataset['dim_observations'] = D for k in ['train', 'valid', 'test']: dataset[k] = {} dataset[k]['tensor'] = state[indices[k]] # dataset[k]['tensor_Z'] = [] dataset[k]['mask'] = np.ones_like(dataset[k]['tensor'][:, :, 0]) dataset['data_type'] = 'real' standardize_data(dataset) savePickle([dataset], curdir+'/lorenz.pkl') print 'Saving...' return dataset
def sample_from_model(): # Sampling from the model DIR = './chkpt/lorenz/' prefix = 'DMM_lr-0_0008-dh-40-ds-'+str(DIM_STOCHASTIC)+'-nl-relu-bs-200-ep-1000-rs-80-rd-0_1-infm-R-tl-2-el-2-ar-2_0-use_p-approx-rc-lstm-uid' pfile = os.path.join(DIR, prefix + '-config.pkl') params = readPickle(pfile, quiet=True)[0] EP = '-EP990' reloadFile = os.path.join(DIR, prefix + EP + '-params.npz') print 'Model parameters in: ', reloadFile params['validate_only'] = True dmm_reloaded = DMM(params, paramFile=pfile, reloadFile=reloadFile) # (mu, logcov): parameters of emission distributions # z_vec = sample in latent space (mu, logcov), zvec = DMM_evaluate.sample(dmm_reloaded, T=40, nsamples=10) print("mu.shape=" + str(mu.shape)) print("zvec.shape=" + str(mu.shape)) visualize_data(mu, n_samples=10) plt.title("Mean trajectories") fig, axlist_x = plt.subplots(3, 1, figsize=(8, 10)) nsamples = 10 T = zvec.shape[1] SNUM = range(nsamples) for idx, ax in enumerate(axlist_x.ravel()): z = zvec[SNUM, :, idx] ax.plot(np.arange(T), np.transpose(z), '-*', label='Dim' + str(idx)) ax.legend() ax.set_xlabel('Time') plt.suptitle('3 dimensional samples of latent space') plt.show()
def reload_dmm(prefix, dmm_dir='./dmm_models/', ep='-EP100'): pfile = os.path.join(dmm_dir, prefix + '-config.pkl') print 'Hyperparameters in: ', pfile, 'Found: ', os.path.exists(pfile) params = readPickle(pfile, quiet=True)[0] reload_file = os.path.join(dmm_dir, prefix + ep + '-params.npz') print 'Model parameters in: ', reload_file # Don't load the training functions for the model since its time consuming params['validate_only'] = True dmm_reloaded = DMM(params, paramFile=pfile, reloadFile=reload_file) return dmm_reloaded
def reload_model(epoch): DIR = './chkpt/lorenz_coupled/' pfile = DIR + 'DMM_lr-0_0008-dh-40-ds-9-nl-relu-bs-100-ep-10000-rs-80-rd-0_1-infm-R-tl-2-el-2-ar-2_0-use_p-approx-rc-lstm-uid-config.pkl' # The hyperparameters are saved in a pickle file - lets load them here hparam = readPickle(pfile, quiet=True)[0] reloadFile = DIR + 'DMM_lr-0_0008-dh-40-ds-9-nl-relu-bs-100-ep-10000-rs-80-rd-0_1-infm-R-tl-2-el-2-ar-2_0-use_p-approx-rc-lstm-uid-EP' + str(epoch) + '-params.npz' # Don't load the training functions for the model since its time consuming hparam['validate_only'] = True dmm = DMM(hparam, paramFile=pfile, reloadFile=reloadFile) return dmm
def set_parameters(dataset): # sets up parameters params = readPickle('../default.pkl')[0] for k in params: print k, '\t',params[k] params['data_type'] = dataset['data_type'] params['dim_observations'] = dataset['dim_observations'] # The dataset is small, lets change some of the default parameters and the unique ID params['dim_stochastic'] = DIM_STOCHASTIC params['dim_hidden'] = 40 params['rnn_size'] = 80 params['epochs'] = EPOCHS params['batch_size'] = 200 params['unique_id'] = params['unique_id'].replace('ds-100', 'ds-'+str(DIM_STOCHASTIC)).replace('dh-200','dh-40').replace('rs-600','rs-80') params['unique_id'] = params['unique_id'].replace('ep-2000','ep-1000').replace('bs-20','bs-200') # Create a temporary directory to save checkpoints params['savedir'] = params['savedir']+'/lorenz/' os.system('mkdir -p '+ params['savedir']) return params
def load_lorenz_coupled(): n = 1000 delta_t = 0.025 t0 = 0.0 T = 100.0 curdir = os.path.dirname(os.path.realpath(__file__)) fname = curdir + '/lorenz_coupled.pkl' if os.path.exists(fname): print 'Reloading dataset from ' + fname return readPickle(fname)[0] print(fname + " not found. Generating data from scratch.") state = generate_data(n=n, t0=t0, T=T, delta_t=delta_t) shufidx = np.random.permutation(n) ntrain = int(0.7*n) ntest = int(0.2*n) nval = n - ntrain - ntest indices = {} indices['train'] = shufidx[:ntrain] indices['valid'] = shufidx[ntrain:ntrain+nval] indices['test'] = shufidx[ntrain+nval:] dataset = {} dataset['dim_observations'] = state.shape[2] for k in ['train', 'valid', 'test']: dataset[k] = {} dataset[k]['tensor'] = state[indices[k]] # dataset[k]['tensor_Z'] = [] dataset[k]['mask'] = np.ones_like(dataset[k]['tensor'][:, :, 0]) dataset['data_type'] = 'real' standardize_data(dataset) savePickle([dataset], fname) print 'Saving...' return dataset
def loadSyntheticData(): curdir = os.path.dirname(os.path.realpath(__file__)) if os.path.exists(curdir + '/synthetic.pkl'): print 'Reloading...' return readPickle(curdir + '/synthetic.pkl')[0] """ Generate simple synthetic data """ params = {} params['train'] = 10000 params['valid'] = 1000 params['test'] = 1000 N = np.sum([params[k] for k in params]) T = 10 DIM_OBS = 3 np.random.seed(0) data, data_Z = simulateLinearData(N, T, DIM_OBS) """ Split into train/valid/test """ shufidx = np.random.permutation(N) indices = {} indices['train'] = shufidx[:params['train']] indices['valid'] = shufidx[params['train']:params['train'] + params['valid']] indices['test'] = shufidx[params['train'] + params['valid']:] """ Setup dataset to return """ dataset = {} for k in ['train', 'valid', 'test']: dataset[k] = {} dataset[k]['tensor'] = data[indices[k]] dataset[k]['tensor_Z'] = data_Z[indices[k]] dataset[k]['mask'] = np.ones_like(dataset[k]['tensor'][:, :, 0]) dataset['data_type'] = 'real' dataset['dim_observations'] = 3 savePickle([dataset], curdir + '/synthetic.pkl') print 'Saving...' return dataset
def _loadWikicorpSubset(kval): assert kval in [1000, 5000, 10000, 15000], 'Bad value: ' + str(kval) DIR = os.path.dirname(os.path.realpath(__file__)).split( 'vae_sparse')[0] + 'vae_sparse/optvaedatasets/wikicorp' assert type(kval) is int, 'Expecting kval as int' h5file = DIR + '/data-learning.h5' pklfile = DIR + '/misc-learning.pkl' assert os.path.exists(h5file) and os.path.exists( pklfile), 'Please run _loadWikicorp to generate data.h5' #Load Wikicorp raw data train = loadSparseHDF5('train', h5file).tocsc() valid = loadSparseHDF5('valid', h5file).tocsc() test = loadSparseHDF5('test', h5file).tocsc() objs = readPickle(DIR + '/misc-learning.pkl', nobjects=3) vocabulary = objs[1] sumfeats = np.array(train.sum(0)).squeeze() idx_sort = np.argsort(sumfeats) idx_to_keep = idx_sort[-kval:] dset = {} dset['vocabulary'] = [ vocabulary[idx] for idx in idx_to_keep.squeeze().tolist() ] train_tmp = train[:, idx_to_keep].tocsr() valid_tmp = valid[:, idx_to_keep].tocsr() test_tmp = test[:, idx_to_keep].tocsr() #Use documents w/ atleast five words in it train_cts_idx = np.where(np.array(train_tmp.sum(1)).squeeze() > 5)[0] valid_cts_idx = np.where(np.array(valid_tmp.sum(1)).squeeze() > 5)[0] test_cts_idx = np.where(np.array(test_tmp.sum(1)).squeeze() > 5)[0] dset['train'] = train_tmp[train_cts_idx] dset['valid'] = valid_tmp[valid_cts_idx] dset['test'] = test_tmp[test_cts_idx] dset['dim_observations'] = dset['train'].shape[1] dset['data_type'] = 'bow' return dset
for k in opt_params: print k, opt_params[k].shape import glob, os, sys, time sys.path.append('../') from utils.misc import getConfigFile, readPickle, displayTime start_time = time.time() from model_th.dmm import DMM import model_th.learning as DMM_learn import model_th.evaluate as DMM_evaluate displayTime('importing DMM', start_time, time.time()) #This is the prefix we will use DIR = './chkpt-ipython/' prefix = 'DMM_lr-0_0008-dh-40-ds-2-nl-relu-bs-200-ep-40-rs-80-rd-0_1-infm-R-tl-2-el-2-ar-2_0-use_p-approx-rc-lstm-uid' pfile = os.path.join(DIR, prefix + '-config.pkl') print 'Hyperparameters in: ', pfile, 'Found: ', os.path.exists(pfile) #The hyperparameters are saved in a pickle file - lets load them here params = readPickle(pfile, quiet=True)[0] #Reload the model at Epoch 30 EP = '-EP30' #File containing model paramters reloadFile = os.path.join(DIR, prefix + EP + '-params.npz') print 'Model parameters in: ', reloadFile #Don't load the training functions for the model since its time consuming params['validate_only'] = True dmm_reloaded = DMM(params, paramFile=pfile, reloadFile=reloadFile) print 'Model Reloaded: ', type(dmm_reloaded)
def _cifar10(): #CIFAR 10 Dataset DIR = getPYDIR() + '/datasets/cifar10' if not os.path.exists(DIR): os.system('mkdir -p ' + DIR) savef = os.path.join(DIR, 'cifar-10-python.tar.gz') if not os.path.exists(savef): urllib.urlretrieve( 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz', savef) cifarfile = os.path.join(DIR, 'cifar10.h5') if not os.path.exists(cifarfile): print 'Extracting CIFAR...' tf = tarfile.open(savef) tf.extractall(DIR) tf.close() EDIR = DIR + '/cifar-10-batches-py/' h5f = h5py.File(cifarfile, mode='w') traindatalist, trainlabellist = [], [] for k in range(5): print k, hmap = readPickle(EDIR + '/data_batch_' + str(k + 1))[0] traindatalist.append(hmap['data']) trainlabellist.append(hmap['labels']) alltrainx = np.concatenate(traindatalist, axis=0) alltrainy = np.concatenate(trainlabellist, axis=0) np.random.seed(1) idxlist = np.random.permutation(alltrainx.shape[0]) val_idx = idxlist[:int(0.1 * alltrainx.shape[0])] tr_idx = idxlist[int(0.1 * alltrainx.shape[0]):] TRAINX = alltrainx[tr_idx] TRAINY = alltrainy[tr_idx] VALIDX = alltrainx[val_idx] VALIDY = alltrainy[val_idx] h5f.create_dataset('train', data=reshapeMatrix(TRAINX)) h5f.create_dataset('valid', data=reshapeMatrix(VALIDX)) h5f.create_dataset('train_y', data=TRAINY) h5f.create_dataset('valid_y', data=VALIDY) hmap = readPickle(EDIR + '/test_batch')[0] h5f.create_dataset('test', data=reshapeMatrix(hmap['data'])) h5f.create_dataset('test_y', data=np.array(hmap['labels'])) hmap = readPickle(EDIR + '/batches.meta')[0] h5f.create_dataset('label_names', data=np.array(hmap['label_names'], dtype='|S10')) h5f.close() print '\nCreated CIFAR h5 file' else: print 'Found CIFAR h5 file' h5f = h5py.File(cifarfile, mode='r') dataset = {} dataset['label_names'] = h5f['label_names'].value dataset['train'] = h5f['train'].value dataset['test'] = h5f['test'].value dataset['valid'] = h5f['valid'].value dataset['train_y'] = h5f['train_y'].value dataset['test_y'] = h5f['test_y'].value dataset['valid_y'] = h5f['valid_y'].value dataset['dim_observations'] = np.prod(dataset['train'].shape[1:]) dataset['num_channels'] = dataset['train'].shape[-3] dataset['dim_h'] = dataset['train'].shape[-2] dataset['dim_w'] = dataset['train'].shape[-1] dataset['data_type'] = 'image' h5f.close() return dataset
def getTF(dataset): tfidf = TfidfTransformer(norm=None) tfidf.fit(dataset['train']) return tfidf.idf_ additional_attrs_wiki['idf'] = getTF(dataset_wiki) for mname in MODELS_TO_USE: if 'wikicorp' not in mname: continue print 'Model: ', mname pfile = models[mname].split('uid')[0] + 'uid-config.pkl' params = readPickle(pfile)[0] suffix = '-EP' + str(epochval[mname]) + '-params.npz' rfile = models[mname] + suffix assert os.path.exists(rfile), 'not found' params['EVALUATION'] = True if 'wikicorp' in mname: vae = VAE(params, paramFile=pfile, reloadFile=rfile, additional_attrs=additional_attrs_wiki) trainData = dataset_wiki['train'] validData = dataset_wiki['valid'] Ntrain = trainData.shape[0] np.random.seed(0) trainData = trainData[np.random.permutation(Ntrain)[:100000]] else:
Read saved results to obtain tables """ import glob from utils.misc import loadHDF5, getConfigFile, readPickle DIR = './' datasets = ['20newsgroups', 'rcv2'] result_best = {} result_last = {} for dataset in datasets: print 'Dataset: ', dataset for f in glob.glob(DIR + '/chkpt-' + dataset + '-*/*evaluate.h5'): if 'mnist' in f or 'qvary' in f: continue dataset = f.split('chkpt-')[1].split('-')[0] opt_type = f.split('chkpt-')[1].split('-')[1].split('/')[0] params = readPickle(getConfigFile(f.replace('evaluate.h5', '')))[0] dset = loadHDF5(f) if params['opt_type'] == 'finopt': name = str(params['p_layers']) + '-M' + str( params['n_steps']) + '-' + params['input_type'] else: name = str(params['p_layers']) + '-M1-' + params['input_type'] result_best[params['dataset'] + '-' + name] = (dset['perp_0_best'], dset['perp_f_best']) result_last[params['dataset'] + '-' + name] = (dset['test_perp_0'], dset['test_perp_f']) print name, (dset['perp_0_best'], dset['perp_f_best']) for dataset in datasets: for itype in ['normalize', 'tfidf']: for layer in ['0', '2']: for M in ['M1', 'M100']:
print 'Dimensionality of the observations: ', dataset['dim_observations'] print 'Data type of features:', dataset['data_type'] for dtype in ['train', 'valid', 'test']: print 'dtype: ', dtype, ' type(dataset[dtype]): ', type(dataset[dtype]) print[(k, type(dataset[dtype][k]), dataset[dtype][k].shape) for k in dataset[dtype]] print '--------\n' start_time = time.time() from model_th.dmm import DMM import model_th.learning as DMM_learn import model_th.evaluate as DMM_evaluate displayTime('importing DMM', start_time, time.time()) params = readPickle('../default.pkl')[0] for k in params: print k, '\t', params[k] params['data_type'] = dataset['data_type'] params['dim_observations'] = dataset['dim_observations'] #The dataset is small, lets change some of the default parameters and the unique ID params['dim_stochastic'] = 2 params['dim_hidden'] = 40 params['rnn_size'] = 80 params['epochs'] = 40 params['batch_size'] = 200 params['unique_id'] = params['unique_id'].replace('ds-100', 'ds-2').replace( 'dh-200', 'dh-40').replace('rs-600', 'rs-80') params['unique_id'] = params['unique_id'].replace('ep-2000', 'ep-40').replace( 'bs-20', 'bs-200')
""" Compile aggregate timing information for different runs """ import glob,os import numpy as np from utils.misc import loadHDF5,getConfigFile,readPickle for f in glob.glob('./chkpt-*/*-EP50-stats.h5'): code = 'ds'+os.path.basename(f).split('-ql')[0].split('ds')[1] if 'finopt' in f: code = 'finopt-'+code else: code = 'none-'+code data = loadHDF5(f) params=readPickle(getConfigFile(f))[0] code = params['dataset']+'-'+code runtimes = [] for edata in data['batch_time']: if int(edata[0])%params['savefreq']==0: continue else: runtimes.append(edata[1]) print code, np.mean(runtimes)