Ejemplo n.º 1
0
def _loadWikicorp():
    DIR = os.path.dirname(os.path.realpath(__file__)).split(
        'vae_sparse')[0] + 'vae_sparse/optvaedatasets/wikicorp'
    if not os.path.exists(DIR):
        os.mkdir(DIR)
    locations = {}
    locations[
        'WestburyLab.wikicorp.201004.txt.bz2'] = 'http://nlp.stanford.edu/data/WestburyLab.wikicorp.201004.txt.bz2'
    if not os.path.exists(DIR + '/WestburyLab.wikicorp.201004.txt'):
        _getData(DIR, locations)
    if not os.path.exists(DIR + '/data-learning.h5') or not os.path.exists(
            DIR + '/misc-learning.pkl'):
        raise ValueError, 'Run ProcessWikicorp(-learning/-large).ipynb to setup data.h5'
    else:
        dataset = {}
        dataset['data_type'] = 'bow'
        dataset['train'] = loadSparseHDF5('train', DIR + '/data-learning.h5')
        dataset['valid'] = loadSparseHDF5('valid', DIR + '/data-learning.h5')
        dataset['test'] = loadSparseHDF5('test', DIR + '/data-learning.h5')
        dataset['dim_observations'] = dataset['train'].shape[1]
        objs = readPickle(DIR + '/misc-learning.pkl', nobjects=3)
        dataset['mapIdx'] = objs[0]
        dataset['vocabulary'] = objs[1]
        dataset['vocabulary_singular'] = objs[2]
        return dataset
Ejemplo n.º 2
0
def load_lorenz():
    curdir = os.path.dirname(os.path.realpath(__file__))
    fname = curdir+'/lorenz.pkl'
    if os.path.exists(fname):
        print 'Reloading dataset from ' + fname
        return readPickle(fname)[0]

    state = generate_data()

    shufidx = np.random.permutation(N)
    ntrain = int(0.7*N)
    ntest = int(0.2*N)
    nval = N - ntrain - ntest
    indices = {}
    indices['train'] = shufidx[:ntrain]
    indices['valid'] = shufidx[ntrain:ntrain+nval]
    indices['test'] = shufidx[ntrain+nval:]

    dataset = {}
    dataset['dim_observations'] = D
    for k in ['train', 'valid', 'test']:
        dataset[k] = {}
        dataset[k]['tensor'] = state[indices[k]]
        # dataset[k]['tensor_Z'] = []
        dataset[k]['mask'] = np.ones_like(dataset[k]['tensor'][:, :, 0])
    dataset['data_type'] = 'real'

    standardize_data(dataset)
    savePickle([dataset], curdir+'/lorenz.pkl')
    print 'Saving...'
    return dataset
Ejemplo n.º 3
0
def sample_from_model():
    # Sampling from the model
    DIR = './chkpt/lorenz/'
    prefix = 'DMM_lr-0_0008-dh-40-ds-'+str(DIM_STOCHASTIC)+'-nl-relu-bs-200-ep-1000-rs-80-rd-0_1-infm-R-tl-2-el-2-ar-2_0-use_p-approx-rc-lstm-uid'
    pfile = os.path.join(DIR, prefix + '-config.pkl')
    params = readPickle(pfile, quiet=True)[0]
    EP = '-EP990'
    reloadFile = os.path.join(DIR, prefix + EP + '-params.npz')
    print 'Model parameters in: ', reloadFile
    params['validate_only'] = True
    dmm_reloaded = DMM(params, paramFile=pfile, reloadFile=reloadFile)

    # (mu, logcov): parameters of emission distributions
    # z_vec = sample in latent space
    (mu, logcov), zvec = DMM_evaluate.sample(dmm_reloaded, T=40, nsamples=10)

    print("mu.shape=" + str(mu.shape))
    print("zvec.shape=" + str(mu.shape))

    visualize_data(mu, n_samples=10)
    plt.title("Mean trajectories")


    fig, axlist_x = plt.subplots(3, 1, figsize=(8, 10))
    nsamples = 10
    T = zvec.shape[1]
    SNUM = range(nsamples)
    for idx, ax in enumerate(axlist_x.ravel()):
         z = zvec[SNUM, :, idx]
         ax.plot(np.arange(T), np.transpose(z), '-*', label='Dim' + str(idx))
         ax.legend()
    ax.set_xlabel('Time')
    plt.suptitle('3 dimensional samples of latent space')
    plt.show()
Ejemplo n.º 4
0
def reload_dmm(prefix, dmm_dir='./dmm_models/', ep='-EP100'):
    pfile = os.path.join(dmm_dir, prefix + '-config.pkl')
    print 'Hyperparameters in: ', pfile, 'Found: ', os.path.exists(pfile)
    params = readPickle(pfile, quiet=True)[0]

    reload_file = os.path.join(dmm_dir, prefix + ep + '-params.npz')
    print 'Model parameters in: ', reload_file

    # Don't load the training functions for the model since its time consuming
    params['validate_only'] = True
    dmm_reloaded = DMM(params, paramFile=pfile, reloadFile=reload_file)
    return dmm_reloaded
Ejemplo n.º 5
0
def reload_model(epoch):
    DIR = './chkpt/lorenz_coupled/'
    pfile = DIR + 'DMM_lr-0_0008-dh-40-ds-9-nl-relu-bs-100-ep-10000-rs-80-rd-0_1-infm-R-tl-2-el-2-ar-2_0-use_p-approx-rc-lstm-uid-config.pkl'

    # The hyperparameters are saved in a pickle file - lets load them here
    hparam = readPickle(pfile, quiet=True)[0]
    reloadFile = DIR + 'DMM_lr-0_0008-dh-40-ds-9-nl-relu-bs-100-ep-10000-rs-80-rd-0_1-infm-R-tl-2-el-2-ar-2_0-use_p-approx-rc-lstm-uid-EP' + str(epoch) + '-params.npz'

    # Don't load the training functions for the model since its time consuming
    hparam['validate_only'] = True
    dmm = DMM(hparam, paramFile=pfile, reloadFile=reloadFile)
    return dmm
Ejemplo n.º 6
0
def set_parameters(dataset):
    # sets up parameters
    params = readPickle('../default.pkl')[0]
    for k in params:
        print k, '\t',params[k]
    params['data_type'] = dataset['data_type']
    params['dim_observations'] = dataset['dim_observations']

    # The dataset is small, lets change some of the default parameters and the unique ID
    params['dim_stochastic'] = DIM_STOCHASTIC
    params['dim_hidden'] = 40
    params['rnn_size'] = 80
    params['epochs'] = EPOCHS
    params['batch_size'] = 200
    params['unique_id'] = params['unique_id'].replace('ds-100', 'ds-'+str(DIM_STOCHASTIC)).replace('dh-200','dh-40').replace('rs-600','rs-80')
    params['unique_id'] = params['unique_id'].replace('ep-2000','ep-1000').replace('bs-20','bs-200')

    # Create a temporary directory to save checkpoints
    params['savedir'] = params['savedir']+'/lorenz/'
    os.system('mkdir -p '+ params['savedir'])

    return params
Ejemplo n.º 7
0
def load_lorenz_coupled():
    n = 1000
    delta_t = 0.025
    t0 = 0.0
    T = 100.0

    curdir = os.path.dirname(os.path.realpath(__file__))
    fname = curdir + '/lorenz_coupled.pkl'

    if os.path.exists(fname):
        print 'Reloading dataset from ' + fname
        return readPickle(fname)[0]

    print(fname + " not found. Generating data from scratch.")

    state = generate_data(n=n, t0=t0, T=T, delta_t=delta_t)

    shufidx = np.random.permutation(n)
    ntrain = int(0.7*n)
    ntest = int(0.2*n)
    nval = n - ntrain - ntest
    indices = {}
    indices['train'] = shufidx[:ntrain]
    indices['valid'] = shufidx[ntrain:ntrain+nval]
    indices['test'] = shufidx[ntrain+nval:]

    dataset = {}
    dataset['dim_observations'] = state.shape[2]
    for k in ['train', 'valid', 'test']:
        dataset[k] = {}
        dataset[k]['tensor'] = state[indices[k]]
        # dataset[k]['tensor_Z'] = []
        dataset[k]['mask'] = np.ones_like(dataset[k]['tensor'][:, :, 0])
    dataset['data_type'] = 'real'

    standardize_data(dataset)
    savePickle([dataset], fname)
    print 'Saving...'
    return dataset
Ejemplo n.º 8
0
def loadSyntheticData():
    curdir = os.path.dirname(os.path.realpath(__file__))
    if os.path.exists(curdir + '/synthetic.pkl'):
        print 'Reloading...'
        return readPickle(curdir + '/synthetic.pkl')[0]
    """ Generate simple synthetic data """
    params = {}
    params['train'] = 10000
    params['valid'] = 1000
    params['test'] = 1000
    N = np.sum([params[k] for k in params])
    T = 10
    DIM_OBS = 3
    np.random.seed(0)
    data, data_Z = simulateLinearData(N, T, DIM_OBS)
    """
    Split into train/valid/test
    """
    shufidx = np.random.permutation(N)
    indices = {}
    indices['train'] = shufidx[:params['train']]
    indices['valid'] = shufidx[params['train']:params['train'] +
                               params['valid']]
    indices['test'] = shufidx[params['train'] + params['valid']:]
    """
    Setup dataset to return
    """
    dataset = {}
    for k in ['train', 'valid', 'test']:
        dataset[k] = {}
        dataset[k]['tensor'] = data[indices[k]]
        dataset[k]['tensor_Z'] = data_Z[indices[k]]
        dataset[k]['mask'] = np.ones_like(dataset[k]['tensor'][:, :, 0])
    dataset['data_type'] = 'real'
    dataset['dim_observations'] = 3
    savePickle([dataset], curdir + '/synthetic.pkl')
    print 'Saving...'
    return dataset
Ejemplo n.º 9
0
def _loadWikicorpSubset(kval):
    assert kval in [1000, 5000, 10000, 15000], 'Bad value: ' + str(kval)
    DIR = os.path.dirname(os.path.realpath(__file__)).split(
        'vae_sparse')[0] + 'vae_sparse/optvaedatasets/wikicorp'
    assert type(kval) is int, 'Expecting kval as int'
    h5file = DIR + '/data-learning.h5'
    pklfile = DIR + '/misc-learning.pkl'
    assert os.path.exists(h5file) and os.path.exists(
        pklfile), 'Please run _loadWikicorp to generate data.h5'
    #Load Wikicorp raw data
    train = loadSparseHDF5('train', h5file).tocsc()
    valid = loadSparseHDF5('valid', h5file).tocsc()
    test = loadSparseHDF5('test', h5file).tocsc()
    objs = readPickle(DIR + '/misc-learning.pkl', nobjects=3)
    vocabulary = objs[1]

    sumfeats = np.array(train.sum(0)).squeeze()
    idx_sort = np.argsort(sumfeats)
    idx_to_keep = idx_sort[-kval:]
    dset = {}
    dset['vocabulary'] = [
        vocabulary[idx] for idx in idx_to_keep.squeeze().tolist()
    ]
    train_tmp = train[:, idx_to_keep].tocsr()
    valid_tmp = valid[:, idx_to_keep].tocsr()
    test_tmp = test[:, idx_to_keep].tocsr()
    #Use documents w/ atleast five words in it
    train_cts_idx = np.where(np.array(train_tmp.sum(1)).squeeze() > 5)[0]
    valid_cts_idx = np.where(np.array(valid_tmp.sum(1)).squeeze() > 5)[0]
    test_cts_idx = np.where(np.array(test_tmp.sum(1)).squeeze() > 5)[0]
    dset['train'] = train_tmp[train_cts_idx]
    dset['valid'] = valid_tmp[valid_cts_idx]
    dset['test'] = test_tmp[test_cts_idx]
    dset['dim_observations'] = dset['train'].shape[1]
    dset['data_type'] = 'bow'
    return dset
Ejemplo n.º 10
0
for k in opt_params:
    print k, opt_params[k].shape

import glob, os, sys, time
sys.path.append('../')
from utils.misc import getConfigFile, readPickle, displayTime
start_time = time.time()
from model_th.dmm import DMM
import model_th.learning as DMM_learn
import model_th.evaluate as DMM_evaluate
displayTime('importing DMM', start_time, time.time())

#This is the prefix we will use
DIR = './chkpt-ipython/'
prefix = 'DMM_lr-0_0008-dh-40-ds-2-nl-relu-bs-200-ep-40-rs-80-rd-0_1-infm-R-tl-2-el-2-ar-2_0-use_p-approx-rc-lstm-uid'
pfile = os.path.join(DIR, prefix + '-config.pkl')
print 'Hyperparameters in: ', pfile, 'Found: ', os.path.exists(pfile)

#The hyperparameters are saved in a pickle file - lets load them here
params = readPickle(pfile, quiet=True)[0]

#Reload the model at Epoch 30
EP = '-EP30'
#File containing model paramters
reloadFile = os.path.join(DIR, prefix + EP + '-params.npz')
print 'Model parameters in: ', reloadFile
#Don't load the training functions for the model since its time consuming
params['validate_only'] = True
dmm_reloaded = DMM(params, paramFile=pfile, reloadFile=reloadFile)

print 'Model Reloaded: ', type(dmm_reloaded)
Ejemplo n.º 11
0
def _cifar10():
    #CIFAR 10 Dataset
    DIR = getPYDIR() + '/datasets/cifar10'
    if not os.path.exists(DIR):
        os.system('mkdir -p ' + DIR)
    savef = os.path.join(DIR, 'cifar-10-python.tar.gz')
    if not os.path.exists(savef):
        urllib.urlretrieve(
            'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz', savef)
    cifarfile = os.path.join(DIR, 'cifar10.h5')
    if not os.path.exists(cifarfile):
        print 'Extracting CIFAR...'
        tf = tarfile.open(savef)
        tf.extractall(DIR)
        tf.close()
        EDIR = DIR + '/cifar-10-batches-py/'
        h5f = h5py.File(cifarfile, mode='w')
        traindatalist, trainlabellist = [], []
        for k in range(5):
            print k,
            hmap = readPickle(EDIR + '/data_batch_' + str(k + 1))[0]
            traindatalist.append(hmap['data'])
            trainlabellist.append(hmap['labels'])
        alltrainx = np.concatenate(traindatalist, axis=0)
        alltrainy = np.concatenate(trainlabellist, axis=0)
        np.random.seed(1)
        idxlist = np.random.permutation(alltrainx.shape[0])
        val_idx = idxlist[:int(0.1 * alltrainx.shape[0])]
        tr_idx = idxlist[int(0.1 * alltrainx.shape[0]):]
        TRAINX = alltrainx[tr_idx]
        TRAINY = alltrainy[tr_idx]
        VALIDX = alltrainx[val_idx]
        VALIDY = alltrainy[val_idx]
        h5f.create_dataset('train', data=reshapeMatrix(TRAINX))
        h5f.create_dataset('valid', data=reshapeMatrix(VALIDX))
        h5f.create_dataset('train_y', data=TRAINY)
        h5f.create_dataset('valid_y', data=VALIDY)
        hmap = readPickle(EDIR + '/test_batch')[0]
        h5f.create_dataset('test', data=reshapeMatrix(hmap['data']))
        h5f.create_dataset('test_y', data=np.array(hmap['labels']))
        hmap = readPickle(EDIR + '/batches.meta')[0]
        h5f.create_dataset('label_names',
                           data=np.array(hmap['label_names'], dtype='|S10'))
        h5f.close()
        print '\nCreated CIFAR h5 file'
    else:
        print 'Found CIFAR h5 file'
    h5f = h5py.File(cifarfile, mode='r')
    dataset = {}
    dataset['label_names'] = h5f['label_names'].value
    dataset['train'] = h5f['train'].value
    dataset['test'] = h5f['test'].value
    dataset['valid'] = h5f['valid'].value
    dataset['train_y'] = h5f['train_y'].value
    dataset['test_y'] = h5f['test_y'].value
    dataset['valid_y'] = h5f['valid_y'].value
    dataset['dim_observations'] = np.prod(dataset['train'].shape[1:])
    dataset['num_channels'] = dataset['train'].shape[-3]
    dataset['dim_h'] = dataset['train'].shape[-2]
    dataset['dim_w'] = dataset['train'].shape[-1]
    dataset['data_type'] = 'image'
    h5f.close()
    return dataset
Ejemplo n.º 12
0

def getTF(dataset):
    tfidf = TfidfTransformer(norm=None)
    tfidf.fit(dataset['train'])
    return tfidf.idf_


additional_attrs_wiki['idf'] = getTF(dataset_wiki)

for mname in MODELS_TO_USE:
    if 'wikicorp' not in mname:
        continue
    print 'Model: ', mname
    pfile = models[mname].split('uid')[0] + 'uid-config.pkl'
    params = readPickle(pfile)[0]
    suffix = '-EP' + str(epochval[mname]) + '-params.npz'
    rfile = models[mname] + suffix
    assert os.path.exists(rfile), 'not found'
    params['EVALUATION'] = True
    if 'wikicorp' in mname:
        vae = VAE(params,
                  paramFile=pfile,
                  reloadFile=rfile,
                  additional_attrs=additional_attrs_wiki)
        trainData = dataset_wiki['train']
        validData = dataset_wiki['valid']
        Ntrain = trainData.shape[0]
        np.random.seed(0)
        trainData = trainData[np.random.permutation(Ntrain)[:100000]]
    else:
Ejemplo n.º 13
0
Read saved results to obtain tables
"""
import glob
from utils.misc import loadHDF5, getConfigFile, readPickle
DIR = './'
datasets = ['20newsgroups', 'rcv2']
result_best = {}
result_last = {}
for dataset in datasets:
    print 'Dataset: ', dataset
    for f in glob.glob(DIR + '/chkpt-' + dataset + '-*/*evaluate.h5'):
        if 'mnist' in f or 'qvary' in f:
            continue
        dataset = f.split('chkpt-')[1].split('-')[0]
        opt_type = f.split('chkpt-')[1].split('-')[1].split('/')[0]
        params = readPickle(getConfigFile(f.replace('evaluate.h5', '')))[0]
        dset = loadHDF5(f)
        if params['opt_type'] == 'finopt':
            name = str(params['p_layers']) + '-M' + str(
                params['n_steps']) + '-' + params['input_type']
        else:
            name = str(params['p_layers']) + '-M1-' + params['input_type']
        result_best[params['dataset'] + '-' + name] = (dset['perp_0_best'],
                                                       dset['perp_f_best'])
        result_last[params['dataset'] + '-' + name] = (dset['test_perp_0'],
                                                       dset['test_perp_f'])
        print name, (dset['perp_0_best'], dset['perp_f_best'])
for dataset in datasets:
    for itype in ['normalize', 'tfidf']:
        for layer in ['0', '2']:
            for M in ['M1', 'M100']:
Ejemplo n.º 14
0
print 'Dimensionality of the observations: ', dataset['dim_observations']
print 'Data type of features:', dataset['data_type']
for dtype in ['train', 'valid', 'test']:
    print 'dtype: ', dtype, ' type(dataset[dtype]): ', type(dataset[dtype])
    print[(k, type(dataset[dtype][k]), dataset[dtype][k].shape)
          for k in dataset[dtype]]
    print '--------\n'

start_time = time.time()
from model_th.dmm import DMM
import model_th.learning as DMM_learn
import model_th.evaluate as DMM_evaluate
displayTime('importing DMM', start_time, time.time())

params = readPickle('../default.pkl')[0]
for k in params:
    print k, '\t', params[k]
params['data_type'] = dataset['data_type']
params['dim_observations'] = dataset['dim_observations']

#The dataset is small, lets change some of the default parameters and the unique ID
params['dim_stochastic'] = 2
params['dim_hidden'] = 40
params['rnn_size'] = 80
params['epochs'] = 40
params['batch_size'] = 200
params['unique_id'] = params['unique_id'].replace('ds-100', 'ds-2').replace(
    'dh-200', 'dh-40').replace('rs-600', 'rs-80')
params['unique_id'] = params['unique_id'].replace('ep-2000', 'ep-40').replace(
    'bs-20', 'bs-200')
Ejemplo n.º 15
0
"""
Compile aggregate timing information for different runs
"""
import glob,os
import numpy as np
from utils.misc import loadHDF5,getConfigFile,readPickle

for f in glob.glob('./chkpt-*/*-EP50-stats.h5'):
    code = 'ds'+os.path.basename(f).split('-ql')[0].split('ds')[1]
    if 'finopt' in f:
        code = 'finopt-'+code
    else:
        code = 'none-'+code
    data = loadHDF5(f)
    params=readPickle(getConfigFile(f))[0]
    code = params['dataset']+'-'+code
    runtimes = [] 
    for edata in data['batch_time']:
        if int(edata[0])%params['savefreq']==0:
            continue
        else:
            runtimes.append(edata[1])
    print code, np.mean(runtimes)