Beispiel #1
0
def _processMNIST():
    pfile = getPYDIR() + '/datasets/mnist/proc-mnist.h5'
    """
        Move to processed h5 file
    """
    DIR = os.path.dirname(pfile)
    if not os.path.exists(DIR):
        print 'Making: ', DIR
        os.mkdir(DIR)
    if not os.path.exists(os.path.join(DIR, 'mnist.pkl.gz')):
        print 'Downloading data'
        urllib.urlretrieve('http://deeplearning.net/data/mnist/mnist.pkl.gz',
                           os.path.join(DIR, 'mnist.pkl.gz'))
    if os.path.exists(pfile):
        print 'Found: ', pfile
        return pfile
    print 'Processing MNIST'
    f = gzip.open(os.path.join(DIR, 'mnist.pkl.gz'))
    train, valid, test = cPickle.load(f)
    f.close()
    h5f = h5py.File(pfile, mode='w')
    h5f.create_dataset('train', data=train[0])
    h5f.create_dataset('train_y', data=train[1])
    h5f.create_dataset('test', data=test[0])
    h5f.create_dataset('test_y', data=test[1])
    h5f.create_dataset('valid', data=valid[0])
    h5f.create_dataset('valid_y', data=valid[1])
    h5f.close()
    print 'Done processing MNIST'
    return pfile
Beispiel #2
0
def _processPolyphonic(name):
    DIR = getPYDIR()+'/datasets'
    assert os.path.exists(DIR),'Directory does not exist: '+DIR
    polyphonicDIR = DIR+'/polyphonic/'
    if not os.path.exists(polyphonicDIR):
        os.mkdir(polyphonicDIR)
    fname = polyphonicDIR+'/'+name+'.h5'
    if os.path.exists(fname):
        print 'Found: ',fname
        return fname
    #Setup polyphonic datasets from scratch
    if not os.path.exists(os.path.join(polyphonicDIR,'piano.pkl')) or \
    not os.path.exists(os.path.join(polyphonicDIR,'musedata.pkl')) or \
    not os.path.exists(os.path.join(polyphonicDIR,'jsb.pkl')) or \
    not os.path.exists(os.path.join(polyphonicDIR,'nottingham.pkl')):
        print 'Downloading polyphonic pickled data into: ',polyphonicDIR
        os.system('wget '+'http://www-etud.iro.umontreal.ca/~boulanni/JSB%20Chorales.pickle -O '+os.path.join(polyphonicDIR,'jsb.pkl'))
        os.system('wget '+'http://www-etud.iro.umontreal.ca/~boulanni/Nottingham.pickle -O '+os.path.join(polyphonicDIR,'nottingham.pkl'))
        os.system('wget '+'http://www-etud.iro.umontreal.ca/~boulanni/MuseData.pickle -O '+os.path.join(polyphonicDIR,'musedata.pkl'))
        os.system('wget '+'http://www-etud.iro.umontreal.ca/~boulanni/Piano-midi.de.pickle -O '+os.path.join(polyphonicDIR,'piano.pkl'))
    else:
        print 'Polyphonic pickle files found'
    #Helper function to sort by sequence length
    def getSortedVersion(data,mask):
        idx         = np.argsort(mask.sum(1))
        return data[idx,:,:], mask[idx,:]
    for dset in ['jsb','piano','nottingham','musedata','jsb-sorted','piano-sorted','nottingham-sorted','musedata-sorted']:
        datafile = os.path.join(polyphonicDIR,dset.replace('-sorted','')+'.pkl')
        savefile = os.path.join(polyphonicDIR,dset+'.h5')
        print '\n\nDataset: ',dset
        print ('Reading from: ',datafile)
        print ('Saving to:',savefile)
        MAX = 108
        MIN = 21
        DIM = MAX-MIN+1
        alldata = cPickle.load(file(datafile))
        h5file = h5py.File(savefile,mode='w')
        for dtype in ['train','valid','test']:
            print '----',dtype,'----'
            dataset = alldata[dtype]
            N_SEQUENCES = len(dataset)
            #First, find out the maximum number of sequences
            MAX_LENGTH  = max(len(dataset[k]) for k in range(N_SEQUENCES))
            print N_SEQUENCES,' sequences with max length ',MAX_LENGTH
            mask         = np.zeros((N_SEQUENCES, MAX_LENGTH))
            compileddata = np.zeros((N_SEQUENCES, DIM, MAX_LENGTH))
            for idxseq,seq in enumerate(dataset):
                T = len(seq)
                mask[idxseq,:T] = 1
                for t in range(T):
                    compileddata[idxseq,np.array(seq[t]).astype(int)-MIN,t]=1
            if 'sorted' in dset:
                compileddata,mask = getSortedVersion(compileddata,mask)
            #Save as bs x T x dim
            compileddata      = compileddata.swapaxes(1,2)
            print 'First and last lenghts: ',mask.sum(1)[1:5].tolist(),'....',mask.sum(1)[-5:].tolist()
            print 'Saving tensor data: ',compileddata.shape,' Mask: ',mask.shape
            h5file.create_dataset(dtype,data = compileddata)
            h5file.create_dataset('mask_'+dtype,data = mask)
        h5file.close()
Beispiel #3
0
def _processBinarizedMNIST():
    pfile = getPYDIR()+'/datasets/mnist/proc-bmnist.h5'
    """
        Move to processed h5 file
    """
    DIR = os.path.dirname(pfile)
    if not os.path.exists(DIR):
        print 'Making: ',DIR
        os.mkdir(DIR)
    if not os.path.exists(os.path.join(DIR,'binarized_mnist_train.amat')):
        print 'Downloading binarized mnist'
        urllib.urlretrieve('http://www.cs.toronto.edu/~larocheh/public/datasets/binarized_mnist/binarized_mnist_train.amat',os.path.join(DIR,'binarized_mnist_train.amat'))
        urllib.urlretrieve('http://www.cs.toronto.edu/~larocheh/public/datasets/binarized_mnist/binarized_mnist_valid.amat',os.path.join(DIR,'binarized_mnist_valid.amat'))
        urllib.urlretrieve('http://www.cs.toronto.edu/~larocheh/public/datasets/binarized_mnist/binarized_mnist_test.amat',os.path.join(DIR,'binarized_mnist_test.amat'))
    if os.path.exists(pfile):
        print 'Found: ',pfile
        return pfile
    print 'Processing binarized MNIST'
    h5f   = h5py.File(pfile, mode='w')
    h5f.create_dataset('train',data = np.loadtxt(os.path.join(DIR,'binarized_mnist_train.amat')))
    h5f.create_dataset('test' ,data = np.loadtxt(os.path.join(DIR,'binarized_mnist_test.amat')))
    h5f.create_dataset('valid',data = np.loadtxt(os.path.join(DIR,'binarized_mnist_valid.amat')))
    h5f.close()
    print 'Done processing binarized MNIST'
    return pfile
def runBaselines(DIR, name):
    DATADIR = getPYDIR() + '/datasets/synthetic'
    assert os.path.exists(
        DATADIR), DATADIR + ' not found. must have this to run baselines'
    if not os.path.exists(DIR):
        os.mkdir(DIR)

    for f in glob.glob(DATADIR + '/*.h5'):
        dataset = os.path.basename(f).replace('.h5', '')
        if name not in dataset:  #'synthetic' not in dataset or 'synthetic11' in dataset:
            continue
        print dataset, f
        if os.path.exists(DIR + '/' + dataset + '-baseline.h5'):
            print DIR + '/' + dataset + '-baseline.h5', ' found....not rerunning baseline'
            continue
        print 'Reading from: ', f, ' Saving to: ', DIR + '/' + dataset + '-baseline.h5'

        filterType = params_synthetic[dataset]['baseline']
        h5fout = h5py.File(DIR + '/' + dataset + '-baseline.h5', mode='w')
        h5f = h5py.File(f, mode='r')

        if int(dataset.split('synthetic')[1]) in [9, 10, 11]:
            print 'Running filter: ', filterType, ' on train'
            X = h5f['train'].value
            mus, cov, ll = runFilter(X, params_synthetic, dataset, filterType)
            h5fout.create_dataset('train_mu', data=mus)
            h5fout.create_dataset('train_cov', data=cov)
            h5fout.create_dataset('train_ll', data=np.array([ll]))
            rmse = np.sqrt(
                np.square(mus - h5f['train_z'].value.squeeze()).mean())
            h5fout.create_dataset('train_rmse', data=np.array([rmse]))

        #Always run exact inference on the validation set
        print 'Running filter: ', filterType, ' on valid'
        X = h5f['valid'].value
        mus, cov, ll = runFilter(X, params_synthetic, dataset, filterType)
        h5fout.create_dataset('valid_mu', data=mus)
        h5fout.create_dataset('valid_cov', data=cov)
        h5fout.create_dataset('valid_ll', data=np.array([ll]))
        rmse = np.sqrt(np.square(mus - h5f['valid_z'].value.squeeze()).mean())
        h5fout.create_dataset('valid_rmse', data=np.array([rmse]))

        if int(dataset.split('synthetic')[1]) in [9, 10, 11]:
            print 'Running filter: ', filterType, ' on test'
            X = h5f['test'].value
            mus, cov, ll = runFilter(X, params_synthetic, dataset, filterType)
            h5fout.create_dataset('test_mu', data=mus)
            h5fout.create_dataset('test_cov', data=cov)
            h5fout.create_dataset('test_ll', data=np.array([ll]))
            rmse = np.sqrt(
                np.square(mus - h5f['test_z'].value.squeeze()).mean())
            h5fout.create_dataset('test_rmse', data=np.array([rmse]))

        h5f.close()
        h5fout.close()
def runBaselines(DIR):
    DATADIR = getPYDIR() + '/datasets/synthetic'
    assert os.path.exists(
        DATADIR), DATADIR + ' not found. must have this to run baselines'
    if not os.path.exists(DIR):
        os.mkdir(DIR)

    for f in glob.glob(DATADIR + '/*.h5'):
        dataset = os.path.basename(f).replace('.h5', '')
        print 'Reading from: ', f, ' Saving to: ', DIR + '/' + dataset + '-baseline.h5'

        filterType = params_synthetic[dataset]['baseline']
        h5fout = h5py.File(DIR + '/' + dataset + '-baseline.h5', mode='w')
        h5f = h5py.File(f, mode='r')

        print 'Running filter: ', filterType, ' on train'
        X = h5f['train'].value
        mus, cov, ll = runFilter(X, params_synthetic, dataset, filterType)
        h5fout.create_dataset('train_mu', data=mus)
        h5fout.create_dataset('train_cov', data=cov)
        h5fout.create_dataset('train_ll', data=np.array([ll]))
        rmse = np.sqrt(np.square(mus - h5f['train_z'].value.squeeze()).mean())
        h5fout.create_dataset('train_rmse', data=np.array([rmse]))

        print 'Running filter: ', filterType, ' on valid'
        X = h5f['valid'].value
        mus, cov, ll = runFilter(X, params_synthetic, dataset, filterType)
        h5fout.create_dataset('valid_mu', data=mus)
        h5fout.create_dataset('valid_cov', data=cov)
        h5fout.create_dataset('valid_ll', data=np.array([ll]))
        rmse = np.sqrt(np.square(mus - h5f['valid_z'].value.squeeze()).mean())
        h5fout.create_dataset('valid_rmse', data=np.array([rmse]))

        print 'Running filter: ', filterType, ' on test'
        X = h5f['test'].value
        mus, cov, ll = runFilter(X, params_synthetic, dataset, filterType)
        h5fout.create_dataset('test_mu', data=mus)
        h5fout.create_dataset('test_cov', data=cov)
        h5fout.create_dataset('test_ll', data=np.array([ll]))
        rmse = np.sqrt(np.square(mus - h5f['test_z'].value.squeeze()).mean())
        h5fout.create_dataset('test_rmse', data=np.array([rmse]))

        h5f.close()
        h5fout.close()
Beispiel #6
0
def _processFashionMNIST():
    pfile = getPYDIR()+'/datasets/fashion_mnist/proc-fashion_mnist.h5'
    DIR = os.path.dirname(pfile)
    createIfAbsent(DIR)
    if not os.path.exists(os.path.join(DIR,'train-images-idx3-ubyte.gz')):
        print 'Downloading data'
        urllib.urlretrieve('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',os.path.join(DIR,'train-images-idx3-ubyte.gz'))
        urllib.urlretrieve('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',os.path.join(DIR,'train-labels-idx1-ubyte.gz'))
        urllib.urlretrieve('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',os.path.join(DIR,'t10k-images-idx3-ubyte.gz'))
        urllib.urlretrieve('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',os.path.join(DIR,'t10k-labels-idx1-ubyte.gz'))
    if os.path.exists(pfile):
        print 'Found: ',pfile
        return pfile
    print DIR
    X, Y= readData(os.path.join(DIR,'train-images-idx3-ubyte.gz'), os.path.join(DIR,'train-labels-idx1-ubyte.gz'))
    np.random.seed(0)
    idxshuf   = np.random.permutation(X.shape[0])
    valid_idx = idxshuf[:10000]
    train_idx = idxshuf[10000:]
    train_x, train_y = np.clip(X[train_idx]/255., a_min=0.0, a_max=1.0), Y[train_idx]
    valid_x, valid_y = np.clip(X[valid_idx]/255., a_min=0.0, a_max=1.0), Y[valid_idx]
    test_x, test_y = readData(os.path.join(DIR,'t10k-images-idx3-ubyte.gz'), os.path.join(DIR,'t10k-labels-idx1-ubyte.gz'))
    test_x = np.clip(test_x/255., a_min=0.0, a_max=1.0)
    print 'Processing Fashion MNIST'
    h5f   = h5py.File(pfile, mode='w')
    h5f.create_dataset('train',data = train_x)
    h5f.create_dataset('train_y',data = train_y)
    h5f.create_dataset('test' ,data = test_x)
    h5f.create_dataset('test_y' ,data = test_y)
    h5f.create_dataset('valid',data = valid_x)
    h5f.create_dataset('valid_y',data = valid_y)
    h5f.close()
    for dd in [train_x, train_y, valid_x, valid_y, test_x, test_y]:
        print dd.shape, dd.min(), dd.max()
    print 'Done processing Fashion MNIST....',pfile
    return pfile
Beispiel #7
0
def _cifar10():
    #CIFAR 10 Dataset
    DIR = getPYDIR() + '/datasets/cifar10'
    if not os.path.exists(DIR):
        os.system('mkdir -p ' + DIR)
    savef = os.path.join(DIR, 'cifar-10-python.tar.gz')
    if not os.path.exists(savef):
        urllib.urlretrieve(
            'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz', savef)
    cifarfile = os.path.join(DIR, 'cifar10.h5')
    if not os.path.exists(cifarfile):
        print 'Extracting CIFAR...'
        tf = tarfile.open(savef)
        tf.extractall(DIR)
        tf.close()
        EDIR = DIR + '/cifar-10-batches-py/'
        h5f = h5py.File(cifarfile, mode='w')
        traindatalist, trainlabellist = [], []
        for k in range(5):
            print k,
            hmap = readPickle(EDIR + '/data_batch_' + str(k + 1))[0]
            traindatalist.append(hmap['data'])
            trainlabellist.append(hmap['labels'])
        alltrainx = np.concatenate(traindatalist, axis=0)
        alltrainy = np.concatenate(trainlabellist, axis=0)
        np.random.seed(1)
        idxlist = np.random.permutation(alltrainx.shape[0])
        val_idx = idxlist[:int(0.1 * alltrainx.shape[0])]
        tr_idx = idxlist[int(0.1 * alltrainx.shape[0]):]
        TRAINX = alltrainx[tr_idx]
        TRAINY = alltrainy[tr_idx]
        VALIDX = alltrainx[val_idx]
        VALIDY = alltrainy[val_idx]
        h5f.create_dataset('train', data=reshapeMatrix(TRAINX))
        h5f.create_dataset('valid', data=reshapeMatrix(VALIDX))
        h5f.create_dataset('train_y', data=TRAINY)
        h5f.create_dataset('valid_y', data=VALIDY)
        hmap = readPickle(EDIR + '/test_batch')[0]
        h5f.create_dataset('test', data=reshapeMatrix(hmap['data']))
        h5f.create_dataset('test_y', data=np.array(hmap['labels']))
        hmap = readPickle(EDIR + '/batches.meta')[0]
        h5f.create_dataset('label_names',
                           data=np.array(hmap['label_names'], dtype='|S10'))
        h5f.close()
        print '\nCreated CIFAR h5 file'
    else:
        print 'Found CIFAR h5 file'
    h5f = h5py.File(cifarfile, mode='r')
    dataset = {}
    dataset['label_names'] = h5f['label_names'].value
    dataset['train'] = h5f['train'].value
    dataset['test'] = h5f['test'].value
    dataset['valid'] = h5f['valid'].value
    dataset['train_y'] = h5f['train_y'].value
    dataset['test_y'] = h5f['test_y'].value
    dataset['valid_y'] = h5f['valid_y'].value
    dataset['dim_observations'] = np.prod(dataset['train'].shape[1:])
    dataset['num_channels'] = dataset['train'].shape[-3]
    dataset['dim_h'] = dataset['train'].shape[-2]
    dataset['dim_w'] = dataset['train'].shape[-1]
    dataset['data_type'] = 'image'
    h5f.close()
    return dataset
Beispiel #8
0
def _processSynthetic(dset):
    DIR = getPYDIR() + '/datasets'
    assert os.path.exists(DIR), 'Directory does not exist: ' + DIR
    syntheticDIR = DIR + '/synthetic/'
    if not os.path.exists(syntheticDIR):
        os.mkdir(syntheticDIR)
    fname = syntheticDIR + '/' + dset + '.h5'
    #assert dset in ['synthetic9','synthetic10','synthetic11','synthetic12','synthetic13','synthetic14'] ,'Only synthetic 9/10/11 supported'
    """
    9: linear    ds = 1
    10:nonlinear ds = 1
    11:nonlinear ds = 2 [param estimation]

    Checking scalability of ST-R
    12:linear    ds = 10
    13:linear    ds = 100
    14:linear    ds = 250

    Checking scalability of ST-R - dimz = dimobs
    15:linear    ds = 10
    16:linear    ds = 100
    17:linear    ds = 250

    Checking scalability of ST-R - dimz = dimobs and diagonal weight matrices
    18:linear    ds = 10
    19:linear    ds = 100
    20:linear    ds = 250
    """
    if os.path.exists(fname):
        print 'Found: ', fname
        return fname
    #Old=np.random.seed(1)
    def sampleGaussian(mu, cov):
        assert type(cov) is float or type(
            cov) is np.ndarray, 'invalid type: ' + str(cov) + ' type: ' + str(
                type(cov))
        return mu + np.random.randn(*mu.shape) * np.sqrt(cov)

    def createDataset(N, T, t_fxn, e_fxn, init_mu, init_cov, trans_cov,
                      obs_cov, model_params, dim_stochastic, dim_obs):
        all_z = []
        z_prev = sampleGaussian(
            np.ones((N, 1, dim_stochastic)) * init_mu, init_cov)
        all_z.append(np.copy(z_prev))
        for t in range(T - 1):
            z_prev = sampleGaussian(t_fxn(z_prev, fxn_params=model_params),
                                    trans_cov)
            all_z.append(z_prev)
        Z_true = np.concatenate(all_z, axis=1)
        assert Z_true.shape[1] == T, 'Expecting T in dim 2 of Z_true'
        X = sampleGaussian(e_fxn(Z_true, fxn_params=model_params), obs_cov)
        assert X.shape[2] == dim_obs, 'Shape mismatch'
        return Z_true, X

    if not np.all([
            os.path.exists(os.path.join(syntheticDIR, fname + '.h5'))
            for fname in ['synthetic' + str(i) for i in range(9, 21)]
    ]):
        #Create all datasets
        for s in range(9, 21):
            if os.path.exists(
                    os.path.join(syntheticDIR, 'synthetic' + str(s) + '.h5')):
                print 'Found ', s
                continue
            print 'Creating: ', s
            dataset = {}
            transition_fxn = params_synthetic['synthetic' +
                                              str(s)]['trans_fxn']
            emission_fxn = params_synthetic['synthetic' + str(s)]['obs_fxn']
            init_mu = params_synthetic['synthetic' + str(s)]['init_mu']
            init_cov = params_synthetic['synthetic' + str(s)]['init_cov']
            trans_cov = params_synthetic['synthetic' + str(s)]['trans_cov']
            obs_cov = params_synthetic['synthetic' + str(s)]['obs_cov']
            model_params = params_synthetic['synthetic' + str(s)]['params']
            dim_obs, dim_stoc = params_synthetic[
                'synthetic' +
                str(s)]['dim_obs'], params_synthetic['synthetic' +
                                                     str(s)]['dim_stoc']
            if s in [12, 13, 14, 15, 16, 17, 18, 19, 20]:
                Ntrain = 1000
                Ttrain = 25
                Ttest = 25
            else:
                Ntrain = 5000
                Ttrain = 25
                Ttest = 50
            Nvalid = 500
            Ntest = 500
            #New-
            np.random.seed(1)
            train_Z, train_dataset = createDataset(
                Ntrain, Ttrain, transition_fxn, emission_fxn, init_mu,
                init_cov, trans_cov, obs_cov, model_params, dim_stoc, dim_obs)
            valid_Z, valid_dataset = createDataset(
                Nvalid, Ttrain, transition_fxn, emission_fxn, init_mu,
                init_cov, trans_cov, obs_cov, model_params, dim_stoc, dim_obs)
            test_Z, test_dataset = createDataset(Ntest, Ttest, transition_fxn,
                                                 emission_fxn, init_mu,
                                                 init_cov, trans_cov, obs_cov,
                                                 model_params, dim_stoc,
                                                 dim_obs)
            savefile = syntheticDIR + '/synthetic' + str(s) + '.h5'
            h5file = h5py.File(savefile, mode='w')
            h5file.create_dataset('train_z', data=train_Z)
            h5file.create_dataset('test_z', data=test_Z)
            h5file.create_dataset('valid_z', data=valid_Z)
            h5file.create_dataset('train', data=train_dataset)
            h5file.create_dataset('test', data=test_dataset)
            h5file.create_dataset('valid', data=valid_dataset)
            h5file.close()
            print 'Created: ', savefile
    print 'REMEMBER TO RUN BASELINES!'