def create_datasets(cls,
                        datasets=None,
                        overwrite=False,
                        img_dir=DATA_DIR,
                        output_dir=DATA_DIR):
        """Creates the requested datasets, and writes them to disk.
        """
        datasets = datasets or cls.ALL_DATASETS
        serial.mkdir(output_dir)

        for dataset_name in list(datasets):
            file_path_fn = lambda ext: os.path.join(
                output_dir, '%s.%s' % (dataset_name, ext))

            output_files = dict([(ext, file_path_fn(ext))
                                 for ext in ['pkl', 'npy']])
            files_missing = np.any(
                [not os.path.isfile(f) for f in output_files.values()])

            if overwrite or np.any(files_missing):
                print("Loading the %s data" % dataset_name)
                dataset = cls(which_set=dataset_name, img_dir=img_dir)

                print("Saving the %s data" % dataset_name)
                dataset.use_design_loc(output_files['npy'])
                serial.save(output_files['pkl'], dataset)
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL10-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))
    del supplement

    print("Preparing output directory...")
    patch_dir = data_dir + '/stl10_patches_8x8'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from a downsampled (to 32x32)
    version of the STL-10 train and unlabeled datasets.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_stl10_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8),
                          num_patches=2*1000*1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Exemple #3
0
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100')

    print('Loading CIFAR-100 train dataset...')
    train = CIFAR100(which_set='train', gcn=55.)

    print("Preparing output directory...")
    output_dir = data_dir + '/pylearn2_gcn_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining a labeled
    dataset of a 32x32 contrast normalized,
    approximately whitened version of the CIFAR-100 dataset.
    train.pkl contains labeled train examples.
    test.pkl contains labeled test examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_cifar100_gcn_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor \
           and preprocessing the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    train.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the training data')
    train.use_design_loc(output_dir + '/train.npy')
    serial.save(output_dir + '/train.pkl', train)

    print("Loading the test data")
    test = CIFAR100(which_set='test', gcn=55.)

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir + '/test.npy')
    serial.save(output_dir + '/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
Exemple #4
0
def emit_eta_h(method, directory, n, eta_h):
    directory = directory + '/eta_h_'+str(eta_h)

    serial.mkdir(directory)

    if method == 'cg':
        emit_cg(directory, n, eta_h)
    else:
        assert method == 'heuristic'
        emit_heuristic(directory, n, eta_h)
Exemple #5
0
def emit_eta_h(method, directory, n, eta_h):
    directory = directory + '/eta_h_' + str(eta_h)

    serial.mkdir(directory)

    if method == 'cg':
        emit_cg(directory, n, eta_h)
    else:
        assert method == 'heuristic'
        emit_heuristic(directory, n, eta_h)
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100')

    print('Loading CIFAR-100 train dataset...')
    train = CIFAR100(which_set='train', gcn=55.)

    print("Preparing output directory...")
    output_dir = data_dir + '/pylearn2_gcn_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining a labeled
    dataset of a 32x32 contrast normalized,
    approximately whitened version of the CIFAR-100 dataset.
    train.pkl contains labeled train examples.
    test.pkl contains labeled test examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_cifar100_gcn_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor \
           and preprocessing the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    train.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the training data')
    train.use_design_loc(output_dir+'/train.npy')
    serial.save(output_dir + '/train.pkl', train)

    print("Loading the test data")
    test = CIFAR100(which_set='test', gcn=55.)

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir+'/test.npy')
    serial.save(output_dir+'/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}')

    print('Loading CIFAR-100 train dataset...')
    data = CIFAR100(which_set='train')

    print("Preparing output directory...")
    patch_dir = data_dir + '/cifar100/cifar100_patches'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from the CIFAR-100 train set.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_cifar100_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(
        preprocessing.ExtractPatches(patch_shape=(6, 6),
                                     num_patches=2 * 1000 * 1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Exemple #8
0
def deal_npy_file(whitenFile_label, whitenFile_feature, txtfile, mode):
    y = np.load(whitenFile_label)
    print(y.shape)

    x = np.load(whitenFile_feature)
    x = x.reshape((x.shape[0], 3, 32, 32)).transpose(0, 2, 3, 1)
    print(x.shape)

    output_dir = "./cifar10_npy/" + mode
    serial.mkdir(output_dir)
    file_names = []
    for i in range(x.shape[0]):
        np.save(output_dir + "/" + mode + str(i), x[i])
        file_names.append(
            str(y[i][0]) + "," + output_dir + "/" + mode + str(i) + ".npy" +
            "\n")

    open(txtfile, "w").writelines(file_names)
    print(len(file_names))
Exemple #9
0
def deal_npy_file(whitenFile_label, whitenFile_feature, txtfile, mode):
    y = np.load(whitenFile_label)
    print(y.shape)

    x = np.load(whitenFile_feature)
    x = x.reshape((x.shape[0], 3, 32, 32)).transpose(0, 2, 3, 1)
    print(x.shape)

    output_dir = "./cifar10_images_from_npy/" + mode + "_misc"
    serial.mkdir(output_dir)
    file_names = []
    for i in range(x.shape[0]):
        # for i in range(1):
        name = output_dir + "/" + mode + str(i) + "_misc.png"
        # plt.imsave(name, x[i])
        misc.imsave(name, x[i])
        file_names.append(str(y[i][0]) + "," + name + "\n")

    open(txtfile, "w").writelines(file_names)
    print(len(file_names))
Exemple #10
0
    def create_datasets(cls, datasets=None, overwrite=False,
                        img_dir=DATA_DIR, output_dir=DATA_DIR):
        """Creates the requested datasets, and writes them to disk.
        """
        datasets = datasets or cls.ALL_DATASETS
        serial.mkdir(output_dir)

        for dataset_name in list(datasets):
            file_path_fn = lambda ext: os.path.join(
                output_dir,
                '%s.%s' % (dataset_name, ext))

            output_files = dict([(ext, file_path_fn(ext))
                                 for ext in ['pkl', 'npy']])
            files_missing = np.any([not os.path.isfile(f)
                                    for f in output_files.values()])

            if overwrite or np.any(files_missing):
                print("Loading the %s data" % dataset_name)
                dataset = cls(which_set=dataset_name, img_dir=img_dir)

                print("Saving the %s data" % dataset_name)
                dataset.use_design_loc(output_files['npy'])
                serial.save(output_files['pkl'], dataset)
params = yaml_parse.load_path('params.yaml')

validate = open('validate.yaml', 'r')
validate_template = validate.read()
validate.close()

for expnum, line in enumerate(lines):
    elems = line.split(' ')
    assert elems[-1] == '\n'
    obj = elems[0]
    if obj == 'P':
        expdir = '/RQexec/goodfell/experiment_6/%d' % expnum
        if os.path.exists(expdir):
            continue
        try:
            mkdir(expdir)

            config = {}
            for param, value in safe_zip(params, elems[2:-1]):
                if param['type'] == 'float':
                    value = float(value)
                elif param['type'] == 'int':
                    value = int(value)
                else:
                    raise NotImplementedError()
                if 'postprocess' in param:
                    value = param['postprocess'](value)
                if 'joint_postprocess' in param:
                    try:
                        value = param['joint_postprocess'](value, config)
                    except Exception, e:
This script also translates the data to lie in [-127.5, 127.5] instead of
[0,255]. This makes it play nicer with some of pylearn's visualization tools.
"""

from pylearn2.datasets.stl10 import STL10
from pylearn2.datasets.preprocessing import Downsample
from pylearn2.utils import string_utils as string
from pylearn2.utils import serial
import numpy as np

print 'Preparing output directory...'

data_dir = string.preprocess('${PYLEARN2_DATA_PATH}')
downsampled_dir = data_dir + '/stl10_32x32'
serial.mkdir( downsampled_dir )
README = open(downsampled_dir + '/README','w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load. They contain pylearn2
Dataset objects defining the STL-10 dataset, but downsampled to
size 32x32 and translated to lie in [-127.5, 127.5 ].

They were created with the pylearn2 script make_downsampled_stl10.py

All other files in this directory, including this README, were
created by the same script and are necessary for the other files
to function correctly.
""")
import os
from pylearn2.utils import serial

assert len(sys.argv) in [2, 3, 4]
train_file = sys.argv[1]

if train_file.endswith('.npy'):
    pieces = train_file.split('.npy')
elif train_file.endswith('mat'):
    pieces = train_file.split('.mat')
else:
    assert False
assert len(pieces) == 2

results_dir = pieces[0]
serial.mkdir(results_dir)

if len(sys.argv) > 3:
    memreq = sys.argv[3]
else:
    memreq = '12G'

command = 'jobdispatch --duree=48:00:00 --whitespace --mem=%(memreq)s /RQusagers/goodfell/cifar100_fold_point_worker ' % locals(
)
command += ' "{{'

if len(sys.argv) > 2:

    C_list = sys.argv[2]

    C_list = [float(C) for C in C_list.split(',')]
    if str(val.dtype) == 'bool':
        val = val.astype('int')
        params[key] = val
    assert val.shape == (num_jobs, )
    #print key,':',(val.min(),val.mean(),val.max())

ref = {
    "layer_2_target": 0.0890535860395,
    "layer_2_irange": 0.0301747773266,
    "layer_2_init_bias": -0.741101442887,
    "layer_1_init_bias": -0.397164399345,
    "balance": 0
}
yaml.dump(ref)

mkdir(out_dir)
for i in xrange(num_jobs):
    cur_dir = out_dir + '/' + str(i)
    mkdir(cur_dir)
    path = cur_dir + '/stage_00_inpaint_params.yaml'

    obj = dict([(key, params[key][i]) for key in params])

    assert all([isinstance(key, str) for key in obj])
    assert all([isinstance(val, (int, float)) for val in obj.values()])

    # numpy has actually given us subclassed ints/floats that yaml doesn't know how to serialize
    for key in obj:
        if isinstance(obj[key], float):
            obj[key] = float(obj[key])
        elif isinstance(obj[key], int):
import sys
from pylearn2.utils import serial

ignore, model_path, script_dir = sys.argv

serial.mkdir(script_dir)

chunk_size = 1000

m = 10000

assert m % chunk_size == 0

num_chunks = m / chunk_size
assert num_chunks == 10

for i in xrange(num_chunks):
    start = i * chunk_size
    stop = (i+1)*chunk_size
    name = 'chunk_%d.yaml' % i
    f = open(script_dir + '/' + name, 'w')
    f.write("""!obj:galatea.pddbm.extract_features.FeatureExtractor {
            batch_size : 1,
            model_path : %(model_path)s,
            pooling_region_counts : [ 3 ],
            save_paths : [ %(script_dir)s/chunk_%(i)d.npy ],
            feature_type : "exp_h,exp_g",
            dataset_family : galatea.pddbm.extract_features.cifar100,
            which_set : "test",
            restrict : [ %(start)d, %(stop)d ]
        }""" % locals() )
Exemple #16
0
if arg == 'public_test':
    base = preprocess(
        '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/public_test_images')
    outdir = base[:-6] + 'lcn'
    expected_num_images = 500
elif arg == 'private_test':
    base = preprocess(
        '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/private_test_images')
    outdir = base[:-6] + 'lcn'
    expected_num_images = 500
else:
    usage()
    print 'Unrecognized argument value:', arg
    print 'Recognized values are: public_test, private_test'

serial.mkdir(outdir)

paths = os.listdir(base)
if len(paths) != expected_num_images:
    raise AssertionError("Something is wrong with your " + base \
            + "directory. It should contain " + str(expected_num_images) + \
            " image files, but contains " + str(len(paths)))

kernel_shape = 7

from theano import tensor as T
from pylearn2.utils import sharedX
from pylearn2.datasets.preprocessing import gaussian_filter
from theano.tensor.nnet import conv2d

X = T.TensorType(dtype='float32', broadcastable=(True, False, False, True))()
Exemple #17
0
    learning_rate =  10. ** rng.uniform(-2., -.5)

    if rng.randint(2):
        msat = 2
    else:
        msat = rng.randint(2, 1000)

    final_momentum = rng.uniform(.5, .9)

    lr_sat = rng.randint(200, 1000)

    decay = 10. ** rng.uniform(-3, -1)


    task_0_yaml_str = task_0_template % locals()

    serial.mkdir('{}exp/'.format(EXP_PATH) + str(job_id))
    train_file_full_stem = '{}exp/'.format(EXP_PATH)+str(job_id)+'/'
    f = open(train_file_full_stem + 'task_0.yaml', 'w')
    f.write(task_0_yaml_str)
    f.close()

    task_1_yaml_str = task_1_template % locals()

    serial.mkdir('{}exp/'.format(EXP_PATH) + str(job_id))
    f = open(train_file_full_stem + 'task_1.yaml', 'w')
    f.write(task_1_yaml_str)
    f.close()

Exemple #18
0
from pylearn2.datasets.tfd import TFD
from pylearn2.utils import string_utils
from hossrbm import preproc as my_preproc

data_dir = string_utils.preprocess('/data/lisatmp2/desjagui/data')

pipeline = preprocessing.Pipeline()
pipeline.items.append(preprocessing.GlobalContrastNormalization(subtract_mean=True))
pipeline.items.append(my_preproc.LeCunLCN((1,48,48)))
pipeline.items.append(preprocessing.RemoveMean(axis=0))
pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(14,14), num_patches=5*1000*1000))

#### Build full-sized image dataset. ####
print "Preparing output directory for unlabeled patches..."
outdir = data_dir + '/tfd_lcn_v1'
serial.mkdir(outdir)
README = open('README','w')
README.write("""
File generated from hossrbm/scripts/tfd/make_tfd_lcn.py.
""")
README.close()

print 'Loading TFD unlabeled dataset...'
print "Preprocessing the data..."
data = TFD('unlabeled')
data.apply_preprocessor(preprocessor = pipeline, can_fit = True)
data.use_design_loc(outdir + '/unlabeled_patches.npy')
serial.save(outdir + '/unlabeled_patches.pkl',data)

#### For supervised dataset, we work on the full-image dataset ####
pipeline.items.pop()
Exemple #19
0
    h0_bias = sigmoid_bias()
    h1_bias = sigmoid_bias()

    learning_rate = 10.**rng.uniform(-2., -.5)

    if rng.randint(2):
        msat = 2
    else:
        msat = rng.randint(2, 1000)

    final_momentum = rng.uniform(.5, .9)

    lr_sat = rng.randint(200, 1000)

    decay = 10.**rng.uniform(-3, -1)

    task_0_yaml_str = task_0_template % locals()

    serial.mkdir('{}exp/'.format(EXP_PATH) + str(job_id))
    train_file_full_stem = '{}exp/'.format(EXP_PATH) + str(job_id) + '/'
    f = open(train_file_full_stem + 'task_0.yaml', 'w')
    f.write(task_0_yaml_str)
    f.close()

    task_1_yaml_str = task_1_template % locals()

    serial.mkdir('{}exp/'.format(EXP_PATH) + str(job_id))
    f = open(train_file_full_stem + 'task_1.yaml', 'w')
    f.write(task_1_yaml_str)
    f.close()
Exemple #20
0
import sys
from pylearn2.utils import serial

ignore, model_path, script_dir = sys.argv

serial.mkdir(script_dir)

chunk_size = 1000

m = 50000

assert m % chunk_size == 0

num_chunks = m / chunk_size

for i in xrange(num_chunks):
    start = i * chunk_size
    stop = (i + 1) * chunk_size
    name = 'chunk_%d.yaml' % i
    f = open(script_dir + '/' + name, 'w')
    f.write("""!obj:galatea.pddbm.extract_features.FeatureExtractor {
            batch_size : 1,
            model_path : %(model_path)s,
            pooling_region_counts : [ 3 ],
            save_paths : [ %(script_dir)s/chunk_%(i)d.npy ],
            feature_type : "exp_h,exp_g",
            dataset_family : galatea.pddbm.extract_features.cifar100,
            which_set : "train",
            restrict : [ %(start)d, %(stop)d ]
        }""" % locals())
    f.close()
    thumbnail_path = image_path.replace(input_path,output_path)
    thumbnail_path = thumbnail_path.replace('.JPEG','.npy')

    t1 = time.time()
    e =  os.path.exists(thumbnail_path)
    t2 = time.time()
    print t2-t1

    if e:
        continue

    thumbnail_subdir = '/'.join(thumbnail_path.split('/')[:-1])

    if thumbnail_subdir not in created_subdirs:
        serial.mkdir(thumbnail_subdir)
        created_subdirs = created_subdirs.union([thumbnail_subdir])

    try:
        t1 = time.time()
        img = image.load(image_path)
        t2 = time.time()
    except Exception, e:
        print "Encountered a problem: "+str(e)
        img = None

    if img is not None:
        assert len(img.shape) == 3
        thumbnail = image.make_letterboxed_thumbnail(img, image_shape)
        t3 = time.time()
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))

    print("Preparing output directory...")
    output_dir = data_dir + '/stl10_32x32_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    unsupervised.pkl, unlabeled.pkl, train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining an unlabeled
    dataset of a 32x32 approximately whitened version of the STL-10
    dataset. unlabeled.pkl contains unlabeled train examples. train.pkl
    contains labeled train examples. unsupervised.pkl contains the union
    of these (without any labels). test.pkl contains the labeled test
    examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_stl10_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor \
          and preprocessing the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    data.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the unsupervised data')
    data.use_design_loc(output_dir+'/unsupervised.npy')
    serial.save(output_dir + '/unsupervised.pkl', data)

    X = data.X
    unlabeled = X[0:100*1000, :]
    labeled = X[100*1000:, :]
    del X

    print("Saving the unlabeled data")
    data.X = unlabeled
    data.use_design_loc(output_dir + '/unlabeled.npy')
    serial.save(output_dir + '/unlabeled.pkl', data)
    del data
    del unlabeled

    print("Saving the labeled train data")
    supplement.X = labeled
    supplement.use_design_loc(output_dir+'/train.npy')
    serial.save(output_dir+'/train.pkl', supplement)
    del supplement
    del labeled

    print("Loading the test data")
    test = serial.load(downsampled_dir + '/test.pkl')

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir+'/test.npy')
    serial.save(output_dir+'/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
for key in sorted(params.keys()):
    val = params[key]
    if isinstance(val, list):
        val = np.asarray(val)
    if str(val.dtype) == 'bool':
        val = val.astype('int')
        params[key] = val
    assert val.shape == (num_jobs, )
    #print key,':',(val.min(),val.mean(),val.max())


ref = {"layer_2_target":0.0890535860395, "layer_2_irange":0.0301747773266, "layer_2_init_bias":-0.741101442887, "layer_1_init_bias":-0.397164399345, "balance":0}
yaml.dump(ref)

mkdir(out_dir)
for i in xrange(num_jobs):
    cur_dir = out_dir +'/'+str(i)
    mkdir(cur_dir)
    path = cur_dir + '/stage_00_inpaint_params.yaml'

    obj = dict([(key, params[key][i]) for key in params])

    assert all([isinstance(key, str) for key in obj])
    assert all([isinstance(val, (int, float)) for val in obj.values()])

    # numpy has actually given us subclassed ints/floats that yaml doesn't know how to serialize
    for key in obj:
        if isinstance(obj[key], float):
            obj[key] = float(obj[key])
        elif isinstance(obj[key], int):
Exemple #24
0
    thumbnail_path = image_path.replace(input_path, output_path)
    thumbnail_path = thumbnail_path.replace('.JPEG', '.npy')

    t1 = time.time()
    e = os.path.exists(thumbnail_path)
    t2 = time.time()
    print t2 - t1

    if e:
        continue

    thumbnail_subdir = '/'.join(thumbnail_path.split('/')[:-1])

    if thumbnail_subdir not in created_subdirs:
        serial.mkdir(thumbnail_subdir)
        created_subdirs = created_subdirs.union([thumbnail_subdir])

    try:
        t1 = time.time()
        img = image.load(image_path)
        t2 = time.time()
    except Exception, e:
        print "Encountered a problem: " + str(e)
        img = None

    if img is not None:
        assert len(img.shape) == 3
        thumbnail = image.make_letterboxed_thumbnail(img, image_shape)
        t3 = time.time()
Exemple #25
0
params = yaml_parse.load_path('params.yaml')

validate = open('validate.yaml', 'r')
validate_template = validate.read()
validate.close()

for expnum, line in enumerate(lines):
    elems = line.split(' ')
    assert elems[-1] == '\n'
    obj = elems[0]
    if obj == 'P':
        expdir = '/RQexec/goodfell/experiment_6/%d' % expnum
        if os.path.exists(expdir):
            continue
        try:
            mkdir(expdir)

            config = {}
            for param, value in safe_zip(params, elems[2:-1]):
                if param['type'] == 'float':
                    value = float(value)
                elif param['type'] == 'int':
                    value = int(value)
                else:
                    raise NotImplementedError()
                if 'postprocess' in param:
                    value = param['postprocess'](value)
                if 'joint_postprocess' in param:
                    try:
                        value = param['joint_postprocess'](value, config)
                    except Exception, e:
Exemple #26
0

    learning_rate =  10. ** rng.uniform(-2., -.5)

    if rng.randint(2):
        msat = 2
    else:
        msat = rng.randint(2, 1000)

    final_momentum = rng.uniform(.5, .9)

    lr_sat = rng.randint(200, 1000)

    decay = 10. ** rng.uniform(-3, -1)


    task_0_yaml_str = task_0_template % locals()

    serial.mkdir('exp/' + str(job_id))
    train_file_full_stem = 'exp/'+str(job_id)+'/'
    f = open(train_file_full_stem + 'task_0.yaml', 'w')
    f.write(task_0_yaml_str)
    f.close()

    task_1_yaml_str = task_1_template % locals()

    serial.mkdir('exp/' + str(job_id))
    f = open(train_file_full_stem + 'task_1.yaml', 'w')
    f.write(task_1_yaml_str)
    f.close()
from pylearn2.utils import serial
from pylearn2.utils import string_utils
import numpy
import argparse

from hossrbm.scripts.conv_pipeline import cssrbm_feature_extractor as featext

print "Preparing output directory..."
data_dir = string_utils.preprocess('/data/lisatmp2/desjagui/data')
indir  = data_dir + '/tfd_cn'
outdir = data_dir + '/tfd_cn_layer2'
serial.mkdir(outdir)

parser = argparse.ArgumentParser()
parser.add_argument('--model', help='Path of model .pkl file.')
args = parser.parse_args()

"""
print 'Processing unlabeled set...'
in_dset_fname = '%s/%s.pkl' % (indir, 'unlabeled')
out_dset_fname = '%s/%s.pkl' % (outdir, 'unlabeled')
featext.run(args.model,
        in_dset_fname,
        batch_size = 128,
        image_width = 48,
        patch_width = 14,
        pool_width = 12,
        output_width = 9216,
        output_file = out_dset_fname)
"""
import os
from pylearn2.utils import serial

assert len(sys.argv) in [2,3,4]
train_file = sys.argv[1]

if train_file.endswith('.npy'):
    pieces = train_file.split('.npy')
elif train_file.endswith('mat'):
    pieces = train_file.split('.mat')
else:
    assert False
assert len(pieces) == 2

results_dir = pieces[0]
serial.mkdir(results_dir)

if len(sys.argv) > 3:
    assert False
    memreq = sys.argv[3]
else:
    memreq = '15000M'

command = 'jobdispatch --duree=48:00:00 --whitespace --mem=%(memreq)s /data/lisatmp2/goodfeli/cifar10_fold_point_worker ' % locals()
command += ' "{{'

if len(sys.argv) > 2:

    C_list = sys.argv[2]

    C_list = [ float(C) for C in C_list.split(',') ]
"""

from __future__ import print_function

from theano.compat.six.moves import xrange
from pylearn2.datasets.stl10 import STL10
from pylearn2.datasets.preprocessing import Downsample
from pylearn2.utils import string_utils as string
from pylearn2.utils import serial
import numpy as np

print('Preparing output directory...')

data_dir = string.preprocess('${PYLEARN2_DATA_PATH}')
downsampled_dir = data_dir + '/stl10_32x32'
serial.mkdir(downsampled_dir)
README = open(downsampled_dir + '/README', 'w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load. They contain pylearn2
Dataset objects defining the STL-10 dataset, but downsampled to
size 32x32 and translated to lie in [-127.5, 127.5 ].

They were created with the pylearn2 script make_downsampled_stl10.py

All other files in this directory, including this README, were
created by the same script and are necessary for the other files
to function correctly.
""")
Exemple #30
0
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))

    print("Preparing output directory...")
    output_dir = data_dir + '/stl10_32x32_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    unsupervised.pkl, unlabeled.pkl, train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining an unlabeled
    dataset of a 32x32 approximately whitened version of the STL-10
    dataset. unlabeled.pkl contains unlabeled train examples. train.pkl
    contains labeled train examples. unsupervised.pkl contains the union
    of these (without any labels). test.pkl contains the labeled test
    examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_stl10_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor \
          and preprocessing the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    data.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the unsupervised data')
    data.use_design_loc(output_dir + '/unsupervised.npy')
    serial.save(output_dir + '/unsupervised.pkl', data)

    X = data.X
    unlabeled = X[0:100 * 1000, :]
    labeled = X[100 * 1000:, :]
    del X

    print("Saving the unlabeled data")
    data.X = unlabeled
    data.use_design_loc(output_dir + '/unlabeled.npy')
    serial.save(output_dir + '/unlabeled.pkl', data)
    del data
    del unlabeled

    print("Saving the labeled train data")
    supplement.X = labeled
    supplement.use_design_loc(output_dir + '/train.npy')
    serial.save(output_dir + '/train.pkl', supplement)
    del supplement
    del labeled

    print("Loading the test data")
    test = serial.load(downsampled_dir + '/test.pkl')

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir + '/test.npy')
    serial.save(output_dir + '/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

print 'Loading STL10-10 unlabeled and train datasets...'
downsampled_dir = data_dir + '/stl10_32x32'

data = serial.load(downsampled_dir + '/unlabeled.pkl')
supplement = serial.load(downsampled_dir + '/train.pkl')

print 'Concatenating datasets...'
data.set_design_matrix(np.concatenate((data.X,supplement.X),axis=0))
del supplement


print "Preparing output directory..."
patch_dir = data_dir + '/stl10_patches_8x8'
serial.mkdir( patch_dir )
README = open(patch_dir + '/README','w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load.

data.pkl contains a pylearn2 Dataset object defining an unlabeled
dataset of 2 million 6x6 approximately whitened, contrast-normalized
patches drawn uniformly at random from a downsampled (to 32x32)
version of the STL-10 train and unlabeled datasets.

preprocessor.pkl contains a pylearn2 Pipeline object that was used
to extract the patches and approximately whiten / contrast normalize
them. This object is necessary when extracting features for
supervised learning or test set classification, because the
Exemple #32
0
            for output, count in zip(outputs, pooling_region_counts):
                output[i:i + batch_size, ...] = average_pool(count)

            t6 = time.time()

            print(t6 - t1, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5)

        return outputs[0]


if __name__ == '__main__':
    assert len(sys.argv) == 3
    ipath = sys.argv[1]
    opath = sys.argv[2]

    serial.mkdir(opath)

    model = serial.load('/data/lisatmp/goodfeli/darpa_s3c.pkl')
    preprocessor = serial.load(
        '/data/lisatmp/goodfeli/darpa_imagenet_patch_6x6_train_preprocessor.pkl'
    )
    patchifier = ExtractGridPatches(patch_shape=(size, size),
                                    patch_stride=(1, 1))
    preprocessor.items.insert(0, patchifier)

    extractor = FeatureExtractor(model=model, preprocessor=preprocessor)

    contents = os.listdir(ipath)

    for i, fname in enumerate(contents):
        print str(i + 1) + '/' + str(len(contents))
Exemple #33
0
    h0_bias = sigmoid_bias()
    h1_bias = sigmoid_bias()

    learning_rate = 10.**rng.uniform(-2., -.5)

    if rng.randint(2):
        msat = 2
    else:
        msat = rng.randint(2, 1000)

    final_momentum = rng.uniform(.5, .9)

    lr_sat = rng.randint(200, 1000)

    decay = 10.**rng.uniform(-3, -1)

    task_0_yaml_str = task_0_template % locals()

    serial.mkdir('exp/' + str(job_id))
    train_file_full_stem = 'exp/' + str(job_id) + '/'
    f = open(train_file_full_stem + 'task_0.yaml', 'w')
    f.write(task_0_yaml_str)
    f.close()

    task_1_yaml_str = task_1_template % locals()

    serial.mkdir('exp/' + str(job_id))
    f = open(train_file_full_stem + 'task_1.yaml', 'w')
    f.write(task_1_yaml_str)
    f.close()
Exemple #34
0
        dataset_str = {
            'stlfull': '${STL10_WHITENED_UNSUP}',
            'stlpatch': '${STL10_PATCHES_6x6}',
            'cifarfull': '${CIFAR10_WHITENED_TRAIN}',
            'cifarpatch': '${CIFAR10_PATCHES_6x6}'
        }[dataset + kind]

        for size in ['small', 'med', 'big']:

            N = {'small': 625, 'med': 1600, 'big': 4000}[size]

            directory = 'models/%s/%s/%s' % (dataset, kind, size)
            path = '%s/random_patches.yaml' % (directory)

            serial.mkdir(directory)

            f = open(path, 'w')

            f.write("""
!obj:pylearn2.scripts.train.Train {
    "dataset": !pkl: &src "%s",
    "model": !obj:galatea.s3c.s3c.S3C {
               "nvis" : 108,
               "nhid" : %d,
               "init_bias_hid" : -4.,
               "max_bias_hid" : 0.,
               "min_bias_hid" : -7.,
               "irange"  : .02,
               "constrain_W_norm" : 1,
               "init_B"  : 3.,
"""
from __future__ import print_function

from pylearn2.utils import serial
from pylearn2.datasets import preprocessing
from pylearn2.datasets.cifar100 import CIFAR100
from pylearn2.utils import string

data_dir = string.preprocess('${PYLEARN2_DATA_PATH}')

print('Loading CIFAR-100 train dataset...')
data = CIFAR100(which_set='train')

print("Preparing output directory...")
patch_dir = data_dir + '/cifar100/cifar100_patches_8x8'
serial.mkdir(patch_dir)
README = open(patch_dir + '/README', 'w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load.

data.pkl contains a pylearn2 Dataset object defining an unlabeled
dataset of 2 million 8x8 approximately whitened, contrast-normalized
patches drawn uniformly at random from the CIFAR-100 train set.

preprocessor.pkl contains a pylearn2 Pipeline object that was used
to extract the patches and approximately whiten / contrast normalize
them. This object is necessary when extracting features for
supervised learning or test set classification, because the
extracted features must be computed using inputs that have been
Exemple #36
0
    for kind in [ 'full', 'patch' ]:

        dataset_str = { 'stlfull' : '${STL10_WHITENED_UNSUP}',
                    'stlpatch' : '${STL10_PATCHES_6x6}',
                    'cifarfull' : '${CIFAR10_WHITENED_TRAIN}',
                    'cifarpatch' : '${CIFAR10_PATCHES_6x6}'
                    }[dataset+kind]

        for size in [ 'small', 'med', 'big' ]:

            N = { 'small' : 625, 'med' : 1600, 'big' : 4000 }[size]

            directory = 'models/%s/%s/%s' % (dataset, kind, size)
            path = '%s/random_patches.yaml' % (directory)

            serial.mkdir(directory)

            f = open(path,'w')

            f.write("""
!obj:pylearn2.scripts.train.Train {
    "dataset": !pkl: &src "%s",
    "model": !obj:galatea.s3c.s3c.S3C {
               "nvis" : 108,
               "nhid" : %d,
               "init_bias_hid" : -4.,
               "max_bias_hid" : 0.,
               "min_bias_hid" : -7.,
               "irange"  : .02,
               "constrain_W_norm" : 1,
               "init_B"  : 3.,
Exemple #37
0
"""

from pylearn2.utils import serial
from pylearn2.datasets import preprocessing
from pylearn2.utils import string_utils
from pylearn2.datasets.cifar100 import CIFAR100

data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100')

print 'Loading CIFAR-100 train dataset...'
train = CIFAR100(which_set = 'train', gcn = 55.)

print "Preparing output directory..."
output_dir = data_dir + '/pylearn2_gcn_whitened'
serial.mkdir( output_dir )
README = open(output_dir + '/README','w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load.

train.pkl, and test.pkl each contain
a pylearn2 Dataset object defining a labeled
dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100
dataset. train.pkl contains labeled train examples. test.pkl
contains labeled test examples.

preprocessor.pkl contains a pylearn2 ZCA object that was used
to approximately whiten the images. You may want to use this
object later to preprocess other images.
Exemple #38
0
            #average pooling
            for output, count in zip(outputs, pooling_region_counts):
                output[i:i+batch_size,...] = average_pool(count)

            t6 = time.time()

            print (t6-t1, t2-t1, t3-t2, t4-t3, t5-t4, t6-t5)

        return outputs[0]

if __name__ == '__main__':
    assert len(sys.argv) == 3
    ipath = sys.argv[1]
    opath = sys.argv[2]

    serial.mkdir(opath)

    model = serial.load('/data/lisatmp/goodfeli/darpa_s3c.pkl')
    preprocessor = serial.load('/data/lisatmp/goodfeli/darpa_imagenet_patch_6x6_train_preprocessor.pkl')
    patchifier = ExtractGridPatches( patch_shape = (size,size), patch_stride = (1,1) )
    preprocessor.items.insert(0,patchifier)

    extractor = FeatureExtractor( model = model, preprocessor = preprocessor)

    contents = os.listdir(ipath)

    for i, fname in enumerate(contents):
        print str(i+1)+'/'+str(len(contents))
        X = np.load(ipath+'/'+fname)
        X = extractor(X)
        np.save(opath+'/'+fname,X)