Ejemplo n.º 1
0
    def eval_acts(self, inp):
        """ Evaluate Tagger on given inp data

        Here we are only evaluating the clean path."""

        clean_path_kv = [[k, v] for k, v in self.clean.iteritems()]

        givens = {}
        givens.update({self.x: inp['features_labeled']})
        givens.update({self.y: inp['targets_labeled']})
        givens.update({self.x_only: inp['features_unlabeled']})
        givens.update(
            {self.masks_unlabeled: np.float32(inp['masks_unlabeled'])})

        params, args = zip(*givens.iteritems())
        # on_unused_input is set to ignore to omit some warning prints. It can be changed accordingly base on your use
        # cases.
        function = theano.function(params, [k[1] for k in clean_path_kv],
                                   on_unused_input='ignore')

        clean_path_v_outputs = function(*args)

        acts = AttributeDict(clean=AttributeDict(
            zip([k[0] for k in clean_path_kv], clean_path_v_outputs)))

        return acts
Ejemplo n.º 2
0
        def encoder(input_, path_name, input_noise_std=0, noise_std=[]):
            h = input_

            logger.info('  0: noise %g' % input_noise_std)
            if input_noise_std > 0.:
                h = h + self.noise_like(h) * input_noise_std

            d = AttributeDict()
            d.unlabeled = self.new_activation_dict()
            d.labeled = self.new_activation_dict()
            d.labeled.z[0] = self.labeled(h)
            d.unlabeled.z[0] = self.unlabeled(h)
            prev_dim = input_dim
            for i, (spec, _, act_f) in layers[1:]:
                d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h)
                noise = noise_std[i] if i < len(noise_std) else 0.
                curr_dim, z, m, s, h = self.f(h,
                                              prev_dim,
                                              spec,
                                              i,
                                              act_f,
                                              path_name=path_name,
                                              noise_std=noise)
                assert self.layer_dims.get(i) in (None, curr_dim)
                self.layer_dims[i] = curr_dim
                d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z)
                d.unlabeled.s[i] = s
                d.unlabeled.m[i] = m
                prev_dim = curr_dim
            d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h)
            return d
Ejemplo n.º 3
0
    def build_datasets(self, dataset_class, p):
        train_split = ['train']
        train_set = dataset_class(which_sets=train_split)

        # Take all indices and permutate them
        all_ind = numpy.arange(train_set.num_examples)
        rng = numpy.random.RandomState(seed=p.seed)
        rng.shuffle(all_ind)

        valid_set = dataset_class(which_sets=["valid"])
        valid_ind = numpy.arange(valid_set.num_examples)
        trn_set_size = p.get('train_set_size', None)
        train_ind = all_ind[:trn_set_size]

        test_split = ['test']
        test_set = dataset_class(which_sets=test_split)
        test_ind = numpy.arange(test_set.num_examples)

        trn = AttributeDict(set=train_set,
                            ind=train_ind,
                            batch_size=p.batch_size)
        val = AttributeDict(set=valid_set,
                            ind=valid_ind,
                            batch_size=p.valid_batch_size)
        tst = AttributeDict(set=test_set,
                            ind=test_ind,
                            batch_size=p.valid_batch_size)

        return trn, val, tst
Ejemplo n.º 4
0
        def encoder(input_, path_name, input_noise_std=0, noise_std=[]):
            h = input_

            logger.info('  0: noise %g' % input_noise_std)
            if input_noise_std > 0.:
                h = h + self.noise_like(h) * input_noise_std

            d = AttributeDict()
            d.unlabeled = self.new_activation_dict()
            d.labeled = self.new_activation_dict()
            d.labeled.z[0] = self.labeled(h)
            d.unlabeled.z[0] = self.unlabeled(h)
            prev_dim = input_dim
            for i, (spec, _, act_f) in layers[1:]:
                d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h)
                noise = noise_std[i] if i < len(noise_std) else 0.
                curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f,
                                              path_name=path_name,
                                              noise_std=noise)
                assert self.layer_dims.get(i) in (None, curr_dim)
                self.layer_dims[i] = curr_dim
                d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z)
                d.unlabeled.s[i] = s
                d.unlabeled.m[i] = m
                prev_dim = curr_dim
            d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h)
            return d
Ejemplo n.º 5
0
def load_and_log_params(cli_params):
    cli_params = AttributeDict(cli_params)
    if cli_params.get('load_from'):
        p = load_df(cli_params.load_from, 'params').to_dict()[0]
        p = AttributeDict(p)
        for key in cli_params.keys():
            if key not in p:
                p[key] = None
        new_params = cli_params
        loaded = True
    else:
        p = cli_params
        new_params = {}
        loaded = False

        # Make dseed seed unless specified explicitly
        if p.get('dseed') is None and p.get('seed') is not None:
            p['dseed'] = p['seed']

    logger.info('== COMMAND LINE ==')
    logger.info(' '.join(sys.argv))

    logger.info('== PARAMETERS ==')
    for k, v in p.items():
        if new_params.get(k) is not None:
            p[k] = new_params[k]
            replace_str = "<- " + str(new_params.get(k))
        else:
            replace_str = ""
        logger.info(" {!s:20s}: {!s:<20s} {}".format(k, v, replace_str))
    return p, loaded
Ejemplo n.º 6
0
    def decoder(self, clean, corr, batch_size):
        get_unlabeled = lambda x: x[batch_size:] if x is not None else x
        est = self.new_activation_dict()
        costs = AttributeDict()
        costs.denois = AttributeDict()
        for i, ((_, spec), act_f) in self.layers[::-1]:
            z_corr = get_unlabeled(corr.z[i])
            z_clean = get_unlabeled(clean.z[i])
            z_clean_s = get_unlabeled(clean.s.get(i))
            z_clean_m = get_unlabeled(clean.m.get(i))

            # It's the last layer
            if i == len(self.layers) - 1:
                fspec = (None, None)
                ver = get_unlabeled(corr.h[i])
                ver_dim = self.layer_dims[i]
                top_g = True
            else:
                fspec = self.layers[i + 1][1][0]
                ver = est.z.get(i + 1)
                ver_dim = self.layer_dims.get(i + 1)
                top_g = False

            z_est = self.g(z_lat=z_corr,
                           z_ver=ver,
                           in_dims=ver_dim,
                           out_dims=self.layer_dims[i],
                           num=i,
                           fspec=fspec,
                           top_g=top_g)

            # For semi-supervised version
            if z_clean_s:
                z_est_norm = (z_est - z_clean_m) / z_clean_s
            else:
                z_est_norm = z_est
            z_est_norm = z_est

            se = SquaredError('denois' + str(i))
            costs.denois[i] = se.apply(z_est_norm.flatten(2),
                                       z_clean.flatten(2)) \
                / np.prod(self.layer_dims[i], dtype=floatX)
            costs.denois[i].name = 'denois' + str(i)

            # Store references for later use
            est.z[i] = z_est
            est.h[i] = apply_act(z_est, act_f)
            est.s[i] = None
            est.m[i] = None
        return est, costs
Ejemplo n.º 7
0
    def decoder(self, clean, corr):
        est = self.new_activation_dict()
        costs = AttributeDict()
        costs.denois = AttributeDict()
        for i, ((_, spec), act_f) in self.layers[::-1]:
            z_corr = corr.unlabeled.z[i]
            z_clean = clean.unlabeled.z[i]
            z_clean_s = clean.unlabeled.s.get(i)
            z_clean_m = clean.unlabeled.m.get(i)

            # It's the last layer
            if i == len(self.layers) - 1:
                fspec = (None, None)
                ver = corr.unlabeled.h[i]
                ver_dim = self.layer_dims[i]
                top_g = True
            else:
                fspec = self.layers[i + 1][1][0]
                ver = est.z.get(i + 1)
                ver_dim = self.layer_dims.get(i + 1)
                top_g = False

            z_est = self.g(z_lat=z_corr,
                           z_ver=ver,
                           in_dims=ver_dim,
                           out_dims=self.layer_dims[i],
                           num=i,
                           fspec=fspec,
                           top_g=top_g)

            # The first layer
            if z_clean_s:
                z_est_norm = (z_est - z_clean_m) / z_clean_s
            else:
                z_est_norm = z_est

            se = SquaredError('denois' + str(i))
            costs.denois[i] = se.apply(z_est_norm.flatten(2),
                                       z_clean.flatten(2)) \
                / np.prod(self.layer_dims[i], dtype=floatX)
            costs.denois[i].name = 'denois' + str(i)

            # Store references for later use
            est.z[i] = z_est
            est.h[i] = apply_act(z_est, act_f)
            est.s[i] = None
            est.m[i] = None
        return est, costs
Ejemplo n.º 8
0
def setup_data(p, test_set=False):
    dataset_class, training_set_size = {
        'cifar10': (CIFAR10, 40000),
        'mnist': (MNIST, 50000),
        'reddit': (SubredditTopPhotosFeatures22, 20000)
    }[p.dataset]

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class(("train", ))

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(("test", ))
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    in_dim = train_set.data_sources[train_set.sources.index(
        'features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm
Ejemplo n.º 9
0
def mnist_load(train_size=50000, dseed=1):
    # borrowed from https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py
    # We can now download and read the training and test set images and labels.
    X_train = load_mnist_images('train-images-idx3-ubyte.gz')
    y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
    X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
    y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')

    # We reserve the last 10000 training examples for validation.
    rng = np.random.RandomState(dseed)
    randix = rng.permutation(X_train.shape[0])
    X_train, X_val = X_train[randix[:train_size]], X_train[randix[train_size:]]
    y_train, y_val = y_train[randix[:train_size]], y_train[randix[train_size:]]

    logger.debug('%d examples in training dataset' % X_train.shape[0])
    logger.debug('%d examples in validation dataset' % X_val.shape[0])
    logger.debug('%d examples in testing dataset' % X_test.shape[0])

    return AttributeDict({
        'X_train': X_train,
        'y_train': y_train,
        'X_val': X_val,
        'y_val': y_val,
        'X_test': X_test,
        'y_test': y_test
    })
Ejemplo n.º 10
0
    def decoder(self, clean, corr, batch_size):
        get_unlabeled = lambda x: x[batch_size:] if x is not None else x
        est = self.new_activation_dict()
        costs = AttributeDict()
        costs.denois = AttributeDict()
        for i, ((_, spec), act_f) in self.layers[::-1]:
            z_corr = get_unlabeled(corr.z[i])
            z_clean = get_unlabeled(clean.z[i])
            z_clean_s = get_unlabeled(clean.s.get(i))
            z_clean_m = get_unlabeled(clean.m.get(i))

            # It's the last layer
            if i == len(self.layers) - 1:
                fspec = (None, None)
                ver = get_unlabeled(corr.h[i])
                ver_dim = self.layer_dims[i]
                top_g = True
            else:
                fspec = self.layers[i + 1][1][0]
                ver = est.z.get(i + 1)
                ver_dim = self.layer_dims.get(i + 1)
                top_g = False

            z_est = self.g(
                z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], num=i, fspec=fspec, top_g=top_g
            )

            # For semi-supervised version
            if z_clean_s:
                z_est_norm = (z_est - z_clean_m) / z_clean_s
            else:
                z_est_norm = z_est
            z_est_norm = z_est

            se = SquaredError("denois" + str(i))
            costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) / np.prod(
                self.layer_dims[i], dtype=floatX
            )
            costs.denois[i].name = "denois" + str(i)

            # Store references for later use
            est.z[i] = z_est
            est.h[i] = apply_act(z_est, act_f)
            est.s[i] = None
            est.m[i] = None
        return est, costs
Ejemplo n.º 11
0
def load_and_log_params(cli_params):
    cli_params = AttributeDict(cli_params)
    if cli_params.get('load_from'):
        p = load_df(cli_params.load_from, 'params').to_dict()[0]
        p = AttributeDict(p)
        for key in cli_params.iterkeys():
            if key not in p:
                p[key] = None
        new_params = cli_params
        loaded = True
    else:
        p = cli_params
        new_params = {}
        loaded = False

        # Make dseed seed unless specified explicitly
        if p.get('dseed') is None and p.get('seed') is not None:
            p['dseed'] = p['seed']

    logger.info('== COMMAND LINE ==')
    logger.info(' '.join(sys.argv))

    logger.info('== PARAMETERS ==')
    for k, v in p.iteritems():
        if new_params.get(k) is not None:
            p[k] = new_params[k]
            replace_str = "<- " + str(new_params.get(k))
        else:
            replace_str = ""
        logger.info(" {:20}: {:<20} {}".format(k, v, replace_str))
    return p, loaded
Ejemplo n.º 12
0
def get_mnist_data_dict(unlabeled_samples, valid_set_size, test_set=False):
    train_set = MNIST(("train",))
    # Make sure the MNIST data is in right format
    train_set.data_sources = (
        (train_set.data_sources[0] / 255.).astype(numpy.float32),
        train_set.data_sources[1])

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    rng = numpy.random.RandomState(seed=1)
    rng.shuffle(all_ind)

    data = AttributeDict()

    # Choose the training set
    data.train = train_set
    data.train_ind = all_ind[:unlabeled_samples]

    # Then choose validation set from the remaining indices
    data.valid = train_set
    data.valid_ind = numpy.setdiff1d(all_ind, data.train_ind)[:valid_set_size]
    logger.info('Using %d examples for validation' % len(data.valid_ind))
    # Only touch test data if requested
    if test_set:
        data.test = MNIST(("test",))
        data.test_ind = numpy.arange(data.test.num_examples)

    return data
Ejemplo n.º 13
0
    def __init__(self, params):
        super().__init__()

        if not isinstance(params, AttributeDict):
            params = AttributeDict(params)

        self.hparams = params

        self.model = TransformerModel(params)
Ejemplo n.º 14
0
def setup_data(p, test_set=False):
    dataset_class, training_set_size = {"cifar10": (CIFAR10, 40000), "mnist": (MNIST, 50000)}[p.dataset]

    # Allow overriding the default from command line
    if p.get("unlabeled_samples") is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class("train")

    # Make sure the MNIST data is in right format
    if p.dataset == "mnist":
        d = train_set.data_sources[train_set.sources.index("features")]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), "Make sure data is in float format and in range 0 to 1"

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get("dseed"):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[: p.valid_set_size]
    logger.info("Using %d examples for validation" % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class("test")
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    in_dim = train_set.data_sources[train_set.sources.index("features")].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, "Need %d whitening dimensions, not %d" % (
            numpy.product(in_dim),
            p.whiten_zca,
        )
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index("features")]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info("Whitening using %d ZCA components" % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm
Ejemplo n.º 15
0
def metrics(indexes=['data_objects', 'data_bundles']):
    """
    return document counts
    """
    return [
        AttributeDict({
            'name': index,
            'count': len(store.get(index, []))
        }) for index in indexes
    ]
Ejemplo n.º 16
0
def setup_data(p, test_set=False):
    dataset_class, training_set_size = {
        'cifar10': (CIFAR10, 40000),
        'mnist': (MNIST, 50000),
        'reddit': (SubredditTopPhotosFeatures22, 20000)
    }[p.dataset]

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class(("train",))

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(("test",))
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm
Ejemplo n.º 17
0
def setup_data(p, test_set=False):
    dataset_class, training_set_size = {
        'cifar10': (CIFAR10, 40000),
        'mnist': (MNIST, 50000),
    }[p.dataset]

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class("train")

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class("test")
        d.test_ind = numpy.arange(d.test.num_examples)

    in_dim = train_set.data_sources[train_set.sources.index(
        'features')].shape[1:]

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    return in_dim, d
Ejemplo n.º 18
0
    def doPreprocessing(self):
        results = AttributeDict()
        results.dataset = []
        for i in range(len(self.params.dataset)):
            # shall we just load it?
            filename = '%s/preprocessing-%s%s.mat' % (
                self.params.dataset[i].savePath,
                self.params.dataset[i].saveFile, self.params.saveSuffix)
            if self.params.dataset[i].preprocessing.load and os.path.isfile(
                    filename):
                r = loadmat(filename)
                print('Loading file %s ...' % filename)
                results.dataset[i].preprocessing = r.results_preprocessing
            else:
                # or shall we actually calculate it?
                p = deepcopy(self.params)
                p.dataset = self.params.dataset[i]
                d = AttributeDict()
                d.preprocessing = np.copy(SeqSLAM.preprocessing(p))
                results.dataset.append(d)

                if self.params.dataset[i].preprocessing.save:
                    results_preprocessing = results.dataset[i].preprocessing
                    savemat(filename,
                            {'results_preprocessing': results_preprocessing})

        return results
Ejemplo n.º 19
0
    def _load_extends_settings(self, section_name, store):
        """
        Loads all settings from other template(s) specified by a section's
        'extends' setting.

        This method walks a dependency tree of sections from bottom up. Each
        step is a group of settings for a section in the form of a dictionary.
        A 'master' dictionary is updated with the settings at each step. This
        causes the next group of settings to override the previous, and so on.
        The 'section_name' settings are at the top of the dependency tree.
        """
        section = store[section_name]
        extends = section.get('extends')
        if extends is None:
            return
        if DEBUG_CONFIG:
            log.debug('%s extends %s' % (section_name, extends))
        extensions = [section]
        while True:
            extends = section.get('extends', None)
            if not extends:
                break
            try:
                section = store[extends]
                if section in extensions:
                    exts = ', '.join([self._get_section_name(x['__name__'])
                                      for x in extensions])
                    raise exception.ConfigError(
                        "Cyclical dependency between sections %s. "
                        "Check your EXTENDS settings." % exts)
                extensions.insert(0, section)
            except KeyError:
                raise exception.ConfigError(
                    "%s can't extend non-existent section %s" %
                    (section_name, extends))
        transform = AttributeDict()
        for extension in extensions:
            transform.update(extension)
        store[section_name] = transform
Ejemplo n.º 20
0
    def __init__(self, p):
        logger.debug(theano.config)
        self.p = p
        self.params = OrderedDict()
        self.rstream = RandomStreams(seed=p.seed)
        self.rng = np.random.RandomState(seed=p.seed)
        self.in_dim = data_dim(p)  # input dimensionality

        input_type = T.type.TensorType('float32',
                                       [False] * (len(self.in_dim) + 1))
        input_plus_one_type = T.type.TensorType('float32', [False] *
                                                (len(self.in_dim) + 2))
        self.x_only = input_type('features_unlabeled')
        self.x = input_type('features_labeled')
        self.y = theano.tensor.lmatrix('targets_labeled')
        self.masks_unlabeled = input_plus_one_type('masks_unlabeled')

        # We noticed that continuous case becomes more stable if we
        # have stable v, i.e. put Ladder's v through a sigmoid
        if p.input_type == 'continuous':
            decoder_spec = ('gauss_stable_v', )
        else:
            decoder_spec = ('gauss', )

        # Ladder Network, a.k.a Parametric Mapping

        ladder_p = AttributeDict({
            'seed':
            p.seed,
            'encoder_layers':
            p.encoder_proj,
            'decoder_spec':
            decoder_spec,
            'denoising_cost_x': (0.0, ) * len(p.encoder_proj),
            'act':
            'relu',
            # Ladder doesn't add noise to its layers. Tagger handles all corruption.
            'f_local_noise_std':
            0.,
            'super_noise_std':
            0.,
            'zestbn':
            'no',
            'top_c':
            True,
            'lr':
            0.,
        })
        self.ladder = LadderAE(ladder_p)
        # disable logging from ladder
        logging.getLogger('main.model').setLevel(logging.WARNING)
Ejemplo n.º 21
0
def merge_with_yaml(yaml_filename):
    """Load a yaml config file and merge it into the global config object"""
    global _g_conf
    with open(yaml_filename, 'r') as f:

        yaml_file = yaml.load(f)

        yaml_cfg = AttributeDict(yaml_file)

    _merge_a_into_b(yaml_cfg, _g_conf)

    path_parts = os.path.split(yaml_filename)
    _g_conf.EXPERIMENT_BATCH_NAME = os.path.split(path_parts[-2])[-1]
    _g_conf.EXPERIMENT_NAME = path_parts[-1].split('.')[-2]
Ejemplo n.º 22
0
    def doPreprocessing(self):
        results = AttributeDict()
        results.dataset = []
        for i in range(len(self.params.dataset)):
            # shall we just load it?
            filename = '%s/preprocessing-%s%s.mat' % (self.params.dataset[i].savePath, self.params.dataset[i].saveFile, self.params.saveSuffix)
            if self.params.dataset[i].preprocessing.load and os.path.isfile(filename):         
                r = loadmat(filename)
                print('Loading file %s ...' % filename)
                results.dataset[i].preprocessing = r.results_preprocessing
            else:
                # or shall we actually calculate it?
                p = deepcopy(self.params)    
                p.dataset = self.params.dataset[i]
                d = AttributeDict()
                d.preprocessing = np.copy(SeqSLAM.preprocessing(p))
                results.dataset.append(d)
    
                if self.params.dataset[i].preprocessing.save:
                    results_preprocessing = results.dataset[i].preprocessing
                    savemat(filename, {'results_preprocessing': results_preprocessing})

        return results
Ejemplo n.º 23
0
    def __init__(self, params):
        super().__init__()

        if not isinstance(params, AttributeDict):
            params = AttributeDict(params)

        self.hparams = params

        self.model = VQVCModel(params)

        self.spec_augmenter = SpecAugmentation(time_drop_width=3,
                                               time_stripes_num=2,
                                               freq_drop_width=3,
                                               freq_stripes_num=2)
Ejemplo n.º 24
0
    def encoder(self, input_, path_name, input_noise_std, noise_std):
        h = input_
        h = h + (self.rstream.normal(size=h.shape).astype(floatX) *
                 input_noise_std)

        d = AttributeDict()
        d.unlabeled = self.new_activation_dict()
        d.labeled = self.new_activation_dict()
        d.labeled.z[0], d.unlabeled.z[0] = self.split_lu(h)
        prev_dim = self.input_dim
        for i, (spec, act_f) in self.layers[1:]:
            d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h)
            noise = noise_std[i] if i < len(noise_std) else 0.
            curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f,
                                          path_name=path_name,
                                          noise_std=noise)
            self.layer_dims[i] = curr_dim
            d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z)
            d.unlabeled.s[i] = s
            d.unlabeled.m[i] = m
            prev_dim = curr_dim
        d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h)

        return d
Ejemplo n.º 25
0
def get_dataset(root: str = DATASETS_PATH,
                train_size: float = .8,
                shuffle=True,
                seed=None) -> AttributeDict:
    dtype_dict = {
        x: np.float_ if x in CATEGORICAL_FEATURES else np.float_
        for x in ALL_FEATURES
    }
    dataset = AttributeDict(
        train=AttributeDict(X=pd.read_csv(os.path.join(root, 'X1.csv'),
                                          dtype=dtype_dict),
                            y=pd.read_csv(os.path.join(root, 'Y1.csv'),
                                          header=None,
                                          dtype={"shares": np.float_},
                                          names=["shares"])),
        validation=AttributeDict(X=None, y=None),
        test=AttributeDict(X=pd.read_csv(os.path.join(root, 'X2.csv'),
                                         dtype=dtype_dict),
                           y=None))

    if train_size < 1:
        dataset.train.X, dataset.validation.X, dataset.train.y, dataset.validation.y = train_test_split(
            dataset.train.X,
            dataset.train.y,
            train_size=train_size,
            shuffle=shuffle,
            random_state=seed)
    else:
        dataset.validation.X = pd.DataFrame(
            {c: []
             for c in dataset.train.X.columns.values})
        dataset.validation.y = pd.DataFrame(
            {c: []
             for c in dataset.train.y.columns.values})

    return dataset
Ejemplo n.º 26
0
def setup_data(p, test_set=False):
    dataset_class, training_set_size = {
        'cifar10': (CIFAR10, 40000),
        'mnist': (MNIST, 50000),
    }[p.dataset]

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class("train")

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class("test")
        d.test_ind = numpy.arange(d.test.num_examples)

    in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:]

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    return in_dim, d
Ejemplo n.º 27
0
def metrics(indexes=['data_objects', 'data_bundles']):
    """
    return document counts
    """
    def _count(index):
        try:
            s = Search(using=client, index=index, doc_type=index[:-1])
            return s.count()
        except elasticsearch.exceptions.NotFoundError as no_found:
            log.info('NotFoundError {} (expected on empty db)'.format(index))
            return 0
        except Exception as e:
            log.error('error getting count of documents in {}'.format(index))
            log.exception(e)
            raise e
    return [
        AttributeDict(
            {'name': index, 'count': _count(index)}
            ) for index in indexes
        ]
Ejemplo n.º 28
0
def merge_with_yaml(yaml_filename):
    """Load a yaml config file and merge it into the global config object"""
    global _g_conf
    with open(yaml_filename, 'r') as f:

        yaml_file = yaml.load(f)

        yaml_cfg = AttributeDict(yaml_file)

    print("yaml here", yaml_cfg)

    print("batch size ", yaml_cfg.BATCH_SIZE)

    _merge_a_into_b(yaml_cfg, _g_conf)

    #TODO: Merging is missing

    path_parts = os.path.split(yaml_filename)
    _g_conf.EXPERIMENT_BATCH_NAME = os.path.split(path_parts[-2])[-1]
    _g_conf.EXPERIMENT_NAME = path_parts[-1].split('.')[-2]
Ejemplo n.º 29
0
def load_and_log_params(cli_params):
    cli_params = AttributeDict(cli_params)
    # 如果有load_from参数,说明参数是有文件里面读取的,文件格式为hdf
    if cli_params.get('load_from'):
        # load_from值 + params组成完整地址
        # string => dict
        p = load_df(cli_params.load_from, 'params').to_dict()[0]
        # dict => AttributeDict
        p = AttributeDict(p)

        for key in cli_params.iterkeys():
            if key not in p:
                p[key] = None
        new_params = cli_params
        loaded = True

    # 如果没有load from参数,直接封装一下cli_params
    else:
        p = cli_params
        new_params = {}
        loaded = False

        # Make dseed seed unless specified explicitly
        # dseed为空而seed不为空时,dseed复制为seed
        if p.get('dseed') is None and p.get('seed') is not None:
            p['dseed'] = p['seed']

    # log相关
    logger.info('== COMMAND LINE ==')
    logger.info(' '.join(sys.argv))

    logger.info('== PARAMETERS ==')
    for k, v in p.iteritems():
        if new_params.get(k) is not None:
            p[k] = new_params[k]
            replace_str = "<- " + str(new_params.get(k))
        else:
            replace_str = ""
        logger.info(" {:20}: {:<20} {}".format(k, v, replace_str))
    return p, loaded
Ejemplo n.º 30
0
def update(_id, doc, index='data_objects'):
    """
    partial update using the contructed _id
    """
    def merge(a, b, path=None):
        "merges b into a"
        if path is None:
            path = []
        for key in b:
            if key in a:
                if isinstance(a[key], dict) and isinstance(b[key], dict):
                    merge(a[key], b[key], path + [str(key)])
                elif a[key] == b[key]:
                    pass  # same leaf value
                else:
                    a[key] = b[key]  # update with new
            else:
                a[key] = b[key]
        return a

    store[index].append(
        AttributeDict(merge(_get(_id, index=index, current=True),
                            doc)))  # noqa
Ejemplo n.º 31
0
def manage_review_view(request, review_id=None):
    if not request.user.id:
        return HttpResponseRedirect('/login')

    if request.method in CREATE_OR_UPDATE_METHODS:
        data = request.POST
        if data.get('action') == 'delete':
            Review.objects.filter(user=request.user, id=review_id).delete()
            next_url = '/'
        elif data.get('action') == 'autofill':
            fill_missing_review_data(review_id)
            next_url = request.path
        else:
            review, created = Review.create_or_update(request.user,
                                                      data,
                                                      id=review_id)
            if created:
                fill_missing_review_data(review.id)
            next_url = '/?submittedReview={}'.format(review.id)

        return HttpResponseRedirect(next_url)
    else:
        if review_id:
            review = Review.objects.get(user=request.user, id=review_id)
        else:
            # pre-populate fields from url parameters.
            review = AttributeDict({
                k: v
                for k, v in request.GET.items()
                if k in {'url', 'rating', 'text'}
            })

    return render_with_globals(request, 'manage_review.html', {
        'review': review,
        'review_id': review_id
    })
Ejemplo n.º 32
0
def save(doc, index='data_objects'):
    """
    save the body in the index, ensure version and id set
    """
    doc = AttributeDict(doc)
    version = doc.get('version', None)
    if not version:
        doc['version'] = now()
    if not doc.get('id', None):
        temp_id = str(uuid.uuid4())
        doc['id'] = temp_id

    if _is_duplicate(doc, index):
        raise Exception("duplicate document")

    if index not in store:
        store[index] = []
    doc.meta = AttributeDict({'id': doc.id})
    store[index].append(doc)
    log.info(doc.id)
    return doc
Ejemplo n.º 33
0
def setup_data(p, use_unlabeled=True, use_labeled=True):
    assert use_unlabeled or use_labeled, 'Cannot train without cost'
    dataset_class = DATASETS[p.dataset]
    dataset = dataset_class(p)
    train_ind = dataset.trn.ind

    if 'labeled_samples' not in p or p.labeled_samples == 0:
        n_labeled = len(train_ind)
    else:
        n_labeled = p.labeled_samples

    if 'unlabeled_samples' not in p:
        n_unlabeled = len(train_ind)
    else:
        n_unlabeled = p.unlabeled_samples

    assert p.batch_size <= n_labeled, "batch size too large"
    assert len(train_ind) >= n_labeled
    assert len(train_ind) >= n_unlabeled, "not enough training samples"
    assert n_labeled <= n_unlabeled, \
        "at least as many unlabeled samples as number of labeled samples"

    # If not using all labels, let's balance classes
    balance_classes = n_labeled < len(train_ind)

    if balance_classes and use_labeled:
        # Ensure each label is equally represented
        y = dataset.get_train_labels()
        n_classes = numpy.max(y) + 1

        n_from_each_class = n_labeled / n_classes
        logger.info('n_sample_from_each_class {0}'.format(n_from_each_class))
        assert n_labeled % n_classes == 0

        i_labeled = []
        for c in xrange(n_classes):
            i = (train_ind[y[:, 0] == c])[:n_from_each_class]
            if len(i) < n_from_each_class:
                logger.warning('Class {0} : only got {1}'.format(c, len(i)))
            i_labeled += list(i)

    else:
        i_labeled = train_ind[:n_labeled]

    def make_unlabeled_set(train_ind, i_labeled, n_unlabeled):
        """ i_unused_labeled: the labels that are not used in i_labeled.
        n_unlabeled_needed: the number of need for i_unlabeled beyond len(i_labeled)
        """
        i_unused_labeled = list(set(train_ind) - set(i_labeled))
        n_unlabeled_needed = n_unlabeled - len(i_labeled)
        i_unlabeled = i_unused_labeled[:n_unlabeled_needed]
        i_unlabeled.extend(i_labeled)

        return i_unlabeled

    i_unlabeled = make_unlabeled_set(train_ind, i_labeled, n_unlabeled)

    logger.info('Creating data set with %d labeled and %d total samples' %
                (len(i_labeled), len(i_unlabeled)))

    streams = AttributeDict()

    def make(kind, ind_labeled, ind_unlabeled):
        ds_labeled, ds_unlabeled = None, None
        if use_labeled:
            ds_labeled = dataset.get_datastream(kind, ind_labeled)
        if use_unlabeled:
            ds_unlabeled = dataset.get_datastream(kind, ind_unlabeled)

        return combine_datastreams(ds_labeled, ds_unlabeled)

    streams.train = make('trn', i_labeled, i_unlabeled)
    streams.valid = make('val', None, None)  # use all indices
    streams.test = make('tst', None, None)  # use all indices

    return streams
Ejemplo n.º 34
0
    def apply(self, input_labeled, target_labeled, input_unlabeled):
        self.layer_counter = 0
        input_dim = self.p.encoder_layers[0]

        # Store the dimension tuples in the same order as layers.
        layers = self.layers
        self.layer_dims = {0: input_dim}

        self.lr = self.shared(self.default_lr, 'learning_rate', role=None)

        self.costs = costs = AttributeDict()
        self.costs.denois = AttributeDict()

        self.act = AttributeDict()
        self.error = AttributeDict()
        self.oos = AttributeDict()

        top = len(layers) - 1

        N = input_labeled.shape[0]
        self.join = lambda l, u: T.concatenate([l, u], axis=0)
        self.labeled = lambda x: x[:N] if x is not None else x
        self.unlabeled = lambda x: x[N:] if x is not None else x
        self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x))

        input_concat = self.join(input_labeled, input_unlabeled)

        def encoder(input_, path_name, input_noise_std=0, noise_std=[]):
            h = input_

            logger.info('  0: noise %g' % input_noise_std)
            if input_noise_std > 0.:
                h = h + self.noise_like(h) * input_noise_std

            d = AttributeDict()
            d.unlabeled = self.new_activation_dict()
            d.labeled = self.new_activation_dict()
            d.labeled.z[0] = self.labeled(h)
            d.unlabeled.z[0] = self.unlabeled(h)
            prev_dim = input_dim
            for i, (spec, _, act_f) in layers[1:]:
                d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h)
                noise = noise_std[i] if i < len(noise_std) else 0.
                curr_dim, z, m, s, h = self.f(h,
                                              prev_dim,
                                              spec,
                                              i,
                                              act_f,
                                              path_name=path_name,
                                              noise_std=noise)
                assert self.layer_dims.get(i) in (None, curr_dim)
                self.layer_dims[i] = curr_dim
                d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z)
                d.unlabeled.s[i] = s
                d.unlabeled.m[i] = m
                prev_dim = curr_dim
            d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h)
            return d

        # Clean, supervised
        logger.info('Encoder: clean, labeled')
        clean = self.act.clean = encoder(input_concat, 'clean')

        # Corrupted, supervised
        logger.info('Encoder: corr, labeled')
        corr = self.act.corr = encoder(input_concat,
                                       'corr',
                                       input_noise_std=self.p.super_noise_std,
                                       noise_std=self.p.f_local_noise_std)
        est = self.act.est = self.new_activation_dict()

        # Decoder path in opposite order
        logger.info('Decoder: z_corr -> z_est')
        for i, ((_, spec), l_type, act_f) in layers[::-1]:
            z_corr = corr.unlabeled.z[i]
            z_clean = clean.unlabeled.z[i]
            z_clean_s = clean.unlabeled.s.get(i)
            z_clean_m = clean.unlabeled.m.get(i)
            fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None,
                                                                     None)

            if i == top:
                ver = corr.unlabeled.h[i]
                ver_dim = self.layer_dims[i]
                top_g = True
            else:
                ver = est.z.get(i + 1)
                ver_dim = self.layer_dims.get(i + 1)
                top_g = False

            z_est = self.g(z_lat=z_corr,
                           z_ver=ver,
                           in_dims=ver_dim,
                           out_dims=self.layer_dims[i],
                           l_type=l_type,
                           num=i,
                           fspec=fspec,
                           top_g=top_g)

            if z_est is not None:
                # Denoising cost

                if z_clean_s and self.p.zestbn == 'bugfix':
                    z_est_norm = (z_est - z_clean_m
                                  ) / T.sqrt(z_clean_s + np.float32(1e-10))
                elif z_clean_s is None or self.p.zestbn == 'no':
                    z_est_norm = z_est
                else:
                    assert False, 'Not supported path'

                se = SquaredError('denois' + str(i))
                costs.denois[i] = se.apply(z_est_norm.flatten(2),
                                           z_clean.flatten(2)) \
                    / np.prod(self.layer_dims[i], dtype=floatX)
                costs.denois[i].name = 'denois' + str(i)
                denois_print = 'denois %.2f' % self.p.denoising_cost_x[i]
            else:
                denois_print = ''

            # Store references for later use
            est.h[i] = self.apply_act(z_est, act_f)
            est.z[i] = z_est
            est.s[i] = None
            est.m[i] = None
            logger.info('  g%d: %10s, %s, dim %s -> %s' %
                        (i, l_type, denois_print, self.layer_dims.get(i + 1),
                         self.layer_dims.get(i)))

        # Costs
        y = target_labeled.flatten()

        Q = int(self.layer_dims[top][0]) - 1
        logger.info('Q=%d' % Q)
        costs.class_clean = CategoricalCrossEntropyIV(
            Q=Q,
            alpha=self.p.alpha,
            beta=self.p.beta,
            dbeta=self.p.dbeta,
            gamma=self.p.gamma,
            gamma1=self.p.gamma1).apply(y, clean.labeled.h[top])
        costs.class_clean.name = 'cost_class_clean'

        costs.class_corr = CategoricalCrossEntropyIV(
            Q=Q,
            alpha=self.p.alpha,
            beta=self.p.beta,
            dbeta=self.p.dbeta,
            gamma=self.p.gamma,
            gamma1=self.p.gamma1,
        ).apply(y, corr.labeled.h[top])
        costs.class_corr.name = 'cost_class_corr'

        # This will be used for training
        costs.total = costs.class_corr * 1.0
        for i in range(top + 1):
            if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0:
                costs.total += costs.denois[i] * self.p.denoising_cost_x[i]
        if self.p.alpha_clean:
            y_true = y
            eps = np.float32(1e-6)

            # scale preds so that the class probas of each sample sum to 1
            y_pred = clean.labeled.h[top] + eps
            y_pred /= y_pred.sum(axis=-1, keepdims=True)

            y0 = T.or_(T.eq(y_true, 0), T.gt(y_true,
                                             Q))  # out-of-set or unlabeled
            y0sum = y0.sum() + eps  # number of oos

            cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred)
            cost1 = T.dot(y0,
                          cost1) / y0sum  # average cost per labeled example
            costs.total += self.p.alpha_clean * cost1

        costs.total.name = 'cost_total'

        # Classification error
        mr = MisclassificationRateIV(oos_thr=self.p.oos_thr)
        self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.)
        self.error.clean.name = 'error_rate_clean'
        oosr = OOSRateIV()
        self.oos.clean = oosr.apply(y, clean.labeled.h[top]) * np.float32(100.)
        self.oos.clean.name = 'oos_rate_clean'
Ejemplo n.º 35
0
# -*- encoding: utf-8 -*-

# customize/override for your backend

import logging
import os
import uuid

from utils import AttributeDict, now, add_created_timestamps, \
                  add_updated_timestamps

log = logging.getLogger(__name__)

DEFAULT_PAGE_SIZE = 100

store = AttributeDict({})


def save(doc, index='data_objects'):
    """
    save the body in the index, ensure version and id set
    """
    doc = AttributeDict(doc)
    version = doc.get('version', None)
    if not version:
        doc['version'] = now()
    if not doc.get('id', None):
        temp_id = str(uuid.uuid4())
        doc['id'] = temp_id

    if _is_duplicate(doc, index):
Ejemplo n.º 36
0
def defaultParameters():

    params = AttributeDict()

    # switches
    params.DO_PREPROCESSING = 1
    params.DO_RESIZE        = 0
    params.DO_GRAYLEVEL     = 1
    params.DO_PATCHNORMALIZATION    = 1 #!!!! 1
    params.DO_SAVE_PREPROCESSED_IMG = 0
    params.DO_DIFF_MATRIX   = 1
    params.DO_CONTRAST_ENHANCEMENT  = 1
    params.DO_FIND_MATCHES  = 1


    # parameters for preprocessing
    params.downsample = AttributeDict()
    params.downsample.size = [32, 64]  # height, width
    try:
        params.downsample.method = Image.LANCZOS
    except:
        params.downsample.method = Image.ANTIALIAS
    params.normalization = AttributeDict()
    params.normalization.sideLength = 8
    params.normalization.mode = 1
            
    
    # parameters regarding the matching between images
    params.matching = AttributeDict()
    params.matching.ds = 10 
    params.matching.Rrecent=5
    params.matching.vmin = 0.8
    params.matching.vskip = 0.1
    params.matching.vmax = 1.2  
    params.matching.Rwindow = 10
    params.matching.save = 1
    params.matching.load = 0 #1
    
    # parameters for contrast enhancement on difference matrix
    params.contrastEnhancement = AttributeDict()  
    params.contrastEnhancement.R = 10

    # load old results or re-calculate? save results?
    params.differenceMatrix = AttributeDict()
    params.differenceMatrix.save = 1
    params.differenceMatrix.load = 0 #1
    
    params.contrastEnhanced = AttributeDict()
    params.contrastEnhanced.save = 1
    params.contrastEnhanced.load = 0 #1
    
    # suffix appended on files containing the results
    params.saveSuffix=''
    
    return params
Ejemplo n.º 37
0
 def new_activation_dict(self):
     return AttributeDict({'z': {}, 'h': {}, 's': {}, 'm': {}})
Ejemplo n.º 38
0
def demo():

    # set the parameters

    # start with default parameters
    params = defaultParameters()

    # Nordland spring dataset
    ds = AttributeDict()
    ds.name = 'spring'

    path = os.environ['DATASET_1_PATH']

    ds.imagePath = path

    ds.prefix = 'images-'
    ds.extension = '.png'
    ds.suffix = ''
    ds.imageSkip = 1  # use every n-nth image
    ds.imageIndices = range(1, 26, ds.imageSkip)
    ds.savePath = 'results'
    ds.saveFile = '%s-%d-%d-%d' % (ds.name, ds.imageIndices[0], ds.imageSkip,
                                   ds.imageIndices[-1])

    ds.preprocessing = AttributeDict()
    ds.preprocessing.save = 1
    ds.preprocessing.load = 0  #1
    ds.crop = []

    spring = ds

    ds2 = deepcopy(ds)
    # Nordland winter dataset
    ds2.name = 'winter'
    #ds.imagePath = '../datasets/nordland/64x32-grayscale-1fps/winter'
    path = os.environ['DATASET_2_PATH']

    ds2.saveFile = '%s-%d-%d-%d' % (ds2.name, ds2.imageIndices[0],
                                    ds2.imageSkip, ds2.imageIndices[-1])
    ds2.crop = []

    winter = ds2

    params.dataset = [spring, winter]

    # load old results or re-calculate?
    params.differenceMatrix.load = 0
    params.contrastEnhanced.load = 0
    params.matching.load = 0

    # where to save / load the results
    params.savePath = 'results'

    ## now process the dataset
    ss = SeqSLAM(params)
    t1 = time.time()
    results = ss.run()
    t2 = time.time()
    print "time taken: " + str(t2 - t1)

    ## show some results
    if len(results.matches) > 0:
        m = results.matches[:,
                            0]  # The LARGER the score, the WEAKER the match.
        thresh = 0.90  # you can calculate a precision-recall plot by varying this threshold
        m[results.matches[:,
                          1] > thresh] = np.nan  # remove the weakest matches
        plt.plot(m, '.')  # ideally, this would only be the diagonal
        plt.title('Matchings')
        plt.show()
    else:
        print "Zero matches"
Ejemplo n.º 39
0
from __future__ import unicode_literals

from ast import literal_eval
from utils import AttributeDict
import copy
import numpy as np
import os
import os.path as osp
import yaml

from configs.namer import generate_name
from logger.coil_logger import create_log, add_message

# TODO: NAMing conventions ?

_g_conf = AttributeDict()
"""#### GENERAL CONFIGURATION PARAMETERS ####"""
_g_conf.NUMBER_OF_LOADING_WORKERS = 12
_g_conf.SENSORS = {'rgb': (3, 128, 128)}
_g_conf.MEASUREMENTS = {'targets': (31)}
_g_conf.TARGETS = ['steer', 'throttle', 'brake']
_g_conf.INPUTS = ['speed_module']
_g_conf.BALANCE_DATA = True
_g_conf.STEERING_DIVISION = [0.05, 0.05, 0.1, 0.3, 0.3, 0.1, 0.05, 0.05]
#_g_conf.STEERING_DIVISION = [0.01, 0.02, 0.07, 0.4, 0.4, 0.07, 0.02, 0.01]  # Forcing curves alot
_g_conf.LABELS_DIVISION = [[0, 2, 5], [3], [4]]
_g_conf.BATCH_SIZE = 120

#_g_conf.AUGMENTATION_SUITE = [iag.ToGPU()]#, iag.Add((0, 0)), iag.Dropout(0, 0), iag.Multiply((1, 1.04)),
#                             #iag.GaussianBlur(sigma=(0.0, 3.0)),
#                             iag.ContrastNormalization((0.5, 1.5))
Ejemplo n.º 40
0
def setup_data(p, use_unlabeled=True, use_labeled=True):
    assert use_unlabeled or use_labeled, 'Cannot train without cost'
    dataset_class = DATASETS[p.dataset]
    dataset = dataset_class(p)
    train_ind = dataset.trn.ind

    if 'labeled_samples' not in p or p.labeled_samples == 0:
        n_labeled = len(train_ind)
    else:
        n_labeled = p.labeled_samples

    if 'unlabeled_samples' not in p:
        n_unlabeled = len(train_ind)
    else:
        n_unlabeled = p.unlabeled_samples

    assert p.batch_size <= n_labeled, "batch size too large"
    assert len(train_ind) >= n_labeled
    assert len(train_ind) >= n_unlabeled, "not enough training samples"
    assert n_labeled <= n_unlabeled, \
        "at least as many unlabeled samples as number of labeled samples"

    # If not using all labels, let's balance classes
    balance_classes = n_labeled < len(train_ind)

    if balance_classes and use_labeled:
        # Ensure each label is equally represented
        y = dataset.get_train_labels()
        n_classes = numpy.max(y) + 1

        n_from_each_class = n_labeled / n_classes
        logger.info('n_sample_from_each_class {0}'.format(n_from_each_class))
        assert n_labeled % n_classes == 0

        i_labeled = []
        for c in xrange(n_classes):
            i = (train_ind[y[:, 0] == c])[:n_from_each_class]
            if len(i) < n_from_each_class:
                logger.warning('Class {0} : only got {1}'.format(c, len(i)))
            i_labeled += list(i)

    else:
        i_labeled = train_ind[:n_labeled]

    def make_unlabeled_set(train_ind, i_labeled, n_unlabeled):
        """ i_unused_labeled: the labels that are not used in i_labeled.
        n_unlabeled_needed: the number of need for i_unlabeled beyond len(i_labeled)
        """
        i_unused_labeled = list(set(train_ind) - set(i_labeled))
        n_unlabeled_needed = n_unlabeled - len(i_labeled)
        i_unlabeled = i_unused_labeled[:n_unlabeled_needed]
        i_unlabeled.extend(i_labeled)

        return i_unlabeled

    i_unlabeled = make_unlabeled_set(train_ind, i_labeled, n_unlabeled)

    logger.info('Creating data set with %d labeled and %d total samples' %
                (len(i_labeled), len(i_unlabeled)))

    streams = AttributeDict()

    def make(kind, ind_labeled, ind_unlabeled):
        ds_labeled, ds_unlabeled = None, None
        if use_labeled:
            ds_labeled = dataset.get_datastream(kind, ind_labeled)
        if use_unlabeled:
            ds_unlabeled = dataset.get_datastream(kind, ind_unlabeled)

        return combine_datastreams(ds_labeled, ds_unlabeled)

    streams.train = make('trn', i_labeled, i_unlabeled)
    streams.valid = make('val', None, None)  # use all indices
    streams.test = make('tst', None, None)  # use all indices

    return streams
Ejemplo n.º 41
0
def demo():

    # set the parameters

    # start with default parameters
    params = defaultParameters()    
    
    # Nordland spring dataset
    ds = AttributeDict()
    ds.name = 'spring'
    
    try:
        path = os.environ['DATASET_1_PATH']
    except:
        path = '../datasets/nordland/64x32-grayscale-1fps/spring'
        print "Warning: Environment variable DATASET_1_PATH not found! Trying '"+path+"'"
    ds.imagePath = path
    
    ds.prefix='images-'
    ds.extension='.png'
    ds.suffix=''
    ds.imageSkip = 100     # use every n-nth image
    ds.imageIndices = range(1, 35700, ds.imageSkip)    
    ds.savePath = 'results'
    ds.saveFile = '%s-%d-%d-%d' % (ds.name, ds.imageIndices[0], ds.imageSkip, ds.imageIndices[-1])
    
    ds.preprocessing = AttributeDict()
    ds.preprocessing.save = 1
    ds.preprocessing.load = 0 #1
    #ds.crop=[1 1 60 32]  # x0 y0 x1 y1  cropping will be done AFTER resizing!
    ds.crop=[]
    
    spring=ds

    ds2 = deepcopy(ds)
    # Nordland winter dataset
    ds2.name = 'winter'
    #ds.imagePath = '../datasets/nordland/64x32-grayscale-1fps/winter'
    try:
        path = os.environ['DATASET_2_PATH']
    except:
        path = '../datasets/nordland/64x32-grayscale-1fps/winter'
        print "Warning: Environment variable DATASET_2_PATH not found! Trying '"+path+"'"
    ds2.saveFile = '%s-%d-%d-%d' % (ds2.name, ds2.imageIndices[0], ds2.imageSkip, ds2.imageIndices[-1])
    # ds.crop=[5 1 64 32]
    ds2.crop=[]
    
    winter=ds2      

    params.dataset = [spring, winter]

    # load old results or re-calculate?
    params.differenceMatrix.load = 0
    params.contrastEnhanced.load = 0
    params.matching.load = 0
    
    # where to save / load the results
    params.savePath='results'
              
    ## now process the dataset
    ss = SeqSLAM(params)  
    t1=time.time()
    results = ss.run()
    t2=time.time()          
    print "time taken: "+str(t2-t1)
    
    ## show some results
    if len(results.matches) > 0:
        m = results.matches[:,0] # The LARGER the score, the WEAKER the match.
        thresh=0.9  # you can calculate a precision-recall plot by varying this threshold
        m[results.matches[:,1]>thresh] = np.nan # remove the weakest matches
        plt.plot(m,'.')      # ideally, this would only be the diagonal
        plt.title('Matchings')   
        plt.show()    
    else:
        print "Zero matches"          
Ejemplo n.º 42
0
def defaultParameters():

    params = AttributeDict()

    # switches
    params.DO_PREPROCESSING = 1
    params.DO_RESIZE = 1
    params.DO_GRAYLEVEL = 1
    params.DO_PATCHNORMALIZATION = 0  #!!!! 1
    params.DO_SAVE_PREPROCESSED_IMG = 1
    params.DO_DIFF_MATRIX = 1
    params.DO_CONTRAST_ENHANCEMENT = 1
    params.DO_FIND_MATCHES = 1

    # parameters for preprocessing
    params.downsample = AttributeDict()
    params.downsample.size = [48, 64]  # height, width
    try:
        params.downsample.method = Image.LANCZOS
    except:
        params.downsample.method = Image.ANTIALIAS
    params.normalization = AttributeDict()
    params.normalization.sideLength = 8
    params.normalization.mode = 1

    # parameters regarding the matching between images
    params.matching = AttributeDict()
    params.matching.ds = 10
    params.matching.Rrecent = 5
    params.matching.vmin = 0.01
    params.matching.vskip = 0.1
    params.matching.vmax = 1.2
    params.matching.Rwindow = 8
    params.matching.save = 1
    params.matching.load = 0  #1

    # parameters for contrast enhancement on difference matrix
    params.contrastEnhancement = AttributeDict()
    params.contrastEnhancement.R = 10

    # load old results or re-calculate? save results?
    params.differenceMatrix = AttributeDict()
    params.differenceMatrix.save = 1
    params.differenceMatrix.load = 0  #1

    params.contrastEnhanced = AttributeDict()
    params.contrastEnhanced.save = 1
    params.contrastEnhanced.load = 0  #1

    # suffix appended on files containing the results
    params.saveSuffix = ''

    return params
Ejemplo n.º 43
0
    def apply_tagger(self, x, apply_noise, y=None):
        """ Build one path of Tagger """
        mb_size = x.shape[1]
        input_shape = (self.p.n_groups, mb_size) + self.in_dim
        in_dim = np.prod(self.in_dim)

        # Add noise
        x_corr = self.corrupt(x) if apply_noise else x
        # Repeat input
        x_corr = T.repeat(x_corr, self.p.n_groups, 0)

        # Compute v
        if self.p.input_type == 'binary':
            v = None
        elif self.p.input_type == 'continuous':
            v = self.weight(1., 'v')
            v = v * T.alloc(1., *input_shape)
            # Cap to positive range
            v = nn.exp_inv_sinh(v)

        d = AttributeDict()

        if y:
            d.pred = []
            d.class_error, d.class_cost = [], []
            # here we have the book-keeping of z and m for the visualizations.
            d.z = []
            d.m = []
        else:
            d.denoising_cost, d.ami_score, d.ami_score_per_sample = [], [], []

        assert self.p.n_iterations >= 1

        # z_hat is the value for the next iteration of tagger.
        # z is the current iteration tagger input
        # m is the current iteration mask input
        # m_hat is the value for the next iteration of tagger.
        # m_lh is the mask likelihood.
        # z_delta is the gradient of z, which depends on x, z and m.
        for step in xrange(self.p.n_iterations):
            # Encoder
            # =======

            # Compute m, z and z_hat_pre_bin
            if step == 0:
                # No values from previous iteration, so let's make them up
                m, z = self.init_m_z(input_shape)
                z_hat_pre_bin = None
                # let's keep in the bookkeeping for the visualizations.
                if y:
                    d.z.append(z)
                    d.m.append(m)
            else:
                # Feed in the previous iteration's estimates
                z = z_hat
                m = m_hat

            # Compute m_lh
            m_lh = self.m_lh(x_corr, z, v)
            z_delta = self.f_z_deriv(x_corr, z, m)

            z_tilde = z_hat_pre_bin if z_hat_pre_bin is not None else z
            # Concatenate all inputs
            inputs = [z_tilde, z_delta, m, m_lh]
            inputs = T.concatenate(inputs, axis=2)

            # Projection, batch-normalization and activation to a hidden layer
            z = self.proj(inputs, in_dim * 4, self.p.encoder_proj[0])

            z -= z.mean((0, 1), keepdims=True)
            z /= T.sqrt(z.var((0, 1), keepdims=True) + np.float32(1e-10))

            z += self.bias(0.0 * np.ones(self.p.encoder_proj[0]), 'b')
            h = self.apply_act(z, 'relu')

            # The first dimension is the group. Let's flatten together with
            # minibatch in order to have parametric mapping compute all groups
            # in parallel
            h, undo_flatten = flatten_first_two_dims(h)

            # Parametric Mapping
            # ==================

            self.ladder.apply(None, self.y, h)
            ladder_encoder_output = undo_flatten(
                self.ladder.act.corr.unlabeled.h[len(self.p.encoder_proj) - 1])
            ladder_decoder_output = undo_flatten(self.ladder.act.est.z[0])

            # Decoder
            # =======

            # compute z_hat
            z_u = self.proj(ladder_decoder_output,
                            self.p.encoder_proj[0],
                            in_dim,
                            scope='z_u')

            z_u -= z_u.mean((0, 1), keepdims=True)
            z_u /= T.sqrt(z_u.var((0, 1), keepdims=True) + np.float32(1e-10))

            z_hat = self.weight(np.ones(in_dim), 'c1') * z_u + self.bias(
                np.zeros(in_dim), 'b1')
            z_hat = z_hat.reshape(input_shape)

            # compute m_hat
            m_u = self.proj(ladder_decoder_output,
                            self.p.encoder_proj[0],
                            in_dim,
                            scope='m_u')

            m_u -= m_u.mean((0, 1), keepdims=True)
            m_u /= T.sqrt(m_u.var((0, 1), keepdims=True) + np.float32(1e-10))

            c = self.weight(np.float32(1), 'c2')
            m_hat = nn.softmax_n(m_u * c, axis=0)
            m_hat = m_hat.reshape(input_shape)

            # Apply sigmoid activation if input_type is binary
            if self.p.input_type == 'binary':
                z_hat_pre_bin = z_hat
                z_hat = self.apply_act(z_hat, 'sigmoid')

            # Collapse layer
            # ==============

            # Remove the last dim, which is assumed to be class 'None'
            pred = ladder_encoder_output[:, :, :-1]
            # Normalize
            pred /= T.sum(T.sum(pred, axis=2, keepdims=True),
                          axis=0,
                          keepdims=True)

            # Denoising and Classification costs
            # ==================================

            if y:
                class_cost, class_error = self.compute_classification_cost_and_error(
                    pred, y)
                d.pred.append(pred)
                d.class_cost.append(class_cost)
                d.class_error.append(class_error)

                d.m.append(m_hat)
                d.z.append(z_hat)
            else:
                d.denoising_cost.append(self.denoising_cost(
                    z_hat, m_hat, x, v))

                ami_score, ami_score_per_sample = self.mask_accuracy(
                    self.masks_unlabeled, m_hat)
                d.ami_score.append(ami_score)
                d.ami_score_per_sample.append(ami_score_per_sample)

        # stack the list of tensors into one
        d = AttributeDict(
            {key: T.stacklists(val)
             for key, val in d.iteritems()})

        return d
Ejemplo n.º 44
0
def setup_data(p, test_set=False):
    dataset_class, training_set_size = {
        'cifar10': (CIFAR10, 40000),
        'mnist': (MNIST, 50000),
        'conll': (EMBOOT_CONLL, 13900),
        'ontonotes': (EMBOOT_ONTO, 67000)
    }[p.dataset]

    print("p.dataset = ", p.dataset)

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples
        print("Training set size : ", training_set_size)

    train_set = dataset_class(["train"])
    print("train_set.num_examples : ", train_set.num_examples)

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(["test"])
        d.test_ind = numpy.arange(d.test.num_examples)
        print("d.test.num_examples = ", d.test.num_examples)

    in_dim = train_set.data_sources[train_set.sources.index(
        'features')].shape[1:]

    if p.dataset == 'conll' or p.dataset == 'ontonotes':
        whiten = None
        cnorm = None

    # Setup optional whitening, only used for Cifar-10
    elif p.dataset == 'cifar10':
        if len(in_dim) > 1 and p.whiten_zca > 0:
            assert numpy.product(in_dim) == p.whiten_zca, \
                'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                          p.whiten_zca)
        cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

        def get_data(d, i):
            data = d.get_data(request=list(i))[d.sources.index('features')]
            # Fuel provides Cifar in uint8, convert to float32
            data = numpy.require(data, dtype=numpy.float32)
            return data if cnorm is None else cnorm.apply(data)

        if p.whiten_zca > 0:
            logger.info('Whitening using %d ZCA components' % p.whiten_zca)
            whiten = ZCA()
            whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
        else:
            whiten = None

    return in_dim, d, whiten, cnorm
Ejemplo n.º 45
0
def setup_data(p, test_set=False):

    # CIFAR10与MNIST都是封装过后的HDF5数据集
    # p.dataset为命令行传入的参数,在cifar10与mnist之间选择其一
    dataset_class, training_set_size = {
        'cifar10': (CIFAR10, 40000),
        'mnist': (MNIST, 50000),
    }[p.dataset]

    # 可以通过命令行指定为标注样本的大小
    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    # 选出mnist数据集里面的train子集
    train_set = dataset_class("train")

    # Make sure the MNIST data is in right format
    # 对minst进行数据检查,查看是否所有值都在0-1之间且都为float
    if p.dataset == 'mnist':
        # features大小为60000*1*28*28,num_examples*channel*height*weight,minst为灰度图片所以channel=1
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # 随机打乱样本顺序
    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        # 通过dseed制作一个随机器,用于打乱样本编号
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    # 此时index应该都被打乱
    # 取出前training_set_size个数的样本做为训练集(的index)
    d.train_ind = all_ind[:training_set_size]

    # 选出一部分数据作为验证集
    # Then choose validation set from the remaining indices
    d.valid = train_set
    # 全部的数据集中去掉训练用的样本,剩下的作为验证集
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]

    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # 如果有测试数据的话,生成测试数据的index
    # Only touch test data if requested
    if test_set:
        d.test = dataset_class("test")
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    # 计算特征值的维度,shape[1:]:获取第一个样本的维度
    in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)

    # 归一化参数如果不为空,创建归一化类
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index('features')]

        # Fuel provides Cifar in uint8, convert to float32
        # 检查data集合中的item是否符合float32类型
        data = numpy.require(data, dtype=numpy.float32)
        # TODO ContrastNorm.apply
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        # TODO ZCA
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm
Ejemplo n.º 46
0
Archivo: run.py Proyecto: josvr/ladder
def setup_data(p, test_set=False):
    dataset_class = {
        'cifar10': (CIFAR10),
        'jos' : (JOS),
        'mnist': (MNIST),
    }[p.dataset]

    training_set_size = p.unlabeled_samples 

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class(["train"])

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(["test"])
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    fn = find_in_data_path(train_set.filename)
    #iprint(fn)
    s1 = H5PYDataset(fn, ("train",))
    handle = s1.open()
    in_dim =  s1.get_data(handle,slice(0,1))[0].shape[1:]
    s1.close(handle)
    #in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=list(i))[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm
Ejemplo n.º 47
0
def setup_data(p, test_set=False):
    if p.dataset in ['cifar10','mnist']:
        dataset_class, training_set_size = {
            'cifar10': (CIFAR10, 40000),
            'mnist': (MNIST, 50000),
        }[p.dataset]
    else:
        from fuel.datasets import H5PYDataset
        from fuel.utils import find_in_data_path
        from functools import partial
        fn=p.dataset
        fn=os.path.join(fn, fn + '.hdf5')
        def dataset_class(which_sets):
            return H5PYDataset(file_or_path=find_in_data_path(fn),
                               which_sets=which_sets,
                               load_in_memory=True)
        training_set_size = None

    train_set = dataset_class(["train"])

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None and p.unlabeled_samples >= 0:
        training_set_size = p.unlabeled_samples
    elif training_set_size is None:
        training_set_size = train_set.num_examples

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(["test"])
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm
Ejemplo n.º 48
0
    def apply(self, input_labeled, target_labeled, input_unlabeled):
        self.layer_counter = 0
        input_dim = self.p.encoder_layers[0]

        # Store the dimension tuples in the same order as layers.
        layers = self.layers
        self.layer_dims = {0: input_dim}

        self.lr = self.shared(self.default_lr, 'learning_rate', role=None)

        self.costs = costs = AttributeDict()
        self.costs.denois = AttributeDict()

        self.act = AttributeDict()
        self.error = AttributeDict()

        top = len(layers) - 1

        N = input_labeled.shape[0]
        self.join = lambda l, u: T.concatenate([l, u], axis=0)
        self.labeled = lambda x: x[:N] if x is not None else x
        self.unlabeled = lambda x: x[N:] if x is not None else x
        self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x))

        input_concat = self.join(input_labeled, input_unlabeled)

        def encoder(input_, path_name, input_noise_std=0, noise_std=[]):
            h = input_

            logger.info('  0: noise %g' % input_noise_std)
            if input_noise_std > 0.:
                h = h + self.noise_like(h) * input_noise_std

            d = AttributeDict()
            d.unlabeled = self.new_activation_dict()
            d.labeled = self.new_activation_dict()
            d.labeled.z[0] = self.labeled(h)
            d.unlabeled.z[0] = self.unlabeled(h)
            prev_dim = input_dim
            for i, (spec, _, act_f) in layers[1:]:
                d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h)
                noise = noise_std[i] if i < len(noise_std) else 0.
                curr_dim, z, m, s, h = self.f(h,
                                              prev_dim,
                                              spec,
                                              i,
                                              act_f,
                                              path_name=path_name,
                                              noise_std=noise)
                assert self.layer_dims.get(i) in (None, curr_dim)
                self.layer_dims[i] = curr_dim
                d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z)
                d.unlabeled.s[i] = s
                d.unlabeled.m[i] = m
                prev_dim = curr_dim
            d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h)
            return d

        # Clean, supervised
        logger.info('Encoder: clean, labeled')
        clean = self.act.clean = encoder(input_concat, 'clean')

        # Corrupted, supervised
        logger.info('Encoder: corr, labeled')
        corr = self.act.corr = encoder(input_concat,
                                       'corr',
                                       input_noise_std=self.p.super_noise_std,
                                       noise_std=self.p.f_local_noise_std)
        est = self.act.est = self.new_activation_dict()

        # Decoder path in opposite order
        logger.info('Decoder: z_corr -> z_est')
        for i, ((_, spec), l_type, act_f) in layers[::-1]:
            z_corr = corr.unlabeled.z[i]
            z_clean = clean.unlabeled.z[i]
            z_clean_s = clean.unlabeled.s.get(i)
            z_clean_m = clean.unlabeled.m.get(i)
            fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None,
                                                                     None)

            if i == top:
                ver = corr.unlabeled.h[i]
                ver_dim = self.layer_dims[i]
                top_g = True
            else:
                ver = est.z.get(i + 1)
                ver_dim = self.layer_dims.get(i + 1)
                top_g = False

            z_est = self.g(z_lat=z_corr,
                           z_ver=ver,
                           in_dims=ver_dim,
                           out_dims=self.layer_dims[i],
                           l_type=l_type,
                           num=i,
                           fspec=fspec,
                           top_g=top_g)

            if z_est is not None:
                # Denoising cost
                if z_clean_s:
                    z_est_norm = (z_est - z_clean_m) / z_clean_s
                else:
                    z_est_norm = z_est

                se = SquaredError('denois' + str(i))
                costs.denois[i] = se.apply(z_est_norm.flatten(2),
                                           z_clean.flatten(2)) \
                    / np.prod(self.layer_dims[i], dtype=floatX)
                costs.denois[i].name = 'denois' + str(i)
                denois_print = 'denois %.2f' % self.p.denoising_cost_x[i]
            else:
                denois_print = ''

            # Store references for later use
            est.h[i] = self.apply_act(z_est, act_f)
            est.z[i] = z_est
            est.s[i] = None
            est.m[i] = None
            logger.info('  g%d: %10s, %s, dim %s -> %s' %
                        (i, l_type, denois_print, self.layer_dims.get(i + 1),
                         self.layer_dims.get(i)))

        # Costs
        y = target_labeled.flatten()

        costs.class_clean = CategoricalCrossEntropy().apply(
            y, clean.labeled.h[top])
        costs.class_clean.name = 'cost_class_clean'

        costs.class_corr = CategoricalCrossEntropy().apply(
            y, corr.labeled.h[top])
        costs.class_corr.name = 'cost_class_corr'

        # This will be used for training
        costs.total = costs.class_corr * 1.0
        for i in range(top + 1):
            if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0:
                costs.total += costs.denois[i] * self.p.denoising_cost_x[i]
        costs.total.name = 'cost_total'

        # Classification error
        mr = MisclassificationRate()
        self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.)
        self.error.clean.name = 'error_rate_clean'
Ejemplo n.º 49
0
import sys
from utils import AttributeDict
from tagger_exp import TaggerExperiment

p = AttributeDict()

p.encoder_proj = (3000, 2000, 1000)
p.input_noise = 0.2
p.class_cost_x = 0.
p.zhat_init_value = 0.5

p.n_iterations = 3
p.n_groups = 4
p.lr = 0.001
p.labeled_samples = 1000
p.save_freq = 50
p.seed = 1
p.num_epochs = 150
p.batch_size = 100
p.valid_batch_size = 100
p.objects_per_sample = 2

p.dataset = 'freq20-2mnist'
p.input_type = 'continuous'

if __name__ == '__main__':
    if len(sys.argv) == 2 and sys.argv[1] == '--pretrain':
        p.save_to = 'freq20-2mnist-pretraining'
        experiment = TaggerExperiment(p)
        experiment.train()
    elif len(sys.argv) == 3 and sys.argv[1] == '--continue':
Ejemplo n.º 50
0
 def __init__(self, *args, **kwargs):
     AttributeDict.__init__(self)
     
     self.env  = Environment.getInstance()
     self.resourceType = self.__class__.__name__
     self.isUpdated = False
     
     seen = set()
     
     if not hasattr(self, '_schema'):
         raise Fail("Resource failed to define a valid _schema")
     
     # union global schema with local schema
     schema = copy.deepcopy(self._schema)
     for key in self.s_globalSchema:
         if not key in schema:
             schema[key] = self.s_globalSchema[key]
     
     resolvedArgs = { }
     keys = schema.keys()
     keysLen = len(keys)
     index = 0
     
     # resolve unnamed arguments with names corresponding to the order 
     # they were passed to Resource's ctor and their relative definitions 
     # in the subclass' ResourceArgumentSchema (which is an OrderedDict, 
     # so as to retain this ordering information).
     for arg in args:
         if index < keysLen:
             key = keys[index]
             resolvedArgs[keys[index]] = arg
         else:
             raise InvalidArgument("Invalid unnamed argument %s provided to resource %s" % (arg, str(self)))
         
         index += 1
     
     for arg in kwargs:
         if arg in resolvedArgs:
             raise InvalidArgument("Invalid mixture of named and unnamed arguments provided to resource %s, possibly around argument %s" % (str(self), arg))
         else:
             resolvedArgs[arg] = kwargs[arg]
     
     utils.log("Initializing resource '%s' with args: %s" % (self.resourceType, resolvedArgs))
     
     # validate resource arguments
     output = schema.validate(resolvedArgs)
     for key in output:
         self[key] = output[key]
     
     self.subscriptions = {
         'immediate' : set(), 
         'delayed' : set()
     }
     
     for sub in self.subscribes:
         if len(sub) == 2:
             action, resource = sub
             immediate = False
         else:
             action, resource, immediate = sub
         
         resource.subscribe(action, self, immediate)
     
     for sub in self.notifies:
         self.subscribe(*sub)
     
     self._validate()
     self._register()
     utils.log("Added new resource '%s'" % (str(self), ))
Ejemplo n.º 51
0
    def apply_tagger(self, x, apply_noise, y=None):
        """ Build one path of Tagger """
        mb_size = x.shape[1]
        input_shape = (self.p.n_groups, mb_size) + self.in_dim
        in_dim = np.prod(self.in_dim)

        # Add noise
        x_corr = self.corrupt(x) if apply_noise else x
        # Repeat input
        x_corr = T.repeat(x_corr, self.p.n_groups, 0)

        # Compute v
        if self.p.input_type == 'binary':
            v = None
        elif self.p.input_type == 'continuous':
            v = self.weight(1., 'v')
            v = v * T.alloc(1., *input_shape)
            # Cap to positive range
            v = nn.exp_inv_sinh(v)

        d = AttributeDict()

        if y:
            d.pred = []
            d.class_error, d.class_cost = [], []
            # here we have the book-keeping of z and m for the visualizations.
            d.z = []
            d.m = []
        else:
            d.denoising_cost, d.ami_score, d.ami_score_per_sample = [], [], []

        assert self.p.n_iterations >= 1

        # z_hat is the value for the next iteration of tagger.
        # z is the current iteration tagger input
        # m is the current iteration mask input
        # m_hat is the value for the next iteration of tagger.
        # m_lh is the mask likelihood.
        # z_delta is the gradient of z, which depends on x, z and m.
        for step in xrange(self.p.n_iterations):
            # Encoder
            # =======

            # Compute m, z and z_hat_pre_bin
            if step == 0:
                # No values from previous iteration, so let's make them up
                m, z = self.init_m_z(input_shape)
                z_hat_pre_bin = None
                # let's keep in the bookkeeping for the visualizations.
                if y:
                    d.z.append(z)
                    d.m.append(m)
            else:
                # Feed in the previous iteration's estimates
                z = z_hat
                m = m_hat

            # Compute m_lh
            m_lh = self.m_lh(x_corr, z, v)
            z_delta = self.f_z_deriv(x_corr, z, m)

            z_tilde = z_hat_pre_bin if z_hat_pre_bin is not None else z
            # Concatenate all inputs
            inputs = [z_tilde, z_delta, m, m_lh]
            inputs = T.concatenate(inputs, axis=2)

            # Projection, batch-normalization and activation to a hidden layer
            z = self.proj(inputs, in_dim * 4, self.p.encoder_proj[0])

            z -= z.mean((0, 1), keepdims=True)
            z /= T.sqrt(z.var((0, 1), keepdims=True) + np.float32(1e-10))

            z += self.bias(0.0 * np.ones(self.p.encoder_proj[0]), 'b')
            h = self.apply_act(z, 'relu')

            # The first dimension is the group. Let's flatten together with
            # minibatch in order to have parametric mapping compute all groups
            # in parallel
            h, undo_flatten = flatten_first_two_dims(h)

            # Parametric Mapping
            # ==================

            self.ladder.apply(None, self.y, h)
            ladder_encoder_output = undo_flatten(self.ladder.act.corr.unlabeled.h[len(self.p.encoder_proj) - 1])
            ladder_decoder_output = undo_flatten(self.ladder.act.est.z[0])

            # Decoder
            # =======

            # compute z_hat
            z_u = self.proj(ladder_decoder_output, self.p.encoder_proj[0], in_dim, scope='z_u')

            z_u -= z_u.mean((0, 1), keepdims=True)
            z_u /= T.sqrt(z_u.var((0, 1), keepdims=True) + np.float32(1e-10))

            z_hat = self.weight(np.ones(in_dim), 'c1') * z_u + self.bias(np.zeros(in_dim), 'b1')
            z_hat = z_hat.reshape(input_shape)

            # compute m_hat
            m_u = self.proj(ladder_decoder_output, self.p.encoder_proj[0], in_dim, scope='m_u')

            m_u -= m_u.mean((0, 1), keepdims=True)
            m_u /= T.sqrt(m_u.var((0, 1), keepdims=True) + np.float32(1e-10))

            c = self.weight(np.float32(1), 'c2')
            m_hat = nn.softmax_n(m_u * c, axis=0)
            m_hat = m_hat.reshape(input_shape)

            # Apply sigmoid activation if input_type is binary
            if self.p.input_type == 'binary':
                z_hat_pre_bin = z_hat
                z_hat = self.apply_act(z_hat, 'sigmoid')

            # Collapse layer
            # ==============

            # Remove the last dim, which is assumed to be class 'None'
            pred = ladder_encoder_output[:, :, :-1]
            # Normalize
            pred /= T.sum(T.sum(pred, axis=2, keepdims=True), axis=0, keepdims=True)

            # Denoising and Classification costs
            # ==================================

            if y:
                class_cost, class_error = self.compute_classification_cost_and_error(pred, y)
                d.pred.append(pred)
                d.class_cost.append(class_cost)
                d.class_error.append(class_error)

                d.m.append(m_hat)
                d.z.append(z_hat)
            else:
                d.denoising_cost.append(self.denoising_cost(z_hat, m_hat, x, v))

                ami_score, ami_score_per_sample = self.mask_accuracy(self.masks_unlabeled, m_hat)
                d.ami_score.append(ami_score)
                d.ami_score_per_sample.append(ami_score_per_sample)

        # stack the list of tensors into one
        d = AttributeDict({key: T.stacklists(val) for key, val in d.iteritems()})

        return d
Ejemplo n.º 52
0
from utils import AttributeDict
from tagger_exp import TaggerExperiment

p = AttributeDict()

p.encoder_proj = (2000, 1000, 500)
p.input_noise = 0.2
p.class_cost_x = 0
p.zhat_init_value = 0.26  # mean of the input data.

p.n_iterations = 3
p.n_groups = 4
p.lr = 0.0004
p.seed = 10
p.num_epochs = 100
p.batch_size = 100
p.valid_batch_size = 100

p.dataset = 'shapes50k20x20'
p.input_type = 'binary'

p.save_to = 'shapes50k20x20'

if __name__ == '__main__':
    experiment = TaggerExperiment(p)
    experiment.train()