Beispiel #1
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            # mean_squared_grad := \sum g_t^2
            mean_squared_grad = theano.shared(
                theano._asarray(param.get_value() * 0.0, dtype=theano.config.floatX),
                name="mean_square_grad_" + param.name,
                borrow=False,
            )
            self.parameters.append(mean_squared_grad)

            # Accumulate gradient
            new_mean_squared_grad = T.cast(
                self.decay * mean_squared_grad + (1 - self.decay) * T.sqr(grads[param]), dtype=theano.config.floatX
            )

            # Compute update
            root_mean_squared = T.sqrt(new_mean_squared_grad + self.epsilon)

            # Apply update
            updates[mean_squared_grad] = new_mean_squared_grad
            updates[param] = param - (self.learning_rate / root_mean_squared) * grads[param]

        return updates
Beispiel #2
0
    def get_gradients(self, model, data, **kwargs):
        indiv_results = []
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)
        for cost, cost_data in safe_zip(self.costs, nested_data):
            result = cost.get_gradients(model, cost_data, **kwargs)
            indiv_results.append(result)

        grads = OrderedDict()
        updates = OrderedDict()
        params = model.get_params()

        for coeff, packed in zip(self.coeffs, indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError(
                        "A shared variable (" + str(param) + ") that is not a parameter appeared "
                        "a cost gradient dictionary."
                    )
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
Beispiel #3
0
    def get_params(self):
        """
        This returns the list of theano shared variables that will be trained by the :class:`Optimizer`.
        These parameters are used in the gradient.

        This includes all of the parameters in every model in the Prototype, without duplication.

        Returns
        -------
        dict(str: SharedVariable)
            Dictionary of {string_name: theano shared variables} to be trained with an :class:`Optimizer`.
            These are the parameters to be trained.
        """
        params = OrderedDict()
        model_index = 0
        for model in self.models:
            if isinstance(model, Model):
                model_params = model.get_params()
                # append the parameters only if they aren't already in the list!
                for name, param in model_params.items():
                    if param not in list(params.values()):
                        name = model._classname + "_%d_" % model_index + name
                        params[name] = param
                model_index += 1
        return params
Beispiel #4
0
    def get_monitoring_channels(self, model, X, Y=None, **kwargs):
        if Y is None and self.supervised:
            raise ValueError("no targets provided while some of the " + "costs in the sum are supervised costs")

        rval = OrderedDict()

        for i, cost in enumerate(self.costs):
            try:
                rval.update(cost.get_monitoring_channels(model, X, Y, **kwargs))
            except TypeError:
                print "SumOfCosts.get_monitoring_channels encountered TypeError while calling " + str(
                    type(cost)
                ) + ".get_monitoring_channels"
                raise

            Y_to_pass = Y
            if not cost.supervised:
                Y_to_pass = None

            value = cost(model, X, Y_to_pass, **kwargs)
            if value is not None:
                name = ""
                if hasattr(value, "name") and value.name is not None:
                    name = "_" + value.name
                rval["term_" + str(i) + name] = value

        return rval
Beispiel #5
0
    def __init__(self, valid=None, invalid=None, valid_equivalent=None):
        """
        Check if variables can be expressed without using variables in invalid.

        init_valid_equivalent provides a dictionary mapping some invalid
        variables to valid ones that can be used instead.
        """

        if valid is None:
            valid = []
        if invalid is None:
            invalid = []
        if valid_equivalent is None:
            valid_equivalent = OrderedDict()

        # Nodes that are valid to have in the graph computing outputs
        self.valid = set(valid)

        # Nodes that are NOT valid to have in the graph computing outputs
        self.invalid = set(invalid)

        # Mapping from invalid variables to equivalent valid ones.
        self.valid_equivalent = valid_equivalent.copy()
        self.valid.update(valid_equivalent.values())
        self.invalid.update(valid_equivalent.keys())
Beispiel #6
0
    def get_monitoring_channels(self, model, data, **kwargs):
        self.get_data_specs(model)[0].validate(data)
        rval = OrderedDict()
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)

        for i, cost in enumerate(self.costs):
            cost_data = nested_data[i]
            try:
                channels = cost.get_monitoring_channels(model, cost_data, **kwargs)
                rval.update(channels)
            except TypeError:
                print (
                    "SumOfCosts.get_monitoring_channels encountered "
                    "TypeError while calling " + str(type(cost)) + ".get_monitoring_channels"
                )
                raise

            value = cost.expr(model, cost_data, **kwargs)
            if value is not None:
                name = ""
                if hasattr(value, "name") and value.name is not None:
                    name = "_" + value.name
                rval["term_" + str(i) + name] = value

        return rval
Beispiel #7
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        i_t = self.i + 1.0
        fix1 = 1.0 - (1.0 - self.b1) ** i_t
        fix2 = 1.0 - (1.0 - self.b2) ** i_t
        lr_t = self.learning_rate * (T.sqrt(fix2) / fix1)

        for param in grads.keys():
            m = theano.shared(param.get_value() * 0.0)
            self.parameters.append(m)
            v = theano.shared(param.get_value() * 0.0)
            self.parameters.append(v)

            b1t = 1.0 - (1.0 - self.b1) * self.lmbda ** (i_t - 1)
            m_t = b1t * grads[param] + (1.0 - b1t) * m
            v_t = self.b2 * T.sqr(grads[param]) + (1.0 - self.b2) * v
            g_t = m_t / (T.sqrt(v_t) + self.epsilon)
            p_t = param - (lr_t * g_t)

            updates[m] = m_t
            updates[v] = v_t
            updates[param] = p_t
        updates[self.i] = i_t

        return updates
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        Compute the AdaDelta updates

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        updates = OrderedDict()

        tot_norm_up = 0

        gshared = OrderedDict({p: sharedX(p.get_value() * 0.0, name="%s_grad" % p.name) for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y ** 2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup)

        for param in gshared.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.0)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.0)

            if param.name is not None:
                mean_square_grad.name = "mean_square_grad_" + param.name
                mean_square_dx.name = "mean_square_dx_" + param.name

            # Accumulate gradient
            new_mean_squared_grad = self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gshared[param])

            # Compute update
            epsilon = learning_rate
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = -rms_dx_tm1 / rms_grad_t * gshared[param]

            # Accumulate updates
            new_mean_square_dx = self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t)

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

            tot_norm_up += delta_x_t.norm(2)

        f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input="ignore")

        return f_grad_shared, f_update
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        if self.gradient_clipping is not None:
            grads_norm = sum(map(lambda X: T.sqr(X).sum(), [grads[param] for param in grads.keys()]))
            grads_norm = T.sqrt(grads_norm)
            scaling_den = T.maximum(self.gradient_clipping, grads_norm)
            scaling_num = self.gradient_clipping
            for param in grads.keys():
                grads[param] = scaling_num * grads[param] / scaling_den

        updates = OrderedDict()
        velocity = OrderedDict()
        normalized_velocities = OrderedDict()

        counter = sharedX(0, "counter")
        tot_norm_up = 0
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.0, name="%s_grad" % p.name) for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y ** 2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup)
        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            next_counter = counter + 1.0

            fix_first_moment = 1.0 - self.momentum ** next_counter
            fix_second_moment = 1.0 - self.averaging_coeff ** next_counter

            if param.name is not None:
                avg_grad_sqr.name = "avg_grad_sqr_" + param.name

            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr + (1 - self.averaging_coeff) * T.sqr(gshared[param])

            rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            new_velocity = self.momentum * velocity[param] - (1 - self.momentum) * gshared[param]
            normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) / (rms_grad_t * fix_first_moment)

            tot_norm_up += learning_rate * normalized_velocity.norm(2)

            normalized_velocities[param] = normalized_velocity
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + normalized_velocities[param]

        updates[counter] = counter + 1
        f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input="ignore")

        return f_grad_shared, f_update
def main():
    var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name="W")
    updates = [(var, add_uniform(input=var, noise_level=0.02))]

    stats = get_stats(var)
    l1 = stats.pop("l1")
    l2 = stats.pop("l2")
    min = stats.pop("min")
    max = stats.pop("max")
    var = stats.pop("var")
    std = stats.pop("std")
    mean = stats.pop("mean")

    mean_monitor = Monitor("mean", mean, train=True, valid=True, out_service=FileService("outs/mean.txt"))
    var_monitor = Monitor("var", var, out_service=FileService("outs/var.txt"))

    w_channel = MonitorsChannel("W", monitors=mean_monitor)

    stat_channel = MonitorsChannel("stats", monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed_raw = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw])
    train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw])
    valid_collapsed_raw = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw])
    valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw])

    log.debug("compiling...")
    f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates)
    f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates)
    log.debug("done")

    t1 = time.time()

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        for name, service in train_services.items():
            if name in m:
                service.write(m[name], "train")
        log.debug("----- " + make_time_units_string(time.time() - t))

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        for name, service in valid_services.items():
            if name in m:
                service.write(m[name], "valid")
        log.debug("----- " + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME " + make_time_units_string(time.time() - t1))
Beispiel #11
0
    def get_layer_monitoring_channels(self, state_below=None, state=None, targets=NotImplementedError):

        if self.no_affine:
            return OrderedDict()

        W_class = self.W_class
        W_cluster = self.W_cluster

        assert W_class.ndim == 3
        assert W_cluster.ndim == 2

        sq_W = T.sqr(W_cluster)
        sq_W_class = T.sqr(W_class)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        row_norms_class = T.sqrt(sq_W_class.sum(axis=1))
        col_norms_class = T.sqrt(sq_W_class.sum(axis=0))

        rval = OrderedDict(
            [
                ("row_norms_min", row_norms.min()),
                ("row_norms_mean", row_norms.mean()),
                ("row_norms_max", row_norms.max()),
                ("col_norms_min", col_norms.min()),
                ("col_norms_mean", col_norms.mean()),
                ("col_norms_max", col_norms.max()),
                ("class_row_norms_min", row_norms_class.min()),
                ("class_row_norms_mean", row_norms_class.mean()),
                ("class_row_norms_max", row_norms_class.max()),
                ("class_col_norms_min", col_norms_class.min()),
                ("class_col_norms_mean", col_norms_class.mean()),
                ("class_col_norms_max", col_norms_class.max()),
            ]
        )

        if (state_below is not None) or (state is not None):
            if state is None:

                # for value in get_debug_values(state_below):
                # print 'value is'+ value
                state = self.fprop(state_below, targets)
            # print state
            probclass, probcluster = state
            mx = probclass.max(axis=1)
            rval.update(
                OrderedDict([("mean_max_class", mx.mean()), ("max_max_class", mx.max()), ("min_max_class", mx.min())])
            )
            if targets is not None:
                rval["nll"] = self.cost(Y=targets, Y_hat=(probclass, probcluster))
                rval["perplexity"] = 10 ** (rval["nll"] / np.log(10).astype("float32"))
                rval["entropy"] = rval["nll"] / np.log(2).astype("float32")
        return rval
Beispiel #12
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            decreased_learning_rate = T.cast(
                self.learning_rate / (1 + (self.decrease_constant * self.current_iteration)), dtype=theano.config.floatX
            )
            updates[param] = param - decreased_learning_rate * grads[param]

        updates[self.current_iteration] = self.current_iteration + 1

        return updates
Beispiel #13
0
    def get_gradients(self, model, data, **kwargs):

        cost_cd, cost_ci = model.cost_from_X(data)
        params_dict = model.get_params()
        params = list(params_dict)

        zero_grads = []
        if self.zero_ci_grad_for_cd:
            # how to get this in less explicit way, i.e. using only dict?
            print "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
            assert model.layers[-1].M in params_dict
            assert model.layers[-1].m in params_dict
            zero_grads = [model.layers[-1].M, model.layers[-1].m]

        grads_cd = T.grad(cost_cd, params, disconnected_inputs="ignore", consider_constant=zero_grads)
        grads_ci = T.grad(cost_ci, params, disconnected_inputs="ignore")

        gradients_cd = OrderedDict(izip(params, grads_cd))
        gradients_ci = OrderedDict(izip(params, grads_ci))

        indiv_results = []
        indiv_results.append((gradients_cd, OrderedDict()))
        indiv_results.append((gradients_ci, OrderedDict()))

        grads = OrderedDict()
        updates = OrderedDict()
        params = model.get_params()

        for coeff, packed in zip([self.coeff_cd, self.coeff_ci], indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError(
                        "A shared variable ("
                        + str(param)
                        + ") that is not a parameter appeared in a cost gradient dictionary."
                    )
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
Beispiel #14
0
    def get_gradients(self, model, data, **kwargs):
        cost = self._cost(model, data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs="ignore", consider_constant=[self.sampler.particles])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        sampler_updates = self.sampler.updates()
        updates.update(sampler_updates)
        return gradients, updates
Beispiel #15
0
 def __init__(
     self,
     parent=[],
     parent_dim=[],
     nout=None,
     init_W=InitCell("randn"),
     init_b=InitCell("zeros"),
     cons=0.0,
     name=None,
     lr_scaler=None,
     **kwargs
 ):
     super(StemCell, self).__init__(**kwargs)
     if name is None:
         name = self.__class__.name__.lower()
     self.name = name
     self.nout = nout
     self.init_W = init_W
     self.init_b = init_b
     self.cons = cons
     self.parent = OrderedDict()
     parent_dim = tolist(parent_dim)
     for i, par in enumerate(tolist(parent)):
         if len(parent_dim) != 0 and len(parent) != 0:
             if len(parent) != len(parent_dim):
                 raise AssertionError(
                     "You probably had a mistake providing,\
                                       write number of values. It will end,\
                                       up with a model containing a bug."
                 )
             self.parent[par] = parent_dim[i]
         else:
             self.parent[par] = None
     self.params = OrderedDict()
     self.lr_scaler = lr_scaler
Beispiel #16
0
class StemCell(NonlinCell):
    """
    WRITEME

    Parameters
    ----------
    .. todo::
    """

    def __init__(
        self,
        parent=[],
        parent_dim=[],
        nout=None,
        init_W=InitCell("randn"),
        init_b=InitCell("zeros"),
        cons=0.0,
        name=None,
        lr_scaler=None,
        **kwargs
    ):
        super(StemCell, self).__init__(**kwargs)
        if name is None:
            name = self.__class__.name__.lower()
        self.name = name
        self.nout = nout
        self.init_W = init_W
        self.init_b = init_b
        self.cons = cons
        self.parent = OrderedDict()
        parent_dim = tolist(parent_dim)
        for i, par in enumerate(tolist(parent)):
            if len(parent_dim) != 0 and len(parent) != 0:
                if len(parent) != len(parent_dim):
                    raise AssertionError(
                        "You probably had a mistake providing,\
                                          write number of values. It will end,\
                                          up with a model containing a bug."
                    )
                self.parent[par] = parent_dim[i]
            else:
                self.parent[par] = None
        self.params = OrderedDict()
        self.lr_scaler = lr_scaler

    def get_params(self):
        return self.params

    def fprop(self, x=None):
        raise NotImplementedError(str(type(self)) + " does not implement Layer.fprop.")

    def alloc(self, x):
        self.params[x.name] = x

    def initialize(self):
        for parname, parout in self.parent.items():
            W_shape = (parout, self.nout)
            W_name = "W_" + parname + "__" + self.name
            self.alloc(self.init_W.get(W_shape, W_name))
        self.alloc(self.init_b.get(self.nout, "b_" + self.name))
Beispiel #17
0
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been
        trained at all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ["theano_function_mode"]
        self.t0 = time.time()
        self.theano_function_mode = None

        # Initialize self._nested_data_specs, self._data_specs_mapping,
        # and self._flat_data_specs
        self._build_data_specs()
Beispiel #18
0
 def __init__(
     self,
     recurrent=[],
     recurrent_dim=[],
     skip_list=[],
     use_fast_fprop=0,
     self_recurrent=1,
     init_state_cons=0.0,
     init_U=InitCell("ortho"),
     **kwargs
 ):
     super(RecurrentLayer, self).__init__(**kwargs)
     self.recurrent = OrderedDict()
     if self_recurrent:
         self.recurrent[self.name] = self.nout
     recurrent_dim = tolist(recurrent_dim)
     for i, rec in enumerate(tolist(recurrent)):
         if len(recurrent_dim) != 0:
             self.recurrent[rec] = recurrent_dim[i]
         else:
             self.recurrent[rec] = None
     self.init_U = init_U
     self.init_states = OrderedDict()
     self.init_state_cons = init_state_cons
     self.use_fast_fprop = use_fast_fprop
     self.skip_list = tolist(skip_list)
     if len(self.skip_list) > 0:
         if len(self.skip_list) != len(self.parent):
             raise ValueError("length of parents and skip list should match")
Beispiel #19
0
    def build_train_fn(self,):
        self.lr_theano = T.scalar("lr")
        self.grad_inputs = self.inputs + [self.lr_theano]
        if self.momentum:
            self.mom_theano = T.scalar("mom")
            self.grad_inputs = self.grad_inputs + [self.mom_theano]

        self.gparams = T.grad(self.costs[0], self.params, consider_constant=self.consider_constant)
        if not self.momentum:
            print "Building SGD optimization graph without momentum"
            updates = OrderedDict((i, i - self.lr_theano * j) for i, j in zip(self.params, self.gparams))
        else:
            print "Building SGD optimization graph with momentum"
            updates = OrderedDict()
            for param, param_mom, gparam in zip(self.params, self.params_mom, self.gparams):
                param_inc = self.mom_theano * param_mom - self.lr_theano * gparam
                updates[param_mom] = param_inc
                updates[param] = param + param_inc
        self.calc_cost = theano.function(self.inputs, self.costs)
        if self.updates_old:
            updates_old = copy.copy(
                updates_old
            )  # To avoid updating the model dict if updates dict belongs to model class, very unlikely case.
            self.updates_old.update(updates)
        else:
            self.updates_old = OrderedDict()
            self.updates_old.update(updates)

        self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old)
Beispiel #20
0
    def __init__(self, name, data, model, optimizer, cost, outputs, debug_print=0, trainlog=None, extension=None):
        self.name = name
        self.data = data
        self.model = model
        self.optimizer = optimizer
        self.inputs = model.inputs
        self.cost = cost
        self.outputs = tolist(outputs)
        self.updates = OrderedDict()
        self.updates.update(model.updates)
        self.extension = extension
        self.debug_print = debug_print
        lr_scalers = OrderedDict()
        for node in self.model.nodes:
            lr_scalers[node.name] = node.lr_scaler
        self.optimizer.lr_scalers = lr_scalers

        t0 = time.time()
        self.cost_fn = self.build_training_graph()
        print "Elapsed compilation time: %f" % (time.time() - t0)
        if self.debug_print:
            from theano.printing import debugprint

            debugprint(self.cost_fn)
        if trainlog is None:
            self.trainlog = TrainLog()
        else:
            self.trainlog = trainlog
        self.endloop = 0
Beispiel #21
0
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been
        trained at all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ["theano_function_mode"]
        self.t0 = time.time()
        # Determine whether the model should use topological or vector form of
        # examples. If the model acts on a space with more than the batch index
        # and channel dimension, the model has topological dimensions, so the
        # topological view of the data should be used.
        vector = model.get_input_space().make_theano_batch(name="monitoring_input")
        if isinstance(vector.type, theano.sparse.SparseType):
            self.topo = False
        else:
            self.topo = len(vector.type.broadcastable) > 2

        self.require_label = False
        self.theano_function_mode = None
Beispiel #22
0
    def on_attach(self, fgraph):
        """
        When attaching to a new fgraph, check that
            1) This DestroyHandler wasn't already attached to some fgraph
               (its data structures are only set up to serve one)
            2) The FunctionGraph doesn't already have a DestroyHandler.
               This would result in it validating everything twice, causing
               compilation to be slower.

        Give the FunctionGraph instance:
            1) A new method "destroyers(var)"
                TODO: what does this do exactly?
            2) A new attribute, "destroy_handler"
        TODO: WRITEME: what does this do besides the checks?
        """

        ####### Do the checking ###########
        already_there = False
        if self.fgraph is fgraph:
            already_there = True
        if self.fgraph is not None:
            raise Exception("A DestroyHandler instance can only serve one" " FunctionGraph. (Matthew 6:24)")
        for attr in ("destroyers", "destroy_handler"):
            if hasattr(fgraph, attr):
                already_there = True

        if already_there:
            # FunctionGraph.attach_feature catches AlreadyThere and cancels the attachment
            raise toolbox.AlreadyThere(
                "DestroyHandler feature is already present" " or in conflict with another plugin."
            )

        ####### Annotate the FunctionGraph ############
        self.unpickle(fgraph)
        fgraph.destroy_handler = self

        self.fgraph = fgraph
        self.destroyers = OrderedSet()  # set of Apply instances with non-null destroy_map
        self.view_i = OrderedDict()  # variable -> variable used in calculation
        self.view_o = OrderedDict()  # variable -> set of variables that use this one as a direct input
        # clients: how many times does an apply use a given variable
        self.clients = OrderedDict()  # variable -> apply -> ninputs
        self.stale_droot = True

        self.debug_all_apps = OrderedSet()
        if self.do_imports_on_attach:
            toolbox.Bookkeeper.on_attach(self, fgraph)
Beispiel #23
0
    def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None):

        # channels that does not require state information
        if self.no_affine:
            rval = OrderedDict()

        W = self.W

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        rval = OrderedDict(
            [
                ("row_norms_min", row_norms.min()),
                ("row_norms_mean", row_norms.mean()),
                ("row_norms_max", row_norms.max()),
                ("col_norms_min", col_norms.min()),
                ("col_norms_mean", col_norms.mean()),
                ("col_norms_max", col_norms.max()),
            ]
        )

        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)

            mx = state.max(axis=1)

            rval.update(
                OrderedDict([("mean_max_class", mx.mean()), ("max_max_class", mx.max()), ("min_max_class", mx.min())])
            )

            if targets is not None:
                y_hat = T.argmax(state, axis=1)
                y = T.argmax(targets, axis=1)
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval["misclass"] = misclass
                rval["nll"] = self.cost(Y_hat=state, Y=targets)
                rval["perplexity"] = 2 ** (rval["nll"] / T.log(2))

        return rval
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        Provides the updates for learning with gradient descent + momentum.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.0, name="%s_grad" % p.name) for p, g in grads.iteritems()})

        gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y ** 2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup)
        updates = OrderedDict()

        for param, grad in gshared.keys():
            vel = sharedX(param.get_value() * 0.0)
            assert param.dtype == vel.dtype
            assert grad.dtype == param.dtype
            if param.name is not None:
                vel.name = "vel_" + param.name

            scaled_lr = learning_rate * lr_scalers.get(param, 1.0)
            updates[vel] = self.momentum * vel - scaled_lr * grad

            inc = updates[vel]
            if self.nesterov_momentum:
                inc = self.momentum * inc - scaled_lr * grad

            assert inc.dtype == vel.dtype
            updates[param] = param + inc

        f_update = theano.function([learning_rate], [], updates=updates, on_unused_input="ignore")

        return f_grad_shared, f_update
Beispiel #25
0
    def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None):

        # channels that does not require state information
        #         if self.no_affine:
        #             rval = OrderedDict()
        #
        #         W = self.W
        #
        #         assert W.ndim == 2
        #
        #         sq_W = T.sqr(W)
        #
        #         row_norms = T.sqrt(sq_W.sum(axis=1))
        #         col_norms = T.sqrt(sq_W.sum(axis=0))
        #
        #         rval = OrderedDict([('row_norms_min',  row_norms.min()),
        #                             ('row_norms_mean', row_norms.mean()),
        #                             ('row_norms_max',  row_norms.max()),
        #                             ('col_norms_min',  col_norms.min()),
        #                             ('col_norms_mean', col_norms.mean()),
        #                             ('col_norms_max',  col_norms.max()), ])

        rval = OrderedDict()
        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)

            mx = state.max(axis=1)

            rval.update(
                OrderedDict([("mean_max_class", mx.mean()), ("max_max_class", mx.max()), ("min_max_class", mx.min())])
            )

            if targets is not None:
                y_hat = self.target_convert(T.argmax(state, axis=1))
                # Assume target is in [0,1] as binary one-hot
                y = self.target_convert(T.argmax(targets, axis=1))
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval["misclass"] = misclass
                rval["nll"] = self.cost(Y_hat=state, Y=targets)

        return rval
Beispiel #26
0
    def __init__(
        self, batch_size, x_axes=["b", "c", 0, 1], fprop_code=True, lr=0.01, n_steps=10, truncate=-1, *args, **kwargs
    ):

        super(ConvSparseCoding, self).__init__(*args, **kwargs)
        self.batch_size = batch_size
        self.fprop_code = fprop_code
        self.n_steps = n_steps
        self.truncate = truncate
        self.lr = lr
        self._scan_updates = OrderedDict()
Beispiel #27
0
    def get_gradients(self, model, X, Y=None, **kwargs):

        if Y is None and self.supervised:
            raise ValueError("no targets provided while some of the " + "costs in the sum are supervised costs")

        indiv_results = []
        for cost in self.costs:
            if cost.supervised:
                Y_to_pass = Y
            else:
                Y_to_pass = None
            result = cost.get_gradients(model, X, Y_to_pass, **kwargs)
            indiv_results.append(result)

        grads = OrderedDict()
        updates = OrderedDict()

        params = model.get_params()

        for coeff, packed in zip(self.coeffs, indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError(
                        "A shared variable ("
                        + str(param)
                        + ") that is not a parameter appeared in a cost gradient dictionary."
                    )
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
Beispiel #28
0
 def __init__(self, recurrent=[], recurrent_dim=[], self_recurrent=1, init_U=InitCell("ortho"), **kwargs):
     super(RecurrentLayer, self).__init__(**kwargs)
     self.recurrent = OrderedDict()
     if self_recurrent:
         self.recurrent[self.name] = self.nout
     recurrent_dim = tolist(recurrent_dim)
     for i, rec in enumerate(tolist(recurrent)):
         if len(recurrent_dim) != 0:
             self.recurrent[rec] = recurrent_dim[i]
         else:
             self.recurrent[rec] = None
     self.init_U = init_U
    def get_lr_scalers(self):
        rval = OrderedDict()

        params = self.get_params()

        for layer in self.layers[:-1]:
            contrib = layer.get_lr_scalers()

            assert isinstance(contrib, OrderedDict)
            # No two layers can contend to scale a parameter
            assert not any([key in rval for key in contrib])
            # Don't try to scale anything that's not a parameter
            assert all([key in params for key in contrib])

            rval.update(contrib)

        for layer in self.layers[-1]:
            contrib = layer.get_lr_scalers()

            assert isinstance(contrib, OrderedDict)
            # No two layers can contend to scale a parameter
            assert not any([key in rval for key in contrib])
            # Don't try to scale anything that's not a parameter
            assert all([key in params for key in contrib])

            rval.update(contrib)

        assert all([isinstance(val, float) for val in rval.values()])

        return rval
Beispiel #30
0
class RecurrentLayer(StemCell):
    """
    Abstract class for recurrent layers

    Parameters
    ----------
    .. todo::
    """

    def __init__(
        self,
        recurrent=[],
        recurrent_dim=[],
        skip_list=[],
        use_fast_fprop=0,
        self_recurrent=1,
        init_state_cons=0.0,
        init_U=InitCell("ortho"),
        **kwargs
    ):
        super(RecurrentLayer, self).__init__(**kwargs)
        self.recurrent = OrderedDict()
        if self_recurrent:
            self.recurrent[self.name] = self.nout
        recurrent_dim = tolist(recurrent_dim)
        for i, rec in enumerate(tolist(recurrent)):
            if len(recurrent_dim) != 0:
                self.recurrent[rec] = recurrent_dim[i]
            else:
                self.recurrent[rec] = None
        self.init_U = init_U
        self.init_states = OrderedDict()
        self.init_state_cons = init_state_cons
        self.use_fast_fprop = use_fast_fprop
        self.skip_list = tolist(skip_list)
        if len(self.skip_list) > 0:
            if len(self.skip_list) != len(self.parent):
                raise ValueError("length of parents and skip list should match")

    def get_init_state(self, batch_size):
        state = T.zeros((batch_size, self.nout), dtype=theano.config.floatX) + self.init_state_cons
        state = T.unbroadcast(state, *range(state.ndim))
        return state

    def initialize(self):
        super(RecurrentLayer, self).initialize()
        for recname, recout in self.recurrent.items():
            U_shape = (recout, self.nout)
            U_name = "U_" + recname + "__" + self.name
            self.alloc(self.init_U.get(U_shape, U_name))