Example #1
0
    def __init__(self, layers, dataset=None, weights_only=False, name="model", optimizer=None):
        super(Model, self).__init__(name)
        self.optimizer = optimizer
        self.params = None  # should be able to remove
        self.states = None  # should be able to remove
        self.epoch_index = 0
        self.finished = False
        self.initialized = False
        self.cost = None
        self.nbatches = 0
        self.ndata = 0

        if dataset is not None:
            logger.warning('dataset is a deprecated argument and will be ignored')

        if type(layers) in (ModelDescription, dict):
            # load up the model from a serialized file (dataset could be None here)
            self.deserialize(layers, load_states=(not weights_only))
        elif type(layers) is str:
            self.load_params(layers, load_states=(not weights_only))
        else:
            # Wrap the list of layers in a Sequential container if a raw list of layers
            if type(layers) in (Sequential, Tree, SingleOutputTree):
                self.layers = layers
            else:
                self.layers = Sequential(layers)
        self.layers.propagate_parallelism("Data")
Example #2
0
    def __init__(self):
        self.in_shape = [1024, (2538, 38)]

        init = Constant(0)
        image_path = Sequential(
            [Affine(20, init, bias=init),
             Affine(10, init, bias=init)])
        sent_path = Sequential([Affine(30, init, bias=init), Affine(10, init)])

        layers = [
            MergeMultistream(layers=[image_path, sent_path],
                             merge="recurrent"),
            Dropout(keep=0.5),
            LSTM(4,
                 init,
                 activation=Logistic(),
                 gate_activation=Tanh(),
                 reset_cells=True),
            Affine(20, init, bias=init, activation=Softmax())
        ]
        self.layers = layers
        self.cost = GeneralizedCostMask(CrossEntropyMulti())

        self.model = Model(layers=layers)
        self.model.initialize(self.in_shape, cost=self.cost)
Example #3
0
def inception_bare(ref_module, kvals, name="i"):
    (p1, p2, p3) = kvals
    branch1 = [Conv(fshape(1, p1[0]), **common)] if p1[0] else []
    branch2 = [
        Conv(fshape(1, p2[0]), **common),
        Conv(fshape(3, p2[1]), **commonp1)
    ]
    branch3 = [Pooling(op=p3[0], **pool3s1p1)
               ] + ([Conv(fshape(1, p3[1]), **common)] if p3[1] else [])

    branch1 = Sequential(branch1)
    branch2 = Sequential(branch2)
    branch3 = Sequential(branch3)

    (branch1_ref, branch2_ref, branch3_ref) = ref_module[0].layers

    if p1[0]:
        for ll, lr in zip(branch1.layers, branch1_ref.layers):
            if ll.has_params:
                ll.set_params({'params': {'W': lr.W.get()}})

    for ll, lr in zip(branch2.layers, branch2_ref.layers):
        if ll.has_params:
            ll.set_params({'params': {'W': lr.W.get()}})

    if p3[1]:
        for ll, lr in zip(branch3.layers, branch3_ref.layers):
            if ll.has_params:
                ll.set_params({'params': {'W': lr.W.get()}})

    return (branch1.layers, branch2.layers, branch3.layers)
Example #4
0
def inception(kvals, name="i"):
    (p1, p2, p3) = kvals

    branch1 = [Sequential([Conv(fshape(1, p1[0]), **common)])] if p1[0] else []
    branch2 = [Sequential([Conv(fshape(1, p2[0]), **common),
                           Conv(fshape(3, p2[1]), **commonp1)])]
    branch3 = [Sequential([Pooling(op=p3[0], **pool3s1p1)] + (
                          [Conv(fshape(1, p3[1]), **common)] if p3[1] else []))]
    partitions = branch1 + branch2 + branch3
    return [MergeBroadcast(layers=partitions, merge="depth")]
Example #5
0
    def __init__(self, layers, dataset=None, weights_only=False, name="model", optimizer=None):
        super(Model, self).__init__(name)
        self.optimizer = optimizer
        self.params = None  # should be able to remove
        self.states = None  # should be able to remove
        self.epoch_index = 0
        self.finished = False
        self.initialized = False
        self.cost = None
        self.nbatches = 0
        self.ndata = 0

        if dataset is not None:
            logger.warning('dataset is a deprecated argument and will be ignored')

        if type(layers) in (ModelDescription, dict):
            # load up the model from a serialized file (dataset could be None here)
            self.deserialize(layers, load_states=(not weights_only))
        elif type(layers) is str:
            self.load_params(layers, load_states=(not weights_only))
        else:
            # Wrap the list of layers in a Sequential container if a raw list of layers
            if type(layers) in (Sequential, Tree, SingleOutputTree):
                self.layers = layers
            else:
                self.layers = Sequential(layers)
        self.layers.propagate_parallelism("Data")
Example #6
0
def test_concat_l1_l1(backend_default, allrand_args):
    # test two linear layers that are merged with concat
    dtypeu = np.float32
    w_rng, rngmax = allrand_args
    # Diff size inputs and outputs
    nins = [128, 1024]
    nouts = [64, 2048]
    batch_size = 16
    NervanaObject.be.bsz = batch_size
    be = NervanaObject.be

    init_unif = Uniform(low=w_rng[0], high=w_rng[1])
    layers = [Sequential(Affine(nout=nout, init=init_unif)) for nout in nouts]
    inputs = [be.array(dtypeu(np.random.random((nin, batch_size)))) for nin in nins]
    merge = MergeMultistream(layers, merge="stack")
    assert(len(inputs) == len(layers))
    merge.configure(inputs)
    merge.allocate()
    merge.set_deltas(None)
    out = merge.fprop(inputs).get()

    sublayers = [s.layers[0] for s in layers]
    weights = [layer.W.get() for layer in sublayers]
    out_exp = np.concatenate([np.dot(w, inp.get()) for (w, inp) in zip(weights, inputs)])

    assert np.allclose(out, out_exp, atol=1e-3)

    err_lst = [dtypeu(np.random.random((nout, batch_size))) for nout in nouts]
    err_concat = np.concatenate(err_lst)
    merge.bprop(be.array(err_concat))
    dW_exp_lst = [np.dot(err, inp.get().T) for (err, inp) in zip(err_lst, inputs)]

    for layer, dW_exp in zip(sublayers, dW_exp_lst):
        assert np.allclose(layer.dW.get(), dW_exp)
    return
Example #7
0
    def __init__(self, layers, name="model", optimizer=None):
        super(Model, self).__init__(name)
        self.optimizer = optimizer
        self.params = None  # should be able to remove
        self.states = None  # should be able to remove
        self.epoch_index = 0
        self.finished = False
        self.initialized = False
        self.cost = None

        # Wrap the list of layers in a Sequential container if a raw list of layers
        self.layers = layers if type(layers) in (Sequential, Tree) else Sequential(layers)
        self.layers_to_optimize = self.layers.layers_to_optimize
Example #8
0
def test_concat_sequence_l1_l1(backend_default, allrand_args, deltas_buffer):
    # test two linear layers that are merged with concat
    dtypeu = np.float32
    w_rng, rngmax = allrand_args
    # Diff size input steps
    nin = 128
    steps = [32, 64]
    nout = 256
    batch_size = 16
    NervanaObject.be.bsz = batch_size
    be = NervanaObject.be

    init_unif = Uniform(low=w_rng[0], high=w_rng[1])
    layers = [Sequential(Affine(nout=nout, init=init_unif)) for _ in (0, 1)]
    inputs = [
        be.array(dtypeu(np.random.random((nin, batch_size * step))))
        for step in steps
    ]
    merge = MergeMultistream(layers, merge="recurrent")
    assert (len(inputs) == len(layers))
    merge.configure(inputs)
    merge.allocate()

    merge.allocate_deltas(deltas_buffer)
    deltas_buffer.allocate_buffers()
    merge.set_deltas(deltas_buffer)

    out = merge.fprop(inputs).get()

    sublayers = [s.layers[0] for s in layers]
    weights = [layer.W.get() for layer in sublayers]
    out_exp = np.concatenate(
        [np.dot(w, inp.get()) for (w, inp) in zip(weights, inputs)], axis=1)

    assert allclose_with_out(out, out_exp, atol=1e-3)

    err_lst = [
        dtypeu(np.random.random((nout, batch_size * step))) for step in steps
    ]
    err_concat = be.array(np.concatenate(err_lst, axis=1))
    merge.bprop(err_concat)
    dW_exp_lst = [
        np.dot(err,
               inp.get().T) for (err, inp) in zip(err_lst, inputs)
    ]

    for layer, dW_exp in zip(sublayers, dW_exp_lst):
        assert allclose_with_out(layer.dW.get(), dW_exp)
    return
Example #9
0
    def __init__(self, layers, dataset=None, weights_only=False, name="model", optimizer=None):
        super(Model, self).__init__(name)
        self.optimizer = optimizer
        self.params = None  # should be able to remove
        self.states = None  # should be able to remove
        self.epoch_index = 0
        self.finished = False
        self.initialized = False
        self.cost = None
        self.weights_only = weights_only
        self.nbatches = 0
        self.ndata = 0

        if type(layers) is ModelDescription or type(layers) is dict:
            # load up the model from a serialized file (dataset could be None here)
            load_states = not self.weights_only
            self.deserialize(layers, dataset, load_states)
        else:
            # Wrap the list of layers in a Sequential container if a raw list of layers
            if type(layers) in (Sequential, Tree, SingleOutputTree):
                self.layers = layers
            else:
                self.layers = Sequential(layers)
        self.layers_to_optimize = self.layers.layers_to_optimize
Example #10
0
File: model.py Project: maony/neon
    def __init__(self, layers, dataset=None, inference=False, name="model", optimizer=None):
        super(Model, self).__init__(name)
        self.optimizer = optimizer
        self.params = None  # should be able to remove
        self.states = None  # should be able to remove
        self.epoch_index = 0
        self.finished = False
        self.initialized = False
        self.cost = None
        self.inference = inference
        self.nbatches = 0
        self.ndata = 0

        if type(layers) is ModelDescription or type(layers) is dict:
            # load up the model from a serialized file
            assert dataset is not None, 'Need data set to initialize model from serialized file'
            self.deserialize(layers, dataset, weights_only=False)
        else:
            # Wrap the list of layers in a Sequential container if a raw list of layers
            if type(layers) in (Sequential, Tree, SingleOutputTree):
                self.layers = layers
            else:
                self.layers = Sequential(layers)
        self.layers_to_optimize = self.layers.layers_to_optimize
def test_branch_model():
    NervanaObject.be = gen_backend("gpu", batch_size=64)
    be = NervanaObject.be
    main1 = main_branch()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (3, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()
    print neon_layer.nested_str()
    neon_layer.layers[0].prev_layer = True
    neon_layer.allocate_deltas()
    neon_layer.layers[0].set_deltas([be.iobuf(inshape)])
    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].set_deltas([be.iobuf(inshape)])
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params(lo.W.get())
        ll.allocate()
        ll.set_deltas([be.iobuf(ll.in_shape)])
    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            ll.set_deltas([be.iobuf(ll.in_shape)])

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[8].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    difference = neon_out_ref - neon_out
    assert np.max(np.abs(difference)) < 1e-7
    print np.max(np.abs(difference))

    print "Beginning Back prop"
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[8:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b1[0].deltas + b2[0].deltas + b3[0].deltas

    neon_ref_deltas = ref_deltas.get()

    difference = neon_deltas - neon_ref_deltas
    print np.max(np.abs(difference))
    assert np.max(np.abs(difference)) < 1e-8
Example #12
0
def insert_branch_layer(network, b):
    return Sequential(layers=(network, b))
Example #13
0
class Model(NervanaObject):
    """
    Basic model class which stores a list of layers describing the model. Can train the layer
    weights on a dataset, evaluate on a test set and serialize the mode.
    Additional functionality can be added to fit through callback functions.

    Arguments:
        layers: layer container, or a list of layers (that will be containerized),
                or a serialized model description
        dataset (iterator): Data set (ignored, will be removed)
        weights_only (bool): set to True if you do not want to recreate layers
                             and states during deserialization from a serialized model
                             description.  Defaults to False.
        name (str): Model name.  Defaults to "model"
        optimizer (Optimizer): Optimizer object which defines the learning rule
                               for updating model parameters (ie DescentMomentum, AdaDelta)
    """

    def __init__(self, layers, dataset=None, weights_only=False, name="model", optimizer=None):
        super(Model, self).__init__(name)
        self.optimizer = optimizer
        self.params = None  # should be able to remove
        self.states = None  # should be able to remove
        self.epoch_index = 0
        self.finished = False
        self.initialized = False
        self.cost = None
        self.nbatches = 0
        self.ndata = 0

        if dataset is not None:
            logger.warning('dataset is a deprecated argument and will be ignored')

        if type(layers) in (ModelDescription, dict):
            # load up the model from a serialized file (dataset could be None here)
            self.deserialize(layers, load_states=(not weights_only))
        elif type(layers) is str:
            self.load_params(layers, load_states=(not weights_only))
        else:
            # Wrap the list of layers in a Sequential container if a raw list of layers
            if type(layers) in (Sequential, Tree, SingleOutputTree):
                self.layers = layers
            else:
                self.layers = Sequential(layers)
        self.layers.propagate_parallelism("Data")

    @property
    def layers_to_optimize(self):
        return self.layers.layers_to_optimize

    def set_shortcut(self):
        # infer whether bprop shortcut can be used on final activation
        # self.cost should be set to run this otherwise do nothing
        lastlayer = self.layers[-1]
        try:
            if self.cost.costfunc.__class__ is CrossEntropyBinary:
                if (lastlayer.__class__ is Activation and
                   lastlayer.transform.__class__ is Logistic):
                    lastlayer.transform.set_shortcut(True)
        except:
            # if any attributes are not set or any other exception
            # is thrown leave transform.shortcut as is (do nothing)
            pass

    def initialize(self, dataset, cost=None):
        if self.initialized:
            return

        # Propagate shapes through the layers to configure
        prev_input = dataset
        prev_input = self.layers.configure(prev_input)

        if cost is not None:
            cost.initialize(prev_input)
            self.cost = cost

        # Now allocate space
        self.layers.allocate()
        self.layers.allocate_deltas()
        self.initialized = True

    def __str__(self):
        """
        String representation of model's layers
        """
        config_string = "Network Layers:\n" + self.layers.nested_str()
        return config_string

    def fit(self, dataset, cost, optimizer, num_epochs, callbacks):
        """
        Trains the model parameters on a dataset by minimizing the cost function through
        gradient descent and updates the layer weights according to a learning rule
        defined in optimizer.

        Arguments:
            dataset (iterator): An iterable of minibatches where each
                element is a (x, y) tuple where x is the input data and y are the labels.
                x is of dimension (feature_size, batch_size)
                y is of dimension (label_size, batch_size)
                Length of the iterator is num_batches which is num_data / batch_size
            cost (Cost): Defines the function which the model is minimizing based
                on the output of the last layer and the input labels
            optimizer (Optimizer): Defines the learning rule for updating the model parameters
            num_epochs: Number of times to iterate over the dataset.
            callbacks (Callbacks): Defines callbacks to run at the end of each mini-batch / epoch.
        """
        self.nbatches = dataset.nbatches
        self.ndata = dataset.ndata
        # self.set_shortcut()  # infer if bprop shortcut can be used
        self.total_cost = self.be.empty((1, 1), dtype=np.float32)
        self.optimizer = optimizer
        self.initialize(dataset, cost)

        callbacks.on_train_begin(num_epochs)
        while self.epoch_index < num_epochs and not self.finished:
            self.nbatches = dataset.nbatches

            callbacks.on_epoch_begin(self.epoch_index)

            self._epoch_fit(dataset, callbacks)

            callbacks.on_epoch_end(self.epoch_index)

            self.epoch_index += 1

        callbacks.on_train_end()

    def _epoch_fit(self, dataset, callbacks):
        """
        Helper function for fit which performs training on a dataset for one epoch.

        Arguments:
            dataset (iterable): Dataset iterator to perform fit on
        """
        epoch = self.epoch_index
        self.total_cost[:] = 0
        # iterate through minibatches of the dataset
        for mb_idx, (x, t) in enumerate(dataset):
            callbacks.on_minibatch_begin(epoch, mb_idx)
            self.be.begin(Block.minibatch, mb_idx)

            x = self.fprop(x)

            self.total_cost[:] = self.total_cost + self.cost.get_cost(x, t)

            # deltas back propagate through layers
            # for every layer in reverse except the 0th one
            delta = self.cost.get_errors(x, t)

            self.bprop(delta)
            
            self.optimizer.optimize(self.layers_to_optimize, epoch=epoch)

            self.be.end(Block.minibatch, mb_idx)
            callbacks.on_minibatch_end(epoch, mb_idx)

        # now we divide total cost by the number of batches,
        # so it was never total cost, but sum of averages
        # across all the minibatches we trained on
        self.total_cost[:] = self.total_cost / dataset.nbatches

    def fprop(self, x, inference=False):
        """
        Forward propagates a minibatch x through the model.

        Arguments:
            x (Tensor): Input minibatch data
            inference (bool): Flag for performing training or inference
                Only affects batch norm and dropout layers.

        Returns:
            Tensor: the output of the final layer in the model
        """
        return self.layers.fprop(x, inference)

    def bprop(self, delta):
        """
        Back propagates the error of a minibatch through the model.

        Arguments:
            delta (Tensor): Derivative of cost with respect to the last layer's output
        """
        return self.layers.bprop(delta)

    def eval(self, dataset, metric):
        """
        Evaluates a model on a dataset according to an input metric.

        Arguments:
            datasets (iterable): dataset to evaluate on.
            metric (Cost): what function to evaluate dataset on.
        """
        self.initialize(dataset)
        running_error = np.zeros((len(metric.metric_names)), dtype=np.float32)
        nprocessed = 0
        dataset.reset()
        for x, t in dataset:
            x = self.fprop(x, inference=True)

            # This logic is for handling partial batch sizes at the end of the dataset
            nsteps = x.shape[1] / self.be.bsz if not isinstance(x, list) else \
                x[0].shape[1] / self.be.bsz

            bsz = min(dataset.ndata - nprocessed, self.be.bsz)
            running_error += metric(x, t, calcrange=slice(0, nsteps * bsz)) * nsteps * bsz
            nprocessed += bsz * nsteps
        running_error /= nprocessed
        return running_error

    def get_outputs(self, dataset):
        """
        Get the activation outputs of the final model layer for the dataset

        Arguments:
            dataset (iterable): Dataset iterator to perform fit on

        Returns:
            Host numpy array: the output of the final layer for the entire Dataset
        """
        self.initialize(dataset)
        dataset.reset()  # Move "pointer" back to beginning of dataset
        n = dataset.nbatches
        x = self.layers.layers[-1].outputs
        assert not isinstance(x, list), "Can not get_outputs with Branch terminal"
        Ypred = None
        for idx, (x, t) in enumerate(dataset):
            x = self.fprop(x, inference=True)
            if Ypred is None:
                (dim0, dim1) = x.shape
                Ypred = np.empty((n * dim1, dim0), dtype=x.dtype)
                nsteps = dim1 / self.be.bsz
            cur_batch = slice(idx * dim1, (idx + 1) * dim1)
            Ypred[cur_batch] = x.get().T

        # Handle the recurrent case.
        if nsteps != 1:
            b, s = (self.be.bsz, nsteps)
            Ypred = Ypred.reshape((n, s, b, -1)).transpose(0, 2, 1, 3).copy().reshape(n*b, s, -1)

        return Ypred[:dataset.ndata]

    def get_description(self, get_weights=False, keep_states=False):
        """
        Gets a description of the model required to reconstruct the model with
        no weights like from a yaml file.

        Returns:
            dict: Description of each component of the model.
        """
        pdict = dict()
        pdict['neon_version'] = __neon_version__
        compat_mode = self.be.compat_mode if self.be.compat_mode is not None else 'neon'
        pdict['backend'] = {'type': self.be.__class__.__name__,
                            'compat_mode': compat_mode,
                            'rng_seed': self.be.rng_seed,
                            'rng_state': self.be.rng_get_state()}

        if self.cost:
            pdict['cost'] = self.cost.get_description()
        if self.optimizer:
            pdict['optimizer'] = self.optimizer.get_description()

        pdict['model'] = self.layers.get_description(get_weights=get_weights,
                                                     keep_states=keep_states)
        return pdict

    def save_params(self, param_path, keep_states=True):
        """
        Serializes and saves model parameters to the path specified.

        Arguments:
            param_path (str): File to write serialized parameter dict to.
            keep_states (bool): Whether to save optimizer states too.
                                Defaults to True.
        """
        self.serialize(keep_states=keep_states, fn=param_path)

    def load_params(self, param_path, load_states=True):
        """
        Loads the model parameters (per layer weights, epochs run, optimizer
        states) saved in param_path from serialize().

        Arguments:
            param_path (str): File containing serialized python dict with layer
                              weights and states.
            load_states (bool):  if False, then only the weights will be loaded
                                 into a model in which the layers have already been
                                 created, otherwise will (re)create the layers from
                                 the serialized parameters and set the learning
                                 states as well
        """
        self.deserialize(load_obj(param_path), load_states=load_states)
        logger.info('Model weights loaded from %s', param_path)

    def load_weights(self, weight_path):
        """
        .. deprecated:: 1.1.4
           Use :func:`load_params` instead
        """
        logger.warning('Calling deprecated load_weights function.  Use '
                       'load_params instead')
        self.load_params(weight_path)

    def deserialize(self, model_dict, data=None, load_states=True):
        """
        Loads per layer (weights, states) and other model parameters from the
        dictionary passed.

        Arguments:
            model_dict (dict): dictionary describing the model including layers,
                               cost, optimizers, backend settings, etc.
                               generated by the serialize function
            data (iterator):   Data set (ignored, will be removed)

            load_states (bool):  if False, then only the weights will be loaded
                                 into a model in which the layers have already been
                                 created, otherwise will (re)create the layers from
                                 the serialized parameters and set the learning
                                 states as well
        """

        if data is not None:
            logger.warning('data is a deprecated argument and will be ignored')

        if 'epoch_index' in model_dict:
            self.epoch_index = model_dict['epoch_index']
        if 'model' not in model_dict:
            logger.error('Using old model serialization format. '
                         'Serialized the model into new format')

            param_layers = [l for l in self.layers_to_optimize]
            param_dict_list = model_dict['layer_params_states']
            for l, ps in zip(param_layers, param_dict_list):
                l.set_params(ps)
                if 'states' in ps and load_states:
                    l.set_states(ps)
            return

        if 'backend' in model_dict:
            if 'compat_mode' in model_dict['backend']:
                self.be.compat_mode = model_dict['backend']['compat_mode']
        else:
            model_dict['backend'] = {}

        typ = model_dict['model']['type']
        main_container = load_class(typ)

        if not hasattr(self, 'layers'):
            self.layers = main_container.gen_class(model_dict['model']['config'])

        self.layers.load_weights(model_dict['model'], load_states)

        if load_states and 'rng_state' in model_dict['backend']:
            try:
                self.be.rng_set_state(model_dict['backend']['rng_state'])
            except ValueError as e:
                # could come about when switching backend types (ex GPU to CPU)
                logger.warning("Problems restoring existing RNG state: %s", str(e))

    # serialize tells how to write out the parameters we've learned so
    # far and associate them with layers. it can ignore layers with no
    # learned parameters. the model stores states to pass to the
    # optimizers.  if we're saving the model out for inference, we
    # don't need to remember states.
    def serialize(self, fn=None, keep_states=True):
        """
        Creates a dictionary storing the layer parameters and epochs complete.

        Arguments:
            fn (str): file to save pkl formatted model dictionary
            keep_states (bool): Whether to save optimizer states.

        Returns:
            dict: Model data including layer parameters and epochs complete.
        """

        # get the model dict with the weights
        pdict = self.get_description(get_weights=True, keep_states=keep_states)
        pdict['epoch_index'] = self.epoch_index + 1
        if self.initialized:
            pdict['train_input_shape'] = self.layers.in_shape
        if fn is not None:
            save_obj(pdict, fn)
            return
        return pdict

    def set_batch_size(self, N):
        """
        Set the actual minibatch size, so eventhough the buffers are allocated considering
        excessive padding, the processing for some layers may be shortened.
        Currently most of the neon layers don't use that to control the processing. The
        interface is here only for when someone wants to set that information and experiment.
        """
        return self.layers.set_batch_size(N)

    def set_seq_len(self, S):
        """
        Set the actual minibatch sequence length, so eventhough the buffers are allocated
        considering excessive padding, the processing for some layers may be shortened.
        Currently most of the neon layers don't use that to control the processing. The
        interface is here only for when someone wants to set that information and experiment.
        """
        return self.layers.set_seq_len(S)

    def benchmark(self, dataset, inference=False, cost=None, optimizer=None,
                  niterations=20, nskip=2):
        """
        Measure runtime for computing fprop and bprop seperately, as well as
        full minibatch run times. For inference case, only the fprop

        Arguments:
              dataset (iterable): Dataset iterator to perform fit on

              cost (Cost): Defines the function which the model is minimizing based
                            on the output of the last layer and the input labels

             niterations (optional, int): Number of minibatches to average over

             nskip (optional, int): number of iterations at the beginning to skip
                                    when calculating the runtime statistics

        Returns:
            dictionary with fprop, bprop run times
        """
        # initialize model
        if inference is False:
            assert cost is not None and optimizer is not None, "Need cost and optimizer to \
                                                                benchmark bprop and update"
        self.cost = cost
        self.initialize(dataset, cost)
        self.optimizer = optimizer
        self.total_cost = self.be.empty((1, 1))
        self.total_cost[:] = 0

        # iterate through minibatches of the dataset
        times = OrderedDict()
        time_keys = ['fprop'] if inference else ['fprop', 'bprop', 'iteration']
        for ky in time_keys:
            times[ky] = np.full(niterations + nskip, -1.0)
        count = 0

        fprop_start = self.be.init_mark()
        fprop_end = self.be.init_mark()
        bprop_end = self.be.init_mark()

        while count < niterations + nskip:
            dataset.reset()
            for mb_idx, (x, t) in enumerate(dataset):

                self.be.record_mark(fprop_start)  # mark start of fprop

                x = self.fprop(x)

                if inference is False:
                    self.total_cost[:] = self.total_cost + self.cost.get_cost(x, t)

                self.be.record_mark(fprop_end)  # mark end of fprop and start of bprop

                if inference is False:
                    delta = self.cost.get_errors(x, t)
                    self.bprop(delta)
                    self.optimizer.optimize(self.layers_to_optimize, epoch=0)

                    self.be.record_mark(bprop_end)  # mark end of bprop
                    self.be.synchronize_mark(bprop_end)
                else:
                    self.be.synchronize_mark(fprop_end)

                times['fprop'][count] = self.be.get_time(fprop_start, fprop_end)
                if inference is False:
                    times['bprop'][count] = self.be.get_time(fprop_end, bprop_end)
                    times['iteration'][count] = times['fprop'][count] + times['bprop'][count]

                count += 1
                if count >= niterations + nskip:
                    break

        # print results
        header = ('Func', 'Mean', 'Median', 'Min', 'Max', 'Units')
        stats = tuple(stat.lower() for stat in header[1:-1])

        fmt_titles = '| {:^11} '*len(header) + '|'
        fmt_nums = '| {func:<11} ' + '|  {%s:<10.5g} '*len(stats) % (stats) + '| {units:^11} |'

        head_str = fmt_titles.format(*header)
        sep = '-'*len(head_str)
        head_str = sep + '\n' + head_str + '\n' + sep
        print(head_str)
        out_stats = {}
        for step in times:
            timesu = np.array(times[step][nskip:])  # in ms
            out_stats[step] = {}
            for stat in stats:
                out_stats[step][stat] = getattr(np, stat)(timesu)
            print(fmt_nums.format(units='msec', func=step, **out_stats[step]))
        print(sep)
        return out_stats
Example #14
0
def test_model_serialize(backend_default, data):
    (X_train, y_train), (X_test, y_test), nclass = load_mnist(path=data)

    train_set = ArrayIterator([X_train, X_train],
                              y_train,
                              nclass=nclass,
                              lshape=(1, 28, 28))

    init_norm = Gaussian(loc=0.0, scale=0.01)

    # initialize model
    path1 = Sequential([
        Conv((5, 5, 16),
             init=init_norm,
             bias=Constant(0),
             activation=Rectlin()),
        Pooling(2),
        Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin())
    ])
    path2 = Sequential([
        Affine(nout=100,
               init=init_norm,
               bias=Constant(0),
               activation=Rectlin()),
        Dropout(keep=0.5),
        Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin())
    ])
    layers = [
        MergeMultistream(layers=[path1, path2], merge="stack"),
        Affine(nout=20, init=init_norm, batch_norm=True, activation=Rectlin()),
        Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True))
    ]

    tmp_save = 'test_model_serialize_tmp_save.pickle'
    mlp = Model(layers=layers)
    mlp.optimizer = GradientDescentMomentum(learning_rate=0.1,
                                            momentum_coef=0.9)
    mlp.cost = GeneralizedCost(costfunc=CrossEntropyBinary())
    mlp.initialize(train_set, cost=mlp.cost)
    n_test = 3
    num_epochs = 3
    # Train model for num_epochs and n_test batches
    for epoch in range(num_epochs):
        for i, (x, t) in enumerate(train_set):
            x = mlp.fprop(x)
            delta = mlp.cost.get_errors(x, t)
            mlp.bprop(delta)
            mlp.optimizer.optimize(mlp.layers_to_optimize, epoch=epoch)
            if i > n_test:
                break

    # Get expected outputs of n_test batches and states of all layers
    outputs_exp = []
    pdicts_exp = [l.get_params_serialize() for l in mlp.layers_to_optimize]
    for i, (x, t) in enumerate(train_set):
        outputs_exp.append(mlp.fprop(x, inference=True))
        if i > n_test:
            break

    # Serialize model
    mlp.save_params(tmp_save, keep_states=True)

    # Load model
    mlp = Model(tmp_save)

    mlp.initialize(train_set)
    outputs = []
    pdicts = [l.get_params_serialize() for l in mlp.layers_to_optimize]
    for i, (x, t) in enumerate(train_set):
        outputs.append(mlp.fprop(x, inference=True))
        if i > n_test:
            break

    # Check outputs, states, and params are the same
    for output, output_exp in zip(outputs, outputs_exp):
        assert np.allclose(output.get(), output_exp.get())

    for pd, pd_exp in zip(pdicts, pdicts_exp):
        for s, s_e in zip(pd['states'], pd_exp['states']):
            if isinstance(s, list):  # this is the batch norm case
                for _s, _s_e in zip(s, s_e):
                    assert np.allclose(_s, _s_e)
            else:
                assert np.allclose(s, s_e)
        for p, p_e in zip(pd['params'], pd_exp['params']):
            assert type(p) == type(p_e)
            if isinstance(p, list):  # this is the batch norm case
                for _p, _p_e in zip(p, p_e):
                    assert np.allclose(_p, _p_e)
            elif isinstance(p, np.ndarray):
                assert np.allclose(p, p_e)
            else:
                assert p == p_e

    os.remove(tmp_save)
Example #15
0
def mergesum_test_config(modfunc, use_stride=1):
    NervanaObject.be = gen_backend("gpu", batch_size=64)
    be = NervanaObject.be
    l1 = Conv(**conv_params(3, 16))
    neon_layer = modfunc(16, use_stride)
    inshape = (16, 32, 32)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))

    neon_seq = Sequential([l1] + neon_layer)
    neon_seq.configure(inshape)
    inp = be.array(inpa)

    neon_seq.allocate()
    # print neon_layer.nested_str()
    # neon_layer.layers[0].prev_layer = True
    neon_seq.allocate_deltas()
    neon_out = neon_seq.fprop(inp).get()

    # Now make the reference pathways:
    p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride)
    l11 = Conv(**conv_params(3, 16))
    l12 = Conv(**conv_params(3, 16))

    for ll in (l11, l12):
        for lcopy, lref in zip(ll, l1):
            if lcopy.has_params:
                lcopy.set_params(lref.get_params_serialize())

    path1 = Sequential([l11] + p1)
    path2 = Sequential([l12] + p2)
    for ll in (path1, path2):
        ll.configure(inshape)
        ll.allocate()
        ll.allocate_deltas()

    o1 = path1.fprop(inp).get()
    o2 = path2.fprop(inp).get()
    # Now relu it
    neon_out_ref = np.maximum(o1+o2, 0)
    difference = neon_out_ref - neon_out

    print np.max(np.abs(difference))
    # need to have bsum false for this test to be valid
    # assert np.max(np.abs(difference)) < 1e-7
    print "Fprop matching"

    print "Beginning Back prop"
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    ebr = neon_seq.layers[4].bprop(err)
    print "Orig Error", ebr.get()[0, :20]
    ebr = neon_seq.layers[3].bprop(ebr)

    trunk_neon = ebr.get()

    err = be.array(erra)
    err[:] = be.greater(be.array(neon_out_ref), 0) * err

    eb1 = err
    for l in reversed(path1.layers[3:]):
        eb1 = l.bprop(eb1)
    t1 = eb1.get()

    err = be.array(erra)
    err[:] = be.greater(be.array(neon_out_ref), 0) * err
    eb2 = err
    for l in reversed(path2.layers[3:]):
        eb2 = l.bprop(eb2)
    t2 = eb2.get()

    print np.max(np.abs(trunk_neon - (t1 + t2)))
Example #16
0
# parse the command line arguments
parser = NeonArgparser(__doc__)
args = parser.parse_args()

# hyperparameters
num_epochs = args.epochs

(X_train, y_train), (X_test, y_test), nclass = load_mnist(path=args.data_dir)
train_set = ArrayIterator([X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28))
valid_set = ArrayIterator([X_test, X_test], y_test, nclass=nclass, lshape=(1, 28, 28))

# weight initialization
init_norm = Gaussian(loc=0.0, scale=0.01)

# initialize model
path1 = Sequential(layers=[Affine(nout=100, init=init_norm, activation=Rectlin()),
                           Affine(nout=100, init=init_norm, activation=Rectlin())])

path2 = Sequential(layers=[Affine(nout=100, init=init_norm, activation=Rectlin()),
                           Affine(nout=100, init=init_norm, activation=Rectlin())])

layers = [MergeMultistream(layers=[path1, path2], merge="stack"),
          Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True))]

model = Model(layers=layers)
cost = GeneralizedCost(costfunc=CrossEntropyBinary())

# fit and validate
optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9)

# configure callbacks
callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args)
# setup backend
be = gen_backend(**extract_valid_args(args, gen_backend))

# download dataset
data_path = load_flickr8k(path=args.data_dir)  # Other setnames are flickr30k and coco

# load data
train_set = ImageCaption(path=data_path, max_images=-1)

# weight initialization
init = Uniform(low=-0.08, high=0.08)
init2 = Constant(val=train_set.be.array(train_set.bias_init))

# model initialization
image_path = Sequential([Affine(hidden_size, init, bias=Constant(val=0.0))])
sent_path = Sequential([Affine(hidden_size, init, linear_name='sent')])

layers = [
    MergeMultistream(layers=[image_path, sent_path], merge="recurrent"),
    Dropout(keep=0.5),
    LSTM(hidden_size, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=True),
    Affine(train_set.vocab_size, init, bias=init2, activation=Softmax())
]

cost = GeneralizedCostMask(costfunc=CrossEntropyMulti(usebits=True))

# configure callbacks
checkpoint_model_path = "~/image_caption2.pickle"
if args.callback_args['save_path'] is None:
    args.callback_args['save_path'] = checkpoint_model_path
Example #18
0
def test_branch_model_fork():
    from neon.layers import BranchNode, Tree

    NervanaObject.be = gen_backend("gpu", batch_size=64)
    be = NervanaObject.be
    bnode = BranchNode()
    i1 = inception([(32, ), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (3, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()
    print neon_layer.nested_str()
    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()
    neon_layer.layers[0].layers[0].set_deltas([be.iobuf(inshape)])
    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].set_deltas([be.iobuf(inshape)])

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32, ), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        ll.set_deltas([be.iobuf(ll.in_shape)])

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            ll.set_deltas([be.iobuf(ll.in_shape)])

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    difference = neon_out_ref - neon_out[0]
    assert np.max(np.abs(difference)) < 1e-7
    print np.max(np.abs(difference))

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    difference = neon_out_ref2 - neon_out[1]
    assert np.max(np.abs(difference)) < 1e-7
    print np.max(np.abs(difference))

    print "Beginning Back prop"
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3),
                        neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b1[0].deltas + b2[0].deltas + b3[
        0].deltas + alpha2 * lbranch2[0].deltas

    neon_ref_deltas = ref_deltas.get()
    difference = middle_neon_deltas - neon_ref_deltas

    print np.max(np.abs(difference))
    assert np.max(np.abs(difference)) < 1e-8

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    difference = bottom_neon_deltas - bottom_neon_ref_deltas
    print np.max(np.abs(difference))
    assert np.max(np.abs(difference)) < 1e-8
Example #19
0
def test_branch_model_cpu(backend_cpu64):
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 32
    main1 = main_branch()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)
    neon_layer.allocate()
    neon_logger.display(neon_layer.nested_str())
    neon_layer.layers[0].prev_layer = True

    neon_layer.allocate_deltas()

    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()

        temp_buff = DeltasTree()
        ll.allocate_deltas(temp_buff)
        temp_buff.allocate_buffers()
        ll.set_deltas(temp_buff)

    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            temp_buff = DeltasTree()
            ll.allocate_deltas(temp_buff)
            temp_buff.allocate_buffers()
            ll.set_deltas(temp_buff)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[8].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out, neon_out_ref, rtol=0)

    neon_logger.display("Beginning Back prop")
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[8:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas

    neon_ref_deltas = ref_deltas.get()

    assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
Example #20
0
def test_branch_model():
    NervanaObject.be = gen_backend("gpu", batch_size=64)
    be = NervanaObject.be
    main1 = main_branch()
    i1 = inception([(32, ), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (3, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()
    print neon_layer.nested_str()
    neon_layer.layers[0].prev_layer = True
    neon_layer.allocate_deltas()
    neon_layer.layers[0].set_deltas([be.iobuf(inshape)])
    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].set_deltas([be.iobuf(inshape)])
    (b1, b2, b3) = inception_bare(i1, [(32, ), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        ll.set_deltas([be.iobuf(ll.in_shape)])
    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            ll.set_deltas([be.iobuf(ll.in_shape)])

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[8].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    difference = neon_out_ref - neon_out
    assert np.max(np.abs(difference)) < 1e-7
    print np.max(np.abs(difference))

    print "Beginning Back prop"
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[8:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b1[0].deltas + b2[0].deltas + b3[0].deltas

    neon_ref_deltas = ref_deltas.get()

    difference = neon_deltas - neon_ref_deltas
    print np.max(np.abs(difference))
    assert np.max(np.abs(difference)) < 1e-8
def test_branch_model(backend_gpu):
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 64
    main1 = main_branch()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)
    neon_layer.allocate()
    neon_logger.display(neon_layer.nested_str())
    neon_layer.layers[0].prev_layer = True

    neon_layer.allocate_deltas()

    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:6]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get(), 'weight_bias': lo.weight_bias.get()}})
        ll.allocate()

        temp_buff = DeltasTree()
        ll.allocate_deltas(temp_buff)
        temp_buff.allocate_buffers()
        ll.set_deltas(temp_buff)

    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            temp_buff = DeltasTree()
            ll.allocate_deltas(temp_buff)
            temp_buff.allocate_buffers()
            ll.set_deltas(temp_buff)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[6].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out, neon_out_ref, rtol=0)

    neon_logger.display("Beginning Back prop")
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[6:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[6].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas

    neon_ref_deltas = ref_deltas.get()

    assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def test_branch_model_fork_cpu(backend_cpu64):
    from neon.layers import BranchNode, Tree
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 32
    bnode = BranchNode()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()

    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()

    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        temp_deltas = DeltasTree()
        temp_deltas.proc_layer(ll)
        temp_deltas.allocate_buffers()
        ll.set_deltas(temp_deltas)

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            temp_deltas = DeltasTree()
            temp_deltas.proc_layer(ll)
            temp_deltas.allocate_buffers()
            ll.set_deltas(temp_deltas)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0)

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    assert allclose_with_out(neon_out_ref2, neon_out[1])

    neon_logger.display("Beginning Back prop")
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = alpha2 * lbranch2[0].deltas
    ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas
    neon_ref_deltas = ref_deltas.get()
    assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0)

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
Example #23
0
def mergesum_test_config(be, modfunc, use_stride=1):
    l1 = Conv(**conv_params(3, 16))
    neon_layer = modfunc(16, use_stride)
    inshape = (16, 32, 32)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))

    neon_seq = Sequential([l1] + neon_layer)
    neon_seq.configure(inshape)
    inp = be.array(inpa)

    neon_seq.allocate()
    # print neon_layer.nested_str()
    # neon_layer.layers[0].prev_layer = True
    neon_seq.allocate_deltas()
    neon_out = neon_seq.fprop(inp).get()

    # Now make the reference pathways:
    p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride)
    l11 = Conv(**conv_params(3, 16))
    l12 = Conv(**conv_params(3, 16))

    for ll in (l11, l12):
        for lcopy, lref in zip(ll, l1):
            if lcopy.has_params:
                lcopy.set_params(lref.get_params_serialize())

    path1 = Sequential([l11] + p1)
    path2 = Sequential([l12] + p2)
    for ll in (path1, path2):
        ll.configure(inshape)
        ll.allocate()
        ll.allocate_deltas()

    o1 = path1.fprop(inp)
    o2 = path2.fprop(inp)
    neon_out_ref = be.empty_like(o1)
    neon_out_ref[:] = be.maximum(o1 + o2, 0)

    # need to have bsum false for this test to be valid
    assert allclose_with_out(neon_out_ref.get(), neon_out, rtol=0)
    print "Fprop matching"
    print "Beginning Back prop"
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)

    ebr = neon_seq.layers[-1].bprop(err)
    ebr = neon_seq.layers[-2].bprop(ebr)
    trunk_neon = ebr.get()

    err = be.array(erra)
    err[:] = be.greater(neon_out_ref, 0) * err

    pstart = len(l1)
    eb1 = err
    for l in reversed(path1.layers[pstart:]):
        eb1 = l.bprop(eb1)

    eb2 = err
    for l in reversed(path2.layers[pstart:]):
        eb2 = l.bprop(eb2)

    err_ref = be.empty_like(eb1)
    err_ref[:] = eb1 + eb2

    assert allclose_with_out(err_ref.get(), trunk_neon, rtol=0)
Example #24
0
def create_model(dis_model='dc',
                 gen_model='dc',
                 cost_type='wasserstein',
                 noise_type='normal',
                 im_size=64,
                 n_chan=3,
                 n_noise=100,
                 n_gen_ftr=64,
                 n_dis_ftr=64,
                 depth=4,
                 n_extra_layers=0,
                 batch_norm=True,
                 gen_squash=None,
                 dis_squash=None,
                 dis_iters=5,
                 wgan_param_clamp=None,
                 wgan_train_sched=False):
    """
    Create a GAN model and associated GAN cost function for image generation

    Arguments:
        dis_model (str): Discriminator type, can be 'mlp' for a simple MLP or
                         'dc' for a DC-GAN style model. (defaults to 'dc')
        gen_model (str): Generator type, can be 'mlp' for a simple MLP or
                         'dc' for a DC-GAN style model. (defaults to 'dc')
        cost_type (str): Cost type, can be 'original', 'modified' following
                         Goodfellow2014 or 'wasserstein' following Arjovsky2017
                         (defaults to 'wasserstein')
        noise_type (str): Noise distribution, can be 'uniform or' 'normal'
                          (defaults to 'normal')
        im_size (int): Image size (defaults to 64)
        n_chan (int): Number of image channels (defaults to 3)
        n_noise (int): Number of noise dimensions (defaults to 100)
        n_gen_ftr (int): Number of generator feature maps (defaults to 64)
        n_dis_ftr (int): Number of discriminator feature maps (defaults to 64)
        depth (int): Depth of layers in case of MLP (defaults to 4)
        n_extra_layers (int): Number of extra conv layers in case of DC (defaults to 0)
        batch_norm (bool): Enable batch normalization (defaults to True)
        gen_squash (str or None): Squashing function at the end of generator (defaults to None)
        dis_squash (str or None): Squashing function at the end of discriminator (defaults to None)
        dis_iters (int): Number of critics for discriminator (defaults to 5)
        wgan_param_clamp (float or None): In case of WGAN weight clamp value, None for others
        wgan_train_sched (bool): Enable training schedule of number of critics (defaults to False)
    """
    assert dis_model in ['mlp', 'dc'], \
        "Unsupported model type for discriminator net, supported: 'mlp' and 'dc'"
    assert gen_model in ['mlp', 'dc'], \
        "Unsupported model type for generator net, supported: 'mlp' and 'dc'"
    assert cost_type in ['original', 'modified', 'wasserstein'], \
        "Unsupported GAN cost function type, supported: 'original', 'modified' and 'wasserstein'"

    # types of final squashing functions
    squash_func = dict(nosquash=Identity(), sym=Tanh(), asym=Logistic())
    if cost_type == 'wasserstein':
        if gen_model == 'mlp':
            gen_squash = gen_squash or 'nosquash'
        elif gen_model == 'dc':
            gen_squash = gen_squash or 'sym'
        dis_squash = dis_squash or 'nosquash'
    else:  # for all GAN costs other than Wasserstein
        gen_squash = gen_squash or 'sym'
        dis_squash = dis_squash or 'asym'

    assert gen_squash in ['nosquash', 'sym', 'asym'], \
        "Unsupported final squashing function for generator," \
        " supported: 'nosquash', 'sym' and 'asym'"
    assert dis_squash in ['nosquash', 'sym', 'asym'], \
        "Unsupported final squashing function for discriminator," \
        " supported: 'nosquash', 'sym' and 'asym'"

    gfa = squash_func[gen_squash]
    dfa = squash_func[dis_squash]

    # create model layers
    if gen_model == 'mlp':
        gen = create_mlp_generator(im_size,
                                   n_chan,
                                   n_gen_ftr,
                                   depth,
                                   batch_norm=False,
                                   finact=gfa)
        noise_dim = (n_noise, )
    elif gen_model == 'dc':
        gen = create_dc_generator(im_size,
                                  n_chan,
                                  n_noise,
                                  n_gen_ftr,
                                  n_extra_layers,
                                  batch_norm,
                                  finact=gfa)
        noise_dim = (n_noise, 1, 1)

    if dis_model == 'mlp':
        dis = create_mlp_discriminator(im_size,
                                       n_dis_ftr,
                                       depth,
                                       batch_norm=False,
                                       finact=dfa)
    elif dis_model == 'dc':
        dis = create_dc_discriminator(im_size,
                                      n_chan,
                                      n_dis_ftr,
                                      n_extra_layers,
                                      batch_norm,
                                      finact=dfa)
    layers = GenerativeAdversarial(generator=Sequential(gen, name="Generator"),
                                   discriminator=Sequential(
                                       dis, name="Discriminator"))

    return GAN(layers=layers, noise_dim=noise_dim, noise_type=noise_type, k=dis_iters,
               wgan_param_clamp=wgan_param_clamp, wgan_train_sched=wgan_train_sched), \
        GeneralizedGANCost(costfunc=GANCost(func=cost_type))
def test_branch_model_fork():
    from neon.layers import BranchNode, Tree

    NervanaObject.be = gen_backend("gpu", batch_size=64)
    be = NervanaObject.be
    bnode = BranchNode()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (3, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()
    print neon_layer.nested_str()
    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()
    neon_layer.layers[0].layers[0].set_deltas([be.iobuf(inshape)])
    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].set_deltas([be.iobuf(inshape)])

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params(lo.W.get())
        ll.allocate()
        ll.set_deltas([be.iobuf(ll.in_shape)])

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params(lo.W.get())

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            ll.set_deltas([be.iobuf(ll.in_shape)])

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    difference = neon_out_ref - neon_out[0]
    assert np.max(np.abs(difference)) < 1e-7
    print np.max(np.abs(difference))

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    difference = neon_out_ref2 - neon_out[1]
    assert np.max(np.abs(difference)) < 1e-7
    print np.max(np.abs(difference))

    print "Beginning Back prop"
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b1[0].deltas + b2[0].deltas + b3[0].deltas + alpha2 * lbranch2[0].deltas

    neon_ref_deltas = ref_deltas.get()
    difference = middle_neon_deltas - neon_ref_deltas

    print np.max(np.abs(difference))
    assert np.max(np.abs(difference)) < 1e-8

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    difference = bottom_neon_deltas - bottom_neon_ref_deltas
    print np.max(np.abs(difference))
    assert np.max(np.abs(difference)) < 1e-8
Example #26
0
class Model(NervanaObject):
    """
    Basic model class which stores a list of layers describing the model. Can train the layer
    weights on a dataset, evaluate on a test set and serialize the mode.
    Additional functionality can be added to fit through callback functions.

    Arguments:
        layers: layer container, or a list of layers (that will be containerized),
                or a serialized model description
        dataset (iterator): Data set (ignored, will be removed)
        weights_only (bool): set to True if you do not want to recreate layers
                             and states during deserialization from a serialized model
                             description.  Defaults to False.
        name (str): Model name.  Defaults to "model"
        optimizer (Optimizer): Optimizer object which defines the learning rule
                               for updating model parameters (ie DescentMomentum, AdaDelta)
    """

    def __init__(self, layers, dataset=None, weights_only=False, name="model", optimizer=None):
        super(Model, self).__init__(name)
        self.optimizer = optimizer
        self.params = None  # should be able to remove
        self.states = None  # should be able to remove
        self.epoch_index = 0
        self.finished = False
        self.initialized = False
        self.cost = None
        self.nbatches = 0
        self.ndata = 0

        if dataset is not None:
            logger.warning('dataset is a deprecated argument and will be ignored')

        if type(layers) in (ModelDescription, dict):
            # load up the model from a serialized file (dataset could be None here)
            self.deserialize(layers, load_states=(not weights_only))
        elif type(layers) is str:
            self.load_params(layers, load_states=(not weights_only))
        else:
            # Wrap the list of layers in a Sequential container if a raw list of layers
            if type(layers) in (Sequential, Tree, SingleOutputTree):
                self.layers = layers
            else:
                self.layers = Sequential(layers)
        self.layers.propagate_parallelism("Data")

    @property
    def layers_to_optimize(self):
        return self.layers.layers_to_optimize

    def set_shortcut(self):
        # infer whether bprop shortcut can be used on final activation
        # self.cost should be set to run this otherwise do nothing
        lastlayer = self.layers[-1]
        try:
            if self.cost.costfunc.__class__ is CrossEntropyBinary:
                if (lastlayer.__class__ is Activation and
                   lastlayer.transform.__class__ is Logistic):
                    lastlayer.transform.set_shortcut(True)
        except:
            # if any attributes are not set or any other exception
            # is thrown leave transform.shortcut as is (do nothing)
            pass

    def initialize(self, dataset, cost=None):
        if self.initialized:
            return

        # Propagate shapes through the layers to configure
        prev_input = dataset
        prev_input = self.layers.configure(prev_input)

        if cost is not None:
            cost.initialize(prev_input)
            self.cost = cost

        # Now allocate space
        self.layers.allocate()
        self.layers.allocate_deltas()
        self.initialized = True

    def __str__(self):
        """
        String representation of model's layers
        """
        config_string = "Network Layers:\n" + self.layers.nested_str()
        return config_string

    def fit(self, dataset, cost, optimizer, num_epochs, callbacks):
        """
        Trains the model parameters on a dataset by minimizing the cost function through
        gradient descent and updates the layer weights according to a learning rule
        defined in optimizer.

        Arguments:
            dataset (iterator): An iterable of minibatches where each
                element is a (x, y) tuple where x is the input data and y are the labels.
                x is of dimension (feature_size, batch_size)
                y is of dimension (label_size, batch_size)
                Length of the iterator is num_batches which is num_data / batch_size
            cost (Cost): Defines the function which the model is minimizing based
                on the output of the last layer and the input labels
            optimizer (Optimizer): Defines the learning rule for updating the model parameters
            num_epochs: Number of times to iterate over the dataset.
            callbacks (Callbacks): Defines callbacks to run at the end of each mini-batch / epoch.
        """
        self.nbatches = dataset.nbatches
        self.ndata = dataset.ndata
        # self.set_shortcut()  # infer if bprop shortcut can be used
        self.total_cost = self.be.empty((1, 1), dtype=np.float32)
        self.optimizer = optimizer
        self.initialize(dataset, cost)

        callbacks.on_train_begin(num_epochs)
        while self.epoch_index < num_epochs and not self.finished:
            self.nbatches = dataset.nbatches

            callbacks.on_epoch_begin(self.epoch_index)

            self._epoch_fit(dataset, callbacks)

            callbacks.on_epoch_end(self.epoch_index)

            self.epoch_index += 1

        callbacks.on_train_end()

    def _epoch_fit(self, dataset, callbacks):
        """
        Helper function for fit which performs training on a dataset for one epoch.

        Arguments:
            dataset (iterable): Dataset iterator to perform fit on
        """
        epoch = self.epoch_index
        self.total_cost[:] = 0
        # iterate through minibatches of the dataset
        for mb_idx, (x, t) in enumerate(dataset):
            callbacks.on_minibatch_begin(epoch, mb_idx)
            self.be.begin(Block.minibatch, mb_idx)

            x = self.fprop(x)

            self.total_cost[:] = self.total_cost + self.cost.get_cost(x, t)

            # deltas back propagate through layers
            # for every layer in reverse except the 0th one
            delta = self.cost.get_errors(x, t)

            self.bprop(delta)
            self.optimizer.optimize(self.layers_to_optimize, epoch=epoch)

            self.be.end(Block.minibatch, mb_idx)
            callbacks.on_minibatch_end(epoch, mb_idx)

        # now we divide total cost by the number of batches,
        # so it was never total cost, but sum of averages
        # across all the minibatches we trained on
        self.total_cost[:] = self.total_cost / dataset.nbatches

    def fprop(self, x, inference=False):
        """
        Forward propagates a minibatch x through the model.

        Arguments:
            x (Tensor): Input minibatch data
            inference (bool): Flag for performing training or inference
                Only affects batch norm and dropout layers.

        Returns:
            Tensor: the output of the final layer in the model
        """
        return self.layers.fprop(x, inference)

    def bprop(self, delta):
        """
        Back propagates the error of a minibatch through the model.

        Arguments:
            delta (Tensor): Derivative of cost with respect to the last layer's output
        """
        return self.layers.bprop(delta)

    def eval(self, dataset, metric):
        """
        Evaluates a model on a dataset according to an input metric.

        Arguments:
            datasets (iterable): dataset to evaluate on.
            metric (Cost): what function to evaluate dataset on.
        """
        self.initialize(dataset)
        running_error = np.zeros((len(metric.metric_names)), dtype=np.float32)
        nprocessed = 0
        dataset.reset()
        for x, t in dataset:
            x = self.fprop(x, inference=True)

            # This logic is for handling partial batch sizes at the end of the dataset
            nsteps = x.shape[1] / self.be.bsz if not isinstance(x, list) else \
                x[0].shape[1] / self.be.bsz

            bsz = min(dataset.ndata - nprocessed, self.be.bsz)
            running_error += metric(x, t, calcrange=slice(0, nsteps * bsz)) * nsteps * bsz
            nprocessed += bsz * nsteps
        running_error /= nprocessed
        return running_error

    def get_outputs(self, dataset):
        """
        Get the activation outputs of the final model layer for the dataset

        Arguments:
            dataset (iterable): Dataset iterator to perform fit on

        Returns:
            Host numpy array: the output of the final layer for the entire Dataset
        """
        self.initialize(dataset)
        dataset.reset()  # Move "pointer" back to beginning of dataset
        n = dataset.nbatches
        x = self.layers.layers[-1].outputs
        assert not isinstance(x, list), "Can not get_outputs with Branch terminal"
        Ypred = None
        for idx, (x, t) in enumerate(dataset):
            x = self.fprop(x, inference=True)
            if Ypred is None:
                (dim0, dim1) = x.shape
                Ypred = np.empty((n * dim1, dim0), dtype=x.dtype)
                nsteps = dim1 / self.be.bsz
            cur_batch = slice(idx * dim1, (idx + 1) * dim1)
            Ypred[cur_batch] = x.get().T

        # Handle the recurrent case.
        if nsteps != 1:
            b, s = (self.be.bsz, nsteps)
            Ypred = Ypred.reshape((n, s, b, -1)).transpose(0, 2, 1, 3).copy().reshape(n*b, s, -1)

        return Ypred[:dataset.ndata]

    def get_description(self, get_weights=False, keep_states=False):
        """
        Gets a description of the model required to reconstruct the model with
        no weights like from a yaml file.

        Returns:
            dict: Description of each component of the model.
        """
        pdict = dict()
        pdict['neon_version'] = __neon_version__
        compat_mode = self.be.compat_mode if self.be.compat_mode is not None else 'neon'
        pdict['backend'] = {'type': self.be.__class__.__name__,
                            'compat_mode': compat_mode,
                            'rng_seed': self.be.rng_seed,
                            'rng_state': self.be.rng_get_state()}

        if self.cost:
            pdict['cost'] = self.cost.get_description()
        if self.optimizer:
            pdict['optimizer'] = self.optimizer.get_description()

        pdict['model'] = self.layers.get_description(get_weights=get_weights,
                                                     keep_states=keep_states)
        return pdict

    def save_params(self, param_path, keep_states=True):
        """
        Serializes and saves model parameters to the path specified.

        Arguments:
            param_path (str): File to write serialized parameter dict to.
            keep_states (bool): Whether to save optimizer states too.
                                Defaults to True.
        """
        self.serialize(keep_states=keep_states, fn=param_path)

    def load_params(self, param_path, load_states=True):
        """
        Loads the model parameters (per layer weights, epochs run, optimizer
        states) saved in param_path from serialize().

        Arguments:
            param_path (str): File containing serialized python dict with layer
                              weights and states.
            load_states (bool):  if False, then only the weights will be loaded
                                 into a model in which the layers have already been
                                 created, otherwise will (re)create the layers from
                                 the serialized parameters and set the learning
                                 states as well
        """
        self.deserialize(load_obj(param_path), load_states=load_states)
        logger.info('Model weights loaded from %s', param_path)

    def load_weights(self, weight_path):
        """
        .. deprecated:: 1.1.4
           Use :func:`load_params` instead
        """
        logger.warning('Calling deprecated load_weights function.  Use '
                       'load_params instead')
        self.load_params(weight_path)

    def deserialize(self, model_dict, data=None, load_states=True):
        """
        Loads per layer (weights, states) and other model parameters from the
        dictionary passed.

        Arguments:
            model_dict (dict): dictionary describing the model including layers,
                               cost, optimizers, backend settings, etc.
                               generated by the serialize function
            data (iterator):   Data set (ignored, will be removed)

            load_states (bool):  if False, then only the weights will be loaded
                                 into a model in which the layers have already been
                                 created, otherwise will (re)create the layers from
                                 the serialized parameters and set the learning
                                 states as well
        """

        if data is not None:
            logger.warning('data is a deprecated argument and will be ignored')

        if 'epoch_index' in model_dict:
            self.epoch_index = model_dict['epoch_index']
        if 'model' not in model_dict:
            logger.error('Using old model serialization format. '
                         'Serialized the model into new format')

            param_layers = [l for l in self.layers_to_optimize]
            param_dict_list = model_dict['layer_params_states']
            for l, ps in zip(param_layers, param_dict_list):
                l.set_params(ps)
                if 'states' in ps and load_states:
                    l.set_states(ps)
            return

        if 'backend' in model_dict:
            if 'compat_mode' in model_dict['backend']:
                self.be.compat_mode = model_dict['backend']['compat_mode']
        else:
            model_dict['backend'] = {}

        typ = model_dict['model']['type']
        main_container = load_class(typ)

        if not hasattr(self, 'layers'):
            self.layers = main_container.gen_class(model_dict['model']['config'])

        self.layers.load_weights(model_dict['model'], load_states)

        if load_states and 'rng_state' in model_dict['backend']:
            try:
                self.be.rng_set_state(model_dict['backend']['rng_state'])
            except ValueError as e:
                # could come about when switching backend types (ex GPU to CPU)
                logger.warning("Problems restoring existing RNG state: %s", str(e))

    # serialize tells how to write out the parameters we've learned so
    # far and associate them with layers. it can ignore layers with no
    # learned parameters. the model stores states to pass to the
    # optimizers.  if we're saving the model out for inference, we
    # don't need to remember states.
    def serialize(self, fn=None, keep_states=True):
        """
        Creates a dictionary storing the layer parameters and epochs complete.

        Arguments:
            fn (str): file to save pkl formatted model dictionary
            keep_states (bool): Whether to save optimizer states.

        Returns:
            dict: Model data including layer parameters and epochs complete.
        """

        # get the model dict with the weights
        pdict = self.get_description(get_weights=True, keep_states=keep_states)
        pdict['epoch_index'] = self.epoch_index + 1
        if self.initialized:
            pdict['train_input_shape'] = self.layers.in_shape
        if fn is not None:
            save_obj(pdict, fn)
            return
        return pdict

    def set_batch_size(self, N):
        """
        Set the actual minibatch size, so eventhough the buffers are allocated considering
        excessive padding, the processing for some layers may be shortened.
        Currently most of the neon layers don't use that to control the processing. The
        interface is here only for when someone wants to set that information and experiment.
        """
        return self.layers.set_batch_size(N)

    def set_seq_len(self, S):
        """
        Set the actual minibatch sequence length, so eventhough the buffers are allocated
        considering excessive padding, the processing for some layers may be shortened.
        Currently most of the neon layers don't use that to control the processing. The
        interface is here only for when someone wants to set that information and experiment.
        """
        return self.layers.set_seq_len(S)

    def benchmark(self, dataset, inference=False, cost=None, optimizer=None,
                  niterations=20, nskip=2):
        """
        Measure runtime for computing fprop and bprop seperately, as well as
        full minibatch run times. For inference case, only the fprop

        Arguments:
              dataset (iterable): Dataset iterator to perform fit on

              cost (Cost): Defines the function which the model is minimizing based
                            on the output of the last layer and the input labels

             niterations (optional, int): Number of minibatches to average over

             nskip (optional, int): number of iterations at the beginning to skip
                                    when calculating the runtime statistics

        Returns:
            dictionary with fprop, bprop run times
        """
        # initialize model
        if inference is False:
            assert cost is not None and optimizer is not None, "Need cost and optimizer to \
                                                                benchmark bprop and update"
        self.cost = cost
        self.initialize(dataset, cost)
        self.optimizer = optimizer
        self.total_cost = self.be.empty((1, 1))
        self.total_cost[:] = 0

        # iterate through minibatches of the dataset
        times = OrderedDict()
        time_keys = ['fprop'] if inference else ['fprop', 'bprop', 'iteration']
        for ky in time_keys:
            times[ky] = np.full(niterations + nskip, -1.0)
        count = 0

        fprop_start = self.be.init_mark()
        fprop_end = self.be.init_mark()
        bprop_end = self.be.init_mark()

        while count < niterations + nskip:
            dataset.reset()
            for mb_idx, (x, t) in enumerate(dataset):

                self.be.record_mark(fprop_start)  # mark start of fprop

                x = self.fprop(x)

                if inference is False:
                    self.total_cost[:] = self.total_cost + self.cost.get_cost(x, t)

                self.be.record_mark(fprop_end)  # mark end of fprop and start of bprop

                if inference is False:
                    delta = self.cost.get_errors(x, t)
                    self.bprop(delta)
                    self.optimizer.optimize(self.layers_to_optimize, epoch=0)

                    self.be.record_mark(bprop_end)  # mark end of bprop
                    self.be.synchronize_mark(bprop_end)
                else:
                    self.be.synchronize_mark(fprop_end)

                times['fprop'][count] = self.be.get_time(fprop_start, fprop_end)
                if inference is False:
                    times['bprop'][count] = self.be.get_time(fprop_end, bprop_end)
                    times['iteration'][count] = times['fprop'][count] + times['bprop'][count]

                count += 1
                if count >= niterations + nskip:
                    break

        # print results
        header = ('Func', 'Mean', 'Median', 'Min', 'Max', 'Units')
        stats = tuple(stat.lower() for stat in header[1:-1])

        fmt_titles = '| {:^11} '*len(header) + '|'
        fmt_nums = '| {func:<11} ' + '|  {%s:<10.5g} '*len(stats) % (stats) + '| {units:^11} |'

        head_str = fmt_titles.format(*header)
        sep = '-'*len(head_str)
        head_str = sep + '\n' + head_str + '\n' + sep
        print(head_str)
        out_stats = {}
        for step in times:
            timesu = np.array(times[step][nskip:])  # in ms
            out_stats[step] = {}
            for stat in stats:
                out_stats[step][stat] = getattr(np, stat)(timesu)
            print(fmt_nums.format(units='msec', func=step, **out_stats[step]))
        print(sep)
        return out_stats
Example #27
0
def mergesum_test_config(be, modfunc, use_stride=1):
    l1 = Conv(**conv_params(3, 16))
    neon_layer = modfunc(16, use_stride)
    inshape = (16, 32, 32)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))

    neon_seq = Sequential([l1] + neon_layer)
    neon_seq.configure(inshape)
    inp = be.array(inpa)

    neon_seq.allocate()
    # neon_layer.layers[0].prev_layer = True

    neon_seq.allocate_deltas()

    neon_out = neon_seq.fprop(inp).get()

    # Now make the reference pathways:
    p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride)
    l11 = Conv(**conv_params(3, 16))
    l12 = Conv(**conv_params(3, 16))

    for ll in (l11, l12):
        for lcopy, lref in zip(ll, l1):
            if lcopy.has_params:
                lcopy.set_params(lref.get_params_serialize())

    path1 = Sequential([l11] + p1)
    path2 = Sequential([l12] + p2)
    for ll in (path1, path2):
        ll.configure(inshape)
        ll.allocate()
        ll.allocate_deltas()

    o1 = path1.fprop(inp)
    o2 = path2.fprop(inp)
    # convert mkl buffer to cpu for following cpu execution
    be.convert_data(o1, False)
    be.convert_data(o2, False)
    neon_out_ref = be.empty_like(o1)
    neon_out_ref[:] = be.maximum(o1 + o2, 0)

    # need to have bsum false for this test to be valid
    assert allclose_with_out(neon_out_ref.get(), neon_out, rtol=0)
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)

    ebr = neon_seq.layers[-1].bprop(err)
    ebr = neon_seq.layers[-2].bprop(ebr)
    trunk_neon = ebr.get()

    err = be.array(erra)
    err[:] = be.greater(neon_out_ref, 0) * err

    pstart = len(l1)
    eb1 = err
    for l in reversed(path1.layers[pstart:]):
        eb1 = l.bprop(eb1)

    eb2 = err
    for l in reversed(path2.layers[pstart:]):
        eb2 = l.bprop(eb2)

    be.convert_data(eb1, False)
    be.convert_data(eb2, False)
    err_ref = be.empty_like(eb1)
    err_ref[:] = eb1 + eb2

    assert allclose_with_out(err_ref.get(), trunk_neon, rtol=0)
Example #28
0
def test_branch_model_fork(backend_gpu):
    from neon.layers import BranchNode, Tree
    np.random.seed(0)
    be = NervanaObject.be
    if be.gpu_memory_size < 6.1 * 1024 * 1024 * 1024:
        pytest.skip(msg='Test requires more than 6.1GB')
    be.bsz = 64
    bnode = BranchNode()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()

    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()

    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        temp_deltas = DeltasTree()
        temp_deltas.proc_layer(ll)
        temp_deltas.allocate_buffers()
        ll.set_deltas(temp_deltas)

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            temp_deltas = DeltasTree()
            temp_deltas.proc_layer(ll)
            temp_deltas.allocate_buffers()
            ll.set_deltas(temp_deltas)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0)

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    assert allclose_with_out(neon_out_ref2, neon_out[1])

    neon_logger.display("Beginning Back prop")
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = alpha2 * lbranch2[0].deltas
    ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas
    neon_ref_deltas = ref_deltas.get()
    assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0)

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
Example #29
0
conv4 = dict(init=init,
             batch_norm=True,
             activation=lrelu,
             dilation=dict(dil_h=2, dil_w=2, dil_d=2))
conv5 = dict(init=init,
             batch_norm=True,
             activation=lrelu,
             padding=dict(pad_h=2, pad_w=2, pad_d=0),
             dilation=dict(dil_h=2, dil_w=2, dil_d=3))
conv6 = dict(init=init,
             batch_norm=False,
             activation=lrelu,
             padding=dict(pad_h=1, pad_w=0, pad_d=3))
G_layers = [
    Linear(64 * 7 * 7, init=init),  # what's about the input volume
    Reshape((7, 7, 8, 8)),
    Conv((6, 6, 8, 64), **conv4),
    Conv((6, 5, 8, 6), **conv5),
    Conv((3, 3, 8, 6), **conv6),
    Conv((2, 2, 2, 1), init=init, batch_norm=False, activation=relu)
]
# what's about Embedding

layers = GenerativeAdversarial(generator=Sequential(G_layers,
                                                    name="Generator"),
                               discriminator=Sequential(D_layers,
                                                        name="Discriminator"))

# setup cost function as CrossEntropy
cost = GeneralizedGANCost(costfunc=GANCost(func="modified"))
Example #30
0
def test_branch_model(backend_gpu):
    be = NervanaObject.be
    trunk = [{
        'layer': Conv,
        'config': dict(fshape=(5, 5, 16), **common)
    }, {
        'layer': Pooling,
        'config': dict(op='max', **pool2s1p1)
    }]
    branch1 = [{
        'layer': Conv,
        'config': dict(fshape=(5, 5, 32), **common)
    }, {
        'layer': Pooling,
        'config': dict(op='max', **pool2s1p1)
    }, {
        'layer': Affine,
        'config': dict(nout=200, **common)
    }, {
        'layer': Affine,
        'config': dict(nout=10, init=init1, activation=relu)
    }]
    branch2 = [{
        'layer': Conv,
        'config': dict(fshape=(3, 3, 32), **common)
    }, {
        'layer': Pooling,
        'config': dict(op='max', **pool2s1p1)
    }, {
        'layer': Affine,
        'config': dict(nout=256, **common)
    }, {
        'layer': Affine,
        'config': dict(nout=10, init=init1, activation=relu)
    }]

    alphas = [1, 1]
    neon_layer, t, b1, b2 = make_tree(trunk, branch1, branch2, alphas)

    inshape = (16, 32, 32)
    insize = np.prod(inshape)

    # Let's force bprop deltas computation for
    inpa = np.random.random((insize, be.bsz))
    inp = be.array(inpa)

    neon_layer.configure(inshape)
    neon_layer.allocate()
    neon_layer.allocate_deltas()
    neon_out = [i.get() for i in neon_layer.fprop(inp)]

    ref_layers = [Sequential(t), Sequential(b1), Sequential(b2)]
    ref_layers[0].configure(inshape)
    ref_layers[1].configure(ref_layers[0].out_shape)
    ref_layers[2].configure(ref_layers[0].out_shape)
    [r.allocate() for r in ref_layers]
    [r.allocate_deltas() for r in ref_layers]

    # Now copy the weights
    ref_all_layers = ref_layers[0].layers + ref_layers[1].layers + ref_layers[
        2].layers
    ref_weight_layers = [l for l in ref_all_layers if l.has_params]
    neon_weight_layers = neon_layer.layers_to_optimize
    for rl, nl in zip(ref_weight_layers, neon_weight_layers):
        rl.set_params({'params': {'W': nl.W.get()}})

    # Forward prop
    inp_middle = ref_layers[0].fprop(inp)
    ref_out = [r.fprop(inp_middle).get() for r in ref_layers[1:]]

    for h, r in zip(neon_out, ref_out):
        difference = np.max(np.abs(h - r))
        assert (difference < 1e-9)

    # Back prop
    erra = [np.random.random(ll.shape) for ll in neon_out]
    err = [be.array(e) for e in erra]

    input_layer = neon_layer.layers[0].layers[
        0]  # reference the trunk, then the root
    input_layer.prev_layer = True
    input_layer.set_deltas([be.iobuf(inshape)])

    neon_layer.bprop(err)
    errp = input_layer.deltas.get()

    for i, r in enumerate(ref_layers):
        r.layers[0].prev_layer = True
        _inshape = inshape if i == 0 else ref_layers[0].out_shape
        r.layers[0].set_deltas([be.iobuf(_inshape)])

    joined_err = be.iobuf(ref_layers[0].out_shape)
    branch_errs = [
        r.bprop(e, a)
        for r, e, a in reversed(list(zip(ref_layers[1:], err, alphas)))
    ]
    joined_err[:] = branch_errs[0] + branch_errs[1]

    err_ref = ref_layers[0].bprop(joined_err).get()

    difference = np.max(np.abs(err_ref - errp))
    neon_logger.display("Max difference: {}".format(difference))
    assert (difference < 1e-9)