Ejemplo n.º 1
0
    def initialize(self,
                   inputs,
                   loss,
                   target,
                   priority_expr,
                   givens=None,
                   lr_mult=1):
        self._target = target
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs="ignore")

        if self._scale_conv_grads:  # (for dueling network architecture)
            gradients = scale_conv_gradients(params,
                                             gradients,
                                             scale=2**(-1 / 2))

        gradients, grad_norm = apply_grad_norm_clip(gradients,
                                                    self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates = self._update_method(gradients, params, learning_rate=lr)
        self._f_opt = ext.compile_function(
            inputs=inputs,
            outputs=[priority_expr, loss],
            updates=updates,
            givens=givens,
            log_name="grad_and_update",
        )
Ejemplo n.º 2
0
    def initialize(self, inputs, loss, target, priority_expr,
            givens=None, lr_mult=1):
        self._target = target
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs="ignore")

        if self._scale_conv_grads:
            gradients = scale_conv_gradients(params, gradients,
                scale=2 ** (-1 / 2))

        # Compute gradient and save to GPU vector.
        flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients)
        self._shared_grad = shared_grad

        # All-reduce gradient in-place in shared_grad, then reshape
        gradients, avg_factor_var = avg_grads_from_flat(shared_grad, params)
        self._avg_factor_var = avg_factor_var

        gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates = self._update_method(gradients, params, learning_rate=lr)

        self._f_gradient = ext.compile_function(
            inputs=inputs,
            outputs=[priority_expr, loss],
            updates=[flat_update],
            givens=givens,
            log_name="gradient",
        )

        self._f_update = ext.compile_function(
            inputs=[],
            updates=updates,
            log_name="update",
        )
Ejemplo n.º 3
0
    def initialize(self, inputs, losses, constraints, target,
            givens=None, lr_mult=1):
        self._target = target
        loss = sum(losses)
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore')

        # Phase 1: Compute gradient and save to GPU vector
        flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients)
        self._shared_grad = shared_grad

        # Phase 2: All-reduce gradient in-place in shared_grad, then reshape
        gradients, avg_factor_var = avg_grads_from_flat(shared_grad, params)
        self._avg_factor_var = avg_factor_var  # (set later as 1 / n_gpu)

        # Phase 3: Apply combined gradient locally
        gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates = self._update_method(gradients, params, learning_rate=lr)

        self._f_grad = ext.compile_function(
            inputs=inputs,
            outputs=loss,
            updates=[flat_update],
            givens=givens,
            log_name="gradient",
        )
        self._f_update = ext.compile_function(
            inputs=[],
            outputs=grad_norm,
            updates=updates,
            log_name="update",
        )
Ejemplo n.º 4
0
    def initialize(self,
                   inputs,
                   losses,
                   constraints,
                   target,
                   givens=None,
                   lr_mult=1):
        self._target = target
        loss = sum(losses)
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore')

        gradients, grad_norm = apply_grad_norm_clip(gradients,
                                                    self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates = self._update_method(gradients, params, learning_rate=lr)

        # Prepare to load data onto GPU (and shuffle indexes there).
        load_updates, givens, opt_inputs = make_shared_inputs(
            inputs, self._shuffle)

        self._f_load = ext.compile_function(
            inputs=inputs,
            updates=load_updates,
            log_name="load",
        )
        self._f_opt = ext.compile_function(
            inputs=opt_inputs,
            outputs=[loss, grad_norm],
            updates=updates,
            givens=givens,
            log_name="grad_and_update",
        )
Ejemplo n.º 5
0
    def initialize(self, inputs, losses, constraints, target, lr_mult=1):
        self._target = target
        loss = sum(losses)
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore')

        gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip)

        # Phase 1: Compute gradient and save to GPU vector
        flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients)

        # Phase 2: apply gradient chunks to update central params; e.g. rmsprop
        lr = self._learning_rate * lr_mult
        if self.n_update_chunks > 1:
            updates_args = (shared_grad, lr, self.n_update_chunks, self._update_method_args)
            chunk_inputs, outputs_list, updates, idxs = \
                chunked_updates(self._update_method_name, updates_args)
            self._chunk_idxs = idxs
        else:
            whole_inputs, whole_outputs, whole_updates = \
                whole_update(self._update_method_name, shared_grad, lr, self._update_method_args)

        # Phase 3: copy new param values from shared_grad to params
        copy_updates = copy_params_from_flat(params, shared_grad)

        # Phase 1
        self._f_gradient = ext.compile_function(
            inputs=inputs,
            outputs=[loss, grad_norm],
            updates=[flat_update],
            log_name="gradient",
        )

        # Phase 2
        if self.n_update_chunks > 1:
            f_update_chunks = list()
            for i, (outputs, update) in enumerate(zip(outputs_list, updates)):
                f_update_chunks.append(ext.compile_function(
                    inputs=chunk_inputs,
                    outputs=outputs,
                    updates=[update],
                    log_name="update_chunk_{}".format(i))
                )
            self._f_update_chunks = f_update_chunks
        else:
            self._f_update = ext.compile_function(
                inputs=whole_inputs,
                outputs=whole_outputs,
                updates=whole_updates,
                log_name="update",
            )

        # Phase 3
        self._f_copy = ext.compile_function(
            inputs=[],
            updates=copy_updates,
            log_name="copy_params",
        )
Ejemplo n.º 6
0
    def initialize(self,
                   inputs,
                   loss,
                   target,
                   priority_expr,
                   givens=None,
                   lr_mult=1):
        self._target = target
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs="ignore")

        if self._scale_conv_grads:
            gradients = scale_conv_gradients(params,
                                             gradients,
                                             scale=2**(-1 / 2))
        # if self._layerwise_stats:
        #     self._n_params = len(params)
        #     param_grad_norms = [T.sqrt(T.sum(g ** 2)) for g in gradients]

        gradients, grad_norm = apply_grad_norm_clip(gradients,
                                                    self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates, steps = self._update_method(gradients,
                                             params,
                                             learning_rate=lr)

        # step_norm = T.sqrt(sum(T.sum(s ** 2) for s in steps))
        # if self._layerwise_stats:
        #     param_step_norms = [T.sqrt(T.sum(s ** 2)) for s in steps]

        # outputs = [priority_expr, loss, grad_norm, step_norm]
        outputs = [priority_expr, loss]
        # if self._layerwise_stats:
        #     outputs += param_grad_norms + param_step_norms

        self._f_opt = ext.compile_function(
            inputs=inputs,
            outputs=outputs,
            updates=updates,
            givens=givens,
            log_name="grad_and_update",
        )
Ejemplo n.º 7
0
    def initialize(self,
                   inputs,
                   loss,
                   target,
                   priority_expr,
                   givens=None,
                   lr_mult=1):
        self._target = target
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs="ignore")

        if self._scale_conv_grads:
            gradients = scale_conv_gradients(params,
                                             gradients,
                                             scale=2**(-1 / 2))

        gradients, grad_norm = apply_grad_norm_clip(gradients,
                                                    self._grad_norm_clip)

        # Phase 1: Compute gradient and save to GPU vector
        flat_grad, shared_grad, flat_update = flat_shared_grad(
            target, gradients)

        # Phase 2: apply gradient chunks to central params
        lr = self._learning_rate * lr_mult
        updates_args = (shared_grad, lr, self._n_update_chunks,
                        self._update_method_args)
        chunk_inputs, outputs_list, updates, idxs = \
            chunked_updates(self._update_method_name, updates_args)
        self._chunk_idxs = idxs

        # Phase 3: copy new param values from sared_grad to params
        copy_updates = copy_params_from_flat(params, shared_grad)

        if self._update_method_name == "adam":
            copy_updates.append(updates.pop())  # (Move the t update)

        # Phase 1
        self._f_gradient = ext.compile_function(
            inputs=inputs,
            outputs=[priority_expr, loss, grad_norm],
            updates=[flat_update],
            givens=givens,
            log_name="gradient",
        )

        # Phase 2
        f_update_chunks = list()
        for i, (outputs, update) in enumerate(zip(outputs_list, updates)):
            f_update_chunks.append(
                ext.compile_function(inputs=chunk_inputs,
                                     outputs=outputs,
                                     updates=[update],
                                     log_name="update_chunk_{}".format(i)))
        self._f_update_chunks = f_update_chunks

        # Phase 3
        self._f_copy = ext.compile_function(
            inputs=[],
            updates=copy_updates,
            log_name="copy_params",
        )
Ejemplo n.º 8
0
    def initialize(self, inputs, losses, constraints, target, batch_size, givens=None, lr_mult=1):
        self._target = target
        params = target.get_params(trainable=True)
        loss = T.mean(losses)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore')

        # sep_grads = [theano.grad(losses[d], wrt=params, disconnected_inputs='ignore') for d in range(batch_size)]
        # lyr_sep_grads = [list() for _ in range(len(params))]
        # for d_grads in sep_grads:
        #     for d_grad, lyr_sep_grad in zip(d_grads, lyr_sep_grads):
        #         lyr_sep_grad.append(d_grad)
        # lyr_sep_grads = [T.stack(lsg) for lsg in lyr_sep_grads]

        # lyr_sep_grd_sqnorms = [T.sum(lsg ** 2) for lsg in lyr_sep_grads]
        # lyr_cmb_grd_sqnorms = [T.sum(T.sum(lsg, axis=0) ** 2) for lsg in lyr_sep_grads]
        # lyr_grad_diversities = [sep / cmb for sep, cmb in
        #     zip(lyr_sep_grd_sqnorms, lyr_cmb_grd_sqnorms)]
        # tot_sep_grd_sqnorms = T.sum(lyr_sep_grd_sqnorms)
        # tot_cmb_grd_sqnorms = T.sum(lyr_cmb_grd_sqnorms)
        # grad_diversity = tot_sep_grd_sqnorms / tot_cmb_grd_sqnorms
        # batch_size_bound = batch_size * grad_diversity

        # # Jacobians broke inside Theano.
        # # jacobians = theano.gradient.jacobian(losses, wrt=params, disconnected_inputs='ignore')

        # # lyr_sep_grd_sqnorms = [T.sum(j ** 2) for j in jacobians]
        # # lyr_cmb_grd_sqnorms = [T.sum(T.sum(j, axis=0) ** 2) for j in jacobians]
        # # lyr_grad_diversities = [sep / cmb for sep, cmb in
        # #     zip(lyr_sep_grd_sqnorms, lyr_cmb_grd_sqnorms)]
        # # tot_sep_grd_sqnorms = T.sum(lyr_sep_grd_sqnorms)
        # # tot_cmb_grd_sqnorms = T.sum(lyr_cmb_grd_sqnorms)
        # # grad_diversity = tot_sep_grd_sqnorms / tot_cmb_grd_sqnorms
        # # n = inputs[0].shape[0]
        # # batch_size_bound = n * grad_diversity

        # check_cmb_grd_sqnorms = sum(T.sum(g ** 2) for g in gradients) * batch_size

        if self._layerwise_stats:
            self._n_params = len(params)
            param_grad_norms = [T.sqrt(T.sum(g ** 2)) for g in gradients]

        gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be theano shared variable)
        updates, steps = self._update_method(gradients, params, learning_rate=lr)

        step_norm = T.sqrt(sum(T.sum(s ** 2) for s in steps))
        if self._layerwise_stats:
            param_step_norms = [T.sqrt(T.sum(s ** 2)) for s in steps]

        outputs = [grad_norm, step_norm]
        # grad_div_outputs = [grad_diversity, batch_size_bound]
        if self._layerwise_stats:
            outputs += param_grad_norms + param_step_norms
            # grad_div_outputs += lyr_grad_diversities
        # ipdb.set_trace()
        self._opt_fun["optimize"] = ext.compile_function(
            inputs=inputs,
            outputs=outputs,
            updates=updates,
            givens=givens,
            log_name="grad_and_update",
        )

        self._opt_fun["gradient"] = ext.compile_function(
            inputs=inputs,
            outputs=gradients,
            log_name="gradients_only",
        )