Ejemplo n.º 1
0
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.opt = tf.train.AdadeltaOptimizer(config.init_lr)

        self.var_list = model.get_var_list()
        self.global_step = model.get_global_step()
        self.summary = model.summary
        self.models = models
        losses = []
        grads_list = []
        for gpu_idx, model in enumerate(models):
            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device(
                    "/{}:{}".format(config.device_type, gpu_idx)):
                loss = model.get_loss()
                grads = self.opt.compute_gradients(loss,
                                                   var_list=self.var_list)
                #grads = [(tf.clip_by_value(grad, -1., 1.), var) for (grad, var) in grads if not grad is None]
                losses.append(loss)
                grads_list.append(grads)

        self.loss = tf.add_n(losses) / len(losses)
        self.grads = average_gradients(grads_list)
        self.train_op = self.opt.apply_gradients(self.grads,
                                                 global_step=self.global_step)
Ejemplo n.º 2
0
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.opt = tf.train.GradientDescentOptimizer(config.init_lr)
        self.var_list = model.get_var_list()  # it's none here
        self.global_step = model.get_global_step()  # a placeholder
        self.summary = model.summary
        self.models = models
        losses = []
        grads_list = []
        for gpu_idx, model in enumerate(models):
            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device(
                    "/{}:{}".format(config.device_type, gpu_idx)):
                loss = model.get_loss()
                grads = self.opt.compute_gradients(loss,
                                                   var_list=self.var_list)
                losses.append(loss)
                grads_list.append(grads)

        self.loss = tf.add_n(losses) / len(losses)
        self.grads = average_gradients(grads_list)
        self.train_op = self.opt.apply_gradients(self.grads,
                                                 global_step=self.global_step)
Ejemplo n.º 3
0
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.opt = tf.train.AdamOptimizer(config.init_lr)
        self.var_list = model.get_var_list('model_network')
        self.global_step = model.get_global_step()
        self.summary = model.summary
        self.models = models
        losses, grads_list = [], []

        for gpu_idx, model in enumerate(models):
            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device(
                    "/{}:{}".format(config.device_type, gpu_idx)):
                loss = model.get_loss()
                grads = self.opt.compute_gradients(loss,
                                                   var_list=self.var_list)
                losses.append(loss)
                grads_list.append(grads)

            self.loss = tf.add_n(losses) / len(losses)
            self.grads = average_gradients(grads_list)

            grad_vars = [x[1] for x in self.grads]
            gradients = [x[0] for x in self.grads]
            clipped, _ = tf.clip_by_global_norm(gradients, 2)

            self.train_op = self.opt.apply_gradients(
                zip(clipped, grad_vars), global_step=self.global_step)

            with tf.control_dependencies([self.train_op]):
                self.dummy = tf.constant(0, name='dummy')
Ejemplo n.º 4
0
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.opt = tf.train.AdamOptimizer(config.init_lr)
        self.var_list = model.get_var_list()
        self.global_step = model.get_global_step()
        self.summary = model.summary
        self.models = models
        losses_task1 = []
        losses_task2 = []
        grads_list_task1 = []
        grads_list_task2 = []
        #print("VAR LIST",self.var_list) # TODO: Check is this should not be None?
        for gpu_idx, model in enumerate(models):
            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device(
                    "/{}:{}".format(config.device_type, gpu_idx)):
                loss_task1 = model.get_loss_task1()
                loss_task2 = model.get_loss_task2()
                grads_task1 = self.opt.compute_gradients(
                    loss_task1, var_list=self.var_list)
                grads_task2 = self.opt.compute_gradients(
                    loss_task2, var_list=self.var_list)
                losses_task1.append(loss_task1)
                losses_task2.append(loss_task2)
                grads_list_task1.append(grads_task1)
                grads_list_task2.append(grads_task2)

        self.loss_task1 = tf.add_n(losses_task1) / len(losses_task1)
        self.loss_task2 = tf.add_n(losses_task2) / len(losses_task2)
        self.grads_task1 = average_gradients(grads_list_task1)
        self.grads_task2 = average_gradients(grads_list_task2)

        self.train_op_task1 = self.opt.apply_gradients(
            self.grads_task1, global_step=self.global_step)
        self.train_op_task2 = self.opt.apply_gradients(
            self.grads_task2, global_step=self.global_step)
Ejemplo n.º 5
0
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.global_step = model.get_global_step()
        if 'adam' == config.optimizer:
            self.opt = tf.train.AdamOptimizer(config.init_lr)
        elif 'gd' == config.optimizer:
            lr = tf.train.exponential_decay(config.init_lr,
                                            self.global_step,
                                            tf.to_int32(config.num_steps / 3),
                                            0.1,
                                            staircase=True)
            self.opt = tf.train.GradientDescentOptimizer(lr)
        else:
            raise ValueError('Unsupported optimizer')
        self.var_list = model.get_var_list()
        self.summary = model.summary
        self.models = models
        losses = []
        grads_list = []
        for gpu_idx, _model in enumerate(models):
            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device(
                    "/{}:{}".format(config.device_type, gpu_idx)):
                loss = _model.get_loss()
                grads = self.opt.compute_gradients(loss,
                                                   var_list=self.var_list)
                losses.append(loss)
                grads_list.append(grads)

        self.loss = tf.add_n(losses) / len(losses)
        self.grads = average_gradients(grads_list)
        if config.freeze_mode:
            self.grads = zerout_gradients_for_zero_weights(
                self.grads, mode=config.freeze_mode)
        self.train_op = self.opt.apply_gradients(self.grads,
                                                 global_step=self.global_step)
        if model.get_sparsity_op():
            with tf.control_dependencies([self.train_op]):
                self.train_op = tf.group(self.train_op,
                                         model.get_sparsity_op())
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.opt = tf.train.AdamOptimizer(config.init_lr)
        self.var_list = model.get_var_list()
        self.global_step = model.get_global_step()
        self.summary = model.summary
        self.models = models
        losses = []
        grads_list = []
        for gpu_idx, model in enumerate(models):
            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device(
                    "/{}:{}".format(config.device_type, gpu_idx)):
                loss = model.get_loss()
                grads = self.opt.compute_gradients(loss,
                                                   var_list=self.var_list)
                losses.append(loss)
                grads_list.append(grads)

        self.loss = tf.add_n(losses) / len(losses)
        self.grads = average_gradients(grads_list)
        self.train_op = self.opt.apply_gradients(self.grads,
                                                 global_step=self.global_step)
        self.logits = model.logits
        self.logits2 = model.logits2
        self.tensorDic = model.tensor_dict
        self.y = model.y
        self.y2 = model.y2
        self.wy = model.wy
        self.yp = model.yp
        self.yp2 = model.yp2
        self.wyp = model.wyp
        self.correctIndex = model.correctIndex
        self.Pos1 = model.Pos1
        self.Neg1 = model.Neg1
        self.mask1 = model.mask1
        self.invmask1 = model.invmask1
Ejemplo n.º 7
0
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
        self.var_list = model.get_var_list()
        self.global_step = model.get_global_step()
        self.summary = model.summary
        self.models = models
        losses = []
        grads_list = []
        for gpu_idx, model in enumerate(models):
            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)):
                loss = model.get_loss()
                grads = self.opt.compute_gradients(loss, var_list=self.var_list)
                losses.append(loss)
                grads_list.append(grads)

        self.loss = tf.add_n(losses)/len(losses)
        self.grads = average_gradients(grads_list)
        self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
Ejemplo n.º 8
0
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
        self.global_step = model.get_global_step()
        self.summary = model.summary
        self.models = models
        losses = []
        grads_list = []
        with tf.variable_scope("grad"):
            for gpu_idx, model in enumerate(models):
                with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/{}:{}".format(config.device_type, gpu_idx)):
                    loss = model.get_loss()
                    grads = self.opt.compute_gradients(loss)
                    losses.append(loss)
                    grads_list.append(grads)
                    tf.get_variable_scope().reuse_variables()

        self.loss = tf.add_n(losses)/len(losses)
        self.grads = average_gradients(grads_list)
        self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
Ejemplo n.º 9
0
    def initialize(self):
        params = self.params
        sess = self.sess
        device_type = params.device_type
        summaries = []

        global_step = tf.get_variable('global_step', shape=[], dtype='int32',
                                      initializer=tf.constant_initializer(0), trainable=False)
        self.tensors['global_step'] = global_step

        epoch = tf.get_variable('epoch', shape=[], dtype='int32',
                                initializer=tf.constant_initializer(0), trainable=False)
        self.tensors['epoch'] = epoch

        learning_rate = tf.placeholder('float32', name='learning_rate')
        summaries.append(tf.scalar_summary("learning_rate", learning_rate))
        self.placeholders['learning_rate'] = learning_rate

        if params.opt == 'basic':
            opt = tf.train.GradientDescentOptimizer(learning_rate)
        elif params.opt == 'adagrad':
            opt = tf.train.AdagradOptimizer(learning_rate)
        else:
            raise Exception()

        grads_tensors = []
        correct_tensors = []
        loss_tensors = []
        for device_id, tower in enumerate(self.towers):
            with tf.device("/%s:%d" % (device_type, device_id)), tf.name_scope("%s_%d" % (device_type, device_id)) as scope:
                tower.initialize(scope)
                tf.get_variable_scope().reuse_variables()
                loss_tensor = tower.get_loss_tensor()
                loss_tensors.append(loss_tensor)
                correct_tensor = tower.get_correct_tensor()
                correct_tensors.append(correct_tensor)
                grads_tensor = opt.compute_gradients(loss_tensor)
                grads_tensors.append(grads_tensor)

        with tf.name_scope("gpu_sync"):
            loss_tensor = tf.reduce_mean(tf.pack(loss_tensors), 0, name='loss')
            correct_tensor = tf.concat(0, correct_tensors, name="correct")
            with tf.name_scope("average_gradients"):
                grads_tensor = average_gradients(grads_tensors)

        self.tensors['loss'] = loss_tensor
        self.tensors['correct'] = correct_tensor
        summaries.append(tf.scalar_summary(loss_tensor.op.name, loss_tensor))

        for grad, var in grads_tensor:
            if grad is not None:
                summaries.append(tf.histogram_summary(var.op.name+'/gradients', grad))
        self.tensors['grads'] = grads_tensor

        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        apply_grads_op = opt.apply_gradients(grads_tensor, global_step=global_step)

        train_op = tf.group(apply_grads_op)
        self.tensors['train'] = train_op

        saver = tf.train.Saver(tf.all_variables())
        self.saver = saver

        summary_op = tf.merge_summary(summaries)
        self.tensors['summary'] = summary_op

        init_op = tf.initialize_all_variables()
        sess.run(init_op)
        self.writer = tf.train.SummaryWriter(params.log_dir, sess.graph)
        self.initialized = True
Ejemplo n.º 10
0
    def initialize(self):
        params = self.params
        sess = self.sess
        device_type = params.device_type
        summaries = []

        global_step = tf.get_variable('global_step',
                                      shape=[],
                                      dtype='int32',
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        self.tensors['global_step'] = global_step

        epoch = tf.get_variable('epoch',
                                shape=[],
                                dtype='int32',
                                initializer=tf.constant_initializer(0),
                                trainable=False)
        self.tensors['epoch'] = epoch

        learning_rate = tf.placeholder('float32', name='learning_rate')
        summaries.append(tf.scalar_summary("learning_rate", learning_rate))
        self.placeholders['learning_rate'] = learning_rate

        if params.opt == 'basic':
            opt = tf.train.GradientDescentOptimizer(learning_rate)
        elif params.opt == 'adagrad':
            opt = tf.train.AdagradOptimizer(learning_rate)
        elif params.opt == 'adam':
            opt = tf.train.AdamOptimizer()
        elif params.opt == 'adadelta':
            opt = tf.train.AdadeltaOptimizer(learning_rate)
        else:
            raise Exception()

        grads_pairs_dict = defaultdict(list)
        correct_tensors = []
        loss_tensors = []
        with tf.variable_scope("towers"):
            for device_id, tower in enumerate(self.towers):
                with tf.device("/%s:%d" %
                               (device_type, device_id)), tf.name_scope(
                                   "%s_%d" % (device_type, device_id)):
                    tower.initialize()
                    tf.get_variable_scope().reuse_variables()
                    loss_tensor = tower.get_loss_tensor()
                    loss_tensors.append(loss_tensor)
                    correct_tensor = tower.get_correct_tensor()
                    correct_tensors.append(correct_tensor)

                    for key, variables in tower.variables_dict.items():
                        grads_pair = opt.compute_gradients(loss_tensor,
                                                           var_list=variables)
                        grads_pairs_dict[key].append(grads_pair)

        with tf.name_scope("gpu_sync"):
            loss_tensor = tf.reduce_mean(tf.pack(loss_tensors), 0, name='loss')
            correct_tensor = tf.concat(0, correct_tensors, name="correct")
            with tf.name_scope("average_gradients"):
                grads_pair_dict = {
                    key: average_gradients(grads_pairs)
                    for key, grads_pairs in grads_pairs_dict.items()
                }
                if params.max_grad_norm:
                    grads_pair_dict = {
                        key: [(tf.clip_by_norm(grad,
                                               params.max_grad_norm), var)
                              for grad, var in grads_pair]
                        for key, grads_pair in grads_pair_dict.items()
                    }

        self.tensors['loss'] = loss_tensor
        self.tensors['correct'] = correct_tensor
        summaries.append(tf.scalar_summary(loss_tensor.op.name, loss_tensor))

        for key, grads_pair in grads_pair_dict.items():
            for grad, var in grads_pair:
                if grad is not None:
                    summaries.append(
                        tf.histogram_summary(var.op.name + '/gradients/' + key,
                                             grad))

        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        apply_grads_op_dict = {
            key: opt.apply_gradients(grads_pair, global_step=global_step)
            for key, grads_pair in grads_pair_dict.items()
        }

        self.train_ops = {
            key: tf.group(apply_grads_op)
            for key, apply_grads_op in apply_grads_op_dict.items()
        }

        saver = tf.train.Saver(tf.all_variables(), max_to_keep=2)
        self.saver = saver

        summary_op = tf.merge_summary(summaries)
        self.tensors['summary'] = summary_op

        init_op = tf.initialize_all_variables()
        sess.run(init_op)
        if self.write_log:
            self.writer = tf.train.SummaryWriter(params.log_dir, sess.graph)
        self.initialized = True
Ejemplo n.º 11
0
    def initialize(self):
        params = self.params
        sess = self.sess
        device_type = params.device_type
        summaries = []

        global_step = tf.get_variable('global_step', shape=[], dtype='int32',
                                      initializer=tf.constant_initializer(0), trainable=False)
        self.tensors['global_step'] = global_step

        epoch = tf.get_variable('epoch', shape=[], dtype='int32',
                                initializer=tf.constant_initializer(0), trainable=False)
        self.tensors['epoch'] = epoch

        learning_rate = tf.placeholder('float32', name='learning_rate')
        summaries.append(tf.scalar_summary("learning_rate", learning_rate))
        self.placeholders['learning_rate'] = learning_rate

        if params.opt == 'basic':
            opt = tf.train.GradientDescentOptimizer(learning_rate)
        elif params.opt == 'adagrad':
            opt = tf.train.AdagradOptimizer(learning_rate)
        elif params.opt == 'adam':
            opt = tf.train.AdamOptimizer()
        elif params.opt == 'adadelta':
            opt = tf.train.AdadeltaOptimizer(learning_rate)
        else:
            raise Exception()

        grads_pairs_dict = defaultdict(list)
        correct_tensors = []
        loss_tensors = []
        with tf.variable_scope("towers"):
            for device_id, tower in enumerate(self.towers):
                with tf.device("/%s:%d" % (device_type, device_id)), tf.name_scope("%s_%d" % (device_type, device_id)):
                    tower.initialize()
                    tf.get_variable_scope().reuse_variables()
                    loss_tensor = tower.get_loss_tensor()
                    loss_tensors.append(loss_tensor)
                    correct_tensor = tower.get_correct_tensor()
                    correct_tensors.append(correct_tensor)

                    for key, variables in tower.variables_dict.items():
                        grads_pair = opt.compute_gradients(loss_tensor, var_list=variables)
                        grads_pairs_dict[key].append(grads_pair)

        with tf.name_scope("gpu_sync"):
            loss_tensor = tf.reduce_mean(tf.pack(loss_tensors), 0, name='loss')
            correct_tensor = tf.concat(0, correct_tensors, name="correct")
            with tf.name_scope("average_gradients"):
                grads_pair_dict = {key: average_gradients(grads_pairs)
                                   for key, grads_pairs in grads_pairs_dict.items()}
                if params.max_grad_norm:
                    grads_pair_dict = {key: [(tf.clip_by_norm(grad, params.max_grad_norm), var)
                                             for grad, var in grads_pair]
                                       for key, grads_pair in grads_pair_dict.items()}

        self.tensors['loss'] = loss_tensor
        self.tensors['correct'] = correct_tensor
        summaries.append(tf.scalar_summary(loss_tensor.op.name, loss_tensor))

        for key, grads_pair in grads_pair_dict.items():
            for grad, var in grads_pair:
                if grad is not None:
                    summaries.append(tf.histogram_summary(var.op.name+'/gradients/'+key, grad))

        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        apply_grads_op_dict = {key: opt.apply_gradients(grads_pair, global_step=global_step)
                               for key, grads_pair in grads_pair_dict.items()}

        self.train_ops = {key: tf.group(apply_grads_op)
                          for key, apply_grads_op in apply_grads_op_dict.items()}

        saver = tf.train.Saver(tf.all_variables(), max_to_keep=2)
        self.saver = saver

        summary_op = tf.merge_summary(summaries)
        self.tensors['summary'] = summary_op

        init_op = tf.initialize_all_variables()
        sess.run(init_op)
        if self.write_log:
            self.writer = tf.train.SummaryWriter(params.log_dir, sess.graph)
        self.initialized = True
Ejemplo n.º 12
0
    def initialize(self):
        params = self.params
        sess = self.sess
        device_type = params.device_type
        summaries = []

        global_step = tf.get_variable('global_step', shape=[], dtype='int32',
                                      initializer=tf.constant_initializer(0), trainable=False)
        self.tensors['global_step'] = global_step

        epoch = tf.get_variable('epoch', shape=[], dtype='int32',
                                initializer=tf.constant_initializer(0), trainable=False)
        self.tensors['epoch'] = epoch

        learning_rate = tf.placeholder('float32', name='learning_rate')
        summaries.append(tf.scalar_summary("learning_rate", learning_rate))
        self.placeholders['learning_rate'] = learning_rate

        if params.opt == 'basic':
            opt = tf.train.GradientDescentOptimizer(learning_rate)
        elif params.opt == 'adagrad':
            opt = tf.train.AdagradOptimizer(learning_rate)
        else:
            raise Exception()

        grads_tensors = []
        correct_tensors = []
        loss_tensors = []
        for device_id, tower in enumerate(self.towers):
            with tf.device("/%s:%d" % (device_type, device_id)), tf.name_scope("%s_%d" % (device_type, device_id)) as scope:
                tower.initialize(scope)
                tf.get_variable_scope().reuse_variables()
                loss_tensor = tower.get_loss_tensor()
                loss_tensors.append(loss_tensor)
                correct_tensor = tower.get_correct_tensor()
                correct_tensors.append(correct_tensor)
                grads_tensor = opt.compute_gradients(loss_tensor)
                grads_tensors.append(grads_tensor)

        with tf.name_scope("gpu_sync"):
            loss_tensor = tf.reduce_mean(tf.pack(loss_tensors), 0, name='loss')
            correct_tensor = tf.concat(0, correct_tensors, name="correct")
            with tf.name_scope("average_gradients"):
                grads_tensor = average_gradients(grads_tensors)

        self.tensors['loss'] = loss_tensor
        self.tensors['correct'] = correct_tensor
        summaries.append(tf.scalar_summary(loss_tensor.op.name, loss_tensor))

        for grad, var in grads_tensor:
            if grad is not None:
                summaries.append(tf.histogram_summary(var.op.name+'/gradients', grad))
        self.tensors['grads'] = grads_tensor

        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        apply_grads_op = opt.apply_gradients(grads_tensor, global_step=global_step)

        train_op = tf.group(apply_grads_op)
        self.tensors['train'] = train_op

        saver = tf.train.Saver(tf.all_variables())
        self.saver = saver

        summary_op = tf.merge_summary(summaries)
        self.tensors['summary'] = summary_op

        init_op = tf.initialize_all_variables()
        sess.run(init_op)
        self.writer = tf.train.SummaryWriter(params.log_dir, sess.graph)
        self.initialized = True
Ejemplo n.º 13
0
    def __init__(self, config, models):
        model = models[0]
        assert isinstance(model, Model)
        self.config = config
        self.model = model
        self.global_step = model.get_global_step()
        self.opt = tf.train.AdamOptimizer(config.init_lr)

        if config.train_nmn_ctrl_separately:
            self.var_list = model.get_var_list('nmn')
            self.controller_var_list = model.get_var_list('controller')
            controller_grads_list = []
        else:
            self.var_list = model.get_var_list('all')

        self.summary = model.summary
        self.models = models
        losses, grads_list = [], []

        for gpu_idx, model in enumerate(models):
            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device(
                    "/{}:{}".format(config.device_type, gpu_idx)):
                loss = model.get_loss()
                grads = self.opt.compute_gradients(loss,
                                                   var_list=self.var_list)
                losses.append(loss)
                grads_list.append(grads)
                if config.train_nmn_ctrl_separately:
                    controller_grads = self.opt.compute_gradients(
                        loss, var_list=self.controller_var_list)
                    controller_grads_list.append(controller_grads)

        self.loss = tf.add_n(losses) / len(losses)
        self.grads = average_gradients(grads_list)
        if config.train_nmn_ctrl_separately:
            self.controller_grads = average_gradients(controller_grads_list)
            controller_grad_vars = [x[1] for x in self.controller_grads]
            controller_gradients = [x[0] for x in self.controller_grads]
            controller_clipped, _ = tf.clip_by_global_norm(
                controller_gradients, 2)

            ctrl_accum_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in self.controller_var_list
            ]
            self.ctrl_zero_ops = [
                tv.assign(tf.zeros_like(tv)) for tv in ctrl_accum_vars
            ]
            self.ctrl_accum_ops = [
                ctrl_accum_vars[i].assign_add(gv)
                for i, gv in enumerate(controller_clipped)
            ]

            if config.gradient_accum_steps == 1:
                self.controller_train_op = self.opt.apply_gradients(
                    zip(controller_clipped, controller_grad_vars),
                    global_step=self.global_step)
            else:
                self.controller_train_op = self.opt.apply_gradients(
                    [(ctrl_accum_vars[i], gv[1])
                     for i, gv in enumerate(self.controller_grads)],
                    global_step=self.global_step)

        #self.grads, global_norm = tf.clip_by_global_norm(self.grads, 2)

        grad_vars = [x[1] for x in self.grads]
        gradients = [x[0] for x in self.grads]
        clipped, _ = tf.clip_by_global_norm(gradients, 2)

        accum_vars = [
            tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False)
            for tv in self.var_list
        ]
        self.zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]
        self.accum_ops = [
            accum_vars[i].assign_add(gv) for i, gv in enumerate(clipped)
        ]
        if config.gradient_accum_steps == 1:
            self.train_op = self.opt.apply_gradients(
                zip(clipped, grad_vars), global_step=self.global_step)
        else:
            self.train_op = self.opt.apply_gradients(
                [(accum_vars[i], gv[1]) for i, gv in enumerate(self.grads)],
                global_step=self.global_step)

        with tf.control_dependencies([self.train_op]):
            self.dummy = tf.constant(0, name='dummy')