def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.opt = tf.train.AdadeltaOptimizer(config.init_lr) self.var_list = model.get_var_list() self.global_step = model.get_global_step() self.summary = model.summary self.models = models losses = [] grads_list = [] for gpu_idx, model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device( "/{}:{}".format(config.device_type, gpu_idx)): loss = model.get_loss() grads = self.opt.compute_gradients(loss, var_list=self.var_list) #grads = [(tf.clip_by_value(grad, -1., 1.), var) for (grad, var) in grads if not grad is None] losses.append(loss) grads_list.append(grads) self.loss = tf.add_n(losses) / len(losses) self.grads = average_gradients(grads_list) self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.opt = tf.train.GradientDescentOptimizer(config.init_lr) self.var_list = model.get_var_list() # it's none here self.global_step = model.get_global_step() # a placeholder self.summary = model.summary self.models = models losses = [] grads_list = [] for gpu_idx, model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device( "/{}:{}".format(config.device_type, gpu_idx)): loss = model.get_loss() grads = self.opt.compute_gradients(loss, var_list=self.var_list) losses.append(loss) grads_list.append(grads) self.loss = tf.add_n(losses) / len(losses) self.grads = average_gradients(grads_list) self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.opt = tf.train.AdamOptimizer(config.init_lr) self.var_list = model.get_var_list('model_network') self.global_step = model.get_global_step() self.summary = model.summary self.models = models losses, grads_list = [], [] for gpu_idx, model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device( "/{}:{}".format(config.device_type, gpu_idx)): loss = model.get_loss() grads = self.opt.compute_gradients(loss, var_list=self.var_list) losses.append(loss) grads_list.append(grads) self.loss = tf.add_n(losses) / len(losses) self.grads = average_gradients(grads_list) grad_vars = [x[1] for x in self.grads] gradients = [x[0] for x in self.grads] clipped, _ = tf.clip_by_global_norm(gradients, 2) self.train_op = self.opt.apply_gradients( zip(clipped, grad_vars), global_step=self.global_step) with tf.control_dependencies([self.train_op]): self.dummy = tf.constant(0, name='dummy')
def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.opt = tf.train.AdamOptimizer(config.init_lr) self.var_list = model.get_var_list() self.global_step = model.get_global_step() self.summary = model.summary self.models = models losses_task1 = [] losses_task2 = [] grads_list_task1 = [] grads_list_task2 = [] #print("VAR LIST",self.var_list) # TODO: Check is this should not be None? for gpu_idx, model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device( "/{}:{}".format(config.device_type, gpu_idx)): loss_task1 = model.get_loss_task1() loss_task2 = model.get_loss_task2() grads_task1 = self.opt.compute_gradients( loss_task1, var_list=self.var_list) grads_task2 = self.opt.compute_gradients( loss_task2, var_list=self.var_list) losses_task1.append(loss_task1) losses_task2.append(loss_task2) grads_list_task1.append(grads_task1) grads_list_task2.append(grads_task2) self.loss_task1 = tf.add_n(losses_task1) / len(losses_task1) self.loss_task2 = tf.add_n(losses_task2) / len(losses_task2) self.grads_task1 = average_gradients(grads_list_task1) self.grads_task2 = average_gradients(grads_list_task2) self.train_op_task1 = self.opt.apply_gradients( self.grads_task1, global_step=self.global_step) self.train_op_task2 = self.opt.apply_gradients( self.grads_task2, global_step=self.global_step)
def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.global_step = model.get_global_step() if 'adam' == config.optimizer: self.opt = tf.train.AdamOptimizer(config.init_lr) elif 'gd' == config.optimizer: lr = tf.train.exponential_decay(config.init_lr, self.global_step, tf.to_int32(config.num_steps / 3), 0.1, staircase=True) self.opt = tf.train.GradientDescentOptimizer(lr) else: raise ValueError('Unsupported optimizer') self.var_list = model.get_var_list() self.summary = model.summary self.models = models losses = [] grads_list = [] for gpu_idx, _model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device( "/{}:{}".format(config.device_type, gpu_idx)): loss = _model.get_loss() grads = self.opt.compute_gradients(loss, var_list=self.var_list) losses.append(loss) grads_list.append(grads) self.loss = tf.add_n(losses) / len(losses) self.grads = average_gradients(grads_list) if config.freeze_mode: self.grads = zerout_gradients_for_zero_weights( self.grads, mode=config.freeze_mode) self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) if model.get_sparsity_op(): with tf.control_dependencies([self.train_op]): self.train_op = tf.group(self.train_op, model.get_sparsity_op())
def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.opt = tf.train.AdamOptimizer(config.init_lr) self.var_list = model.get_var_list() self.global_step = model.get_global_step() self.summary = model.summary self.models = models losses = [] grads_list = [] for gpu_idx, model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device( "/{}:{}".format(config.device_type, gpu_idx)): loss = model.get_loss() grads = self.opt.compute_gradients(loss, var_list=self.var_list) losses.append(loss) grads_list.append(grads) self.loss = tf.add_n(losses) / len(losses) self.grads = average_gradients(grads_list) self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) self.logits = model.logits self.logits2 = model.logits2 self.tensorDic = model.tensor_dict self.y = model.y self.y2 = model.y2 self.wy = model.wy self.yp = model.yp self.yp2 = model.yp2 self.wyp = model.wyp self.correctIndex = model.correctIndex self.Pos1 = model.Pos1 self.Neg1 = model.Neg1 self.mask1 = model.mask1 self.invmask1 = model.invmask1
def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.opt = tf.train.AdadeltaOptimizer(config.init_lr) self.var_list = model.get_var_list() self.global_step = model.get_global_step() self.summary = model.summary self.models = models losses = [] grads_list = [] for gpu_idx, model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)): loss = model.get_loss() grads = self.opt.compute_gradients(loss, var_list=self.var_list) losses.append(loss) grads_list.append(grads) self.loss = tf.add_n(losses)/len(losses) self.grads = average_gradients(grads_list) self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.opt = tf.train.AdadeltaOptimizer(config.init_lr) self.global_step = model.get_global_step() self.summary = model.summary self.models = models losses = [] grads_list = [] with tf.variable_scope("grad"): for gpu_idx, model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/{}:{}".format(config.device_type, gpu_idx)): loss = model.get_loss() grads = self.opt.compute_gradients(loss) losses.append(loss) grads_list.append(grads) tf.get_variable_scope().reuse_variables() self.loss = tf.add_n(losses)/len(losses) self.grads = average_gradients(grads_list) self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
def initialize(self): params = self.params sess = self.sess device_type = params.device_type summaries = [] global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) self.tensors['global_step'] = global_step epoch = tf.get_variable('epoch', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) self.tensors['epoch'] = epoch learning_rate = tf.placeholder('float32', name='learning_rate') summaries.append(tf.scalar_summary("learning_rate", learning_rate)) self.placeholders['learning_rate'] = learning_rate if params.opt == 'basic': opt = tf.train.GradientDescentOptimizer(learning_rate) elif params.opt == 'adagrad': opt = tf.train.AdagradOptimizer(learning_rate) else: raise Exception() grads_tensors = [] correct_tensors = [] loss_tensors = [] for device_id, tower in enumerate(self.towers): with tf.device("/%s:%d" % (device_type, device_id)), tf.name_scope("%s_%d" % (device_type, device_id)) as scope: tower.initialize(scope) tf.get_variable_scope().reuse_variables() loss_tensor = tower.get_loss_tensor() loss_tensors.append(loss_tensor) correct_tensor = tower.get_correct_tensor() correct_tensors.append(correct_tensor) grads_tensor = opt.compute_gradients(loss_tensor) grads_tensors.append(grads_tensor) with tf.name_scope("gpu_sync"): loss_tensor = tf.reduce_mean(tf.pack(loss_tensors), 0, name='loss') correct_tensor = tf.concat(0, correct_tensors, name="correct") with tf.name_scope("average_gradients"): grads_tensor = average_gradients(grads_tensors) self.tensors['loss'] = loss_tensor self.tensors['correct'] = correct_tensor summaries.append(tf.scalar_summary(loss_tensor.op.name, loss_tensor)) for grad, var in grads_tensor: if grad is not None: summaries.append(tf.histogram_summary(var.op.name+'/gradients', grad)) self.tensors['grads'] = grads_tensor for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) apply_grads_op = opt.apply_gradients(grads_tensor, global_step=global_step) train_op = tf.group(apply_grads_op) self.tensors['train'] = train_op saver = tf.train.Saver(tf.all_variables()) self.saver = saver summary_op = tf.merge_summary(summaries) self.tensors['summary'] = summary_op init_op = tf.initialize_all_variables() sess.run(init_op) self.writer = tf.train.SummaryWriter(params.log_dir, sess.graph) self.initialized = True
def initialize(self): params = self.params sess = self.sess device_type = params.device_type summaries = [] global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) self.tensors['global_step'] = global_step epoch = tf.get_variable('epoch', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) self.tensors['epoch'] = epoch learning_rate = tf.placeholder('float32', name='learning_rate') summaries.append(tf.scalar_summary("learning_rate", learning_rate)) self.placeholders['learning_rate'] = learning_rate if params.opt == 'basic': opt = tf.train.GradientDescentOptimizer(learning_rate) elif params.opt == 'adagrad': opt = tf.train.AdagradOptimizer(learning_rate) elif params.opt == 'adam': opt = tf.train.AdamOptimizer() elif params.opt == 'adadelta': opt = tf.train.AdadeltaOptimizer(learning_rate) else: raise Exception() grads_pairs_dict = defaultdict(list) correct_tensors = [] loss_tensors = [] with tf.variable_scope("towers"): for device_id, tower in enumerate(self.towers): with tf.device("/%s:%d" % (device_type, device_id)), tf.name_scope( "%s_%d" % (device_type, device_id)): tower.initialize() tf.get_variable_scope().reuse_variables() loss_tensor = tower.get_loss_tensor() loss_tensors.append(loss_tensor) correct_tensor = tower.get_correct_tensor() correct_tensors.append(correct_tensor) for key, variables in tower.variables_dict.items(): grads_pair = opt.compute_gradients(loss_tensor, var_list=variables) grads_pairs_dict[key].append(grads_pair) with tf.name_scope("gpu_sync"): loss_tensor = tf.reduce_mean(tf.pack(loss_tensors), 0, name='loss') correct_tensor = tf.concat(0, correct_tensors, name="correct") with tf.name_scope("average_gradients"): grads_pair_dict = { key: average_gradients(grads_pairs) for key, grads_pairs in grads_pairs_dict.items() } if params.max_grad_norm: grads_pair_dict = { key: [(tf.clip_by_norm(grad, params.max_grad_norm), var) for grad, var in grads_pair] for key, grads_pair in grads_pair_dict.items() } self.tensors['loss'] = loss_tensor self.tensors['correct'] = correct_tensor summaries.append(tf.scalar_summary(loss_tensor.op.name, loss_tensor)) for key, grads_pair in grads_pair_dict.items(): for grad, var in grads_pair: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients/' + key, grad)) for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) apply_grads_op_dict = { key: opt.apply_gradients(grads_pair, global_step=global_step) for key, grads_pair in grads_pair_dict.items() } self.train_ops = { key: tf.group(apply_grads_op) for key, apply_grads_op in apply_grads_op_dict.items() } saver = tf.train.Saver(tf.all_variables(), max_to_keep=2) self.saver = saver summary_op = tf.merge_summary(summaries) self.tensors['summary'] = summary_op init_op = tf.initialize_all_variables() sess.run(init_op) if self.write_log: self.writer = tf.train.SummaryWriter(params.log_dir, sess.graph) self.initialized = True
def initialize(self): params = self.params sess = self.sess device_type = params.device_type summaries = [] global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) self.tensors['global_step'] = global_step epoch = tf.get_variable('epoch', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) self.tensors['epoch'] = epoch learning_rate = tf.placeholder('float32', name='learning_rate') summaries.append(tf.scalar_summary("learning_rate", learning_rate)) self.placeholders['learning_rate'] = learning_rate if params.opt == 'basic': opt = tf.train.GradientDescentOptimizer(learning_rate) elif params.opt == 'adagrad': opt = tf.train.AdagradOptimizer(learning_rate) elif params.opt == 'adam': opt = tf.train.AdamOptimizer() elif params.opt == 'adadelta': opt = tf.train.AdadeltaOptimizer(learning_rate) else: raise Exception() grads_pairs_dict = defaultdict(list) correct_tensors = [] loss_tensors = [] with tf.variable_scope("towers"): for device_id, tower in enumerate(self.towers): with tf.device("/%s:%d" % (device_type, device_id)), tf.name_scope("%s_%d" % (device_type, device_id)): tower.initialize() tf.get_variable_scope().reuse_variables() loss_tensor = tower.get_loss_tensor() loss_tensors.append(loss_tensor) correct_tensor = tower.get_correct_tensor() correct_tensors.append(correct_tensor) for key, variables in tower.variables_dict.items(): grads_pair = opt.compute_gradients(loss_tensor, var_list=variables) grads_pairs_dict[key].append(grads_pair) with tf.name_scope("gpu_sync"): loss_tensor = tf.reduce_mean(tf.pack(loss_tensors), 0, name='loss') correct_tensor = tf.concat(0, correct_tensors, name="correct") with tf.name_scope("average_gradients"): grads_pair_dict = {key: average_gradients(grads_pairs) for key, grads_pairs in grads_pairs_dict.items()} if params.max_grad_norm: grads_pair_dict = {key: [(tf.clip_by_norm(grad, params.max_grad_norm), var) for grad, var in grads_pair] for key, grads_pair in grads_pair_dict.items()} self.tensors['loss'] = loss_tensor self.tensors['correct'] = correct_tensor summaries.append(tf.scalar_summary(loss_tensor.op.name, loss_tensor)) for key, grads_pair in grads_pair_dict.items(): for grad, var in grads_pair: if grad is not None: summaries.append(tf.histogram_summary(var.op.name+'/gradients/'+key, grad)) for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) apply_grads_op_dict = {key: opt.apply_gradients(grads_pair, global_step=global_step) for key, grads_pair in grads_pair_dict.items()} self.train_ops = {key: tf.group(apply_grads_op) for key, apply_grads_op in apply_grads_op_dict.items()} saver = tf.train.Saver(tf.all_variables(), max_to_keep=2) self.saver = saver summary_op = tf.merge_summary(summaries) self.tensors['summary'] = summary_op init_op = tf.initialize_all_variables() sess.run(init_op) if self.write_log: self.writer = tf.train.SummaryWriter(params.log_dir, sess.graph) self.initialized = True
def __init__(self, config, models): model = models[0] assert isinstance(model, Model) self.config = config self.model = model self.global_step = model.get_global_step() self.opt = tf.train.AdamOptimizer(config.init_lr) if config.train_nmn_ctrl_separately: self.var_list = model.get_var_list('nmn') self.controller_var_list = model.get_var_list('controller') controller_grads_list = [] else: self.var_list = model.get_var_list('all') self.summary = model.summary self.models = models losses, grads_list = [], [] for gpu_idx, model in enumerate(models): with tf.name_scope("grads_{}".format(gpu_idx)), tf.device( "/{}:{}".format(config.device_type, gpu_idx)): loss = model.get_loss() grads = self.opt.compute_gradients(loss, var_list=self.var_list) losses.append(loss) grads_list.append(grads) if config.train_nmn_ctrl_separately: controller_grads = self.opt.compute_gradients( loss, var_list=self.controller_var_list) controller_grads_list.append(controller_grads) self.loss = tf.add_n(losses) / len(losses) self.grads = average_gradients(grads_list) if config.train_nmn_ctrl_separately: self.controller_grads = average_gradients(controller_grads_list) controller_grad_vars = [x[1] for x in self.controller_grads] controller_gradients = [x[0] for x in self.controller_grads] controller_clipped, _ = tf.clip_by_global_norm( controller_gradients, 2) ctrl_accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in self.controller_var_list ] self.ctrl_zero_ops = [ tv.assign(tf.zeros_like(tv)) for tv in ctrl_accum_vars ] self.ctrl_accum_ops = [ ctrl_accum_vars[i].assign_add(gv) for i, gv in enumerate(controller_clipped) ] if config.gradient_accum_steps == 1: self.controller_train_op = self.opt.apply_gradients( zip(controller_clipped, controller_grad_vars), global_step=self.global_step) else: self.controller_train_op = self.opt.apply_gradients( [(ctrl_accum_vars[i], gv[1]) for i, gv in enumerate(self.controller_grads)], global_step=self.global_step) #self.grads, global_norm = tf.clip_by_global_norm(self.grads, 2) grad_vars = [x[1] for x in self.grads] gradients = [x[0] for x in self.grads] clipped, _ = tf.clip_by_global_norm(gradients, 2) accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in self.var_list ] self.zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars] self.accum_ops = [ accum_vars[i].assign_add(gv) for i, gv in enumerate(clipped) ] if config.gradient_accum_steps == 1: self.train_op = self.opt.apply_gradients( zip(clipped, grad_vars), global_step=self.global_step) else: self.train_op = self.opt.apply_gradients( [(accum_vars[i], gv[1]) for i, gv in enumerate(self.grads)], global_step=self.global_step) with tf.control_dependencies([self.train_op]): self.dummy = tf.constant(0, name='dummy')