def create_model(session, run_options, run_metadata): devices = get_device_address(FLAGS.N) dtype = tf.float32 model = SeqModel(FLAGS._buckets, FLAGS.size, FLAGS.real_vocab_size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, withAdagrad=FLAGS.withAdagrad, dropoutRate=FLAGS.keep_prob, dtype=dtype, devices=devices, topk_n=FLAGS.topk, run_options=run_options, run_metadata=run_metadata) ckpt = tf.train.get_checkpoint_state(FLAGS.saved_model_dir) if FLAGS.mode == "DUMP_LSTM" or FLAGS.mode == "BEAM_DECODE" or FLAGS.mode == 'FORCE_DECODE' or ( not FLAGS.fromScratch) and ckpt: mylog("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: mylog("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) return model
def create_model(session, embAttr, START_ID, run_options, run_metadata): devices = get_device_address(FLAGS.N) dtype = tf.float32 model = SeqModel(_buckets, FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, embAttr, withAdagrad=FLAGS.withAdagrad, num_samples=FLAGS.n_sampled, dropoutRate=FLAGS.keep_prob, START_ID=START_ID, loss=FLAGS.loss, dtype=dtype, devices=devices, use_concat=FLAGS.use_concat, no_user_id=FLAGS.no_user_id, output_feat=FLAGS.output_feat, no_input_item_feature=FLAGS.no_input_item_feature, topk_n=FLAGS.topk, run_options=run_options, run_metadata=run_metadata) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) # if FLAGS.recommend or (not FLAGS.fromScratch) and ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): if FLAGS.recommend or (not FLAGS.fromScratch) and ckpt: mylog("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: mylog("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) return model
def __init__(self, buckets, size, from_vocab_size, target_vocab_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer="adam", forward_only=False, dropoutRate=1.0, run_options=None, run_metadata=None, devices_per_model=None, topk_n=30, dtype=tf.float32, with_attention=False, beam_search=False, beam_buckets=None, n_samples=500, with_sampled_softmax=False, attention_style="additive", attention_scale=True, num_models=4, tie_input_output_embedding=False): ''' LocalReplica: Model1[GPU0,GPU1] Model2[GPU3,GPU4],... each model has their own variables, after one step, gradients will sum across multiple GPUs, and updates locally on their own GPU. devices_per_model: [["/gpu:0",..],...] devices_per_model[m][l] m: model, l:layer ''' self.models = [] self.devices_per_model = devices_per_model self.variable_mgr = VariableMgrLocalReplicated() self.num_models = num_models self.buckets = buckets self.run_options = run_options self.run_metadata = run_metadata # Generate models for d, devices_each_model in enumerate(self.devices_per_model): with tf.device(devices_each_model[0]): with self.variable_mgr.create_outer_variable_scope( d), tf.name_scope("tower_{}".format(d)) as name_scope: mylog("creating model #{} at devices: {}".format( d, devices_each_model)) seqModel = SeqModel( buckets, size, from_vocab_size, target_vocab_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer=optimizer, forward_only=forward_only, dropoutRate=dropoutRate, devices=devices_each_model, run_options=run_options, run_metadata=run_metadata, topk_n=topk_n, dtype=dtype, with_attention=with_attention, beam_search=beam_search, beam_buckets=beam_buckets, n_samples=n_samples, with_sampled_softmax=with_sampled_softmax, attention_style=attention_style, attention_scale=attention_scale, standalone=False, # ! do not init the optimizer now n_distributed_models=self.num_models, tie_input_output_embedding=tie_input_output_embedding) self.models.append(seqModel) # collect the learning_rate_decay_op self.learning_rate_dacay_ops = [] self.dropout10_ops = [] self.dropoutAssign_ops = [] for model in self.models: self.learning_rate_dacay_ops.append(model.learning_rate_decay_op) self.dropout10_ops.append(model.dropout10_op) self.dropoutAssign_ops.append(model.dropoutAssign_op) # Aggregate the gradients section = "Aggregate Gradients " mylog_section(section) agg_grads = [] for b in xrange(len(buckets)): mylog_subsection("Bucket {}".format(b)) # for each buckets gradients = [] # [[grad * n_variable] * n_model] params = [] # [[param * n_variable] * n_model] for model in self.models: gradients.append(model.gradients[b]) params.append(model.params) agg_grad_per_gpu = { } # record how many aggregations of grads happens on eah gpu agg_grads_per_bucket = [] for param_id in xrange(len(params[0])): grads_per_model = [] params_per_model = [] for model_id in xrange(len(params)): params_per_model.append(params[model_id][param_id]) grads_per_model.append(gradients[model_id][param_id]) # choose one device to do aggregation device_for_agg = None min_n_agg = 1000000 for param in params_per_model: dev = param.device if not dev in agg_grad_per_gpu: agg_grad_per_gpu[dev] = [] n_agg = len(agg_grad_per_gpu[dev]) if min_n_agg > n_agg: min_n_agg = n_agg device_for_agg = dev agg_grad_per_gpu[device_for_agg].append(params[0][param_id]) with tf.device(device_for_agg): if type(grads_per_model[0]) == tf.IndexedSlices: values = tf.concat([x.values for x in grads_per_model], 0) indices = tf.concat( [x.indices for x in grads_per_model], 0) agg_grad = tf.IndexedSlices(values, indices) else: agg_grad = tf.add_n(grads_per_model) agg_grads_per_bucket.append(agg_grad) # show aggregation device placement for device in agg_grad_per_gpu: mylog("Aggregated On {}:".format(device)) for param in agg_grad_per_gpu[device]: mylog("\t" + param.name) agg_grads.append(agg_grads_per_bucket) # send the aggregated grads to each model on different gpus for d, devices_each_model in enumerate(self.devices_per_model): self.models[d].init_agg_updates(agg_grads) # combine losses, updates and gradients norm self.losses = [] # per bucket self.updates = [] self.gradient_norms = [] for b in xrange(len(buckets)): losses = [] updates = [] gradient_norms = [] for i, model in enumerate(self.models): losses.append(model.losses[b]) updates.append(model.updates[b]) gradient_norms.append(model.gradient_norms[b]) loss = tf.add_n(losses) self.losses.append(loss) self.updates.append(updates) self.gradient_norms.append(gradient_norms) # get init ops group self.var_init_op = tf.global_variables_initializer() self.broadcast_ops = self.variable_mgr.get_post_init_ops() # for saver all_vars = tf.global_variables() self.train_vars = [] for var in all_vars: if var.name.startswith("v0"): self.train_vars.append(var) self.saver = tf.train.Saver(self.train_vars) self.best_saver = tf.train.Saver(self.train_vars)