def __init__(self, hps, mode="train", ps_device="/gpu:0"): self.hps = hps data_size = hps.batch_size * hps.num_gpus self.x = tf.placeholder(tf.int32, [data_size, hps.num_steps]) self.y = tf.placeholder(tf.int32, [data_size, hps.num_steps]) #self.w = tf.placeholder(tf.int32, [data_size, hps.num_steps]) #self.softmax = {} losses = [] tower_grads = [] #xs = tf.split(0, hps.num_gpus, self.x) xs = tf.split(self.x, hps.num_gpus, 0) #ys = tf.split(0, hps.num_gpus, self.y) ys = tf.split(self.y, hps.num_gpus, 0) #ws = tf.split(0, hps.num_gpus, self.w) for i in range(hps.num_gpus): with tf.device(assign_to_gpu(i, ps_device)), tf.variable_scope(tf.get_variable_scope(), reuse=True if i > 0 else None): #loss = self._forward(i, xs[i], ys[i], ws[i]) loss , softmax = self._forward(i, xs[i], ys[i]) losses += [loss] if mode == "train": cur_grads = self._backward(loss, summaries=((i == hps.num_gpus - 1) and hps.do_summaries)) tower_grads += [cur_grads] self.loss = tf.add_n(losses) / len(losses) self.softmax = softmax tf.summary.scalar("model/loss", self.loss) self.global_step = tf.get_variable("global_step", [], tf.int32, trainable=False) if mode == "train": grads = average_grads(tower_grads) if hps.optimizer == 1: optimizer = tf.train.MomentumOptimizer(hps.learning_rate, 0.9) elif hps.optimizer == 2: optimizer = tf.train.AdamOptimizer(hps.learning_rate) elif hps.optimizer == 3: optimizer = tf.train.RMSPropOptimizer(learning_rate=hps.learning_rate) elif hps.optimizer == 4: optimizer = tf.train.GradientDescentOptimizer(hps.learning_rate) else: optimizer = tf.train.AdagradOptimizer(hps.learning_rate, initial_accumulator_value=1.0*float(hps.loss_scale)*float(hps.loss_scale)) self.train_op = optimizer.apply_gradients(grads, global_step=self.global_step) self.summary_op = tf.summary.merge_all() else: self.train_op = tf.no_op() if mode in ["train", "eval"] and hps.average_params: with tf.name_scope(None): # This is needed due to EMA implementation silliness. # Keep track of moving average of LSTM variables. ema = tf.train.ExponentialMovingAverage(decay=0.999) variables_to_average = find_trainable_variables("lstm") self.train_op = tf.group(*[self.train_op, ema.apply(variables_to_average)]) self.avg_dict = ema.variables_to_restore(variables_to_average)
def __init__(self, hps, mode="train", ps_device="/gpu:0"): self.hps = hps data_size = hps.batch_size * hps.num_gpus self.x = tf.placeholder(tf.int32, [data_size, hps.num_steps]) self.y = tf.placeholder(tf.int32, [data_size, hps.num_steps]) losses = [] tower_grads = [] logitses = [] indexes = [] if mode == "predict_next": self.ind = tf.placeholder(tf.int32, name="ind") self.ind_len = tf.placeholder(tf.int32, name="ind_len") xs = tf.split(self.x, hps.num_gpus, 0) ys = tf.split(self.y, hps.num_gpus, 0) print("ngpus:", hps.num_gpus) for i in range(hps.num_gpus): with tf.device(assign_to_gpu(i, ps_device)), tf.variable_scope( tf.get_variable_scope(), reuse=True if i > 0 else None): if mode == "predict_next": loss, logits = self._forward(i, xs[i], ys[i], mode=mode) logitses += [logits] #indexes += [index] #self.logits = logits else: loss = self._forward(i, xs[i], ys[i]) #self.logits = logits losses += [loss] if mode == "train": cur_grads = self._backward(loss, summaries=(i == hps.num_gpus - 1)) tower_grads += [cur_grads] if mode == "predict_next": # ngpus = 1, nlayers = 1, nums_step =1 self.logits = tf.squeeze(logitses) """ #add graph logits_cache = tf.get_variable("logits_cache", hps.vocab_size) assign_cache = tf.assign(logits_cache, self.logits) logits_bos = tf.get_variable("logits_bos", hps.vocab_size) assign_cache = tf.assign(logits_cache, self.logits)[0] assign_bos = tf.assign(logits_cache, logits_bos)[0] """ ind_logits = tf.reshape(self.logits, [hps.vocab_size, -1]) ind_logits = tf.gather(ind_logits, self.ind) print "ind_logits:", logitses, ind_logits ind_logits = tf.reshape(ind_logits, [-1, self.ind_len]) self.top_k = tf.minimum(self.ind_len, hps.arg_max) _, self.ind_index = tf.nn.top_k(ind_logits, self.top_k) print "ind_index:", self.ind_index _, self.index = tf.nn.top_k(self.logits, hps.arg_max) self.loss = tf.add_n(losses) / len(losses) # total loss tf.summary.scalar("model/loss", self.loss) self.global_step = tf.get_variable("global_step", [], tf.int32, trainable=False) if mode == "train": grads = average_grads(tower_grads) optimizer = tf.train.AdagradOptimizer( hps.learning_rate, initial_accumulator_value=1.0) self.train_op = optimizer.apply_gradients( grads, global_step=self.global_step) self.summary_op = tf.summary.merge_all() #print self.summary_op else: self.train_op = tf.no_op() if mode in ["train", "eval", "predict_next"] and hps.average_params: with tf.name_scope( None ): # This is needed due to EMA implementation silliness. # Keep track of moving average of LSTM variables. ema = tf.train.ExponentialMovingAverage(decay=0.999) variables_to_average = find_trainable_variables("LSTM") self.train_op = tf.group( *[self.train_op, ema.apply(variables_to_average)]) self.avg_dict = ema.variables_to_restore(variables_to_average)