def __init__(self, hps, mode="train", ps_device="/gpu:0"):
        self.hps = hps
        data_size = hps.batch_size * hps.num_gpus
        self.x = tf.placeholder(tf.int32, [data_size, hps.num_steps])
        self.y = tf.placeholder(tf.int32, [data_size, hps.num_steps])
        #self.w = tf.placeholder(tf.int32, [data_size, hps.num_steps])
        #self.softmax = {}
        losses = []
        tower_grads = []
        #xs = tf.split(0, hps.num_gpus, self.x)
        xs = tf.split(self.x, hps.num_gpus, 0)
        #ys = tf.split(0, hps.num_gpus, self.y)
        ys = tf.split(self.y, hps.num_gpus, 0)
        #ws = tf.split(0, hps.num_gpus, self.w)
        for i in range(hps.num_gpus):
            with tf.device(assign_to_gpu(i, ps_device)), tf.variable_scope(tf.get_variable_scope(),
                                                                           reuse=True if i > 0 else None):
                #loss = self._forward(i, xs[i], ys[i], ws[i])
                loss , softmax = self._forward(i, xs[i], ys[i])
                losses += [loss]
                if mode == "train":
                    cur_grads = self._backward(loss,  summaries=((i == hps.num_gpus - 1) and hps.do_summaries))
                    tower_grads += [cur_grads]

        self.loss = tf.add_n(losses) / len(losses)
        self.softmax = softmax
        tf.summary.scalar("model/loss", self.loss)

        self.global_step = tf.get_variable("global_step", [], tf.int32, trainable=False)

        if mode == "train":
            grads = average_grads(tower_grads)
            if hps.optimizer == 1:
                optimizer = tf.train.MomentumOptimizer(hps.learning_rate, 0.9)
            elif hps.optimizer == 2:
                optimizer = tf.train.AdamOptimizer(hps.learning_rate)
            elif hps.optimizer == 3:
                optimizer = tf.train.RMSPropOptimizer(learning_rate=hps.learning_rate)
            elif hps.optimizer == 4:
                optimizer = tf.train.GradientDescentOptimizer(hps.learning_rate)
            else:
                optimizer = tf.train.AdagradOptimizer(hps.learning_rate, initial_accumulator_value=1.0*float(hps.loss_scale)*float(hps.loss_scale))
            self.train_op = optimizer.apply_gradients(grads, global_step=self.global_step)
            self.summary_op = tf.summary.merge_all()
        else:
            self.train_op = tf.no_op()

        if mode in ["train", "eval"] and hps.average_params:
            with tf.name_scope(None):  # This is needed due to EMA implementation silliness.
                # Keep track of moving average of LSTM variables.
                ema = tf.train.ExponentialMovingAverage(decay=0.999)
                variables_to_average = find_trainable_variables("lstm")
                self.train_op = tf.group(*[self.train_op, ema.apply(variables_to_average)])
                self.avg_dict = ema.variables_to_restore(variables_to_average)
Exemple #2
0
    def __init__(self, hps, mode="train", ps_device="/gpu:0"):
        self.hps = hps
        data_size = hps.batch_size * hps.num_gpus
        self.x = tf.placeholder(tf.int32, [data_size, hps.num_steps])
        self.y = tf.placeholder(tf.int32, [data_size, hps.num_steps])
        losses = []
        tower_grads = []
        logitses = []
        indexes = []
        if mode == "predict_next":
            self.ind = tf.placeholder(tf.int32, name="ind")
            self.ind_len = tf.placeholder(tf.int32, name="ind_len")

        xs = tf.split(self.x, hps.num_gpus, 0)
        ys = tf.split(self.y, hps.num_gpus, 0)
        print("ngpus:", hps.num_gpus)
        for i in range(hps.num_gpus):
            with tf.device(assign_to_gpu(i, ps_device)), tf.variable_scope(
                    tf.get_variable_scope(), reuse=True if i > 0 else None):
                if mode == "predict_next":
                    loss, logits = self._forward(i, xs[i], ys[i], mode=mode)
                    logitses += [logits]
                    #indexes += [index]
                    #self.logits = logits
                else:
                    loss = self._forward(i, xs[i], ys[i])
                #self.logits = logits
                losses += [loss]
                if mode == "train":
                    cur_grads = self._backward(loss,
                                               summaries=(i == hps.num_gpus -
                                                          1))
                    tower_grads += [cur_grads]
        if mode == "predict_next":  # ngpus = 1, nlayers = 1, nums_step =1
            self.logits = tf.squeeze(logitses)
            """
            #add graph 
            logits_cache = tf.get_variable("logits_cache", hps.vocab_size)
            assign_cache = tf.assign(logits_cache,  self.logits)
            logits_bos = tf.get_variable("logits_bos", hps.vocab_size)
            assign_cache = tf.assign(logits_cache, self.logits)[0]
            assign_bos = tf.assign(logits_cache, logits_bos)[0]
            """
            ind_logits = tf.reshape(self.logits, [hps.vocab_size, -1])
            ind_logits = tf.gather(ind_logits, self.ind)
            print "ind_logits:", logitses, ind_logits
            ind_logits = tf.reshape(ind_logits, [-1, self.ind_len])
            self.top_k = tf.minimum(self.ind_len, hps.arg_max)
            _, self.ind_index = tf.nn.top_k(ind_logits, self.top_k)
            print "ind_index:", self.ind_index
            _, self.index = tf.nn.top_k(self.logits, hps.arg_max)

        self.loss = tf.add_n(losses) / len(losses)  # total loss
        tf.summary.scalar("model/loss", self.loss)

        self.global_step = tf.get_variable("global_step", [],
                                           tf.int32,
                                           trainable=False)

        if mode == "train":
            grads = average_grads(tower_grads)
            optimizer = tf.train.AdagradOptimizer(
                hps.learning_rate, initial_accumulator_value=1.0)
            self.train_op = optimizer.apply_gradients(
                grads, global_step=self.global_step)
            self.summary_op = tf.summary.merge_all()
            #print self.summary_op
        else:
            self.train_op = tf.no_op()

        if mode in ["train", "eval", "predict_next"] and hps.average_params:
            with tf.name_scope(
                    None
            ):  # This is needed due to EMA implementation silliness.
                # Keep track of moving average of LSTM variables.
                ema = tf.train.ExponentialMovingAverage(decay=0.999)
                variables_to_average = find_trainable_variables("LSTM")
                self.train_op = tf.group(
                    *[self.train_op,
                      ema.apply(variables_to_average)])
                self.avg_dict = ema.variables_to_restore(variables_to_average)