Ejemplos de Critic.predict_rewards en Python

Lenguaje de programación: Python

Namespace/Package Name: critic

Clase / Tipo: Critic

Método / Función: predict_rewards

Ejemplos en hotexamples.com: 3

Python Critic.predict_rewards - 3 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de critic.Critic.predict_rewards extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

Critic(30)

parameters(24)

load_state_dict(10)

get_action_gradients(9)

state_dict(9)

gradients(7)

save(6)

forward(5)

eval(4)

learn(4)

predict(3)

load(3)

action_gradients(3)

predict_rewards(3)

Q1(2)

get_weights(2)

load_checkpoint(2)

get_q(2)

model(2)

get_gradients(2)

predict_target(2)

get_action_grads(2)

next_q_value(2)

action_grad(2)

build(2)

set_weights(2)

save_weights(2)

copy_weights(2)

create_critic_model(2)

cuda(2)

save_model(1)

save_checkpoint(1)

predict_q_val(1)

return_q_and_out(1)

q1(1)

optimizer(1)

soft_update(1)

optimize_q_val(1)

hard_update_target_network(1)

network(1)

dq_da(1)

Q2(1)

__init__(1)

calc_grads(1)

compile(1)

create_value_net(1)

critic_target_update(1)

find_action_grads(1)

named_parameters(1)

gamma(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: actor.py Proyecto: algo-cancer/PhyloM

class Actor(object):
    def __init__(self, config):
        self.config = config

        # Data config
        self.batch_size = config.batch_size  # batch size
        self.max_length = config.nCells * config.nMuts  # input sequence length
        self.input_dimension = config.input_dimension  # dimension of a input

        # Reward config
        #self.avg_baseline = tf.Variable(config.init_baseline, trainable=False, name="moving_avg_baseline") # moving baseline for Reinforce
        # self.ma = config.ma # moving average update
        self.beta = config.beta  # hyperparameter for adjusting NLL

        # Training config (actor)
        self.global_step = tf.Variable(0, trainable=False,
                                       name="global_step")  # global step
        self.lr1_start = config.lr1_start  # initial learning rate
        self.lr1_decay_rate = config.lr1_decay_rate  # learning rate decay rate
        self.lr1_decay_step = config.lr1_decay_step  # learning rate decay step

        # Training config (critic)
        self.global_step2 = tf.Variable(0,
                                        trainable=False,
                                        name="global_step2")  # global step
        self.lr2_start = config.lr1_start  # initial learning rate
        self.lr2_decay_rate = config.lr1_decay_rate  # learning rate decay rate
        self.lr2_decay_step = config.lr1_decay_step  # learning rate decay step

        # Tensor block holding the input sequences [Batch Size, Sequence Length, Features]
        self.input_ = tf.placeholder(
            tf.float32,
            [self.batch_size, self.max_length, self.input_dimension],
            name="input_coordinates")

        self.build_permutation()
        self.build_critic()
        self.build_reward()
        self.build_optim()
        self.merged = tf.summary.merge_all()

    def count3gametes(self, input_):
        columnPairs = list(itertools.permutations(range(self.config.nMuts), 2))
        nColumnPairs = len(columnPairs)
        columnReplicationList = np.array(columnPairs).reshape(-1)
        l = []
        for i in range(input_.get_shape()[0]):
            for j in range(self.config.nCells):
                for k in columnReplicationList:
                    l.append([i, j, k])
        replicatedColumns = tf.reshape(tf.gather_nd(input_, l), [
            input_.get_shape()[0], self.config.nCells,
            len(columnReplicationList)
        ])
        replicatedColumns = tf.transpose(replicatedColumns, perm=[0, 2, 1])
        x = tf.reshape(
            replicatedColumns,
            [input_.get_shape()[0], nColumnPairs, 2, self.config.nCells])
        col10 = tf.count_nonzero(tf.greater(x[:, :, 0, :], x[:, :, 1, :]),
                                 axis=2)  # batch_size * nColumnPairs
        col01 = tf.count_nonzero(tf.greater(x[:, :, 1, :], x[:, :, 0, :]),
                                 axis=2)  # batch_size * nColumnPairs
        col11 = tf.count_nonzero(tf.equal(x[:, :, 0, :] + x[:, :, 1, :], 2),
                                 axis=2)  # batch_size * nColumnPairs
        eachColPair = col10 * col01 * col11  # batch_size * nColumnPairs
        return tf.reduce_sum(eachColPair, axis=1)  # batch_size

    def build_permutation(self):

        with tf.variable_scope("encoder"):

            Encoder = Attentive_encoder(self.config)
            encoder_output = Encoder.encode(self.input_)

        with tf.variable_scope('decoder'):
            # Ptr-net returns permutations (self.positions), with their log-probability for backprop
            self.ptr = Pointer_decoder(encoder_output, self.config)
            self.positions, self.log_softmax = self.ptr.loop_decode()
            variable_summaries('log_softmax',
                               self.log_softmax,
                               with_max_min=True)

    def build_critic(self):

        with tf.variable_scope("critic"):
            # Critic predicts reward (parametric baseline for REINFORCE)
            self.critic = Critic(self.config)
            self.critic.predict_rewards(self.input_)
            variable_summaries('predictions',
                               self.critic.predictions,
                               with_max_min=True)

    def build_reward(self):

        with tf.name_scope('permutations'):

            # Reorder input % tour

            inp_ = tf.identity(self.input_)
            pos = tf.identity(self.positions)

            x = tf.zeros([int(self.max_length / 2), self.batch_size],
                         tf.float32)
            for i in range(int(self.max_length / 2)):

                r = tf.range(start=0, limit=self.batch_size, delta=1)
                r = tf.expand_dims(r, 1)
                r = tf.expand_dims(r, 2)
                r3 = tf.cast(
                    tf.ones([self.max_length, 1]) * tf.cast(r, tf.float32),
                    tf.int32)

                r4 = tf.squeeze(r, axis=2)
                r5 = tf.expand_dims(tf.fill([self.batch_size], i), axis=1)
                u = tf.ones_like(r5)
                r4_r5 = tf.concat([r4, r5], axis=1)

                pos_mask = tf.squeeze(tf.scatter_nd(
                    indices=r4_r5,
                    updates=u,
                    shape=[self.batch_size, self.max_length, 1]),
                                      axis=2)

                pos_mask_cum1 = tf.cumsum(pos_mask,
                                          reverse=True,
                                          exclusive=True,
                                          axis=1)
                pos_mask_cum2 = tf.cumsum(pos_mask,
                                          reverse=False,
                                          exclusive=False,
                                          axis=1)  # for calculating NLL

                per_pos = tf.concat([r3, tf.expand_dims(pos, axis=2)], axis=2)

                per_ = tf.gather_nd(inp_, indices=per_pos)
                per_fp_fn = per_[:, :, 2:3]
                per_fp_fn_log = tf.log(1 / per_fp_fn)  # for N01 and N10
                per_fp_fn_com = tf.subtract(tf.ones_like(per_fp_fn),
                                            per_fp_fn)  # for N00 and N11
                per_fp_fn_com_log = tf.log(1 / per_fp_fn_com)

                NLL_N10_N01 = tf.reduce_sum(tf.multiply(
                    tf.squeeze(per_fp_fn_log, axis=2),
                    tf.cast(pos_mask_cum1, tf.float32)),
                                            axis=1,
                                            keepdims=True)

                per_matrix_mul_cum2 = tf.multiply(
                    tf.squeeze(per_[:, :, 3:4], axis=2),
                    tf.cast(pos_mask_cum2, tf.float32))
                N11 = tf.reduce_sum(per_matrix_mul_cum2, axis=1, keepdims=True)
                sum_mask_cum2 = tf.reduce_sum(tf.cast(pos_mask_cum2,
                                                      tf.float32),
                                              axis=1,
                                              keepdims=True)
                N00 = tf.subtract(sum_mask_cum2, N11)

                per_matrix = per_[:, :, 3:4]

                sum_per_matrix = tf.reduce_sum(tf.squeeze(per_matrix, axis=2),
                                               axis=1)
                sum_per_fp = tf.reduce_sum(tf.squeeze(tf.multiply(
                    per_fp_fn, per_matrix),
                                                      axis=2),
                                           axis=1)
                fp = tf.divide(sum_per_fp, sum_per_matrix)

                sum_per_fn = tf.subtract(
                    tf.reduce_sum(tf.squeeze(per_fp_fn, axis=2), axis=1),
                    sum_per_fp)
                q = tf.cast(
                    tf.tile(tf.constant([self.max_length]),
                            tf.constant([self.batch_size])), tf.float32)
                fn = tf.divide(sum_per_fn, tf.subtract(q, sum_per_matrix))

                fp_com = tf.log(1 / tf.subtract(
                    tf.cast(
                        tf.tile(tf.constant([1]), tf.constant(
                            [self.batch_size])), tf.float32), fp))
                fn_com = tf.log(1 / tf.subtract(
                    tf.cast(
                        tf.tile(tf.constant([1]), tf.constant(
                            [self.batch_size])), tf.float32), fn))

                N00_NLL = tf.multiply(tf.expand_dims(fp_com, axis=1), N00)
                N11_NLL = tf.multiply(tf.expand_dims(fn_com, axis=1), N11)

                NLL = tf.scalar_mul(self.beta,
                                    tf.add_n([NLL_N10_N01, N00_NLL, N11_NLL]))

                m1 = tf.multiply(tf.squeeze(per_matrix, axis=2),
                                 tf.cast(pos_mask_cum1, tf.float32))
                m1 = tf.subtract(tf.cast(pos_mask_cum1, tf.float32), m1)
                m2 = tf.multiply(tf.squeeze(per_matrix, axis=2),
                                 tf.cast(pos_mask_cum2, tf.float32))
                T_f = tf.add(m1, m2)

                per_flipped = tf.concat(
                    [per_[:, :, 0:3],
                     tf.expand_dims(T_f, axis=2)], axis=2)

                idx = tf.concat(
                    [r3, tf.cast(per_flipped[:, :, 0:2], tf.int32)], axis=2)
                m_f = tf.scatter_nd(indices=tf.expand_dims(idx, 2),
                                    updates=per_flipped[:, :, 3:4],
                                    shape=tf.constant([
                                        self.batch_size, self.config.nCells,
                                        self.config.nMuts
                                    ]))
                c_v = self.count3gametes(m_f)
                c_t = tf.expand_dims(tf.add(tf.squeeze(NLL, axis=1),
                                            tf.cast(c_v, tf.float32)),
                                     axis=0)
                ind = []
                for i1 in range(x.get_shape()[1]):
                    ind.append([i, i1])
                ind = tf.convert_to_tensor(ind)
                ind = tf.expand_dims(ind, axis=0)
                x_n = tf.scatter_nd(indices=ind,
                                    updates=c_t,
                                    shape=x.get_shape())
            x_m = tf.reduce_min(x, axis=0)
            self.cost = tf.identity(x_m)

        with tf.name_scope('environment'):

            cost = tf.identity(self.cost)

            # Define reward from tour length
            self.reward = tf.cast(cost, tf.float32)
            variable_summaries('reward', self.reward, with_max_min=True)

    def build_optim(self):
        # Update moving_mean and moving_variance for batch normalization layers
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):

            with tf.name_scope('baseline'):
                # Update baseline
                reward_mean, reward_var = tf.nn.moments(self.reward, axes=[0])

            with tf.name_scope('reinforce'):
                # Actor learning rate
                self.lr1 = tf.train.exponential_decay(self.lr1_start,
                                                      self.global_step,
                                                      self.lr1_decay_step,
                                                      self.lr1_decay_rate,
                                                      staircase=False,
                                                      name="learning_rate1")
                # Optimizer
                self.opt1 = tf.train.AdamOptimizer(learning_rate=self.lr1,
                                                   beta1=0.9,
                                                   beta2=0.99,
                                                   epsilon=0.0000001)
                # Discounted reward
                self.reward_baseline = tf.stop_gradient(
                    self.reward - self.critic.predictions)  # [Batch size, 1]
                variable_summaries('reward_baseline',
                                   self.reward_baseline,
                                   with_max_min=True)
                # Loss
                self.loss1 = tf.reduce_mean(
                    self.reward_baseline * self.log_softmax, 0)
                tf.summary.scalar('loss1', self.loss1)
                # Minimize step
                gvs = self.opt1.compute_gradients(self.loss1)
                capped_gvs = [(tf.clip_by_norm(grad, 1.), var)
                              for grad, var in gvs
                              if grad is not None]  # L2 clip
                self.train_step1 = self.opt1.apply_gradients(
                    capped_gvs, global_step=self.global_step)

            with tf.name_scope('state_value'):
                # Critic learning rate
                self.lr2 = tf.train.exponential_decay(self.lr2_start,
                                                      self.global_step2,
                                                      self.lr2_decay_step,
                                                      self.lr2_decay_rate,
                                                      staircase=False,
                                                      name="learning_rate1")
                # Optimizer
                self.opt2 = tf.train.AdamOptimizer(learning_rate=self.lr2,
                                                   beta1=0.9,
                                                   beta2=0.99,
                                                   epsilon=0.0000001)
                # Loss
                weights_ = 1.0  #weights_ = tf.exp(self.log_softmax-tf.reduce_max(self.log_softmax)) # probs / max_prob
                self.loss2 = tf.losses.mean_squared_error(
                    self.reward, self.critic.predictions, weights=weights_)
                tf.summary.scalar('loss2', self.loss1)
                # Minimize step
                gvs2 = self.opt2.compute_gradients(self.loss2)
                capped_gvs2 = [(tf.clip_by_norm(grad, 1.), var)
                               for grad, var in gvs2
                               if grad is not None]  # L2 clip
                self.train_step2 = self.opt1.apply_gradients(
                    capped_gvs2, global_step=self.global_step2)

Ejemplo n.º 2

Mostrar archivo

Archivo: actor.py Proyecto: xiangyueyang/neural-combinatorial-optimization-rl-tensorflow

class Actor(object):


    def __init__(self, config):
        self.config=config

        # Data config
        self.batch_size = config.batch_size # batch size
        self.max_length = config.max_length # input sequence length (number of cities)
        self.input_dimension = config.input_dimension # dimension of a city (coordinates)

        # Reward config
        self.avg_baseline = tf.Variable(config.init_baseline, trainable=False, name="moving_avg_baseline") # moving baseline for Reinforce
        self.alpha = config.alpha # moving average update

        # Training config (actor)
        self.global_step= tf.Variable(0, trainable=False, name="global_step") # global step
        self.lr1_start = config.lr1_start # initial learning rate
        self.lr1_decay_rate= config.lr1_decay_rate # learning rate decay rate
        self.lr1_decay_step= config.lr1_decay_step # learning rate decay step

        # Training config (critic)
        self.global_step2 = tf.Variable(0, trainable=False, name="global_step2") # global step
        self.lr2_start = config.lr1_start # initial learning rate
        self.lr2_decay_rate= config.lr1_decay_rate # learning rate decay rate
        self.lr2_decay_step= config.lr1_decay_step # learning rate decay step

        # Tensor block holding the input sequences [Batch Size, Sequence Length, Features]
        self.input_ = tf.placeholder(tf.float32, [self.batch_size, self.max_length, self.input_dimension], name="input_coordinates")

        self.build_permutation()
        self.build_critic()
        self.build_reward()
        self.build_optim()
        self.merged = tf.summary.merge_all()


    def build_permutation(self):

        with tf.variable_scope("encoder"):

            Encoder = Attentive_encoder(self.config)
            encoder_output = Encoder.encode(self.input_)

        with tf.variable_scope('decoder'):
            # Ptr-net returns permutations (self.positions), with their log-probability for backprop
            self.ptr = Pointer_decoder(encoder_output, self.config)
            self.positions, self.log_softmax = self.ptr.loop_decode()
            variable_summaries('log_softmax',self.log_softmax, with_max_min = True)
            

    def build_critic(self):

        with tf.variable_scope("critic"):
            # Critic predicts reward (parametric baseline for REINFORCE)
            self.critic = Critic(self.config)
            self.critic.predict_rewards(self.input_)
            variable_summaries('predictions',self.critic.predictions, with_max_min = True)


    def build_reward(self):

        with tf.name_scope('permutations'):

            # Reorder input % tour
            self.ordered_input_ = []
            for input_, path in zip(tf.unstack(self.input_,axis=0), tf.unstack(self.positions,axis=0)): # Unstack % batch axis
                self.ordered_input_.append(tf.gather_nd(input_,tf.expand_dims(path,1)))
            self.ordered_input_ = tf.transpose(tf.stack(self.ordered_input_,0),[2,1,0]) # [batch size, seq length +1 , features] to [features, seq length +1, batch_size]   Rq: +1 because end = start = first_city

            # Ordered coordinates
            ordered_x_ = self.ordered_input_[0] # [seq length +1, batch_size]
            delta_x2 = tf.transpose(tf.square(ordered_x_[1:]-ordered_x_[:-1]),[1,0]) # [batch_size, seq length]        delta_x**2
            ordered_y_ = self.ordered_input_[1] # [seq length +1, batch_size]
            delta_y2 = tf.transpose(tf.square(ordered_y_[1:]-ordered_y_[:-1]),[1,0]) # [batch_size, seq length]        delta_y**2

        with tf.name_scope('environment'):

            # Get tour length (euclidean distance)
            inter_city_distances = tf.sqrt(delta_x2+delta_y2) # sqrt(delta_x**2 + delta_y**2) this is the euclidean distance between each city: depot --> ... ---> depot      [batch_size, seq length]
            self.distances = tf.reduce_sum(inter_city_distances, axis=1) # [batch_size]
            #variable_summaries('tour_length',self.distances, with_max_min = True)

            # Define reward from tour length
            self.reward = tf.cast(self.distances,tf.float32)
            variable_summaries('reward',self.reward, with_max_min = True)


    def build_optim(self):
        # Update moving_mean and moving_variance for batch normalization layers
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):

            with tf.name_scope('baseline'):
                # Update baseline
                reward_mean, reward_var = tf.nn.moments(self.reward,axes=[0])
                self.base_op = tf.assign(self.avg_baseline, self.alpha*self.avg_baseline+(1.0-self.alpha)*reward_mean)
                tf.summary.scalar('average baseline',self.avg_baseline)

            with tf.name_scope('reinforce'):
                # Actor learning rate
                self.lr1 = tf.train.exponential_decay(self.lr1_start, self.global_step, self.lr1_decay_step,self.lr1_decay_rate, staircase=False, name="learning_rate1")
                # Optimizer
                self.opt1 = tf.train.AdamOptimizer(learning_rate=self.lr1,beta1=0.9,beta2=0.99, epsilon=0.0000001)
                # Discounted reward
                self.reward_baseline = tf.stop_gradient(self.reward - self.avg_baseline - self.critic.predictions) # [Batch size, 1] 
                variable_summaries('reward_baseline',self.reward_baseline, with_max_min = True)
                # Loss
                self.loss1 = tf.reduce_mean(self.reward_baseline*self.log_softmax,0)
                tf.summary.scalar('loss1', self.loss1)
                # Minimize step
                gvs = self.opt1.compute_gradients(self.loss1)
                capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs if grad is not None] # L2 clip
                self.train_step1 = self.opt1.apply_gradients(capped_gvs, global_step=self.global_step)

            with tf.name_scope('state_value'):
                # Critic learning rate
                self.lr2 = tf.train.exponential_decay(self.lr2_start, self.global_step2, self.lr2_decay_step,self.lr2_decay_rate, staircase=False, name="learning_rate1")
                # Optimizer
                self.opt2 = tf.train.AdamOptimizer(learning_rate=self.lr2,beta1=0.9,beta2=0.99, epsilon=0.0000001)
                # Loss
                weights_ = 1.0 #weights_ = tf.exp(self.log_softmax-tf.reduce_max(self.log_softmax)) # probs / max_prob
                self.loss2 = tf.losses.mean_squared_error(self.reward - self.avg_baseline, self.critic.predictions, weights = weights_)
                tf.summary.scalar('loss2', self.loss1)
                # Minimize step
                gvs2 = self.opt2.compute_gradients(self.loss2)
                capped_gvs2 = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs2 if grad is not None] # L2 clip
                self.train_step2 = self.opt1.apply_gradients(capped_gvs2, global_step=self.global_step2)

Ejemplo n.º 3

Mostrar archivo

Archivo: actor.py Proyecto: yangcaot/neural-combinatorial-optimization-rl-tensorflow

class Actor(object):
    def __init__(self, config):
        self.config = config

        # Data config
        self.batch_size = config.batch_size  # batch size
        self.max_length = config.max_length  # input sequence length (number of cities)
        self.input_dimension = config.input_dimension  # dimension of a city (coordinates)
        self.speed = config.speed  # agent's speed

        # Network config
        self.input_embed = config.input_embed  # dimension of embedding space
        self.num_neurons = config.hidden_dim  # dimension of hidden states (LSTM cell)
        self.initializer = tf.contrib.layers.xavier_initializer(
        )  # variables initializer

        # Reward config
        self.beta = config.beta  # penalty for constraint

        # Training config (actor)
        self.global_step = tf.Variable(0, trainable=False,
                                       name="global_step")  # global step
        self.lr1_start = config.lr1_start  # initial learning rate
        self.lr1_decay_rate = config.lr1_decay_rate  # learning rate decay rate
        self.lr1_decay_step = config.lr1_decay_step  # learning rate decay step
        self.is_training = not config.inference_mode

        # Training config (critic)
        self.global_step2 = tf.Variable(0,
                                        trainable=False,
                                        name="global_step2")  # global step
        self.lr2_start = config.lr1_start  # initial learning rate
        self.lr2_decay_rate = config.lr1_decay_rate  # learning rate decay rate
        self.lr2_decay_step = config.lr1_decay_step  # learning rate decay step

        # Tensor block holding the input sequences [Batch Size, Sequence Length, Features]
        self.input_ = tf.placeholder(
            tf.float32,
            [self.batch_size, self.max_length + 1, self.input_dimension + 2],
            name="input_raw")  # +1 for depot / +2 for TW mean and TW width

        self.build_permutation()
        self.build_critic()
        self.build_reward()
        self.build_optim()
        self.merged = tf.summary.merge_all()

    def build_permutation(self):

        with tf.variable_scope("encoder"):

            with tf.variable_scope("embedding"):
                # Embed input sequence
                W_embed = tf.get_variable(
                    "weights", [1, self.input_dimension + 2, self.input_embed],
                    initializer=self.initializer)  # +2 for TW feat. here too
                embedded_input = tf.nn.conv1d(self.input_,
                                              W_embed,
                                              1,
                                              "VALID",
                                              name="embedded_input")
                # Batch Normalization
                embedded_input = tf.layers.batch_normalization(
                    embedded_input,
                    axis=2,
                    training=self.is_training,
                    name='layer_norm',
                    reuse=None)

            with tf.variable_scope("dynamic_rnn"):
                # Encode input sequence
                cell1 = LSTMCell(
                    self.num_neurons, initializer=self.initializer
                )  # BNLSTMCell(self.num_neurons, self.training) or cell1 = DropoutWrapper(cell1, output_keep_prob=0.9)
                # Return the output activations [Batch size, Sequence Length, Num_neurons] and last hidden state as tensors.
                encoder_output, encoder_state = tf.nn.dynamic_rnn(
                    cell1, embedded_input, dtype=tf.float32)

        with tf.variable_scope('decoder'):
            # Ptr-net returns permutations (self.positions), with their log-probability for backprop
            self.ptr = Pointer_decoder(encoder_output, self.config)
            self.positions, self.log_softmax, self.attending, self.pointing = self.ptr.loop_decode(
                encoder_state)
            variable_summaries('log_softmax',
                               self.log_softmax,
                               with_max_min=True)

    def build_critic(self):

        with tf.variable_scope("critic"):
            # Critic predicts reward (parametric baseline for REINFORCE)
            self.critic = Critic(self.config)
            self.critic.predict_rewards(self.input_)
            variable_summaries('predictions',
                               self.critic.predictions,
                               with_max_min=True)

    def build_reward(self):

        with tf.name_scope('permutations'):

            # Reorder input % tour
            self.permutations = tf.stack([
                tf.tile(
                    tf.expand_dims(tf.range(self.batch_size, dtype=tf.int32),
                                   1), [1, self.max_length + 2]),
                self.positions
            ], 2)
            self.ordered_input_ = tf.gather_nd(self.input_, self.permutations)
            self.ordered_input_ = tf.transpose(
                self.ordered_input_, [2, 1, 0]
            )  # [batch size, seq length +1 , features] to [features, seq length +1, batch_size]   Rq: +1 because end = start = depot

            # Ordered coordinates
            ordered_x_ = self.ordered_input_[0]  # [seq length +1, batch_size]
            delta_x2 = tf.transpose(
                tf.square(ordered_x_[1:] - ordered_x_[:-1]),
                [1, 0])  # [batch_size, seq length]        delta_x**2
            ordered_y_ = self.ordered_input_[1]  # [seq length +1, batch_size]
            delta_y2 = tf.transpose(
                tf.square(ordered_y_[1:] - ordered_y_[:-1]),
                [1, 0])  # [batch_size, seq length]        delta_y**2

            # Ordered TW constraints
            self.ordered_tw_mean_ = tf.transpose(
                self.ordered_input_[2][:-1],
                [1, 0])  # [seq length, batch_size] to [batch_size, seq length]
            self.ordered_tw_width_ = tf.transpose(
                self.ordered_input_[3][:-1],
                [1, 0])  # [seq length, batch_size] to [batch_size, seq length]

            self.ordered_tw_open_ = self.ordered_tw_mean_ - self.ordered_tw_width_ / 2
            self.ordered_tw_close_ = self.ordered_tw_mean_ + self.ordered_tw_width_ / 2

        with tf.name_scope('environment'):

            # Get tour length (euclidean distance)
            inter_city_distances = tf.sqrt(
                delta_x2 + delta_y2
            )  # sqrt(delta_x**2 + delta_y**2) this is the euclidean distance between each city: depot --> ... ---> depot      [batch_size, seq length]
            self.distances = tf.reduce_sum(inter_city_distances,
                                           axis=1)  # [batch_size]
            variable_summaries('tour_length',
                               self.distances,
                               with_max_min=True)

            # Get time at each city if no constraint
            self.time_at_cities = (1 / self.speed) * tf.cumsum(
                inter_city_distances, axis=1, exclusive=True
            ) - 10  # [batch size, seq length]          # Rq: -10 to be on time at depot (t_mean centered)

            # Apply constraints to each city
            self.constrained_delivery_time = []
            cumul_lateness = 0
            for time_open, delivery_time in zip(
                    tf.unstack(self.ordered_tw_open_, axis=1),
                    tf.unstack(self.time_at_cities,
                               axis=1)):  # Unstack % seq length
                delayed_delivery = delivery_time + cumul_lateness
                cumul_lateness += tf.maximum(
                    time_open - delayed_delivery, tf.zeros([
                        self.batch_size
                    ]))  # if you have to wait... wait (impacts further states)
                self.constrained_delivery_time.append(delivery_time +
                                                      cumul_lateness)
            self.constrained_delivery_time = tf.stack(
                self.constrained_delivery_time, 1)

            # Define delay from lateness
            self.delay = tf.maximum(
                self.constrained_delivery_time - self.ordered_tw_close_ -
                0.0001, tf.zeros([self.batch_size, self.max_length + 1])
            )  # Delay perceived by the client (doesn't care if the deliver waits..)
            self.delay = tf.count_nonzero(self.delay, 1)
            variable_summaries('delay',
                               tf.cast(self.delay, tf.float32),
                               with_max_min=True)

            # Define reward from tour length & delay
            self.reward = tf.cast(self.distances,
                                  tf.float32) + self.beta * tf.sqrt(
                                      tf.cast(self.delay, tf.float32))
            variable_summaries('reward', self.reward, with_max_min=True)

    def build_optim(self):
        # Update moving_mean and moving_variance for batch normalization layers
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):

            with tf.name_scope('reinforce'):
                # Actor learning rate
                self.lr1 = tf.train.exponential_decay(self.lr1_start,
                                                      self.global_step,
                                                      self.lr1_decay_step,
                                                      self.lr1_decay_rate,
                                                      staircase=False,
                                                      name="learning_rate1")
                # Optimizer
                self.opt1 = tf.train.AdamOptimizer(learning_rate=self.lr1,
                                                   beta1=0.9,
                                                   beta2=0.99,
                                                   epsilon=0.0000001)
                # Discounted reward
                self.reward_baseline = tf.stop_gradient(
                    self.reward - self.critic.predictions)  # [Batch size, 1]
                variable_summaries('reward_baseline',
                                   self.reward_baseline,
                                   with_max_min=True)
                # Loss
                self.loss1 = tf.reduce_mean(
                    self.reward_baseline * self.log_softmax, 0)
                tf.summary.scalar('loss1', self.loss1)
                # Minimize step
                gvs = self.opt1.compute_gradients(self.loss1)
                capped_gvs = [(tf.clip_by_norm(grad, 1.), var)
                              for grad, var in gvs
                              if grad is not None]  # L2 clip
                self.train_step1 = self.opt1.apply_gradients(
                    capped_gvs, global_step=self.global_step)

            with tf.name_scope('state_value'):
                # Critic learning rate
                self.lr2 = tf.train.exponential_decay(self.lr2_start,
                                                      self.global_step2,
                                                      self.lr2_decay_step,
                                                      self.lr2_decay_rate,
                                                      staircase=False,
                                                      name="learning_rate1")
                # Optimizer
                self.opt2 = tf.train.AdamOptimizer(learning_rate=self.lr2,
                                                   beta1=0.9,
                                                   beta2=0.99,
                                                   epsilon=0.0000001)
                # Loss
                self.loss2 = tf.losses.mean_squared_error(
                    self.reward, self.critic.predictions, weights=1.0)
                tf.summary.scalar('loss2', self.loss1)
                # Minimize step
                gvs2 = self.opt2.compute_gradients(self.loss2)
                capped_gvs2 = [(tf.clip_by_norm(grad, 1.), var)
                               for grad, var in gvs2
                               if grad is not None]  # L2 clip
                self.train_step2 = self.opt1.apply_gradients(
                    capped_gvs2, global_step=self.global_step2)