Beispiel #1
0
    def __init__(self, cfg, training=False):
        super(ActorCriticMLP, self).__init__()

        self.model_name = 'ActorCriticMLP'

        self.cfg = cfg
        self.training = training

        # network layers
        self.hidden1 = nn.Linear(96, 128)
        self.hidden2 = nn.Linear(128, 256)
        #self.hidden3 = nn.Linear(256, 256)

        # actor
        self.actor_mu = nn.Linear(256, self.cfg.NUM_ACTIONS)
        self.actor_sigma = nn.Linear(256, self.cfg.NUM_ACTIONS)

        # critic
        self.critic = nn.Linear(256, 1)

        # weight initialisation
        self.apply(ut.weight_init)

        self.actor_mu.weight.data = ut.normalized_columns_initializer(
            self.actor_mu.weight.data, 0.01)
        self.actor_mu.bias.data.fill_(0)

        self.actor_sigma.weight.data = ut.normalized_columns_initializer(
            self.actor_sigma.weight.data, 0.001)
        self.actor_sigma.bias.data.fill_(0)

        self.critic.weight.data = ut.normalized_columns_initializer(
            self.critic.weight.data, 1.0)
        self.critic.bias.data.fill_(0)
Beispiel #2
0
    def __init__(self, num_inputs, action_space):
        super(ActorCritic, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.lstm = nn.LSTMCell(32 * 3 * 3, 256)

        num_outputs = action_space.n
        self.critic_linear = nn.Linear(256, 1)
        self.actor_linear = nn.Linear(256, num_outputs)

        self.apply(weights_init)
        self.actor_linear.weight.data = normalized_columns_initializer(
            self.actor_linear.weight.data, 0.01)
        self.actor_linear.bias.data.fill_(0)
        self.critic_linear.weight.data = normalized_columns_initializer(
            self.critic_linear.weight.data, 1.0)
        self.critic_linear.bias.data.fill_(0)

        self.lstm.bias_ih.data.fill_(0)
        self.lstm.bias_hh.data.fill_(0)

        self.train()
        if USE_CUDA:
            self.cuda()
    def __init__(self, cfg, training=False, gpu_id=0):
        super(ActorCriticLSTM, self).__init__()

        self.model_name = 'ActorCriticLSTM'

        self.cfg = cfg
        self.training = training
        self.gpu_id = gpu_id

        self.lstm_layers = 1
        self.lstm_size = 512

        self.conv1 = nn.Conv2d(1, 32, 5, stride=1, padding=2)
        self.maxp1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 32, 5, stride=1, padding=1)
        self.maxp2 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(32, 64, 4, stride=1, padding=1)
        self.maxp3 = nn.MaxPool2d(2, 2)
        self.conv4 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.maxp4 = nn.MaxPool2d(2, 2)

        #self.lstm = nn.LSTMCell(256, 256)
        self.lstm = nn.LSTM(1024,
                            hidden_size=self.lstm_size,
                            num_layers=self.lstm_layers)

        # actor
        self.actor = nn.Linear(self.lstm_size, self.cfg.NUM_ACTIONS)

        # critic
        self.critic = nn.Linear(self.lstm_size, 1)

        # weight initialisation
        self.apply(ut.weight_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.conv4.weight.data.mul_(relu_gain)

        self.actor.weight.data = ut.normalized_columns_initializer(
            self.actor.weight.data, 0.01)
        self.actor.bias.data.fill_(0)

        self.critic.weight.data = ut.normalized_columns_initializer(
            self.critic.weight.data, 1.0)
        self.critic.bias.data.fill_(0)
        """
    def __init__(self, cfg, training=False):
        super(ActorCriticLSTM, self).__init__()

        self.model_name = 'ActorCriticLSTM'

        self.cfg = cfg
        self.training = training

        self.lstm_layers = 1
        self.lstm_size = 128

        # network layers
        self.hidden1 = nn.Linear(8, 128)
        #self.hidden3 = nn.Linear(256, 256)

        #self.lstm = nn.LSTMCell(256, 256)
        self.lstm = nn.LSTM(128,
                            hidden_size=self.lstm_size,
                            num_layers=self.lstm_layers)

        # actor
        self.actor_mu = nn.Linear(128, self.cfg.NUM_ACTIONS)
        self.actor_sigma = nn.Linear(128, self.cfg.NUM_ACTIONS)

        # critic
        self.critic = nn.Linear(128, 1)

        # weight initialisation
        self.apply(ut.weight_init)

        self.actor_mu.weight.data = ut.normalized_columns_initializer(
            self.actor_mu.weight.data, 0.01)
        self.actor_mu.bias.data.fill_(0)

        self.actor_sigma.weight.data = ut.normalized_columns_initializer(
            self.actor_sigma.weight.data, 0.001)
        self.actor_sigma.bias.data.fill_(0)

        self.critic.weight.data = ut.normalized_columns_initializer(
            self.critic.weight.data, 1.0)
        self.critic.bias.data.fill_(0)
        """
Beispiel #5
0
    def __init__(self, num_inputs, num_outputs):
        super(ActorCriticLSTM, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 5, stride=1, padding=2)
        self.conv2 = nn.Conv2d(32, 32, 5, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 4, stride=1, padding=1)
        self.conv4 = nn.Conv2d(64, 64, 3, stride=1, padding=1)

        self.lstm = nn.LSTMCell(1024, 512)

        self.critic_linear = nn.Linear(512, 1)
        self.actor_linear = nn.Linear(512, num_outputs)

        self.apply(weights_init)
        self.actor_linear.weight.data = normalized_columns_initializer(
            self.actor_linear.weight.data, 0.01)
        self.actor_linear.bias.data.fill_(0)
        self.critic_linear.weight.data = normalized_columns_initializer(
            self.critic_linear.weight.data, 1.0)
        self.critic_linear.bias.data.fill_(0)

        self.lstm.bias_ih.data.fill_(0)
        self.lstm.bias_hh.data.fill_(0)

        self.reset()
Beispiel #6
0
    def __init__(self, scope, trainer, global_step=None):
        with tf.variable_scope(scope):
            if FLAGS.meta:
                self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="Prev_Rewards")
                self.reward_multiplier = tf.placeholder(shape=[None], dtype=tf.float32, name="reward_multiplier")

                self.prev_rewards = tf.cast(tf.multiply(self.reward_multiplier, self.prev_rewards), dtype=tf.int32)
                # one_hot_indices = np.arange(0,1,0.1).tolist()[1:] + [1, 5]
                # one_hot_rewards = tf.one_hot(indices=one_hot_indices, depth=11, on_value=1, off_value=0,
                #                                       axis=-1)
                # self.prev_rewards_onehot = self.prev_rewards[]
                self.prev_rewards_onehot = tf.one_hot(self.prev_rewards, 12, dtype=tf.float32,
                                                      name="Prev_Rewards_OneHot")
            self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions")
            self.timestep = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="timestep")
            self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32,
                                                  name="Prev_Actions_OneHot")

            if FLAGS.meta:
                hidden = tf.concat([self.prev_rewards_onehot, self.prev_actions_onehot, self.timestep], 1,
                               name="Concatenated_input")
            else:
                hidden = tf.concat([self.prev_actions_onehot, self.timestep], 1,
                                   name="Concatenated_input")

            lstm_cell = tf.contrib.rnn.BasicLSTMCell(48, state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c], name="c_in")
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h], name="h_in")
            self.state_in = (c_in, h_in)

            rnn_in = tf.expand_dims(hidden, [0], name="RNN_input")
            step_size = tf.shape(self.timestep)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)

            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                time_major=False)

            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 48], name="RNN_out")

            fc_pol_w = tf.get_variable("FC_Pol_W", shape=[48, FLAGS.nb_actions],
                                       initializer=normalized_columns_initializer(0.01))
            self.policy = tf.nn.softmax(tf.matmul(rnn_out, fc_pol_w, name="Policy"), name="Policy_soft")

            fc_value_w = tf.get_variable("FC_Value_W", shape=[48, 1],
                                         initializer=normalized_columns_initializer(1.0))
            self.value = tf.matmul(rnn_out, fc_value_w, name="Value")

            if scope != 'global':
                self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions")
                self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32,
                                                 name="Actions_Onehot")

                self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None], dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                # Loss functions
                self.value_loss = FLAGS.beta_v * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy + 1e-7))

                starter_beta_e = 1.0
                end_beta_e = 0.0
                decay_steps = FLAGS.max_nb_episodes_train
                self.beta_e = tf.train.polynomial_decay(starter_beta_e, global_step,
                                                        decay_steps, end_beta_e,
                                                        power=0.5)

                self.policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs + 1e-7) * self.advantages)

                self.loss = self.value_loss + self.policy_loss - self.entropy * self.beta_e

                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, FLAGS.gradient_clip_value)

                for grad, weight in zip(grads, local_vars):
                    tf.summary.histogram(weight.name + '_grad', grad)
                    tf.summary.histogram(weight.name, weight)

                self.merged_summary = tf.summary.merge_all()

                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
Beispiel #7
0
    def __init__(self, scope, trainer, global_step=None):
        with tf.variable_scope(scope):
            if FLAGS.meta:
                self.prev_rewards = tf.placeholder(shape=[None, 1],
                                                   dtype=tf.float32,
                                                   name="Prev_Rewards")
            self.prev_actions = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name="Prev_Actions")
            self.timestep = tf.placeholder(shape=[None, 1],
                                           dtype=tf.float32,
                                           name="timestep")
            self.prev_actions_onehot = tf.one_hot(self.prev_actions,
                                                  FLAGS.nb_actions,
                                                  dtype=tf.float32,
                                                  name="Prev_Actions_OneHot")

            if FLAGS.meta:
                hidden = tf.concat([
                    self.prev_rewards, self.prev_actions_onehot, self.timestep
                ],
                                   1,
                                   name="Concatenated_input")
            else:
                hidden = tf.concat([self.prev_actions_onehot, self.timestep],
                                   1,
                                   name="Concatenated_input")

            lstm_cell = tf.contrib.rnn.BasicLSTMCell(48, state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c],
                                  name="c_in")
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h],
                                  name="h_in")
            self.state_in = (c_in, h_in)

            rnn_in = tf.expand_dims(hidden, [0], name="RNN_input")
            step_size = tf.shape(self.timestep)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)

            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell,
                rnn_in,
                initial_state=state_in,
                sequence_length=step_size,
                time_major=False)

            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 48], name="RNN_out")

            fc_pol_w = tf.get_variable(
                "FC_Pol_W",
                shape=[48, FLAGS.nb_actions],
                initializer=normalized_columns_initializer(0.01))
            self.policy = tf.nn.softmax(tf.matmul(rnn_out,
                                                  fc_pol_w,
                                                  name="Policy"),
                                        name="Policy_soft")

            fc_value_w = tf.get_variable(
                "FC_Value_W",
                shape=[48, 1],
                initializer=normalized_columns_initializer(1.0))
            self.value = tf.matmul(rnn_out, fc_value_w, name="Value")

            if scope != 'global':
                self.actions = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="Actions")
                self.actions_onehot = tf.one_hot(self.actions,
                                                 FLAGS.nb_actions,
                                                 dtype=tf.float32,
                                                 name="Actions_Onehot")

                self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],
                                                 dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(
                    self.policy * self.actions_onehot, [1])

                # Loss functions
                self.value_loss = FLAGS.beta_v * tf.reduce_sum(
                    tf.square(self.target_v - tf.reshape(self.value, [-1])))
                self.entropy = -tf.reduce_sum(
                    self.policy * tf.log(self.policy + 1e-7))

                starter_beta_e = 1.0
                end_beta_e = 0.0
                decay_steps = 20000
                self.beta_e = tf.train.polynomial_decay(starter_beta_e,
                                                        global_step,
                                                        decay_steps,
                                                        end_beta_e,
                                                        power=0.5)

                self.policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs + 1e-7) * self.advantages)

                self.loss = self.value_loss + self.policy_loss - self.entropy * self.beta_e

                local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, FLAGS.gradient_clip_value)

                for grad, weight in zip(grads, local_vars):
                    tf.summary.histogram(weight.name + '_grad', grad)
                    tf.summary.histogram(weight.name, weight)

                self.merged_summary = tf.summary.merge_all()

                global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, global_vars))
Beispiel #8
0
    def __init__(self, scope, trainer, global_step=None):
        with tf.variable_scope(scope):
            self.prob_of_random_goal = tf.Variable(FLAGS.initial_random_goal_prob, trainable=False,
                                                   name="prob_of_random_goal", dtype=tf.float32)
            self.inputs = tf.placeholder(shape=[None, FLAGS.resized_height, FLAGS.resized_width, FLAGS.agent_history_length],
                                         dtype=tf.float32, name="Inputs")

            self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="Prev_Rewards")

            self.prev_rewards_onehot = tf.one_hot(tf.cast(self.prev_rewards, dtype=tf.int32), 2, dtype=tf.float32,
                                                  name="Prev_Rewards_OneHot")

            self.prev_rewards = tf.expand_dims(self.prev_rewards, 1, name="rewards")

            # self.prev_rewards_onehot = tf.expand_dims(self.prev_rewards, 0)

            self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions")
            self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32,
                                                  name="Prev_Actions_OneHot")

            self.prev_goal = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_Goals")

            self.image_summaries = []

            if FLAGS.game not in flags.SUPPORTED_ENVS:
                self.conv0 = tf.contrib.layers.conv2d(
                    self.inputs, 16, 8, 4, activation_fn=tf.nn.elu, scope="conv0")
                with tf.variable_scope('conv0'):
                    tf.get_variable_scope().reuse_variables()
                    weights = tf.get_variable('weights')
                    grid = self.put_kernels_on_grid(weights)
                    self.image_summaries.append(
                        tf.summary.image('kernels', grid, max_outputs=1))
                self.conv = tf.contrib.layers.conv2d(
                    self.conv0, 32, 4, 2, activation_fn=tf.nn.elu, scope="conv1")
            else:
                self.conv = tf.contrib.layers.conv2d(
                    self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1")
                with tf.variable_scope('conv1'):
                    tf.get_variable_scope().reuse_variables()
                    weights = tf.get_variable('weights')
                    grid = self.put_kernels_on_grid(weights)
                    self.image_summaries.append(
                        tf.summary.image('kernels', grid, max_outputs=1))

            with tf.variable_scope('inputs'):
                tf.get_variable_scope().reuse_variables()
                self.image_summaries.append(
                    tf.summary.image('input', self.inputs, max_outputs=100))

            self.conv_flat = tf.contrib.layers.flatten(self.conv)
            self.fc = tf.contrib.layers.fully_connected(self.conv_flat, FLAGS.hidden_dim)
            self.fc = tf.contrib.layers.layer_norm(self.fc)
            self.f_percept = tf.nn.elu(self.fc, name="Zt")

            if FLAGS.game not in flags.SUPPORTED_ENVS:
                self.f_percept = tf.concat(
                    [self.f_percept, self.prev_rewards], 1,
                    name="Zt_r")
            else:
                self.f_percept = tf.concat(
                    [self.f_percept, self.prev_rewards_onehot], 1,
                    name="Zt_r")

            summary_f_percept_act = tf.contrib.layers.summarize_activation(self.f_percept)

            ############################################################################################################
            # Manager network

            if FLAGS.meta:
                self.f_Mspace = tf.concat(
                    [self.f_percept, self.prev_goal], 1,
                    name="Zt_r")
            else:
                self.f_Mspace = tf.identity(self.f_percept, name="Zt_r")

            self.f_Mspace = tf.contrib.layers.fully_connected(self.f_Mspace, FLAGS.hidden_dim)

            self.f_percept = tf.concat(
                [self.f_percept, self.prev_actions_onehot], 1,
                name="Zt_r")

            self.f_Mspace = tf.contrib.layers.layer_norm(self.f_Mspace)
            self.f_Mspace = tf.nn.elu(self.f_Mspace, name="St")
            summary_f_Mspace_act = tf.contrib.layers.summarize_activation(self.f_Mspace)

            m_rnn_in = tf.expand_dims(self.f_Mspace, [0], name="Mrnn_in")
            step_size = tf.shape(self.inputs)[:1]

            m_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(FLAGS.hidden_dim)
            m_c_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32)
            m_h_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32)
            self.m_state_init = [m_c_init, m_h_init]
            m_c_in = tf.placeholder(tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_c_in")
            m_h_in = tf.placeholder(tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_h_in")
            self.m_state_in = (m_c_in, m_h_in)
            m_state_in = tf.contrib.rnn.LSTMStateTuple(m_c_in, m_h_in)

            m_lstm_outputs, m_lstm_state = self.fast_dlstm(m_rnn_in, m_state_in, m_lstm_cell, FLAGS.manager_horizon,
                                                           FLAGS.hidden_dim * FLAGS.manager_horizon)

            m_lstm_c, m_lstm_h = m_lstm_state
            self.m_state_out = (m_lstm_c[-1, :1, :], m_lstm_h[-1, :1, :])
            self.goals = tf.reshape(m_lstm_outputs, [-1, FLAGS.hidden_dim])
            self.normalized_goals = tf.contrib.layers.fully_connected(self.goals, FLAGS.hidden_dim, activation_fn=tf.tanh, name="Gt")

            summary_goals = tf.contrib.layers.summarize_activation(self.normalized_goals)

            def randomize_goals(t):
                t = tf.cast(t, tf.int32)
                packed_tensors = tf.stack([tf.random_normal([FLAGS.hidden_dim, ]), self.normalized_goals[t, :]])

                to_update = tf.cond(
                    tf.less(self.prob_of_random_goal, tf.constant(FLAGS.final_random_goal_prob, dtype=tf.float32)),
                    lambda: tf.cast(
                        tf.multinomial(
                            tf.log([[self.prob_of_random_goal,
                                     tf.subtract(tf.constant(1.0),
                                                 self.prob_of_random_goal)]]), 1)[0][0], tf.int32),
                    lambda: tf.constant(1, tf.int32))

                resulted_tensor = tf.gather(packed_tensors, to_update)

                return resulted_tensor

            self.randomized_goals = tf.map_fn(lambda t: randomize_goals(t), tf.to_float(tf.range(0, step_size[0])),
                                              name="random_gt")

            summary_random_goals = tf.contrib.layers.summarize_activation(self.randomized_goals)

            self.decrease_prob_of_random_goal = tf.assign_sub(self.prob_of_random_goal, tf.constant(
                (FLAGS.initial_random_goal_prob - FLAGS.final_random_goal_prob) / FLAGS.explore_steps))

            m_fc_value_w = tf.get_variable("M_Value_W", shape=[FLAGS.hidden_dim, 1],
                                           initializer=normalized_columns_initializer(1.0))
            self.m_value = tf.matmul(m_rnn_out, m_fc_value_w, name="M_Value")

            summary_m_value_act = tf.contrib.layers.summarize_activation(self.m_value)

            ############################################################################################################

            # Worker network

            self.sum_prev_goals = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_c_Goals_sum")

            w_rnn_in = tf.expand_dims(self.f_percept, [0], name="Wrnn_in")
            step_size = tf.shape(self.inputs)[:1]
            w_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(FLAGS.goal_embedding_size * FLAGS.nb_actions)
            w_c_init = np.zeros((1, w_lstm_cell.state_size.c), np.float32)
            w_h_init = np.zeros((1, w_lstm_cell.state_size.h), np.float32)
            self.w_state_init = [w_c_init, w_h_init]
            w_c_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.c], name="Wrnn_c_in")
            w_h_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.h], name="Wrnn_h_in")
            self.w_state_in = (w_c_in, w_h_in)
            w_state_in = tf.contrib.rnn.LSTMStateTuple(w_c_in, w_h_in)

            w_lstm_outputs, w_lstm_state = tf.nn.dynamic_rnn(
                w_lstm_cell, w_rnn_in, initial_state=w_state_in, sequence_length=step_size,
                time_major=False)

            w_lstm_c, w_lstm_h = w_lstm_state
            self.w_state_out = (w_lstm_c[:1, :], w_lstm_h[:1, :])
            Ut = tf.reshape(w_lstm_outputs, [step_size[0], FLAGS.nb_actions, FLAGS.goal_embedding_size],
                                   name="Ut")
            Ut_flat = tf.reshape(w_lstm_outputs, [step_size[0], FLAGS.nb_actions * FLAGS.goal_embedding_size],
                                        name="Ut_flat")

            summary_wrnn_act = tf.contrib.layers.summarize_activation(Ut)

            goal_encoding = tf.contrib.layers.fully_connected(self.sum_prev_goals, FLAGS.goal_embedding_size,
                                                              biases_initializer=None, scope="goal_emb")

            interm_rez = tf.squeeze(tf.matmul(Ut, tf.expand_dims(goal_encoding, 2)), 2)
            interm_rez = tf.contrib.layers.flatten(interm_rez)
            self.w_policy = tf.nn.softmax(interm_rez, name="W_Policy")

            summary_w_policy_act = tf.contrib.layers.summarize_activation(self.w_policy)

            w_fc_value_w = tf.get_variable("W_Value_W", shape=[FLAGS.nb_actions * FLAGS.goal_embedding_size + FLAGS.goal_embedding_size, 1],
                                           initializer=normalized_columns_initializer(1.0))
            self.w_value = tf.matmul(tf.concat([Ut_flat, goal_encoding], 1), w_fc_value_w, name="W_Value")

            summary_w_value_act = tf.contrib.layers.summarize_activation(self.w_value)

            if scope != 'global':

                self.w_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32)
                self.m_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32)
                self.w_intrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32)

                def gather_state_at_horiz(t):
                    t = tf.cast(t, tf.int32)
                    f_Mspace_c = tf.gather(self.f_Mspace,
                                           tf.minimum(t + tf.constant(FLAGS.manager_horizon, dtype=tf.int32),
                                                      step_size[0] - 1))
                    return f_Mspace_c

                self.f_Mspace_c = tf.cast(
                    tf.map_fn(lambda t: gather_state_at_horiz(t), tf.to_float(tf.range(0, step_size[0])),
                              name="state_at_horiz"), dtype=tf.float32)
                self.state_diff = self.f_Mspace_c - self.f_Mspace
                self.cos_sim_state_diff = self.cosine_distance(tf.stop_gradient(self.state_diff), self.normalized_goals,
                                                               dim=1)

                self.m_advantages = self.m_extrinsic_return - tf.stop_gradient(tf.reshape(self.m_value, [-1]))
                self.goals_loss = - tf.reduce_sum(self.m_advantages * self.cos_sim_state_diff)
                self.m_value_loss = FLAGS.m_beta_v * tf.reduce_sum(
                    tf.square(self.m_extrinsic_return - tf.reshape(self.m_value, [-1])))

                self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions")
                self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32,
                                                 name="Actions_Onehot")

                self.responsible_outputs = tf.reduce_sum(self.w_policy * self.actions_onehot, [1])

                self.intrinsic_return = FLAGS.alpha * self.w_intrinsic_return
                self.total_return = self.w_extrinsic_return + self.intrinsic_return
                self.w_advantages = self.total_return - tf.stop_gradient(tf.reshape(self.w_value, [-1]))

                # Loss functions
                self.w_value_loss = FLAGS.w_beta_v * tf.reduce_sum(
                    tf.square(self.total_return - tf.reshape(self.w_value, [-1])))
                self.entropy = - tf.reduce_sum(self.w_policy * tf.log(self.w_policy + 1e-7))

                self.w_policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs + 1e-7) * self.w_advantages) - self.entropy * FLAGS.beta_e

                self.loss = self.w_value_loss + self.w_policy_loss + self.m_value_loss + self.goals_loss

                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, FLAGS.gradient_clip_value)

                self.worker_summaries = [summary_f_percept_act, summary_f_Mspace_act, summary_goals,
                                         summary_random_goals,
                                         summary_m_value_act,
                                         summary_wrnn_act, summary_w_policy_act, summary_w_value_act]
                for grad, weight in zip(grads, local_vars):
                    self.worker_summaries.append(tf.summary.histogram(weight.name + '_grad', grad))
                    self.worker_summaries.append(tf.summary.histogram(weight.name, weight))

                self.merged_summary = tf.summary.merge(self.worker_summaries)

                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
Beispiel #9
0
    def __init__(self, scope, trainer, global_step=None):
        with tf.variable_scope(scope):
            self.inputs = tf.placeholder(shape=[None, FLAGS.game_size, FLAGS.game_size, FLAGS.game_channels],
                                         dtype=tf.float32, name="Inputs")

            self.conv = tf.contrib.layers.conv2d(
                self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1")

            self.image_summaries = []
            with tf.variable_scope('conv1'):
                tf.get_variable_scope().reuse_variables()
                weights = tf.get_variable('weights')
                grid = self.put_kernels_on_grid(weights)
                self.image_summaries.append(
                    tf.summary.image('kernels', grid, max_outputs=1))


            with tf.variable_scope('inputs'):
                tf.get_variable_scope().reuse_variables()
                self.image_summaries.append(
                    tf.summary.image('input', self.inputs, max_outputs=1))

            self.fc = tf.contrib.layers.fully_connected(tf.contrib.layers.flatten(self.conv), 64)
            # self.conv = tf.contrib.layers.layer_norm(self.conv)
            self.elu = tf.nn.elu(self.fc)

            summary_conv_act = tf.contrib.layers.summarize_activation(self.elu)

            if FLAGS.meta:
                self.timestep = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="timestep")
                self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Rewards")
                self.prev_rewards_onehot = tf.one_hot(self.prev_rewards, 2, dtype=tf.float32,
                                                      name="Prev_Rewards_OneHot")
                self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions")
                self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32,
                                                      name="Prev_Actions_OneHot")

                if FLAGS.one_hot_reward:
                    hidden = tf.concat([self.elu, self.prev_rewards_onehot, self.prev_actions_onehot], 1, name="Concatenated_input")
                else:
                    hidden = tf.concat([self.elu, self.prev_rewards, self.prev_actions_onehot,
                                        self.timestep], 1, name="Concatenated_input")
            else:
                hidden = self.elu

            summary_hidden_act = tf.contrib.layers.summarize_activation(hidden)

            rnn_in = tf.expand_dims(hidden, [0], name="RNN_input")
            step_size = tf.shape(self.inputs)[:1]

            if FLAGS.fw:
                rnn_cell = LayerNormFastWeightsBasicRNNCell(48)
                # self.initial_state = rnn_cell.zero_state(tf.shape(self.inputs)[0], tf.float32)
                # self.initial_fast_weights = rnn_cell.zero_fast_weights(tf.shape(self.inputs)[0], tf.float32)
                h_init = np.zeros((1, 48), np.float32)
                fw_init = np.zeros((1, 48, 48), np.float32)
                self.state_init = [h_init, fw_init]
                h_in = tf.placeholder(tf.float32, [1, 48], name="hidden_state")
                fw_in = tf.placeholder(tf.float32, [1, 48, 48], name="fast_weights")
                self.state_in = (h_in, fw_in)

                rnn_outputs, rnn_state = tf.nn.dynamic_rnn(
                    rnn_cell, rnn_in, initial_state=self.state_in, sequence_length=step_size,
                    time_major=False)
                rnn_h, rnn_fw = rnn_state
                self.state_out = (rnn_h[:1, :], rnn_fw[:1, :])
                rnn_out = tf.reshape(rnn_outputs, [-1, 48], name="RNN_out")
            else:
                lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(48)
                c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
                h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
                self.state_init = [c_init, h_init]
                c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c], name="c_in")
                h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h], name="h_in")
                self.state_in = (c_in, h_in)
                state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)

                lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                    lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                    time_major=False)

                lstm_c, lstm_h = lstm_state
                self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
                rnn_out = tf.reshape(lstm_outputs, [-1, 48], name="RNN_out")

            summary_rnn_act = tf.contrib.layers.summarize_activation(rnn_out)

            fc_pol_w = tf.get_variable("FC_Pol_W", shape=[48, FLAGS.nb_actions],
                                       initializer=normalized_columns_initializer(0.01))
            self.policy = tf.nn.softmax(tf.matmul(rnn_out, fc_pol_w, name="Policy"), name="Policy_soft")

            summary_policy_act = tf.contrib.layers.summarize_activation(self.policy)

            fc_value_w = tf.get_variable("FC_Value_W", shape=[48, 1],
                                         initializer=normalized_columns_initializer(1.0))
            self.value = tf.matmul(rnn_out, fc_value_w, name="Value")

            summary_value_act = tf.contrib.layers.summarize_activation(self.value)

            if scope != 'global':
                self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions")
                self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32,
                                                 name="Actions_Onehot")

                self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None], dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                # Loss functions
                self.value_loss = FLAGS.beta_v * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy + 1e-7))

                # starter_beta_e = 1.0
                # end_beta_e = 0.0
                # decay_steps = 20000
                # self.beta_e = tf.train.polynomial_decay(starter_beta_e, global_step,
                #                                         decay_steps, end_beta_e,
                #                                         power=0.5)

                self.policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs + 1e-7) * self.advantages) - self.entropy * FLAGS.beta_e

                self.loss = self.value_loss + self.policy_loss

                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, FLAGS.gradient_clip_value)

                self.worker_summaries = [summary_conv_act, summary_hidden_act, summary_rnn_act, summary_policy_act,
                                         summary_value_act]
                for grad, weight in zip(grads, local_vars):
                    self.worker_summaries.append(tf.summary.histogram(weight.name + '_grad', grad))
                    self.worker_summaries.append(tf.summary.histogram(weight.name, weight))

                self.merged_summary = tf.summary.merge(self.worker_summaries)

                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
Beispiel #10
0
    def __init__(self, scope, trainer, global_step=None):
        with tf.variable_scope(scope):
            self.inputs = tf.placeholder(shape=[
                None, FLAGS.game_size, FLAGS.game_size, FLAGS.game_channels
            ],
                                         dtype=tf.float32,
                                         name="Inputs")

            self.conv = tf.contrib.layers.conv2d(self.inputs,
                                                 32,
                                                 5,
                                                 2,
                                                 activation_fn=tf.nn.elu,
                                                 scope="conv1")

            self.image_summaries = []
            with tf.variable_scope('conv1'):
                tf.get_variable_scope().reuse_variables()
                weights = tf.get_variable('weights')
                grid = self.put_kernels_on_grid(weights)
                self.image_summaries.append(
                    tf.summary.image('kernels', grid, max_outputs=1))

            with tf.variable_scope('inputs'):
                tf.get_variable_scope().reuse_variables()
                self.image_summaries.append(
                    tf.summary.image('input', self.inputs, max_outputs=1))

            self.fc = tf.contrib.layers.fully_connected(
                tf.contrib.layers.flatten(self.conv), 64)
            # self.conv = tf.contrib.layers.layer_norm(self.conv)
            self.elu = tf.nn.elu(self.fc)

            summary_conv_act = tf.contrib.layers.summarize_activation(self.elu)

            if FLAGS.meta:
                self.timestep = tf.placeholder(shape=[None, 1],
                                               dtype=tf.float32,
                                               name="timestep")
                self.prev_rewards = tf.placeholder(shape=[None],
                                                   dtype=tf.int32,
                                                   name="Prev_Rewards")
                self.prev_rewards_onehot = tf.one_hot(
                    self.prev_rewards,
                    2,
                    dtype=tf.float32,
                    name="Prev_Rewards_OneHot")
                self.prev_actions = tf.placeholder(shape=[None],
                                                   dtype=tf.int32,
                                                   name="Prev_Actions")
                self.prev_actions_onehot = tf.one_hot(
                    self.prev_actions,
                    FLAGS.nb_actions,
                    dtype=tf.float32,
                    name="Prev_Actions_OneHot")

                if FLAGS.one_hot_reward:
                    hidden = tf.concat([
                        self.elu, self.prev_rewards_onehot,
                        self.prev_actions_onehot
                    ],
                                       1,
                                       name="Concatenated_input")
                else:
                    hidden = tf.concat([
                        self.elu, self.prev_rewards, self.prev_actions_onehot,
                        self.timestep
                    ],
                                       1,
                                       name="Concatenated_input")
            else:
                hidden = self.elu

            summary_hidden_act = tf.contrib.layers.summarize_activation(hidden)

            rnn_in = tf.expand_dims(hidden, [0], name="RNN_input")
            step_size = tf.shape(self.inputs)[:1]

            if FLAGS.fw:
                rnn_cell = LayerNormFastWeightsBasicRNNCell(48)
                # self.initial_state = rnn_cell.zero_state(tf.shape(self.inputs)[0], tf.float32)
                # self.initial_fast_weights = rnn_cell.zero_fast_weights(tf.shape(self.inputs)[0], tf.float32)
                h_init = np.zeros((1, 48), np.float32)
                fw_init = np.zeros((1, 48, 48), np.float32)
                self.state_init = [h_init, fw_init]
                h_in = tf.placeholder(tf.float32, [1, 48], name="hidden_state")
                fw_in = tf.placeholder(tf.float32, [1, 48, 48],
                                       name="fast_weights")
                self.state_in = (h_in, fw_in)

                rnn_outputs, rnn_state = tf.nn.dynamic_rnn(
                    rnn_cell,
                    rnn_in,
                    initial_state=self.state_in,
                    sequence_length=step_size,
                    time_major=False)
                rnn_h, rnn_fw = rnn_state
                self.state_out = (rnn_h[:1, :], rnn_fw[:1, :])
                rnn_out = tf.reshape(rnn_outputs, [-1, 48], name="RNN_out")
            else:
                lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(48)
                c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
                h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
                self.state_init = [c_init, h_init]
                c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c],
                                      name="c_in")
                h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h],
                                      name="h_in")
                self.state_in = (c_in, h_in)
                state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)

                lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                    lstm_cell,
                    rnn_in,
                    initial_state=state_in,
                    sequence_length=step_size,
                    time_major=False)

                lstm_c, lstm_h = lstm_state
                self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
                rnn_out = tf.reshape(lstm_outputs, [-1, 48], name="RNN_out")

            summary_rnn_act = tf.contrib.layers.summarize_activation(rnn_out)

            fc_pol_w = tf.get_variable(
                "FC_Pol_W",
                shape=[48, FLAGS.nb_actions],
                initializer=normalized_columns_initializer(0.01))
            self.policy = tf.nn.softmax(tf.matmul(rnn_out,
                                                  fc_pol_w,
                                                  name="Policy"),
                                        name="Policy_soft")

            summary_policy_act = tf.contrib.layers.summarize_activation(
                self.policy)

            fc_value_w = tf.get_variable(
                "FC_Value_W",
                shape=[48, 1],
                initializer=normalized_columns_initializer(1.0))
            self.value = tf.matmul(rnn_out, fc_value_w, name="Value")

            summary_value_act = tf.contrib.layers.summarize_activation(
                self.value)

            if scope != 'global':
                self.actions = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="Actions")
                self.actions_onehot = tf.one_hot(self.actions,
                                                 FLAGS.nb_actions,
                                                 dtype=tf.float32,
                                                 name="Actions_Onehot")

                self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],
                                                 dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(
                    self.policy * self.actions_onehot, [1])

                # Loss functions
                self.value_loss = FLAGS.beta_v * tf.reduce_sum(
                    tf.square(self.target_v - tf.reshape(self.value, [-1])))
                self.entropy = -tf.reduce_sum(
                    self.policy * tf.log(self.policy + 1e-7))

                # starter_beta_e = 1.0
                # end_beta_e = 0.0
                # decay_steps = 20000
                # self.beta_e = tf.train.polynomial_decay(starter_beta_e, global_step,
                #                                         decay_steps, end_beta_e,
                #                                         power=0.5)

                self.policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs + 1e-7) *
                    self.advantages) - self.entropy * FLAGS.beta_e

                self.loss = self.value_loss + self.policy_loss

                local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, FLAGS.gradient_clip_value)

                self.worker_summaries = [
                    summary_conv_act, summary_hidden_act, summary_rnn_act,
                    summary_policy_act, summary_value_act
                ]
                for grad, weight in zip(grads, local_vars):
                    self.worker_summaries.append(
                        tf.summary.histogram(weight.name + '_grad', grad))
                    self.worker_summaries.append(
                        tf.summary.histogram(weight.name, weight))

                self.merged_summary = tf.summary.merge(self.worker_summaries)

                global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, global_vars))
Beispiel #11
0
    def __init__(self, scope, trainer, global_step=None):
        with tf.variable_scope(scope):
            self.prob_of_random_goal = tf.Variable(
                FLAGS.initial_random_goal_prob,
                trainable=False,
                name="prob_of_random_goal",
                dtype=tf.float32)
            self.inputs = tf.placeholder(shape=[
                None, FLAGS.resized_height, FLAGS.resized_width,
                FLAGS.agent_history_length
            ],
                                         dtype=tf.float32,
                                         name="Inputs")

            self.prev_rewards = tf.placeholder(shape=[None],
                                               dtype=tf.float32,
                                               name="Prev_Rewards")

            self.prev_rewards_onehot = tf.one_hot(tf.cast(self.prev_rewards,
                                                          dtype=tf.int32),
                                                  2,
                                                  dtype=tf.float32,
                                                  name="Prev_Rewards_OneHot")

            self.prev_rewards = tf.expand_dims(self.prev_rewards,
                                               1,
                                               name="rewards")

            # self.prev_rewards_onehot = tf.expand_dims(self.prev_rewards, 0)

            self.prev_actions = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name="Prev_Actions")
            self.prev_actions_onehot = tf.one_hot(self.prev_actions,
                                                  FLAGS.nb_actions,
                                                  dtype=tf.float32,
                                                  name="Prev_Actions_OneHot")

            self.prev_goal = tf.placeholder(shape=[None, FLAGS.hidden_dim],
                                            dtype=tf.float32,
                                            name="Prev_Goals")

            self.image_summaries = []

            if FLAGS.game not in flags.SUPPORTED_ENVS:
                self.conv0 = tf.contrib.layers.conv2d(self.inputs,
                                                      16,
                                                      8,
                                                      4,
                                                      activation_fn=tf.nn.elu,
                                                      scope="conv0")
                with tf.variable_scope('conv0'):
                    tf.get_variable_scope().reuse_variables()
                    weights = tf.get_variable('weights')
                    grid = self.put_kernels_on_grid(weights)
                    self.image_summaries.append(
                        tf.summary.image('kernels', grid, max_outputs=1))
                self.conv = tf.contrib.layers.conv2d(self.conv0,
                                                     32,
                                                     4,
                                                     2,
                                                     activation_fn=tf.nn.elu,
                                                     scope="conv1")
            else:
                self.conv = tf.contrib.layers.conv2d(self.inputs,
                                                     32,
                                                     5,
                                                     2,
                                                     activation_fn=tf.nn.elu,
                                                     scope="conv1")
                with tf.variable_scope('conv1'):
                    tf.get_variable_scope().reuse_variables()
                    weights = tf.get_variable('weights')
                    grid = self.put_kernels_on_grid(weights)
                    self.image_summaries.append(
                        tf.summary.image('kernels', grid, max_outputs=1))

            with tf.variable_scope('inputs'):
                tf.get_variable_scope().reuse_variables()
                self.image_summaries.append(
                    tf.summary.image('input', self.inputs, max_outputs=100))

            self.conv_flat = tf.contrib.layers.flatten(self.conv)
            self.fc = tf.contrib.layers.fully_connected(
                self.conv_flat, FLAGS.hidden_dim)
            self.fc = tf.contrib.layers.layer_norm(self.fc)
            self.f_percept = tf.nn.elu(self.fc, name="Zt")

            if FLAGS.game not in flags.SUPPORTED_ENVS:
                self.f_percept = tf.concat([self.f_percept, self.prev_rewards],
                                           1,
                                           name="Zt_r")
            else:
                self.f_percept = tf.concat(
                    [self.f_percept, self.prev_rewards_onehot], 1, name="Zt_r")

            summary_f_percept_act = tf.contrib.layers.summarize_activation(
                self.f_percept)

            ############################################################################################################
            # Manager network

            if FLAGS.meta:
                self.f_Mspace = tf.concat([self.f_percept, self.prev_goal],
                                          1,
                                          name="Zt_r")
            else:
                self.f_Mspace = tf.identity(self.f_percept, name="Zt_r")

            self.f_Mspace = tf.contrib.layers.fully_connected(
                self.f_Mspace, FLAGS.hidden_dim)

            self.f_percept = tf.concat(
                [self.f_percept, self.prev_actions_onehot], 1, name="Zt_r")

            self.f_Mspace = tf.contrib.layers.layer_norm(self.f_Mspace)
            self.f_Mspace = tf.nn.elu(self.f_Mspace, name="St")
            summary_f_Mspace_act = tf.contrib.layers.summarize_activation(
                self.f_Mspace)

            m_rnn_in = tf.expand_dims(self.f_Mspace, [0], name="Mrnn_in")
            step_size = tf.shape(self.inputs)[:1]

            m_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                FLAGS.hidden_dim)
            m_c_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon),
                                np.float32)
            m_h_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon),
                                np.float32)
            self.m_state_init = [m_c_init, m_h_init]
            m_c_in = tf.placeholder(
                tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon],
                name="Mrnn_c_in")
            m_h_in = tf.placeholder(
                tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon],
                name="Mrnn_h_in")
            self.m_state_in = (m_c_in, m_h_in)
            m_state_in = tf.contrib.rnn.LSTMStateTuple(m_c_in, m_h_in)

            m_lstm_outputs, m_lstm_state = self.fast_dlstm(
                m_rnn_in, m_state_in, m_lstm_cell, FLAGS.manager_horizon,
                FLAGS.hidden_dim * FLAGS.manager_horizon)

            m_lstm_c, m_lstm_h = m_lstm_state
            self.m_state_out = (m_lstm_c[-1, :1, :], m_lstm_h[-1, :1, :])
            self.goals = tf.reshape(m_lstm_outputs, [-1, FLAGS.hidden_dim])
            self.normalized_goals = tf.contrib.layers.fully_connected(
                self.goals, FLAGS.hidden_dim, activation_fn=tf.tanh, name="Gt")

            summary_goals = tf.contrib.layers.summarize_activation(
                self.normalized_goals)

            def randomize_goals(t):
                t = tf.cast(t, tf.int32)
                packed_tensors = tf.stack([
                    tf.random_normal([
                        FLAGS.hidden_dim,
                    ]), self.normalized_goals[t, :]
                ])

                to_update = tf.cond(
                    tf.less(
                        self.prob_of_random_goal,
                        tf.constant(FLAGS.final_random_goal_prob,
                                    dtype=tf.float32)),
                    lambda: tf.cast(
                        tf.multinomial(
                            tf.log([[
                                self.prob_of_random_goal,
                                tf.subtract(tf.constant(1.0), self.
                                            prob_of_random_goal)
                            ]]), 1)[0][0], tf.int32),
                    lambda: tf.constant(1, tf.int32))

                resulted_tensor = tf.gather(packed_tensors, to_update)

                return resulted_tensor

            self.randomized_goals = tf.map_fn(lambda t: randomize_goals(t),
                                              tf.to_float(
                                                  tf.range(0, step_size[0])),
                                              name="random_gt")

            summary_random_goals = tf.contrib.layers.summarize_activation(
                self.randomized_goals)

            self.decrease_prob_of_random_goal = tf.assign_sub(
                self.prob_of_random_goal,
                tf.constant(
                    (FLAGS.initial_random_goal_prob -
                     FLAGS.final_random_goal_prob) / FLAGS.explore_steps))

            m_fc_value_w = tf.get_variable(
                "M_Value_W",
                shape=[FLAGS.hidden_dim, 1],
                initializer=normalized_columns_initializer(1.0))
            self.m_value = tf.matmul(m_rnn_out, m_fc_value_w, name="M_Value")

            summary_m_value_act = tf.contrib.layers.summarize_activation(
                self.m_value)

            ############################################################################################################

            # Worker network

            self.sum_prev_goals = tf.placeholder(
                shape=[None, FLAGS.hidden_dim],
                dtype=tf.float32,
                name="Prev_c_Goals_sum")

            w_rnn_in = tf.expand_dims(self.f_percept, [0], name="Wrnn_in")
            step_size = tf.shape(self.inputs)[:1]
            w_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                FLAGS.goal_embedding_size * FLAGS.nb_actions)
            w_c_init = np.zeros((1, w_lstm_cell.state_size.c), np.float32)
            w_h_init = np.zeros((1, w_lstm_cell.state_size.h), np.float32)
            self.w_state_init = [w_c_init, w_h_init]
            w_c_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.c],
                                    name="Wrnn_c_in")
            w_h_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.h],
                                    name="Wrnn_h_in")
            self.w_state_in = (w_c_in, w_h_in)
            w_state_in = tf.contrib.rnn.LSTMStateTuple(w_c_in, w_h_in)

            w_lstm_outputs, w_lstm_state = tf.nn.dynamic_rnn(
                w_lstm_cell,
                w_rnn_in,
                initial_state=w_state_in,
                sequence_length=step_size,
                time_major=False)

            w_lstm_c, w_lstm_h = w_lstm_state
            self.w_state_out = (w_lstm_c[:1, :], w_lstm_h[:1, :])
            Ut = tf.reshape(
                w_lstm_outputs,
                [step_size[0], FLAGS.nb_actions, FLAGS.goal_embedding_size],
                name="Ut")
            Ut_flat = tf.reshape(
                w_lstm_outputs,
                [step_size[0], FLAGS.nb_actions * FLAGS.goal_embedding_size],
                name="Ut_flat")

            summary_wrnn_act = tf.contrib.layers.summarize_activation(Ut)

            goal_encoding = tf.contrib.layers.fully_connected(
                self.sum_prev_goals,
                FLAGS.goal_embedding_size,
                biases_initializer=None,
                scope="goal_emb")

            interm_rez = tf.squeeze(
                tf.matmul(Ut, tf.expand_dims(goal_encoding, 2)), 2)
            interm_rez = tf.contrib.layers.flatten(interm_rez)
            self.w_policy = tf.nn.softmax(interm_rez, name="W_Policy")

            summary_w_policy_act = tf.contrib.layers.summarize_activation(
                self.w_policy)

            w_fc_value_w = tf.get_variable(
                "W_Value_W",
                shape=[
                    FLAGS.nb_actions * FLAGS.goal_embedding_size +
                    FLAGS.goal_embedding_size, 1
                ],
                initializer=normalized_columns_initializer(1.0))
            self.w_value = tf.matmul(tf.concat([Ut_flat, goal_encoding], 1),
                                     w_fc_value_w,
                                     name="W_Value")

            summary_w_value_act = tf.contrib.layers.summarize_activation(
                self.w_value)

            if scope != 'global':

                self.w_extrinsic_return = tf.placeholder(shape=[None],
                                                         dtype=tf.float32)
                self.m_extrinsic_return = tf.placeholder(shape=[None],
                                                         dtype=tf.float32)
                self.w_intrinsic_return = tf.placeholder(shape=[None],
                                                         dtype=tf.float32)

                def gather_state_at_horiz(t):
                    t = tf.cast(t, tf.int32)
                    f_Mspace_c = tf.gather(
                        self.f_Mspace,
                        tf.minimum(
                            t +
                            tf.constant(FLAGS.manager_horizon, dtype=tf.int32),
                            step_size[0] - 1))
                    return f_Mspace_c

                self.f_Mspace_c = tf.cast(tf.map_fn(
                    lambda t: gather_state_at_horiz(t),
                    tf.to_float(tf.range(0, step_size[0])),
                    name="state_at_horiz"),
                                          dtype=tf.float32)
                self.state_diff = self.f_Mspace_c - self.f_Mspace
                self.cos_sim_state_diff = self.cosine_distance(
                    tf.stop_gradient(self.state_diff),
                    self.normalized_goals,
                    dim=1)

                self.m_advantages = self.m_extrinsic_return - tf.stop_gradient(
                    tf.reshape(self.m_value, [-1]))
                self.goals_loss = -tf.reduce_sum(
                    self.m_advantages * self.cos_sim_state_diff)
                self.m_value_loss = FLAGS.m_beta_v * tf.reduce_sum(
                    tf.square(self.m_extrinsic_return -
                              tf.reshape(self.m_value, [-1])))

                self.actions = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="Actions")
                self.actions_onehot = tf.one_hot(self.actions,
                                                 FLAGS.nb_actions,
                                                 dtype=tf.float32,
                                                 name="Actions_Onehot")

                self.responsible_outputs = tf.reduce_sum(
                    self.w_policy * self.actions_onehot, [1])

                self.intrinsic_return = FLAGS.alpha * self.w_intrinsic_return
                self.total_return = self.w_extrinsic_return + self.intrinsic_return
                self.w_advantages = self.total_return - tf.stop_gradient(
                    tf.reshape(self.w_value, [-1]))

                # Loss functions
                self.w_value_loss = FLAGS.w_beta_v * tf.reduce_sum(
                    tf.square(self.total_return -
                              tf.reshape(self.w_value, [-1])))
                self.entropy = -tf.reduce_sum(
                    self.w_policy * tf.log(self.w_policy + 1e-7))

                self.w_policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs + 1e-7) *
                    self.w_advantages) - self.entropy * FLAGS.beta_e

                self.loss = self.w_value_loss + self.w_policy_loss + self.m_value_loss + self.goals_loss

                local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, FLAGS.gradient_clip_value)

                self.worker_summaries = [
                    summary_f_percept_act, summary_f_Mspace_act, summary_goals,
                    summary_random_goals, summary_m_value_act,
                    summary_wrnn_act, summary_w_policy_act, summary_w_value_act
                ]
                for grad, weight in zip(grads, local_vars):
                    self.worker_summaries.append(
                        tf.summary.histogram(weight.name + '_grad', grad))
                    self.worker_summaries.append(
                        tf.summary.histogram(weight.name, weight))

                self.merged_summary = tf.summary.merge(self.worker_summaries)

                global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, global_vars))