Example #1
0
 def __init__(self, hyperparams, policy, sess, actions):
     self.hyperparams = hyperparams
     self.sess = sess
     with tf.device('/cpu:0'):
         self.policy = policy.create_policy(trainable=False)
     self.sess.run(tf.initialize_all_variables())
     self.input_vars = get_input_vars(self.policy)
     self.y_hat = get_output(self.policy)
     self.experience_cache = []
     self.episode_cache = []
     self.prepare_epsilon()
     self.training = True
     self.actions = actions
     self.greedy_ind = None
Example #2
0
 def __init__(self, hyperparams, policy, sess, actions):
     self.hyperparams = hyperparams
     self.sess = sess
     with tf.device('/cpu:0'):
         self.policy = policy.create_policy(trainable=False)
     self.sess.run(tf.initialize_all_variables())
     self.input_vars = get_input_vars(self.policy)
     self.y_hat = get_output(self.policy)
     self.experience_cache = []
     self.episode_cache = []
     self.prepare_epsilon()
     self.training = True
     self.actions = actions
     self.greedy_ind = None
Example #3
0
 def __init__(self, hyperparams, q_model, sess, actions):
     self.hyperparams = hyperparams
     self.sess = sess
     with tf.device('/cpu:0'):
         self.net = q_model.create_net(trainable=False)
     self.sess.run(tf.initialize_all_variables())
     self.input_vars = get_input_vars(self.net)
     self.y_hat = get_output(self.net)
     self.experience_cache = []
     self.recent_train_q = deque(
         maxlen=self.hyperparams['num_recent_steps'])
     self.recent_eval_q = deque(maxlen=self.hyperparams['num_recent_steps'])
     self.prepare_epsilon()
     self.training = True
     self.actions = actions
     self.greedy_ind = None
Example #4
0
 def __init__(self, hyperparams, q_model, sess, actions):
     self.hyperparams = hyperparams
     self.sess = sess
     with tf.device('/cpu:0'):
         self.net = q_model.create_net(trainable=False)
     self.sess.run(tf.initialize_all_variables())
     self.input_vars = get_input_vars(self.net)
     self.y_hat = get_output(self.net)
     self.experience_cache = []
     self.recent_train_q = deque(
         maxlen=self.hyperparams['num_recent_steps'])
     self.recent_eval_q = deque(
         maxlen=self.hyperparams['num_recent_steps'])
     self.prepare_epsilon()
     self.training = True
     self.actions = actions
     self.greedy_ind = None
Example #5
0
    def build(self):
        print('Building agent ...')
        self.action_model.setup_net()

        self.state_model.setup_net()
        self.state_y = tf.placeholder(
            dtype=tf.float32,
            shape=(None, np.prod(self.state_shape)),
            name='expected_state_y')
        self.state_loss.build(self.state_model.y_hat, self.state_y)
        state_params = flatten_params(get_all_params(
            self.state_model.get_net()))
        # print(state_params)
        # self.state_train_step = self.state_optim.get_train_step(
        #     self.state_loss.loss, state_params)

        self.reward_model.setup_net()
        self.reward_y = tf.placeholder(
            dtype=tf.float32,
            shape=(None,),
            name='expected_reward_y')
        self.reward_loss.build(self.reward_model.y_hat, self.reward_y)
        reward_params = flatten_params(get_all_params(
            self.reward_model.get_net()))
        # self.reward_train_step = self.reward_optim.get_train_step(
        #     self.reward_loss.loss, reward_params)

        self.value_model.setup_net()
        self.value_y = tf.placeholder(
            dtype=tf.float32,
            shape=(None,),
            name='expected_value_y')
        self.value_loss.build(self.value_model.y_hat, self.value_y)
        value_params = flatten_params(get_all_params(
            self.value_model.get_net()))
        # self.value_train_step = self.value_optim.get_train_step(
        #     self.value_loss.loss, value_params)

        partial_params = state_params + reward_params + value_params
        partial_loss = (self.state_loss.loss +
                        self.reward_loss.loss +
                        self.value_loss.loss)
        self.partial_train_step = self.state_optim.get_train_step(
            partial_loss, partial_params)

        reward_discount = self.hyperparams['reward_discount']
        batch_size = self.hyperparams['batch_size']
        self.seed_train_state = tf.placeholder(
            tf.float32,
            shape=(batch_size,) + self.state_shape,
            name='seed_train_state')

        # scale = self.hyperparams['action_train_scale']
        value_rollout_length = self.hyperparams['value_rollout_length']
        next_state = self.seed_train_state
        next_conv_state = tf.concat(3, [next_state] * value_rollout_length)
        total_reward = tf.zeros((batch_size,))
        for timestep in range(self.hyperparams['rollout_length']):
            state = next_state
            conv_state = next_conv_state

            action = get_output(self.action_model.get_net(),
                                {'state': tf.expand_dims(state, 1)},
                                timestep=True)

            # evil softmax to closer-to-one-hot magic
            # action_max = tf.reduce_max(action, reduction_indices=1)
            # action_max = tf.expand_dims(action_max, 1)

            # action_min = tf.reduce_min(action, reduction_indices=1)
            # action_min = tf.expand_dims(action_min, 1)

            # action = tf.pow((1 - (action_max - action) -
            #                  (1 - (action_max - action_min))) /
            #                 (action_max - action_min), scale)
            # print('action shape')
            # print(action.get_shape())

            next_state = get_output(self.state_model.get_net(),
                                    {'state': conv_state,
                                     'action': action})
            next_state = tf.reshape(next_state, (-1, *self.state_shape))
            next_conv_state = tf.concat(
                3, [next_conv_state[:, :, :, :value_rollout_length - 1],
                    next_state])

            reward = get_output(self.reward_model.net,
                                {'state': next_conv_state,
                                 'action': action})
            total_reward += reward_discount * tf.squeeze(reward, [1])
            value = get_output(self.value_model.get_net(),
                               {'state': next_conv_state})

        print('reward shape')
        print(reward.get_shape())
        print('value shape')
        print(value.get_shape())
        total_reward += reward_discount * tf.squeeze(value, [1])
        print('Total reward shape')
        print(total_reward.get_shape())
        self.exp_returns = tf.reduce_mean(total_reward)

        print('Flattening params ...')
        action_params = flatten_params(get_trainable_params(
            self.action_model.get_net()))
        print('Action params:')
        print(get_trainable_params(self.action_model.get_net()))
        self.action_train_step = self.action_optim.get_train_step(
            -self.exp_returns, action_params)

        self.action_preds = get_output(self.action_model.get_net(),
                                       None,
                                       timestep=True,
                                       input_hidden=True)
        # self.assign_hidden_ops = get_assign_hidden_ops(
        #     self.action_model.get_net())
        # self.zero_hidden_ops = get_assign_hidden_ops(
        #     self.action_model.get_net(),
        #     zero=True)
        # self.hidden_states = get_input_hidden_vars(
        #     self.action_model.get_net(),
        #     timestep=True)

        self.hidden_states = get_input_hidden_vars(
            self.action_model.get_net(),
            timestep=True)
        self.hidden_output_states = get_output_hidden_vars(
            self.action_model.get_net())
        self.hidden_state_vals = {}
        self.init_hidden = get_init_hidden(
            self.action_model.get_net())
        # for hidden_name, hidden_state in self.hidden_states.items():
        #     self.hidden_state_vals[hidden_state] = np.zeros(
        #         hidden_state.eval(session=self.sess).shape)
        #     self.hidden_state_vals[hidden_state] = None

        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())

        self.update_value_target_weights_ops = set_all_params_ops(
            get_all_params(self.value_model.get_target_net()),
            get_all_params(self.value_model.get_net()))
        self.update_value_target_weights()

        self.prepare_epsilon()
        self.training = True
        self.part_experiences = []

        self.experience_replay = RDRLMem(self.hyperparams)
        self.experience_replay.build()

        self.greedy_ind = None
Example #6
0
 def output_fn(self, state):
     return get_output(self.net, {get_input_name(self.net): state})
Example #7
0
 def setup_net(self):
     self.build_net()
     self.input_vars = get_input_vars(self.get_net())
     self.y_hat = get_output(self.get_net())
Example #8
0
 def setup_net(self):
     self.build_net()
     self.y_hat = get_output(self.get_net())
     self.target_y_hat = get_output(self.get_target_net())
Example #9
0
 def setup_net(self):
     self.build_net()
     self.policy_y_hat = get_output(self.policy)
     self.value_y_hat = get_output(self.value)
     self.policy_input_vars = get_input_vars(self.policy)
     self.value_input_vars = get_input_vars(self.value)