Beispiel #1
0
 def build_action_chooser(self):
     chooser = AsyncActionChooser(self.hyperparams, self.q_model, self.sess,
                                  self.actions)
     self.update_chooser_weights_ops += set_all_params_ops(
         get_all_params(chooser.net),
         get_all_params(self.q_model.get_net()))
     return chooser
Beispiel #2
0
 def build_action_chooser(self):
     chooser = AsyncEdistActionChooser(self.hyperparams, self.q_model,
                                       self.sess, self.actions)
     self.update_chooser_weights_ops += set_all_params_ops(
         get_all_params(chooser.net),
         get_all_params(self.q_model.get_net()))
     return chooser
Beispiel #3
0
    def build(self):
        print('Building agent ...')
        self.tmax = self.hyperparams['tmax']
        with tf.device('/gpu:0'):
            self.q_model.setup_net()
        self.mask = tf.placeholder(shape=(None, len(self.actions)),
                                   dtype=tf.float32,
                                   name='mask')
        self.y = tf.placeholder(dtype=tf.float32,
                                shape=(None, ),
                                name='expected_y')
        self.loss.build(self.q_model.y_hat, self.y, self.mask)
        self.train_step = self.optim.get_train_step(self.loss.loss)
        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())
        # self.sess.run(tf.initialize_variables(flatten_params(
        #     get_all_params(self.q_model.net))))

        self.prepare_debug_vars()
        self.training = True
        self.experience_replay = SNDQNExperienceReplay(self.hyperparams,
                                                       self.dtype)
        self.experience_replay.build()

        self.update_target_weights_ops = set_all_params_ops(
            get_all_params(self.q_model.get_target_net()),
            get_all_params(self.q_model.get_net()))
        self.update_target_weights()
        self.update_chooser_weights_ops = []
Beispiel #4
0
    def build(self):
        print('Building agent ...')
        self.q_model.setup_net()
        self.mask = tf.placeholder(shape=(None, len(self.actions)),
                                   dtype=tf.float32,
                                   name='mask')
        self.y = tf.placeholder(dtype=tf.float32,
                                shape=(None, ),
                                name='expected_y')
        self.loss.build(self.q_model.y_hat, self.y, self.mask)
        self.train_step = self.optim.get_train_step(self.loss.loss)
        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())
        # self.sess.run(tf.initialize_variables(flatten_params(
        #     get_all_params(self.q_model.net))))

        self.prepare_epsilon()
        self.prepare_debug_vars()
        self.training = True
        self.recent_train_q = deque(
            maxlen=self.hyperparams['num_recent_steps'])
        self.recent_eval_q = deque(maxlen=self.hyperparams['num_recent_steps'])
        self.experience_replay = ExperienceReplay(self.hyperparams, self.dtype)
        self.experience_replay.build()

        self.update_target_weights_ops = set_all_params_ops(
            get_all_params(self.q_model.get_target_net()),
            get_all_params(self.q_model.get_net()))
        self.update_target_weights()
Beispiel #5
0
    def build(self):
        print('Building agent ...')
        self.tmax = self.hyperparams['tmax']
        with tf.device('/gpu:0'):
            self.q_model.setup_net()
        self.mask = tf.placeholder(shape=(None, len(self.actions)),
                                   dtype=tf.float32,
                                   name='mask')
        self.y = tf.placeholder(
            dtype=tf.float32,
            shape=(None,),
            name='expected_y')
        self.loss.build(self.q_model.y_hat, self.y, self.mask)
        self.train_step = self.optim.get_train_step(self.loss.loss)
        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())
        # self.sess.run(tf.initialize_variables(flatten_params(
        #     get_all_params(self.q_model.net))))

        self.prepare_debug_vars()
        self.training = True
        self.experience_replay = SNDQNExperienceReplay(self.hyperparams,
                                                       self.dtype)
        self.experience_replay.build()

        self.update_target_weights_ops = set_all_params_ops(
            get_all_params(self.q_model.get_target_net()),
            get_all_params(self.q_model.get_net()))
        self.update_target_weights()
        self.update_chooser_weights_ops = []
Beispiel #6
0
    def build(self):
        print('Building agent ...')
        self.q_model.setup_net()
        self.mask = tf.placeholder(shape=(None, len(self.actions)),
                                   dtype=tf.float32,
                                   name='mask')
        self.y = tf.placeholder(
            dtype=tf.float32,
            shape=(None,),
            name='expected_y')
        self.loss.build(self.q_model.y_hat, self.y, self.mask)
        self.train_step = self.optim.get_train_step(self.loss.loss)
        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())
        # self.sess.run(tf.initialize_variables(flatten_params(
        #     get_all_params(self.q_model.net))))

        self.prepare_epsilon()
        self.prepare_debug_vars()
        self.training = True
        self.recent_train_q = deque(
            maxlen=self.hyperparams['num_recent_steps'])
        self.recent_eval_q = deque(
            maxlen=self.hyperparams['num_recent_steps'])
        self.experience_replay = ExperienceReplay(self.hyperparams,
                                                  self.dtype)
        self.experience_replay.build()

        self.update_target_weights_ops = set_all_params_ops(
            get_all_params(self.q_model.get_target_net()),
            get_all_params(self.q_model.get_net()))
        self.update_target_weights()
Beispiel #7
0
 def build_action_chooser(self):
     chooser = AsyncActionChooser(self.hyperparams, self.ac_model,
                                  self.sess, self.actions)
     self.update_chooser_weights_ops += set_all_params_ops(
         get_all_params(chooser.policy),
         get_all_params(self.ac_model.policy))
     return chooser
Beispiel #8
0
 def build_action_chooser(self, train=True):
     chooser = AsyncActionChooser(self.hyperparams, self.ac_model,
                                  self.sess, self.actions, train=train)
     self.update_chooser_weights_ops += set_all_params_ops(
         get_all_params(chooser.policy),
         get_all_params(self.ac_model.policy))
     return chooser
Beispiel #9
0
 def _load_params(self, net, fnm, sess):
     dest_params = get_all_params(net)
     with open(fnm, 'r') as f:
         src_params = json.loads(f.read())
     for layer_name in dest_params.keys():
         for param_name in dest_params[layer_name].keys():
             dest_param = dest_params[layer_name][param_name]
             src_param = src_params[layer_name][param_name]
             sess.run(dest_param.assign(src_param))
Beispiel #10
0
    def build(self):
        print('Building agent ...')
        self.action_model.setup_net()

        self.state_model.setup_net()
        self.state_y = tf.placeholder(
            dtype=tf.float32,
            shape=(None, np.prod(self.state_shape)),
            name='expected_state_y')
        self.state_loss.build(self.state_model.y_hat, self.state_y)
        state_params = flatten_params(get_all_params(
            self.state_model.get_net()))
        # print(state_params)
        # self.state_train_step = self.state_optim.get_train_step(
        #     self.state_loss.loss, state_params)

        self.reward_model.setup_net()
        self.reward_y = tf.placeholder(
            dtype=tf.float32,
            shape=(None,),
            name='expected_reward_y')
        self.reward_loss.build(self.reward_model.y_hat, self.reward_y)
        reward_params = flatten_params(get_all_params(
            self.reward_model.get_net()))
        # self.reward_train_step = self.reward_optim.get_train_step(
        #     self.reward_loss.loss, reward_params)

        self.value_model.setup_net()
        self.value_y = tf.placeholder(
            dtype=tf.float32,
            shape=(None,),
            name='expected_value_y')
        self.value_loss.build(self.value_model.y_hat, self.value_y)
        value_params = flatten_params(get_all_params(
            self.value_model.get_net()))
        # self.value_train_step = self.value_optim.get_train_step(
        #     self.value_loss.loss, value_params)

        partial_params = state_params + reward_params + value_params
        partial_loss = (self.state_loss.loss +
                        self.reward_loss.loss +
                        self.value_loss.loss)
        self.partial_train_step = self.state_optim.get_train_step(
            partial_loss, partial_params)

        reward_discount = self.hyperparams['reward_discount']
        batch_size = self.hyperparams['batch_size']
        self.seed_train_state = tf.placeholder(
            tf.float32,
            shape=(batch_size,) + self.state_shape,
            name='seed_train_state')

        # scale = self.hyperparams['action_train_scale']
        value_rollout_length = self.hyperparams['value_rollout_length']
        next_state = self.seed_train_state
        next_conv_state = tf.concat(3, [next_state] * value_rollout_length)
        total_reward = tf.zeros((batch_size,))
        for timestep in range(self.hyperparams['rollout_length']):
            state = next_state
            conv_state = next_conv_state

            action = get_output(self.action_model.get_net(),
                                {'state': tf.expand_dims(state, 1)},
                                timestep=True)

            # evil softmax to closer-to-one-hot magic
            # action_max = tf.reduce_max(action, reduction_indices=1)
            # action_max = tf.expand_dims(action_max, 1)

            # action_min = tf.reduce_min(action, reduction_indices=1)
            # action_min = tf.expand_dims(action_min, 1)

            # action = tf.pow((1 - (action_max - action) -
            #                  (1 - (action_max - action_min))) /
            #                 (action_max - action_min), scale)
            # print('action shape')
            # print(action.get_shape())

            next_state = get_output(self.state_model.get_net(),
                                    {'state': conv_state,
                                     'action': action})
            next_state = tf.reshape(next_state, (-1, *self.state_shape))
            next_conv_state = tf.concat(
                3, [next_conv_state[:, :, :, :value_rollout_length - 1],
                    next_state])

            reward = get_output(self.reward_model.net,
                                {'state': next_conv_state,
                                 'action': action})
            total_reward += reward_discount * tf.squeeze(reward, [1])
            value = get_output(self.value_model.get_net(),
                               {'state': next_conv_state})

        print('reward shape')
        print(reward.get_shape())
        print('value shape')
        print(value.get_shape())
        total_reward += reward_discount * tf.squeeze(value, [1])
        print('Total reward shape')
        print(total_reward.get_shape())
        self.exp_returns = tf.reduce_mean(total_reward)

        print('Flattening params ...')
        action_params = flatten_params(get_trainable_params(
            self.action_model.get_net()))
        print('Action params:')
        print(get_trainable_params(self.action_model.get_net()))
        self.action_train_step = self.action_optim.get_train_step(
            -self.exp_returns, action_params)

        self.action_preds = get_output(self.action_model.get_net(),
                                       None,
                                       timestep=True,
                                       input_hidden=True)
        # self.assign_hidden_ops = get_assign_hidden_ops(
        #     self.action_model.get_net())
        # self.zero_hidden_ops = get_assign_hidden_ops(
        #     self.action_model.get_net(),
        #     zero=True)
        # self.hidden_states = get_input_hidden_vars(
        #     self.action_model.get_net(),
        #     timestep=True)

        self.hidden_states = get_input_hidden_vars(
            self.action_model.get_net(),
            timestep=True)
        self.hidden_output_states = get_output_hidden_vars(
            self.action_model.get_net())
        self.hidden_state_vals = {}
        self.init_hidden = get_init_hidden(
            self.action_model.get_net())
        # for hidden_name, hidden_state in self.hidden_states.items():
        #     self.hidden_state_vals[hidden_state] = np.zeros(
        #         hidden_state.eval(session=self.sess).shape)
        #     self.hidden_state_vals[hidden_state] = None

        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())

        self.update_value_target_weights_ops = set_all_params_ops(
            get_all_params(self.value_model.get_target_net()),
            get_all_params(self.value_model.get_net()))
        self.update_value_target_weights()

        self.prepare_epsilon()
        self.training = True
        self.part_experiences = []

        self.experience_replay = RDRLMem(self.hyperparams)
        self.experience_replay.build()

        self.greedy_ind = None