def build(self): print('Building agent ...') self.tmax = self.hyperparams['tmax'] with tf.device('/gpu:0'): self.q_model.setup_net() self.mask = tf.placeholder(shape=(None, len(self.actions)), dtype=tf.float32, name='mask') self.y = tf.placeholder(dtype=tf.float32, shape=(None, ), name='expected_y') self.loss.build(self.q_model.y_hat, self.y, self.mask) self.train_step = self.optim.get_train_step(self.loss.loss) self.sess = tf.Session() self.sess.run(tf.initialize_all_variables()) # self.sess.run(tf.initialize_variables(flatten_params( # get_all_params(self.q_model.net)))) self.prepare_debug_vars() self.training = True self.experience_replay = SNDQNExperienceReplay(self.hyperparams, self.dtype) self.experience_replay.build() self.update_target_weights_ops = set_all_params_ops( get_all_params(self.q_model.get_target_net()), get_all_params(self.q_model.get_net())) self.update_target_weights() self.update_chooser_weights_ops = []
def build_action_chooser(self): chooser = AsyncActionChooser(self.hyperparams, self.q_model, self.sess, self.actions) self.update_chooser_weights_ops += set_all_params_ops( get_all_params(chooser.net), get_all_params(self.q_model.get_net())) return chooser
def build(self): print('Building agent ...') self.q_model.setup_net() self.mask = tf.placeholder(shape=(None, len(self.actions)), dtype=tf.float32, name='mask') self.y = tf.placeholder(dtype=tf.float32, shape=(None, ), name='expected_y') self.loss.build(self.q_model.y_hat, self.y, self.mask) self.train_step = self.optim.get_train_step(self.loss.loss) self.sess = tf.Session() self.sess.run(tf.initialize_all_variables()) # self.sess.run(tf.initialize_variables(flatten_params( # get_all_params(self.q_model.net)))) self.prepare_epsilon() self.prepare_debug_vars() self.training = True self.recent_train_q = deque( maxlen=self.hyperparams['num_recent_steps']) self.recent_eval_q = deque(maxlen=self.hyperparams['num_recent_steps']) self.experience_replay = ExperienceReplay(self.hyperparams, self.dtype) self.experience_replay.build() self.update_target_weights_ops = set_all_params_ops( get_all_params(self.q_model.get_target_net()), get_all_params(self.q_model.get_net())) self.update_target_weights()
def build_action_chooser(self): chooser = AsyncEdistActionChooser(self.hyperparams, self.q_model, self.sess, self.actions) self.update_chooser_weights_ops += set_all_params_ops( get_all_params(chooser.net), get_all_params(self.q_model.get_net())) return chooser
def build(self): print('Building agent ...') self.tmax = self.hyperparams['tmax'] with tf.device('/gpu:0'): self.q_model.setup_net() self.mask = tf.placeholder(shape=(None, len(self.actions)), dtype=tf.float32, name='mask') self.y = tf.placeholder( dtype=tf.float32, shape=(None,), name='expected_y') self.loss.build(self.q_model.y_hat, self.y, self.mask) self.train_step = self.optim.get_train_step(self.loss.loss) self.sess = tf.Session() self.sess.run(tf.initialize_all_variables()) # self.sess.run(tf.initialize_variables(flatten_params( # get_all_params(self.q_model.net)))) self.prepare_debug_vars() self.training = True self.experience_replay = SNDQNExperienceReplay(self.hyperparams, self.dtype) self.experience_replay.build() self.update_target_weights_ops = set_all_params_ops( get_all_params(self.q_model.get_target_net()), get_all_params(self.q_model.get_net())) self.update_target_weights() self.update_chooser_weights_ops = []
def build(self): print('Building agent ...') self.q_model.setup_net() self.mask = tf.placeholder(shape=(None, len(self.actions)), dtype=tf.float32, name='mask') self.y = tf.placeholder( dtype=tf.float32, shape=(None,), name='expected_y') self.loss.build(self.q_model.y_hat, self.y, self.mask) self.train_step = self.optim.get_train_step(self.loss.loss) self.sess = tf.Session() self.sess.run(tf.initialize_all_variables()) # self.sess.run(tf.initialize_variables(flatten_params( # get_all_params(self.q_model.net)))) self.prepare_epsilon() self.prepare_debug_vars() self.training = True self.recent_train_q = deque( maxlen=self.hyperparams['num_recent_steps']) self.recent_eval_q = deque( maxlen=self.hyperparams['num_recent_steps']) self.experience_replay = ExperienceReplay(self.hyperparams, self.dtype) self.experience_replay.build() self.update_target_weights_ops = set_all_params_ops( get_all_params(self.q_model.get_target_net()), get_all_params(self.q_model.get_net())) self.update_target_weights()
def build_action_chooser(self): chooser = AsyncActionChooser(self.hyperparams, self.ac_model, self.sess, self.actions) self.update_chooser_weights_ops += set_all_params_ops( get_all_params(chooser.policy), get_all_params(self.ac_model.policy)) return chooser
def build_action_chooser(self, train=True): chooser = AsyncActionChooser(self.hyperparams, self.ac_model, self.sess, self.actions, train=train) self.update_chooser_weights_ops += set_all_params_ops( get_all_params(chooser.policy), get_all_params(self.ac_model.policy)) return chooser
def build(self): print('Building agent ...') self.action_model.setup_net() self.state_model.setup_net() self.state_y = tf.placeholder( dtype=tf.float32, shape=(None, np.prod(self.state_shape)), name='expected_state_y') self.state_loss.build(self.state_model.y_hat, self.state_y) state_params = flatten_params(get_all_params( self.state_model.get_net())) # print(state_params) # self.state_train_step = self.state_optim.get_train_step( # self.state_loss.loss, state_params) self.reward_model.setup_net() self.reward_y = tf.placeholder( dtype=tf.float32, shape=(None,), name='expected_reward_y') self.reward_loss.build(self.reward_model.y_hat, self.reward_y) reward_params = flatten_params(get_all_params( self.reward_model.get_net())) # self.reward_train_step = self.reward_optim.get_train_step( # self.reward_loss.loss, reward_params) self.value_model.setup_net() self.value_y = tf.placeholder( dtype=tf.float32, shape=(None,), name='expected_value_y') self.value_loss.build(self.value_model.y_hat, self.value_y) value_params = flatten_params(get_all_params( self.value_model.get_net())) # self.value_train_step = self.value_optim.get_train_step( # self.value_loss.loss, value_params) partial_params = state_params + reward_params + value_params partial_loss = (self.state_loss.loss + self.reward_loss.loss + self.value_loss.loss) self.partial_train_step = self.state_optim.get_train_step( partial_loss, partial_params) reward_discount = self.hyperparams['reward_discount'] batch_size = self.hyperparams['batch_size'] self.seed_train_state = tf.placeholder( tf.float32, shape=(batch_size,) + self.state_shape, name='seed_train_state') # scale = self.hyperparams['action_train_scale'] value_rollout_length = self.hyperparams['value_rollout_length'] next_state = self.seed_train_state next_conv_state = tf.concat(3, [next_state] * value_rollout_length) total_reward = tf.zeros((batch_size,)) for timestep in range(self.hyperparams['rollout_length']): state = next_state conv_state = next_conv_state action = get_output(self.action_model.get_net(), {'state': tf.expand_dims(state, 1)}, timestep=True) # evil softmax to closer-to-one-hot magic # action_max = tf.reduce_max(action, reduction_indices=1) # action_max = tf.expand_dims(action_max, 1) # action_min = tf.reduce_min(action, reduction_indices=1) # action_min = tf.expand_dims(action_min, 1) # action = tf.pow((1 - (action_max - action) - # (1 - (action_max - action_min))) / # (action_max - action_min), scale) # print('action shape') # print(action.get_shape()) next_state = get_output(self.state_model.get_net(), {'state': conv_state, 'action': action}) next_state = tf.reshape(next_state, (-1, *self.state_shape)) next_conv_state = tf.concat( 3, [next_conv_state[:, :, :, :value_rollout_length - 1], next_state]) reward = get_output(self.reward_model.net, {'state': next_conv_state, 'action': action}) total_reward += reward_discount * tf.squeeze(reward, [1]) value = get_output(self.value_model.get_net(), {'state': next_conv_state}) print('reward shape') print(reward.get_shape()) print('value shape') print(value.get_shape()) total_reward += reward_discount * tf.squeeze(value, [1]) print('Total reward shape') print(total_reward.get_shape()) self.exp_returns = tf.reduce_mean(total_reward) print('Flattening params ...') action_params = flatten_params(get_trainable_params( self.action_model.get_net())) print('Action params:') print(get_trainable_params(self.action_model.get_net())) self.action_train_step = self.action_optim.get_train_step( -self.exp_returns, action_params) self.action_preds = get_output(self.action_model.get_net(), None, timestep=True, input_hidden=True) # self.assign_hidden_ops = get_assign_hidden_ops( # self.action_model.get_net()) # self.zero_hidden_ops = get_assign_hidden_ops( # self.action_model.get_net(), # zero=True) # self.hidden_states = get_input_hidden_vars( # self.action_model.get_net(), # timestep=True) self.hidden_states = get_input_hidden_vars( self.action_model.get_net(), timestep=True) self.hidden_output_states = get_output_hidden_vars( self.action_model.get_net()) self.hidden_state_vals = {} self.init_hidden = get_init_hidden( self.action_model.get_net()) # for hidden_name, hidden_state in self.hidden_states.items(): # self.hidden_state_vals[hidden_state] = np.zeros( # hidden_state.eval(session=self.sess).shape) # self.hidden_state_vals[hidden_state] = None self.sess = tf.Session() self.sess.run(tf.initialize_all_variables()) self.update_value_target_weights_ops = set_all_params_ops( get_all_params(self.value_model.get_target_net()), get_all_params(self.value_model.get_net())) self.update_value_target_weights() self.prepare_epsilon() self.training = True self.part_experiences = [] self.experience_replay = RDRLMem(self.hyperparams) self.experience_replay.build() self.greedy_ind = None