def __init__(self, hyperparams, policy, sess, actions): self.hyperparams = hyperparams self.sess = sess with tf.device('/cpu:0'): self.policy = policy.create_policy(trainable=False) self.sess.run(tf.initialize_all_variables()) self.input_vars = get_input_vars(self.policy) self.y_hat = get_output(self.policy) self.experience_cache = [] self.episode_cache = [] self.prepare_epsilon() self.training = True self.actions = actions self.greedy_ind = None
def __init__(self, hyperparams, q_model, sess, actions): self.hyperparams = hyperparams self.sess = sess with tf.device('/cpu:0'): self.net = q_model.create_net(trainable=False) self.sess.run(tf.initialize_all_variables()) self.input_vars = get_input_vars(self.net) self.y_hat = get_output(self.net) self.experience_cache = [] self.recent_train_q = deque( maxlen=self.hyperparams['num_recent_steps']) self.recent_eval_q = deque(maxlen=self.hyperparams['num_recent_steps']) self.prepare_epsilon() self.training = True self.actions = actions self.greedy_ind = None
def __init__(self, hyperparams, q_model, sess, actions): self.hyperparams = hyperparams self.sess = sess with tf.device('/cpu:0'): self.net = q_model.create_net(trainable=False) self.sess.run(tf.initialize_all_variables()) self.input_vars = get_input_vars(self.net) self.y_hat = get_output(self.net) self.experience_cache = [] self.recent_train_q = deque( maxlen=self.hyperparams['num_recent_steps']) self.recent_eval_q = deque( maxlen=self.hyperparams['num_recent_steps']) self.prepare_epsilon() self.training = True self.actions = actions self.greedy_ind = None
def build(self): print('Building agent ...') self.action_model.setup_net() self.state_model.setup_net() self.state_y = tf.placeholder( dtype=tf.float32, shape=(None, np.prod(self.state_shape)), name='expected_state_y') self.state_loss.build(self.state_model.y_hat, self.state_y) state_params = flatten_params(get_all_params( self.state_model.get_net())) # print(state_params) # self.state_train_step = self.state_optim.get_train_step( # self.state_loss.loss, state_params) self.reward_model.setup_net() self.reward_y = tf.placeholder( dtype=tf.float32, shape=(None,), name='expected_reward_y') self.reward_loss.build(self.reward_model.y_hat, self.reward_y) reward_params = flatten_params(get_all_params( self.reward_model.get_net())) # self.reward_train_step = self.reward_optim.get_train_step( # self.reward_loss.loss, reward_params) self.value_model.setup_net() self.value_y = tf.placeholder( dtype=tf.float32, shape=(None,), name='expected_value_y') self.value_loss.build(self.value_model.y_hat, self.value_y) value_params = flatten_params(get_all_params( self.value_model.get_net())) # self.value_train_step = self.value_optim.get_train_step( # self.value_loss.loss, value_params) partial_params = state_params + reward_params + value_params partial_loss = (self.state_loss.loss + self.reward_loss.loss + self.value_loss.loss) self.partial_train_step = self.state_optim.get_train_step( partial_loss, partial_params) reward_discount = self.hyperparams['reward_discount'] batch_size = self.hyperparams['batch_size'] self.seed_train_state = tf.placeholder( tf.float32, shape=(batch_size,) + self.state_shape, name='seed_train_state') # scale = self.hyperparams['action_train_scale'] value_rollout_length = self.hyperparams['value_rollout_length'] next_state = self.seed_train_state next_conv_state = tf.concat(3, [next_state] * value_rollout_length) total_reward = tf.zeros((batch_size,)) for timestep in range(self.hyperparams['rollout_length']): state = next_state conv_state = next_conv_state action = get_output(self.action_model.get_net(), {'state': tf.expand_dims(state, 1)}, timestep=True) # evil softmax to closer-to-one-hot magic # action_max = tf.reduce_max(action, reduction_indices=1) # action_max = tf.expand_dims(action_max, 1) # action_min = tf.reduce_min(action, reduction_indices=1) # action_min = tf.expand_dims(action_min, 1) # action = tf.pow((1 - (action_max - action) - # (1 - (action_max - action_min))) / # (action_max - action_min), scale) # print('action shape') # print(action.get_shape()) next_state = get_output(self.state_model.get_net(), {'state': conv_state, 'action': action}) next_state = tf.reshape(next_state, (-1, *self.state_shape)) next_conv_state = tf.concat( 3, [next_conv_state[:, :, :, :value_rollout_length - 1], next_state]) reward = get_output(self.reward_model.net, {'state': next_conv_state, 'action': action}) total_reward += reward_discount * tf.squeeze(reward, [1]) value = get_output(self.value_model.get_net(), {'state': next_conv_state}) print('reward shape') print(reward.get_shape()) print('value shape') print(value.get_shape()) total_reward += reward_discount * tf.squeeze(value, [1]) print('Total reward shape') print(total_reward.get_shape()) self.exp_returns = tf.reduce_mean(total_reward) print('Flattening params ...') action_params = flatten_params(get_trainable_params( self.action_model.get_net())) print('Action params:') print(get_trainable_params(self.action_model.get_net())) self.action_train_step = self.action_optim.get_train_step( -self.exp_returns, action_params) self.action_preds = get_output(self.action_model.get_net(), None, timestep=True, input_hidden=True) # self.assign_hidden_ops = get_assign_hidden_ops( # self.action_model.get_net()) # self.zero_hidden_ops = get_assign_hidden_ops( # self.action_model.get_net(), # zero=True) # self.hidden_states = get_input_hidden_vars( # self.action_model.get_net(), # timestep=True) self.hidden_states = get_input_hidden_vars( self.action_model.get_net(), timestep=True) self.hidden_output_states = get_output_hidden_vars( self.action_model.get_net()) self.hidden_state_vals = {} self.init_hidden = get_init_hidden( self.action_model.get_net()) # for hidden_name, hidden_state in self.hidden_states.items(): # self.hidden_state_vals[hidden_state] = np.zeros( # hidden_state.eval(session=self.sess).shape) # self.hidden_state_vals[hidden_state] = None self.sess = tf.Session() self.sess.run(tf.initialize_all_variables()) self.update_value_target_weights_ops = set_all_params_ops( get_all_params(self.value_model.get_target_net()), get_all_params(self.value_model.get_net())) self.update_value_target_weights() self.prepare_epsilon() self.training = True self.part_experiences = [] self.experience_replay = RDRLMem(self.hyperparams) self.experience_replay.build() self.greedy_ind = None
def output_fn(self, state): return get_output(self.net, {get_input_name(self.net): state})
def setup_net(self): self.build_net() self.input_vars = get_input_vars(self.get_net()) self.y_hat = get_output(self.get_net())
def setup_net(self): self.build_net() self.y_hat = get_output(self.get_net()) self.target_y_hat = get_output(self.get_target_net())
def setup_net(self): self.build_net() self.policy_y_hat = get_output(self.policy) self.value_y_hat = get_output(self.value) self.policy_input_vars = get_input_vars(self.policy) self.value_input_vars = get_input_vars(self.value)