class ExplorerWorker: def __init__(self, name, actions): self.e_model = EncoderModel() self.act_model = ActorModel(actions) self.c_model = CriticModel() self.t_model = A2CGaeTrain(self.e_model, self.act_model, self.c_model, name, lr=2e-6) self.name = name self.steps = 0 self.log_dir = 'logs/{}'.format(name) self.summary_writer = tf.summary.create_file_writer(self.log_dir) self.boards = { 'reward': tf.keras.metrics.Mean('reward_board', dtype=tf.float32), 'actor_loss': tf.keras.metrics.Mean('train_loss', dtype=tf.float32), 'critic_loss': tf.keras.metrics.Mean('train_loss_c', dtype=tf.float32), } def set_weights(self, model): self.e_model.update_model(model.e_model) self.act_model.update_model(model.act_model) self.c_model.update_model(model.c_model) def get_weights(self): return self.e_model.trainable_variables + self.act_model.trainable_variables + self.c_model.trainable_variables def set_board(self, name, value): self.boards[name](value) def get_board(self, name): return self.boards[name].result() def get_values(self, state): embeding = self.e_model( tf.convert_to_tensor(state[None, ...], dtype=tf.float32)) action, p, log_policy = self.act_model.get_action(embeding) value = np.asarray(self.c_model(embeding))[0][0] return action, p, log_policy, value def train(self, states, actions, rewards, next_states, values, old_log_policies, R, adv, dones): return self.t_model.get_gradients(states, actions, rewards, next_states, values, old_log_policies, R, adv, dones) def update_model(self, cg, ag, eg): self.t_model.apply_grads(cg, ag, eg)
def __init__(self, name, actions): self.e_model = EncoderModel() self.act_model = ActorModel( actions ) self.c_model = CriticModel() self.t_model = A2CGaeTrain(self.e_model, self.act_model, self.c_model, name, lr=2e-6) self.name = name self.steps = 0 self.log_dir = 'logs/{}'.format(name) self.summary_writer = tf.summary.create_file_writer(self.log_dir) self.boards = { 'reward': tf.keras.metrics.Mean('reward_board', dtype=tf.float32), 'actor_loss': tf.keras.metrics.Mean('train_loss', dtype=tf.float32), 'critic_loss': tf.keras.metrics.Mean('train_loss_c', dtype=tf.float32) }
cv.waitKey(1) size = ( 84, 84 ) bs = 1024 buffer_memory = 5000 t_steps = 1000 n_episodes = 100000 num_local_steps = 100 num_epochs = 5 gamma = 0.9 tau = 1.0 env = create_train_env( 1, 1, RIGHT_ONLY, random=True ) e_model = EncoderModel() act_model = ActorModel( env.action_space.n ) c_model = CriticModel() t_model = A2CGaeTrain( e_model, act_model, c_model ) fig, axs = plt.subplots(1) fig.suptitle('Vertically stacked subplots') curr_states, state_ = env.reset() curr_episode = 0 ct = 0 while True: curr_episode += 1 actions = [] values = [] states = [] _states = []