class WrappedEnv(gym.Env): def __init__(self, sess=None, env_name=None, feature_dim=None, encoder_gamma=None, encoder_hidden_size=None, dynamics_hidden_size=None, invdyn_hidden_size=None, encoder_lr=None, dynamics_lr=None, invdyn_lr=None): super(WrappedEnv, self).__init__() self._sess = sess self._env = gym.make(env_name) self._state_dim = self._env.observation_space.shape[0] self._action_dim = self._env.action_space.shape[0] self._feature_dim = feature_dim self._encoder_gamma = encoder_gamma self._experience_buffer_size = 50000 self.observation_space = spaces.Box( np.array([-np.inf] * self._feature_dim), np.array([np.inf] * self._feature_dim)) self.action_space = self._env.action_space self._num_hidden_layers = 2 self._encoder_hidden_sizes = [encoder_hidden_size ] * self._num_hidden_layers self._dynamics_hidden_sizes = [dynamics_hidden_size ] * self._num_hidden_layers self._invdyn_hidden_sizes = [invdyn_hidden_size ] * self._num_hidden_layers self._encoder = Encoder(sess=self._sess, input_dim=self._state_dim, output_dim=feature_dim, hidden_sizes=self._encoder_hidden_sizes, learning_rate=encoder_lr) self._dynamics = Dynamics(sess=self._sess, state_dim=feature_dim, action_dim=self._action_dim, hidden_sizes=self._dynamics_hidden_sizes, learning_rate=dynamics_lr) self._inv_dynamics = InverseDynamics( sess=self._sess, state_dim=feature_dim, action_dim=self._action_dim, hidden_sizes=self._invdyn_hidden_sizes, learning_rate=invdyn_lr) self._state = self._env.reset() def step(self, action): next_state, reward, terminal, info = self._env.step(action) encoded_next_state = self._encoder.predict(state=next_state) batch_state = self._state batch_action = action batch_reward = reward batch_next_state = next_state batch_encoded_state = self._encoder.predict(state=batch_state) batch_encoded_next_state = self._encoder.predict( state=batch_next_state) self._dynamics.update(state=batch_encoded_state, action=batch_action, next_state=batch_encoded_next_state) self._inv_dynamics.update(state=batch_encoded_state, action=batch_action, next_state=batch_encoded_next_state) dyn_gradients = self._dynamics.calc_gradients( state=batch_encoded_state, action=action) invdyn_gradients = self._inv_dynamics.calc_gradients( state=batch_encoded_state, next_state=batch_encoded_next_state) encoder_gradients = self._encoder.calc_gradients(state=batch_state) dyn_state_gradients = dyn_gradients[:self._feature_dim] dyn_action_gradients = dyn_gradients[self._feature_dim:] invdyn_state_gradients = invdyn_gradients[:self._feature_dim] invdyn_nstate_gradients = invdyn_gradients[self._feature_dim:] encoder_optim_grads = np.dot( invdyn_state_gradients, invdyn_nstate_gradients) * dyn_state_gradients self._encoder.update(state=batch_state, optim_grads=encoder_optim_grads) self._state = next_state return encoded_next_state, reward, terminal, info def reset(self): self._state = self._env.reset() encoded_state = self._encoder.predict(state=self._state) return encoded_state def render(self): self._env.render()
print("Before:", encoded_1) encoder.update(state_1, state_2, reward_1, reward_2, wasserstein) encoded_1 = encoder.predict(state=state_1) print("After:", encoded_1) ################################################################################# ############################ Test Dynamics ###################################### state = np.random.rand(state_dim) next_state = np.random.rand(state_dim) action = np.random.rand(action_dim) encoded_state = encoder.predict(state=state) encoded_next_state = encoder.predict(state=next_state) next_state_from_dynamics = dynamics.predict(state=encoded_state, action=action) print("Before:", next_state_from_dynamics) dynamics.update(state=encoded_state, action=action, next_state=encoded_next_state) next_state_from_dynamics = dynamics.predict(state=encoded_state, action=action) print("After:", next_state_from_dynamics) #################################################################################