Example #1
0
def test_batch_shape_invariant_to_scaling():
    """
    test that scaling deals well with batches as tensors and numpy matrices in terms of shape
    """
    action_space = Box(np.array([-10., -5., -1.]), np.array([10., 3., 2.]))

    tensor = tf.constant(1., shape=[2, 3])
    matrix = np.ones((2, 3))

    assert scale_action(action_space, tensor).shape == (2, 3)
    assert scale_action(action_space, matrix).shape == (2, 3)

    assert unscale_action(action_space, tensor).shape == (2, 3)
    assert unscale_action(action_space, matrix).shape == (2, 3)
Example #2
0
    def predict(self,
                observation,
                state=None,
                mask=None,
                deterministic=True,
                action_mask=None):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions = self.policy_tf.step(observation)

        if self.action_noise is not None and not deterministic:
            actions = np.clip(actions + self.action_noise(), -1, 1)

        actions = actions.reshape(
            (-1, ) +
            self.action_space.shape)  # reshape to the correct action shape
        actions = unscale_action(
            self.action_space, actions)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None
Example #3
0
def check_scaled_actions_from_range(low, high, scalar=False):
    """
    helper method which creates dummy action space spanning between respective components of low and high
    and then checks scaling to and from tanh co-domain for low, middle and high value from  that action space
    :param low: (np.ndarray), (int) or (float)
    :param high: (np.ndarray), (int) or (float)
    :param scalar: (bool) Whether consider scalar range or wrap it into 1d vector
    """

    if scalar and (isinstance(low, float) or isinstance(low, int)):
        ones = 1.
        action_space = Box(low, high, shape=(1, ))
    else:
        low = np.atleast_1d(low)
        high = np.atleast_1d(high)
        ones = np.ones_like(low)
        action_space = Box(low, high)

    mid = 0.5 * (low + high)

    expected_mapping = [(low, -ones), (mid, 0. * ones), (high, ones)]

    for (not_scaled, scaled) in expected_mapping:
        assert np.allclose(scale_action(action_space, not_scaled), scaled)
        assert np.allclose(unscale_action(action_space, scaled), not_scaled)
Example #4
0
    def predict(self, observation, state=None, mask=None, deterministic=False):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions = self.policy_tf.step(observation, deterministic=deterministic)
        actions = actions.reshape(
            (-1, ) +
            self.action_space.shape)  # reshape to the correct action shape
        actions = unscale_action(
            self.action_space, actions)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None
Example #5
0
    def learn(self, total_timesteps, callback=None,
              log_interval=1, tb_log_name="SAC", print_freq=100):

        with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            start_time = time.time()
            episode_rewards = [0.0]
            is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
            # TeleopEnv
            if is_teleop_env:
                print("Waiting for teleop")
                obs = self.env.wait_for_teleop_reset()
            else:
                obs = self.env.reset()

            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            infos_values = []
            mb_infos_vals = []

            for step in range(total_timesteps):
                # Compute current learning_rate
                frac = 1.0 - step / total_timesteps
                current_lr = self.learning_rate(frac)
                print('STEP: %d'%step)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(obs[None], deterministic=False).flatten()
                    print("action %s"%str(action))
                    # Rescale from [-1, 1] to the correct bounds
                    #rescaled_action = action * np.abs(self.env.action_space.low)
                    rescaled_action = unscale_action(self.action_space, action)
                    print("rescaled_action %s"%str(rescaled_action))
                    
                print("self.action_space %s"%str(self.action_space))

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)
                ep_len += 1
                print('got reward %s'%str(reward))

                if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
                    print("{} steps".format(ep_len))

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs, float(done))
                obs = new_obs
                print('obs:' +str(obs.shape))

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                print('WRITER OUT %s'%str(writer))
                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                                                                      ep_done, writer, step)

                if True and ep_len > self.train_freq:
                    print("Additional training")
                    self.env.reset()
                    mb_infos_vals = self.optimize(step, writer, current_lr)
                    done = True

                episode_rewards[-1] += reward
                if done:
                    print('DONE')
                    if not (isinstance(self.env, VecEnv) or is_teleop_env):
                        obs = self.env.reset()

                    print("Episode finished. Reward: {:.2f} {} Steps".format(episode_rewards[-1], ep_len))
                    episode_rewards.append(0.0)
                    ep_len = 0
                    mb_infos_vals = self.optimize(step, writer, current_lr)

                    # Refresh obs when using TeleopEnv
                    if is_teleop_env:
                        print("Waiting for teleop")
                        obs = self.env.wait_for_teleop_reset()

                # Log losses and entropy, useful for monitor training
                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", self.n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
                    
                # check if game is over
                if False and self.env.is_game_over():
                    while self.env.is_game_over():
                        print('waiting for control')
                        time.sleep(1)
                    
            if is_teleop_env:
                self.env.is_training = False
            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)
        return self