def test_batch_shape_invariant_to_scaling(): """ test that scaling deals well with batches as tensors and numpy matrices in terms of shape """ action_space = Box(np.array([-10., -5., -1.]), np.array([10., 3., 2.])) tensor = tf.constant(1., shape=[2, 3]) matrix = np.ones((2, 3)) assert scale_action(action_space, tensor).shape == (2, 3) assert scale_action(action_space, matrix).shape == (2, 3) assert unscale_action(action_space, tensor).shape == (2, 3) assert unscale_action(action_space, matrix).shape == (2, 3)
def predict(self, observation, state=None, mask=None, deterministic=True, action_mask=None): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation) if self.action_noise is not None and not deterministic: actions = np.clip(actions + self.action_noise(), -1, 1) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action( self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None
def check_scaled_actions_from_range(low, high, scalar=False): """ helper method which creates dummy action space spanning between respective components of low and high and then checks scaling to and from tanh co-domain for low, middle and high value from that action space :param low: (np.ndarray), (int) or (float) :param high: (np.ndarray), (int) or (float) :param scalar: (bool) Whether consider scalar range or wrap it into 1d vector """ if scalar and (isinstance(low, float) or isinstance(low, int)): ones = 1. action_space = Box(low, high, shape=(1, )) else: low = np.atleast_1d(low) high = np.atleast_1d(high) ones = np.ones_like(low) action_space = Box(low, high) mid = 0.5 * (low + high) expected_mapping = [(low, -ones), (mid, 0. * ones), (high, ones)] for (not_scaled, scaled) in expected_mapping: assert np.allclose(scale_action(action_space, not_scaled), scaled) assert np.allclose(unscale_action(action_space, scaled), not_scaled)
def predict(self, observation, state=None, mask=None, deterministic=False): observation = np.array(observation) vectorized_env = self._is_vectorized_observation( observation, self.observation_space) observation = observation.reshape((-1, ) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape( (-1, ) + self.action_space.shape) # reshape to the correct action shape actions = unscale_action( self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] return actions, None
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() else: obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) print('STEP: %d'%step) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() print("action %s"%str(action)) # Rescale from [-1, 1] to the correct bounds #rescaled_action = action * np.abs(self.env.action_space.low) rescaled_action = unscale_action(self.action_space, action) print("rescaled_action %s"%str(rescaled_action)) print("self.action_space %s"%str(self.action_space)) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ep_len += 1 print('got reward %s'%str(reward)) if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs print('obs:' +str(obs.shape)) # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) print('WRITER OUT %s'%str(writer)) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, step) if True and ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: print('DONE') if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format(episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] # check if game is over if False and self.env.is_game_over(): while self.env.is_game_over(): print('waiting for control') time.sleep(1) if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self