def create_predictor(trainer, model_type, use_gpu, action_dim=None): if model_type == ModelType.SOFT_ACTOR_CRITIC.value: predictor = GymSACPredictor(trainer, action_dim) elif model_type in ( ModelType.PYTORCH_DISCRETE_DQN.value, ModelType.PYTORCH_PARAMETRIC_DQN.value, ): predictor = GymDQNPredictor(trainer, action_dim) else: raise NotImplementedError() return predictor
def create_predictor(trainer, model_type, use_gpu): if model_type == ModelType.CONTINUOUS_ACTION.value: predictor = GymDDPGPredictor(trainer) elif model_type in ( ModelType.PYTORCH_DISCRETE_DQN.value, ModelType.PYTORCH_PARAMETRIC_DQN.value, ): predictor = GymDQNPredictor(trainer) else: raise NotImplementedError() return predictor
def create_predictor(trainer, model_type, use_gpu): c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU) if model_type == ModelType.CONTINUOUS_ACTION.value: predictor = GymDDPGPredictor(trainer) elif model_type in ( ModelType.PYTORCH_DISCRETE_DQN.value, ModelType.PYTORCH_PARAMETRIC_DQN.value, ): predictor = GymDQNPredictorPytorch(trainer) else: predictor = GymDQNPredictor(trainer, c2_device) return predictor
def run( c2_device, gym_env, model_type, trainer, test_run_name, score_bar, num_episodes=301, max_steps=None, train_every_ts=100, train_after_ts=10, test_every_ts=100, test_after_ts=10, num_train_batches=10, avg_over_num_episodes=100, render=False, render_every=10, save_timesteps_to_dataset=None, ): avg_reward_history = [] if model_type == ModelType.CONTINUOUS_ACTION.value: predictor = GymDDPGPredictor(trainer) else: predictor = GymDQNPredictor(trainer, c2_device) total_timesteps = 0 for i in range(num_episodes): terminal = False next_state = gym_env.transform_state(gym_env.env.reset()) next_action = gym_env.policy(predictor, next_state, False) reward_sum = 0 ep_timesteps = 0 if model_type == ModelType.CONTINUOUS_ACTION.value: trainer.noise.clear() while not terminal: state = next_state action = next_action if render: gym_env.env.render() if gym_env.action_type == EnvType.DISCRETE_ACTION: action_index = np.argmax(action) next_state, reward, terminal, _ = gym_env.env.step(action_index) else: next_state, reward, terminal, _ = gym_env.env.step(action) next_state = gym_env.transform_state(next_state) ep_timesteps += 1 total_timesteps += 1 next_action = gym_env.policy(predictor, next_state, False) reward_sum += reward (possible_next_actions, possible_next_actions_lengths) = get_possible_next_actions( gym_env, model_type, terminal) gym_env.insert_into_memory( np.float32(state), action, np.float32(reward), np.float32(next_state), next_action, terminal, possible_next_actions, possible_next_actions_lengths, 1, ) if save_timesteps_to_dataset: # TODO: handle continuous/parametric actions. assert ( gym_env.action_type == EnvType.DISCRETE_ACTION ), "Save to file supports discrete actions only." action_str = str(np.argmax(action).item()) possible_actions = [str(a) for a in range(gym_env.action_dim)] save_timesteps_to_dataset.insert( i, ep_timesteps - 1, np.float32(state).tolist(), action_str, np.float32(reward).item(), possible_actions, ) if terminal: save_timesteps_to_dataset.insert( i, ep_timesteps, np.float32(next_state).tolist(), None, 0.0, [], ) # Training loop if ( total_timesteps % train_every_ts == 0 and total_timesteps > train_after_ts and len(gym_env.replay_memory) >= trainer.minibatch_size ): for _ in range(num_train_batches): if model_type == ModelType.CONTINUOUS_ACTION.value: samples = gym_env.sample_memories(trainer.minibatch_size) trainer.train(samples) else: with core.DeviceScope(c2_device): gym_env.sample_and_load_training_data_c2( trainer.minibatch_size, model_type, trainer.maxq_learning, ) trainer.train() # Evaluation loop if total_timesteps % test_every_ts == 0 and total_timesteps > test_after_ts: avg_rewards = gym_env.run_ep_n_times( avg_over_num_episodes, predictor, test=True ) avg_reward_history.append(avg_rewards) logger.info( "Achieved an average reward score of {} over {} evaluations." " Total episodes: {}, total timesteps: {}.".format( avg_rewards, avg_over_num_episodes, i + 1, total_timesteps ) ) if score_bar is not None and avg_rewards > score_bar: logger.info( "Avg. reward history for {}: {}".format( test_run_name, avg_reward_history ) ) return avg_reward_history if max_steps and ep_timesteps >= max_steps: break # Always eval on last episode if previous eval loop didn't return. if i == num_episodes - 1: avg_rewards = gym_env.run_ep_n_times( avg_over_num_episodes, predictor, test=True ) avg_reward_history.append(avg_rewards) logger.info( "Achieved an average reward score of {} over {} evaluations." " Total episodes: {}, total timesteps: {}.".format( avg_rewards, avg_over_num_episodes, i + 1, total_timesteps ) ) logger.info( "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history) ) return avg_reward_history
def run( c2_device, gym_env, model_type, trainer, test_run_name, score_bar, num_episodes=301, max_steps=None, train_every_ts=100, train_after_ts=10, test_every_ts=100, test_after_ts=10, num_train_batches=1, avg_over_num_episodes=100, render=False, save_timesteps_to_dataset=None, start_saving_from_episode=0, batch_rl_file_path=None, ): if model_type == ModelType.CONTINUOUS_ACTION.value: predictor = GymDDPGPredictor(trainer) elif model_type in ( ModelType.PYTORCH_DISCRETE_DQN.value, ModelType.PYTORCH_PARAMETRIC_DQN.value, ): predictor = GymDQNPredictorPytorch(trainer) else: predictor = GymDQNPredictor(trainer, c2_device) if batch_rl_file_path is not None: return train_gym_batch_rl( model_type, trainer, predictor, batch_rl_file_path, gym_env, num_train_batches, test_every_ts, test_after_ts, avg_over_num_episodes, score_bar, test_run_name, ) else: return train_gym_online_rl( c2_device, gym_env, model_type, trainer, predictor, test_run_name, score_bar, num_episodes, max_steps, train_every_ts, train_after_ts, test_every_ts, test_after_ts, num_train_batches, avg_over_num_episodes, render, save_timesteps_to_dataset, start_saving_from_episode, )
def __init__(self, trainer, action_dim): GymDQNPredictor.__init__(self, trainer, action_dim)
def run( c2_device, gym_env, model_type, trainer, test_run_name, score_bar, num_episodes=301, max_steps=None, train_every_ts=100, train_after_ts=10, test_every_ts=100, test_after_ts=10, num_train_batches=10, avg_over_num_episodes=100, render=False, render_every=10, ): avg_reward_history = [] if model_type == ModelType.CONTINUOUS_ACTION.value: predictor = GymDDPGPredictor(trainer) else: predictor = GymDQNPredictor(trainer, c2_device) total_timesteps = 0 for i in range(num_episodes): terminal = False next_state = gym_env.transform_state(gym_env.env.reset()) next_action = gym_env.policy(predictor, next_state, False) reward_sum = 0 ep_timesteps = 0 if model_type == ModelType.CONTINUOUS_ACTION.value: trainer.noise.clear() while not terminal: state = next_state action = next_action if render: gym_env.env.render() if gym_env.action_type == EnvType.DISCRETE_ACTION: action_index = np.argmax(action) next_state, reward, terminal, _ = gym_env.env.step(action_index) else: next_state, reward, terminal, _ = gym_env.env.step(action) next_state = gym_env.transform_state(next_state) ep_timesteps += 1 total_timesteps += 1 next_action = gym_env.policy(predictor, next_state, False) reward_sum += reward if model_type == ModelType.DISCRETE_ACTION.value: possible_next_actions = [ 0 if terminal else 1 for __ in range(gym_env.action_dim) ] possible_next_actions_lengths = gym_env.action_dim elif model_type == ModelType.PARAMETRIC_ACTION.value: if terminal: possible_next_actions = np.array([]) possible_next_actions_lengths = 0 else: possible_next_actions = np.eye(gym_env.action_dim) possible_next_actions_lengths = gym_env.action_dim elif model_type == ModelType.CONTINUOUS_ACTION.value: possible_next_actions = None possible_next_actions_lengths = None gym_env.insert_into_memory( np.float32(state), action, np.float32(reward), np.float32(next_state), next_action, terminal, possible_next_actions, possible_next_actions_lengths, 1, ) # Training loop if ( total_timesteps % train_every_ts == 0 and total_timesteps > train_after_ts and len(gym_env.replay_memory) >= trainer.minibatch_size ): for _ in range(num_train_batches): if model_type == ModelType.CONTINUOUS_ACTION.value: samples = gym_env.sample_memories(trainer.minibatch_size) trainer.train(samples) else: with core.DeviceScope(c2_device): gym_env.sample_and_load_training_data_c2( trainer.minibatch_size, model_type, trainer.maxq_learning, ) trainer.train(episode_values=None, evaluator=None) # Evaluation loop if total_timesteps % test_every_ts == 0 and total_timesteps > test_after_ts: avg_rewards = gym_env.run_ep_n_times( avg_over_num_episodes, predictor, test=True ) avg_reward_history.append(avg_rewards) logger.info( "Achieved an average reward score of {} over {} evaluations." " Total episodes: {}, total timesteps: {}.".format( avg_rewards, avg_over_num_episodes, i + 1, total_timesteps ) ) if score_bar is not None and avg_rewards > score_bar: logger.info( "Avg. reward history for {}: {}".format( test_run_name, avg_reward_history ) ) return avg_reward_history if max_steps and ep_timesteps >= max_steps: break # Always eval on last episode if previous eval loop didn't return. if i == num_episodes - 1: avg_rewards = gym_env.run_ep_n_times( avg_over_num_episodes, predictor, test=True ) avg_reward_history.append(avg_rewards) logger.info( "Achieved an average reward score of {} over {} evaluations." " Total episodes: {}, total timesteps: {}.".format( avg_rewards, avg_over_num_episodes, i + 1, total_timesteps ) ) logger.info( "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history) ) return avg_reward_history