def main(model_path): predictor = DQNPredictor.load(model_path, "minidb", int_features=False) env = OpenAIGymEnvironment(gymenv=ENV) avg_rewards, avg_discounted_rewards = env.run_ep_n_times(AVG_OVER_NUM_EPS, predictor, test=True) logger.info( "Achieved an average reward score of {} over {} evaluations.".format( avg_rewards, AVG_OVER_NUM_EPS))
def test_open_ai_gym_generate_samples_multi_step(self): env = OpenAIGymEnvironment( "CartPole-v0", epsilon=1.0, # take random actions to collect training data softmax_policy=False, gamma=0.9, ) num_samples = 1000 num_steps = 5 samples = env.generate_random_samples( num_samples, use_continuous_action=True, epsilon=1.0, multi_steps=num_steps ) self._check_samples(samples, num_samples, num_steps, True)
def main(model_path, temperature): model_path = glob.glob(model_path)[0] predictor = DiscreteDqnTorchPredictor(torch.jit.load(model_path)) predictor.softmax_temperature = temperature env = OpenAIGymEnvironment(gymenv=ENV) avg_rewards, avg_discounted_rewards = env.run_ep_n_times(AVG_OVER_NUM_EPS, predictor, test=True) logger.info( "Achieved an average reward score of {} over {} evaluations.".format( avg_rewards, AVG_OVER_NUM_EPS))
def main(model_path): predictor = DQNPredictor.load(model_path, "minidb", int_features=False) env = OpenAIGymEnvironment(gymenv=ENV) avg_rewards, avg_discounted_rewards = env.run_ep_n_times( AVG_OVER_NUM_EPS, predictor, test=True ) logger.info( "Achieved an average reward score of {} over {} evaluations.".format( avg_rewards, AVG_OVER_NUM_EPS ) )
def mdnrnn_gym(params, gpu_id, feature_importance): logger.info("Running gym with params") logger.info(params) env_type = params["env"] env = OpenAIGymEnvironment(env_type, epsilon=1.0, softmax_policy=True, gamma=0.99) use_gpu = gpu_id != USE_CPU if use_gpu: raise NotImplementedError() trainer = create_trainer(params, env) c2_device = core.DeviceOption( caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id)) _, _, trainer = train_sgd( c2_device, env, trainer, "{} test run".format(env_type), params["mdnrnn"]["minibatch_size"], **params["run_details"], ) if feature_importance: calculate_feature_importance(env, trainer, **params["run_details"])
def run_gym( params: OpenAiGymParameters, score_bar, embed_rl_dataset: RLDataset, gym_env: Env, mdnrnn: MemoryNetwork, max_embed_seq_len: int, ): assert params.rl is not None rl_parameters = params.rl env_type = params.env model_type = params.model_type epsilon, epsilon_decay, minimum_epsilon = create_epsilon( offline_train=True, rl_parameters=rl_parameters, params=params ) replay_buffer = OpenAIGymMemoryPool(params.max_replay_memory_size) for row in embed_rl_dataset.rows: replay_buffer.insert_into_memory(**row) assert replay_buffer.memory_buffer is not None state_mem = replay_buffer.memory_buffer.state state_min_value = torch.min(state_mem).item() state_max_value = torch.max(state_mem).item() state_embed_env = StateEmbedGymEnvironment( gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value ) open_ai_env = OpenAIGymEnvironment( state_embed_env, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, epsilon_decay, minimum_epsilon, ) rl_trainer = create_trainer(params, open_ai_env) rl_predictor = create_predictor( rl_trainer, model_type, params.use_gpu, open_ai_env.action_dim ) assert ( params.run_details.max_steps is not None and params.run_details.offline_train_epochs is not None ), "Missing data required for offline training: {}".format(str(params.run_details)) return train_gym_offline_rl( gym_env=open_ai_env, replay_buffer=replay_buffer, model_type=model_type, trainer=rl_trainer, predictor=rl_predictor, test_run_name="{} offline rl state embed".format(env_type), score_bar=score_bar, max_steps=params.run_details.max_steps, avg_over_num_episodes=params.run_details.avg_over_num_episodes, offline_train_epochs=params.run_details.offline_train_epochs, num_batch_per_epoch=None, )
def run_gym( params: OpenAiGymParameters, offline_train, score_bar, seed=None, save_timesteps_to_dataset=None, start_saving_from_score=None, path_to_pickled_transitions=None, warm_trainer=None, reward_shape_func=None, ): use_gpu = params.use_gpu logger.info("Running gym with params") logger.info(params) assert params.rl is not None rl_parameters = params.rl env_type = params.env model_type = params.model_type epsilon, epsilon_decay, minimum_epsilon = create_epsilon( offline_train, rl_parameters, params ) env = OpenAIGymEnvironment( env_type, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, epsilon_decay, minimum_epsilon, seed, ) replay_buffer = create_replay_buffer( env, params, model_type, offline_train, path_to_pickled_transitions ) trainer = warm_trainer if warm_trainer else create_trainer(params, env) predictor = create_predictor(trainer, model_type, use_gpu, env.action_dim) c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU) return train( c2_device, env, offline_train, replay_buffer, model_type, trainer, predictor, "{} test run".format(env_type), score_bar, params.run_details, save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_score=start_saving_from_score, reward_shape_func=reward_shape_func, )
def run_gym( params, use_gpu, score_bar, embed_rl_dataset: RLDataset, gym_env: Env, mdnrnn: MemoryNetwork, max_embed_seq_len: int, ): rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] model_type = params["model_type"] epsilon, epsilon_decay, minimum_epsilon = create_epsilon( offline_train=True, rl_parameters=rl_parameters, params=params ) replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"]) for row in embed_rl_dataset.rows: replay_buffer.insert_into_memory(**row) state_mem = torch.cat([m[0] for m in replay_buffer.replay_memory]) state_min_value = torch.min(state_mem).item() state_max_value = torch.max(state_mem).item() state_embed_env = StateEmbedGymEnvironment( gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value ) open_ai_env = OpenAIGymEnvironment( state_embed_env, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, epsilon_decay, minimum_epsilon, ) rl_trainer = create_trainer( params["model_type"], params, rl_parameters, use_gpu, open_ai_env ) rl_predictor = create_predictor( rl_trainer, model_type, use_gpu, open_ai_env.action_dim ) return train_gym_offline_rl( open_ai_env, replay_buffer, model_type, rl_trainer, rl_predictor, "{} offline rl state embed".format(env_type), score_bar, max_steps=params["run_details"]["max_steps"], avg_over_num_episodes=params["run_details"]["avg_over_num_episodes"], offline_train_epochs=params["run_details"]["offline_train_epochs"], bcq_imitator_hyper_params=None, )
def run_gym( params, offline_train, score_bar, gpu_id, seed=None, save_timesteps_to_dataset=None, start_saving_from_score=None, path_to_pickled_transitions=None, ): logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] model_type = params["model_type"] epsilon, epsilon_decay, minimum_epsilon = create_epsilon( offline_train, rl_parameters, params) env = OpenAIGymEnvironment( env_type, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, epsilon_decay, minimum_epsilon, seed, ) replay_buffer = create_replay_buffer(env, params, model_type, offline_train, path_to_pickled_transitions) use_gpu = gpu_id != USE_CPU trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env) predictor = create_predictor(trainer, model_type, use_gpu, env.action_dim) c2_device = core.DeviceOption( caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id)) return train( c2_device, env, offline_train, replay_buffer, model_type, trainer, predictor, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_score=start_saving_from_score, )
def mdnrnn_gym( params: OpenAiGymParameters, feature_importance: bool = False, feature_sensitivity: bool = False, save_embedding_to_path: Optional[str] = None, seed: Optional[int] = None, ): assert params.mdnrnn is not None use_gpu = params.use_gpu logger.info("Running gym with params") logger.info(params) env_type = params.env env = OpenAIGymEnvironment( env_type, epsilon=1.0, softmax_policy=False, gamma=0.99, random_seed=seed ) # create test data once assert params.run_details.max_steps is not None test_replay_buffer = get_replay_buffer( params.run_details.num_test_episodes, params.run_details.seq_len, params.run_details.max_steps, env, ) test_batch = test_replay_buffer.sample_memories( test_replay_buffer.memory_size, use_gpu=use_gpu, batch_first=True ) trainer = create_trainer(params, env, use_gpu) _, _, trainer = train_sgd( env, trainer, use_gpu, "{} test run".format(env_type), params.mdnrnn.minibatch_size, params.run_details, test_batch=test_batch, ) feature_importance_map, feature_sensitivity_map, dataset = None, None, None if feature_importance: feature_importance_map = calculate_feature_importance( env, trainer, use_gpu, params.run_details, test_batch=test_batch ) if feature_sensitivity: feature_sensitivity_map = calculate_feature_sensitivity_by_actions( env, trainer, use_gpu, params.run_details, test_batch=test_batch ) if save_embedding_to_path: dataset = RLDataset(save_embedding_to_path) create_embed_rl_dataset(env, trainer, dataset, use_gpu, params.run_details) dataset.save() return env, trainer, feature_importance_map, feature_sensitivity_map, dataset
def run_parametric_dqn_cartpole(config): trainer = build_trainer(config) num_episodes = PARAMETRIC_DQN_CARTPOLE_NUM_EPISODES env = gym.make(config.env) wrapped_env = OpenAIGymEnvironment(config.env) action_shape = np.array(wrapped_env.actions).shape action_type = np.float32 replay_buffer = ReplayBuffer( observation_shape=env.reset().shape, stack_size=1, replay_capacity=config.max_replay_memory_size, batch_size=trainer.minibatch_size, observation_dtype=np.float32, action_shape=action_shape, action_dtype=action_type, reward_shape=(), reward_dtype=np.float32, extra_storage_types=[ ReplayElement("possible_actions_mask", action_shape, action_type), ReplayElement("log_prob", (), np.float32), ], ) actions = wrapped_env.actions normalization = wrapped_env.normalization policy = Policy( scorer=parametric_dqn_scorer(len(actions), trainer.q_network), sampler=SoftmaxActionSampler(), policy_preprocessor=tiled_numpy_policy_preprocessor(len(actions)), ) agent = Agent( policy=policy, action_preprocessor=discrete_action_preprocessor, replay_buffer=replay_buffer, replay_buffer_add_fn=replay_buffer_add_fn, replay_buffer_train_fn=replay_buffer_train_fn( trainer=trainer, trainer_preprocessor=parametric_dqn_trainer_preprocessor( len(actions), normalization), training_freq=config.run_details.train_every_ts, batch_size=trainer.minibatch_size, replay_burnin=config.run_details.train_after_ts, ), ) reward_history = run( env=env, agent=agent, num_episodes=num_episodes, max_steps=config.run_details.max_steps, ) return reward_history
def run_gym( params, score_bar, gpu_id, save_timesteps_to_dataset=None, start_saving_from_episode=0, ): # Caffe2 core uses the min of caffe2_log_level and minloglevel # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info. core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"]) logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] env = OpenAIGymEnvironment( env_type, rl_parameters.epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, ) replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"]) model_type = params["model_type"] use_gpu = gpu_id != USE_CPU trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env) predictor = create_predictor(trainer, model_type, use_gpu) c2_device = core.DeviceOption( caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, gpu_id ) return train_sgd( c2_device, env, replay_buffer, model_type, trainer, predictor, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_episode=start_saving_from_episode, )
def run_gym( params, offline_train, score_bar, gpu_id, save_timesteps_to_dataset=None, start_saving_from_episode=0, ): logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] if offline_train: # take random actions during data collection epsilon = 1.0 else: epsilon = rl_parameters.epsilon env = OpenAIGymEnvironment( env_type, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma ) replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"]) model_type = params["model_type"] use_gpu = gpu_id != USE_CPU trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env) predictor = create_predictor(trainer, model_type, use_gpu) c2_device = core.DeviceOption( caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id) ) return train_sgd( c2_device, env, offline_train, replay_buffer, model_type, trainer, predictor, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_episode=start_saving_from_episode, )
def main(args): parser = argparse.ArgumentParser( description="Train a RL net to play in an OpenAI Gym environment.") parser.add_argument("-p", "--parameters", help="Path to JSON parameters file.") parser.add_argument("-s", "--score-bar", help="Bar for averaged tests scores.", type=float, default=None) parser.add_argument( "-g", "--gpu_id", help="If set, will use GPU with specified ID. Otherwise will use CPU.", default=USE_CPU) args = parser.parse_args(args) with open(args.parameters, 'r') as f: params = json.load(f) rl_settings = params['rl'] training_settings = params['training'] rl_settings['gamma'] = rl_settings['reward_discount_factor'] del rl_settings['reward_discount_factor'] training_settings['gamma'] = training_settings['learning_rate_decay'] del training_settings['learning_rate_decay'] env_type = params['env'] env = OpenAIGymEnvironment(env_type, rl_settings['epsilon']) trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(**rl_settings), training=TrainingParameters(**training_settings)) device = core.DeviceOption( caffe2_pb2.CPU if args.gpu_id == USE_CPU else caffe2_pb2.CUDA, args.gpu_id) with core.DeviceScope(device): trainer = DiscreteActionTrainer(env.normalization, trainer_params, skip_normalization=True) return run(env, trainer, "{} test run".format(env_type), args.score_bar, **params["run_details"])
def mdnrnn_gym( params: OpenAiGymParameters, feature_importance: bool = False, feature_sensitivity: bool = False, save_embedding_to_path: Optional[str] = None, ): assert params.mdnrnn is not None use_gpu = params.use_gpu logger.info("Running gym with params") logger.info(params) env_type = params.env env = OpenAIGymEnvironment(env_type, epsilon=1.0, softmax_policy=True, gamma=0.99) trainer = create_trainer(params, env, use_gpu) _, _, trainer = train_sgd( env, trainer, use_gpu, "{} test run".format(env_type), params.mdnrnn.minibatch_size, params.run_details, ) feature_importance_map, feature_sensitivity_map, dataset = None, None, None if feature_importance: feature_importance_map = calculate_feature_importance( env, trainer, use_gpu, params.run_details) if feature_sensitivity: feature_sensitivity_map = calculate_feature_sensitivity_by_actions( env, trainer, use_gpu, params.run_details) if save_embedding_to_path: dataset = RLDataset(save_embedding_to_path) create_embed_rl_dataset(env, trainer, dataset, use_gpu, params.run_details) dataset.save() return env, trainer, feature_importance_map, feature_sensitivity_map, dataset
def run_gym(params, score_bar, gpu_id): rl_settings = params['rl'] training_settings = params['training'] rl_settings['gamma'] = rl_settings['reward_discount_factor'] del rl_settings['reward_discount_factor'] training_settings['gamma'] = training_settings['learning_rate_decay'] del training_settings['learning_rate_decay'] env_type = params['env'] env = OpenAIGymEnvironment(env_type, rl_settings['epsilon']) trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(**rl_settings), training=TrainingParameters(**training_settings)) device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id, ) with core.DeviceScope(device): if env.img: trainer = DiscreteActionConvTrainer( DiscreteActionConvModelParameters( fc_parameters=trainer_params, cnn_parameters=CNNModelParameters(**params['cnn']), num_input_channels=env.num_input_channels, img_height=env.height, img_width=env.width), env.normalization, ) else: trainer = DiscreteActionTrainer( trainer_params, env.normalization, ) return run(env, trainer, "{} test run".format(env_type), score_bar, **params["run_details"])
def run_gym(params, score_bar, gpu_id, save_timesteps_to_dataset=None): logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] env = OpenAIGymEnvironment( env_type, rl_parameters.epsilon, rl_parameters.softmax_policy, params["max_replay_memory_size"], ) model_type = params["model_type"] c2_device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id ) if model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"] ) training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters ) trainer = DiscreteActionTrainer(trainer_params, env.normalization) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"] ) training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ContinuousActionDQNTrainer( trainer_params, env.normalization, env.normalization_action ) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_settings = params["shared_training"] actor_settings = params["actor_training"] critic_settings = params["critic_training"] trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=DDPGTrainingParameters(**training_settings), actor_training=DDPGNetworkParameters(**actor_settings), critic_training=DDPGNetworkParameters(**critic_settings), ) # DDPG can handle continuous and discrete action spaces if env.action_type == EnvType.CONTINUOUS_ACTION: action_range = env.action_space.high else: action_range = None trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, use_gpu=False, action_range=action_range, ) else: raise NotImplementedError("Model of type {} not supported".format(model_type)) return run( c2_device, env, model_type, trainer, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, )
def train_gym_offline_rl( gym_env: OpenAIGymEnvironment, replay_buffer: OpenAIGymMemoryPool, model_type: str, trainer: RLTrainer, predictor: OnPolicyPredictor, test_run_name: str, score_bar: Optional[float], max_steps: int, avg_over_num_episodes: int, offline_train_epochs: int, num_batch_per_epoch: Optional[int], bcq_imitator_hyper_params: Optional[Dict[str, Any]] = None, ): if num_batch_per_epoch is None: num_batch_per_epoch = replay_buffer.size // trainer.minibatch_size assert num_batch_per_epoch > 0, "The size of replay buffer is not sufficient" logger.info( "{} offline transitions in replay buffer.\n" "Training will take {} epochs, with each epoch having {} mini-batches" " and each mini-batch having {} samples".format( replay_buffer.size, offline_train_epochs, num_batch_per_epoch, trainer.minibatch_size, ) ) avg_reward_history, epoch_history = [], [] # Pre-train a GBDT imitator if doing batch constrained q-learning in Gym if getattr(trainer, "bcq", None): assert bcq_imitator_hyper_params is not None gbdt = GradientBoostingClassifier( n_estimators=bcq_imitator_hyper_params["gbdt_trees"], max_depth=bcq_imitator_hyper_params["max_depth"], ) samples = replay_buffer.sample_memories(replay_buffer.size, model_type) X, y = samples.states.numpy(), torch.max(samples.actions, dim=1)[1].numpy() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) logger.info("Fitting GBDT...") gbdt.fit(X_train, y_train) train_score = round(gbdt.score(X_train, y_train) * 100, 1) test_score = round(gbdt.score(X_test, y_test) * 100, 1) logger.info( "GBDT train accuracy {}% || test accuracy {}%".format( train_score, test_score ) ) trainer.bcq_imitator = gbdt.predict_proba # type: ignore # Offline training for i_epoch in range(offline_train_epochs): for _ in range(num_batch_per_epoch): samples = replay_buffer.sample_memories(trainer.minibatch_size, model_type) samples.set_device(trainer.device) trainer.train(samples) batch_td_loss = float( torch.mean( torch.tensor( [stat.td_loss for stat in trainer.loss_reporter.incoming_stats] ) ) ) trainer.loss_reporter.flush() logger.info( "Average TD loss: {} in epoch {}".format(batch_td_loss, i_epoch + 1) ) # test model performance for this epoch avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times( avg_over_num_episodes, predictor, test=True, max_steps=max_steps ) avg_reward_history.append(avg_rewards) # For offline training, use epoch number as timestep history since # we have a fixed batch of data to count epochs over. epoch_history.append(i_epoch) logger.info( "Achieved an average reward score of {} over {} evaluations" " after epoch {}.".format(avg_rewards, avg_over_num_episodes, i_epoch) ) if score_bar is not None and avg_rewards > score_bar: logger.info( "Avg. reward history for {}: {}".format( test_run_name, avg_reward_history ) ) return avg_reward_history, epoch_history, trainer, predictor, gym_env logger.info( "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history) ) return avg_reward_history, epoch_history, trainer, predictor, gym_env
def build_trainer(config): return create_trainer(config, OpenAIGymEnvironment(config.env))
def test_open_ai_gym_generate_samples_multi_step(self): env = OpenAIGymEnvironment( "CartPole-v0", epsilon=1.0, # take random actions to collect training data softmax_policy=False, gamma=0.9, ) num_samples = 1000 num_steps = 5 samples = env.generate_random_samples(num_samples, use_continuous_action=True, epsilon=1.0, multi_steps=num_steps) for i in range(num_samples): if samples.terminals[i][0]: break if i < num_samples - 1: self.assertEqual(samples.mdp_ids[i], samples.mdp_ids[i + 1]) self.assertEqual(samples.sequence_numbers[i] + 1, samples.sequence_numbers[i + 1]) for j in range(len(samples.terminals[i])): self.assertEqual(samples.rewards[i][j], samples.rewards[i + j][0]) self.assertDictEqual(samples.next_states[i][j], samples.next_states[i + j][0]) self.assertDictEqual(samples.next_actions[i][j], samples.next_actions[i + j][0]) self.assertEqual(samples.terminals[i][j], samples.terminals[i + j][0]) self.assertListEqual( samples.possible_next_actions[i][j], samples.possible_next_actions[i + j][0], ) if samples.terminals[i][j]: continue self.assertDictEqual(samples.next_states[i][j], samples.states[i + j + 1]) self.assertDictEqual(samples.next_actions[i][j], samples.actions[i + j + 1]) self.assertListEqual( samples.possible_next_actions[i][j], samples.possible_actions[i + j + 1], ) single_step_samples = samples.to_single_step() for i in range(num_samples): if single_step_samples.terminals[i] is True: break self.assertEqual(single_step_samples.mdp_ids[i], samples.mdp_ids[i]) self.assertEqual(single_step_samples.sequence_numbers[i], samples.sequence_numbers[i]) self.assertDictEqual(single_step_samples.states[i], samples.states[i]) self.assertDictEqual(single_step_samples.actions[i], samples.actions[i]) self.assertEqual( single_step_samples.action_probabilities[i], samples.action_probabilities[i], ) self.assertEqual(single_step_samples.rewards[i], samples.rewards[i][0]) self.assertListEqual(single_step_samples.possible_actions[i], samples.possible_actions[i]) self.assertDictEqual(single_step_samples.next_states[i], samples.next_states[i][0]) self.assertDictEqual(single_step_samples.next_actions[i], samples.next_actions[i][0]) self.assertEqual(single_step_samples.terminals[i], samples.terminals[i][0]) self.assertListEqual( single_step_samples.possible_next_actions[i], samples.possible_next_actions[i][0], )
def multi_step_sample_generator( gym_env: OpenAIGymEnvironment, num_transitions: int, max_steps: Optional[int], multi_steps: int, include_shorter_samples_at_start: bool, include_shorter_samples_at_end: bool, ): """ Convert gym env multi-step sample format to mdn-rnn multi-step sample format :param gym_env: The environment used to generate multi-step samples :param num_transitions: # of samples to return :param max_steps: An episode terminates when the horizon is beyond max_steps :param multi_steps: # of steps of states and actions per sample :param include_shorter_samples_at_start: Whether to keep samples of shorter steps which are generated at the beginning of an episode :param include_shorter_samples_at_end: Whether to keep samples of shorter steps which are generated at the end of an episode """ samples = gym_env.generate_random_samples( num_transitions=num_transitions, use_continuous_action=True, max_step=max_steps, multi_steps=multi_steps, include_shorter_samples_at_start=include_shorter_samples_at_start, include_shorter_samples_at_end=include_shorter_samples_at_end, ) for j in range(num_transitions): sample_steps = len(samples.terminals[j]) # type: ignore state = dict_to_np(samples.states[j], np_size=gym_env.state_dim, key_offset=0) action = dict_to_np(samples.actions[j], np_size=gym_env.action_dim, key_offset=gym_env.state_dim) next_actions = np.float32( # type: ignore [ dict_to_np( samples.next_actions[j][k], np_size=gym_env.action_dim, key_offset=gym_env.state_dim, ) for k in range(sample_steps) ]) next_states = np.float32( # type: ignore [ dict_to_np(samples.next_states[j][k], np_size=gym_env.state_dim, key_offset=0) for k in range(sample_steps) ]) rewards = np.float32(samples.rewards[j]) # type: ignore terminals = np.float32(samples.terminals[j]) # type: ignore not_terminals = np.logical_not(terminals) ordered_states = np.vstack((state, next_states)) ordered_actions = np.vstack((action, next_actions)) mdnrnn_states = ordered_states[:-1] mdnrnn_actions = ordered_actions[:-1] mdnrnn_next_states = ordered_states[-multi_steps:] mdnrnn_next_actions = ordered_actions[-multi_steps:] # Padding zeros so that all samples have equal steps # The general rule is to pad zeros at the end of sequences. # In addition, if the sequence only has one step (i.e., the # first state of an episode), pad one zero row ahead of the # sequence, which enables embedding generated properly for # one-step samples num_padded_top_rows = 1 if multi_steps > 1 and sample_steps == 1 else 0 num_padded_bottom_rows = multi_steps - sample_steps - num_padded_top_rows sample_steps_next = len(mdnrnn_next_states) num_padded_top_rows_next = 0 num_padded_bottom_rows_next = multi_steps - sample_steps_next yield ( np.pad( mdnrnn_states, ((num_padded_top_rows, num_padded_bottom_rows), (0, 0)), "constant", constant_values=0.0, ), np.pad( mdnrnn_actions, ((num_padded_top_rows, num_padded_bottom_rows), (0, 0)), "constant", constant_values=0.0, ), np.pad( rewards, ((num_padded_top_rows, num_padded_bottom_rows)), "constant", constant_values=0.0, ), np.pad( mdnrnn_next_states, ((num_padded_top_rows_next, num_padded_bottom_rows_next), (0, 0)), "constant", constant_values=0.0, ), np.pad( mdnrnn_next_actions, ((num_padded_top_rows_next, num_padded_bottom_rows_next), (0, 0)), "constant", constant_values=0.0, ), np.pad( not_terminals, ((num_padded_top_rows, num_padded_bottom_rows)), "constant", constant_values=0.0, ), sample_steps, sample_steps_next, )
def run_gym( params, score_bar, gpu_id, save_timesteps_to_dataset=None, start_saving_from_episode=0, batch_rl_file_path=None, ): # Caffe2 core uses the min of caffe2_log_level and minloglevel # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info. core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"]) logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] env = OpenAIGymEnvironment( env_type, rl_parameters.epsilon, rl_parameters.softmax_policy, params["max_replay_memory_size"], rl_parameters.gamma, ) model_type = params["model_type"] c2_device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id) use_gpu = gpu_id != USE_CPU if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters) trainer = DQNTrainer(trainer_params, env.normalization, use_gpu) elif model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters) trainer = DiscreteActionTrainer(trainer_params, env.normalization) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ParametricDQNTrainer(trainer_params, env.normalization, env.normalization_action, use_gpu) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ContinuousActionDQNTrainer(trainer_params, env.normalization, env.normalization_action) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_settings = params["shared_training"] actor_settings = params["actor_training"] critic_settings = params["critic_training"] trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=DDPGTrainingParameters(**training_settings), actor_training=DDPGNetworkParameters(**actor_settings), critic_training=DDPGNetworkParameters(**critic_settings), ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return run( c2_device, env, model_type, trainer, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_episode=start_saving_from_episode, batch_rl_file_path=batch_rl_file_path, )
def run_gym(params, score_bar, gpu_id): rl_settings = params['rl'] rl_settings['gamma'] = rl_settings['reward_discount_factor'] del rl_settings['reward_discount_factor'] env_type = params['env'] env = OpenAIGymEnvironment(env_type, rl_settings['epsilon']) model_type = params['model_type'] c2_device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id, ) if model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_settings = params['training'] training_settings['gamma'] = training_settings[ 'learning_rate_decay'] del training_settings['learning_rate_decay'] trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(**rl_settings), training=TrainingParameters(**training_settings)) if env.img: trainer = DiscreteActionConvTrainer( DiscreteActionConvModelParameters( fc_parameters=trainer_params, cnn_parameters=CNNModelParameters(**params['cnn']), num_input_channels=env.num_input_channels, img_height=env.height, img_width=env.width), env.normalization, ) else: trainer = DiscreteActionTrainer( trainer_params, env.normalization, ) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_settings = params['training'] training_settings['gamma'] = training_settings[ 'learning_rate_decay'] del training_settings['learning_rate_decay'] trainer_params = ContinuousActionModelParameters( rl=RLParameters(**rl_settings), training=TrainingParameters(**training_settings), knn=KnnParameters(model_type='DQN', ), ) trainer = ContinuousActionDQNTrainer(trainer_params, env.normalization, env.normalization_action) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_settings = params['shared_training'] training_settings['gamma'] = training_settings['learning_rate_decay'] del training_settings['learning_rate_decay'] actor_settings = params['actor_training'] critic_settings = params['critic_training'] trainer_params = DDPGModelParameters( rl=DDPGRLParameters(**rl_settings), shared_training=DDPGTrainingParameters(**training_settings), actor_training=DDPGNetworkParameters(**actor_settings), critic_training=DDPGNetworkParameters(**critic_settings), ) trainer = DDPGTrainer( trainer_params, EnvDetails( state_dim=env.state_dim, action_dim=env.action_dim, action_range=(env.action_space.low, env.action_space.high), )) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return run(env, model_type, trainer, "{} test run".format(env_type), score_bar, **params["run_details"])