def create_embed_rl_dataset( gym_env: OpenAIGymEnvironment, trainer: MDNRNNTrainer, dataset: RLDataset, use_gpu: bool, run_details: OpenAiRunDetails, ): assert run_details.max_steps is not None old_mdnrnn_mode = trainer.mdnrnn.mdnrnn.training trainer.mdnrnn.mdnrnn.eval() num_transitions = run_details.num_state_embed_episodes * run_details.max_steps device = torch.device("cuda") if use_gpu else torch.device( "cpu") # type: ignore ( state_batch, action_batch, reward_batch, next_state_batch, next_action_batch, not_terminal_batch, step_batch, next_step_batch, ) = map( list, zip(*multi_step_sample_generator( gym_env=gym_env, num_transitions=num_transitions, max_steps=run_details.max_steps, # +1 because MDNRNN embeds the first seq_len steps and then # the embedded state will be concatenated with the last step multi_steps=run_details.seq_len + 1, include_shorter_samples_at_start=True, include_shorter_samples_at_end=False, )), ) def concat_batch(batch): return torch.cat( [ torch.tensor(np.expand_dims(x, axis=1), dtype=torch.float, device=device) for x in batch ], dim=1, ) # shape: seq_len x batch_size x feature_dim mdnrnn_state = concat_batch(state_batch) next_mdnrnn_state = concat_batch(next_state_batch) mdnrnn_action = concat_batch(action_batch) next_mdnrnn_action = concat_batch(next_action_batch) mdnrnn_input = rlt.PreprocessedStateAction.from_tensors( state=mdnrnn_state, action=mdnrnn_action) next_mdnrnn_input = rlt.PreprocessedStateAction.from_tensors( state=next_mdnrnn_state, action=next_mdnrnn_action) # batch-compute state embedding mdnrnn_output = trainer.mdnrnn(mdnrnn_input) next_mdnrnn_output = trainer.mdnrnn(next_mdnrnn_input) for i in range(len(state_batch)): # Embed the state as the hidden layer's output # until the previous step + current state hidden_idx = 0 if step_batch[ i] == 1 else step_batch[i] - 2 # type: ignore next_hidden_idx = next_step_batch[i] - 2 # type: ignore hidden_embed = ( mdnrnn_output.all_steps_lstm_hidden[hidden_idx, i, :].squeeze().detach().cpu()) state_embed = torch.cat( (hidden_embed, torch.tensor(state_batch[i][hidden_idx + 1]) ) # type: ignore ) next_hidden_embed = (next_mdnrnn_output.all_steps_lstm_hidden[ next_hidden_idx, i, :].squeeze().detach().cpu()) next_state_embed = torch.cat(( next_hidden_embed, torch.tensor(next_state_batch[i][next_hidden_idx + 1]), # type: ignore )) logger.debug( "create_embed_rl_dataset:\nstate batch\n{}\naction batch\n{}\nlast " "action: {},reward: {}\nstate embed {}\nnext state embed {}\n". format( state_batch[i][:hidden_idx + 1], # type: ignore action_batch[i][:hidden_idx + 1], # type: ignore action_batch[i][hidden_idx + 1], # type: ignore reward_batch[i][hidden_idx + 1], # type: ignore state_embed, next_state_embed, )) terminal = 1 - not_terminal_batch[i][hidden_idx + 1] # type: ignore possible_actions, possible_actions_mask = get_possible_actions( gym_env, ModelType.PYTORCH_PARAMETRIC_DQN.value, False) possible_next_actions, possible_next_actions_mask = get_possible_actions( gym_env, ModelType.PYTORCH_PARAMETRIC_DQN.value, terminal) dataset.insert( state=state_embed, action=torch.tensor(action_batch[i][hidden_idx + 1]), # type: ignore reward=reward_batch[i][hidden_idx + 1], # type: ignore next_state=next_state_embed, next_action=torch.tensor(next_action_batch[i][next_hidden_idx + 1] # type: ignore ), terminal=torch.tensor(terminal), possible_next_actions=possible_next_actions, possible_next_actions_mask=possible_next_actions_mask, time_diff=torch.tensor(1), possible_actions=possible_actions, possible_actions_mask=possible_actions_mask, policy_id=0, ) logger.info("Insert {} transitions into a state embed dataset".format( len(state_batch))) trainer.mdnrnn.mdnrnn.train(old_mdnrnn_mode) return dataset
def custom_train_gym_online_rl( c2_device, gym_env, replay_buffer, model_type, trainer, predictor, test_run_name, score_bar, num_episodes, max_steps, train_every_ts, train_after_ts, test_every_ts, test_after_ts, num_train_batches, avg_over_num_episodes, render, save_timesteps_to_dataset, start_saving_from_score, solved_reward_threshold, max_episodes_to_run_after_solved, stop_training_after_solved, timesteps_total, checkpoint_after_ts, avg_over_num_steps): """Train off of dynamic set of transitions generated on-policy.""" ep_i = 0 ts = 0 policy_id = 0 # logging average_reward_train, num_episodes_train = [], [] average_reward_eval, num_episodes_eval = [], [] timesteps_history = [] reward_hist = list() while ep_i < num_episodes and ts < timesteps_total: terminal = False next_state = gym_env.transform_state(gym_env.env.reset()) next_action, next_action_probability = gym_env.policy( predictor, next_state, False) reward_sum = 0 ep_timesteps = 0 if model_type == ModelType.CONTINUOUS_ACTION.value: trainer.noise.clear() while not terminal: state = next_state action = next_action action_probability = next_action_probability # Get possible actions possible_actions, _ = horizon_runner.get_possible_actions( gym_env, model_type, terminal) if render: gym_env.env.render() timeline_format_action, gym_action = horizon_runner._format_action_for_log_and_gym( action, gym_env.action_type, model_type) next_state, reward, terminal, _ = gym_env.env.step(gym_action) next_state = gym_env.transform_state(next_state) ep_timesteps += 1 ts += 1 next_action, next_action_probability = gym_env.policy( predictor, next_state, False) reward_sum += reward (possible_actions, possible_actions_mask) = horizon_runner.get_possible_actions( gym_env, model_type, False) # Get possible next actions (possible_next_actions, possible_next_actions_mask) = horizon_runner.get_possible_actions( gym_env, model_type, terminal) replay_buffer.insert_into_memory( np.float32(state), action, np.float32(reward), np.float32(next_state), next_action, terminal, possible_next_actions, possible_next_actions_mask, 1, possible_actions, possible_actions_mask, policy_id, ) if save_timesteps_to_dataset and (ts % checkpoint_after_ts == 0 or ts == timesteps_total): save_timesteps_to_dataset.insert( mdp_id=ep_i, sequence_number=ep_timesteps - 1, state=state, action=action, timeline_format_action=timeline_format_action, action_probability=action_probability, reward=reward, next_state=next_state, next_action=next_action, terminal=terminal, possible_next_actions=possible_next_actions, possible_next_actions_mask=possible_next_actions_mask, time_diff=1, possible_actions=possible_actions, possible_actions_mask=possible_actions_mask, policy_id=policy_id, ) # Training loop if (ts % train_every_ts == 0 and ts > train_after_ts and len( replay_buffer.replay_memory) >= trainer.minibatch_size): for _ in range(num_train_batches): samples = replay_buffer.sample_memories( trainer.minibatch_size, model_type) samples.set_type(trainer.dtype) trainer.train(samples) # Every time we train, the policy changes policy_id += 1 # Evaluation loop if ts % test_every_ts == 0 and ts > test_after_ts: avg_ep_count, avg_rewards = gym_env.run_n_steps( avg_over_num_steps, predictor, test=True) # save Tensorboard statistics timesteps_history.append(ts) avg_train_reward = sum(reward_hist) / len(reward_hist) average_reward_train.append(avg_train_reward) num_episodes_train.append(len(reward_hist)) average_reward_eval.append(avg_rewards) num_episodes_eval.append(avg_ep_count) logger.info( "Achieved an average reward score of {} over {} evaluations." " Total episodes: {}, total timesteps: {}.".format( avg_rewards, avg_ep_count, ep_i + 1, ts)) logger.info( "Achieved an average reward score of {} during {} training episodes." " Total episodes: {}, total timesteps: {}.".format( avg_train_reward, len(reward_hist), ep_i + 1, ts)) reward_hist.clear() if score_bar is not None and avg_rewards > score_bar: logger.info( "Avg. reward history during evaluation for {}: {}". format(test_run_name, average_reward_eval)) logger.info( "Avg. reward history during training for {}: {}". format(test_run_name, average_reward_train)) return average_reward_train, num_episodes_train, average_reward_eval, num_episodes_eval, timesteps_history, trainer, predictor, gym_env if max_steps and ep_timesteps >= max_steps: break reward_hist.append(reward_sum) # Always eval on last episode if previous eval loop didn't return. if ep_i == num_episodes - 1: avg_ep_count, avg_rewards = gym_env.run_n_steps(avg_over_num_steps, predictor, test=True) # save Tensorboard statistics timesteps_history.append(ts) avg_train_reward = sum(reward_hist) / len(reward_hist) average_reward_train.append(avg_train_reward) num_episodes_train.append(len(reward_hist)) average_reward_eval.append(avg_rewards) num_episodes_eval.append(avg_ep_count) logger.info( "Achieved an average reward score of {} over {} evaluations." " Total episodes: {}, total timesteps: {}.".format( avg_rewards, avg_ep_count, ep_i + 1, ts)) logger.info( "Achieved an average reward score of {} during {} training episodes." " Total episodes: {}, total timesteps: {}.".format( avg_train_reward, len(reward_hist), ep_i + 1, ts)) reward_hist.clear() gym_env.decay_epsilon() ep_i += 1 logger.info("Avg. reward history during evaluation for {}: {}".format( test_run_name, average_reward_eval)) logger.info("Avg. reward history during training for {}: {}".format( test_run_name, average_reward_train)) return average_reward_train, num_episodes_train, average_reward_eval, num_episodes_eval, timesteps_history, trainer, predictor, gym_env