class TestDQNAgent(unittest.TestCase): def setUp(self): self.state_size = 3 self.action_size = 5 fc = nn.Sequential(nn.Linear(self.state_size, 5), nn.ReLU(), nn.Linear(5, 7), nn.ReLU(), nn.Linear(7, 9), nn.ReLU(), nn.Linear(9, self.action_size)) self.main_model = QNetwork(name="my_network", fc=fc) self.target_model = QNetwork(name="my_network", fc=fc) self.agent = DQNAgent(main_model=self.main_model, target_network=self.target_model, memory=WeightedReplayBuffer(buffer_size=12, batch_size=3)) self.eps_greediness = 0.01 def test_allruns(self): """ No explosions? """ # act state_value = [random()] * self.agent.state_size self.agent.act(state=state_value, eps=self.eps_greediness) agent_learned = False while not agent_learned: # I want to force a learning step. agent_learned = self.agent.step( state=[random()] * self.agent.state_size, action=np.random.randint(self.agent.action_size), reward=random(), next_state=[random()] * self.agent.state_size, done=random() > 0.75)
def main(args): with open(args.param, "r") as f: config = json.load(f) env = gym.make('Freeway-v0') env.seed(args.seed) env = FrameStack(env, config) print('State shape: ', env.observation_space.shape) print('Action shape: ', env.action_space.n) agent = DQNAgent(state_size=200, action_size=env.action_space.n, config=config) #agent_r.load("models-28_11_2020_22:25:27/2000-") env = gym.wrappers.Monitor(env, "./vid", video_callable=lambda episode_id: True, force=True) #agent.qnetwork_local.load_state_dict(torch.load('checkpoint-score80.47156817885116_epi_125.pth')) agent.qnetwork_local.load_state_dict( torch.load('search_results/models/eval-{}/_q_net.pth'.format( args.agent))) agent.encoder.load_state_dict( torch.load('search_results/models/eval-{}/_encoder.pth'.format( args.agent))) n_episodes = 1 max_t = 3000 eps = 0 for i_episode in range(1, n_episodes + 1): state = env.reset() score = 0 for t in range(max_t): action = agent.act(state, eps) next_state, reward, done, _ = env.step(action) score += reward time.sleep(0.01) state = next_state env.render() if done: break print("Episode {} Reward {} Steps {}".format(i_episode, score, t)) env.close()
def train_speed_agent(coach): """ takes caoch(lstm) to modify the target reward function for the agent """ score = 0 coaching_score_keep = [] coaching_episode_keep = [] env = gym.make('CartPole-v0') state_size = env.observation_space.shape[0] action_size = env.action_space.n coaching = DQNAgent(len(env.reset()), env.action_space.n) done = False batch_size = 32 index = 0 for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(500): #env.render() action = coaching.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) index = index + 1 coaching.remember(state, action, reward, next_state, done, index) score = score + reward success = determine_sucess(done, score) coaching.lstm_data(state, action, reward, next_state, done, success) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, time, coaching.epsilon)) coaching_score_keep.append(score) coaching_episode_keep.append(e) score = 0 break if len(coaching.memory) > batch_size: coaching.replay(batch_size, coach) return agent, coaching_score_keep, coaching_episode_keep
def train_expert(): """ craetes an agent that is trained to an optimal policy and captures all values required to train lstm """ agent_score_keep = [] agent_episode_keep = [] score = 0 env = gym.make('CartPole-v1') state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(len(env.reset()), env.action_space.n) # agent.load("./save/cartpole-dqn.h5") done = False batch_size = 32 index = 0 for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(500): # env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) a_reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) index = index + 0 agent.remember(state, action, a_reward, next_state, done, index) score = score + reward success = determine_sucess(done, score) agent.lstm_data(state, action, reward, next_state, done, success) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, score, agent.epsilon)) agent_score_keep.append(score) agent_episode_keep.append(e) score = 0 break if len(agent.memory) > batch_size: agent.replay(batch_size, None) return agent, agent_score_keep, agent_episode_keep
input_shape=[len(obs)], policy=policy, obs_processer=obs_processer) agent.compile() result = [] nb_epsiodes = 1000 for episode in range(nb_epsiodes): agent.reset() observation = env.reset() observation = deepcopy(observation) agent.observe(observation) done = False while not done: action = deepcopy(agent.act()) observation, reward, done, info = env.step(action) observation = deepcopy(observation) agent.observe(observation, reward, done) if done: break agent.training = False observation = env.reset() agent.observe(observation) done = False step = 0 while not done: # env.render() # 表示 step += 1 action = agent.act()
crt_num_episodes = 0 crt_ep_reward = 0.0 total_rewards = 0.0 render = False i = 0 while agent.config['episodes_left']: obs = np.array(env.reset()) done = False while not done: if render: env.render() action = agent.act(obs) new_obs, reward, done, _ = env.step(action) new_obs = np.array(new_obs) agent.remember(obs, action, reward, new_obs, done) # fname = get_img_name(agent.config, action) # matplotlib.image.imsave(fname, obs[:, :, 3]) obs = new_obs crt_ep_reward += reward i += 1 if len(agent.memory.buffer) >= 500: loss = agent.train() if i % 100 == 0: print('loss:', loss)
print('No weights found from previous learning session. Unable to proceed.') exit(-1) return_history = [] for episodes in range(1, NUM_EPISODES + 1): # Reset the environment state = env.reset() # This reshape is needed to keep compatibility with Keras state = np.reshape(state, [1, state_size]) # Cumulative reward is the return since the beginning of the episode cumulative_reward = 0.0 for time in range(1, 500): # Render the environment for visualization env.render() # Select action action = agent.act(state) # Take action, observe reward and new state next_state, reward, done, _ = env.step(action) # Reshaping to keep compatibility with Keras next_state = np.reshape(next_state, [1, state_size]) # Making reward engineering to keep compatibility with how training was done reward = reward_engineering_mountain_car(state[0], action, reward, next_state[0], done) state = next_state # Accumulate reward cumulative_reward = agent.gamma * cumulative_reward + reward if done: print("episode: {}/{}, time: {}, score: {:.6}, epsilon: {:.3}" .format(episodes, NUM_EPISODES, time, cumulative_reward, agent.epsilon)) break return_history.append(cumulative_reward)
class DQNTrainer: """ Trainer for a Deep Q-Network Agent on giver environment. Currently only supports epsilon greedy exploration. Parameters ---------- env : gym.Env Environment in which the training may occur. log_frequency : int, optional Frequency, in timesteps, to log training information. (Default is 1000) exploration : dict, optional Exploration algorithm to use in the training. Expects a dict in any of the following formats. 1. For linear epsilon decay: {'algorithm': 'epsilon_greedy', 'decay': 'linear', 'initial_epsilon': 1.0, 'final_epsilon': 0.01, 'decay_timesteps': 1000} 2. For exponential epsilon decay: {'algorithm': 'epsilon_greedy', 'decay': 'exponential', 'initial_epsilon': 1.0, 'epsilon_decay': 0.995} (Default is as follows in 1.) **kwargs Optional keyword arguments for the DQNAgent's hyperparameters. Attributes ---------- agent : DQNAgent Agent to be trained by the DQNTrainer on given environment. env : gym.Env Training environment for the agent. log_frequency : int Frequency, in timesteps, to log training information. (Default is 1000) exploration_config : dict Necessary parameters for setting the exploration algorithm, including which one to use. update_explo_param : function Updates the exploration parameter when called. """ def __init__(self, env: gym.Env, log_frequency=1000, exploration=None, **kwargs): self.log_frequency = log_frequency self.agent = DQNAgent(action_dim=env.action_space.n, state_dim=env.observation_space.shape[0], **kwargs) self.env = env # Avoid mutable argument if exploration is None: exploration = {'algorithm': 'epsilon_greedy', 'decay': 'linear', 'initial_epsilon': 1.0, 'final_epsilon': 0.01, 'decay_timesteps': 1000} self.exploration_config = exploration # Parse the exploration dict to make the update_explo_param function if self.exploration_config['algorithm'] == 'epsilon_greedy': if self.exploration_config['decay'] == 'linear': update_term = (self.exploration_config['initial_epsilon'] - self.exploration_config[ 'final_epsilon']) / self.exploration_config['decay_timesteps'] self.update_explo_param = (lambda epsilon: epsilon - update_term if epsilon > self.exploration_config[ 'final_epsilon'] else epsilon) elif self.exploration_config['decay'] == 'exponential': self.update_explo_param = (lambda epsilon: epsilon * self.exploration_config['epsilon_decay']) else: raise NotImplementedError def train(self, num_timesteps=100000, render=False): """ Perform the training loop for num_timesteps duration. Parameters ---------- num_timesteps : int, optional Number of timesteps to perform during the training session. (Default is 100000) render : bool, optional Whether to render the environment or not. (Default is False) """ # Initialize log metrics episode_rewards = [] episode_lengths = [] episode_losses = [] num_episodes = 0 episode_reward = 0 episode_length = 0 episode_loss = 0 # Get initial state and set initial epsilon state = self.env.reset() epsilon = self.exploration_config['initial_epsilon'] # Perform the training loop num_timesteps times for timestep in range(num_timesteps): # Get agent action and perform a step on the environment action = self.agent.act(state, epsilon) if render: self.env.render() next_state, reward, done, info = self.env.step(action) # Add S,A,R,S',d transition to the agent's buffer self.agent.replay_buffer.add_transition((state, action, reward, next_state, done)) # Optimize the agent, if it has enough experiences stored if timestep > self.agent.batch_size: loss = self.agent.optimize(timestep) else: loss = 0 # Update the epsilon value and log metrics epsilon = self.update_explo_param(epsilon) episode_reward += reward episode_loss += loss episode_length += 1 # Reset the environment if the episode has ended, logging more information if done: num_episodes += 1 episode_rewards.append(episode_reward) episode_losses.append(episode_loss) episode_lengths.append(episode_length) episode_reward = 0 episode_loss = 0 episode_length = 0 state = self.env.reset() else: state = next_state # Report the log metrics every log_frequency timesteps if timestep % self.log_frequency == 0: print('Num episodes: ', num_episodes) print('Num timesteps: ', timestep) print('Mean episode reward: ', sum(episode_rewards[-20:]) / min(20, max(1, len(episode_rewards)))) print('Mean episode loss: ', sum(episode_losses[-20:]) / min(20, max(1, len(episode_losses)))) print('Mean episode length: ', sum(episode_lengths[-20:]) / min(20, max(1, len(episode_lengths)))) print('\n')
for phase in range(7, 8): env.number_of_grids = phase + 3 agent.epsilon = 1.0 agent.memory = deque(maxlen=2000) phase_scores = deque(maxlen=5) for e in range(EPISODES): done = False state = env.reset() # env.seed(0) state = np.reshape(state, [1, state_size]) score = 0 # while not done: for _ in range(500): # env.render() possible_actions = env.get_possible_actions() action = agent.act(state, possible_actions) # if action not in possible_actions: # reward = -1*99999 # env.done = True # done = True # else: next_state, reward, done, _ = env.step(action) score += reward next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format(
def train_agent(env, config): """ Args: """ # create CNN convert the [1,3,84,84] to [1, 200] now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") torch.manual_seed(config["seed"]) np.random.seed(config["seed"]) #pathname = str(args.locexp) + "/" + str(args.env_name) + '_agent_' + str(args.policy) #pathname += "_batch_size_" + str(args.batch_size) + "_lr_act_" + str(args.lr_actor) #pathname += "_lr_critc_" + str(args.lr_critic) + "_lr_decoder_" pathname = dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname agent = DQNAgent(state_size=200, action_size=env.action_space.n, config=config) writer = SummaryWriter(tensorboard_name) print("action_size {}".format(env.action_space.n)) # eval_policy(env, agent, writer, 0, config) memory = ReplayBuffer((3, config["size"], config["size"]), (1, ), config["expert_buffer_size"], int(config["image_pad"]), config["device"]) if config["create_buffer"]: create_buffer(env, memory, config) memory.load_memory("/export/leiningc/" + config["buffer_path"]) else: print("load Buffer") memory.load_memory("/export/leiningc/" + config["buffer_path"]) print("Buffer size {}".format(memory.idx)) eps = config["eps_start"] eps_end = config["eps_end"] eps_decay = config["eps_decay"] scores_window = deque(maxlen=100) scores = [] t0 = time.time() for i_episode in range(config["train_episodes"]): obs = env.reset() score = 0 for t in range(config["max_t"]): action = agent.act(obs, eps) # action = env.action_space.sample() next_obs, reward, done_no_max, _ = env.step(action) done = done_no_max if t + 1 == config["max_t"]: print("t ", t) done = 0 memory.add(obs, action, reward, next_obs, done, done_no_max) agent.step(memory, writer) obs = next_obs eps = max(eps_end, eps_decay * eps) # decrease epsilon score += reward if done: break scores_window.append(score) # save most recent scor scores.append(score) # save most recent score ave_score = np.mean(scores_window) writer.add_scalar("ave_score", ave_score, i_episode) writer.add_scalar("episode_score", score, i_episode) print( '\rEpisode {} score {} \tAverage Score: {:.2f} eps: {:.2f} time: {}' .format(i_episode, score, np.mean(scores_window), eps, time_format(time.time() - t0)), end="") if i_episode % config["eval"] == 0: eval_policy(env, agent, writer, i_episode, config) agent.save( str(config["locexp"]) + "/models/eval-{}/".format(i_episode)) print( 'Episode {} Average Score: {:.2f} eps: {:.2f} time: {}'. format(i_episode, np.mean(scores_window), eps, time_format(time.time() - t0)), )
def main(model=None, mode='train', start_episode=0): my_xml = '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <About> <Summary>Hill Descent.</Summary> </About> <ModSettings> <MsPerTick>20</MsPerTick> </ModSettings> <ServerSection> <ServerInitialConditions> <Time><StartTime>1</StartTime></Time> </ServerInitialConditions> <ServerHandlers> <DefaultWorldGenerator seed="-999595225643433963" forceReset="false" destroyAfterUse="false" /> <ServerQuitFromTimeUp timeLimitMs="100000000"/> <ServerQuitWhenAnyAgentFinishes/> </ServerHandlers> </ServerSection> <AgentSection mode="Survival"> <Name>Bob</Name> <AgentStart> <Placement x="28.5" y="87" z="330.5" pitch="-90" yaw="0"/> </AgentStart> <AgentHandlers> <DiscreteMovementCommands/> <MissionQuitCommands quitDescription="done"/> <ChatCommands/> <ObservationFromFullStats/> <ObservationFromGrid> <Grid name="sight"> <min x="{}" y="{}" z="{}"/> <max x="{}" y="{}" z="{}"/> </Grid> <Grid name="feet"> <min x="0" y="-1" z="0"/> <max x="0" y="-1" z="0"/> </Grid> </ObservationsationFromGrid> <AgentQuitFromTouchingBlockType> <Block type="cobblestone" /> </AgentQuitFromTouchingBlockType> </AgentHandlers> </AgentSection> </Mission> '''.format(-(grid_width - 1) // 2, -grid_height, -(grid_width - 1) // 2, (grid_width - 1) // 2, grid_height, (grid_width - 1) // 2) batch_size = 100 agent = DQNAgent(state_size, action_size, learning_rate, discount_rate, epsilon, epsilon_min, epsilon_decay) if model != None: agent.load(model) if mode == 'test': agent.epsilon = 0.0 print('loaded model: {}'.format(model)) else: clear_csv('./data/results.csv') clear_csv('./data/moves.csv') my_client_pool = MalmoPython.ClientPool() my_client_pool.add(MalmoPython.ClientInfo("127.0.0.1", 10001)) agent_host = MalmoPython.AgentHost() for e in range(start_episode + 1, episodes + 1): my_mission = MalmoPython.MissionSpec(my_xml, True) my_mission_record = MalmoPython.MissionRecordSpec() my_mission.requestVideo(800, 500) my_mission.setViewpoint(2) print("Waiting for the mission to start", end=' ') agent_host.startMission( my_mission, my_mission_record, ) world_state = agent_host.getWorldState() while not world_state.has_mission_begun: print(".", end="") time.sleep(0.1) world_state = agent_host.getWorldState() for error in world_state.errors: print("Error:", error.text) print() agent_host.sendCommand('chat /kill @e[type=Chicken]') agent_host.sendCommand('chat /kill @e[type=Pig]') agent_host.sendCommand('chat /kill @e[type=Cow]') moves = 0 episode_reward = 0 while world_state.is_mission_running: world_state = agent_host.getWorldState() if world_state.number_of_observations_since_last_state > 0: try: obvsText = world_state.observations[-1].text data = json.loads(obvsText) except: print("Error when getting state") continue state = get_state(data) prev_x = data.get(u'XPos', 0) prev_y = data.get(u'YPos', 0) prev_z = data.get(u'ZPos', 0) useful_state = [state[2], state[6], state[7], state[8], \ state[10], state[11], state[13], \ state[14], state[16], state[17], \ state[18], state[22]] action = agent.act(useful_state) if ((action == 0 and state[grid_center - grid_width] == 0) or (action == 1 and state[grid_center + 1] == 0) or (action == 2 and state[grid_center + grid_width] == 0) or (action == 3 and state[grid_center - 1] == 0)): agent_host.sendCommand(jump_directions[action]) else: agent_host.sendCommand(directions[action]) time.sleep(0.25) #print("North:", state[grid_center - grid_width], \ # " East:", state[grid_center + 1], \ # " South:", state[grid_center + grid_width], \ # " West:", state[grid_center - 1]) try: world_state = wait_world_state(agent_host, world_state) obvsText = world_state.observations[-1].text data = json.loads(obvsText) except: print("Error when getting state") continue current_x = data.get(u'XPos', 0) current_y = data.get(u'YPos', 0) current_z = data.get(u'ZPos', 0) damage_taken = calculate_damage(prev_y, current_y) next_state = get_state(data) useful_next_state = [state[2], state[6], state[7], state[8], \ state[10], state[11], state[13], \ state[14], state[16], state[17], \ state[18], state[22]] # print("previous and current y", prev_y, current_y) # print("damage taken", damage_taken) #print("X:", prev_x, current_x, "\n", \ # "Y:", prev_y, current_y, "\n", \ # "Z:", prev_z, current_z, "\n") reward = 2 * ( prev_y - current_y ) - 50 * damage_taken - 1 if prev_x != current_x or prev_y != current_y or prev_z != current_z else -1000 episode_reward += reward done = True if current_y <= goal_height or not world_state.is_mission_running or data[ 'Life'] <= 0 else False agent.remember(useful_state, action, reward, useful_next_state, done) if ((action == 0 and state[grid_center - grid_width] == 0) or (action == 1 and state[grid_center + 1] == 0) or (action == 2 and state[grid_center + grid_width] == 0) or (action == 3 and state[grid_center - 1] == 0)): print( 'episode {}/{}, action: {}, reward: {}, e: {:.2}, move: {}, done: {}' .format(e, episodes, jump_directions[action], reward, agent.epsilon, moves, done)) else: print( 'episode {}/{}, action: {}, reward: {}, e: {:.2}, move: {}, done: {}' .format(e, episodes, directions[action], reward, agent.epsilon, moves, done)) moves += 1 if mode == 'train' or model == None: write_to_csv('./data/moves.csv', [e, current_x, current_y, current_z, reward]) if e > batch_size: agent.replay(batch_size) if done or moves > max_moves: agent_host.sendCommand("quit") if (mode == 'train' or model == None) and (e in checkpoints or agent.epsilon <= epsilon_min): print('saving model at episode {}'.format(e)) agent.save('./models/model_{}'.format(e)) if agent.epsilon <= epsilon_min: break time.sleep(1) # my_mission.forceWorldReset() if mode == 'train' or model == None: write_to_csv('./data/results.csv', [e, episode_reward, moves, int(episode_reward > 0)])
class Executor: def __init__(self, config, env, eval_env): self.config = config self.env = env self.eval_env = eval_env self.stats = None self.student_agent = None self.evaluation_dir = None self.save_videos_path = None self.steps_reward = 0.0 self.steps_error_in = 0.0 self.steps_error_out = 0.0 self.episode_duration = 0 self.episode_reward = 0.0 self.episode_error_in = 0.0 self.episode_error_out = 0.0 self.episode_visited = set() self.obs_images = None self.tr_info = None # ============================================================================================================== self.process = None self.run_id = None self.scripts_dir = None self.local_workspace_dir = None self.runs_local_dir = None self.summaries_dir = None self.checkpoints_dir = None self.copy_scripts_dir = None self.videos_dir = None self.save_summary_path = None self.save_model_path = None self.save_scripts_path = None self.save_videos_path = None self.plots_subdirs = None self.save_plots_paths = None self.session = None self.summary_writer = None self.saver = None self.teacher_agent = None self.rnd_rm = None self.rnd_uncertainty_c = None self.rnd_uncertainty_d = None self.action_advising_enabled = None self.action_advising_budget = None self.action_advising_method = None self.action_advising_rm_th = self.config['action_advising_rm_th'] self.action_advising_check_rm = None # RND Observation normalization self.obs_running_mean = None self.obs_running_std = None self.obs_norm_n = 0 self.obs_norm_max_n = 5000 if self.config['env_type'] == 1 else 1000 self.data_collection_period = 500 if self.config[ 'env_type'] == 1 else 100 # Frames self.data_collection_step = 0 self.data_rnd_uncertainty = None self.data_rnd_rm = None # Online counters self.rm_obs_counter_all = None self.rm_obs_counter_teacher = None self.rm_tr_counter_all = None self.rm_tr_counter_teacher = None # Snapshots self.data_rm_obs_counter_all = None self.data_rm_obs_counter_teacher = None self.data_rm_tr_counter_all = None self.data_rm_tr_counter_teacher = None # ------------------------------------------------------------------------------------------------------------------ def render(self, env): if self.config['env_type'] == 0: return env.render() elif self.config['env_type'] == 1: return env.render_state() # ------------------------------------------------------------------------------------------------------------------ def run(self): os.environ['PYTHONHASHSEED'] = str(self.config['seed']) random.seed(self.config['seed']) np.random.seed(self.config['seed']) tf.set_random_seed(self.config['seed']) self.run_id = self.config['run_id'] if self.config['run_id'] is not None \ else strftime("%Y%m%d-%H%M%S", localtime()) + '-' + str(self.config['process_index']) self.seed_id = str(self.config['seed']) print('Run ID: {}'.format(self.run_id)) self.scripts_dir = os.path.dirname(os.path.abspath(__file__)) self.local_workspace_dir = os.path.join( str(pathlib.Path(self.scripts_dir).parent.parent.parent.parent)) print('{} (Scripts directory)'.format(self.scripts_dir)) print('{} (Local Workspace directory)'.format( self.local_workspace_dir)) self.runs_local_dir = os.path.join(self.local_workspace_dir, 'Runs') os.makedirs(self.runs_local_dir, exist_ok=True) self.summaries_dir = os.path.join(self.runs_local_dir, 'Summaries') os.makedirs(self.summaries_dir, exist_ok=True) self.checkpoints_dir = os.path.join(self.runs_local_dir, 'Checkpoints') os.makedirs(self.checkpoints_dir, exist_ok=True) self.copy_scripts_dir = os.path.join(self.runs_local_dir, 'Scripts') os.makedirs(self.copy_scripts_dir, exist_ok=True) self.videos_dir = os.path.join(self.runs_local_dir, 'Videos') os.makedirs(self.videos_dir, exist_ok=True) self.plots_dir = os.path.join(self.runs_local_dir, 'Plots') os.makedirs(self.plots_dir, exist_ok=True) self.data_dir = os.path.join(self.runs_local_dir, 'Data') os.makedirs(self.data_dir, exist_ok=True) # -------------------------------------------------------------------------------------------------------------- self.save_summary_path = os.path.join(self.summaries_dir, self.run_id, self.seed_id) self.save_model_path = os.path.join(self.checkpoints_dir, self.run_id, self.seed_id) self.save_scripts_path = os.path.join(self.copy_scripts_dir, self.run_id, self.seed_id) self.save_videos_path = os.path.join(self.videos_dir, self.run_id, self.seed_id) self.save_data_path = os.path.join(self.data_dir, self.run_id, self.seed_id) self.plots_subdirs = [] self.plots_subdirs.append(os.path.join(self.plots_dir, 'TD-Error-All')) # 0 self.plots_subdirs.append( os.path.join(self.plots_dir, 'State-Uncertainty')) # 1 self.plots_subdirs.append(os.path.join(self.plots_dir, 'Combined')) # 2 self.plots_subdirs.append(os.path.join(self.plots_dir, 'ER')) # 3 for graphs_subdir in self.plots_subdirs: os.makedirs(graphs_subdir, exist_ok=True) self.save_plots_paths = [ os.path.join(plots_subdir, self.run_id, self.seed_id) for plots_subdir in self.plots_subdirs ] for save_plots_path in self.save_plots_paths: os.makedirs(save_plots_path, exist_ok=True) if self.config['save_models']: os.makedirs(self.save_model_path, exist_ok=True) os.makedirs(self.save_videos_path, exist_ok=True) os.makedirs(self.save_data_path, exist_ok=True) self.copy_scripts(self.save_scripts_path) if self.config['use_gpu']: print('Using GPU.') session_config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) else: print('Not using GPU.') session_config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, allow_soft_placement=True, device_count={ 'CPU': 1, 'GPU': 0 }) self.session = tf.InteractiveSession(graph=tf.get_default_graph(), config=session_config) self.summary_writer = tf.summary.FileWriter(self.save_summary_path, self.session.graph) # -------------------------------------------------------------------------------------------------------------- # Experiment setup format: abc (0: No Advising) # a: Action advising method # -- 1: Early advising # -- 2: Uniformly random advising # -- 3: Uncertainty based advising (RND) # -- 4: Uncertainty based bootstrapped DQN # b: Replay memory checking (True/False) # c: Budget if self.config['experiment_setup'] == 0: # Self exploration self.action_advising_enabled = False self.action_advising_check_rm = False self.action_advising_budget = 0 else: es_1 = self.config['experiment_setup'] es_2 = es_1 % 100 es_3 = es_2 % 10 action_advising_budgets = { 0: 500, 1: 1000, 2: 2500, 3: 5000, 4: 10000, 5: 25000, 6: 50000, 7: 100000 } self.action_advising_enabled = True self.action_advising_method = es_1 // 100 self.action_advising_check_rm = es_2 // 10 self.action_advising_budget = action_advising_budgets[es_3] # -------------------------------------------------------------------------------------------------------------- # Config to be passed to agents if self.config['env_type'] == 0: self.config['env_obs_dims'] = self.env.obs_space.shape self.config['env_n_actions'] = self.env.action_space.n elif self.config['env_type'] == 1: self.config['env_obs_dims'] = self.env.state_shape() self.config['env_n_actions'] = self.env.num_actions() student_agent_name = self.run_id.replace('-', '') + '0' + '_' + str( self.config['seed']) self.student_agent = DQNAgent(student_agent_name, self.config, self.session, 'task') self.config['student_model_name'] = self.student_agent.name print('Student agent name: {}'.format(self.student_agent.name)) self.save_config(self.config, os.path.join(self.save_summary_path, 'config.txt')) if self.config[ 'env_type'] == 1 and self.config['experiment_setup'] != 0: self.teacher_agent = DQNAgent( self.config['expert_agent_id'].replace("-", "") + '0_' + self.config['expert_agent_seed'], self.config, self.session, 'task') print('Expert agent name: {}'.format(self.teacher_agent.name)) # -------------------------------------------------------------------------------------------------------------- # Initialize RND: if self.action_advising_check_rm == 2: # RM checking with RND self.rnd_rm = RND(student_agent_name + '_RM', self.config, self.session, self.config['rm_rnd_learning_rate'], self.config['rm_rnd_adam_epsilon']) self.student_agent.rnd_rm = self.rnd_rm # -------------------------------------------------------------------------------------------------------------- if self.config['env_type'] == 0: n_data_points = int(self.config['n_training_frames'] / self.data_collection_period) + 1 self.data_rnd_uncertainty = \ np.zeros((self.config['env_obs_dims'][0], self.config['env_obs_dims'][1], n_data_points), dtype=np.float32) self.data_rnd_rm = \ np.zeros((self.config['env_obs_dims'][0], self.config['env_obs_dims'][1], n_data_points), dtype=np.float32) # Online counters self.rm_obs_counter_all = np.zeros(self.env.n_states, dtype=int) self.rm_obs_counter_teacher = np.zeros(self.env.n_states, dtype=int) self.rm_tr_counter_all = np.zeros(self.env.n_transitions, dtype=int) self.rm_tr_counter_teacher = np.zeros(self.env.n_transitions, dtype=int) # Snapshots self.data_rm_obs_counter_all = np.zeros( (self.env.n_states, n_data_points), dtype=int) self.data_rm_obs_counter_teacher = np.zeros( (self.env.n_states, n_data_points), dtype=int) self.data_rm_tr_counter_all = np.zeros( (self.env.n_transitions, n_data_points), dtype=int) self.data_rm_tr_counter_teacher = np.zeros( (self.env.n_transitions, n_data_points), dtype=int) # -------------------------------------------------------------------------------------------------------------- total_parameters = 0 for variable in tf.trainable_variables(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters print('Number of parameters: {}'.format(total_parameters)) self.saver = tf.train.Saver() self.session.run(tf.global_variables_initializer()) self.stats = Statistics(self.summary_writer, self.session) # Restore if self.config['experiment_setup'] != 0: if self.config['env_type'] == 1: self.teacher_agent.restore( self.checkpoints_dir, self.config['expert_agent_id'] + '/' + self.config['expert_agent_seed'], self.config['expert_agent_checkpoint']) if not self.config['save_models']: tf.get_default_graph().finalize() reward_is_seen = False if self.stats.n_env_steps % self.config['evaluation_period'] == 0: eval_score = self.evaluate() print('Evaluation @ {} | {}'.format(self.stats.n_env_steps, eval_score)) render = self.stats.n_episodes % self.config[ 'visualization_period'] == 0 if render: self.obs_images = [] self.tr_info = [] obs = None if self.config['env_type'] == 0: obs = self.env.reset() elif self.config['env_type'] == 1: self.env.reset() obs = self.env.state().astype(dtype=np.float32) if render: self.obs_images.append(self.render(self.env)) if self.config['env_type'] == 0: self.record_data() while True: if self.obs_norm_n < self.obs_norm_max_n: obs_mean = obs.mean(axis=(0, 1)) obs_std = obs.std(axis=(0, 1)) if self.obs_norm_n == 0: self.obs_running_mean = obs_mean self.obs_running_std = obs_std else: self.obs_running_mean = \ self.obs_running_mean + (obs_mean - self.obs_running_mean)/(self.obs_norm_n + 1) self.obs_running_std = \ self.obs_running_std + (obs_std - self.obs_running_std) / (self.obs_norm_n + 1) self.obs_norm_n += 1 if self.obs_norm_n == self.obs_norm_max_n: print(self.obs_running_mean) print(self.obs_running_std) self.obs_norm_n += 1 # state_id = self.env.state_id_dict[(self.env.state.agent_pos[0], self.env.state.agent_pos[1])] \ if self.config['env_type'] == 0 else None # ---------------------------------------------------------------------------------------------------------- # Action Advising get_action_advice = False if self.action_advising_enabled and self.action_advising_budget > 0: if self.action_advising_method == 1: get_action_advice = True elif self.action_advising_method == 2: if random.random() < 0.5: get_action_advice = True # Second-factor check for RM if get_action_advice and self.action_advising_check_rm != 0: if self.action_advising_check_rm == 1: if self.rm_obs_counter_teacher[ state_id] >= self.action_advising_rm_th: get_action_advice = False elif self.action_advising_check_rm == 2: sparsity = self.rnd_rm.get_error(obs, normalize=True) if sparsity >= self.action_advising_rm_th: pass else: get_action_advice = False if get_action_advice: self.action_advising_budget -= 1 self.stats.advices_taken += 1 self.stats.advices_taken_cumulative += 1 if self.config['env_type'] == 0: action = self.env.optimal_action() elif self.config['env_type'] == 1: action = self.teacher_agent.greedy_action(obs, evaluation=True) source = 1 else: action = self.student_agent.act(obs, evaluation=False) source = 0 # ---------------------------------------------------------------------------------------------------------- transition_id = self.env.transition_id_dict[(self.env.state.agent_pos[0], self.env.state.agent_pos[1], action)] \ if self.config['env_type'] == 0 else None obs_next, reward, done = None, None, None if self.config['env_type'] == 0: obs_next, reward, done = self.env.step(action) elif self.config['env_type'] == 1: reward, done = self.env.act(action) obs_next = self.env.state().astype(dtype=np.float32) td_error = self.student_agent.get_td_error(obs, action, reward, obs_next, done) if render: self.obs_images.append(self.render(self.env)) self.episode_error_in += td_error self.episode_reward += reward self.episode_duration += 1 self.steps_error_in += td_error self.steps_reward += reward self.stats.n_env_steps += 1 if reward > 0 and reward_is_seen is False: reward_is_seen = True print(">>> Reward is seen at ", self.stats.n_episodes, "|", self.episode_duration) if source == 1: if self.action_advising_check_rm == 2: self.rnd_rm.train_model(obs, loss_id=0, is_batch=False, normalize=True) if self.config['env_type'] == 0: self.rm_obs_counter_all[state_id] += 1 self.rm_tr_counter_all[transition_id] += 1 if source == 1: self.rm_obs_counter_teacher[state_id] += 1 self.rm_tr_counter_teacher[transition_id] += 1 # ---------------------------------------------------------------------------------------------------------- # Dropped data from RM old_transition = self.student_agent.feedback_observe( obs, action, reward, obs_next, done, source, state_id) if old_transition is not None: if self.action_advising_check_rm == 2 and old_transition[ 5] == 1: self.rnd_rm.train_model(old_transition[0], loss_id=1, is_batch=False, normalize=True) if self.config['env_type'] == 0: old_state_id = old_transition[6] old_action = old_transition[1] old_agent_pos = self.env.agent_pos_dict[old_state_id] old_transition_id = self.env.transition_id_dict[( old_agent_pos[0], old_agent_pos[1], old_action)] self.rm_obs_counter_all[old_state_id] -= 1 self.rm_tr_counter_all[old_transition_id] -= 1 if old_transition[5] == 1: self.rm_obs_counter_teacher[old_state_id] -= 1 self.rm_tr_counter_teacher[old_transition_id] -= 1 # ---------------------------------------------------------------------------------------------------------- td_error_batch, loss = self.student_agent.feedback_learn() td_error_batch_sum = np.sum(td_error_batch) self.episode_error_out += td_error_batch_sum self.steps_error_out += td_error_batch_sum self.stats.loss += loss obs = obs_next if self.config[ 'env_type'] == 0 and self.stats.n_env_steps % self.data_collection_period == 0: self.record_data() if done: self.action_advising_countdown = 0 self.stats.n_episodes += 1 self.stats.episode_reward_auc += np.trapz( [self.stats.episode_reward_last, self.episode_reward]) self.stats.episode_reward_last = self.episode_reward self.stats.update_summary_episode( self.episode_reward, self.stats.episode_reward_auc, self.episode_duration, self.episode_error_in, self.episode_error_out) print( 'ER: {:.1f} ({}) (error: {:.3f}) @ {} frames - {}'.format( self.episode_reward, self.stats.n_episodes, self.episode_error_in, self.stats.n_env_steps, self.stats.advices_taken_cumulative)) if render: self.write_video( self.obs_images, '{}_{}'.format( str(self.stats.n_episodes - 1), str(self.stats.n_env_steps - self.episode_duration))) self.obs_images.clear() self.tr_info.clear() self.episode_duration = 0 self.episode_reward = 0.0 self.episode_error_in = 0.0 self.episode_error_out = 0.0 render = self.stats.n_episodes % self.config[ 'visualization_period'] == 0 obs = None if self.config['env_type'] == 0: obs = self.env.reset() elif self.config['env_type'] == 1: self.env.reset() obs = self.env.state().astype(dtype=np.float32) if render: self.obs_images.append(self.render(self.env)) # Per N steps summary update if self.stats.n_env_steps % self.stats.n_steps_per_update == 0: self.stats.steps_reward_auc += np.trapz( [self.stats.steps_reward_last, self.steps_reward]) self.stats.steps_reward_last = self.steps_reward self.stats.epsilon = self.student_agent.epsilon self.stats.update_summary_steps(self.steps_reward, self.stats.steps_reward_auc, self.steps_error_in, self.steps_error_out) self.stats.advices_taken = 0.0 self.stats.exploration_steps_taken = 0 self.steps_reward = 0.0 self.steps_error_in = 0.0 self.steps_error_out = 0.0 if self.stats.n_env_steps % self.config['evaluation_period'] == 0: evaluation_score = self.evaluate() print('Evaluation ({}): {}'.format(self.stats.n_episodes, evaluation_score)) if self.config[ 'save_models'] and self.stats.n_env_steps % self.config[ 'model_save_period'] == 0: model_path = os.path.join(os.path.join(self.save_model_path), 'model-{}.ckpt').format( self.stats.n_env_steps) print('[{}] Saving model... {}'.format(self.stats.n_env_steps, model_path)) self.saver.save(self.session, model_path) if self.stats.n_env_steps >= self.config['n_training_frames']: if self.config['save_models']: model_path = os.path.join( os.path.join(self.save_model_path), 'model-{}.ckpt').format(self.stats.n_env_steps) print('[{}] Saving model... {}'.format( self.stats.n_env_steps, model_path)) self.saver.save(self.session, model_path) break print('Env steps: {}'.format(self.stats.n_env_steps)) if self.config['env_type'] == 0: self.save_data() self.session.close() def write_video(self, images, filename): v_w = np.shape(images[0])[0] v_h = np.shape(images[0])[1] filename_full = os.path.join(self.save_videos_path, str(filename)) video = cv2.VideoWriter(filename_full + '.avi', cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 20, (v_h, v_w)) for image in images: video.write(image) video.release() def copy_scripts(self, target_directory): if not os.path.exists(target_directory): os.makedirs(target_directory) files = glob.iglob(os.path.join(self.scripts_dir, '*.py')) for file in files: if os.path.isfile(file): shutil.copy2(file, target_directory) def save_config(self, config, filepath): fo = open(filepath, "w") for k, v in config.items(): fo.write(str(k) + '>> ' + str(v) + '\n') fo.close() def evaluate(self): eval_render = self.stats.n_evaluations % self.config[ 'evaluation_visualization_period'] == 0 eval_total_reward = 0.0 eval_duration = 0 if self.config['env_type'] == 0 or self.config['env_type'] == 1: self.eval_env.set_random_state(self.config['env_evaluation_seed']) for i_eval_trial in range(self.config['n_evaluation_trials']): eval_obs_images = [] eval_obs = None if self.config['env_type'] == 0: eval_obs = self.eval_env.reset() elif self.config['env_type'] == 1: self.eval_env.reset() eval_obs = self.eval_env.state().astype(dtype=np.float32) eval_episode_reward = 0.0 eval_episode_duration = 0 while True: if eval_render: eval_obs_images.append(self.render(self.eval_env)) eval_action = self.student_agent.greedy_action(eval_obs, evaluation=True) eval_obs_next, eval_reward, eval_done = None, None, None if self.config['env_type'] == 0: eval_obs_next, eval_reward, eval_done = self.eval_env.step( eval_action) elif self.config['env_type'] == 1: eval_reward, eval_done = self.eval_env.act(eval_action) eval_obs_next = self.eval_env.state().astype( dtype=np.float32) eval_episode_reward += eval_reward eval_duration += 1 eval_episode_duration += 1 eval_obs = eval_obs_next if eval_done: if self.config['env_type'] == 0: if eval_episode_reward == 1.0: eval_episode_reward = 1.0 - ( eval_episode_duration - 24) / 76.0 if eval_render: eval_obs_images.append(self.render(self.eval_env)) self.write_video( eval_obs_images, 'E_{}_{}'.format(str(self.stats.n_episodes), str(self.stats.n_env_steps))) eval_obs_images.clear() eval_render = False eval_total_reward += eval_episode_reward break eval_mean_reward = eval_total_reward / float( self.config['n_evaluation_trials']) self.stats.evaluation_reward_auc += np.trapz( [self.stats.evaluation_reward_last, eval_mean_reward]) self.stats.evaluation_reward_last = eval_mean_reward self.stats.n_evaluations += 1 self.stats.update_summary_evaluation(eval_mean_reward, eval_duration, self.stats.evaluation_reward_auc) return eval_mean_reward # ------------------------------------------------------------------------------------------------------------------ def record_data(self): # Grid for n in range(len(self.env.passage_positions[0])): y = self.env.passage_positions[0][n] x = self.env.passage_positions[1][n] obs = self.env.generate_obs((y, x)) #if self.ac: if self.action_advising_method == 3: self.data_rnd_uncertainty[ y, x, self. data_collection_step] = self.student_agent.get_uncertainty( obs) if self.rnd_rm is not None: self.data_rnd_rm[y, x, self.data_collection_step] = \ self.rnd_rm.get_error(obs, normalize=True) self.data_rm_obs_counter_all[:, self. data_collection_step] = self.rm_obs_counter_all.copy( ) self.data_rm_obs_counter_teacher[:, self. data_collection_step] = self.rm_obs_counter_teacher.copy( ) self.data_rm_tr_counter_all[:, self. data_collection_step] = self.rm_tr_counter_all.copy( ) self.data_rm_tr_counter_teacher[:, self. data_collection_step] = self.rm_tr_counter_teacher.copy( ) self.data_collection_step += 1 # ------------------------------------------------------------------------------------------------------------------ def save_data(self): np.save(os.path.join(self.save_data_path, 'RND_Uncertainty.npy'), self.data_rnd_uncertainty) np.save(os.path.join(self.save_data_path, 'RND_RM.npy'), self.data_rnd_rm) np.save(os.path.join(self.save_data_path, 'RM_Obs_All.npy'), self.data_rm_obs_counter_all) np.save(os.path.join(self.save_data_path, 'RM_Obs_Teacher.npy'), self.data_rm_obs_counter_teacher) np.save(os.path.join(self.save_data_path, 'RM_TR_All.npy'), self.data_rm_tr_counter_all) np.save(os.path.join(self.save_data_path, 'RM_TR_Teacher.npy'), self.data_rm_tr_counter_teacher)
env = gym.make('LunarLander-v2') env.seed(0) print('State shape: ', env.observation_space.shape) print('Number of actions: ', env.action_space.n) agent = DQNAgent(state_size=8, action_size=4, seed=0) agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) memory = ReplayBuffer((8, ), (1, ), 20000, 'cuda') n_episodes = 40 max_t = 500 eps = 0 for i_episode in range(1, n_episodes + 1): state = env.reset() score = 0 for t in range(max_t): action = agent.act(state, eps) next_state, reward, done, _ = env.step(action) score += reward memory.add(state, action, reward, next_state, done, done) state = next_state # env.render() if done: print("Episode {} Reward {}".format(i_episode, score)) break mkdir("", "expert_policy") print("save memory ...") memory.save_memory("expert_policy") print("... memory saved")
def run(): # environment name env = gym.make('LunarLander-v2') plt.figure() all_scores = [] all_losses = [] all_t = [] agent = DQNAgent( env.observation_space.shape[0], # first 2 are position in x axis and y axis(hieght) , other 2 are the x,y axis velocity terms, # lander angle and angular velocity, left and right left contact points (bool) env.action_space.n, args ) is_end = False for e in range(args.episodes): s_t0 = env.reset() reward_total = 0 episode_loss = [] is_win = False for t in range(args.max_steps): if args.is_render and len(all_scores): # and all_scores[-1] > 0: # if e % 10 == 0 and all_scores[-1] > 0: env.render() a_t0 = agent.act(s_t0) s_t1, r_t1, is_end, _ = env.step(a_t0) reward_total += r_t1 if t == args.max_steps - 1: r_t1 = -100 is_end = True agent.replay_memory.push( (s_t0, a_t0, r_t1, s_t1, is_end) ) s_t0 = s_t1 if len(agent.replay_memory) > args.batch_size: loss = agent.replay() episode_loss.append(loss) if is_end: all_scores.append(reward_total) all_losses.append(np.mean(episode_loss)) ''' if terminal reward is =100 => landed https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py#L381 ''' if r_t1 >= 100: is_win = True break all_t.append(t) metrics_episode = { 'loss': all_losses[-1], 'score': reward_total, 't': t, 'e': agent.epsilon, 'is_win': is_win } if args.is_csv is True: CsvUtils.add_hparams( sequence_dir=os.path.join('.', args.sequence_name), sequence_name=args.sequence_name, run_name=args.run_name, args_dict=args.__dict__, metrics_dict=metrics_episode, global_step=e ) else: logging.info(f'episode: {e}/{args.episodes} ', metrics_episode) print(f'episode: {e}/{args.episodes} ', metrics_episode) if e % 100 == 0 and not args.is_inference: # save logs, graphics and weights during training plt.clf() plt.subplot(3, 1, 1) plt.ylabel('Score') plt.plot(all_scores) plt.subplot(3, 1, 2) plt.ylabel('Loss') plt.plot(all_losses) plt.subplot(3, 1, 3) plt.ylabel('Steps') plt.plot(all_t) plt.xlabel('Episode') plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png')) torch.save(agent.q_model.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}.pt')) env.close()
def train(conf: dict) -> dict: env = gym.make(**conf['env']) env.seed(conf['seed']) conf['action_size'] = env.action_space.n conf['device'] = torch.device( "cuda" if torch.cuda.is_available() else "cpu") module, model_to_use = conf['model_to_use'] model = getattr(globals()[module], model_to_use) conf['model'] = model crop_params = conf['preprocess']['exclude'] n_episodes = conf['n_episodes'] scores = [] epsilons = [] scores_window = deque(maxlen=20) eps = conf['eps_start'] # Evaluate the agent based on the mean of the Q values on the fixed set # of states fixed_states = collect_fixed_set_of_states(conf, env) average_action_values = [] agent = DQNAgent(**exp_conf) agent_hps = np.inf for i_episode in range(1, n_episodes + 1): state = stack_frames(None, env.reset(), crop_params, True) score = 0 epsilons.append(eps) eps = decay_epsilon(conf, i_episode) while True: # env.render() action = agent.act(state, eps) next_state, reward, done, info = env.step(action) if reward == 0.0 and not done: reward += -0.01 if agent_hps == np.inf: agent_hps = info['ale.lives'] elif info['ale.lives'] < agent_hps: reward += -50.0 agent_hps += -1 score += reward next_state = stack_frames(state, next_state, crop_params, False) agent.step(state, action, reward, next_state, done) state = next_state if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score avg_av = agent.evaluate_on_fixed_set(fixed_states) average_action_values.append(avg_av) print(f'Episode {i_episode}\tAverage Score: ' f'{round(np.mean(scores_window),4)}\tEpsilon: {round(eps, 4)}\t' f'Average Q value: {round(avg_av, 4)}') if i_episode % conf['save_every'] == 0 and i_episode > 0: print(f'Saving model at iteration: {i_episode}') save_model(conf, agent) env.close() return { 'scores': scores, 'epsilons': epsilons, 'avg_action_values': average_action_values }
def run(): game = ple.games.flappybird.FlappyBird() # game = ple.games.snake.Snake(width=512, height=512) # game = ple.games.pong.Pong(width=512, height=512) p = ple.PLE(game, fps=30, display_screen=args.is_render) p.init() plt.figure() all_scores = [] all_losses = [] all_t = [] agent = DQNAgent(len(p.getGameState()), len(p.getActionSet()), args) is_end = p.game_over() for e in range(args.episodes): p.reset_game() s_t0 = np.asarray(list(p.getGameState().values()), dtype=np.float32) reward_total = 0 pipes = 0 episode_loss = [] for t in range(args.max_steps): a_t0_idx = agent.act(s_t0) a_t0 = p.getActionSet()[a_t0_idx] r_t1 = p.act(a_t0) is_end = p.game_over() s_t1 = np.asarray(list(p.getGameState().values()), dtype=np.float32) reward_total += r_t1 ''' from /PyGame-Learning-Environment/ple/games/base/pygamewrapper.py self.rewards = { "positive": 1.0, "negative": -1.0, "tick": 0, "loss": -5.0, "win": 5.0 } ''' if r_t1 == 1.0: pipes += 1 if t == args.max_steps - 1: r_t1 = -100 is_end = True agent.replay_memory.push( (s_t0, a_t0_idx, r_t1, s_t1, is_end) ) s_t0 = s_t1 if len(agent.replay_memory) > args.batch_size: loss = agent.replay() episode_loss.append(loss) if is_end: all_scores.append(reward_total) all_losses.append(np.mean(episode_loss)) break all_t.append(t) metrics_episode = { 'loss': all_losses[-1], 'score': reward_total, 't': t, 'e': agent.epsilon, 'pipes': pipes } if args.is_csv is True: CsvUtils.add_hparams( sequence_dir=os.path.join('.', args.sequence_name), sequence_name=args.sequence_name, run_name=args.run_name, args_dict=args.__dict__, metrics_dict=metrics_episode, global_step=e ) else: logging.info(f'episode: {e}/{args.episodes} ', metrics_episode) print(f'episode: {e}/{args.episodes} ', metrics_episode) if e % 100 == 0 and not args.is_inference: # save logs, graphics and weights during training plt.clf() plt.subplot(3, 1, 1) plt.ylabel('Score') plt.plot(all_scores) plt.subplot(3, 1, 2) plt.ylabel('Loss') plt.plot(all_losses) plt.subplot(3, 1, 3) plt.ylabel('Steps') plt.plot(all_t) plt.xlabel('Episode') plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png')) torch.save(agent.q_model.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}.pt'))