def train(rank, device, args): current_time = datetime.now().strftime('%b%d_%H-%M') LOGGER_DIR = os.path.join(args.log_dir, args.env, current_time, 'Agent:{}'.format(rank)) writer = SummaryWriter(LOGGER_DIR) MODEL_DIR = os.path.join(LOGGER_DIR, 'models') if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) env = create_env(args.env, args) if args.pri: ram = PrioMemoryBuffer(args.buffer_size) else: ram = MemoryBuffer(args.buffer_size) player = DDPGAgent(env.observation_space, env.action_space, ram, writer, device, args) if args.model_dir is not None: player.load_models(args.model_dir) steps_done = 0 episode_rewards = [] max_score = -9999 count_eps = 0 for _ep in range(1, args.max_eps): observation = env.reset() total_reward = 0 count_eps += 1 for r in range(10000): if 'img' in args.obs: state = np.expand_dims(observation, axis=0) else: state = np.float32(observation) action, action_rescale = player.get_exploration_action(state) new_observation, reward, done, info = env.step(action_rescale) steps_done += 1 total_reward += reward ram.add(observation, np.expand_dims(action, axis=0), reward, new_observation) observation = new_observation # perform optimization if steps_done > args.start_learning: player.optimize() if done: break # logger writer.add_scalar('episode/reward', total_reward, steps_done) writer.add_scalar('episode/length', r, steps_done) episode_rewards.append(total_reward) if _ep % args.eval_eps == 0: reward_ave = np.array(episode_rewards).mean() print('Train, episode %d, steps: %d reward: %.3f,ave_reward: %.3f' % (count_eps, steps_done, episode_rewards[-1], reward_ave)) if reward_ave > max_score: player.save_models(os.path.join(MODEL_DIR, 'best')) max_score = reward_ave print('Save Best!') else: player.save_models(os.path.join(MODEL_DIR, 'new')) episode_rewards = [] # check memory consumption and clear memory gc.collect()
def test(device, args): env = create_env(args.env, args) ram = MemoryBuffer(1) player = DDPGAgent(env.observation_space, env.action_space, ram, None, device, args) if args.model_dir is not None: player.load_models(args.model_dir, test=True) steps_done = 0 count_eps = 0 count_success = 0 while True: episode_rewards = [] episode_lenghts = [] for _ep in range(1, args.eval_eps): if args.ar: env.seed(True) observation = env.reset() total_reward = 0 episode_action = [] for steps in range(1000): if 'img' in args.obs: state = np.expand_dims(observation, axis=0) else: state = np.float32(observation) action, action_rescale = player.get_exploitation_action(state) episode_action.append(action) new_observation, reward, done, info = env.step(action_rescale) observation = new_observation total_reward += reward steps_done += 1 if args.render: env.render() if done: episode_rewards.append(total_reward) count_eps += 1 episode_lenghts.append(steps) if reward > 1: count_success += 1.0 break # check memory consumption and clear memory gc.collect() reward_ave = np.array(episode_rewards).mean() length_ave = np.array(episode_lenghts).mean() print( 'Test, episode %d, steps: %d, Success_rate: %.3f ave_reward: %.3f, ave_length: %.3f' % (count_eps, steps_done, count_success / count_eps, reward_ave, length_ave)) env.close()
'tau': 0.01, 'noise_sigma': 0.2 } else: print('Environment unknown!') exit() num_episodes = 200 num_steps = 250 #------------------------------# #------Hyperparameters---------# #------------------------------# agent = DDPGAgent(env, hyperparameter) agent.load_models(model_path) rets = [] render = False for e in range(num_episodes): ret = 0 s = env.reset() for step in range(num_steps): a = agent.take_action(s, greedy=True) s_next, r, done, _ = env.step(a) # Press Enter in the console to activate/deactivate Rendering render = rendering(env, render, r) ret += r if done: break s = s_next
environment.close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--testing', type=int, default=0) parser.add_argument('--mode', type=int, default=PENDULUM) args = parser.parse_args() mode = args.mode player2 = lh.BasicOpponent() if mode == TRAIN_SHOOTING: imitation_data = "imitations_shooting.pt" elif mode == TRAIN_DEFENSE: imitation_data = "imitations_defense.pt" else: imitation_data = "imitations_normal.pt" environment, action_size = create_environment(mode, args.testing) agent = DDPGAgent(environment.observation_space.shape[0], action_size, environment.action_space.high[0], environment.action_space.low[0], imitation_data) if args.testing: agent.load_models() for _ in range(20): test(environment, agent, mode, player2, True) else: #agent.load_models() train(environment, agent, mode, player2)