def train(cfg): print('Start to train ! \n') env = NormalizedActions(gym.make("Pendulum-v0")) # 增加action噪声 ou_noise = OUNoise(env.action_space) n_states = env.observation_space.shape[0] n_actions = env.action_space.shape[0] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = DDPG(n_states, n_actions, device="cpu", critic_lr=1e-3, actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) rewards = [] moving_average_rewards = [] ep_steps = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE writer = SummaryWriter(log_dir) for i_episode in range(1, cfg.train_eps + 1): state = env.reset() ou_noise.reset() ep_reward = 0 for i_step in range(1, cfg.train_steps + 1): action = agent.select_action(state) action = ou_noise.get_action(action, i_step) # 即paper中的random process next_state, reward, done, _ = env.step(action) ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state if done: break print('Episode:', i_episode, ' Reward: %i' % int(ep_reward), 'n_steps:', i_step) ep_steps.append(i_step) rewards.append(ep_reward) if i_episode == 1: moving_average_rewards.append(ep_reward) else: moving_average_rewards.append(0.9 * moving_average_rewards[-1] + 0.1 * ep_reward) writer.add_scalars('rewards', { 'raw': rewards[-1], 'moving_average': moving_average_rewards[-1] }, i_episode) writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode) writer.close() print('Complete training!') ''' 保存模型 ''' if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 os.mkdir(SAVED_MODEL_PATH) agent.save_model(SAVED_MODEL_PATH + 'checkpoint.pth') '''存储reward等相关结果''' if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 os.mkdir(RESULT_PATH) np.save(RESULT_PATH + 'rewards_train.npy', rewards) np.save(RESULT_PATH + 'moving_average_rewards_train.npy', moving_average_rewards) np.save(RESULT_PATH + 'steps_train.npy', ep_steps)
np.random.seed(42) env.seed(42) torch.manual_seed(42) torch.cuda.manual_seed(42) writer = SummaryWriter(log_dir='logs/') agent = DDPG(env, writer) all_timesteps = 0 for e in range(epoch): noise = OUActionNoise(env.action_space.shape[0]) state = env.reset() cumulative_reward = 0 for timestep in range(200): action = agent.get_action(state, noise, timestep) state_, reward, done, _ = env.step(action * env.action_space.high[0]) # env.render() agent.store_transition(state, action, state_, reward, done) state = state_ cumulative_reward += reward agent.update(all_timesteps) all_timesteps += 1 print('Epoch : {} / {}, Cumulative Reward : {}'.format( e, epoch, cumulative_reward)) writer.add_scalar("reward", cumulative_reward, e) agent.save_model()
agent = DDPG(env, writer) all_timesteps = 0 for e in range(epoch): noise = OUActionNoise(env.action_space.shape[0]) env.reset() pixel = env.render(mode='rgb_array') state = deque([get_screen(pixel) for _ in range(3)], maxlen=3) cumulative_reward = 0 for timestep in range(200): action = agent.get_action(np.array(state)[np.newaxis], noise, timestep) _, reward, done, _ = env.step(action * env.action_space.high[0]) pixel = env.render(mode='rgb_array') state_ = state.copy() state_.append(get_screen(pixel)) agent.store_transition(np.array(state), action, np.array(state_), reward, done) state = state_ cumulative_reward += reward agent.update(all_timesteps, batch_size=16) all_timesteps += 1 print('Epoch : {} / {}, Cumulative Reward : {}'.format( e, epoch, cumulative_reward)) writer.add_scalar("reward", cumulative_reward, e) if e % 500 == 0: agent.save_model('models/' + str(e) + '_') agent.save_model()