def train(cfg,env,agent): print('Start to train ! ') ou_noise = OUNoise(env.action_space) # action noise rewards = [] ma_rewards = [] # moving average rewards ep_steps = [] for i_episode in range(cfg.train_eps): state = env.reset() ou_noise.reset() done = False ep_reward = 0 i_step = 0 while not done: i_step += 1 action = agent.choose_action(state) action = ou_noise.get_action(action, i_step) # 即paper中的random process next_state, reward, done, _ = env.step(action) ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward)) ep_steps.append(i_step) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) print('Complete training!') return rewards,ma_rewards
def train(cfg, env, agent): print('开始训练!') print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}') ou_noise = OUNoise(env.action_space) # 动作噪声 rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.train_eps): state = env.reset() ou_noise.reset() done = False ep_reward = 0 i_step = 0 while not done: i_step += 1 action = agent.choose_action(state) action = ou_noise.get_action(action, i_step) next_state, reward, done, _ = env.step(action) ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state if (i_ep+1)%10 == 0: print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) print('完成训练!') return rewards, ma_rewards