def main(): env = ContinuousCartPoleEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # 使用PARL框架创建agent model = Model(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, act_dim) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # 往经验池中预存数据 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(agent, env, rpm) episode = 0 is_render = False while episode < TRAIN_EPISODE: for i in range(50): total_reward = run_episode(agent, env, rpm) episode += 1 # eval_reward = evaluate(env, agent, render=False) eval_reward = evaluate(env, agent, is_render) logger.info('episode:{} Test reward:{}'.format( episode, eval_reward))
def train(n_episodes=5000, max_t=700): env = gym.make('BipedalWalker-v2') env.seed(10) scores_deque = deque(maxlen=100) scores = [] max_score = -np.Inf obs_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] model = Model(state_size=obs_size, action_size=action_size) target_model = Model(state_size=obs_size, action_size=action_size) alg = DDPG(model, target_model, gamma=0.99, tau=1e-3, actor_lr=1e-4, critic_lr=3e-4) agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10) for i_episode in range(1, n_episodes + 1): state = env.reset() #agent.reset() score = 0 for t in range(max_t): action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward * 0.01, next_state, done) state = next_state score += reward if done: break scores_deque.append(score) scores.append(score) #print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) if np.mean(scores_deque) > 200: torch.save(agent.alg.model.actor_model.state_dict(), 'walker_actor.pth') torch.save(agent.alg.model.critic_model.state_dict(), 'walker_critic.pth') break return scores
def main(): env = ContinuousCartPoleEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # 使用PARL框架创建agent model = Model(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, act_dim) # 加载模型 if os.path.exists('./model.ckpt'): agent.restore('./model.ckpt') eval_reward = evaluate(env, agent, render=True) print("eval_reward=", eval_reward) exit() # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # 往经验池中预存数据 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(agent, env, rpm) episode = 0 while episode < TRAIN_EPISODE: print("start training, episode=", episode) for i in range(50): total_reward = run_episode(agent, env, rpm) episode += 1 print("episode=", episode, "total_reward=", total_reward) eval_reward = evaluate(env, agent, render=False) logger.info('episode:{} Test reward:{}'.format( episode, eval_reward)) agent.save('./model.ckpt')
def evaluate(render=True): env = ContinuousCartPoleEnv() obs_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] model = Model(state_size=obs_size, action_size=action_size) target_model = Model(state_size=obs_size, action_size=action_size) alg = DDPG(model, target_model, gamma=0.99, tau=1e-3, actor_lr=1e-4, critic_lr=3e-4) agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10) agent.alg.model.actor_model.load_state_dict( torch.load("cart_pole_actor.pth")) agent.alg.model.critic_model.load_state_dict( torch.load("cart_pole_critic.pth")) eval_reward = [] for i in range(10): obs = env.reset() total_reward = 0 steps = 0 while True: action = agent.act(obs) steps += 1 next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward if render: env.render() if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def main(): env = StockTradingEnv(df) env.reset() act_dim = env.action_space.shape[0] obs_dim = env.observation_space.shape[0] rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) model = StockModel(act_dim=act_dim) algorithm = DDPG(model, gamma=dc, tau=TAU, actor_lr=al, critic_lr=cl) agent = Agent(algorithm, obs_dim, act_dim) total_steps, test_flag = 0, 0 while total_steps < TRAIN_TOTAL_STEPS: # train part train_reward, steps, aloss, closs = run_episode(env, agent, rpm, bs) total_steps += steps logger.info("Step {}, Train Reward {}.".format(total_steps, train_reward)) tb_logger.add_scalar(tag="Train/Reward", step=total_steps, value=train_reward) tb_logger.add_scalar(tag="Train/Actor", step=total_steps, value=aloss) tb_logger.add_scalar(tag="Train/Critic", step=total_steps, value=closs) # test part if total_steps // TEST_EVERY_STEPS >= test_flag: while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 # keep increment until condition is violated eval_reward = evaluate(env, agent) logger.info('Step:{}, Test Reward:{}'.format( total_steps, eval_reward)) tb_logger.add_scalar(tag="Test/Reward", step=total_steps, value=eval_reward) # 训练结束,保存模型 save_path = 'check_point/ddpg_%s_%s_%s_%s.ckpt' % (bs, dc, al, cl) agent.save(save_path)
def test_ddpg(): args = DDPGInput() DDPG(args)
break return total_reward, steps env = make_env("Quadrotor", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] set_trace() model = Model(state_size=obs_dim, action_size=act_dim) target_model = Model(state_size=obs_dim, action_size=act_dim) alg = DDPG(model, target_model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10) total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent) total_steps += steps print('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward if total_steps % TEST_EVERY_STEPS == 0: # 每隔一定step数,评估一次模型 torch.save(agent.alg.model.actor_model.state_dict(), f'flighter_actor_{total_steps}.pth') torch.save(agent.alg.model.critic_model.state_dict(),
from algorithm import DDPG, Actor, Critic import gym import torch.optim as optim import torch.nn as nn env = gym.make("MountainCarContinuous-v0") actor = Actor(env.observation_space.shape[0], 24, 24, 1) t_actor = Actor(env.observation_space.shape[0], 24, 24, 1) optimA = optim.Adam(actor.parameters(), lr=0.00001) critic = Critic(env.observation_space.shape[0], 24, 24) t_critic = Critic(env.observation_space.shape[0], 24, 24) optimC = optim.Adam(critic.parameters(), lr=0.00005) loss = nn.MSELoss() agent = DDPG(env, actor, t_actor, optimA, critic, t_critic, optimC, loss, 10000) agent.run(50, 10, 0.5, 64, 0.99, 0.001)