def test(): env = gym.make('CartPole-v0') action_dim = env.action_space.n obs_shape = env.observation_space.shape rpm = ReplayMemory(MEMORY_SIZE) model = CartpoleModel(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = CartpoleAgent( algorithm, obs_dim=obs_shape[0], act_dim=action_dim, e_greed=0.1, # explore e_greed_decrement=1e-6 ) # probability of exploring is decreasing during training while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, env, rpm) if os.path.exists('./model_dir'): agent.restore('./model_dir') eval_reward = evaluate(agent, env, True) logger.info('test_reward:{}'.format(eval_reward))
def main(): env = gym.make("CartPole-v0") model = CartpoleModel(act_dim=ACT_DIM) alg = parl.algorithms.PolicyGradient(model, lr=LEARNING_RATE) agent = CartpoleAgent(alg, obs_dim=OBS_DIM, act_dim=ACT_DIM) # if the file already exists, restore parameters from it if os.path.exists('./model.ckpt'): agent.restore('./model.ckpt') for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: _, _, reward_list = run_episode(env, agent, train_or_test='test') total_reward = np.sum(reward_list) logger.info('Test reward: {}'.format(total_reward)) # save the parameters to ./model.ckpt agent.save('./model.ckpt')
def main(): env = gym.make('CartPole-v0') action_dim = env.action_space.n obs_shape = env.observation_space.shape rpm = ReplayMemory(MEMORY_SIZE) model = CartpoleModel(act_dim=action_dim) algorithm = parl.algorithms.DQN( model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = CartpoleAgent( algorithm, obs_dim=obs_shape[0], act_dim=action_dim, e_greed=0.1, # explore e_greed_decrement=1e-6 ) # probability of exploring is decreasing during training while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, env, rpm) max_episode = 2000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(agent, env, rpm) episode += 1 eval_reward = evaluate(agent, env) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward))
def main(): logger.info("------------------- SAC ---------------------") logger.info('Env: {}, Seed: {}'.format(args.env, args.seed)) logger.info("---------------------------------------------") logger.set_dir('./{}_{}'.format(args.env, args.seed)) env = ContinuousCartPoleEnv() #gym.make(args.env) env.seed(args.seed) obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] print('obs_dim, action_dim', (obs_dim, action_dim)) # Initialize model, algorithm, agent, replay_memory model = CartpoleModel(obs_dim, action_dim) algorithm = SAC(model, gamma=GAMMA, tau=TAU, alpha=args.alpha, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = CartpoleAgent(algorithm) rpm = ReplayMemory(max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim) # load model and evaluate if os.path.exists('sac_model.ckpt'): logger.info("restore model succeed and test !") agent.restore('sac_model.ckpt') run_evaluate_episodes(agent, env, EVAL_EPISODES, render=True) exit() total_steps = 0 test_flag = 0 while total_steps < args.train_total_steps: # Train episode episode_reward, episode_steps = run_train_episode(agent, env, rpm) total_steps += episode_steps tensorboard.add_scalar('train/episode_reward', episode_reward, total_steps) logger.info('Total Steps: {} Reward: {}'.format( total_steps, episode_reward)) # Evaluate episode if (total_steps + 1) // args.test_every_steps >= test_flag: while (total_steps + 1) // args.test_every_steps >= test_flag: test_flag += 1 avg_reward = run_evaluate_episodes(agent, env, EVAL_EPISODES, render=False) tensorboard.add_scalar('eval/episode_reward', avg_reward, total_steps) logger.info('Evaluation over: {} episodes, Reward: {}'.format( EVAL_EPISODES, avg_reward)) agent.save('sac_model.ckpt')
def main(): env = gym.make('CartPole-v0') model = CartpoleModel(obs_dim=OBS_DIM, act_dim=ACT_DIM) alg = parl.algorithms.PolicyGradient(model, LEARNING_RATE) agent = CartpoleAgent(alg) for i in range(1000): # 1000 episodes obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: _, _, reward_list = run_episode(env, agent, train_or_test='test') total_reward = np.sum(reward_list) logger.info('Test reward: {}'.format(total_reward))
def main(): env = gym.make('CartPole-v0') # env = env.unwrapped # Cancel the minimum score limit obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # build an agent model = CartpoleModel(obs_dim=obs_dim, act_dim=act_dim) alg = morl.algorithms.PolicyGradient(model, lr=LEARNING_RATE) agent = CartpoleAgent(alg) # load model and evaluate if os.path.exists('simple_model.ckpt'): logger.info("restore model succeed and test !") agent.restore('simple_model.ckpt') evaluate(env, agent, render=True) exit() for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=True) logger.info('Test reward: {}'.format(total_reward)) # save the parameters to model.ckpt agent.save('simple_model.ckpt')