def main(): action_high = 2 action_low = -2 action_high = np.array([action_high]) action_low = np.array([action_low]) buffer_size = 100000 minibatch_size = 256 num_episode = 500 env = gym.make("Pendulum-v0") state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] agent = Agent(state_size, action_size, buffer_size, minibatch_size, action_high, action_low) reward_list = [] for i_episode in range(num_episode): print("episode: %d" % i_episode) state = env.reset() total_reward = 0 for t_timesteps in range(env.spec.timestep_limit): env.render() action = agent.choose_action(state) next_state, reward, done, info = env.step(action) total_reward += reward transition = [state, action, next_state, reward, done] agent.train(transition) state = next_state if (done or t_timesteps == env.spec.timestep_limit - 1): print("Episode finish---time steps: %d" % t_timesteps) print("total reward: %d" % total_reward) reward_list.append(total_reward) break np.save('reward', reward_list)
from ddpg import Agent import gym import numpy as np env = gym.make('LunarLanderContinuous-v2') agent = Agent(alpha = 0.000025, beta = 0.00025, input_dims = [8], tau = 0.001, env = env, batch_size = 64, layer1_size = 400, layer2_size = 300, n_actions = 2) np.random.seed(42) score_history = [] for i in range(1000): done = False score = 0 obs = env.reset() while not done: act = agent.choose_action(obs) new_state, reward, done, info = env.step(act) agent.remember(obs, act, reward, new_state, int(done)) agent.learn() score += reward obs = new_state score_history.append(score) print("Episode - {} Score - {} 100 game average {}".format(i, score, np.mean(score_history[-100:]))) if i % 25 == 0: agent.save_models() filename = l
#agent.load_models() np.random.seed(0) score_history = [] for i in range(100000): env_params = env.reset() obs = env_params[ 'observation'] # Remove 'observation' indexing for envs with no dict d_goal = env_params['desired_goal'] net_input = np.hstack((obs, d_goal)) # print(obs.shape) done = False score = 0 while not done: act = agent.choose_action(net_input) # print(act) new_state, reward, done, info = env.step(act) new_state = np.hstack( (new_state['observation'], new_state['desired_goal'] )) ## Remove 'observation' indexing for envs with no dict agent.remember(net_input, act, reward, new_state, int(done)) agent.learn() score += reward net_input = new_state #env.render() score_history.append(score) if i % 25 == 0: agent.save_models()
episode_history = deque(maxlen=100) for i in xrange(MAX_EPISODES): # initialize state = env.reset() total_rewards = 0 noise = exploration(0.0, 0.2, MAX_STEPS) for t in xrange(MAX_STEPS): env.render() # Add noise and make sure action stays within bounds action = learner.choose_action(state) action = np.clip(action + noise[t], -action_scale, action_scale) next_state, reward, done, _ = env.step(action) next_state = next_state.flatten() # Note how reward is scaled for monitoring purposes reward = reward / 10. total_rewards += reward learner.update_buffer(state, action, reward, next_state, done) state = next_state # Fill up some of the experience replay memory before trying to learn
input_dims=[3], tau=0.001, env=env, n_actions=1) np.random.seed(0) score_history = [] for episode in range(1000): state = env.reset() done = False score = 0 while not done: action = agent.choose_action(state) next_state, reward, done, info = env.step(action) agent.remember(state, action, reward, next_state, int(done)) agent.learn() score += reward state = next_state score_history.append(score) print('Episode {}, Score: {:.2f}, 100 game average: {:.2f}'.format( episode, score, np.mean(score_history[-100:]))) filename = 'pendulum.png' plotLearning(score_history, filename, window=100)
action = env.action_space.sample() observation_, reward, done, info = env.step(action) agent.remember(observation, action, reward, observation_, done) n_steps += 1 agent.learn() agent.load_models() evaluate = True else: evaluate = False for i in range(n_episodes): observation = env.reset() done = False score = 0 while not done: action = agent.choose_action(observation, evaluate) env.render() observation_, reward, done, info = env.step(action) score += reward agent.remember(observation, action, reward, observation_, done) if not load_checkpoint: agent.learn() observation = observation_ score_history.append(score) avg_score = np.mean(score_history[-100:]) if avg_score > best_score: best_score = avg_score if not load_checkpoint: agent.save_models()
env = gym.make('LunarLanderContinuous-v2') agent = Agent(alpha=0.000025, beta=0.00025, input_dims=[8], tau=0.001, env=env, n_actions=2, layer1_dims=400, layer2_dims=300, batch_size=64) np.random.seed(0) score_history = [] score = 0 n_episodes = 2500 for i in range(n_episodes): print('episode: ', i, 'score %.3f' % score) done = False score = 0 observation = env.reset() while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) agent.remember(observation, action, reward, observation_, int(done)) agent.learn() observation = observation_ score += reward score_history.append(score) wandb.log({"score": score})