Ejemplo n.º 1
0
def main():
    action_high = 2
    action_low = -2
    action_high = np.array([action_high])
    action_low = np.array([action_low])
    buffer_size = 100000
    minibatch_size = 256
    num_episode = 500

    env = gym.make("Pendulum-v0")
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    agent = Agent(state_size, action_size, buffer_size, minibatch_size,
                  action_high, action_low)
    reward_list = []
    for i_episode in range(num_episode):
        print("episode: %d" % i_episode)
        state = env.reset()
        total_reward = 0
        for t_timesteps in range(env.spec.timestep_limit):
            env.render()
            action = agent.choose_action(state)
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            transition = [state, action, next_state, reward, done]
            agent.train(transition)
            state = next_state
            if (done or t_timesteps == env.spec.timestep_limit - 1):
                print("Episode finish---time steps: %d" % t_timesteps)
                print("total reward: %d" % total_reward)
                reward_list.append(total_reward)
                break
    np.save('reward', reward_list)
Ejemplo n.º 2
0
from ddpg import Agent
import gym
import numpy as np

env = gym.make('LunarLanderContinuous-v2')

agent = Agent(alpha = 0.000025, beta = 0.00025, input_dims = [8], tau = 0.001, env = env, batch_size = 64, layer1_size = 400, layer2_size = 300, n_actions = 2)

np.random.seed(42)
score_history = []

for i in range(1000):
	done = False
	score = 0 
	obs = env.reset()
	while not done:
		act = agent.choose_action(obs)
		new_state, reward, done, info = env.step(act)
		agent.remember(obs, act, reward, new_state, int(done))
		agent.learn()
		score += reward
		obs = new_state

	score_history.append(score)
	print("Episode - {} Score - {} 100 game average {}".format(i, score, np.mean(score_history[-100:])))

	if i % 25 == 0:
		agent.save_models()

filename = l
Ejemplo n.º 3
0
#agent.load_models()
np.random.seed(0)

score_history = []
for i in range(100000):
    env_params = env.reset()
    obs = env_params[
        'observation']  # Remove 'observation' indexing for envs with no dict
    d_goal = env_params['desired_goal']
    net_input = np.hstack((obs, d_goal))
    # print(obs.shape)
    done = False
    score = 0
    while not done:
        act = agent.choose_action(net_input)
        # print(act)
        new_state, reward, done, info = env.step(act)
        new_state = np.hstack(
            (new_state['observation'], new_state['desired_goal']
             ))  ## Remove 'observation' indexing for envs with no dict
        agent.remember(net_input, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        net_input = new_state
        #env.render()
    score_history.append(score)

    if i % 25 == 0:
        agent.save_models()
Ejemplo n.º 4
0

episode_history = deque(maxlen=100)
for i in xrange(MAX_EPISODES):

    # initialize
    state = env.reset()
    total_rewards = 0

    noise = exploration(0.0, 0.2, MAX_STEPS)

    for t in xrange(MAX_STEPS):
        env.render()

        # Add noise and make sure action stays within bounds
        action = learner.choose_action(state)
        action = np.clip(action + noise[t], -action_scale, action_scale)

        next_state, reward, done, _ = env.step(action)
        next_state = next_state.flatten()

        # Note how reward is scaled for monitoring purposes
        reward = reward / 10.

        total_rewards += reward

        learner.update_buffer(state, action, reward, next_state, done)

        state = next_state

        # Fill up some of the experience replay memory before trying to learn
              input_dims=[3],
              tau=0.001,
              env=env,
              n_actions=1)

np.random.seed(0)
score_history = []

for episode in range(1000):

    state = env.reset()
    done = False
    score = 0

    while not done:
        action = agent.choose_action(state)

        next_state, reward, done, info = env.step(action)
        agent.remember(state, action, reward, next_state, int(done))

        agent.learn()

        score += reward
        state = next_state

    score_history.append(score)
    print('Episode {}, Score: {:.2f}, 100 game average: {:.2f}'.format(
        episode, score, np.mean(score_history[-100:])))

filename = 'pendulum.png'
plotLearning(score_history, filename, window=100)
Ejemplo n.º 6
0
            action = env.action_space.sample()
            observation_, reward, done, info = env.step(action)
            agent.remember(observation, action, reward, observation_, done)
            n_steps += 1
        agent.learn()
        agent.load_models()
        evaluate = True
    else:
        evaluate = False

    for i in range(n_episodes):
        observation = env.reset()
        done = False
        score = 0
        while not done:
            action = agent.choose_action(observation, evaluate)
            env.render()
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.remember(observation, action, reward, observation_, done)
            if not load_checkpoint:
                agent.learn()
            observation = observation_

        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            if not load_checkpoint:
                agent.save_models()
Ejemplo n.º 7
0
    env = gym.make('LunarLanderContinuous-v2')
    agent = Agent(alpha=0.000025,
                  beta=0.00025,
                  input_dims=[8],
                  tau=0.001,
                  env=env,
                  n_actions=2,
                  layer1_dims=400,
                  layer2_dims=300,
                  batch_size=64)

    np.random.seed(0)
    score_history = []
    score = 0
    n_episodes = 2500

    for i in range(n_episodes):
        print('episode: ', i, 'score %.3f' % score)
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            agent.remember(observation, action, reward, observation_,
                           int(done))
            agent.learn()
            observation = observation_
            score += reward
        score_history.append(score)
        wandb.log({"score": score})