Exemple #1
0
    def random_play_and_save(self, env, current_state, act_list):

        # Decide action
        action_index = env.action_space.sample()

        # Check if all previous actions are NOOP
        all_zeros = all(p == 0 for p in act_list)

        # Change action if all previous actions are NOOP and current is NOOP
        if all_zeros and action_index == 0:
            action_index = random.randint(1, env.action_space.n - 1)

        # Build action vector
        action = np.eye(self.action_size, dtype=np.int8)[action_index]

        # Advance the game to the next state based on the action.
        obs, reward, is_done, _ = env.step(action_index)

        # Pre-process observation
        obs = preprocess(obs)

        # Build next state
        next_state = get_next_state(current_state, obs)

        # Pre-process reward
        transformed_reward = transform_reward(reward)

        # Remember the previous state, action, reward, and done
        self.memory.append((current_state, action, transformed_reward, next_state, is_done))

        return next_state, reward, is_done, action_index
Exemple #2
0
    def q_iteration(self, env, current_state, act_list):

        # Choose the action
        if random.random() < self.epsilon:
            action_index = env.action_space.sample()

        else:
            action_index = self.choose_best_action(current_state)

        # Check if all previous actions are NOOP
        all_zeros = all(p == 0 for p in act_list)

        # Change action if all previous actions are NOOP and current is NOOP
        if all_zeros and action_index == 0:
            action_index = random.randint(1, env.action_space.n - 1)

        # Build action vector
        action = np.eye(self.action_size, dtype=np.int8)[action_index]

        # Play one game iteration
        obs, reward, is_done, _ = env.step(action_index)

        # Pre-process observation
        obs = preprocess(obs)

        # Build next state
        next_state = get_next_state(current_state, obs)

        # Pre-process reward
        transformed_reward = transform_reward(reward)

        # Remember the previous state, action, reward, and done
        self.memory.append((current_state, action, transformed_reward, next_state, is_done))

        # Sample and fit
        batch = self.memory.sample_batch(self.batch_size)
        self.fit_batch(batch)

        return next_state, reward, is_done, action_index
Exemple #3
0
    def nonrandom_play_and_save(self, env, current_state, act_list):
        """This method is designed to be used when you need to continue training after a (initial)training session is
        finished.
        I have found that initializing the memory with random play in that situation can make the agent to diverge."""

        # Choose the action according to the behaviour policy
        if random.random() < 0.05:
            action_index = env.action_space.sample()
        else:
            action_index = self.choose_best_action(current_state)

        # Check if all previous actions are NOOP
        all_zeros = all(p == 0 for p in act_list)

        # Change action if all previous actions are NOOP and current is NOOP
        if all_zeros and action_index == 0:
            action_index = random.randint(1, env.action_space.n - 1)

        # Build action vector
        action = np.eye(self.action_size, dtype=np.int8)[action_index]

        # Advance the game to the next state based on the action.
        obs, reward, is_done, _ = env.step(action_index)

        # Pre-process observation
        obs = preprocess(obs)

        # Build next state
        next_state = get_next_state(current_state, obs)

        # Pre-process reward
        transformed_reward = transform_reward(reward)

        # Remember the previous state, action, reward, and done
        self.memory.append((current_state, action, transformed_reward, next_state, is_done))

        return next_state, reward, is_done, action_index
Exemple #4
0
from DQL_agents_preprocessing import preprocess, transform_reward, get_next_state
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

# initialize gym environment and the agent
env = gym.make('PongDeterministic-v4')
agent = DQLAgent('pong')
agent.model.load_weights('DQN_pong_weights_13000000.hdf5')
# To test transfer just change the weights
# agent.model.load_weights('DQN_breakout_weights_20000000.hdf5')

returns = []
episode_return = 0
for episode in xrange(100):
    # Observe reward and initialize first state
    obs = preprocess(env.reset())

    # Initialize the first state with the same 4 images
    current_state = np.array([[obs, obs, obs, obs]], dtype=np.uint8).reshape((105, 80, 4))

    for time_step in xrange(20000):
        # print "episode:", e, "time_step:", time_step

        # turn this on if you want to render
        # env.render()

        # Choose the action according to the behaviour policy
        if random.random() < 0.05:
            action_index = env.action_space.sample()
        else:
            action_index = agent.choose_best_action(current_state)
Exemple #5
0
# Get data from OpenAI
env = gym.make('BreakoutDeterministic-v4')

# Define agent
agent = DQLAgent('breakout')
agent.model.load_weights('DQN_breakout_weights12000000.hdf5')

frame_counter = 0

# Generate random indexes to shuffle database
random_indexes = np.arange(MAX_STATES)
np.random.shuffle(random_indexes)

for episode in xrange(MAX_EPISODES):
    raw_obs = env.reset()
    obs = preprocess(raw_obs)
    # Initialize the first state with the same 4 images
    current_state = np.array([[obs, obs, obs, obs]], dtype=np.uint8).reshape((105, 80, 4))

    for t in xrange(MAX_EPISODE_STATES):
        # run environment
        # env.render()

        # Choose the action according to the behaviour policy
        if random.random() < 0.4:
            action_index = env.action_space.sample()
        else:
            action_index = agent.choose_best_action(current_state)

        # Play one game iteration
        raw_obs, reward, is_done, _ = env.step(action_index)