Python Networks.dueling_dqn Exemples, networks.Networks.dueling_dqn Python Exemples

Exemple #1

0

Afficher le fichier

    game.new_episode()
    game_state = game.get_state()
    misc = game_state.game_variables  # [KILLCOUNT, AMMO, HEALTH]
    prev_misc = misc

    action_size = game.get_available_buttons_size()

    img_rows, img_cols = 64, 64
    # Convert image into Black and white
    img_channels = 4  # We stack 4 frames

    state_size = (img_rows, img_cols, img_channels)
    agent = DoubleDQNAgent(state_size, action_size)

    agent.model = Networks.dueling_dqn(state_size, action_size,
                                       agent.learning_rate)
    agent.target_model = Networks.dueling_dqn(state_size, action_size,
                                              agent.learning_rate)

    x_t = game_state.screen_buffer  # 480 x 640
    x_t = preprocessImg(x_t, size=(img_rows, img_cols))
    s_t = np.stack(([x_t] * 4), axis=2)  # It becomes 64x64x4
    s_t = np.expand_dims(s_t, axis=0)  # 1x64x64x4

    is_terminated = game.is_episode_finished()

    # Start training
    epsilon = agent.initial_epsilon
    GAME = 0
    t = 0
    max_life = 0  # Maximum episode life (Proxy for agent performance)

Exemple #2

0

Afficher le fichier

def main():
    env = make(game='SonicTheHedgehog2-Genesis', state='EmeraldHillZone.Act1')

    # Parameters for observation image size processing.
    img_rows = 128
    img_cols = 128
    img_stack = 4

    action_size = 8  # 8 valid button combinations

    # Inputs to the agent's prediction network will have the following shape.
    input_size = (img_rows, img_cols, img_stack)

    # File paths
    stat_path = '../statistics/dqn_n-step'
    model_path = '../models/dqn_n-step'

    # Priortized Experience Replay.
    if (PER_AGENT):
        print('PER agent')
        stat_path += '_PER'
        model_path += '_PER'
        dqn_agent = DQN_PER_Agent(input_size, action_size)
    else:
        dqn_agent = DQN_Agent(input_size, action_size)

    # Use the Noisy Dueling Network.
    if (NOISY):
        stat_path += '_noisy_dueling'
        model_path += '_noisy_dueling'
        print('NOISY Dueling agent')
        dqn_agent.main_model = Networks.noisy_dueling_dqn(
            input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.noisy_dueling_dqn(
            input_size, action_size, dqn_agent.target_lr)
        dqn_agent.noisy = True
    # Use the normal dueling network.
    elif (DUELING):
        stat_path += '_dueling'
        model_path += '_dueling'
        print('Dueling agent')
        dqn_agent.main_model = Networks.dueling_dqn(input_size, action_size,
                                                    dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dueling_dqn(input_size, action_size,
                                                      dqn_agent.target_lr)
    # Normal DQN.
    else:
        dqn_agent.main_model = Networks.dqn(input_size, action_size,
                                            dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dqn(input_size, action_size,
                                              dqn_agent.target_lr)

    # Append correct suffix and filetype to paths.
    stat_path += '_stats.csv'
    main_model_path = model_path + '_main.h5'
    target_model_path = model_path + '_target.h5'

    # Load previous models, or instantiate new networks.
    if (LOAD_MODELS):
        dqn_agent.load_models(main_model_path, target_model_path)

    # Modify statrting epsilon value
    if (EPSILON == START):
        dqn_agent.epsilon = dqn_agent.initial_epsilon
    elif (EPSILON == MIDDLE):
        dqn_agent.epsilon = (
            (dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / 2)
    else:
        dqn_agent.epsilon = dqn_agent.final_epsilon

    # Store rewards and states from the previous n state, action pairs to
    # create experiences.
    prev_n_rewards = deque(maxlen=dqn_agent.n_step)
    prev_n_exp = deque(maxlen=dqn_agent.n_step)

    # One episode is 4500 steps if not completed
    # 5 minutes of frames at 1/15th of a second = 4 60Hz frames
    total_timestep = 0  # Total number of timesteps over all episodes.
    for episode in range(EPISODES):
        done = False
        reward_sum = 0  # Average reward within episode.
        timestep = 0  # Track timesteps within the episode.

        # Rewards and states must be consecutive to improve temporal awareness.
        # Reset at the start of each episode to compensate for sudden scene change.
        prev_n_rewards.clear()
        prev_n_exp.clear()

        # Experiences are a stack of the img_stack most frames to provide
        # temporal information. Initialize this sequence to the first
        # observation stacked 4 times.
        first_obs = env.reset()
        processed = preprocess_obs(first_obs, size=(img_rows, img_cols))
        # (img_rows, img_cols, img_stack)
        exp_stack = np.stack(([processed] * img_stack), axis=2)
        # Expand dimension to stack and submit multiple exp_stacks in  a batch
        # (1, img_rows, img_cols, img_stack).
        exp_stack = np.expand_dims(exp_stack, axis=0)  # 1x64x64x4

        # Continue until the end of the zone is reached or 4500 timesteps have
        # passed.
        while not done:
            # Predict an action to take based on the most recent
            # experience.
            #
            # Note that the first dimension
            # (1, img_rows, img_cols, img_stack) is ignored by the
            # network here as it represents a batch size of 1.
            act_idx, action = dqn_agent.act(exp_stack)
            obs, reward, done, info = env.step(action)
            # env.render()

            timestep += 1
            total_timestep += 1
            reward_sum += reward

            # Create a 1st dimension for stacking experiences and a 4th for
            # stacking img_stack frames.
            obs = preprocess_obs(obs, size=(img_rows, img_cols))
            obs = np.reshape(obs, (1, img_rows, img_cols, 1))

            # Append the new observation to the front of the stack and remove
            # the oldest (4th) frame.
            exp_stack_new = np.append(obs, exp_stack[:, :, :, :3], axis=3)

            # Save the previous state, selected action, and resulting reward
            prev_n_rewards.appendleft(reward)
            prev_n_exp.append((exp_stack, act_idx, done))
            exp_stack = exp_stack_new

            # Once sufficent steps have been taken, discount rewards and save nth
            # previous experience
            if (len(prev_n_rewards) >= dqn_agent.n_step):
                # Compute discounted reward
                discounted_reward = 0
                for idx in range(len(prev_n_rewards)):
                    prev_reward = prev_n_rewards[idx]
                    # rewards are append left so that the most recent rewards are
                    # discounted the least.
                    discounted_reward += ((dqn_agent.gamma**idx) * prev_reward)

                # Experiences are pushed forward into the deque as more are appened. The
                # nth previous experience is at the last index.
                original_state, original_act, _ = prev_n_exp[-1]
                nth_state, _, nth_done = prev_n_exp[0]

                # Save the nth previous state and predicted action the discounted sum of rewards
                # and final state over the next n steps.
                dqn_agent.save_memory(original_state, original_act,
                                      discounted_reward, nth_state, nth_done)

            # In the observation phase skip training updates and decrmenting epsilon.
            if (total_timestep >= dqn_agent.observation_timesteps):

                # Update the target model with the main model's weights.
                if ((total_timestep % dqn_agent.update_target_freq) == 0):
                    dqn_agent.update_target_model()

                # Train the agent on saved experiences.
                if ((total_timestep % dqn_agent.timestep_per_train) == 0):
                    dqn_agent.replay_update()
                    dqn_agent.save_models(main_model_path, target_model_path)

                if (dqn_agent.epsilon > dqn_agent.final_epsilon):
                    # Decrease epsilon by a fraction of the range such that epsilon decreases
                    # for "exploration_timesteps".
                    dec = (
                        (dqn_agent.initial_epsilon - dqn_agent.final_epsilon) /
                        dqn_agent.exploration_timesteps)
                    dqn_agent.epsilon -= dec

            # print(info)
            print("Epsisode:", episode, " Timestep:", timestep, " Action:",
                  act_idx, " Episode Reward Sum:", reward_sum, " Epsilon:",
                  dqn_agent.epsilon)

        # Save mean episode reward at the end of the episode - append to stats file
        with open(stat_path, "a") as stats_fd:
            reward_str = "Epsiode Cummulative Reward: " + str(
                reward_sum) + ", Episode Timestpes: " + str(timestep) + ",\n"
            stats_fd.write(str(reward_str))

Exemple #3

0

Afficher le fichier

def main():
    env = make(game='SonicTheHedgehog2-Genesis', state='EmeraldHillZone.Act1')
    
    # Parameters for observation image size processing.
    img_rows = 128
    img_cols = 128          
    img_stack = 4        

    action_size = 8         # 8 valid button combinations
    
    # Inputs to the agent's prediction network will have the following shape.
    input_size = (img_rows, img_cols, img_stack)
    
    # File paths
    stat_path = '../statistics/dqn'
    model_path = '../models/dqn'

    # Priortized Experience Replay.
    if (PER_AGENT):
        print('PER agent')
        stat_path += '_PER'
        model_path+= '_PER'
        dqn_agent = DQN_PER_Agent(input_size, action_size)
    elif(DIST_AGENT):
        stat_path += '_DIST'
        model_path+= '_DIST'
        dqn_agent = DistributionalDQN(input_size, action_size)
    else:
        dqn_agent = DQN_Agent(input_size, action_size)

    
    # Use the Noisy Dueling Network.
    if (NOISY):
        stat_path += '_noisy_dueling'
        model_path += '_noisy_dueling'
        print('NOISY Dueling agent')
        dqn_agent.main_model = Networks.noisy_dueling_dqn(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.noisy_dueling_dqn(input_size, action_size, dqn_agent.target_lr)
        dqn_agent.noisy = True
    # Use the normal dueling network.
    elif(DUELING and DIST_AGENT):
        stat_path += '_dueling'
        model_path += '_dueling'
        print('Dueling distributional')
        dqn_agent.main_model = Networks.dueling_C51(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dueling_C51(input_size, action_size, dqn_agent.target_lr)
    elif (DUELING):
        stat_path += '_dueling'
        model_path += '_dueling'
        print('Dueling agent')
        dqn_agent.main_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.target_lr)
    elif(DIST_AGENT):
        dqn_agent.main_model = Networks.C51(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.C51(input_size, action_size, dqn_agent.target_lr)
    # Normal DQN.
    else:
        dqn_agent.main_model = Networks.dqn(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dqn(input_size, action_size, dqn_agent.target_lr)
    
    # Append correct suffix and filetype to paths.
    stat_path += '_stats.csv'
    main_model_path = model_path + '_main.h5'
    target_model_path = model_path + '_target.h5'

    # Load previous models.
    if (LOAD_MODELS):
        dqn_agent.load_models(main_model_path, target_model_path)

    # Modify statrting epsilon value
    if (EPSILON == START):
        dqn_agent.epsilon = dqn_agent.initial_epsilon
    elif (EPSILON == MIDDLE):
        dqn_agent.epsilon = ((dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / 2)
    else:
        dqn_agent.epsilon = dqn_agent.final_epsilon

    # One episode is 4500 steps if not completed 
    # 5 minutes of frames at 1/15th of a second = 4 60Hz frames
    total_timestep = 0              # Total number of timesteps over all episodes.
    for episode in range(EPISODES):
        done = False
        reward_sum = 0          # Average reward within episode.
        timestep = 0            # Track timesteps within the episode.
        first_obs  = env.reset()

        # Experiences are a stack of the img_stack most frames to provide 
        # temporal information. Initialize this sequence to the first 
        # observation stacked 4 times.
        processed = preprocess_obs(first_obs, size=(img_rows, img_cols))
        # (img_rows, img_cols, img_stack)
        exp_stack = np.stack(([processed]*img_stack), axis = 2)
        # Expand dimension to stack and submit multiple exp_stacks in  a batch
        # (1, img_rows, img_cols, img_stack).
        exp_stack = np.expand_dims(exp_stack, axis=0) # 1x64x64x4
        
        # Punish the agent for not moving forward
        prev_state = {}
        steps_stuck = 0

        # Continue until the end of the zone is reached or 4500 timesteps have 
        # passed.
        while not done:
                # Predict an action to take based on the most recent
                # experience. 
                # 
                # Note that the first dimension 
                # (1, img_rows, img_cols, img_stack) is ignored by the
                # network here as it represents a batch size of 1.
                act_idx, action = dqn_agent.act(exp_stack)
                obs, reward, done, info = env.step(action)
                # env.render()
                
                # Punish the agent for standing still for too long.
                if (prev_state == info):
                    steps_stuck += 1
                else:
                    steps_stuck = 0
                prev_state = info

                # Position based reward does not include stagnation punishment.
                reward_sum += reward      
                if (steps_stuck > 20):
                    reward -= 1
                
                # Track various events
                timestep += 1
                total_timestep += 1

                obs = preprocess_obs(obs, size=(img_rows, img_cols))
                
                # Create a 1st dimension for stacking experiences and a 4th for 
                # stacking img_stack frames.
                obs = np.reshape(obs, (1, img_rows, img_cols, 1))
                
                # Append the new observation to the front of the stack and remove
                # the oldest (4th) frame.
                exp_stack_new = np.append(obs, exp_stack[:, :, :, :3], axis=3)

                # Save the experience: <state, action, reward, next_state, done>. 
                dqn_agent.save_memory(exp_stack, act_idx, reward, exp_stack_new, done)
                exp_stack = exp_stack_new
                
                # In the observation phase skip training updates and decrmenting epsilon.
                if (total_timestep >= dqn_agent.observation_timesteps):
                     
                    # Update the target model with the main model's weights.
                    if ((total_timestep % dqn_agent.update_target_freq) == 0):
                        dqn_agent.update_target_model()

                    # Train the agent on saved experiences.
                    if ((total_timestep % dqn_agent.timestep_per_train) == 0):
                            dqn_agent.replay_update()
                            dqn_agent.save_models(main_model_path, target_model_path)
                        
                    if (dqn_agent.epsilon > dqn_agent.final_epsilon):
                        # Decrease epsilon by a fraction of the range such that epsilon decreases
                        # for "exploration_timesteps".
                        dec = ((dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / dqn_agent.exploration_timesteps)
                        dqn_agent.epsilon -= dec

                # print(info)
                print("Epsisode:", episode, " Timestep:", timestep, " Action:", act_idx, " Episode Reward Sum:", reward_sum, " Epsilon:", dqn_agent.epsilon)
        
        # Save mean episode reward at the end of the episode - append to stats file            
        with open(stat_path, "a") as stats_fd:
            reward_str = "Epsiode Cummulative Reward: " + str(reward_sum) + ", Episode Timestpes: " +  str(timestep) + ",\n"
            stats_fd.write(str(reward_str))