Esempio n. 1
0
 def build_dqn(self):
     """
     # TODO
         You need to build your DQN here.
         And load the pre-trained model named as './best_model.ckpt'.
         For example, 
             saver.restore(self.sess, './best_model.ckpt')
     """
     self.dqn = dqn.DeepQNetwork(len(self.min_action_set), "./", self.args)
Esempio n. 2
0
    def __init__(self, env):
        """An agent that maximizes its score using deep Q-learning.

        Args:
            env: An AtariWrapper object (see 'environment.py') that wraps over an OpenAI Gym
                environment.
        """

        self.env = env
        self.dqn = dqn.DeepQNetwork(env.state_shape, env.num_actions)
Esempio n. 3
0
    def __init__(self, env, num_hidden_units):
        """An agent that maximizes its score using deep Q-learning.

        Args:
            env: An EnvironmentWrapper object (see 'environment.py') that wraps over an OpenAI Gym
                environment.
            num_hidden_units: Number of units in the hidden layer of the network.
        """

        self.env = env
        self.dqn = dqn.DeepQNetwork(env.num_features, num_hidden_units, env.num_actions)
Esempio n. 4
0
)
parser.add_argument("--model",
                    help="tensorflow model checkpoint file to initialize from")
parser.add_argument("rom", help="rom file to run")
args = parser.parse_args()

print 'Arguments: %s' % (args)

baseOutputDir = 'game-out-' + time.strftime("%Y-%m-%d-%H-%M-%S")
os.makedirs(baseOutputDir)

State.setup(args)

environment = AtariEnvironment(args, baseOutputDir)

dqn = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args)

replayMemory = replay.ReplayMemory(args)


def runEpoch(minEpochSteps, evalWithEpsilon=None):
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    while environment.getStepNumber() - stepStart < minEpochSteps:

        startTime = lastLogTime = time.time()
        stateReward = 0
        state = None
Esempio n. 5
0
base_output_dir = 'run-out-' + time.strftime("%Y-%m-%d-%H-%M-%S")
os.makedirs(base_output_dir)

tensorboard_dir = base_output_dir + "/tensorboard/"
os.makedirs(tensorboard_dir)
summary_writer = tf.summary.create_file_writer(tensorboard_dir)
with summary_writer.as_default():
    tf.summary.text('params', str(args), step=0)

State.setup(args)

environment = CarEnv(args)
replay_memory = replay.ReplayMemory(base_output_dir, args)
dqn = dqn.DeepQNetwork(environment.get_num_actions(),
                       environment.get_state_size(), replay_memory,
                       base_output_dir, tensorboard_dir, args)

train_epsilon = args.epsilon  #don't want to reset epsilon between epoch
start_time = datetime.datetime.now()
train_episodes = 0
eval_episodes = 0
episode_train_reward_list = []
episode_eval_reward_list = []

#################################
# stop handler
#################################

stop = False
pause = False
Esempio n. 6
0
    def __init__(self, env, start_epsilon, end_epsilon, anneal_duration,
                 train_interval, target_network_reset_interval, batch_size,
                 learning_rate, max_gradient_norm, discount):
        """An agent that learns to play Atari games using deep Q-learning.

        Args:
            env: An AtariWrapper object (see 'environment.py') that wraps over an OpenAI Gym Atari
                environment.
            start_epsilon: Initial value for epsilon (exploration chance) used when training.
            end_epsilon: Final value for epsilon (exploration chance) used when training.
            anneal_duration: Number of time steps needed to decrease epsilon from start_epsilon to
                end_epsilon when training.
            train_interval: Number of experiences to accumulate before another round of training
                starts.
            target_network_reset_interval: Rate at which target Q-network values reset to actual
                Q-network values. Using a delayed target Q-network improves training stability.
            batch_size: Number of experiences sampled and trained on at once.
            learning_rate: The speed with which the network learns from new examples.
            max_gradient_norm: Maximum value allowed for the L2-norms of gradients. Gradients with
                norms that would otherwise surpass this value are scaled down.
            discount: Discount factor for future rewards.
        """

        self.env = env
        self.dqn = dqn.DeepQNetwork(env.state_shape, env.num_actions)
        self.start_epsilon = start_epsilon
        self.end_epsilon = end_epsilon
        self.anneal_duration = anneal_duration
        self.train_interval = train_interval
        self.target_network_reset_interval = target_network_reset_interval
        self.batch_size = batch_size
        self.discount = discount
        self.time_step = 0
        self.episodes_played = 0
        self.epsilon = self._get_epsilon()

        # Create target Q-network.
        dqn_params = tf.trainable_variables()
        self.target_dqn = dqn.DeepQNetwork(env.state_shape, env.num_actions)
        target_dqn_params = tf.trainable_variables()[len(dqn_params):]

        # Reset target Q-network values to the actual Q-network values.
        self.reset_target_dqn = [
            old.assign(new) for old, new in zip(target_dqn_params, dqn_params)
        ]

        # Define the optimization scheme for the deep Q-network.
        self.reward = tf.placeholder(tf.float32, [None],
                                     name='Observed_Reward')
        self.ongoing = tf.placeholder(tf.bool, [None],
                                      name='State_Is_Nonterminal')

        # Determine the true action values using double Q-learning (Hasselt et al., 2015): estimate
        # optimal actions using the Q-network, but estimate their values using the (delayed) target
        # Q-network. This reduces the likelihood that Q is overestimated.
        next_optimal_action_value = self.target_dqn.estimated_action_value
        observed_action_value = tf.stop_gradient(
            self.reward + tf.cast(self.ongoing, tf.float32) * discount *
            next_optimal_action_value)

        # Compute the loss function and regularize it by clipping the norm of its gradients.
        loss = tf.nn.l2_loss(self.dqn.estimated_action_value -
                             observed_action_value)
        gradients = tf.gradients(loss, dqn_params)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients,
                                                      max_gradient_norm)

        # Perform gradient descent.
        grads_and_vars = list(zip(clipped_gradients, dqn_params))
        self.global_step = tf.Variable(tf.constant(0, tf.int64),
                                       False,
                                       name='Global_Step')
        self.train_step = tf.train.AdamOptimizer(
            learning_rate).apply_gradients(grads_and_vars, self.global_step)
Esempio n. 7
0
import dqn
import matplotlib.pyplot as plt
import json
#%%
seq_size = 3
memory_size = 1000
env = Bot2DWrapper.Bot2DEnv(obs_size=64,
                            grid_size=3,
                            map_path="Image/map7.png")

RL = dqn.DeepQNetwork(
    n_actions=3,
    feature_size=[64, 64, seq_size],
    sensor_size=60,
    learning_rate=2e-4,
    reward_decay=0.95,
    e_greedy=0.98,
    replace_target_iter=100,
    memory_size=memory_size,
    e_greedy_increment=0.0001,
)
#%%
if __name__ == '__main__':
    total_step = 0
    state_m_rec = np.zeros([64, 64, seq_size], np.float32)
    reward_rec = []
    for eps in range(250):
        print('[ Episode ' + str(eps) + ' ]')
        state_m, state_s = env.reset()
        step = 0
Esempio n. 8
0
import dqn
import matplotlib.pyplot as plt
import json
import cv2
import models
#%%
env = GSlamBot2DWrapper.Bot2DEnv(obs_size=128,
                                 grid_size=3,
                                 map_path="Image/map9.png")
memory_size = 800
RL = dqn.DeepQNetwork(
    qnet=models.QNetNavMap,
    n_actions=3,
    learning_rate=2e-4,
    reward_decay=0.95,
    replace_target_iter=100,
    memory_size=memory_size,
    batch_size=64,
    e_greedy=0.95,
    e_greedy_increment=0.00004,
)
#%%
seq_size = 3
if __name__ == '__main__':
    total_step = 0
    reward_rec = []
    learn_count = 0
    for eps in range(400):
        state = env.reset()
        state_m = cv2.resize(state["map"], (64, 64),
                             interpolation=cv2.INTER_LINEAR)
Esempio n. 9
0
    def __init__(self,
                 env,
                 start_epsilon,
                 end_epsilon,
                 anneal_duration,
                 train_interval,
                 target_network_reset_interval,
                 batch_size,
                 num_hidden_units,
                 initial_learning_rate,
                 learning_rate_decay_factor,
                 learning_rate_decay_frequency,
                 max_gradient_norm,
                 discount):
        """An agent that learns to maximize its score using deep Q-learning.

        Args:
            env: An EnvironmentWrapper object (see 'environment.py') that wraps over an OpenAI Gym
                environment.
            start_epsilon: Initial value for epsilon (exploration chance) used when training.
            end_epsilon: Final value for epsilon (exploration chance) used when training.
            anneal_duration: Number of time steps needed to decrease epsilon from start_epsilon to
                end_epsilon when training.
            train_interval: Number of experiences to accumulate before another round of training
                starts.
            target_network_reset_interval: Rate at which target Q-network values reset to actual
                Q-network values. Using a delayed target Q-network improves training stability.
            batch_size: Number of experiences sampled and trained on at once.
            num_hidden_units: Number of units in the hidden layer of the network.
            initial_learning_rate: Initial speed with which the network learns from new examples.
            learning_rate_decay_factor: The value with which the learning rate is multiplied when it
                decays.
            learning_rate_decay_frequency: The frequency (measured in training steps) at which the
                learning rate is reduced.
            max_gradient_norm: Maximum value allowed for the L2-norms of gradients. Gradients with
                norms that would otherwise surpass this value are scaled down.
            discount: Discount factor for future rewards.
        """

        self.env = env
        self.dqn = dqn.DeepQNetwork(env.num_features, num_hidden_units, env.num_actions)
        self.start_epsilon = start_epsilon
        self.end_epsilon = end_epsilon
        self.anneal_duration = anneal_duration
        self.train_interval = train_interval
        self.target_network_reset_interval = target_network_reset_interval
        self.batch_size = batch_size
        self.time_step = 0
        self.episodes_played = 0
        self.epsilon = self._get_epsilon()

        # Create target Q-network.
        dqn_params = tf.trainable_variables()
        self.target_dqn = dqn.DeepQNetwork(env.num_features, num_hidden_units, env.num_actions)
        target_dqn_params = tf.trainable_variables()[len(dqn_params):]

        # Reset target Q-network values to the actual Q-network values.
        self.reset_target_dqn = [old.assign(new) for old, new in zip(target_dqn_params, dqn_params)]

        # Define the optimization scheme for the deep Q-network.
        self.reward = tf.placeholder(tf.float32, [None], name='Observed_Reward')
        self.ongoing = tf.placeholder(tf.bool, [None], name='State_Is_Nonterminal')

        # Determine the true action values.
        #
        #                    { r, if next state is terminal
        # Q(state, action) = {
        #                    { r + discount * max(Q(next state, <any action>)), otherwise
        next_optimal_action_value = tf.stop_gradient(self.target_dqn.optimal_action_value)
        observed_action_value = (
            self.reward + tf.cast(self.ongoing, tf.float32) * discount * next_optimal_action_value)

        # Compute the loss function and regularize it by clipping the norm of its gradients.
        loss = tf.nn.l2_loss(self.dqn.estimated_action_value - observed_action_value)
        gradients = tf.gradients(loss, dqn_params)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)

        # Perform gradient descent.
        grads_and_vars = list(zip(clipped_gradients, dqn_params))
        self.global_step = tf.Variable(tf.constant(0, tf.int64), False, name='Global_Step')
        self.learning_rate = tf.train.exponential_decay(initial_learning_rate,
                                                        self.global_step,
                                                        learning_rate_decay_frequency,
                                                        learning_rate_decay_factor,
                                                        staircase=True)
        self.train_step = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(
            grads_and_vars, self.global_step)