def build_dqn(self): """ # TODO You need to build your DQN here. And load the pre-trained model named as './best_model.ckpt'. For example, saver.restore(self.sess, './best_model.ckpt') """ self.dqn = dqn.DeepQNetwork(len(self.min_action_set), "./", self.args)
def __init__(self, env): """An agent that maximizes its score using deep Q-learning. Args: env: An AtariWrapper object (see 'environment.py') that wraps over an OpenAI Gym environment. """ self.env = env self.dqn = dqn.DeepQNetwork(env.state_shape, env.num_actions)
def __init__(self, env, num_hidden_units): """An agent that maximizes its score using deep Q-learning. Args: env: An EnvironmentWrapper object (see 'environment.py') that wraps over an OpenAI Gym environment. num_hidden_units: Number of units in the hidden layer of the network. """ self.env = env self.dqn = dqn.DeepQNetwork(env.num_features, num_hidden_units, env.num_actions)
) parser.add_argument("--model", help="tensorflow model checkpoint file to initialize from") parser.add_argument("rom", help="rom file to run") args = parser.parse_args() print 'Arguments: %s' % (args) baseOutputDir = 'game-out-' + time.strftime("%Y-%m-%d-%H-%M-%S") os.makedirs(baseOutputDir) State.setup(args) environment = AtariEnvironment(args, baseOutputDir) dqn = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args) replayMemory = replay.ReplayMemory(args) def runEpoch(minEpochSteps, evalWithEpsilon=None): stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 while environment.getStepNumber() - stepStart < minEpochSteps: startTime = lastLogTime = time.time() stateReward = 0 state = None
base_output_dir = 'run-out-' + time.strftime("%Y-%m-%d-%H-%M-%S") os.makedirs(base_output_dir) tensorboard_dir = base_output_dir + "/tensorboard/" os.makedirs(tensorboard_dir) summary_writer = tf.summary.create_file_writer(tensorboard_dir) with summary_writer.as_default(): tf.summary.text('params', str(args), step=0) State.setup(args) environment = CarEnv(args) replay_memory = replay.ReplayMemory(base_output_dir, args) dqn = dqn.DeepQNetwork(environment.get_num_actions(), environment.get_state_size(), replay_memory, base_output_dir, tensorboard_dir, args) train_epsilon = args.epsilon #don't want to reset epsilon between epoch start_time = datetime.datetime.now() train_episodes = 0 eval_episodes = 0 episode_train_reward_list = [] episode_eval_reward_list = [] ################################# # stop handler ################################# stop = False pause = False
def __init__(self, env, start_epsilon, end_epsilon, anneal_duration, train_interval, target_network_reset_interval, batch_size, learning_rate, max_gradient_norm, discount): """An agent that learns to play Atari games using deep Q-learning. Args: env: An AtariWrapper object (see 'environment.py') that wraps over an OpenAI Gym Atari environment. start_epsilon: Initial value for epsilon (exploration chance) used when training. end_epsilon: Final value for epsilon (exploration chance) used when training. anneal_duration: Number of time steps needed to decrease epsilon from start_epsilon to end_epsilon when training. train_interval: Number of experiences to accumulate before another round of training starts. target_network_reset_interval: Rate at which target Q-network values reset to actual Q-network values. Using a delayed target Q-network improves training stability. batch_size: Number of experiences sampled and trained on at once. learning_rate: The speed with which the network learns from new examples. max_gradient_norm: Maximum value allowed for the L2-norms of gradients. Gradients with norms that would otherwise surpass this value are scaled down. discount: Discount factor for future rewards. """ self.env = env self.dqn = dqn.DeepQNetwork(env.state_shape, env.num_actions) self.start_epsilon = start_epsilon self.end_epsilon = end_epsilon self.anneal_duration = anneal_duration self.train_interval = train_interval self.target_network_reset_interval = target_network_reset_interval self.batch_size = batch_size self.discount = discount self.time_step = 0 self.episodes_played = 0 self.epsilon = self._get_epsilon() # Create target Q-network. dqn_params = tf.trainable_variables() self.target_dqn = dqn.DeepQNetwork(env.state_shape, env.num_actions) target_dqn_params = tf.trainable_variables()[len(dqn_params):] # Reset target Q-network values to the actual Q-network values. self.reset_target_dqn = [ old.assign(new) for old, new in zip(target_dqn_params, dqn_params) ] # Define the optimization scheme for the deep Q-network. self.reward = tf.placeholder(tf.float32, [None], name='Observed_Reward') self.ongoing = tf.placeholder(tf.bool, [None], name='State_Is_Nonterminal') # Determine the true action values using double Q-learning (Hasselt et al., 2015): estimate # optimal actions using the Q-network, but estimate their values using the (delayed) target # Q-network. This reduces the likelihood that Q is overestimated. next_optimal_action_value = self.target_dqn.estimated_action_value observed_action_value = tf.stop_gradient( self.reward + tf.cast(self.ongoing, tf.float32) * discount * next_optimal_action_value) # Compute the loss function and regularize it by clipping the norm of its gradients. loss = tf.nn.l2_loss(self.dqn.estimated_action_value - observed_action_value) gradients = tf.gradients(loss, dqn_params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm) # Perform gradient descent. grads_and_vars = list(zip(clipped_gradients, dqn_params)) self.global_step = tf.Variable(tf.constant(0, tf.int64), False, name='Global_Step') self.train_step = tf.train.AdamOptimizer( learning_rate).apply_gradients(grads_and_vars, self.global_step)
import dqn import matplotlib.pyplot as plt import json #%% seq_size = 3 memory_size = 1000 env = Bot2DWrapper.Bot2DEnv(obs_size=64, grid_size=3, map_path="Image/map7.png") RL = dqn.DeepQNetwork( n_actions=3, feature_size=[64, 64, seq_size], sensor_size=60, learning_rate=2e-4, reward_decay=0.95, e_greedy=0.98, replace_target_iter=100, memory_size=memory_size, e_greedy_increment=0.0001, ) #%% if __name__ == '__main__': total_step = 0 state_m_rec = np.zeros([64, 64, seq_size], np.float32) reward_rec = [] for eps in range(250): print('[ Episode ' + str(eps) + ' ]') state_m, state_s = env.reset() step = 0
import dqn import matplotlib.pyplot as plt import json import cv2 import models #%% env = GSlamBot2DWrapper.Bot2DEnv(obs_size=128, grid_size=3, map_path="Image/map9.png") memory_size = 800 RL = dqn.DeepQNetwork( qnet=models.QNetNavMap, n_actions=3, learning_rate=2e-4, reward_decay=0.95, replace_target_iter=100, memory_size=memory_size, batch_size=64, e_greedy=0.95, e_greedy_increment=0.00004, ) #%% seq_size = 3 if __name__ == '__main__': total_step = 0 reward_rec = [] learn_count = 0 for eps in range(400): state = env.reset() state_m = cv2.resize(state["map"], (64, 64), interpolation=cv2.INTER_LINEAR)
def __init__(self, env, start_epsilon, end_epsilon, anneal_duration, train_interval, target_network_reset_interval, batch_size, num_hidden_units, initial_learning_rate, learning_rate_decay_factor, learning_rate_decay_frequency, max_gradient_norm, discount): """An agent that learns to maximize its score using deep Q-learning. Args: env: An EnvironmentWrapper object (see 'environment.py') that wraps over an OpenAI Gym environment. start_epsilon: Initial value for epsilon (exploration chance) used when training. end_epsilon: Final value for epsilon (exploration chance) used when training. anneal_duration: Number of time steps needed to decrease epsilon from start_epsilon to end_epsilon when training. train_interval: Number of experiences to accumulate before another round of training starts. target_network_reset_interval: Rate at which target Q-network values reset to actual Q-network values. Using a delayed target Q-network improves training stability. batch_size: Number of experiences sampled and trained on at once. num_hidden_units: Number of units in the hidden layer of the network. initial_learning_rate: Initial speed with which the network learns from new examples. learning_rate_decay_factor: The value with which the learning rate is multiplied when it decays. learning_rate_decay_frequency: The frequency (measured in training steps) at which the learning rate is reduced. max_gradient_norm: Maximum value allowed for the L2-norms of gradients. Gradients with norms that would otherwise surpass this value are scaled down. discount: Discount factor for future rewards. """ self.env = env self.dqn = dqn.DeepQNetwork(env.num_features, num_hidden_units, env.num_actions) self.start_epsilon = start_epsilon self.end_epsilon = end_epsilon self.anneal_duration = anneal_duration self.train_interval = train_interval self.target_network_reset_interval = target_network_reset_interval self.batch_size = batch_size self.time_step = 0 self.episodes_played = 0 self.epsilon = self._get_epsilon() # Create target Q-network. dqn_params = tf.trainable_variables() self.target_dqn = dqn.DeepQNetwork(env.num_features, num_hidden_units, env.num_actions) target_dqn_params = tf.trainable_variables()[len(dqn_params):] # Reset target Q-network values to the actual Q-network values. self.reset_target_dqn = [old.assign(new) for old, new in zip(target_dqn_params, dqn_params)] # Define the optimization scheme for the deep Q-network. self.reward = tf.placeholder(tf.float32, [None], name='Observed_Reward') self.ongoing = tf.placeholder(tf.bool, [None], name='State_Is_Nonterminal') # Determine the true action values. # # { r, if next state is terminal # Q(state, action) = { # { r + discount * max(Q(next state, <any action>)), otherwise next_optimal_action_value = tf.stop_gradient(self.target_dqn.optimal_action_value) observed_action_value = ( self.reward + tf.cast(self.ongoing, tf.float32) * discount * next_optimal_action_value) # Compute the loss function and regularize it by clipping the norm of its gradients. loss = tf.nn.l2_loss(self.dqn.estimated_action_value - observed_action_value) gradients = tf.gradients(loss, dqn_params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm) # Perform gradient descent. grads_and_vars = list(zip(clipped_gradients, dqn_params)) self.global_step = tf.Variable(tf.constant(0, tf.int64), False, name='Global_Step') self.learning_rate = tf.train.exponential_decay(initial_learning_rate, self.global_step, learning_rate_decay_frequency, learning_rate_decay_factor, staircase=True) self.train_step = tf.train.AdamOptimizer(self.learning_rate).apply_gradients( grads_and_vars, self.global_step)