def __init__(self, env, batchsize=64, pic_size=(96, 96), num_frame_stack=4, gamma=0.95, frame_skip=1, train_freq=4, initial_epsilon=1.0, min_epsilon=0.1, render=True, epsilon_decay_steps=int(1e6), min_experience_size=int(1e3), experience_capacity=int(1e5), network_update_freq=5000, regularization=1e-6, optimizer_params=None, action_map=None): self.exp_history = ExperienceHistory(num_frame_stack, capacity=experience_capacity, pic_size=pic_size) # in playing mode we don't store the experience to agent history # but this cache is still needed to get the current frame stack self.playing_cache = ExperienceHistory(num_frame_stack, capacity=num_frame_stack * 5 + 10, pic_size=pic_size) if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.network_update_freq = network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.pic_size = pic_size self.regularization = regularization # These default magic values always work with Adam self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack, ) + self.pic_size self.global_counter = 0 self.episode_counter = 0
class DQN: """ General DQN agent. Can be applied to any standard environment The implementation follows: Mnih et. al - Playing Atari with Deep Reinforcement Learning https://arxiv.org/pdf/1312.5602.pdf The q-network structure is different from the original paper see also: David Silver's RL course lecture 6: https://www.youtube.com/watch?v=UoPei5o4fps&t=1s """ def __init__(self, env, batchsize=64, pic_size=(96, 96), num_frame_stack=4, gamma=0.95, frame_skip=1, train_freq=4, initial_epsilon=1.0, min_epsilon=0.1, render=True, epsilon_decay_steps=int(1e6), min_experience_size=int(1e3), experience_capacity=int(1e5), network_update_freq=5000, regularization=1e-6, optimizer_params=None, action_map=None): self.exp_history = ExperienceHistory(num_frame_stack, capacity=experience_capacity, pic_size=pic_size) # in playing mode we don't store the experience to agent history # but this cache is still needed to get the current frame stack self.playing_cache = ExperienceHistory(num_frame_stack, capacity=num_frame_stack * 5 + 10, pic_size=pic_size) if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.network_update_freq = network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.pic_size = pic_size self.regularization = regularization # These default magic values always work with Adam self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack, ) + self.pic_size self.global_counter = 0 self.episode_counter = 0 @staticmethod def process_image(img): return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1 def build_graph(self): input_dim_with_batch = (self.batchsize, self.num_frame_stack) + self.pic_size input_dim_general = (None, self.num_frame_stack) + self.pic_size self.input_prev_state = tf.placeholder(tf.float32, input_dim_general, "prev_state") self.input_next_state = tf.placeholder(tf.float32, input_dim_with_batch, "next_state") self.input_reward = tf.placeholder(tf.float32, self.batchsize, "reward") self.input_actions = tf.placeholder(tf.int32, self.batchsize, "actions") self.input_done_mask = tf.placeholder(tf.int32, self.batchsize, "done_mask") # These are the state action values for all states # The target Q-values come from the fixed network with tf.variable_scope("fixed"): qsa_targets = self.create_network(self.input_next_state, trainable=False) # with tf.variable_scope("Dueling_DDQN"): # qsa_targets_DDQN = tf.stop_gradient(self.create_network(self.input_next_state,trainable=True)) # target_action = tf.argmax(qsa_targets_DDQN, axis=1) # target_action_onehot = tf.one_hot(indices=target_action,depth=self.dim_actions) # qsa_targets = tf.stop_gradient(tf.reduce_sum(tf.multiply(qsa_targets,target_action_onehot),reduction_indices=[1,])) with tf.variable_scope("train"): qsa_estimates = self.create_network(self.input_prev_state, trainable=True) self.best_action = tf.argmax(qsa_estimates, axis=1) not_done = tf.cast( tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32") q_target = tf.reduce_max( qsa_targets, -1) * self.gamma * not_done + self.input_reward # q_target = qsa_targets * self.gamma * not_done + self.input_reward # select the chosen action from each row # in numpy this is qsa_estimates[range(batchsize), self.input_actions] action_slice = tf.stack( [tf.range(0, self.batchsize), self.input_actions], axis=1) q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice) training_loss = tf.nn.l2_loss( q_target - q_estimates_for_input_action) / self.batchsize optimizer = tf.train.AdamOptimizer(**(self.optimizer_params)) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) self.train_op = optimizer.minimize(reg_loss + training_loss) train_params = self.get_variables("train") fixed_params = self.get_variables("fixed") assert (len(train_params) == len(fixed_params)) self.copy_network_ops = [ tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params) ] def get_variables(self, scope): vars = [ t for t in tf.global_variables() if "%s/" % scope in t.name and "Adam" not in t.name ] return sorted(vars, key=lambda v: v.name) # the dueling network def create_network(self, input, trainable): if trainable: wr = slim.l2_regularizer(self.regularization) else: wr = None # the input is stack of black and white frames. # put the stack in the place of channel (last in tf) input_t = tf.transpose(input, [0, 2, 3, 1]) net = slim.conv2d(input_t, 8, (7, 7), data_format="NHWC", activation_fn=tf.nn.relu, stride=3, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.conv2d(net, 16, (3, 3), data_format="NHWC", activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.flatten(net) fc_1 = slim.fully_connected(net, 256, activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) fc_2 = slim.fully_connected(net, 256, activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) value = slim.fully_connected(fc_1, 1, activation_fn=None, weights_regularizer=wr, trainable=trainable) advantage = slim.fully_connected(fc_2, self.dim_actions, activation_fn=None, weights_regularizer=wr, trainable=trainable) q_state_action_values = value + (advantage - tf.reduce_mean( advantage, reduction_indices=[ 1, ], keepdims=True)) return q_state_action_values def check_early_stop(self, reward, totalreward): return False, 0.0 def get_random_action(self): return np.random.choice(self.dim_actions) def get_epsilon(self): if not self.do_training: return self.playing_epsilon elif self.global_counter >= self.epsilon_decay_steps: return self.min_epsilon else: # linear decay r = 1.0 - self.global_counter / float(self.epsilon_decay_steps) return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r def train(self): batch = self.exp_history.sample_mini_batch(self.batchsize) fd = { self.input_reward: "reward", self.input_prev_state: "prev_state", self.input_next_state: "next_state", self.input_actions: "actions", self.input_done_mask: "done_mask" } fd1 = {ph: batch[k] for ph, k in fd.items()} self.session.run([self.train_op], fd1) def play_episode(self): eh = (self.exp_history if self.do_training else self.playing_cache) total_reward = 0 frames_in_episode = 0 first_frame = self.env.reset() first_frame_pp = self.process_image(first_frame) eh.start_new_episode(first_frame_pp) while True: if np.random.rand() > self.get_epsilon(): action_idx = self.session.run(self.best_action, { self.input_prev_state: eh.current_state()[np.newaxis, ...] })[0] else: action_idx = self.get_random_action() if self.action_map is not None: action = self.action_map[action_idx] else: action = action_idx reward = 0 for _ in range(self.frame_skip): observation, r, done, info = self.env.step(action) if self.render: self.env.render() reward += r if done: break early_done, punishment = self.check_early_stop( reward, total_reward) if early_done: reward += punishment done = done or early_done total_reward += reward frames_in_episode += 1 eh.add_experience(self.process_image(observation), action_idx, done, reward) if self.do_training: self.global_counter += 1 if self.global_counter % self.network_update_freq: self.update_target_network() train_cond = ( self.exp_history.counter >= self.min_experience_size and self.global_counter % self.train_freq == 0) if train_cond: self.train() if done: if self.do_training: self.episode_counter += 1 return total_reward, frames_in_episode def update_target_network(self): self.session.run(self.copy_network_ops)
class DQN: def __init__(self, env, batchsize=64, pic_size=(96, 96), num_frame_stack=4, gamma=0.95, frame_skip=1, train_freq=4, initial_epsilon=1.0, min_epsilon=0.1, render=True, epsilon_decay_steps=int(1e6), min_experience_size=int(1e3), experience_capacity=int(1e5), network_update_freq=5000, regularization = 1e-6, optimizer_params = None, action_map=None ): self.exp_history = ExperienceHistory( num_frame_stack, capacity=experience_capacity, pic_size=pic_size ) # in playing mode we don't store the experience to agent history # but this cache is still needed to get the current frame stack self.playing_cache = ExperienceHistory( num_frame_stack, capacity=num_frame_stack * 5 + 10, pic_size=pic_size ) if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.network_update_freq = network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.pic_size = pic_size self.regularization = regularization self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack,) + self.pic_size self.global_counter = 0 self.episode_counter = 0 @staticmethod #chuyen frame nhan vao sang grayscale def preprocessing(img): return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1 def build_graph(self): input_dim_with_batch = (self.batchsize, self.num_frame_stack) + self.pic_size input_dim_general = (None, self.num_frame_stack) + self.pic_size self.input_prev_state = tf.placeholder(tf.float32, input_dim_general, "prev_state") self.input_next_state = tf.placeholder(tf.float32, input_dim_with_batch, "next_state") self.input_reward = tf.placeholder(tf.float32, self.batchsize, "reward") self.input_actions = tf.placeholder(tf.int32, self.batchsize, "actions") self.input_done_mask = tf.placeholder(tf.int32, self.batchsize, "done_mask") # These are the state action values for all states # The target Q-values come from the fixed network with tf.variable_scope("fixed"): qsa_targets = self.create_network(self.input_next_state, trainable=False) with tf.variable_scope("train"): qsa_estimates = self.create_network(self.input_prev_state, trainable=True) self.best_action = tf.argmax(qsa_estimates, axis=1) not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32") q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward # lay action duoc chon o moi hang action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1) q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice) training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize optimizer = tf.train.AdamOptimizer(**(self.optimizer_params)) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) self.train_op = optimizer.minimize(reg_loss + training_loss) train_params = self.get_variables("train") fixed_params = self.get_variables("fixed") assert (len(train_params) == len(fixed_params)) self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)] def get_variables(self, scope): vars = [t for t in tf.global_variables() if "%s/" % scope in t.name and "Adam" not in t.name] return sorted(vars, key=lambda v: v.name) def create_network(self, input, trainable): if trainable: wr = slim.l2_regularizer(self.regularization) else: wr = None # dau vao la mot stack nhung frame trang den # truyen stack vao channel input_t = tf.transpose(input, [0, 2, 3, 1]) net = slim.conv2d(input_t, 8, (7, 7), data_format="NHWC", activation_fn=tf.nn.relu, stride=3, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.conv2d(net, 16, (3, 3), data_format="NHWC", activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.flatten(net) net = slim.fully_connected(net, 256, activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn=None, weights_regularizer=wr, trainable=trainable) return q_state_action_values def check_early_stop(self, reward, totalreward): return False, 0.0 def get_random_action(self): return np.random.choice(self.dim_actions) def get_epsilon(self): if not self.do_training: return self.playing_epsilon elif self.global_counter >= self.epsilon_decay_steps: return self.min_epsilon else: r = 1.0 - self.global_counter / float(self.epsilon_decay_steps) return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r def train(self): batch = self.exp_history.sample_mini_batch(self.batchsize) fd = { self.input_reward: "reward", self.input_prev_state: "prev_state", self.input_next_state: "next_state", self.input_actions: "actions", self.input_done_mask: "done_mask" } fd1 = {ph: batch[k] for ph, k in fd.items()} self.session.run([self.train_op], fd1) def play_episode(self): eh = ( self.exp_history if self.do_training else self.playing_cache ) total_reward = 0 frames_in_episode = 0 first_frame = self.env.reset() first_frame_pp = self.preprocessing(first_frame) eh.start_new_episode(first_frame_pp) while True: if np.random.rand() > self.get_epsilon(): action_idx = self.session.run( self.best_action, {self.input_prev_state: eh.current_state()[np.newaxis, ...]} )[0] else: action_idx = self.get_random_action() if self.action_map is not None: action = self.action_map[action_idx] else: action = action_idx reward = 0 for _ in range(self.frame_skip): observation, r, done, info = self.env.step(action) if self.render: self.env.render() reward += r if done: break early_done, punishment = self.check_early_stop(reward, total_reward) if early_done: reward += punishment done = done or early_done total_reward += reward frames_in_episode += 1 eh.add_experience(self.preprocessing(observation), action_idx, done, reward) if self.do_training: self.global_counter += 1 if self.global_counter % self.network_update_freq: self.update_target_network() train_cond = ( self.exp_history.counter >= self.min_experience_size and self.global_counter % self.train_freq == 0 ) if train_cond: self.train() if done: if self.do_training: self.episode_counter += 1 return total_reward, frames_in_episode def update_target_network(self): self.session.run(self.copy_network_ops)
class DQN: """ General DQN agent. Can be applied to any standard environment The implementation follows: Mnih et. al - Playing Atari with Deep Reinforcement Learning https://arxiv.org/pdf/1312.5602.pdf The q-network structure is different from the original paper see also: David Silver's RL course lecture 6: https://www.youtube.com/watch?v=UoPei5o4fps&t=1s """ def __init__(self, env, batchsize=64, pic_size=(96, 96), num_frame_stack=4, gamma=0.95, frame_skip=1, train_freq=4, initial_epsilon=1.0, min_epsilon=0.1, render=True, epsilon_decay_steps=int(1e6), min_experience_size=int(1e3), experience_capacity=int(1e5), network_update_freq=5000, regularization=1e-6, optimizer_params=None, action_map=None): self.exp_history = ExperienceHistory(num_frame_stack, capacity=experience_capacity, pic_size=pic_size) # in playing mode we don't store the experience to agent history # but this cache is still needed to get the current frame stack self.playing_cache = ExperienceHistory(num_frame_stack, capacity=num_frame_stack * 5 + 10, pic_size=pic_size) if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.network_update_freq = network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.pic_size = pic_size self.regularization = regularization # These default magic values always work with Adam self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack, ) + self.pic_size self.global_counter = 0 self.episode_counter = 0 @staticmethod def process_image(img): return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1 def kl_divergence(p, q): return tf.reduce_sum(p * tf.log(p / q)) def sample_z(self, mu, logvar): eps = tf.random_normal(shape=tf.shape(mu)) return mu + tf.exp(logvar / 2) * eps def RGB(self, x): #x = tf.placeholder(tf.float32, [64, 96, 96, 3], name='image') #x = tf.image.resize_images(x, [64, 64]) x = tf.compat.v1.image.resize(x, [64, 96, 96, 3]) #x = tf.image.resize_images(x, [None, 96, 96, 3]) x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) # x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) # x = tf.layers.conv2d(x, filters=128, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) # x = tf.layers.conv2d(x, filters=256, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) x = tf.layers.flatten(x) x = tf.reshape(x, [-1, 6272]) print(x) z_mu = slim.fully_connected(x, 5, activation_fn=tf.nn.elu) z_var = slim.fully_connected(x, 5, activation_fn=tf.nn.elu) print("slim", z_mu) return z_mu, z_var def Event(self, x): #x = tf.placeholder(tf.float32, [64, 96, 96, 3], name='image') #x = tf.image.resize_images(x, [64, 64]) x = tf.compat.v1.image.resize(x, [64, 96, 96, 3]) #x = tf.image.resize_images(x,[None, 96, 96, 3]) x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) # x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) # x = tf.layers.conv2d(x, filters=128, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) # x = tf.layers.conv2d(x, filters=256, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) x = tf.layers.flatten(x) x = tf.reshape(x, [-1, 6272]) print(x) z_mu = slim.fully_connected(x, 5, activation_fn=tf.nn.elu) z_var = slim.fully_connected(x, 5, activation_fn=tf.nn.elu) print("slim", z_mu) return z_mu, z_var def encoderRGB(self, x): #x = tf.placeholder(tf.float32, [None, 96, 96, 3], name='image') x = tf.image.resize_images(x, [96, 96]) x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) x = tf.layers.conv2d(x, filters=128, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) x = tf.layers.conv2d(x, filters=256, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu) x = tf.layers.flatten(x) fc1 = tf.reshape(x, [-1, 4096]) shapes = tf.shape(x) #z_mu = slim.fully_connected(fc1, 5, activation_fn=tf.nn.elu) z_mean = tf_contrib.layers.fully_connected(fc1, 32) shape = x.get_shape().as_list() z_mua = tf.layers.dense(fc1, units=32, name='z_mu') z_logvara = tf.layers.dense(fc1, units=32, name='z_logvar') # dim = np.prod(shape[1:]) # x2 = tf.reshape(-1, x.get_shape()) #print("dimension!!!",fc1) # x = tf.reshape(-1,4096) tf.reset_default_graph() # z_mus = tf.layers.dense(x2, units=32, name='z_mu') # z_logvars = tf.layers.dense(x2, units=32, name='z_logvar') return z_mean, z_logvara def encoderEvent(self, input): wr = slim.l2_regularizer(1e-6) input_t = tf.transpose(input, [0, 2, 3]) net = slim.conv2d(input_t, 8, (7, 7), data_format="NHWC", activation_fn=tf.nn.relu, stride=3, weights_regularizer=wr) net = slim.max_pool2d(net, 2, 2) net = slim.conv2d( net, 16, (3, 3), data_format="NHWC", activation_fn=tf.nn.relu, weights_regularizer=wr, ) net = slim.max_pool2d(net, 2, 2) net = slim.flatten(net) net = slim.fully_connected(net, 256, activation_fn=tf.nn.relu, weights_regularizer=wr) return net def compute_loss(self): logits_flat = tf.layers.flatten(self.reconstructions) labels_flat = tf.layers.flatten(self.resized_image) reconstruction_loss = tf.reduce_sum(tf.square(logits_flat - labels_flat), axis=1) kl_loss = 0.5 * tf.reduce_sum( tf.exp(self.z_logvar) + self.z_mu**2 - 1. - self.z_logvar, 1) vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss) return vae_loss def build_graph(self, env): input_dim_with_batch = (self.batchsize, self.num_frame_stack) + self.pic_size input_dim_general = (None, self.num_frame_stack) + self.pic_size self.input_prev_state = tf.placeholder(tf.float32, input_dim_general, "prev_state") self.input_next_state = tf.placeholder(tf.float32, input_dim_with_batch, "next_state") self.input_reward = tf.placeholder(tf.float32, self.batchsize, "reward") self.input_actions = tf.placeholder(tf.int32, self.batchsize, "actions") self.input_done_mask = tf.placeholder(tf.int32, self.batchsize, "done_mask") #self.first_frame = tf.placeholder(tf.int32, self.batchsize, "first_frame") self.loss_vector = tf.placeholder(tf.float32, shape=self.batchsize) # These are the state action values for all states # The target Q-values come from the fixed network #ENCODER AND LOSS CALCULATION #################################################################### first_frame = env.reset() self.val = tf.placeholder(tf.float32, [64, 96, 96, 3], name='rgb') self.event = tf.placeholder(tf.float32, [64, 96, 96, 3], name='event') self.val, self.event = env.returnRgb() rgb_mu, rgb_var = self.RGB(self.val) events_mu, events_var = self.Event(self.event) rgb_std = tf.math.sqrt(rgb_var) event_std = tf.math.sqrt(events_mu) self.val_for_loss = ((rgb_std**2) + ((rgb_mu - events_mu)**2)) / (2 * (rgb_std**2)) self.loss_vector = tf.math.log( event_std / rgb_std) + self.val_for_loss - (1 / 2) # self.val_latent = self.sample_z(vals_mu,vals_var) # self.val_event = self.sample_z(events_mu,events_var) # x = tf.Print(self.loss_vector,[self.loss_vector]) # sess = tf.InteractiveSession() # sess.run(x) # sess.close() # X = tf.distributions.Normal(probs=self.val_latent) # Y = tf.distributions.Normal(probs=self.val_event) # self.loss_vector = tf.distributions.kl_divergence(X, Y) ##################################################################### with tf.variable_scope("fixed"): qsa_targets = self.create_network(self.input_next_state, trainable=False) with tf.variable_scope("train"): qsa_estimates = self.create_network(self.input_prev_state, trainable=True) self.best_action = tf.argmax(qsa_estimates, axis=1) not_done = tf.cast( tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32") q_target = tf.reduce_max( qsa_targets, -1) * self.gamma * not_done + self.input_reward # select the chosen action from each row # in numpy this is qsa_estimates[range(batchsize), self.input_actions] action_slice = tf.stack( [tf.range(0, self.batchsize), self.input_actions], axis=1) q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice) training_loss = tf.nn.l2_loss( q_target - q_estimates_for_input_action) / self.batchsize optimizer = tf.train.AdamOptimizer(**(self.optimizer_params)) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) # x = tf.Print(reg_loss, [reg_loss]) # sess = tf.InteractiveSession() # sess.run(x) # sess.close() self.train_op = optimizer.minimize(reg_loss + training_loss) #+ self.loss_vector) tf.print(self.train_op) train_params = self.get_variables("train") fixed_params = self.get_variables("fixed") assert (len(train_params) == len(fixed_params)) self.copy_network_ops = [ tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params) ] return self.train_op def get_variables(self, scope): vars = [ t for t in tf.global_variables() if "%s/" % scope in t.name and "Adam" not in t.name ] return sorted(vars, key=lambda v: v.name) def create_network(self, input, trainable): if trainable: wr = slim.l2_regularizer(self.regularization) else: wr = None # the input is stack of black and white frames. # put the stack in the place of channel (last in tf) input_t = tf.transpose(input, [0, 2, 3, 1]) net = slim.conv2d(input_t, 8, (7, 7), data_format="NHWC", activation_fn=tf.nn.relu, stride=3, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.conv2d(net, 16, (3, 3), data_format="NHWC", activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.flatten(net) net = slim.fully_connected(net, 256, activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn=None, weights_regularizer=wr, trainable=trainable) return q_state_action_values def check_early_stop(self, reward, totalreward): return False, 0.0 def get_random_action(self): return np.random.choice(self.dim_actions) def get_epsilon(self): if not self.do_training: return self.playing_epsilon elif self.global_counter >= self.epsilon_decay_steps: return self.min_epsilon else: # linear decay r = 1.0 - self.global_counter / float(self.epsilon_decay_steps) return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r def train(self): batch = self.exp_history.sample_mini_batch(self.batchsize) fd = { self.input_reward: "reward", self.input_prev_state: "prev_state", self.input_next_state: "next_state", self.input_actions: "actions", self.input_done_mask: "done_mask" } fd1 = {ph: batch[k] for ph, k in fd.items()} results = self.session.run([self.train_op, self.loss_vector], fd1) self.graph = build_graph() with tf.InteractiveSession() as sess: print(sess.run([self.graph], fd1)) def play_episode(self): eh = (self.exp_history if self.do_training else self.playing_cache) total_reward = 0 frames_in_episode = 0 # with tf.compat.v1.Session() as sess: first_frame = self.env.reset() #From CarRacing # sess.close() #value = tfp.distributions.kl_divergence(latent, latent_event, allow_nan_stats=True, name=None) #kl_r = rel_entr(latent,latent_event) #clear_session() # a = tf.constant([[4,3],[3,3]]) # print(type(a)) # sess = tf.InteractiveSession() # xo = tf.Print(mu,[mu]) # sess.run(xo) # sess.close() wr = slim.l2_regularizer(self.regularization) # k = tf.keras.losses.KLDivergence() # loss = k(latent,latent_event) #PRINTING VALUES # a = tf.constant([[4,3],[3,3]]) # x = tf.Print(self.val_latent,[self.val_latent]) # sess = tf.InteractiveSession() # sess.run(x) # sess.close() first_frame_pp = self.process_image(first_frame) eh.start_new_episode(first_frame_pp) while True: if np.random.rand() > self.get_epsilon(): action_idx = self.session.run(self.best_action, { self.input_prev_state: eh.current_state()[np.newaxis, ...] })[0] # action_idx, loss_vec = self.session.run( # [self.best_action, self.loss_vector], # {self.input_prev_state: eh.current_state()[np.newaxis, ...]} # ) # print("Loss vec:", self.loss_vector) else: action_idx = self.get_random_action() if self.action_map is not None: action = self.action_map[action_idx] else: action = action_idx reward = 0 for _ in range(self.frame_skip): observation, r, done, info = self.env.step(action) if self.render: self.env.render() reward += r if done: break early_done, punishment = self.check_early_stop( reward, total_reward) if early_done: reward += punishment done = done or early_done total_reward += reward frames_in_episode += 1 eh.add_experience(self.process_image(observation), action_idx, done, reward) if self.do_training: self.global_counter += 1 if self.global_counter % self.network_update_freq: self.update_target_network() train_cond = ( self.exp_history.counter >= self.min_experience_size and self.global_counter % self.train_freq == 0) if train_cond: self.train() if done: if self.do_training: self.episode_counter += 1 return total_reward, frames_in_episode def update_target_network(self): self.session.run(self.copy_network_ops)
def test_many_frames(self): n_frames = 1000 size = 30 frames = np.ones( (n_frames, 2, 2)).astype("float32") * np.arange(n_frames).reshape( -1, 1, 1) start_frame = np.ones((2, 2), "float32") * 10000 h = ExperienceHistory(num_frame_stack=num_frame_stack, capacity=30, pic_size=pic_size) h.start_new_episode(start_frame) #add 10 frames for f in frames[:10]: h.add_experience(f, 12, False, 5.0) this_state = h.current_state() h.add_experience(frames[10], 10, False, 4) def a(): assert np.all(this_state == frames[7:10]) assert h.rewards[10] == 4 assert h.actions[10] == 10 assert not h.is_done[10] assert np.all(h.frames[h.prev_states[10]] == frames[7:10]) assert np.all(h.frames[h.next_states[10]] == frames[8:11]) # Check that adding one frame # doesn't mess up the existing history a() # add 29 more experiences and check that # the past experience is not changed for f in frames[11:40]: done = np.random.rand() > 0.5 h.add_experience(f, 0, done, 1.0) if done: h.start_new_episode(start_frame) a() # adding one more experience should # overwrite the oldest experience: h.add_experience(frames[40], 1, False, 2.0) assert h.rewards[10] == 2.0 assert h.actions[10] == 1 with self.assertRaises(AssertionError): a()
def test_add_frame(self): h = ExperienceHistory(num_frame_stack=num_frame_stack, capacity=size, pic_size=pic_size) #can't do anything because no episode started with self.assertRaises(AssertionError): h.current_state() with self.assertRaises(AssertionError): h.add_experience(None, None, None, None) frames = np.random.rand(4, 2, 2).astype("float32") # add the beginning frame h.start_new_episode(frames[0]) # Check that padding works correctly assert (h.current_state() == frames[0]).all() assert (h.current_state().shape == (num_frame_stack, ) + pic_size) # Now add next frame. # The action is action taken before this frame # and reward is the reward observed for this action # done is a flag if we ended in terminal state h.add_experience(frames[1], 4, False, 1.0) assert (h.current_state()[:2] == frames[0]).all() assert (h.current_state()[2] == frames[1]).all() assert (h.current_state().shape == (num_frame_stack, ) + pic_size) # add one more experience and set episode as finished h.add_experience(frames[2], 5, True, 2.0) # now there should not be any padding for current state assert (h.current_state() == frames[:3]).all() assert (h.current_state().shape == (num_frame_stack, ) + pic_size) assert np.all(h.next_states[:3] == np.array([[0, 0, 1], [0, 1, 2], [-1, -1, -1]])) assert np.all(h.prev_states[:3] == np.array([[0, 0, 0], [0, 0, 1], [-1, -1, -1]])) h.start_new_episode(frames[3]) assert (h.current_state() == frames[3]).all() assert (h.current_state().shape == (num_frame_stack, ) + pic_size) batch = h.sample_mini_batch(20) # Check that we don't sample from the part which is not yet written # i.e shouldn't see zeros (the caches are initialized with zeros) assert np.all(np.in1d(batch["reward"], [1., 2.])) assert np.all(np.in1d(batch["actions"], [4., 5.])) # when we arrived to 2nd frame was the only time when episode was not done dm = ~batch["done_mask"].astype(bool) assert np.all(batch["next_state"][dm] == np.array(frames[[0, 0, 1]])) # frames[2] in the history is overwritten by frames[3] because new episode has started, # however it doesn't matter because the terminal state shouldn't be used anywhere. assert np.all(batch["next_state"][~dm] == np.array(frames[[0, 1, 3]])) assert np.all((batch["prev_state"] == frames[0]) | (batch["prev_state"] == frames[1]))