class DQNAgent: def __init__(self, environment): self.env = environment self.memory = ReplayMemory(MEMORY_CAPACITY) self.dim_actions = self.env.action_space.n self.dim_states = self.env.observation_space.shape self.NN = NN(self.env.observation_space.shape, self.env.action_space.n, BATCH_SIZE, SIZE_HIDDEN, LEARNING_RATE, ACTIVATION) self.observers = [] self.episode_count = 0 self.step_count_total = 1 self.step_count_episode = 1 self.epsilon_min = EPSILON_MIN self.epsilon_max = EPSILON_MAX self.epsilon_decay = EPSILON_DECAY self.target_update = TARGET_UPDATE self.max_steps = MAX_STEPS self.n_episodes = N_EPISODES self.epsilon = EPSILON_MAX self.batch_size = BATCH_SIZE self.usetarget = False self.gamma = GAMMA self.loss = 0 self.done = False self.reward = 0 self.reward_episode = 0 self.learning_switch = False self.learning_start = LEARNING_START def notify(self, event): for observer in self.observers: observer(event) pass def act(self, state): self.step_count_total += 1 action = self.choose_action(state) return action def learn(self, obs): self.memory.store(obs) if self.learning_switch: self.backup() self.notify('step_done') pass def backup(self): self.flashback() if self.step_count_total % self.target_update == 0: print('update') print(self.epsilon) self.NN.update_target() self.usetarget = True pass def flashback(self): X, y = self._make_batch() self.loss = self.NN.train(X, y) if np.isnan(self.loss.history['loss']).any(): print('Warning, loss is {}'.format(self.loss)) pass def choose_action(self, state): if np.random.rand() <= self.epsilon: choice = self.random_choice() else: choice = self.greedy_choice(state) return choice def greedy_choice(self, state): greedy_choice = self.NN.best_action(state, usetarget=False) return greedy_choice def random_choice(self): random_choice = np.random.randint(0, self.dim_actions) return random_choice def _make_batch(self): X = [] y = [] batch = self.memory.get_batch(self.batch_size) for state, action, newstate, reward, done in batch: X.append(state) target = self.NN.predict(state, False) q_vals_new_t = self.NN.predict(newstate, self.usetarget) a_select = self.NN.best_action(newstate, False) if done: target[action] = reward else: target[action] = reward + self.gamma * q_vals_new_t[a_select] y.append(target) return X, y def add_observer(self, observer): self.observers.append(observer) pass
class Agent(object): """ The learner and decision maker. Based on the DQN algorithm - ref Mnih et. al 2015 i.e. Q-Learning with experience replay & a target network All calls to tensorflow are wrapped into methods. Support for environments is currently manually configured. """ def __init__(self, env, discount, tau, sess, total_steps, batch_size, layers, learning_rate, epsilon_decay_fraction=0.5, memory_fraction=0.25, process_observation=False, process_target=False, **kwargs): self.env = env self.discount = discount self.tau = tau self.sess = sess self.batch_size = batch_size # number of steps where epsilon is decayed from 1.0 to 0.1 decay_steps = total_steps * epsilon_decay_fraction self.epsilon_getter = EpsilonDecayer(decay_steps) # the counter is stepped up every time we act or learn self.counter = 0 if repr(env) == '<TimeLimit<CartPoleEnv<CartPole-v1>>>': obs_space_shape = env.observation_space.shape # the shape of the gym Discrete space is the number of actions # not the shape of a single action array # create a tuple to specify the action space self.action_space_shape = (1, ) # a list of all possible actions self.actions = [act for act in range(env.action_space.n)] elif repr(env) == '<TimeLimit<PendulumEnv<Pendulum-v0>>>': raise ValueError('Build in progress') obs_space_shape = env.observation_space.shape self.action_space_shape = env.action_space.shape self.actions = np.linspace(env.action_space.low, env.action_space.high, num=20, endpoint=True).tolist() elif repr(env) == '<TimeLimit<MountainCarEnv<MountainCar-v0>>>': obs_space_shape = env.observation_space.shape self.action_space_shape = (1, ) self.actions = [act for act in range(env.action_space.n)] else: raise ValueError('Environment not supported') self.memory = ReplayMemory(obs_space_shape, self.action_space_shape, size=int(total_steps * memory_fraction)) model_config = { 'input_shape': obs_space_shape, 'output_shape': (len(self.actions), ), 'layers': layers, 'learning_rate': learning_rate } # the two approximations of Q(s,a) # use the same config dictionary for both self.online = Qfunc(model_config, scope='online') self.target = Qfunc(model_config, scope='target') # set up the operations to copy the online network parameters to # the target network self.update_ops = self.make_target_net_update_ops() if process_observation: self.observation_processor = Normalizer(obs_space_shape[0]) if process_target: self.target_processor = Normalizer(1) self.acting_writer = tf.summary.FileWriter('./results/acting', graph=self.sess.graph) self.learning_writer = tf.summary.FileWriter('./results/learning', graph=self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.update_target_network() def __repr__(self): return '<class DQN Agent>' def make_target_net_update_ops(self): """ Creates the Tensorflow operations to update the target network. The two lists of Tensorflow Variables (one for the online net, one for the target net) are iterated over together and new weights are assigned to the target network """ with tf.variable_scope('update_target_network'): update_ops = [] for online, target in zip(self.online.params, self.target.params): logging.debug('copying {} to {}'.format( online.name, target.name)) val = tf.add(tf.multiply(online, self.tau), tf.multiply(target, 1 - self.tau)) operation = target.assign(val) update_ops.append(operation) return update_ops def remember(self, observation, action, reward, next_observation, done): """ Store experience in the agent's memory. args observation (np.array) action (np.array) reward (np.array) next_observation (np.array) done (np.array) """ if hasattr(self, 'observation_processor'): observation = self.observation_processor(observation) next_observation = self.observation_processor(next_observation) return self.memory.remember(observation, action, reward, next_observation, done) def predict_target(self, observations): """ Target network is used to predict the maximum discounted expected return for the next_observation as experienced by the agent args observations (np.array) returns max_q (np.array) shape=(batch_size, 1) """ fetches = [ self.target.q_values, self.target.max_q, self.target.acting_summary ] feed_dict = {self.target.observation: observations} q_vals, max_q, summary = self.sess.run(fetches, feed_dict) self.learning_writer.add_summary(summary, self.counter) logging.debug('predict_target - next_obs {}'.format(observations)) logging.debug('predict_target - q_vals {}'.format(q_vals)) logging.debug('predict_target - max_q {}'.format(max_q)) return max_q.reshape(observations.shape[0], 1) def predict_online(self, observation): """ We use our online network to choose actions. args observation (np.array) a single observation returns action """ obs = observation.reshape((1, *self.env.observation_space.shape)) fetches = [ self.online.q_values, self.online.max_q, self.online.optimal_action_idx, self.online.acting_summary ] feed_dict = {self.online.observation: obs} q_values, max_q, action_idx, summary = self.sess.run( fetches, feed_dict) self.acting_writer.add_summary(summary, self.counter) max_q = max_q.flatten()[0] max_q_sum = tf.Summary( value=[tf.Summary.Value(tag='max_q_acting', simple_value=max_q)]) self.acting_writer.add_summary(max_q_sum, self.counter) self.acting_writer.flush() # index at zero because TF returns an array action = self.actions[action_idx[0]] logging.debug('predict_online - observation {}'.format(obs)) logging.debug('predict_online - pred_q_values {}'.format(q_values)) logging.debug('predict_online - max_q {}'.format(max_q)) logging.debug('predict_online - action_index {}'.format(action_idx)) logging.debug('predict_online - action {}'.format(action)) return action def update_target_network(self): """ Updates the target network weights using the parameter tau Relies on the sorted lists of tf.Variables kept in each Qfunc object """ logging.debug('updating target net at count {}'.format(self.counter)) return self.sess.run(self.update_ops) def act(self, observation): """ Our agent attempts to manipulate the world. Acting according to epsilon greedy policy. args observation (np.array) returns action (np.array) """ self.counter += 1 epsilon = self.epsilon_getter.epsilon logging.debug('epsilon is {}'.format(epsilon)) if epsilon > random_uniform(): action = self.env.action_space.sample() logging.debug('acting randomly - action is {}'.format(action)) else: action = self.predict_online(observation) logging.debug('acting optimally action is {}'.format(action)) epsilon_sum = tf.Summary( value=[tf.Summary.Value(tag='epsilon', simple_value=epsilon)]) self.acting_writer.add_summary(epsilon_sum, self.counter) self.acting_writer.flush() # return np.array(action).reshape(1, *self.action_space_shape) return action def learn(self): """ Our agent attempts to make sense of the world. A batch sampled using experience replay is used to train the online network using targets from the target network. returns train_info (dict) """ batch = self.memory.get_batch(self.batch_size) observations = batch['observations'] actions = batch['actions'] rewards = batch['rewards'] terminals = batch['terminal'] next_observations = batch['next_observations'] next_obs_q = self.predict_target(next_observations) # if next state is terminal, set the value to zero next_obs_q[terminals] = 0 # creating a target for Q(s,a) using the Bellman equation rewards = rewards.reshape(rewards.shape[0], 1) target = rewards + self.discount * next_obs_q if hasattr(self, 'target_processor'): target = self.target_processor(target) indicies = np.zeros((actions.shape[0], 1), dtype=int) for arr, action in zip(indicies, actions): idx = self.actions.index(action) arr[0] = idx rng = np.arange(actions.shape[0]).reshape(actions.shape[0], 1) indicies = np.concatenate([rng, indicies], axis=1) fetches = [ self.online.q_values, self.online.q_value, self.online.loss, self.online.train_op, self.online.learning_summary ] feed_dict = { self.online.observation: observations, self.online.action: indicies, self.online.target: target } q_vals, q_val, loss, train_op, train_sum = self.sess.run( fetches, feed_dict) logging.debug('learning - observations {}'.format(observations)) logging.debug('learning - rewards {}'.format(rewards)) logging.debug('learning - terminals {}'.format(terminals)) logging.debug('learning - next_obs_q {}'.format(next_obs_q)) logging.debug('learning - actions {}'.format(actions)) logging.debug('learning - indicies {}'.format(indicies)) logging.debug('learning - q_values {}'.format(q_vals)) logging.debug('learning - q_value {}'.format(q_val)) logging.debug('learning - target {}'.format(target)) logging.debug('learning - loss {}'.format(loss)) self.learning_writer.add_summary(train_sum, self.counter) self.update_target_network() return {'loss': loss}
class Agent: def __init__(self, environment, optimizer, memory_length, dueling=True, loss='mse', noisy_net=False, egreedy=False, save_memory=None, save_weights=None, verbose_action=False, ): self.environment = environment self._optimizer = optimizer self._loss = loss self.dueling = dueling self.egreedy = egreedy self.noisy_net = noisy_net # Initialize discount and exploration rate, etc self.total_steps = 0 self.gamma = 0.99 self.epsilon = 1 self.epsilon_min = 0.01 self.epsilon_decay = 0.00005 self.tau = 0.05 self.pretraining_steps = 0 # Build networks self.q_network = self._build_compile_model() self.target_network = self._build_compile_model() self.align_target_model(how='hard') self.memory = ReplayMemory(memory_length) self.save_weights_fp = save_weights self.save_memory_fp = save_memory self.start_time = datetime.datetime.now() self.verbose_action = verbose_action def load_memory(self, fp): with open(fp, 'rb') as f: self.memory.load_memory(pickle.load(f)) print(f'loading {self.memory.length} memories...') def save_memory(self, fp): if fp: with open(fp, 'wb') as f: print('saving replay memory...') pickle.dump(self.memory.get_memory(), f) def load_weights(self, weights_fp): if weights_fp: print('loading weights...') self.q_network.load_weights(weights_fp) self.align_target_model(how='hard') def save_weights(self, weights_fp): if weights_fp: self.q_network.save_weights(weights_fp) def set_epsilon_decay_schedule(self, epsilon, epsilon_min, annealed_steps): self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = math.log(self.epsilon / self.epsilon_min) / annealed_steps def set_beta_schedule(self, beta_start, beta_max, annealed_samplings): self.memory.beta = beta_start self.memory.beta_max = beta_max self.memory.beta_increment_per_sampling = (self.memory.beta_max - self.memory.beta) / annealed_samplings def predict(self, state, use_target=False): if use_target: return self.target_network.predict(state) else: return self.q_network.predict(state) def _decay_epsilon(self): self.epsilon = self.epsilon * np.exp(-self.epsilon_decay) def store(self, state, action, reward, next_state, terminated): self.memory.add((state, action, reward, next_state, terminated)) self.total_steps += 1 if not self.egreedy: if (self.epsilon > self.epsilon_min) and (self.memory.length > self.pretraining_steps): self._decay_epsilon() def batch_store(self, batch_load): batch_load[-2][2] = -0.1 # custom reward altering for row in batch_load: self.store(*row) def _build_compile_model(self): inputs = tf.keras.layers.Input(shape=(32, 290, 4)) conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=4, padding='same', activation='relu')(inputs) conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=2, padding='same', activation='relu')(conv1) conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu')(conv2) conv3 = tf.keras.layers.Flatten()(conv3) if self.noisy_net: advt = NoisyNetDense(256, activation='relu')(conv3) final = NoisyNetDense(2)(advt) else: advt = tf.keras.layers.Dense(256, activation='relu')(conv3) final = tf.keras.layers.Dense(2)(advt) if self.dueling: if self.noisy_net: value = NoisyNetDense(256, activation='relu')(conv3) value = NoisyNetDense(1)(value) else: value = tf.keras.layers.Dense(256, activation='relu')(conv3) value = tf.keras.layers.Dense(1)(value) advt = tf.keras.layers.Lambda(lambda x: x - tf.reduce_mean(x, axis=1, keepdims=True))(final) final = tf.keras.layers.Add()([value, advt]) model = tf.keras.models.Model(inputs=inputs, outputs=final) model.compile(optimizer=self._optimizer, loss=self._loss, metrics=['accuracy']) return model def align_target_model(self, how): assert how in ('hard', 'soft'), '"how" must be either "hard" or "soft"' if how == 'hard': self.target_network.set_weights(self.q_network.get_weights()) elif how == 'soft': for t, e in zip(self.target_network.trainable_variables, self.q_network.trainable_variables): t.assign(t * (1 - self.tau) + (e * self.tau)) def choose_action(self, state): if not self.egreedy: if np.random.rand() <= self.epsilon: action = self.environment.action_space.sample() if self.verbose_action: print(f'action: {action}, q: random') return action q_values = self.predict(state, use_target=False) action = np.argmax(q_values[0]) if self.verbose_action: print(f'action: {action}, q: {q_values}') return action def train(self, batch, is_weights): td_errors = np.zeros(len(batch)) states = np.zeros((len(batch), 32, 290, 4)) targets = np.zeros((len(batch), 2)) for i, (state, action, reward, next_state, terminated) in enumerate(batch): target, td_error = self._get_target(state, action, reward, next_state, terminated) states[i] = state.reshape(32, 290, 4) targets[i] = target td_errors[i] = td_error self.q_network.fit(states, targets, sample_weight=is_weights, batch_size=32, epochs=1, verbose=0) self.align_target_model(how='soft') return td_errors def replay(self, batch_size, epoch_steps=None): num_batches = 1 if epoch_steps: num_batches = int(np.max([np.floor(epoch_steps / 4), 1])) bar = progressbar.ProgressBar(maxval=num_batches, widgets=[f'training - ', progressbar.widgets.Counter(), f'/{num_batches} ', progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() for i in range(num_batches): leaf_idx, batch, is_weights = self.memory.get_batch(batch_size) # prioritized experience replay td_errors = self.train(batch, is_weights) self.memory.update_sum_tree(leaf_idx, td_errors) bar.update(i + 1) bar.finish() self.save_weights(self.save_weights_fp) def _get_target(self, state, action, reward, next_state, terminated): target = self.predict(state, use_target=False) prev_target = target[0][action] if terminated: target[0][action] = reward else: a = np.argmax(self.predict(next_state, use_target=False)[0]) target[0][action] = reward + (self.gamma * self.predict(next_state, use_target=True)[0][a]) # double Q Network td_error = abs(prev_target - target[0][action]) return target, td_error