def __init__(self): self.eps = 0.1 self.env = GridEnv(3) self.batch_size = 20 if prioritized_replay and replay_type == "proportional": self.replay = ProportionalReplay(max_buffer_size, prioritized_replay_alpha) elif prioritized_replay and replay_type == "ranked": N_list = [self.batch_size] + [ int(x) for x in np.linspace(100, max_buffer_size, 5) ] save_quantiles(N_list=N_list, k=self.batch_size, alpha=prioritized_replay_alpha) self.replay = RankBasedReplay(max_buffer_size, prioritized_replay_alpha) else: self.replay = ExperienceReplay( max_buffer_size) # passing size of buffer # define graph self.inputs = tf.placeholder(tf.float32, shape=(None, self.env.state_size)) self.target_values = tf.placeholder(tf.float32, shape=(None, )) self.actions = tf.placeholder(tf.int32, shape=(None, )) self.is_weights = tf.placeholder(tf.float32, shape=( None, )) # importance sampling weights for prioritized replay self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph( ) # build main network self.target_Q_out_op, _, _ = self.build_graph( 'target') # build identical target network self.init_op = tf.global_variables_initializer() self.sess = tf.Session()
def set_new_goal(self, goal): goal_raw = np.array(goal) self.grid_env = GridEnv() _ = self.grid_env.reset(goal_raw) self.state_features, self.step_data_cache = [], [] feed_dict = {} self.net_state = self.sess.run(self.policy.train_ops['init_state'], feed_dict=feed_dict) self.net_state = dict( zip(self.policy.train_ops['state_names'], self.net_state))
class CMPRunner(object): def __init__(self, model_path): config = self._get_tf_config() args = get_cfg_defaults() self.tf_graph = tf.Graph() with self.tf_graph.as_default(): self.policy = CMPPolicy(args, is_training=False, batch_norm_is_training=False, only_single_step_graph=True) self.sess = tf.Session(config=config) self.sess.run(self.policy.init_op) self.policy.saver_op.restore(self.sess, model_path) def set_new_goal(self, goal): goal_raw = np.array(goal) self.grid_env = GridEnv() _ = self.grid_env.reset(goal_raw) self.state_features, self.step_data_cache = [], [] feed_dict = {} self.net_state = self.sess.run(self.policy.train_ops['init_state'], feed_dict=feed_dict) self.net_state = dict( zip(self.policy.train_ops['state_names'], self.net_state)) def compute_action(self, image): f = self.grid_env.get_features() f['imgs'] = image.copy() f = self.grid_env.pre_features(f) f.update(self.net_state) self.state_features.append(f) feed_dict = prepare_feed_dict(self.policy.input_tensors['step'], self.state_features[-1]) feed_dict[self.policy.train_ops['batch_norm_is_training_op']] = False kwargs = {} outs = self.sess.run([self.policy.train_ops['step'], self.policy.train_ops['step_data_cache'], self.policy.train_ops['updated_state']], feed_dict=feed_dict, **kwargs) action_probs = np.expand_dims(outs[0], axis=1) action = [np.argmax(action_probs[0, 0, :])] self.step_data_cache.append( dict(zip(self.policy.train_ops['step_data_cache_names'], outs[1]))) self.net_state = outs[2] assert (action_probs.shape[1] == 1) next_state = self.grid_env.take_action(action[0]) self.net_state = dict( zip(self.policy.train_ops['state_names'], self.net_state)) return action, next_state def _get_tf_config(self): config = tf.ConfigProto() config.device_count['GPU'] = 1 config.gpu_options.allow_growth = True config.intra_op_parallelism_threads = 0 config.inter_op_parallelism_threads = 0 return config
import os import numpy as np from grid_env import GridEnv from agent import Agent NUM_EPISODE = 1000 if __name__ == "__main__": env = GridEnv() agent = Agent(env) for n_episode in range(NUM_EPISODE): state = env.reset() while True: action = agent.get_action(state) next_state, reward, done = env.step(action) next_action = agent.get_action(next_state) agent.update_table(state, action, reward, next_state, next_action) state = next_state if done: break debug_str = "" for h in range(env.height): for w in range(env.width): debug_str += '****************' debug_str += "*\n"
from __future__ import division import tensorflow as tf import numpy as np import gym import itertools import collections from grid_env import GridEnv from lib import plotting from matplotlib import pyplot as plt env = GridEnv() obs_dim, act_dim, grid_size = env.get_dimensions() # Coordinated Exploration class MultiAgent(): def __init__(self, learning_rate=0.01): self.reg_constant_1 = 0.0005 #0.0005 for hard env, 0.0002 for easy self.reg_constant_2 = 0.0005 with tf.variable_scope('agent1'): # Define policy for agent 1 self.lambda_1 = tf.placeholder(tf.float32, name='lambda_1') self.state_1 = tf.placeholder(tf.int32, [], "state_1") self.action_1 = tf.placeholder(dtype=tf.int32, name="action_1") self.target_1 = tf.placeholder(dtype=tf.float32, name="target_1") self.ratio_1 = tf.placeholder(dtype=tf.float32, name="ratio_1")
class DQN_agent(): def __init__(self): self.eps = 0.1 self.env = GridEnv(3) self.batch_size = 20 if prioritized_replay and replay_type == "proportional": self.replay = ProportionalReplay(max_buffer_size, prioritized_replay_alpha) elif prioritized_replay and replay_type == "ranked": N_list = [self.batch_size] + [ int(x) for x in np.linspace(100, max_buffer_size, 5) ] save_quantiles(N_list=N_list, k=self.batch_size, alpha=prioritized_replay_alpha) self.replay = RankBasedReplay(max_buffer_size, prioritized_replay_alpha) else: self.replay = ExperienceReplay( max_buffer_size) # passing size of buffer # define graph self.inputs = tf.placeholder(tf.float32, shape=(None, self.env.state_size)) self.target_values = tf.placeholder(tf.float32, shape=(None, )) self.actions = tf.placeholder(tf.int32, shape=(None, )) self.is_weights = tf.placeholder(tf.float32, shape=( None, )) # importance sampling weights for prioritized replay self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph( ) # build main network self.target_Q_out_op, _, _ = self.build_graph( 'target') # build identical target network self.init_op = tf.global_variables_initializer() self.sess = tf.Session() def build_graph(self, scope='main'): with tf.variable_scope(scope): h = tf.layers.dense(self.inputs, 16, activation=tf.nn.relu, name="h") outputs = tf.layers.dense(h, self.env.num_actions, activation=tf.nn.softmax, name="outputs") # everything is now the same shape (batch_size, num_actions) # nonzero error only for selected actions action_mask = tf.one_hot(self.actions, self.env.num_actions, on_value=True, off_value=False) targets = tf.tile(tf.expand_dims(self.target_values, 1), [1, self.env.num_actions]) target_outputs = tf.where( action_mask, targets, outputs ) # takes target value where mask is true. takes outputs value otherwise td_error = target_outputs - outputs # only one element in each row is non-zero weights = tf.tile(tf.expand_dims(self.is_weights, 1), [1, self.env.num_actions ]) # all 1s when not using priority replay weighted_td_error = weights * td_error # element-wise multiplication loss = tf.reduce_sum(tf.square(weighted_td_error)) update = tf.train.AdamOptimizer().minimize(loss) return outputs, update, td_error def train(self): steps_per_ep = np.zeros(episodes) for episode in range(episodes): print(episode) self.env.reset() state = self.env.state done = False num_steps = 0 while not done: num_steps += 1 action = self.get_eps_action(state, self.eps) next_state, reward, done, _ = self.env.step(action) self.replay.add((state, action, reward, next_state, done)) # store in experience replay # sample from experience replay if prioritized_replay: beta = beta0 + episode * ( 1 - beta0 ) / episodes # linear annealing schedule for IS weights states, actions, rewards, next_states, dones, weights, indices = self.replay.sample( self.batch_size, beta) self.net_update(states, actions, rewards, next_states, dones, weights, indices) # qlearning else: states, actions, rewards, next_states, dones = self.replay.sample( self.batch_size) self.net_update(states, actions, rewards, next_states, dones) # qlearning # slowly update target network if num_steps % update_every == 0: self.target_net_update() # sort max heap periodically if num_steps % sort_every == 0: if prioritized_replay and replay_type == "ranked": self.replay.sort() state = next_state steps_per_ep[episode] = num_steps return steps_per_ep # from https://tomaxent.com/2017/07/09/Using-Tensorflow-and-Deep-Q-Network-Double-DQN-to-Play-Breakout/ def target_net_update(self): # get sorted lists of parameters in each of the networks main_params = [ t for t in tf.trainable_variables() if t.name.startswith("main") ] main_params = sorted(main_params, key=lambda v: v.name) target_params = [ t for t in tf.trainable_variables() if t.name.startswith("target") ] target_params = sorted(target_params, key=lambda v: v.name) update_ops = [] for main_v, target_v in zip(main_params, target_params): op = target_v.assign(main_v) update_ops.append(op) self.sess.run(update_ops) # minibatch qlearning def net_update(self, states, actions, rewards, next_states, dones, weights=None, indices=None): not_dones = np.logical_not(dones) # create a shape (batch_size, ) array of target values target_values = rewards.astype( float) # np.array of shape (batch_size, ) next_inputs = next_states[ not_dones] # np.array of shape (#done, state_size) next_Qs = self.sess.run(self.Q_out_op, {self.inputs: next_inputs }) # np.array of shape (#done, num_actions) max_Qs = np.max(next_Qs, axis=1) # np.array of shape (#done,) target_values[not_dones] += gamma * max_Qs # if not using prioritized replay if weights is None: weights = np.ones(self.batch_size) # compute gradients and update parameters _, td_error = self.sess.run([self.Q_update_op, self.td_error_op], \ {self.inputs: states, self.target_values: target_values, self.actions: actions, self.is_weights: weights}) # update priority replay priorities if indices is not None: td_error = td_error.ravel()[np.flatnonzero( td_error)] # shape (batch_size, ) self.replay.update_priorities( indices, np.abs(td_error) + 1e-3 ) # add small number to prevent never sampling 0 error transitions # returns eps-greedy action with respect to Q def get_eps_action(self, state, eps): if self.env.np_random.uniform() < eps: action = self.env.sample() else: Q = self.sess.run(self.Q_out_op, {self.inputs: np.array([state])}) max_actions = np.where(np.ravel(Q) == Q.max())[0] action = self.env.np_random.choice( max_actions) # to select argmax randomly return action