def __init__(self, sess, env, conf): super(DQN_Agent, self).__init__(env, conf) self.lstm = False if conf.network_config == 'MLP': self.net_config = constants.MLP print 'Init new MLP network...' elif conf.network_config == 'CNN': self.net_config = constants.CNN print 'Init new CNN network...' else: print 'Bad network config given! Exiting....' sys.exit(-1) self.sess = sess self.discrete = False self.batch_size = conf.batch_size self.queue_size = self.batch_size * 25 self.exp_replay = ExpReplay(self.env.getStateShape()[-1:], self.batch_size, self.env.history_length, capacity=conf.exp_replay_size) self.train_freq = conf.train_freq self.update_target_freq = conf.update_target_freq self.double_q = conf.double_q self.unroll = self.env.history_length self.sgd_steps = 10 self.train_iterations = 0 self.coord = tf.train.Coordinator() self.replay_lock = threading.Lock() self._initModel(conf.lr, conf.lr_minimum, conf.lr_decay_step, conf.lr_decay) self._updateTargetModel()
def __init__(self, sess, env, conf): super(RDQN_Agent, self).__init__(env, conf) self.num_lstm_units = 32 self.lstm = True if conf.network_config == 'R_MLP': self.net_config = constants.R_MLP print 'Init new Recurrant MLP network...' else: print 'Bad network config given! Exiting....' sys.exit(-1) self.sess = sess self.discrete = False self.batch_size = conf.batch_size self.queue_size = self.batch_size * 4 self.unroll = conf.unroll self.exp_replay = ExpReplay(self.env.state_shape[-1:], self.batch_size, self.unroll, capacity=conf.exp_replay_size) self.train_freq = conf.train_freq self.update_target_freq = conf.update_target_freq self.double_q = conf.double_q self.train_iterations = 0 self.coord = tf.train.Coordinator() self.replay_lock = threading.Lock() self._initModel(conf.lr, conf.lr_minimum, conf.lr_decay_step, conf.lr_decay) self._updateTargetModel()
#env = gym.make('Pendulum-v0') ob_space = env.observation_space ac_space = env.action_space print("Observation space: ", ob_space, ob_space.dtype) print("Action space: ", ac_space, ac_space.n) s_size = ob_space.shape[0] a_size = ac_space.n print('size: ' + str(s_size) + '/' + str(a_size)) actor = ActorNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=ACTOR_LEARNING_RATE, n_h1=N_H1, n_h2=N_H2, tau=TAU) critic = CriticNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=CRITIC_LEARNING_RATE, n_h1=N_H1, n_h2=N_H2, tau=TAU) noise = OUProcess(ACTION_SIZE) exprep = ExpReplay(mem_size=MEM_SIZE, start_mem=START_MEM, state_size=[STATE_SIZE], kth=-1, batch_size=BATCH_SIZE) sess = tf.Session() with tf.device('/{}:0'.format('CPU')): agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=ACTION_RANGE) sess.run(tf.initialize_all_variables()) for i in range(NUM_EPISODES): cur_state = env.reset() cum_reward = 0 # tensorboard summary summary_writer = tf.summary.FileWriter('/tmp/pendulum-log-0'+'/train', graph=tf.get_default_graph()) if (i % EVALUATE_EVERY) == 0: print ('====evaluation====') for t in range(MAX_STEPS):
class DQN_Agent(Agent): def __init__(self, sess, env, conf): super(DQN_Agent, self).__init__(env, conf) self.lstm = False if conf.network_config == 'MLP': self.net_config = constants.MLP print 'Init new MLP network...' elif conf.network_config == 'CNN': self.net_config = constants.CNN print 'Init new CNN network...' else: print 'Bad network config given! Exiting....' sys.exit(-1) self.sess = sess self.discrete = False self.batch_size = conf.batch_size self.queue_size = self.batch_size * 25 self.exp_replay = ExpReplay(self.env.getStateShape()[-1:], self.batch_size, self.env.history_length, capacity=conf.exp_replay_size) self.train_freq = conf.train_freq self.update_target_freq = conf.update_target_freq self.double_q = conf.double_q self.unroll = self.env.history_length self.sgd_steps = 10 self.train_iterations = 0 self.coord = tf.train.Coordinator() self.replay_lock = threading.Lock() self._initModel(conf.lr, conf.lr_minimum, conf.lr_decay_step, conf.lr_decay) self._updateTargetModel() # Init tensorflow network model def _initModel(self, lr, lr_min, lr_decay_step, lr_decay): state_shape = [self.env.history_length, self.env.getStateShape()[-1]] self.q_model = Network('q_network', state_shape, self.env.num_actions, self.net_config, lr, lr_min, lr_decay_step, lr_decay, self.batch_size, self.queue_size) self.t_model = Network('t_network', state_shape, self.env.num_actions, self.net_config, lr, lr_min, lr_decay_step, lr_decay, self.batch_size, self.queue_size) self.sess.run(tf.global_variables_initializer()) self.copy_op = self._setupTargetUpdates() # Create ops to copy weights from online net to target net def _setupTargetUpdates(self): copy_ops = list() for key in self.q_model.weights.keys(): copy_ops.append(self.t_model.weights[key].assign( self.q_model.weights[key])) return tf.group(*copy_ops, name='copy_op') # Run the online->target update ops def _updateTargetModel(self): self.sess.run(self.copy_op) # Randomly take the given number of steps and store experiences def randomExplore(self, num_steps): step = 0 self.newGame() while step < num_steps: action = self.env.gym_env.action_space.sample() self.env.takeAction(action) self._storeExperience(action) step += 1 if self.env.done or step >= num_steps: self.newGame() # Run the agent for the desired number of steps either training def train(self, num_steps): step = 0 episode_num = 0 eps = self.train_eps while step < num_steps: reward_sum = 0.0 self.newGame() while not self.env.done: # Take action greedly with eps proability if np.random.rand() < eps: action = np.random.randint(self.env.num_actions) else: state = self.env.getState(self.discrete) action = self._selectAction(state, 0.1) #state = self.env.getState(self.discrete) #action = self._selectAction(self.env.state, eps) self.env.takeAction(action) self._storeExperience(action) eps = self._decayEps() reward_sum += self.env.reward step += 1 # Train and update networks as neccessary if step % self.train_freq == 0: self._trainNetwork() if step % self.update_target_freq == self.update_target_freq - 1: self._updateTargetModel() # Handle episode termination if self.env.done or step >= num_steps: episode_num += 1 self.callback.onStep(action, self.env.reward, True, eps) break else: self.callback.onStep(action, self.env.reward, self.env.done, eps) # Choose action greedly from network def _selectAction(self, state, eps): state = state.reshape([1] + self.env.getStateShape()) #return self.sess.run(self.q_model.predict_op, feed_dict={self.q_model.batch_input : state})[0] q_probs = self.sess.run(self.q_model.q_dist, feed_dict={ self.q_model.batch_input: state, self.q_model.keep_prob: eps }) action_value = np.random.choice(q_probs[0], p=q_probs[0]) return np.argmax(q_probs[0] == action_value) # Store the transition into memory def _storeExperience(self, action): with self.replay_lock: state = self.env.getState(self.discrete) if self.env.history_length == 1: self.exp_replay.storeExperience(state, action, self.env.reward, self.env.done) else: self.exp_replay.storeExperience(state[-1], action, self.env.reward, self.env.done) # Get Q values based off predicted max future reward def _getTargetQValues(self, states, actions, rewards, states_, done_flags): if self.double_q: #q_values = self.sess.run(self.q_model.q_values, feed_dict={self.q_model.batch_input: states}) future_actions = self.sess.run( self.q_model.predict_op, feed_dict={self.q_model.batch_input: states_}) target_q_values_with_idxs = self.sess.run( self.t_model.q_values_with_idxs, feed_dict={ self.t_model.batch_input: states_, self.t_model.q_value_idxs: [[idx, future_a] for idx, future_a in enumerate(future_actions)] }) pred_q_values = ( 1.0 - done_flags ) * self.discount * target_q_values_with_idxs + rewards #errors = np.abs(q_values[:, actions] - pred_q_values) else: max_future_q_values = self.sess.run( self.t_model.max_q_values, feed_dict={self.t_model.batch_input: states_}) pred_q_values = (1.0 - done_flags ) * self.discount * max_future_q_values + rewards errors = None return errors, pred_q_values # Run the train ops def _trainNetwork(self): # Wait until the queue has been filled up with experiences for i in range(self.sgd_steps): while self.sess.run(self.q_model.queue_size_op) < self.batch_size: continue q_values, loss, _ = self.sess.run([ self.q_model.q_values, self.q_model.loss, self.q_model.train_op ]) self.train_iterations += 1 if self.callback: self.callback.onTrain(q_values, loss) # Start threads to load training data into the network queue def startEnqueueThreads(self): threads = list() for i in range(constants.NUM_QUEUE_THREADS): t = threading.Thread(target=self._enqueueThread) t.setDaemon(True) t.start() threads.append(t) self.coord.register_thread(t) time.sleep(0.1) # Stop threads thats load training data def stopEnqueueThreads(self): self.coord.request_stop() # Enqueue training data inot the network queue def _enqueueThread(self): while not self.coord.should_stop(): # Make sure we only keep recent experiences in the batch if self.exp_replay.size < self.batch_size or \ self.sess.run(self.q_model.queue_size_op) == self.queue_size: continue with self.replay_lock: states, actions, rewards, states_, done_flags = self.exp_replay.getBatch( ) errors, pred_q_values = self._getTargetQValues( states, actions, rewards, states_, done_flags) feed_dict = { self.q_model.queue_input: states, self.q_model.queue_action: actions, self.q_model.queue_label: pred_q_values } try: self.sess.run(self.q_model.enqueue_op, feed_dict=feed_dict) except tf.errors.CancelledError: return
print("make env") env = ns3env.Ns3Env(port=setting.port, stepTime=setting.stepTime, startSim=startSim, simSeed=setting.seed, simArgs=setting.simArgs, debug=setting.debug) print("done env"); env._max_episode_steps = setting.MAX_STEPS delay_history = [] rew_history = [] util_history = [] signal.signal(signal.SIGINT, signal_handler) actor = ActorNetwork(state_size=setting.STATE_SIZE, action_size=setting.ACTION_SIZE, lr=setting.ACTOR_LEARNING_RATE, n_h1=setting.N_H1, n_h2=setting.N_H2, tau=setting.TAU) critic = CriticNetwork(state_size=setting.STATE_SIZE, action_size=setting.ACTION_SIZE, lr=setting.CRITIC_LEARNING_RATE, n_h1=setting.N_H1, n_h2=setting.N_H2, tau=setting.TAU) noise = OUProcess(setting.ACTION_SIZE) exprep = ExpReplay(mem_size=setting.MEM_SIZE, start_mem=setting.START_MEM, state_size=[setting.STATE_SIZE], kth=-1, batch_size=setting.BATCH_SIZE) sess = tf.Session() with tf.device('/{}:0'.format('CPU')): agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=setting.ACTION_RANGE) sess.run(tf.initialize_all_variables()) for i in range(setting.NUM_EPISODES): cur_state = env.reset() cum_reward = 0 if (i % setting.EVALUATE_EVERY) == 0: print ('====evaluation====') for t in range(setting.MAX_STEPS): print("Time step: " + str(t)) if (i % setting.EVALUATE_EVERY) == 0: env.render()
from my_dqn import DQN from exp_replay import ExpReplay import tensorflow as tf import numpy as np import matplotlib.pyplot as plt # from gym.wrappers.monitoring.video_recorder import VideoRecorder env_name = 'MountainCar-v0' env = gym.make(env_name) dump_path = './checkpoints/cartpole' # recorder = None # recorder = VideoRecorder(env, f'/home/sudipto/Reinforcement/Videos/{env_name}-b256-h32-lr3e-3.mp4', enabled=True) action_space = env.action_space.n dqn = DQN(action_space, hunits1=16, hunits2=16) replay = ExpReplay(100000) num_episodes = 1000 batch_size = 128 cur_frame = 0 epsilon = 0.99 window = [] mean_rewards = [] # For training the agent for episode in range(1, num_episodes + 1): state = env.reset() ep_reward, done = 0, False while not done: #env.render() state_in = tf.expand_dims(state, axis=0) action = dqn.chose_action(state_in, epsilon)