def Main(): print("Starting") env = gameEnv(partial=False, size=9) env = gameEnv(partial=True, size=9) Train(env) Test(env) print("Finished")
def show_game(ckpt, game_len): """ loads a model up from ckpt and plays a single game to show. """ scope = cur_scope() env = gridworld.gameEnv(partial=False, size=5) tf.reset_default_graph() qnet = get_dummy_net(scope, env.state.shape, env.actions) plt.ion() plt.show() init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) restore_from_ckpt(sess, scope, ckpt, env.state.shape, env.actions) state = env.reset() for _ in range(game_len): plt.imshow(state) plt.draw() plt.pause(.1) action = qnet.predict(sess, np.array([state]))[0] state, _, _ = env.step(action) plt.close() plt.ioff()
def __init__(self): self.batch_size = 64 # How many experiences to use for each training step self.train_frequency = 5 # How often you update the network self.num_epochs = 20 # How many epochs to train when updating the network self.y = 0.99 # Discount factor self.prob_random_start = 0.6 # Starting chance of random action self.prob_random_end = 0.1 # Ending chance of random action self.annealing_steps = 1000. # Steps of training to reduce from start_e -> end_e self.max_num_episodes = 10000 # Max number of episodes you are allowes to played to train the game self.min_pre_train_episodes = 100 # Number of episodes played with random actions before to start training. self.max_num_step = 50 # Maximum allowed episode length self.goal = 15 # Number of rewards we want to achieve while playing a game. # Set env self.env = gameEnv(partial=False, size=5) # Reset everything from keras session K.clear_session() # Setup our Q-networks self.main_qn = Qnetwork() self.target_qn = Qnetwork() # Setup our experience replay self.experience_replay = ExperienceReplay()
def test_model(ckpt, num_games, game_len): """ Loads up a model and plays a number of games with it. Doesn't do any training, but reports the net reward at the end. """ scope = cur_scope() env = gridworld.gameEnv(partial=False, size=5) tf.reset_default_graph() qnet = get_dummy_net(scope, env.state.shape, env.actions) net_reward = 0 init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) restore_from_ckpt(sess, scope, ckpt, env.state.shape, env.actions) for _ in range(num_games): state = env.reset() for i in range(game_len): action = qnet.predict(sess, np.array([state]))[0] state, reward, _ = env.step(action) net_reward += reward print('num_games =', num_games, ' game_len =', game_len, ' score =', net_reward, ' ckpt =', ckpt)
def evaluate(env_name, online_net): # env = gym.make(env_name) env = gameEnv(partial=False, size=32, object_size=1) avg_reward = 0. for ep in range(20): state = env.reset() # screen = env.render(mode='rgb_array') frame_state = np.zeros((4 * 3, 32, 32), dtype=np.float32) frame_state[9:, :, :] = np.transpose(state, (2, 0, 1)) total_reward = 0 step = 0.0 while (step < maxSteps): q_values = online_net( Variable( torch.from_numpy(frame_state).unsqueeze(0).cuda().type( torch.cuda.FloatTensor))) action = get_action(q_values, 0.05, env) next_state, reward, done = env.step(action) total_reward += reward # screen = env.render(mode='rgb_array') next_frame_state = np.copy(frame_state) for i in range(3): next_frame_state[i * 3:(i + 1) * 3, :, :] = next_frame_state[(i + 1) * 3:(i + 2) * 3, :, :] # next_frame_state[9:, :, :] = np.transpose(process_frame(screen), (2, 0, 1)) next_frame_state[9:, :, :] = np.transpose(next_state, (2, 0, 1)) frame_state = next_frame_state if done: break print('Evaluation done of episode:', ep + 1) avg_reward += total_reward avg_reward /= 20. return avg_reward
def __init__(self): # Define your network architecture here. It is also a good idea to define any training operations # and optimizers here, initialize your variables, or alternately compile your model here. self.gymEnv = gameEnv(partial=False, size=5) #self.stateSize = self.gymEnv.observation_space.shape[0] self.actionSize = self.gymEnv.actions self.maxSteps = 50 # limit maximum steps for cartpole self.numEpisodes = 10000 self.learningRate = 0.0001 self.epsilon_start = 1 self.epsilon = self.epsilon_start self.epsilon_end = 0.1 self.annealingSteps = 10000 self.epsilon_decay = (self.epsilon_start - self.epsilon_end) / self.annealingSteps self.max_epLength = 50 # Discount factor : 1: for MountainCar, and 0:99 for CartPole and Space Invaders. self.gamma = 0.99 self.network = "dueling_double" self.batch_size = 32 self.skip = 4 # create a main model and target model self.model = self._createModel() self.target_model = self._createModel() # initialize the target model so that the parameters in the two models are the same self.update_target_model() # sumnmary for loss self.loss = tf.placeholder(tf.float32) # print("loss shape",tf.shape(self.loss)) #shape=(?,) tf.summary.scalar('loss-81ffb76', tf.reduce_mean(self.loss)) self.reward = tf.placeholder(tf.float32) tf.summary.scalar('reward', self.reward) self.currentEpsilon = tf.placeholder(tf.float32) tf.summary.scalar('epsilon', self.currentEpsilon) self.Q = tf.placeholder(tf.float32) tf.summary.scalar('Q', self.Q) self.merged = tf.summary.merge_all()
def combine_nets(combine_dir, submodels_dir, episodes=0): """ Combines a set of Neural Networks by taking the avg value for each neuron, weighted by the number of experiences. :param combine_dir: directory to save combined model to :param submodel_dir: directory to load models from :param episodes: base number of episodes to add to model total. Ends up being the total count Ends up being the total count :return: path to combined model """ avg_scope = cur_scope() env = gridworld.gameEnv(partial=False, size=5) tf.reset_default_graph() # create the network that will be used to combine the others. avg_qnet = get_dummy_net(avg_scope, env.state.shape, env.actions) avg_vars = tf.trainable_variables(avg_scope) init = tf.global_variables_initializer() avg_saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=avg_scope)) models = get_models_to_combine(submodels_dir) with tf.Session() as sess: sess.run(init) for i, m in enumerate(models): _, m_scope, eps = m.split(DELIM) eps = int(eps) m_qnet = get_dummy_net(m_scope, env.state.shape, env.actions) restore_from_ckpt(sess, m_scope, os.path.join(submodels_dir, m + '.ckpt'), env.state.shape, env.actions) m_vars = tf.trainable_variables(m_scope) for avg, loaded in zip(avg_vars, m_vars): sess.run( avg.assign( avg.value() * episodes / (episodes + eps) + loaded.value() * eps / (episodes + eps))) episodes += eps # Save the combined NN ckpt_path = make_ckpt_path(combine_dir, avg_scope, episodes) avg_saver.save(sess, ckpt_path) return ckpt_path
def __init__(self): self.path = '../model/' self.network = 'DRQN_runner' self.numEpisodes = 10000 self.learningRate = 0.0001 self.epsilon_start = 1 self.epsilon = self.epsilon_start self.epsilon_end = 0.1 self.annealingSteps = 10000 self.epsilon_decay = (self.epsilon_start - self.epsilon_end)/self.annealingSteps self.gamma = 0.99 self.gymEnv = gameEnv(partial=False, size=5) self.actionSize = self.gymEnv.actions self.duelDQN = duelDQN(self.learningRate, self.actionSize) self.model = self.duelDQN.model self.target_model = self.duelDQN.target_model self.batch_size = 32 self.skip = 4 self.max_epLength = 50 self.update_Q_steps = 1000
# This code deals with "partial obserbability markov decision process" question with deep recurrent q network and convolution layer import numpy as np import random import tensorflow as tf import matplotlib.pyplot as plt import scipy.misc import os import csv import itertools import tensorflow.contrib.slim as slim # %matplotlib inline from helper import * from gridworld import gameEnv env = gameEnv(partial=False, size=9) env = gameEnv(partial=True, size=9) # @ # You build neural network class Qnetwork(): """ This class is for q network object\n Args: 1.h_size: 1.rnn_cell: 1.myScope: """ def __init__(self, h_size, rnn_cell, myScope): # Network takes in frame as vectorized array from game
def __init__(self, size, max_steps, total_steps): self.env = gameEnv(partial=False, size=size) self.max_steps = max_steps self.total_steps = total_steps
def part_4(): import gym import numpy as np import random import tensorflow as tf import tensorflow.contrib.slim as slim import matplotlib.pyplot as plt import scipy.misc import os from gridworld import gameEnv env = gameEnv(partial=False, size=5) # print(env) # print(env.renderEnv()) class Qnetwork(): def __init__(self, h_size): # The network recieves a frame from the game, flattened into an array. # It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3]) self.conv1 = slim.conv2d(inputs=self.imageIn, num_outputs=32, kernel_size=[8, 8], stride=[4, 4], padding='VALID', biases_initializer=None) self.conv2 = slim.conv2d(inputs=self.conv1, num_outputs=64, kernel_size=[4, 4], stride=[2, 2], padding='VALID', biases_initializer=None) self.conv3 = slim.conv2d(inputs=self.conv2, num_outputs=64, kernel_size=[3, 3], stride=[1, 1], padding='VALID', biases_initializer=None) self.conv4 = slim.conv2d(inputs=self.conv3, num_outputs=h_size, kernel_size=[7, 7], stride=[1, 1], padding='VALID', biases_initializer=None) # We take the output from the final convolutional layer and split it into separate advantage and value streams. self.streamAC, self.streamVC = tf.split(self.conv4, 2, 3) self.streamA = slim.flatten(self.streamAC) self.streamV = slim.flatten(self.streamVC) xavier_init = tf.contrib.layers.xavier_initializer() self.AW = tf.Variable(xavier_init([h_size // 2, env.actions])) self.VW = tf.Variable(xavier_init([h_size // 2, 1])) self.Advantage = tf.matmul(self.streamA, self.AW) self.Value = tf.matmul(self.streamV, self.VW) # Then combine them together to get our final Q-values. self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True)) self.predict = tf.argmax(self.Qout, 1) # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32) self.actions = tf.placeholder(shape=[None], dtype=tf.int32) self.actions_onehot = tf.one_hot(self.actions, env.actions, dtype=tf.float32) self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1) self.td_error = tf.square(self.targetQ - self.Q) self.loss = tf.reduce_mean(self.td_error) self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) self.updateModel = self.trainer.minimize(self.loss) class experience_buffer(): def __init__(self, buffer_size=50000): self.buffer = [] self.buffer_size = buffer_size def add(self, experience): if len(self.buffer) + len(experience) >= self.buffer_size: self.buffer[0:(len(experience) + len(self.buffer)) - self.buffer_size] = [] self.buffer.extend(experience) def sample(self, size): return np.reshape(np.array(random.sample(self.buffer, size)), [size, 5]) def processState(states): return np.reshape(states, [21168]) def updateTargetGraph(tfVars, tau): total_vars = len(tfVars) op_holder = [] for idx, var in enumerate(tfVars[0:total_vars // 2]): op_holder.append(tfVars[idx + total_vars // 2].assign( (var.value() * tau) + ((1 - tau) * tfVars[idx + total_vars // 2].value()))) return op_holder def updateTarget(op_holder, sess): for op in op_holder: sess.run(op) batch_size = 32 # How many experiences to use for each training step. update_freq = 4 # How often to perform a training step. y = .99 # Discount factor on the target Q-values startE = 1 # Starting chance of random action endE = 0.1 # Final chance of random action annealing_steps = 10000. # How many steps of training to reduce startE to endE. num_episodes = 220 # How many episodes of game environment to train network with. ##### default = 10000 pre_train_steps = 10000 # How many steps of random actions before training begins. max_epLength = 50 # The max allowed length of our episode. load_model = False # Whether to load a saved model. path = "./dqn" # The path to save our model to. h_size = 512 # The size of the final convolutional layer before splitting it into Advantage and Value streams. tau = 0.001 # Rate to update target network toward primary network tf.reset_default_graph() mainQN = Qnetwork(h_size) targetQN = Qnetwork(h_size) init = tf.global_variables_initializer() saver = tf.train.Saver() trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, tau) myBuffer = experience_buffer() # Set the rate of random action decrease. e = startE stepDrop = (startE - endE) / annealing_steps # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # Make a path for our model to be saved in. if not os.path.exists(path): os.makedirs(path) with tf.Session() as sess: sess.run(init) if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) for i in range(num_episodes): print('episode:', i) episodeBuffer = experience_buffer() # Reset environment and get first new observation s = env.reset() s = processState(s) d = False rAll = 0 j = 0 # The Q-Network while j < max_epLength: # If the agent takes longer than 200 moves to reach either of the blocks, end the trial. j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e or total_steps < pre_train_steps: a = np.random.randint(0, 4) else: a = sess.run(mainQN.predict, feed_dict={mainQN.scalarInput: [s]})[0] s1, r, d = env.step(a) s1 = processState(s1) total_steps += 1 episodeBuffer.add( np.reshape(np.array([s, a, r, s1, d]), [1, 5])) # Save the experience to our episode buffer. if total_steps > pre_train_steps: if e > endE: e -= stepDrop if total_steps % (update_freq) == 0: trainBatch = myBuffer.sample(batch_size) # Get a random batch of experiences. # Below we perform the Double-DQN update to the target Q-values Q1 = sess.run(mainQN.predict, feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 3])}) Q2 = sess.run(targetQN.Qout, feed_dict={targetQN.scalarInput: np.vstack(trainBatch[:, 3])}) end_multiplier = -(trainBatch[:, 4] - 1) doubleQ = Q2[range(batch_size), Q1] targetQ = trainBatch[:, 2] + (y * doubleQ * end_multiplier) # Update the network with our target values. _ = sess.run(mainQN.updateModel, feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 0]), mainQN.targetQ: targetQ, mainQN.actions: trainBatch[:, 1]}) updateTarget(targetOps, sess) # Update the target network toward the primary network. rAll += r s = s1 if d == True: break myBuffer.add(episodeBuffer.buffer) jList.append(j) rList.append(rAll) # Periodically save the model. if i % 1000 == 0: saver.save(sess, path + '/model-' + str(i) + '.ckpt') print("Saved Model") if len(rList) % 10 == 0: print(total_steps, np.mean(rList[-10:]), e) saver.save(sess, path + '/model-' + str(i) + '.ckpt') print("Percent of successful episodes: " + str(sum(rList) / num_episodes) + "%") rMat = np.resize(np.array(rList), [len(rList) // 100, 100]) rMean = np.average(rMat, 1) plt.plot(rMean) plt.show()
import numpy as np import random import tensorflow as tf import matplotlib.pyplot as plt import scipy.misc import os import csv import itertools import tensorflow.contrib.slim as slim from helper import updateTargetGraph,updateTarget,processState,saveToCenter from gridworld import gameEnv # partial设置为True环境为部分可观测,如果设置为False,则是完全可观测 env = gameEnv(partial=True,Size = 9) class Qnetwork(): def __init__(self,h_size,rnn_cell,myScope): self.scalarInput = tf.placeholder(shape=[None,21168],dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3]) self.conv1 = slim.convolution2d(inputs = self.imageIn,num_outputs=32,kernel_size = [8,8], stride=[4,4],padding='VALID',biases_initializer=None, scope=myScope + "_conv1") #[-1,20,20,32] self.conv2 = slim.convolution2d(inputs = self.conv1, num_outputs = 64,kernel_size=[4,4], stride=[2,2],padding='VALID', biases_initializer= None,scope=myScope+"_conv2") # [-1,9,9,64] self.conv3 = slim.convolution2d(inputs = self.conv2,num_outputs = 64,kernel_size=[3,3],stride=[1,1], padding='VALID',biases_initializer=None,scope=myScope+"_conv3") #[-1,7,7,64] self.conv4 = slim.convolution2d(inputs = self.conv3,num_outputs=h_size,kernel_size=[7,7],stride=[1,1], padding='VALID',biases_initializer=None,scope=myScope+"_conv4") #[-1,1,1,h_size]
def create_env(): return gameEnv(False, 5, True)
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import numpy as np import random import tensorflow as tf import os from gridworld import gameEnv env = gameEnv(size=5) class Qnetwork(): def __init__(self, h_size): #The network recieves a frame from the game, flattened into an array. #It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3]) self.conv1 = tf.contrib.layers.convolution2d( \ inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None) self.conv2 = tf.contrib.layers.convolution2d( \ inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None) self.conv3 = tf.contrib.layers.convolution2d( \ inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
import numpy as np import random import matplotlib.pyplot as plt #https://github.com/awjuliani/DeepRL-Agents/blob/master/gridworld.py from gridworld import gameEnv from keras.models import Model from keras.layers import Input, Dense, Lambda from keras.layers.convolutional import Conv2D from keras import backend as K from keras.engine.topology import Layer from keras.layers.merge import _Merge, Multiply env_size = 5 env = gameEnv(partial=False, size=env_size) class Experience(): def __init__(self, buffer_size): self.replay_buffer = [] self.buffer_size = buffer_size def storeExperience(self, exp): if(len(exp)+self.buffer_size >= len(self.replay_buffer)): del self.replay_buffer[:(len(exp)+len(self.replay_buffer) - self.buffer_size)] self.replay_buffer.extend(exp)
from gridworld import gameEnv slim = tf.contrib.layers np.random.seed(2) tf.set_random_seed(2) # reproducible # Superparameters OUTPUT_GRAPH = True MAX_EPISODE = 30000 DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold MAX_EP_STEPS = 50 # maximum time step in one episode RENDER = False # rendering wastes time GAMMA = 0.9 # reward discount in TD error LR_A = 0.0001 # learning rate for actor LR_C = 0.001 # learning rate for critic env = gameEnv(8) # env.seed(1) # reproducible # env = env.unwrapped N_F = 21168 N_A = 4 def processState(states): return np.reshape(states, [21168]) class Actor(object): def __init__(self, sess, n_features, n_actions, lr=0.001, h_size =512): self.sess = sess self.s = tf.placeholder(tf.float32, [1, n_features], "state") self.a = tf.placeholder(tf.int32, None, "act")
# Example of Deep Recurrent Q-Network for partially observable environment import numpy as np import random import tensorflow as tf import matplotlib.pyplot as plt import scipy.misc import os import csv import itertools import tensorflow.contrib.slim as slim from helper import * from gridworld import gameEnv env = gameEnv(partial=True, size=9) # class for Q-network class Qnetwork(): def __init__(self, h_size, rnn_cell, myScope): self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3]) self.conv1 = slim.conv2d(inputs=self.imageIn, num_outputs=32, kernel_size=[8, 8], stride=[4, 4], padding='VALID', biases_initializer=None, scope=myScope + '_conv1') self.conv2 = slim.conv2d(inputs=self.conv1,
For more reinforcment learning tutorials, as well as required gridworld.py file, see: https://github.com/awjuliani/DeepRL-Agents In [ ]: import gym import numpy as np import random import tensorflow as tf import matplotlib.pyplot as plt import scipy.misc import os %matplotlib inline Load the game environment Feel free to adjust the size of the gridworld. Making it smaller provides an easier task for our DQN agent, while making the world larger increases the challenge. In [ ]: from gridworld import gameEnv env = gameEnv(partial=False,size=5) Above is an example of a starting environment in our simple game. The agent controls the blue square, and can move up, down, left, or right. The goal is to move to the green square (for +1 reward) and avoid the red square (for -1 reward). The position of the three blocks is randomized every episode. Implementing the network itself In [ ]: class Qnetwork(): def __init__(self,h_size): #The network recieves a frame from the game, flattened into an array. #It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None,21168],dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3]) self.conv1 = tf.contrib.layers.convolution2d( \ inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None) self.conv2 = tf.contrib.layers.convolution2d( \ inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None) self.conv3 = tf.contrib.layers.convolution2d( \ inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
def main(): env = gridworld.gameEnv(partial=False, size=5) input_shape = env.state.shape tf.reset_default_graph() e = .5 # Describe the NN used to select actions input_layer = tf.placeholder(shape=input_shape, dtype=tf.float32) actions = make_nn(input_layer) # Create learning parameters (loss) next_actions = tf.placeholder(shape=(1, env.actions), dtype=tf.float32) loss = tf.reduce_sum(tf.square(next_actions - actions)) train_op = tf.train.GradientDescentOptimizer( learning_rate=LEARN_RATE).minimize(loss) rewards = [] plt.ion() plt.show() init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) try: saver.restore(sess, model_dir) except: print('New Model') for i in range(EPISODES): s = env.reset() # specific state net_reward = 0 done = False j = 0 while not done and j < 200: # Choose the action greedily but with chance e of random action # an action function (action) becomes a specific action (a) # a set of action functions (actions) becomes a set of Q values (Q) Q = sess.run(actions, feed_dict={input_layer: s}) a = np.argmax(Q, 1)[0] if np.random.rand(1) < e: a = np.random.randint(0, env.actions) # Perform action and get the next state and reward next_s, reward, done = env.step(a) maybe_show(i, s) # Obtain the next Q values by feeding the new state to the network # and use it to train. next_Q = sess.run(actions, feed_dict={input_layer: next_s}) Q[0, a] = reward + DISCOUNT * np.max(next_Q) sess.run(train_op, feed_dict={input_layer: s, next_actions: Q}) net_reward += reward s = next_s j += 1 e *= .99 rewards.append(net_reward) if i % 1000 == 0: plt.close() if i >= 1000: print('Round', i, '- Net Reward last 100:', sum(rewards[-1000:])) save_path = saver.save(sess, model_dir) print(save_path) plt.close() plt.ioff() plt.plot(rewards) plt.show()
Q(s,a) = V(s) + A(a) Dueling DQN have resulted in improved performance, stability and faster learning time. Deep Q-Network using both Double DQN and Dueling DQN. The agent learns to solve a navigation task in a basic grid world. Goal: Move the blue block to green while avoiding red. The agent controls the blue square and can move up, down, left, and right. move to green (reward =1), hitting red square (reqard =-1). The position of the 3 blocks are randomized every episode. ''' # load environment # artial = fasle: the whole state of the state is visible to the network. env = gameEnv(partial=False, size=5) # 5*5 grid # defining conv layer def conv2d(x, W, b, strides=1, padding='SAME', activation=tf.nn.relu): x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding=padding) x = tf.nn.bias_add(x, b) return activation(x) # size of final conv layer before splitting into Advantage and Value streams(Dueling) h_size = 512 # we want 4 conv layers in Q network and target network
def train(args): """ Create a GridworldQnet and have it play games and train. """ scope = cur_scope() if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) e = args.random_action env = gridworld.gameEnv(partial=False, size=5) tf.reset_default_graph() # Convert update_target_net_rate from number of games between copies to # the number of 'updates' between. update_target_net_rate = \ args.update_target_net_rate * args.game_len * args.replays qnet = get_net( scope, env.state.shape, env.actions, args.discount, args.learning_rate, args.experiences_size, args.batch_size, update_target_net_rate) rewards = [] init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) restore_from_ckpt(sess, scope, args.restore_model, env.state.shape, env.actions) for ep in range(args.episodes): state = normalize_img(env.reset()) done = False turn = 0 net_reward = 0 n_updates = 0 while not done and turn < args.game_len: action = qnet.predict(sess, np.array([state]))[0] if np.random.rand(1) < e: action = qnet.rand_action() next_state, reward, done = env.step(action) next_state = normalize_img(next_state) qnet.add_experience(state, action, reward, next_state, done) for i in range(abs(int(reward))*10): # Add lots of examples when there is a reward, cuz sparse qnet.add_experience(state, action, reward, next_state, done) turn += 1 state = next_state net_reward += reward while qnet.experiences_full and n_updates < (turn * args.replays): # This is where the actual training happens. We look back # back and sample experiences we have had and learn by # replaying them. n_updates += args.batch_size qnet.update(sess) e *= args.random_decay rewards.append(net_reward) if (ep + 1) % args.ckpt_every == 0 and ep > 0: ckpt_path = make_ckpt_path(args.model_dir, scope, ep + 1) print(datetime.now().strftime("%Y-%m-%d %H:%M:%S "), os.path.basename(ckpt_path), ' ', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss//2**10, 'kB random_action={:.5f}'.format(e), ' game_len=', args.game_len, ' Net reward last ', args.ckpt_every, ' games: ', sum(rewards[-args.ckpt_every:]), sep='') saver.save(sess, ckpt_path)
import random import sys import psutil import tensorflow as tf import gridworld as gWorld if "../" not in sys.path: sys.path.append("../") from lib import plotting from collections import deque, namedtuple # In[ ]: #env = gym.envs.make("Breakout-v0") env = gWorld.gameEnv(True, 5); # In[ ]: # Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions VALID_ACTIONS = [0, 1, 2, 3] # In[ ]: class StateProcessor(): """ Processes a raw Atari iamges. Resizes it and converts it to grayscale. """ def __init__(self): # Build the Tensorflow graph
from __future__ import division import gym import numpy as np import random import tensorflow as tf import tensorflow.contrib.slim as slim import matplotlib.pyplot as plt import scipy.misc import os from gridworld import gameEnv env = gameEnv(partial=False, size=5) class Qnetwork(): def __init__(self, h_size): # The network receives a frame from the game, flattened into an array. # It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3]) self.conv1 = slim.conv2d( \ inputs=self.imageIn, num_outputs=32, kernel_size=[8, 8], stride=[4, 4], padding='VALID', biases_initializer=None) self.conv2 = slim.conv2d( \ inputs=self.conv1, num_outputs=64, kernel_size=[4, 4], stride=[2, 2], padding='VALID', biases_initializer=None) self.conv3 = slim.conv2d( \ inputs=self.conv2, num_outputs=64, kernel_size=[3, 3], stride=[1, 1], padding='VALID',
avg_reward += total_reward avg_reward /= 20. return avg_reward args = parse_arguments() num_episodes = 100000 maxSteps = 300 # num_s_iterations = 1000000 # environment_name = args.env gamma = 1.0 C = args.C tau = 1e-2 env = gameEnv(partial=False, size=32, object_size=1) # env = gym.make(environment_name) # env = gym.wrappers.Monitor(env, 'space_invaders_dqn_color_curr', video_callable=lambda episode_id: episode_id%(num_episodes//100)==0, force=True) memory = Replay_Memory(burn_in=0) # nets = torch.load('space_invaders_dqn_target_new.pth', map_location=lambda storage, loc: storage) # target_net = nets['target_net'] # online_net = nets['online_net'] target_net = QNetworkConv(env) online_net = QNetworkConv(env) online_net.cuda() target_net.cuda() optimizer = optim.Adam(online_net.parameters(), lr=1e-4) s_iter = 0