def __init__(self): # Our Main Model: POLICY NETWORK self.dqn = DeepQNetwork(model_name=MODEL_NAME, input_dim=INPUT_DIM, n_actions=N_ACTIONS, layer1_units=LAYER1_UNITS, layer2_units=LAYER2_UNITS, lr=LEARNING_RATE) self.model = self.dqn.create_model() # TARGET NETWORK self.target_model = self.dqn.create_model() self.target_model.set_weights(self.model.get_weights()) # An array with last n steps for training self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) # Tensorboard for logging results self.tensorboard = ModifiedTensorBoard( log_dir=f'../logs/{MODEL_NAME}-{int(time.time())}') # target update counter self.target_update_counter = 0
def replay_train(mainDQN: DeepQNetwork, targetDQN: DeepQNetwork, train_batch: list) -> float: """Trains `mainDQN` with target Q values given by `targetDQN` Args: mainDQN (DeepQNetwork``): Main DQN that will be trained targetDQN (DeepQNetwork): Target DQN that will predict Q_target train_batch (list): Minibatch of replay memory Each element is (s, a, r, s', done) [(state, action, reward, next_state, done), ...] Returns: float: After updating `mainDQN`, it returns a `loss` """ states = np.vstack([x[0] for x in train_batch]) actions = np.array([x[1] for x in train_batch[:FLAGS.batch_size]]) rewards = np.array([x[2] for x in train_batch[:FLAGS.batch_size]]) next_states = np.vstack([x[3] for x in train_batch]) done = np.array([x[4] for x in train_batch[:FLAGS.batch_size]]) predict_result = targetDQN.predict(next_states) Q_target = rewards + FLAGS.discount_rate * np.max(predict_result, axis=1) * (1 - done) X = states y = mainDQN.predict(states) y[np.arange(len(X)), actions] = Q_target # Train our network using target and predicted Q values on each episode return mainDQN.update(X, y)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_next', chkpt_dir=self.chkpt_dir)
def __init__(self): # from the origin base.agent self.reward = 0 self.episodes = 0 self.steps = 0 self.obs_spec = None self.action_spec = None self.dqn = DeepQNetwork( len(smart_actions), 10, # one of the most important data that needs to be update manually learning_rate=0.001, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=5000, batch_size=32, e_greedy_increment=None, output_graph=True) # self defined vars self.fighting = False self.player_hp = [] self.enemy_hp = [] self.previous_enemy_hp = [] self.previous_player_hp = [] self.leftover_enemy_hp = [] self.win = 0 self.count = 0 self.previous_action = None self.previous_state = None
def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, max_mem_size=100000, eps_end=0.01, eps_dec=5e-5): self.gamma = gamma self.epsilon = epsilon self.eps_min = eps_end self.eps_dec = eps_dec self.lr = lr self.action_space = [i for i in range(n_actions)] self.mem_size = max_mem_size self.batch_size = batch_size self.mem_cntr = 0 self.Q_eval = DeepQNetwork(self.lr, n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256) self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32) self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32) self.action_memory = np.zeros(self.mem_size, dtype=np.int32) self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
def run(arguments) -> None: #Create the env env = GameEnv() agent1 = DeepQNetwork.restore(arguments["TRAINED_MODEL_1"]) agent2 = DeepQNetwork.restore(arguments["TRAINED_MODEL_2"]) # Test the agent that was trained for e_test in range(TEST_Episodes): state = env.reset() state = np.reshape(state, [1, agent1.nS]) tot_reward1 = 0 tot_reward2 = 0 if WRITE_VIDEO and e_test == 0: fig = plt.figure() frames = [] for t_test in range(1000): if SHOW_GAME: show_game(env.render_env(), t_test, tot_rewards1, tot_rewards2) if WRITE_VIDEO and e_test == 0: temp = env.render_env() frames.append([ plt.text(0, -1, "Time: " + str(t_test), fontsize=8), plt.text(7, -1, "Total reward - player 1: " + str(tot_reward1) + ", player 2: " + str(tot_reward2), fontsize=8), plt.imshow(temp, animated=True) ]) agent1_action = agent1.test_action(state) agent2_action = agent2.test_action(state) reward1, reward2 = env.move(agent1_action, agent2_action) nstate = tf.reshape(env.contribute_metrix(), [-1]) nstate = np.reshape(nstate, [1, agent1.nS]) tot_reward1 += reward1 tot_reward2 += reward2 #DON'T STORE ANYTHING DURING TESTING state = nstate if t_test == 999: print("episode: {}/{}, scores: {}, {}".format( e_test, TEST_Episodes, tot_reward1, tot_reward2)) break if WRITE_VIDEO and e_test == 0: Writer = matplotlib.animation.writers['ffmpeg'] writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800) ani = matplotlib.animation.ArtistAnimation(fig, frames, interval=20, blit=True) ani.save('movies/' + arguments["TRAINED_MODEL_1"].split('/')[-1] + '_test_2players.mp4', writer=writer) print(f'Video saved.')
def __init__(self, inputs, n_actions): self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain.load_state_dict(self.brain.state_dict()) self.target_brain.eval() self.set_params() self.optimizer = torch.optim.Adam(self.brain.parameters()) self.memory = ReplayMemory(50000) self.action_space = [0, 1]
class DQNAgent(Agent): def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir)
def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) # define Q-evaluation network and target Q-network for the agent self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) # we will never perform gradient descent or backpropagation with Q next network self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir)
def __init__(self): self.dqn = DeepQNetwork(model_name=MODEL_NAME, input_dim=INPUT_DIM, n_actions=N_ACTIONS, layer1_units=LAYER1_UNITS, layer2_units=LAYER2_UNITS, lr=LEARNING_RATE) # Our Main Model: POLICY NETWORK if LOAD_MODEL is not None: print(f"Loading {LOAD_MODEL}") self.model = load_model(LOAD_MODEL) print(f"Loaded Model: {LOAD_MODEL}") else: self.model = self.dqn.create_model() # TARGET NETWORK self.target_model = self.dqn.create_model() self.target_model.set_weights(self.model.get_weights()) # An array with last n steps for training self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) # target update counter self.target_update_counter = 0
def __init__(self): self.dqn = DeepQNetwork(n_actions=524, n_features=13) self.previous_action = None self.previous_state = None self.episodes = 0 self.steps = 0 self.reward = 0 self.reward_weights = np.array([ .2, ##blizz_score .2, .2, ##total_unit_value, total_structure_value .2, .3, ##killed_unit_value, killed_building_value .2, .2, ##mineral_rate, mineral_spent .2, .1, ##supply_used, supply_limit .3, .3, ##army_supply,worker_supply .3 #army_count ])
def main(): env = gym.make('Catcher-v0') model = make_model() model.load_weights('data/weights520000.dat') net = DeepQNetwork(env, model, 10000) # net.train() net.play() net.play() net.play()
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "hn:", ["help", "network="]) except getopt.error as msg: raise Usage(msg) except Usage as err: print(sys.stderr, err.msg) print(sys.stderr, "for help use --help") return 2 if len(opts) == 0: print("Please specify parameters!") return 1 for opt, arg in opts: if opt in ("-h", "--help"): print(__doc__) return 0 elif opt in ("-n", "--network"): if arg == 'dqn': from dqn import DeepQNetwork score_graph_path = './saved_model/' network = DeepQNetwork( e_greedy=0.1, output_graph=True, save_path=score_graph_path, ) elif arg == 'doubledqn': from double_dqn import DoubleDQN score_graph_path = './saved_model_doubledqn/' network = DoubleDQN( e_greedy=0.1, output_graph=True, save_path=score_graph_path, ) else: print( "You could choose 'dqn', 'doubledqn' as network's parameter" ) return 1 train(network, score_graph_path) return 0
def bot_play(mainDQN: DeepQNetwork, env: gym.Env) -> None: """Test runs with rendering and logger.infos the total score Args: mainDQN (DeepQNetwork): DQN agent to run a test env (gym.Env): Gym Environment """ state = env.reset() reward_sum = 0 while True: env.render() action = np.argmax(mainDQN.predict(state)) state, reward, done, _ = env.step(action) reward_sum += reward if done: logger.info("Total score: {}".format(reward_sum)) break
def CartPoleDQN(): env = gym.make('CartPole-v0') env = env.unwrapped env.reset() rl0 = DeepQNetwork(env.action_space.n, env.state.shape[0], learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=100000, batch_size=32, e_greedy_increment=None, output_graph=None) game = Game(env, rl0) game.run()
def run(render): net = DeepQNetwork(sess, N_A, N_S, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, scope='dqn_{0}'.format(0), # output_graph=True ) sess.run(tf.global_variables_initializer()) step = 0 for episode in range(300): # initial observation s = env.reset() while True: # RL choose action based on observation a, q = net.choose_action(s) # RL take action and get next observation and reward s_, r, d, _ = env.step(a) if render: env.render() #print('rewards: {0}'.format(r)) net.store_transition(s, a, r, s_) if (step > 200) and (step % 5 == 0): net.learn() # swap observation s = s_ # break while loop when end of this episode if d: break step += 1
import gym from dqn import DeepQNetwork env = gym.make('CartPole-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork( n_actions = env.action_space.n, n_features = env.observation_space.shape[0], lr = 0.01, e_greedy = 0.9, replace_target_iter = 100, memory_size = 2000, e_greedy_increment = 0.001 ) total_steps = 0 for i_episode in range(100): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation)
class DDQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_next', chkpt_dir=self.chkpt_dir) def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def replace_target_network(self): if self.replace_target_cnt is not None and \ self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_) q_eval = self.q_eval.forward(states_) max_actions = T.argmax(q_eval, dim=1) q_next[dones] = 0.0 q_target = rewards + self.gamma*q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 16384 #of sensors input np.random.seed(61502) vision = True EXPLORE = 100000. episode_count = 2000 max_steps = 40000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 esar2 = [] esar4 = [] #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #from keras import backend as K #K.set_session(sess) # 1. CREATE DQN NETWORK. num_actions_steering = 13 # before it was 13 num_actions_acceleration = 9 # before it was 3 num_actions_break = 9 # before it was 3 num_dqn_actions = num_actions_steering * num_actions_acceleration * num_actions_break base_dir = "/home/sergio/Projects/apclypsr/DDPG-Keras-Torcs/" args = { "save_model_freq": 1000, "target_model_update_freq": 1000, "normalize_weights": True, #"learning_rate": 0.00025, 'learning_rate': 0.00025, "model": None } dqn = DeepQNetwork(sess, num_dqn_actions, base_dir, args) # Tensorflow saver saver = tf.train.Saver() #actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) #critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight # print("Now we load the weight") # try: # # actor.model.load_weights("actormodel2.h5") # # critic.model.load_weights("criticmodel2.h5") # # actor.target_model.load_weights("actormodel2.h5") # # critic.target_model.load_weights("criticmodel2.h5") # # print("Weight load successfully") # saver.restore(sess, base_dir + "dqn.ckpt") # print("model restored") # except: # print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 500) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 500 episode because of the memory leak error else: ob = env.reset() # 0. BUILD THE 4 images. s_t = np.hstack((ob.img)) s_t_four_images_list = [] for j in range(4): s_t_four_images_list.append(np.zeros((128, 128), dtype=np.float64)) s_t_phi = get_phi_from_four_images(s_t_four_images_list) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) # 2 EVALUATE the first image a_t_original_dqn_discrete = dqn.inference(s_t_phi) #a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # 2.5 TRANSFORM from discrete to continuous. a_t_original_dqn = from_discrete_actions_to_continuous_actions( a_t_original_dqn_discrete, num_actions_steering, num_actions_acceleration, num_actions_break) print("actions: ", a_t_original_dqn) # a_t_original[0][0] steering: [-1, 1] # noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original_dqn[0], 0.0, 0.60, 0.30) # a_t_original[0][1] acceleration: [0, 1]. discretize in 6. # noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original_dqn[1], 0.5, 1.00, 0.10) # a_t_original[0][2] break: [0, 1] # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original_dqn[2], -0.1, 1.00, 0.05) #The following code do the stochastic brake if random.random() <= 0.05: print("********Now we apply the brake***********") #noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) noise_t[0][2] = train_indicator * max( epsilon, 0) * OU.function(a_t_original_dqn[2], 0.2, 1.00, 0.10) # a_t[0][0] = a_t_original[0][0] + noise_t[0][0] # a_t[0][1] = a_t_original[0][1] + noise_t[0][1] # a_t[0][2] = a_t_original[0][2] + noise_t[0][2] a_t[0][0] = a_t_original_dqn[0] + noise_t[0][0] a_t[0][1] = a_t_original_dqn[1] + noise_t[0][1] a_t[0][2] = a_t_original_dqn[2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) # 0. UPDATE THE LAST FOUR IMAGES s_t1 = np.hstack((ob.img)) if len(s_t_four_images_list) >= 4: s_t_four_images_list.pop(0) image = np.reshape(ob.img, (128, 128)) s_t_four_images_list.append(image) # print greyscale image # plt.imshow(image, origin='lower') # plt.draw() # plt.pause(0.001) #get phi for the new observed state s_t1_phi = get_phi_from_four_images(s_t_four_images_list) # Add replay buffer #buff.add(s_t, a_t[0], r_t, s_t1, done) buff.add(s_t_phi, from_continuous_actions_to_discrete_actions( a_t[0], num_actions_steering, num_actions_acceleration, num_actions_break), r_t, s_t1_phi, done) # Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) # states = np.asarray([e[0] for e in batch]) # actions = np.asarray([e[1] for e in batch]) # rewards = np.asarray([e[2] for e in batch]) # new_states = np.asarray([e[3] for e in batch]) # dones = np.asarray([e[4] for e in batch]) # y_t = np.asarray([e[1] for e in batch]) states = [e[0] for e in batch] states = np.concatenate(states, axis=0) actions = [e[1] for e in batch] rewards = [e[2] for e in batch] new_states = [e[3] for e in batch] new_states = np.concatenate(new_states, axis=0) dones = [e[4] for e in batch] y_t = [e[1] for e in batch] # 3. TRAINING #target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) # for k in range(len(batch)): # if dones[k]: # y_t[k] = rewards[k] # else: # y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): # 4 TRAIN loss = dqn.train(s_t=states, s_t1=new_states, rewards=rewards, actions=actions, terminals=dones, stepNumber=step) # loss += critic.model.train_on_batch([states,actions], y_t) # a_for_grad = actor.model.predict(states) # grads = critic.gradients(states, a_for_grad) # actor.train(states, grads) # actor.target_train() # critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) esar = (i, step, a_t, r_t, loss) esar2.append(esar) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): # print("Now we save model") # actor.model.save_weights("actormodelIMG.h5", overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) # # critic.model.save_weights("criticmodelIMG.h5", overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) save_path = saver.save(sess, base_dir + "dqn.ckpt") print("Model saved in file: %s" % save_path) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") esar3 = (i, step, total_reward) esar4.append(esar3) def save_object(obj, filename): with open(filename, 'w+b') as output: pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) save_object(esar2, 'IntraEpisode.pkl') save_object(esar4, 'InterEpisode.pkl') env.end() # This is for shutting down TORCS print("Finish.") print("Saving esars.")
class My_Env(Env): def __init__(self, p): super().__init__(p) self.step_value = ACTIONSTEP self.maxlen = MAXLEN if __name__ == "__main__": env = My_Env(P) RL = DeepQNetwork( env.action_space, env.observation_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.95, replace_target_iter=500, memory_size=1000, # output_graph=True, e_greedy_increment=0.01) step = 0 for i in range(EPISODE): accu_reward = 0 env.reset() env.step(np.random.randint(0, env.action_space)) observation = env.observation() while 1: action = RL.choose_action(observation)
# 前200个step为随机探索,之后开始学习? if (step > 200) and (step % 5 == 0): RL.learn() # 进入下一状态 state = state_ # 结束此回合 if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork(env.n_actions) env.after(100, update) env.mainloop() RL.plot_cost()
n_action = 11 n_width = 8 n_height = 3 n_channel = 1 n_episode = 1000 e_greedy_increment = 0.001 learning_rate = 0.005 memory_size = 3000 dueling = True prioritized = True double_q = True dqn = DeepQNetwork(n_action, n_width, n_height, n_channel, e_greedy_increment=e_greedy_increment, \ memory_size=memory_size, \ learning_rate=learning_rate, dueling=dueling, prioritized=prioritized, double_q=double_q) # dqn.load(372) counter = 0 state = deque([], maxlen=n_width) state_ = deque([], maxlen=n_width) for i in range(n_episode): observation = env.reset() state_.append(observation) # observation = np.identity(16)[observation:observation + 1] # observation = np.expand_dims(observation, axis=2) # observation = np.expand_dims(observation, axis=3)
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 512 #of sensors input np.random.seed(61502) vision = True EXPLORE = 100000. episode_count = 600000 max_steps = 1800 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 esar2 = [] esar4 = [] #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) #We insert the Deep Q Image Processing Module args = { 'save_model_freq': 10000, 'target_model_update_freq': 10000, 'normalize_weights': True, 'learning_rate': .00025, 'model': None } # print(args["save_model_freq"]) C = DeepQNetwork(state_dim, sess, '/home/lou/DDPG-Keras-Torcs', args=args) # print(C) x, h_fc1 = C.buildNetwork('test', trainable=True, numActions=1) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodelIMG.h5") critic.model.load_weights("criticmodelIMG.h5") actor.target_model.load_weights("actormodelIMG.h5") critic.target_model.load_weights("criticmodelIMG.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 500) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 500 episode because of the memory leak error else: ob = env.reset() imgfinal = np.zeros((1, 480, 640, 12), dtype=np.int32) s_t = C.getFC7(imgfinal) # print('ST FIRST', s_t) # print('STSHAPE FIRST', np.shape(s_t)) total_reward = 0. imglst = [] speed = 0 stepreset = 1 for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) # a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) a_t_original = actor.model.predict(C.getFC7(imgfinal)) # print('ATORIGINAL', a_t_original) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake if random.random() <= 0.05: print("********Now we apply the brake***********") noise_t[0][2] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) # print('GTD1SUM:', np.sum(generate_training_data(config=MyConfig))) # print('GTD SHAPE', np.shape(generate_training_data(config=MyConfig))) imglst.append(generate_training_data(config=MyConfig)) if len(imglst) == 4: imgcopy = imglst[:] imgfinal = np.stack(imgcopy) #print("Original stacked matrix", imgfinal) imgfinal = np.reshape(imgfinal, (4, 480, 640, 3)) #print("Reshaped stacked matrix", imgfinal) #Switch 3 and 0 if you want to switch RGB or Batch imgfinal = np.transpose(imgfinal, (1, 2, 3, 0)) #print("Transposed stacked matrix", imgfinal) imgfinal = np.reshape(imgfinal, (1, 480, 640, 12)) #print("Shape of imgfinal", imgfinal.shape) s_t1 = C.getFC7(imgfinal) #print('STL', s_t1) #print('STLSHAPE', np.shape(s_t1)) #print('IMGFINAL', imgfinal) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) # print('STATESSHAPE1', states) # print('SUMARRAY', np.sum(states)) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) # print('NEW STATES', new_states) # target_q_values = critic.target_model.predict([C.getFC7(imgfinal), actor.target_model.predict(C.getFC7(imgfinal))]) # print('ACTOR TARGET MODEL PREDICT', C.getFC7(imgfinal)) new_states = np.reshape(new_states, (-1, state_dim)) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # print('TARGET Q VALUES', target_q_values) # print('NEW STATES', new_states) # print('ACTOR MODEL PREDICT NEW STATES', actor.target_model.predict(new_states)) # print('REWARDS', rewards) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): states = np.reshape(states, (-1, state_dim)) # print('STATESSHAPE2', np.shape(states)) # print('ACTIONSSHAPE', np.shape(actions)) # print('YT', np.shape(y_t)) loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 speed += ob.speedX * 300 speedavg = speed / stepreset #print("SPEED X", ob.speedX) print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss, "Average Speed", speedavg) esar = (i, step, a_t, r_t, loss, speedavg) esar2.append(esar) step += 1 stepreset += 1 if len(imglst) >= 4: del imglst[0] # print("Length of imglist", len(imglst)) # print("List itself", imgfinal) if done: break if np.mod(i, 50) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodelIMG.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodelIMG.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") esar3 = (i, step, total_reward, speedavg) esar4.append(esar3) if np.mod(i, 50) == 0: save_object(esar2, 'IntraEpisode.pkl') save_object(esar4, 'InterEpisode.pkl') env.end() # This is for shutting down TORCS print("Finish.") print("Saving esars.")
n_actions = 24 # one state consists of 18 dimesions # 0-7: motor signal # 8: action # 9-14: imu data # 15-17: position # n_features = [s_t, s_t-1, s_t-2, s_t-3, s_t-4] n_features = 90 EPISODE = 1000 TIMESTEP = 1000 if __name__ == "__main__": env = SpyndraEnv() print "Start model initialization..." RL = DeepQNetwork( n_actions, n_features, learning_rate=0.1, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=4000, # output_graph=True ) print "Model initialization complete" #env._render() run(EPISODE, TIMESTEP) #RL.plot_cost()
class DQNAgent(Agent): ''' Agent based on Deep Q-Network Agent (DQN) ''' def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) # define Q-evaluation network and target Q-network for the agent self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) # we will never perform gradient descent or backpropagation with Q next network self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def learn(self): if self.memory.mem_cntr < self.batch_size: return # zero out previous gradient calculations self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) # calculate Q-predicted and Q-target values (gives action values for batch states) ''' dims --> batch_size x n_actions what the target network has to say about the values of the new states that results from the agent's actions. we want to know what are teh values of the maximal actions for that articular set of states. we find that by taking the max along the action dimension ''' q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_).max( dim=1)[0] # 0 max value, 1 index of max value # done flag as a type of mask q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() # backprogate the loss self.q_eval.optimizer.step() # step the optimizer to update weight self.learn_step_counter += 1 # do this to remember to update target network to the right frequency
class DDQNAgent(Agent): ''' Agent based on Double Deep Q-Nework (Double-DQN) ''' def __init__(self, *args, **kwargs): super(DDQNAgent, self).__init__(*args, **kwargs) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def learn(self): if self.memory.mem_cntr < self.batch_size: return # zero out previous gradient calculations self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_) q_eval = self.q_eval.forward(states_) max_actions = T.argmax(q_eval, dim=1) # done flag as a type of mask q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() # backprogate the loss self.q_eval.optimizer.step() # step the optimizer to update weight self.learn_step_counter += 1 # do this to remember to update target network to the right frequency self.decrement_epsilon()
class DQNAgent: def __init__(self, inputs, n_actions): self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain.load_state_dict(self.brain.state_dict()) self.target_brain.eval() self.set_params() self.optimizer = torch.optim.Adam(self.brain.parameters()) self.memory = ReplayMemory(50000) self.action_space = [0, 1] def set_params(self): self.batch_size = 64 self.max_exploration_rate = 1 self.min_exploration_rate = 0.05 self.exploration_decay_rate = 0.0005 self.steps_done = 0 def select_action(self, state): sample = np.random.random() exploration_rate = self.min_exploration_rate + ( self.max_exploration_rate - self.min_exploration_rate) * np.exp( -self.steps_done * self.exploration_decay_rate) self.steps_done += 1 if sample > exploration_rate: with torch.no_grad(): actions = self.brain(state) return torch.argmax(actions).item() else: return np.random.choice(self.action_space) def learn(self): if len(self.memory) < self.batch_size: return self.optimizer.zero_grad() max_capacity = (len(self.memory) if len(self.memory) < self.memory.capacity else self.memory.capacity) batch = np.random.choice(max_capacity, self.batch_size) transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor( tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.bool, ) non_final_next_states = torch.tensor( [s for s in batch.next_state if s is not None]) state_batch = torch.tensor(batch.state) action_batch = torch.tensor(batch.action) reward_batch = torch.tensor(batch.reward, dtype=torch.float) state_action_values = self.brain(state_batch).gather( 1, action_batch.unsqueeze(-1)) next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_brain( non_final_next_states).max(1)[0] gamma = 0.99 expected_state_action_values = (gamma * next_state_values + reward_batch / reward_batch.max()) self.loss = torch.nn.MSELoss()( expected_state_action_values.unsqueeze(-1), state_action_values) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step()
class DQNAgent: def __init__(self): # Our Main Model: POLICY NETWORK self.dqn = DeepQNetwork(model_name=MODEL_NAME, input_dim=INPUT_DIM, n_actions=N_ACTIONS, layer1_units=LAYER1_UNITS, layer2_units=LAYER2_UNITS, lr=LEARNING_RATE) self.model = self.dqn.create_model() # TARGET NETWORK self.target_model = self.dqn.create_model() self.target_model.set_weights(self.model.get_weights()) # An array with last n steps for training self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) # Tensorboard for logging results self.tensorboard = ModifiedTensorBoard( log_dir=f'../logs/{MODEL_NAME}-{int(time.time())}') # target update counter self.target_update_counter = 0 def update_replay_memory(self, transition): ''' To update the replay with the steps' experience ''' self.replay_memory.append(transition) def get_q_values(self, state): ''' Get the Q values (learned or thus far) ''' return self.model.predict(np.array(state).reshape(-1, *state.shape))[0] def train(self, terminal_state, step): ''' This is where we actually train the Agent ''' # Start training only if certain number of samples is already saved in REPLAY MEMORY # Else it keeps making steps which are added to the REPLAY MEM if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return # Get a minibatch of random samples from replay memory minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE) # Get current states from minibatch, then query NN model for Q values current_states = np.array([transition[0] for transition in minibatch]) current_qs_list = self.model.predict(current_states) # Get future states from minibatch, then query Target NN model for Q values # Computing the max term in Bellman Equation new_current_states = np.array( [transition[3] for transition in minibatch]) future_qs_list = self.target_model.predict(new_current_states) X = [] y = [] # Now we need to enumerate our batches for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch): if not done: max_future_q = np.max(future_qs_list[index]) new_q = reward + DISCOUNT * max_future_q else: new_q = reward # Update Q value for given state current_qs = current_qs_list[index] current_qs[action] = new_q # And append to our training data X.append(current_state) y.append(current_qs) # Fit on all samples as one batch, log only on terminal state self.model.fit( np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None) # Update target network counter every episode if terminal_state: self.target_update_counter += 1 # If counter reaches set value, update target network with weights of main network if self.target_update_counter > UPDATE_TARGET_EVERY: self.target_model.set_weights(self.model.get_weights()) self.target_update_counter = 0
HIDDENS = [128] network = Header(inpt_shape=env.observation_space.shape, hiddens=HIDDENS, opt_size=env.action_space.n, network=MLP, dueling=args.dueling) target_network = Header(inpt_shape=env.observation_space.shape, hiddens=HIDDENS, opt_size=env.action_space.n, network=MLP, dueling=args.dueling) if args.double else None dqn = DeepQNetwork(network=network, memory_size=MEMORY_SIZE, use_double_dqn=args.double, target_network=target_network, dueling=args.dueling, prioritized=args.prioritized) # ==== train & test ==== # choose one of the two pipelines if env_id == 0: train_pipeline_conservative(env, dqn, score_threshold, n_epoch=500, n_rollout=100, n_train=1000, batch_size=256) if env_id == 1 or env_id == 2: train_pipeline_progressive(env,