def _init_replay_memory(args): if MODEL not in args: info("Using empty replay memory.") return ReplayMemory(DEFAULT_PARAMS[REPLAY_MEMORY_SIZE]) replay_memory = load(args[MODEL], REPLAY_MEMORY_EXT) if replay_memory is None: info("Using empty replay memory.") return ReplayMemory(DEFAULT_PARAMS[REPLAY_MEMORY_SIZE]) info("Successfully loaded saved replay memory of model %s." % args[MODEL]) return replay_memory
def __init__(self, h, w, num_actions, device): self.device = device self.num_actions = num_actions self.memory = ReplayMemory(Config.CAPACITY) self.main_q_network = Net(h, w, num_actions).to(self.device) self.target_q_network = Net(h, w, num_actions).to(self.device) self.loss = 0. self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001) self.epsilon = 1.0 print(self.main_q_network)
def __init__(self, p): self.p = p self.target_dqn = DQN(self.p['HIDDEN_DIM']) self.eval_dqn = DQN(self.p['HIDDEN_DIM']) self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4]) self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE']) try: self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data")) self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data")) print("Data has been loaded successfully") except: print("No data existing")
def __init__(self, args, n_actions): self.model = DQN(args.img_width, args.img_height, args.channels, n_actions).to(device) self.n_action = n_actions self.epsilon_start = args.epsilon self.epsilon = args.epsilon self.decay_start = args.decay_start self.decay_end = args.n_epochs * 0.8 self.memory = ReplayMemory(args) self.batch_size = args.batch_size self.actions = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
def __init__(self, config, session): # build the net self.config = config self.sess = session self.RM = ReplayMemory(config) self.step_count = 0 self.episode = 0 self.isTesting = False self.game_state = np.zeros((1, 84, 84, self.config.buff_size), dtype=np.uint8) self.reset_game() self.timeout_option = tf.RunOptions(timeout_in_ms=5000) # if the new agent needs other action modes define a different dict self.action_modes = { str(config.testing_epsilon) + "_greedy": self.e_greedy_action } self.default_action_mode = self.action_modes.items()[0][0] self.action_mode = self.default_action_mode
def __init__(self, input_size, nb_action, gamma): """ Initialize Deep Q Learning @param input_size: input size of Neural Network @param nb_action: possible actions @param gamma: gamma paramemter of Deep Q-learning equation """ self.gamma = gamma self.reward_window = [] self.model = Network(input_size, nb_action) self.memory = ReplayMemory(100000) #optimization algorithm self.optimizer = optim.Adam(self.model.parameters(), lr=0.001) #unsqueeze(0) => torch tensor of size 1 x input_size self.last_state = torch.Tensor(input_size).unsqueeze(0) self.last_action = 0 self.last_reward = 0
class Brain: def __init__(self, h, w, num_actions, device): self.device = device self.num_actions = num_actions self.memory = ReplayMemory(Config.CAPACITY) self.main_q_network = Net(h, w, num_actions).to(self.device) self.target_q_network = Net(h, w, num_actions).to(self.device) self.loss = 0. self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001) self.epsilon = 1.0 print(self.main_q_network) def replay(self): if len(self.memory) < Config.LEARNING_START: return self.batch, self.state_batch, self.action_batch, self.reward_batch, \ self.non_final_next_states = self.make_minibatch() self.state_batch = self.state_batch.to(self.device) self.action_batch = self.action_batch.to(self.device) self.reward_batch = self.reward_batch.to(self.device) self.non_final_next_states = self.non_final_next_states.to(self.device) self.expected_state_action_values = self.get_expected_state_action_values() self.update_main_q_network() def decide_action(self, state): state = state.to(self.device) if self.epsilon <= np.random.uniform(0, 1): self.main_q_network.eval() with torch.no_grad(): action = self.main_q_network(state).max(1)[1].view(1, 1) else: action = torch.LongTensor([[random.randrange(self.num_actions)]]) return action def make_minibatch(self): transitions = self.memory.sample(Config.BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) return batch, state_batch, action_batch, reward_batch, non_final_next_states def get_expected_state_action_values(self): self.main_q_network.eval() self.target_q_network.eval() self.state_action_values = self.main_q_network(self.state_batch).gather(1, self.action_batch) # *********************************8 non_final_mask = torch.BoolTensor(tuple(map(lambda s: s is not None, self.batch.next_state))) a_m = torch.zeros(Config.BATCH_SIZE).type(torch.LongTensor).to(self.device) a_m[non_final_mask] = self.main_q_network(self.non_final_next_states).detach().max(1)[1] a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1) next_state_values = torch.zeros(Config.BATCH_SIZE).to(self.device) next_state_values[non_final_mask] = self.target_q_network( self.non_final_next_states).gather(1, a_m_non_final_next_states).detach().squeeze() expected_state_action_values = self.reward_batch + Config.GAMMA * next_state_values return expected_state_action_values def update_main_q_network(self): self.main_q_network.train() self.loss = F.smooth_l1_loss(self.state_action_values, self.expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() def update_target_q_function(self): self.target_q_network.load_state_dict(self.main_q_network.state_dict()) def update_epsilon(self): if self.epsilon > Config.EPSILON_MIN: self.epsilon -= 1 / (Config.NUM_EPISODES - Config.START_TRAIN_EP) def model_save(self): torch.save(self.main_q_network, 'puckworkd_model.pth')
def main(): parser = argparse.ArgumentParser( description='Train using Gazebo Simulations') parser.add_argument('--seed', default=10, type=int, help='Random seed') parser.add_argument('--input_shape', default=(80, 100), help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00001, help='learning rate') parser.add_argument('--window_size', default=4, type=int, help='Number of frames to feed to the Q-network') parser.add_argument('--num_time', default=4, type=int, help='Number of steps in RNN') parser.add_argument('--num_actions', default=7, type=int, help='Number of actions') parser.add_argument('--batch_size', default=64, type=int, help='Batch size of the training part') parser.add_argument('--num_iteration', default=500000, type=int, help='number of iterations to train') parser.add_argument( '--eval_every', default=0.01, type=float, help='What fraction of num_iteration to run between evaluations') args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) batch_environment = GazeboWorld() print('Environment initialized') replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) online_model, online_params = create_model(args.window_size, args.input_shape, args.num_actions, 'online_model', create_duel_q_network, trainable=True) target_model, target_params = create_model(args.window_size, args.input_shape, args.num_actions, 'target_model', create_duel_q_network, trainable=False) update_target_params_ops = [ t.assign(s) for s, t in zip(online_params, target_params) ] agent = DQNAgent(online_model, target_model, replay_memory, args.num_actions, args.gamma, TARGET_UPDATE_FREQENCY, update_target_params_ops, args.batch_size, args.learning_rate) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): # saving and loading networks trainables = tf.trainable_variables() trainable_saver = tf.train.Saver(trainables, max_to_keep=1) sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state("saved_networks") print('checkpoint:', checkpoint) if checkpoint and checkpoint.model_checkpoint_path: trainable_saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # make target_model equal to online_model sess.run(update_target_params_ops) print('Prepare fixed samples for mean max Q.') fixed_samples = get_fixed_samples(batch_environment, args.num_actions, NUM_FIXED_SAMPLES) # initialize replay buffer print('Burn in replay_memory.') agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False) # start training: fit_iteration = int(args.num_iteration * args.eval_every) for i in range(0, args.num_iteration, fit_iteration): # evaluate: reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate( sess, batch_environment) mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q( sess, fixed_samples) print("%d, %f, %f, %f, %f, %f, %f" % (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var, reward_max, reward_min)) # train: agent.fit(sess, batch_environment, fit_iteration, do_train=True) trainable_saver.save(sess, 'saved_networks/', global_step=i) reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate( sess, batch_environment) mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(sess, fixed_samples) print("%d, %f, %f, %f, %f, %f, %f" % (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var, reward_max, reward_min))
class Dqn(): """ Whole process of Deep Q Learning Algorithm """ def __init__(self, input_size, nb_action, gamma): """ Initialize Deep Q Learning @param input_size: input size of Neural Network @param nb_action: possible actions @param gamma: gamma paramemter of Deep Q-learning equation """ self.gamma = gamma self.reward_window = [] self.model = Network(input_size, nb_action) self.memory = ReplayMemory(100000) #optimization algorithm self.optimizer = optim.Adam(self.model.parameters(), lr=0.001) #unsqueeze(0) => torch tensor of size 1 x input_size self.last_state = torch.Tensor(input_size).unsqueeze(0) self.last_action = 0 self.last_reward = 0 def select_action(self, state): """ Select the next action of the car. Use softmax function to get the best action while exploring different actions 1.Generate Q values for all of possible actions 2.It generate the probability distribution of all Q values 3.Choose the action according to the probability distribution of each action @param state: input state of neural network @return: final action to play """ #Convert torch tensor state to torch variable # T=100 Temperature parameter # Higher temperature, softmax function increases the certainty probs = F.softmax(self.model(Variable(state, volatile=True)) * 100) # Get random draw of the probability distribution for eacj state action = probs.multinomial() return action.data[0, 0] def learn(self, batch_state, batch_next_state, batch_reward, batch_action): """ Train the deep neural network 1.Get the output by forward propagation 2.Get the target 3.Compare output and target to computer last error 4.Back propagate the last error to neural network 5.Use stochastic gradient descent to update the weight according how much contribute the last error @param batch_state: current state @param batch_next_state: next state @param batch_reward: reward @param batch_action: action """ #output of all possible actions #gather(1, batch_action.unsqueeze(1)) to choose the only one action selected by NN #Want a simple vector, we need to kill fake dimension #squeeze(1) removes size of 1 from the dimension outputs = self.model(batch_state).gather( 1, batch_action.unsqueeze(1)).squeeze(1) #Detach next_outputs = self.model(batch_next_state).detach().max(1)[0] #Bellman equation target = self.gamma * next_outputs + batch_reward #Temporal Difference td_loss = F.smooth_l1_loss(outputs, target) #re-initialize the optimizer self.optimizer.zero_grad() #back-ward propagation #free memory by retain variables td_loss.backward(retain_variables=True) #Update the weight of neural network self.optimizer.step() def update(self, reward, new_signal): """ Update everything once AI reaches to new state @param reward: last reward @param new_signal: last signal @return action to play """ new_state = torch.Tensor(new_signal).float().unsqueeze(0) self.memory.push((self.last_state, new_state, torch.LongTensor([int(self.last_action)]), torch.Tensor([self.last_reward]))) action = self.select_action(new_state) if len(self.memory.memory) > 100: batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample( 100) self.learn(batch_state, batch_next_state, batch_reward, batch_action) self.last_action = action self.last_state = new_state self.last_reward = reward self.reward_window.append(reward) if len(self.reward_window) > 1000: del self.reward_window[0] return action def score(self): """ Get the current score @return score """ return sum(self.reward_window) / (len(self.reward_window) + 1.) def save(self): """ Save the current Neural Network into file """ torch.save( { 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), }, 'last_brain.pth') def load(self): """ Load the existed Neural Network """ if os.path.isfile('last_brain.pth'): print("=> loading checkpoint... ") checkpoint = torch.load('last_brain.pth') self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print("done !") else: print("no checkpoint found...")
class AgentAtari: def __init__(self, p): self.p = p self.target_cnn = Convolutional() self.eval_cnn = Convolutional() self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4, 84, 84]) self.optimizer = torch.optim.Adam(self.eval_cnn.parameters(), self.p['LEARNING_RATE']) try: self.eval_cnn.load_state_dict(torch.load("Model/eval_cnn4ac.data")) self.target_cnn.load_state_dict( torch.load("Model/eval_cnn4ac.data")) print("Data has been loaded successfully") except: print("No data existing") def act(self, state): r = random.random() if r > self.p['EPSILON']: x = torch.FloatTensor(state).to(device) q_value = self.eval_cnn(x) return torch.argmax(q_value).item() else: action = random.randint(0, self.p['N_ACTIONS'] - 1) return action def learn(self, losss): if self.memory.index < self.p['BATCH_SIZE']: return # Get the state dict from the saved date eval_dict = self.eval_cnn.state_dict() target_dict = self.eval_cnn.state_dict() # Updating the parameters of the target DQN for w in eval_dict: target_dict[w] = (1 - self.p['ALPHA']) * target_dict[w] + self.p[ 'ALPHA'] * eval_dict[w] self.target_cnn.load_state_dict(target_dict) # Get a sample of size BATCH batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.pop( self.p['BATCH_SIZE']) # Update the treshold for the act() method if needed everytime the agent learn if self.p["EPSILON"] > self.p["EPSILON_MIN"]: self.p["EPSILON"] *= self.p["EPSILON_DECAY"] loss = nn.MSELoss() # Compute q values for the current evaluation q_eval = self.eval_cnn(batch_state).gather( 1, batch_action.long().unsqueeze(1)).reshape([self.p["BATCH_SIZE"]]) # Compute the next state q values q_next = self.target_cnn(batch_next_state).detach() # Compute the targetted q values q_target = batch_reward + q_next.max(1)[0].reshape( [self.p["BATCH_SIZE"]]) * self.p["GAMMA"] self.optimizer.zero_grad() l = loss(q_eval, q_target) losss.append(l) l.backward() self.optimizer.step() def atari(self): env = gym.make('BreakoutNoFrameskip-v4') env = env.unwrapped env = AtariPreprocessing(env, frame_skip=4, grayscale_obs=True, scale_obs=True) env = FrameStack(env, 4) rewards = [] losss = [] print(env.get_action_meanings()) for i in range(self.p['N_EPISODE']): state = env.reset() rewards.append(0) env.step(1) actual_life = 5 for s in range(self.p['N_STEPS']): env.render() action = self.act(state) n_state, reward, done, _ = env.step(action) if env.env.ale.lives() != actual_life: reward = -1 actual_life -= 1 env.step(1) rewards[-1] += reward self.memory.push(state, action, n_state, reward, done) self.learn(losss) state = n_state print("Episode : ", i, ", Rewards : ", rewards[-1]) torch.save(self.eval_cnn.state_dict(), "Model/eval_cnn4ac.data") # Display result plt.ylabel("Rewards") plt.xlabel("Episode") plt.plot(rewards) plt.grid() plt.show() plt.ylabel("loss") plt.xlabel("Episode") plt.plot(losss) plt.grid() plt.show() env.close()
def main(): parser = argparse.ArgumentParser( description='Run DQN on Atari Space Invaders') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--seed', default=10703, type=int, help='Random seed') parser.add_argument('--input_shape', default=(84, 84), help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.') parser.add_argument('--window_size', default=4, type=int, help='Number of frames to feed to the Q-network') parser.add_argument('--batch_size', default=32, type=int, help='Batch size of the training part') parser.add_argument('--num_process', default=3, type=int, help='Number of parallel environment') parser.add_argument('--num_iteration', default=20000000, type=int, help='number of iterations to train') parser.add_argument( '--eval_every', default=0.001, type=float, help='What fraction of num_iteration to run between evaluations.') parser.add_argument('--is_duel', default=1, type=int, help='Whether use duel DQN, 0 means no, 1 means yes.') parser.add_argument( '--is_double', default=1, type=int, help='Whether use double DQN, 0 means no, 1 means yes.') parser.add_argument( '--is_per', default=1, type=int, help='Whether use PriorityExperienceReplay, 0 means no, 1 means yes.') parser.add_argument( '--is_distributional', default=1, type=int, help='Whether use distributional DQN, 0 means no, 1 means yes.') parser.add_argument('--num_step', default=1, type=int, help='Num Step for multi-step DQN, 3 is recommended') parser.add_argument('--is_noisy', default=1, type=int, help='Whether use NoisyNet, 0 means no, 1 means yes.') args = parser.parse_args() args.input_shape = tuple(args.input_shape) print('Environment: %s.' % (args.env, )) env = gym.make(args.env) num_actions = env.action_space.n print('number_actions: %d.' % (num_actions, )) env.close() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) batch_environment = BatchEnvironment(args.env, args.num_process, args.window_size, args.input_shape, NUM_FRAME_PER_ACTION, MAX_EPISODE_LENGTH) if args.is_per == 1: replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) else: replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model noisy = True if args.is_noisy == 1 else False online_model, online_params = create_model_fn(args.window_size, args.input_shape, num_actions, 'online_model', create_network_fn, trainable=True, noisy=noisy) target_model, target_params = create_model_fn(args.window_size, args.input_shape, num_actions, 'target_model', create_network_fn, trainable=False, noisy=noisy) update_target_params_ops = [ t.assign(s) for s, t in zip(online_params, target_params) ] agent = DQNAgent(online_model, target_model, replay_memory, num_actions, args.gamma, UPDATE_FREQUENCY, TARGET_UPDATE_FREQENCY, update_target_params_ops, args.batch_size, args.is_double, args.is_per, args.is_distributional, args.num_step, args.is_noisy, args.learning_rate, RMSP_DECAY, RMSP_MOMENTUM, RMSP_EPSILON) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): sess.run(tf.global_variables_initializer()) # make target_model equal to online_model sess.run(update_target_params_ops) print('Prepare fixed samples for mean max Q.') fixed_samples = get_fixed_samples(batch_environment, num_actions, NUM_FIXED_SAMPLES) print('Burn in replay_memory.') agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False) # Begin to train: fit_iteration = int(args.num_iteration * args.eval_every) for i in range(0, args.num_iteration, fit_iteration): # Evaluate: reward_mean, reward_var = agent.evaluate(sess, batch_environment, NUM_EVALUATE_EPSIODE) mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples) print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var)) # Train: agent.fit(sess, batch_environment, fit_iteration, do_train=True) batch_environment.close()
def train(self, env, steps_per_epoch=128, epochs=10000): # Every four actions a gradient descend step is performed UPDATE_FREQ = 4 # Number of chosen actions between updating the target network. NETW_UPDATE_FREQ = 10000 # Replay mem REPLAY_MEMORY_START_SIZE = 33 # Create network model self.main_network.compile(optimizer=tf.keras.optimizers.Adam(), loss='mse') # Replay memory my_replay_memory = ReplayMemory() # Metrics loss_avg = tf.keras.metrics.Mean() train_reward_tot = tf.keras.metrics.Sum() train_rew_comf_tot = tf.keras.metrics.Sum() train_rew_eff_tot = tf.keras.metrics.Sum() train_rew_safe_tot = tf.keras.metrics.Sum() train_coll_rate = tf.keras.metrics.Mean() train_speed_rate = tf.keras.metrics.Mean() # Training loop: collect samples, send to optimizer, repeat updates times. next_obs = env.reset(gui=True, numVehicles=40) first_epoch = 0 try: for epoch in range(first_epoch, epochs): ep_rewards = 0 for step in range(steps_per_epoch): # curr state state = next_obs.copy() # get action action = self.act(state, self.main_network) # do step next_obs, rewards_info, done, collision = env.step(action) # process obs and get rewards avg_speed_perc = env.speed / env.target_speed rewards_tot, R_comf, R_eff, R_safe = rewards_info # Add experience my_replay_memory.add_experience(action=action, frame=next_obs, reward=rewards_tot, terminal=done) # Update metrics train_reward_tot.update_state(rewards_tot) train_rew_comf_tot.update_state(R_comf) train_rew_eff_tot.update_state(R_eff) train_rew_safe_tot.update_state(R_safe) train_coll_rate.update_state(collision) train_speed_rate.update_state(avg_speed_perc) # Train every UPDATE_FREQ times if self.steps_done > REPLAY_MEMORY_START_SIZE: loss_value = self.train_step_(my_replay_memory) loss_avg.update_state(loss_value) self.update_network() else: loss_avg.update_state(-1) # Copy network from main to target every NETW_UPDATE_FREQ if step % NETW_UPDATE_FREQ == 0 and step > REPLAY_MEMORY_START_SIZE: self.target_network.set_weights( self.main_network.get_weights()) self.steps_done += 1 # Write with self.train_summary_writer.as_default(): tf.summary.scalar('loss', loss_avg.result(), step=epoch) tf.summary.scalar('reward_tot', train_reward_tot.result(), step=epoch) tf.summary.scalar('rewards_comf', train_rew_comf_tot.result(), step=epoch) tf.summary.scalar('rewards_eff', train_rew_eff_tot.result(), step=epoch) tf.summary.scalar('rewards_safe', train_rew_safe_tot.result(), step=epoch) tf.summary.scalar('collission_rate', train_coll_rate.result(), step=epoch) tf.summary.scalar('avg speed wrt maximum', train_speed_rate.result(), step=epoch) # Reset train_reward_tot.reset_states() train_rew_comf_tot.reset_states() train_rew_eff_tot.reset_states() train_rew_safe_tot.reset_states() train_coll_rate.reset_states() train_speed_rate.reset_states() # Save model if epoch % 1000 == 0: tf.keras.models.save_model(self.main_network, self.model_dir + "/" + str(epoch) + "_main_network.hp5", save_format="h5") tf.keras.models.save_model(self.target_network, self.model_dir + "/" + str(epoch) + "_target_network.hp5", save_format="h5") except KeyboardInterrupt: # self.model.save_weights(self.model_dir+"/model.ckpt") tf.keras.models.save_model(self.main_network, self.model_dir + "/" + str(epoch) + "_main_network.hp5", save_format="h5") tf.keras.models.save_model(self.target_network, self.model_dir + "/" + str(epoch) + "_target_network.hp5", save_format="h5") env.close() return 0
def train(sess, env, args, actors, critics, noise): summary_ops, summary_vars = build_summaries() # summary_ops,episode_reward1 = build_summaries() init = tf.initialize_all_variables() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) for a in actors: a.update_target() for b in critics: b.update_target() replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) for ep in range(int(args['max_episodes']) + 1): print('starting runing') print('this is {} of epoch'.format(ep)) s = env.reset() episode_reward = np.zeros((env.n, )) if ep % 1000 == 0: for k in range(env.n): file1 = 'results/actor' + str(k) + str(ep) + '.h5' # file2 = 'results/actor'+str(k)+'/target'+str(ep)+'.h5' file3 = 'results/critic' + str(k) + str(ep) + '.h5' # file4 = 'results/critic'+str(k)+'/target'+str(ep)+'.h5' actor = actors[k] critic = critics[k] actor.mainModel.save(file1) # actor.targetModel.save(file2) critic.mainModel.save(file3) # critic.targetModel.save(file4) plt.close() plt.figure() for stp in range(int(args['max_episode_len'])): if args['render_env']: env.render(s) plt.clf() a = [] # shape=(n,actor.action_dim) for i in range(env.n): actor = actors[i] a.append( actor.act(np.reshape(s[i], (-1, actor.state_dim)), noise[i]()).reshape(actor.action_dim, )) s2, r, done = env.step( a) # a is a list with each element being an array replayMemory.add(s, a, r, done, s2) s = s2 action_dims_done = 0 for i in range(env.n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['minibatch_size'])) a = [] for j in range(env.n): state_batch_j = np.asarray( [x for x in s_batch[:, j]] ) # batch processing will be much more efficient even though reshaping will have to be done a.append(actors[j].predict_target(state_batch_j)) a_temp = np.transpose(np.asarray(a), (1, 0, 2)) a_for_critic = np.asarray([x.flatten() for x in a_temp]) s2_batch_i = np.asarray([ x for x in s2_batch[:, i] ]) # Checked till this point, should be fine. targetQ = critic.predict_target( s2_batch_i, a_for_critic) # Should work, probably yi = [] for k in range(int(args['minibatch_size'])): if d_batch[:, i][k]: yi.append(r_batch[:, i][k]) else: yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) critic.train( s_batch_i, np.asarray([x.flatten() for x in a_batch]), np.reshape(yi, (int(args['minibatch_size']), 1))) actions_pred = [] for j in range(env.n): state_batch_j = np.asarray([x for x in s2_batch[:, j]]) actions_pred.append( actors[j].predict(state_batch_j) ) # Should work till here, roughly, probably a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2)) a_for_critic_pred = np.asarray( [x.flatten() for x in a_temp]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) grads = critic.action_gradients( s_batch_i, a_for_critic_pred)[:, action_dims_done:action_dims_done + actor.action_dim] actor.train(s_batch_i, grads) actor.update_target() critic.update_target() action_dims_done = action_dims_done + actor.action_dim episode_reward += r # print(done) if sum(done): summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward[0], summary_vars[1]: episode_reward[2] }) writer.add_summary(summary_str, ep) writer.flush() break
class AgentCartpole: def __init__(self, p): self.p = p self.target_dqn = DQN(self.p['HIDDEN_DIM']) self.eval_dqn = DQN(self.p['HIDDEN_DIM']) self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4]) self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE']) try: self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data")) self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data")) print("Data has been loaded successfully") except: print("No data existing") def act(self, state): r = random.random() if r > self.p['EPSILON']: x = torch.FloatTensor(state) q_value = self.eval_dqn(x) action = torch.argmax(q_value).item() return action else: action = random.randint(0, self.p['N_ACTIONS']-1) return action def learn(self): if self.memory.index < self.p['BATCH_SIZE']: return # Get the state dict from the saved date eval_dict = self.eval_dqn.state_dict() target_dict = self.eval_dqn.state_dict() # Updating the parameters of the target DQN for w in eval_dict: target_dict[w] = (1 - self.p['ALPHA']) * target_dict[w] + self.p['ALPHA'] * eval_dict[w] self.target_dqn.load_state_dict(target_dict) # Get a sample of size BATCH batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.pop(self.p['BATCH_SIZE']) # Update the treshold for the act() method if needed everytime the agent learn if self.p["EPSILON"] > self.p["EPSILON_MIN"]: self.p["EPSILON"] *= self.p["EPSILON_DECAY"] loss = nn.MSELoss() # Compute q values for the current evaluation q_eval = self.eval_dqn(batch_state).gather(1, batch_action.long().unsqueeze(1)).reshape([self.p["BATCH_SIZE"]]) # Compute the next state q values q_next = self.target_dqn(batch_next_state).detach() # Compute the targetted q values q_target = batch_reward + q_next.max(1)[0].reshape([self.p["BATCH_SIZE"]]) * self.p["GAMMA"] self.optimizer.zero_grad() l = loss(q_eval, q_target) l.backward() self.optimizer.step() def random(self): env = gym.make('CartPole-v1') env = env.unwrapped env.reset() rewards = [] while True: env.render() action = env.action_space.pop(self.p['BATCH_SIZE']) observation, reward, done, info = env.step(action) rewards.append(reward) if done: break env.close() plt.ylabel("Rewards") plt.xlabel("Nb interactions") plt.plot(rewards) plt.grid() plt.show() def dqn_cartpole(self): env = gym.make('CartPole-v1') env = env.unwrapped rewards = [] for i in range(self.p['N_EPISODE']): state = env.reset() rewards.append(0) for s in range(self.p['N_STEPS']): # env.render() action = self.act(state) n_state, reward, done, _ = env.step(action) if done: reward = -1 rewards[-1] += reward self.memory.push(state, action, n_state, reward, done) self.learn() state = n_state print('Episode : ', i, ', Rewards : ', rewards[-1]) # Save the eval model after each episode torch.save(self.eval_dqn.state_dict(), "Model/eval_dqn.data") # Display result n = 50 res = sum(([a]*n for a in [sum(rewards[i:i+n])//n for i in range(0,len(rewards),n)]), []) print(rewards) plt.ylabel("Rewards") plt.xlabel("Episode") plt.plot(rewards) plt.plot(res) plt.grid() plt.legend(['Rewards per episode', 'Last 50 runs average']) plt.show() env.close()
def algorithmImpl(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = gym.make( "BreakoutDeterministic-v4") # Deterministic-v4; frameskip = 4 numActions = env.action_space.n mem = ReplayMemory(MEM_CAPACITY) agent = Agent(EP_START, EP_END, EP_DECAY, numActions, device) policyNet = DQN(numActions).to(device) targetNet = DQN(numActions).to(device) targetNet.load_state_dict(policyNet.state_dict()) targetNet.eval() optimizer = optim.Adam(params=policyNet.parameters(), lr=LEARNIN_RATE) stepCount = 0 for ep in range(EPISODES): print('episode: ', ep + 1) done = False obv = env.reset() preproObv = preprocessing(obv) frames = [preproObv] nextFrames = [] lastAction = 0 totalReward = 0 for t in count(): if len(frames) == 4: state = torch.cat(frames, dim=1).to( device) # returns tensor of 1x4x84x84 action = agent.selectAction(state, policyNet) frames = [] else: action = lastAction obv, r, done, _ = env.step(action) preproObv = preprocessing(obv) frames.append(preproObv) nextFrames.append(preproObv) totalReward += r # for evalution if done: r = -1.0 reward = torch.tensor(r).reshape(1, 1).to(device) lastAction = action if len(nextFrames) == 4: nextState = torch.cat(nextFrames, dim=1).to( device) # returns tensor of 1x4x84x84 nextFrames = [] mem.push(Experience(state, action, reward, nextState)) state = nextState if mem.canProvideSample(BATCH_SIZE): exps = mem.sample(BATCH_SIZE) states, actions, rewards, nextStates = extractTensors(exps) qPred = policyNet(states).gather(1, actions) qTarget = targetNet(nextStates).max( dim=1, keepdim=True)[0].detach() target = GAMMA * qTarget + rewards loss = functional.mse_loss(qPred, target) policyNet.zero_grad() loss.backward() optimizer.step() stepCount += 1 if stepCount == TARGET_UPDATE: stepCount = 0 targetNet.load_state_dict(policyNet.state_dict()) print("SavedModels/Saved model") torch.save(policyNet, "Policy.pt") if ep % 20 == 0: env.render() if done: break torch.save(policyNet, "SavedModels/Policy.pt")
print("device", device) if not os.path.exists(file_path): os.makedirs(file_path) write_lr(lr) #lrのtextファイルを作成する now = datetime.datetime.now() print('{0:%Y%m%d}'.format(now)) #tensorboarx writer_x = SummaryWriter('tfbx2/' + '_' + '{0:%Y%m%d%H%M%S_}'.format(now) + model_filename + MEMO + '/') ban = Env(BANHEN, WINREN) memory = ReplayMemory(CAPACITY, ban) brain = Brain_dqn(NeuralNet_cnn, device, ban.size, ban, memory, GAMMA, BATCH_SIZE, lr, T, BANHEN, BANSIZE) match_is_continue = True #試合が継続しているかどうか train_is_continue = True #訓練を継続するか reward = 0 #報酬 step = 0 #何手目か step_sum = 0 gen_num = 0 #モデルの初期値 episode_sum = 0 #エピソードの累積 search_depth = 3 ep_random_data = 0 log_print("lrはtextファイルから読み取り")
weapons = ["Candlestick", "Knife", "Lead Pipe", "Revolver", "Rope", "Wrench"] characters = [ "Mr. Green", "Colonel Mustard", "Mrs. Peacock", "Professor Plum", "Ms. Scarlet", "Mrs. White" ] if numQlearnPlayers > 0: qtbl = QTable(rooms, weapons, characters) else: qtbl = {} if numDeepQLearnPlayers > 0: if os.path.exists("QNetworks.pickle"): nets = pickle.load(open("QNetworks.pickle", "rb")) rm = ReplayMemory( 100000, namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))) qNetworks = (nets[0], nets[1], rm) else: n1 = QNetwork(6, 6, 67220).to( torch.device("cuda" if torch.cuda.is_available() else "cpu")) n2 = QNetwork(6, 6, 67220).to( torch.device("cuda" if torch.cuda.is_available() else "cpu")) rm = ReplayMemory( 100000, namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))) qNetworks = (n1, n2, rm) else: qNetworks = ()
torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False #Setup Tensorboard tsb_folder = "tensorboard" tsb_dirname = "DQN_%d" % (args.target_update) tsb_path = os.path.join(tsb_folder, tsb_dirname) writer = SummaryWriter(tsb_path) BATCH_SIZE = 32 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward')) print("TRAINING ON RANDOM INSTANCES") memory = ReplayMemory(args.memory_capacity) brain = DQNet(args) i_episode = 0 steps_done = 0 trainPath = "./Data/MediumData/" print("Start Initialization") while len(memory) < 5 * BATCH_SIZE: #generate random dataset randomFile = random.choice(os.listdir(trainPath)) randomFile = os.path.join(trainPath, randomFile) print(randomFile) instance = readMatInstance(randomFile) loss, reward, real_reward = run_episode(args, i_episode, instance, memory_initialization=True)
gamma = 0.999 eps_start = 1 eps_end = 0.01 eps_decay = 0.001 target_update = 10 #update the target network every 10 episode memory_size = 100000 lr = 0.001 #learning rate num_episodes = 1000 #set the device use cpu or gpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #enviorement manager em = CartPoleEnvManager(device) #create the strategy strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay) #create agent agent = Agent(strategy, em.num_actions_available(), device) #create replay memory memory = ReplayMemory(memory_size) #create policy network and target network #pass height and width to create appropriate input shape policy_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device) target_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
device = args.device gamma = args.gamma learning_rate = args.learning_rate exploration_steps = args.exploration_steps initial_epsilon = args.initial_epsilon final_epsilon = args.final_epsilon sync_rate = args.sync_rate save_summary_rate = args.save_summary_rate def get_epsilon(): if global_step < config.exploration_steps: return config.initial_epsilon-((config.initial_epsilon-config.final_epsilon)/config.exploration_steps)*global_step else: return config.final_epsilon RM = ReplayMemory(config) def flush_print(str): print(str) sys.stdout.flush() def preprocess(new_frame, state): frame = cv2.resize(new_frame, (84, 84)) new_state = np.roll(state, -1, axis=3) new_state[0, :, :, config.buff_size -1] = frame return new_state with tf.device(config.device): input_state_ph = tf.placeholder(tf.float32,[config.batch_size,84,84,4], name="input_state_ph")
class BaseAgent: # must be implemented by each agent def update(self): return def __init__(self, config, session): # build the net self.config = config self.sess = session self.RM = ReplayMemory(config) self.step_count = 0 self.episode = 0 self.isTesting = False self.game_state = np.zeros( (1, 84, 84, self.config.buff_size), dtype=np.uint8) self.reset_game() self.timeout_option = tf.RunOptions(timeout_in_ms=5000) # if the new agent needs other action modes define a different dict self.action_modes = { str(config.epsilon) + "_greedy": self.e_greedy_action} self.default_action_mode = self.action_modes.items()[0][0] self.action_mode = self.default_action_mode self.representations = [] def step(self, screen, reward): # clip the reward if not self.isTesting: # add the last transition self.RM.add(self.game_state[:, :, :, -1], self.game_action, self.game_reward, False) self.observe(screen, reward) self.game_action = self.e_greedy_action(self.epsilon()) if self.step_count > self.config.steps_before_training: self.update() self.step_count += 1 else: # if the agent is testing self.observe(screen, reward) self.game_action = self.e_greedy_action(0.01) return self.game_action # Add the final transition to the RM and reset the internal state for the next # episode def terminal(self): if not self.isTesting: self.RM.add( self.game_state[:, :, :, -1], self.game_action, self.game_reward, True) self.reset_game() def observe(self, screen, reward): self.game_reward = max(-1, min(1, reward)) screen = cv2.resize(screen, (84, 84)) screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY) self.game_state = np.roll(self.game_state, -1, axis=3) self.game_state[0, :, :, -1] = screen def e_greedy_action(self, epsilon): ops = [self.Q] + self.representations res= self.sess.run( ops, feed_dict={ self.state_ph: self.game_state}) self.Q_np = res[0] self.representations_np = res[1:] action = np.argmax(self.Q_np) if np.random.uniform() < epsilon: action = random.randint(0, self.config.action_num - 1) return action def testing(self, t=True): self.isTesting = t def set_action_mode(self, mode): if mode not in self.action_modes: raise Exception(str(mode) + " is not a valid action mode") self.select_action = self.action_modes[mode] def reset_game(self): self.game_state.fill(0) self.game_action = 0 self.game_reward = 0 if not self.isTesting: # add initial black screens for next episode for i in range(self.config.buff_size - 1): self.RM.add(np.zeros((84, 84)), 0, 0, False) def epsilon(self): if self.step_count < self.config.exploration_steps: return self.config.initial_epsilon - \ ((self.config.initial_epsilon - self.config.final_epsilon) / self.config.exploration_steps) * self.step_count else: return self.config.final_epsilon
if config.h_to_h not in ["oh_concat", "expanded_concat", "conditional"]: raise "Not valid transition function" def get_epsilon(): if global_step < config.exploration_steps: return config.initial_epsilon - ( (config.initial_epsilon - config.final_epsilon) / config.exploration_steps) * global_step else: return config.final_epsilon RM = ReplayMemory(config) def flush_print(str): print(str) sys.stdout.flush() def preprocess(new_frame, state): frame = cv2.resize(new_frame, (84, 84)) new_state = np.roll(state, -1, axis=3) new_state[0, :, :, config.buff_size - 1] = frame return new_state with tf.device(config.device):