def __init__(self, env, mode, pre_trained_model, tensorboard_writer=None): super(DQNAgent, self).__init__(env, mode, tensorboard_writer) self.agent_name = 'DQN' + str(self.agent_no) self.memory = ReplayMemory() self.network = DeepQNetwork(self.obs_space[0], self.action_space) if self.mode == 'play': self.network.load_params(pre_trained_model) self.network.eval() elif self.mode == 'train': self.eval_network = DeepQNetwork(self.obs_space[0], self.action_space) self.eval_network.eval() if pre_trained_model: self.eval_network.load_params(pre_trained_model) self.optimizer = optim.RMSprop(self.network.parameters(), lr=LR) self.loss_func = SmoothL1Loss() else: raise ValueError( 'Please set a valid mode for the agent (play or train)')
def __init__(self, sess, s_size, a_size, scope, queues, trainer): self.queue = queues[0] self.param_queue = queues[1] self.replaymemory = ReplayMemory(100000) self.sess = sess self.learner_net = network(s_size, a_size, scope, 20) self.q = self.learner_net.q self.Q = self.learner_net.Q self.actions_q = tf.placeholder(shape=[None, a_size, N], dtype=tf.float32) self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32) self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32) self.q_actiona = tf.multiply(self.q, self.actions_q) self.q_action = tf.reduce_sum(self.q_actiona, axis=1) self.u = tf.abs(self.q_target - self.q_action) self.loss = tf.reduce_mean( tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1)) self.local_vars = self.learner_net.local_vars #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, self.local_vars) #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0) self.apply_grads = trainer.apply_gradients( zip(self.gradients, self.local_vars)) self.sess.run(tf.global_variables_initializer())
def __init__(self, policy_net, target_net, durability, optimizer, name, constants): """An agent class that takes action on the environment and optimizes the action based on the reward. Parameters ---------- policy_net : DQN [description] target_net : DQN [description] durability : int [description] optimizer : [type] [description] name : str The name of agent constants: Constants The hyper-parameters from Constants class """ self.CONSTANTS = constants self.policy_net = policy_net self.target_net = target_net self.target_net.load_state_dict(policy_net.state_dict()) self.durability = durability self.optimizer = optimizer self.name = name self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE) self.steps_done = 0 self.total_reward = 0.0 self.reward = 0.0 self.obtained_reward = 0.0 self.n_best = 0 self.policy_net_flag = False
def __init__(self, state_size, action_size, seed, is_double_q=False): '''Initialize an Agent. Params ====== state_size (int): the dimension of the state action_size (int): the number of actions seed (int): random seed ''' self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP) self.running_loss = 0 self.training_cnt = 0 self.is_double_q = is_double_q self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device) self.qnetowrk_target = QNetwork(self.state_size, self.action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)
def __init__(self, load_checkpoint, n_states, n_actions, checkpoint_file, mem_size=10**6, batch_size=64, n_hid1=400, n_hid2=300, alpha=1e-4, beta=1e-3, gamma=0.99, tau=0.99): self.batch_size = batch_size self.gamma = gamma self.tau = tau self.actor = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, alpha, checkpoint_file, name='actor') self.critic = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, beta, checkpoint_file, name='critic') self.actor_target = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, alpha, checkpoint_file, name='actor_target') self.critic_target = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, beta, checkpoint_file, name='critic_target') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.memory = ReplayMemory(mem_size, n_states, n_actions) self.update_network_parameters_phil(tau=1) if load_checkpoint: self.actor.eval() self.load_checkpoint = load_checkpoint
def __init__(self, env, name, s_size, a_size, trainer, model_path, global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory)
def __init__(self, num_states, num_actions, Double, Dueling, PER): self.num_actions = num_actions # 행동 가짓수(2)를 구함 self.Double = Double self.Dueling = Dueling self.PER = PER # transition을 기억하기 위한 메모리 객체 생성 self.memory = ReplayMemory(CAPACITY) # 신경망 구성 n_in, n_mid, n_out = num_states, 32, num_actions self.main_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용 self.target_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용 print(self.main_q_network) # 신경망의 구조를 출력 # 최적화 기법을 선택 self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001) # PER - TD 오차를 기억하기 위한 메모리 객체 생성 if self.PER == True: self.td_error_memory = TDerrorMemory(CAPACITY)
def __init__(self, dim): self.critic_path = cst.CN_CKPT_PATH self.actor_path = cst.AN_CKPT_PATH self.replaymemory_path = cst.RM_PATH self.dim_body = dim[0] self.dim_sensor = dim[1] self.dim_state = dim[0] + dim[1] * 3 self.dim_action = dim[2] self.sess = tf.InteractiveSession() self.act_lr = cst.ACT_LEARNING_RATE self.cri_lr = cst.CRI_LEARNING_RATE self.tau = cst.TAU self.batch_size = cst.BATCH_SIZE self.gamma = cst.REWARD_DECAY self.actorNN = ActorNetwork(self.sess, self.dim_state, self.dim_action, self.act_lr, self.tau, self.batch_size) self.criticNN = CriticNetwork(self.sess, self.dim_state, self.dim_action, self.cri_lr, self.tau, self.gamma, self.actorNN.get_num_trainable_vars()) self.sess.run(tf.global_variables_initializer()) self.actorNN.update_target_network() self.criticNN.update_target_network() self.rm = ReplayMemory('DDPG') self.agent_count = cst.AGENT_COUNT self.exploration_rate = cst.EXPLORATION_RATE self.epsilon = cst.CRITIC_EPSILON self.LOSS_ITERATION = cst.LOSS_ITERATION self.expl_noise = OUNoise(self.dim_action) self.expl = False self.expl_decay = cst.EXPLORATION_DECAY
def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory)
def main(game, episodes, training_mode=False, log=False, no_ops=30): env = gym.make(game) num_actions = env.action_space.n dqn = DeepQNetwork(num_actions, (4, 84, 84)) replay = ReplayMemory(100000) obs = env.reset() h, w, c = obs.shape phi = Phi(4, 84, 84, c, h, w) agent = Agent(replay, dqn, training_mode=training_mode) stats = Stats('results/results.csv') for i_episode in range(episodes): env.reset() for i in range(random.randint(1, no_ops)): observation, _, _, _ = env.step(0) pre_state = phi.add(observation) game_score = 0 done = False t = 0 while not done: t += 1 env.render() action = agent.get_action(pre_state) observation, reward, done, _ = env.step(action) post_state = phi.add(observation) if training_mode: agent.update_replay_memory(pre_state, action, reward, post_state, done) if agent.time_step > agent.replay_start_size: stats.log_time_step(agent.get_loss()) pre_state = post_state game_score += reward print("Episode {} finished after {} time steps with score {}".format( i_episode, t, game_score)) phi.reset() if agent.time_step > agent.replay_start_size: stats.log_game(game_score, t) stats.close() if log: dqn.save_model('results/model_weights.hdf5')
def main(): game = FlappyBird() env = PLE(game, fps=30, display_screen=False) env_evaluate = PLE(game, fps=30, display_screen=False) obs_dim = len(env.getGameState()) action_dim = 2 # 只能是up键,还有一个其它,所以是2 # rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_dim, act_dim=action_dim, e_greed=0.2, # explore e_greed_decrement=1e-6 ) # probability of exploring is decreasing during training if os.path.exists('./model_dir'): agent.restore('./model_dir') # while rpm.size() < MEMORY_WARMUP_SIZE: # warm up replay memory while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, env, rpm) max_episode = 5000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(agent, env, rpm) episode += 1 eval_reward = evaluate(agent, env_evaluate) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward)) agent.save('./model_dir')
EPISODES = 500 START_RANDOM = False MAX_EPISODE_COUNTER = 3600 * 24 * 2.0 / PERIOD ACTION_DIM = 1 STATE_DIM = 6 ACTION_MAX = 1.0 MAX_BUFFER = 100000 MAX_TOTAL_REWARD = 300 EPISODE_PLOT = 25 # -------------------------------------------- # # LOAD USEFULL CLASSES. # -------------------------------------------- # # Load the memroy memory = ReplayMemory(MAX_BUFFER) # Load the environment. env = Environment(FILENAME, QUOTE_QTY, TRADE_QTY) # Load the trainer. trainer = Trainer(STATE_DIM, ACTION_DIM, ACTION_MAX, memory) # Load the window. window = Window(LOOK_BACK) window.add_norm("#t", method="log_change", ref="close_price_#t") # Load the tensorboard writer. writer = SummaryWriter("tensorboard/runs") # -------------------------------------------- #
def __init__( self, game, mem_size=512 * 512, #1024*512, state_buffer_size=4, batch_size=64, learning_rate=1e-5, pretrained_model=None, frameskip=4, #1 record=False): """ Inputs: - game: string to select the game - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - record: boolean to enable record option """ # Namestring self.game = game # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders) #if self.game == 'Breakout-v0': # dimensions = (32, 195, 8, 152) #elif self.game == 'SpaceInvaders-v0': # dimensions = (21, 195, 20, 141) #elif self.game == 'Assault-v0': # dimensions = (50, 240, 5, 155) #elif self.game == 'Phoenix-v0': # dimensions = (23, 183, 0, 160) #elif self.game == 'Skiing-v0': # dimensions = (55, 202, 8, 152) #elif self.game == 'Enduro-v0': # dimensions = (50, 154, 8, 160) #elif self.game == 'BeamRider-v0': # dimensions = (32, 180, 9, 159) if self.game == 'BreakoutAndSpace': dimensions_break = (32, 195, 8, 152) dimensions_space = (21, 195, 20, 141) elif self.game != 'BreakoutAndSpace': print( 'Error! This version is for playing BreakOut and SpaceInvaders at the same time.' ) # Environment self.env_break = Environment('BreakoutNoFrameskip-v4', dimensions_break, frameskip=frameskip) self.env_space = Environment('SpaceInvaders-v0', dimensions_space, frameskip=frameskip) # Cuda self.use_cuda = torch.cuda.is_available() # Neural network self.net = DQN(channels_in=state_buffer_size, num_actions=self.env_space.get_number_of_actions()) self.target_net = DQN( channels_in=state_buffer_size, num_actions=self.env_space.get_number_of_actions()) if self.use_cuda: self.net.cuda() self.target_net.cuda() if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate) #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 4 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 25000 else: self.start_train_after = mem_size // 2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500
def __init__(self, load_checkpoint, checkpoint_file, env, n_states, n_actions, update_actor_interval=2, warmup=1000, mem_size=10**6, batch_size=100, n_hid1=400, n_hid2=300, lr_alpha=1e-3, lr_beta=1e-3, gamma=0.99, tau=5e-3, noise_mean=0, noise_sigma=0.1): self.load_checkpoint = load_checkpoint self.checkpoint_file = checkpoint_file # needed for clamping in the learn function self.env = env self.max_action = float(env.action_space.high[0]) self.low_action = float(env.action_space.low[0]) self.n_actions = n_actions # to keep track of how often we call "learn" function, for the actor network self.learn_step_counter = 0 # to handle countdown to the end of the warmup period, incremented every time we call an action self.time_step = 0 self.update_actor_interval = update_actor_interval self.warmup = warmup self.gamma = gamma self.tau = tau self.batch_size = batch_size self.noise_mean = noise_mean self.noise_sigma = noise_sigma self.actor = TD3ActorNetwork(n_states, n_actions, n_hid1, n_hid2, lr_alpha, checkpoint_file, name='actor') self.target_actor = TD3ActorNetwork(n_states, n_actions, n_hid1, n_hid2, lr_alpha, checkpoint_file, name='target_actor') self.critic_1 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='critic_1') self.critic_2 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='critic_2') self.target_critic_1 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='target_critic_1') self.target_critic_2 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='target_critic_2') self.memory = ReplayMemory(mem_size, n_states, n_actions) # tau=1 perform an exact copy of the networks to the respective targets # self.update_network_parameters(tau=1) self.update_network_parameters(self.actor, self.target_actor, tau=1) self.update_network_parameters(self.critic_1, self.target_critic_1, tau=1) self.update_network_parameters(self.critic_2, self.target_critic_2, tau=1)
image_dimensions = 210 * 160 * 3 num_episodes = 50 target_episode_update = 5 action_threshold = 250 train_batch_size = 64 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 steps_done = 0 n_actions = env.action_space.n screen_height = 210 screen_width = 160 memory = ReplayMemory(10000) policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) def optimize_model(): if len(memory) < train_batch_size: return transitions = memory.sample(train_batch_size) print('Training on:', len(transitions))
class SingleAgent(object): def __init__(self, game, mem_size = 1000000, state_buffer_size = 4, batch_size = 64, learning_rate = 1e-5, pretrained_model = None, frameskip = 4 ): """ Inputs: - game: string to select the game - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - record: boolean to enable record option """ # Namestring self.game = game # Environment self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip) # Cuda self.use_cuda = torch.cuda.is_available() # Neural network self.net = DQN(channels_in = state_buffer_size, num_actions = self.env.get_number_of_actions()) self.target_net = DQN(channels_in = state_buffer_size, num_actions = self.env.get_number_of_actions()) if self.use_cuda: self.net.cuda() self.target_net.cuda() if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate) #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 1 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 50000 else: self.start_train_after = mem_size//2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500 def select_action(self, observation, mode='train'): """ Select an random action from action space or an proposed action from neural network depending on epsilon Inputs: - observation: np.array with the observation Returns: action: int """ # Hyperparameters EPSILON_START = 1 EPSILON_END = 0.1 EPSILON_DECAY = 1000000 EPSILON_PLAY = 0.01 MAXNOOPS = 30 # Decrease of epsilon value if not self.pretrained_model: #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \ # np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY) epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY elif mode=='play': epsilon = EPSILON_PLAY else: epsilon = EPSILON_END if epsilon < random(): # Action according to neural net # Wrap tensor into variable state_variable = Variable(observation, volatile=True) # Evaluate network and return action with maximum of activation action = self.net(state_variable).data.max(1)[1].view(1,1) # Prevent noops if action[0,0]!=1: self.noops_count += 1 if self.noops_count == MAXNOOPS: action[0,0] = 1 self.noops_count = 0 else: self.noops_count = 0 else: # Random action action = self.env.sample_action() action = LongTensor([[action]]) return action def optimize(self, net_updates): """ Optimizer function Inputs: - net_updates: int Returns: - loss: float - q_value: float - exp_q_value: float """ # Hyperparameter GAMMA = 0.99 # not enough memory yet if len(self.replay) < self.start_train_after: return # Sample a transition batch = self.replay.sampleTransition(self.batch_size) # Mask to indicate which states are not final (=done=game over) non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state))) # Wrap tensors in variables state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]), volatile=True) # volatile==true prevents calculation of the derivative next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False) if self.use_cuda: state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() non_final_mask = non_final_mask.cuda() non_final_next_states = non_final_next_states.cuda() next_state_values = next_state_values.cuda() # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the # columns of actions taken state_action_values = self.net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_max_values = self.target_net(non_final_next_states).detach().max(1)[0] next_state_values[non_final_mask]= next_max_values # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) self.optimizer.zero_grad() loss.backward() for param in self.net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if net_updates%self.update_target_net_each_k_steps==0: self.target_net.load_state_dict(self.net.state_dict()) print('target_net update!') return loss.data.cpu().numpy()[0] def play(self, n): """ Play a game with the current net and render it Inputs: - n: games to play """ for i in range(n): done = False # games end indicator variable score = 0 # Reset game screen = self.env.reset() # list of k last frames last_k_frames = [] for j in range(self.num_stored_frames): last_k_frames.append(None) last_k_frames[j] = gray2pytorch(screen) # frame is saved as ByteTensor -> convert to gray value between 0 and 1 state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 while not done: action = self.select_action(state, mode='play')[0,0] screen, reward, _, done, _ = self.env.step(action, mode='play') score += reward # save latest frame, discard oldest for j in range(self.num_stored_frames-1): last_k_frames[j] = last_k_frames[j+1] last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen) # convert frames to range 0 to 1 again state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 self.state = state print('Game ({}/{}) - Final score {}: {}'.format(i+1, n, self.game, score)) self.env.game.close() def play_stats(self, n_games, mode='random'): """ Play N games randomly or evaluate a net and log results for statistics Input: - n_games: int Number of games to play - mode: str 'random' or 'evaluation' """ # Subdirectory for logging sub_dir = mode + '_' + self.game + '/' if not os.path.exists(sub_dir): os.makedirs(sub_dir) # Store history reward_history = [] reward_clamped_history = [] # Number of actions to sample from n_actions = self.env.get_number_of_actions() for i_episode in range(1, n_games+1): # Reset game screen = self.env.reset() # Store screen if mode=='evaluation': # list of k last frames last_k_frames = [] for j in range(self.num_stored_frames): last_k_frames.append(None) last_k_frames[j] = gray2pytorch(screen) # frame is saved as ByteTensor -> convert to gray value between 0 and 1 state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 # games end indicator variable done = False # reset score with initial lives, because every lost live adds -1 total_reward = 0 total_reward_clamped = self.env.get_lives() while not done: if mode=='random': action = randrange(n_actions) elif mode=='evaluation': action = self.select_action(state, mode='play')[0,0] screen, reward, reward_clamped, done, _ = self.env.step(action) total_reward += int(reward) total_reward_clamped += int(reward_clamped) if mode=='evaluation': # save latest frame, discard oldest for j in range(self.num_stored_frames-1): last_k_frames[j] = last_k_frames[j+1] last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen) # convert frames to range 0 to 1 again state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 # Print current result print('Episode: {:6}/{:6} | '.format(i_episode, n_games), 'score: ({:4}/{:4})'.format(total_reward_clamped,total_reward)) # Save rewards reward_history.append(total_reward) reward_clamped_history.append(total_reward_clamped) avg_reward = np.sum(reward_history)/len(reward_history) avg_reward_clamped = np.sum(reward_clamped_history)/len(reward_clamped_history) # Print final result print('\n\n=============================================\n' + 'avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward)) # Log results to files with open(sub_dir + mode + '.txt', 'w') as fp: fp.write('avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward)) with open(sub_dir + mode + '_reward.pickle', 'wb') as fp: pickle.dump(reward_history, fp) with open(sub_dir + mode + '_reward_clamped.pickle', 'wb') as fp: pickle.dump(reward_clamped_history, fp) def train(self): """ Train the agent """ num_episodes = 100000 net_updates = 0 # Logging sub_dir = self.game + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/' os.makedirs(sub_dir) logfile = sub_dir + self.game + '_train.txt' loss_file = sub_dir + 'loss.pickle' reward_file = sub_dir + 'reward.pickle' reward_clamped_file = sub_dir + 'reward_clamped.pickle' log_avg_episodes = 50 best_score = 0 best_score_clamped = 0 avg_score = 0 avg_score_clamped = 0 loss_history = [] reward_history = [] reward_clamped_history = [] # Initialize logfile with header with open(logfile, 'w') as fp: fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' + 'Trained game: ' + str(self.game) + '\n' + 'Learning rate: ' + str(self.learning_rate) + '\n' + 'Batch size: ' + str(self.batch_size) + '\n' + 'Memory size(replay): ' + str(self.mem_size) + '\n' + 'Pretrained: ' + str(self.pretrained_model) + '\n' + 'Started training after k frames: ' + str(self.start_train_after) + '\n' + 'Optimized after k frames: ' + str(self.optimize_each_k) + '\n' + 'Target net update after k frame: ' + str(self.update_target_net_each_k_steps) + '\n\n' + '------------------------------------------------------' + '--------------------------------------------------\n') print('Started training...\nLogging to', sub_dir) for i_episode in range(1,num_episodes): # reset game at the start of each episode screen = self.env.reset() # list of k last frames last_k_frames = [] for j in range(self.num_stored_frames): last_k_frames.append(None) last_k_frames[j] = gray2pytorch(screen) if i_episode == 1: self.replay.pushFrame(last_k_frames[0].cpu()) # frame is saved as ByteTensor -> convert to gray value between 0 and 1 state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 done = False # games end indicator variable # reset score with initial lives, because every lost live adds -1 total_reward = 0 total_reward_clamped = self.env.get_lives() # Loop over one game while not done: self.steps +=1 action = self.select_action(state) # perform selected action on game screen, reward, reward_clamped, done, _ = self.env.step(action[0,0]) total_reward += int(reward) total_reward_clamped += int(reward_clamped) # Wrap into tensor reward = torch.Tensor([reward_clamped]) # save latest frame, discard oldest for j in range(self.num_stored_frames-1): last_k_frames[j] = last_k_frames[j+1] last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen) # convert frames to range 0 to 1 again if not done: next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 else: next_state = None # Store transition self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu()) self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity, action, reward, done) # only optimize each kth step if self.steps%self.optimize_each_k == 0: loss = self.optimize(net_updates) # Logging loss_history.append(loss) #q_history.append(q_value) #exp_q_history.append(exp_q_value) net_updates += 1 # set current state to next state to select next action if next_state is not None: state = next_state if self.use_cuda: state = state.cuda() # plays episode until there are no more lives left ( == done) if done: break; # Save rewards reward_history.append(total_reward) reward_clamped_history.append(total_reward_clamped) print('Episode: {:6} | '.format(i_episode), 'steps {:8} | '.format(self.steps), 'loss: {:.2E} | '.format(loss if loss else 0), 'score: ({:4}/{:4}) | '.format(total_reward_clamped,total_reward), 'best score: ({:4}/{:4}) | '.format(best_score_clamped,best_score), 'replay size: {:7}'.format(len(self.replay))) avg_score_clamped += total_reward_clamped avg_score += total_reward if total_reward_clamped > best_score_clamped: best_score_clamped = total_reward_clamped if total_reward > best_score: best_score = total_reward if i_episode % log_avg_episodes == 0 and i_episode!=0: avg_score_clamped /= log_avg_episodes avg_score /= log_avg_episodes print('----------------------------------------------------------------' '-----------------------------------------------------------------', '\nLogging to file: \nEpisode: {:6} '.format(i_episode), 'steps: {:8} '.format(self.steps), 'avg on last {:4} games ({:6.1f}/{:6.1f}) '.format(log_avg_episodes, avg_score_clamped,avg_score), 'best score: ({:4}/{:4})'.format(best_score_clamped, best_score), '\n---------------------------------------------------------------' '------------------------------------------------------------------') # Logfile with open(logfile, 'a') as fp: fp.write('Episode: {:6} | '.format(i_episode) + 'steps: {:8} | '.format(self.steps) + 'avg on last {:4} games ({:6.1f}/{:6.1f}) | '.format(log_avg_episodes, avg_score_clamped,avg_score) + 'best score: ({:4}/{:4})\n'.format(best_score_clamped, best_score)) # Dump loss & reward with open(loss_file, 'wb') as fp: pickle.dump(loss_history, fp) with open(reward_file, 'wb') as fp: pickle.dump(reward_history, fp) with open(reward_clamped_file, 'wb') as fp: pickle.dump(reward_clamped_history, fp) avg_score_clamped = 0 avg_score = 0 if i_episode % self.save_net_each_k_episodes == 0: with open(logfile, 'a') as fp: fp.write('Saved model at episode ' + str(i_episode) + '...\n') self.target_net.save(sub_dir + self.game + '-' + str(i_episode) + '_episodes.model') print('Training done!') self.target_net.save(sub_dir + self.game + '.model')
class Agent(object): def __init__( self, game, mem_size=512 * 512, #1024*512, state_buffer_size=4, batch_size=64, learning_rate=1e-5, pretrained_model=None, frameskip=4, #1 record=False): """ Inputs: - game: string to select the game - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - record: boolean to enable record option """ # Namestring self.game = game # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders) #if self.game == 'Breakout-v0': # dimensions = (32, 195, 8, 152) #elif self.game == 'SpaceInvaders-v0': # dimensions = (21, 195, 20, 141) #elif self.game == 'Assault-v0': # dimensions = (50, 240, 5, 155) #elif self.game == 'Phoenix-v0': # dimensions = (23, 183, 0, 160) #elif self.game == 'Skiing-v0': # dimensions = (55, 202, 8, 152) #elif self.game == 'Enduro-v0': # dimensions = (50, 154, 8, 160) #elif self.game == 'BeamRider-v0': # dimensions = (32, 180, 9, 159) if self.game == 'BreakoutAndSpace': dimensions_break = (32, 195, 8, 152) dimensions_space = (21, 195, 20, 141) elif self.game != 'BreakoutAndSpace': print( 'Error! This version is for playing BreakOut and SpaceInvaders at the same time.' ) # Environment self.env_break = Environment('BreakoutNoFrameskip-v4', dimensions_break, frameskip=frameskip) self.env_space = Environment('SpaceInvaders-v0', dimensions_space, frameskip=frameskip) # Cuda self.use_cuda = torch.cuda.is_available() # Neural network self.net = DQN(channels_in=state_buffer_size, num_actions=self.env_space.get_number_of_actions()) self.target_net = DQN( channels_in=state_buffer_size, num_actions=self.env_space.get_number_of_actions()) if self.use_cuda: self.net.cuda() self.target_net.cuda() if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate) #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 4 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 25000 else: self.start_train_after = mem_size // 2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500 def select_action(self, observation, mode='train'): """ Select an random action from action space or an proposed action from neural network depending on epsilon Inputs: - observation: np.array with the observation Returns: action: int """ # Hyperparameters EPSILON_START = 1 EPSILON_END = 0.1 EPSILON_DECAY = 1000000 MAXNOOPS = 30 # Decrease of epsilon value if not self.pretrained_model: #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \ # np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY) epsilon = EPSILON_START - self.steps * ( EPSILON_START - EPSILON_END) / EPSILON_DECAY else: epsilon = EPSILON_END if epsilon > random() or mode == 'play': # Action according to neural net # Wrap tensor into variable state_variable = Variable(observation, volatile=True) # Evaluate network and return action with maximum of activation action = self.net(state_variable).data.max(1)[1].view(1, 1) # Prevent noops if action[0, 0] == 0: self.noops_count += 1 if self.noops_count == MAXNOOPS: action[0, 0] = 1 self.noops_count = 0 else: self.noops_count = 0 else: # Random action action = self.env_space.sample_action() action = LongTensor([[action]]) return action def optimize(self, net_updates): """ Optimizer function Inputs: - net_updates: int Returns: - loss: float - q_value: float - exp_q_value: float """ # Hyperparameter GAMMA = 0.99 # not enough memory yet if len(self.replay) < self.start_train_after: return # Sample a transition batch = self.replay.sampleTransition(self.batch_size) # Mask to indicate which states are not final (=done=game over) non_final_mask = ByteTensor( list(map(lambda ns: ns is not None, batch.next_state))) # Wrap tensors in variables state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) non_final_next_states = Variable( torch.cat([ns for ns in batch.next_state if ns is not None]), volatile=True ) # volatile==true prevents calculation of the derivative next_state_values = Variable(torch.zeros( self.batch_size).type(FloatTensor), volatile=False) if self.use_cuda: state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() non_final_mask = non_final_mask.cuda() non_final_next_states = non_final_next_states.cuda() next_state_values = next_state_values.cuda() # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the # columns of actions taken state_action_values = self.net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_max_values = self.target_net(non_final_next_states).detach().max( 1)[0] next_state_values[non_final_mask] = next_max_values # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) self.optimizer.zero_grad() loss.backward() for param in self.net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if net_updates % self.update_target_net_each_k_steps == 0: self.target_net.load_state_dict(self.net.state_dict()) print('target_net update!') return loss.data.cpu().numpy()[0] def play(self): """ Play a game with the current net and render it """ done = False # games end indicator variable score = 0 # Reset game screen_break = self.env_break.reset() screen_space = self.env_space.reset() # list of k last frames ############old version: #breakout part #last_k_frames_break = [] #for j in range(self.num_stored_frames): # last_k_frames_break.append(None) # last_k_frames_break[j] = gray2pytorch(screen_break) #spaceinvaders part #last_k_frames_space = [] #for j in range(self.num_stored_frames): # last_k_frames_space.append(None) # last_k_frames_space[j] = gray2pytorch(screen_space) ################# last_k_frames = [] for j in range(self.num_stored_frames): last_k_frames.append(None) last_k_frames[j] = torch.cat( (gray2pytorch(screen_break), gray2pytorch(screen_space)), dim=2) # frame is saved as ByteTensor -> convert to gray value between 0 and 1 ############old version: #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0 #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0 #state = torch.cat((state_break,state_space), 2) state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0 while not done: action = self.select_action(state, mode='play') # Render game self.env_break.game.render(mode='human') self.env_space.game.render(mode='human') # maps actions from space invaders to breakout (shot-left to left, shot-right to right) if action[0, 0] == 4: action_break = 2 elif action[0, 0] == 5: action_break = 3 elif action[0, 0] != 5: action_break = action[0, 0] screen_break, _, reward_break, done_break, info_break = self.env_break.step( action_break, mode='play') screen_space, _, reward_space, done_space, info_space = self.env_space.step( action[0, 0], mode='play') score += reward_break score += reward_space done = done_break or done_space ############old # save latest frame, discard oldest #for j in range(self.num_stored_frames - 1): # last_k_frames_break[j] = last_k_frames_break[j + 1] # last_k_frames_space[j] = last_k_frames_space[j + 1] #last_k_frames_break[self.num_stored_frames - 1] = gray2pytorch(screen_break) #last_k_frames_space[self.num_stored_frames - 1] = gray2pytorch(screen_space) # convert frames to range 0 to 1 again #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0 #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0 #state = torch.cat((state_break, state_space), 2) #############old_end # save latest frame, discard oldest for j in range(self.num_stored_frames - 1): last_k_frames[j] = last_k_frames[j + 1] last_k_frames[self.num_stored_frames - 1] = torch.cat( (gray2pytorch(screen_break), gray2pytorch(screen_space)), dim=2) # convert frames to range 0 to 1 again state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0 done = done_break or done_space print('Final score:', score) self.env.game.close() #for both changen def train(self): """ Train the agent """ num_episodes = 100000 net_updates = 0 # Logging sub_dir = self.game + '_' + datetime.now().strftime( '%Y%m%d_%H%M%S') + '/' os.makedirs(sub_dir) logfile = sub_dir + self.game + '_train.log' loss_file = sub_dir + 'loss.pickle' reward_file = sub_dir + 'reward.pickle' reward_clamped_file = sub_dir + 'reward_clamped.pickle' log_avg_episodes = 50 best_score = 0 best_score_clamped = 0 avg_score = 0 avg_score_clamped = 0 loss_history = [] reward_history = [] reward_clamped_history = [] # Initialize logfile with header with open(logfile, 'w') as fp: fp.write( datetime.now().strftime('%Y%m%d_%H%M%S') + '\n' + 'Trained game: ' + str(self.game) + '\n' + 'Learning rate: ' + str(self.learning_rate) + '\n' + 'Batch size: ' + str(self.batch_size) + '\n' + 'Pretrained: ' + str(self.pretrained_model) + '\n' + 'Started training after k frames: ' + str(self.start_train_after) + '\n' + 'Optimized after k frames: ' + str(self.optimize_each_k) + '\n' + 'Target net update after k frame: ' + str(self.update_target_net_each_k_steps) + '\n\n' + '--------------------------------------------------------------------------------\n' ) print('Started training...\nLogging to', sub_dir) for i_episode in range(1, num_episodes): # reset game at the start of each episode screen_break = self.env_break.reset() screen_space = self.env_space.reset() # list of k last frames last_k_frames_break = [] last_k_frames_space = [] for j in range(self.num_stored_frames): last_k_frames_break.append(None) last_k_frames_space.append(None) last_k_frames_break[j] = gray2pytorch(screen_break) last_k_frames_space[j] = gray2pytorch(screen_space) if i_episode == 1: frames_both = torch.cat((last_k_frames_break[0].cpu(), last_k_frames_space[0].cpu()), 2) #self.replay.pushFrame(last_k_frames_break[0].cpu()) #self.replay.pushFrame(last_k_frames_space[0].cpu()) self.replay.pushFrame(frames_both) # frame is saved as ByteTensor -> convert to gray value between 0 and 1 state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0 state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0 state = torch.cat((state_break, state_space), 2) done = False # games end indicator variable # reset score with initial lives, because every lost live adds -1 total_reward = self.env_break.get_lives() total_reward += self.env_space.get_lives() total_reward_clamped = self.env_break.get_lives() total_reward_clamped += self.env_space.get_lives() ########### # Loop over one game while not done: self.steps += 1 action = self.select_action(state) # perform selected action on game # screen, reward, done, info = self.env.step(action[0,0])#envTest.step(action[0,0]) #maps actions from space invaders to breakout (shot-left to left, shot-right to right) screen_space, _, reward_space, done_space, info_space = self.env_space.step( action[0, 0]) action_break = action[0, 0] if action_break > 3: #shoot+right/left --> right/left action_break = action_break - 2 screen_break, _, reward_break, done_break, info_break = self.env_break.step( action_break) total_reward += int(reward_break) total_reward += int(reward_space) done = done_break or done_space # clamp rewards reward_break = torch.Tensor([np.clip(reward_break, -1, 1)]) reward_space = torch.Tensor([np.clip(reward_space, -1, 1)]) reward = reward_break + reward_space total_reward_clamped += int(reward_break[0]) total_reward_clamped += int(reward_space[0]) # save latest frame, discard oldest for j in range(self.num_stored_frames - 1): last_k_frames_break[j] = last_k_frames_break[j + 1] last_k_frames_space[j] = last_k_frames_space[j + 1] last_k_frames_break[self.num_stored_frames - 1] = gray2pytorch(screen_break) last_k_frames_space[self.num_stored_frames - 1] = gray2pytorch(screen_space) # convert frames to range 0 to 1 again if not done: next_state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0 next_state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0 next_state = torch.cat( (next_state_break, next_state_space), 2) else: next_state = None #Store transition #Frame concat, Trasition not (try) frame_break = last_k_frames_break[self.num_stored_frames - 1].cpu() frame_space = last_k_frames_space[self.num_stored_frames - 1].cpu() frame_both = torch.cat((frame_break, frame_space), 2) self.replay.pushFrame(frame_both) self.replay.pushTransition( (self.replay.getCurrentIndex() - 1) % self.replay.capacity, action, reward, done) # only optimize each kth step if self.steps % self.optimize_each_k == 0: loss = self.optimize(net_updates) # Logging loss_history.append(loss) #q_history.append(q_value) #exp_q_history.append(exp_q_value) net_updates += 1 # set current state to next state to select next action if next_state is not None: state = next_state if self.use_cuda: state = state.cuda() # plays episode until there are no more lives left ( == done) if done: break # Save rewards reward_history.append(total_reward) reward_clamped_history.append(total_reward_clamped) print( 'Episode: {:6} | '.format(i_episode), 'steps {:8} | '.format(self.steps), 'loss: {:.2E} | '.format(loss if loss else 0), 'score: ({:4}/{:4}) | '.format(total_reward_clamped, total_reward), 'best score: ({:4}/{:4}) | '.format(best_score_clamped, best_score), 'replay size: {:7}'.format(len(self.replay))) avg_score_clamped += total_reward_clamped avg_score += total_reward if total_reward_clamped > best_score_clamped: best_score_clamped = total_reward_clamped if total_reward > best_score: best_score = total_reward if i_episode % log_avg_episodes == 0 and i_episode != 0: avg_score_clamped /= log_avg_episodes avg_score /= log_avg_episodes print( '----------------------------------------------------------------' '-----------------------------------------------------------------', '\nLogging to file: \nEpisode: {:6} '.format(i_episode), 'steps: {:8} '.format(self.steps), 'avg on last {:4} games ({:6.1f}/{:6.1f}) '.format( log_avg_episodes, avg_score_clamped, avg_score), 'best score: ({:4}/{:4})'.format(best_score_clamped, best_score), '\n---------------------------------------------------------------' '------------------------------------------------------------------' ) # Logfile with open(logfile, 'a') as fp: fp.write( 'Episode: {:6} | '.format(i_episode) + 'steps: {:8} | '.format(self.steps) + 'avg on last {:4} games ({:6.1f}/{:6.1f}) | '.format( log_avg_episodes, avg_score_clamped, avg_score) + 'best score: ({:4}/{:4})\n'.format( best_score_clamped, best_score)) # Dump loss & reward with open(loss_file, 'wb') as fp: pickle.dump(loss_history, fp) with open(reward_file, 'wb') as fp: pickle.dump(reward_history, fp) with open(reward_clamped_file, 'wb') as fp: pickle.dump(reward_clamped_history, fp) avg_score_clamped = 0 avg_score = 0 if i_episode % self.save_net_each_k_episodes == 0: with open(logfile, 'a') as fp: fp.write('Saved model at episode ' + str(i_episode) + '...\n') self.target_net.save(sub_dir + self.game + '-' + str(i_episode) + '_episodes.model') print('Training done!') self.target_net.save(sub_dir + self.game + '.model')
def __init__(self, game1, game2, mem_size = 1000000, state_buffer_size = 4, batch_size = 64, learning_rate = 1e-5, pretrained_model = None, pretrained_subnet1 = False, pretrained_subnet2 = False, frameskip = 4, frozen = False ): """ Inputs: - game 1: string to select the game 1 - game 2: string to select the game 2 - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - pretrained_subnet1: str path to the model of the subnet - pretrained_subnet2: str path to the model of the subnet - frozen: boolean freeze pretrained subnets """ # Namestring self.game1 = game1 self.game2 = game2 # Environment self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip) self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip) # Neural net self.pretrained_subnet1 = pretrained_subnet1 self.pretrained_subnet2 = pretrained_subnet2 self.net = TwinDQN(channels_in = state_buffer_size, num_actions = self.env2.get_number_of_actions(), pretrained_subnet1 = pretrained_subnet1, pretrained_subnet2 = pretrained_subnet2, frozen = frozen) self.target_net = TwinDQN(channels_in = state_buffer_size, num_actions = self.env2.get_number_of_actions(), pretrained_subnet1 = pretrained_subnet1, pretrained_subnet2 = pretrained_subnet2, frozen = frozen) # Cuda self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.net.cuda() self.target_net.cuda() # Pretrained if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()), lr=learning_rate) #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()), # lr=learning_rate,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 1 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 50000 else: self.start_train_after = mem_size//2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500
class DoubleAgent(object): def __init__(self, game1, game2, mem_size = 1000000, state_buffer_size = 4, batch_size = 64, learning_rate = 1e-5, pretrained_model = None, pretrained_subnet1 = False, pretrained_subnet2 = False, frameskip = 4, frozen = False ): """ Inputs: - game 1: string to select the game 1 - game 2: string to select the game 2 - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - pretrained_subnet1: str path to the model of the subnet - pretrained_subnet2: str path to the model of the subnet - frozen: boolean freeze pretrained subnets """ # Namestring self.game1 = game1 self.game2 = game2 # Environment self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip) self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip) # Neural net self.pretrained_subnet1 = pretrained_subnet1 self.pretrained_subnet2 = pretrained_subnet2 self.net = TwinDQN(channels_in = state_buffer_size, num_actions = self.env2.get_number_of_actions(), pretrained_subnet1 = pretrained_subnet1, pretrained_subnet2 = pretrained_subnet2, frozen = frozen) self.target_net = TwinDQN(channels_in = state_buffer_size, num_actions = self.env2.get_number_of_actions(), pretrained_subnet1 = pretrained_subnet1, pretrained_subnet2 = pretrained_subnet2, frozen = frozen) # Cuda self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.net.cuda() self.target_net.cuda() # Pretrained if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()), lr=learning_rate) #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()), # lr=learning_rate,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 1 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 50000 else: self.start_train_after = mem_size//2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500 def select_action(self, observation, mode='train'): """ Select an random action from action space or an proposed action from neural network depending on epsilon Inputs: - observation: np.array with the observation Returns: - action: int """ # Hyperparameters EPSILON_START = 1 EPSILON_END = 0.1 EPSILON_DECAY = 1000000 EPSILON_PLAY = 0.01 MAXNOOPS = 30 # Decrease of epsilon value if not self.pretrained_model: #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \ # np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY) epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY elif mode=='play': epsilon = EPSILON_PLAY else: epsilon = EPSILON_END if epsilon < random(): # Action according to neural net # Wrap tensor into variable state_variable = Variable(observation, volatile=True) # Evaluate network and return action with maximum of activation action = self.net(state_variable).data.max(1)[1].view(1,1) # Prevent noops if action[0,0]!=1: self.noops_count += 1 if self.noops_count == MAXNOOPS: action[0,0] = 1 self.noops_count = 0 else: self.noops_count = 0 else: # Random action action = self.env2.sample_action() action = LongTensor([[action]]) return action def map_action(self, action): """ Maps action from game with more actions to game with less actions Inputs: - action: int Returns: - action: int """ # Map SpaceInvaders on Breakout if self.game1=='Breakout' and self.game2=='SpaceInvaders': if action>3: # shoot+right/left --> right/left return action-2 # Map Assault on SpaceInvaders if self.game1=='SpaceInvaders' and self.game2=='Assault': if action!=0: # all actions except 2nd idle return action-1 # Map Phoenix on SpaceInvaders if self.game1=='SpaceInvaders' and self.game2=='Phoenix': if action==4: # shield --> idle return 0 if action==7: # shield+shot --> shot return 1 if action>4: # shoot+right/left --> shoot+right/left return action-1 # Map Phoenix on Assault if self.game1=='Assault' and self.game2=='Phoenix': if action==4: # shield --> idle return 0 if action==7: # shield+shot --> shot return 2 if 1<= action and action<=3: # shot/right/left --> shot/right/left return action+1 # No mapping necessary return action def optimize(self, net_updates): """ Optimizer function Inputs: - net_updates: int Returns: - loss: float - q_value: float - exp_q_value: float """ # Hyperparameter GAMMA = 0.99 # not enough memory yet if len(self.replay) < self.start_train_after: return # Sample a transition batch = self.replay.sampleTransition(self.batch_size) # Mask to indicate which states are not final (=done=game over) non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state))) # Wrap tensors in variables state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]), volatile=True) # volatile==true prevents calculation of the derivative next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False) if self.use_cuda: state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() non_final_mask = non_final_mask.cuda() non_final_next_states = non_final_next_states.cuda() next_state_values = next_state_values.cuda() # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the # columns of actions taken state_action_values = self.net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_max_values = self.target_net(non_final_next_states).detach().max(1)[0] next_state_values[non_final_mask]= next_max_values # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) self.optimizer.zero_grad() loss.backward() for param in self.net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if net_updates%self.update_target_net_each_k_steps==0: self.target_net.load_state_dict(self.net.state_dict()) print('target_net update!') return loss.data.cpu().numpy()[0] def play(self, n): """ Play a game with the current net and render it Inputs: - n: games to play """ for i in range(n): done = False # games end indicator variable # Score counter total_reward_game1 = 0 total_reward_game2 = 0 total_reward = 0 # Reset game screen1 = self.env1.reset() screen2 = self.env2.reset() # list of k last frames last_k_frames = [] for j in range(self.num_stored_frames): last_k_frames.append(None) last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1) # frame is saved as ByteTensor -> convert to gray value between 0 and 1 state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 while not done: action = self.select_action(state, mode='play')[0,0] action1 = self.map_action(action) action2 = action # perform selected action on game screen1, reward1, _, done1, _ = self.env1.step(action1, mode='play') screen2, reward2, _, done2, _ = self.env2.step(action2, mode='play') # Logging total_reward_game1 += int(reward1) total_reward_game2 += int(reward2) total_reward += int(reward1) + int(reward2) # save latest frame, discard oldest for j in range(self.num_stored_frames-1): last_k_frames[j] = last_k_frames[j+1] last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1) # convert frames to range 0 to 1 again state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 # Merged game over indicator done = done1 or done2 print('Final scores Game ({}/{}): {}: {} '.format(i+1, n, self.game1, total_reward_game1) + '{}: {} '.format(self.game2, total_reward_game2) + 'total: {}'.format(total_reward)) self.env1.game.close() self.env2.game.close() def play_stats(self, n_games, mode='random'): """ Play N games randomly or evaluate a net and log results for statistics Input: - n_games: int Number of games to play - mode: str 'random' or 'evaluation' """ # Subdirectory for logging sub_dir = mode + '_' + self.game1 + '+' + self.game2 + '/' if not os.path.exists(sub_dir): os.makedirs(sub_dir) # Store history total reward_history = [] reward_clamped_history = [] # Store history game 1 reward_history_game1 = [] reward_clamped_history_game1 = [] # Store history game 2 reward_history_game2 = [] reward_clamped_history_game2 = [] # Number of actions to sample from n_actions = self.env2.get_number_of_actions() for i_episode in range(1, n_games+1): # Reset game screen1 = self.env1.reset() screen2 = self.env2.reset() # Store screen if mode=='evaluation': # list of k last frames last_k_frames = [] for j in range(self.num_stored_frames): last_k_frames.append(None) last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1) # frame is saved as ByteTensor -> convert to gray value between 0 and 1 state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 # games end indicator variable done = False # reset score with initial lives, because every lost live adds -1 total_reward_game1 = 0 total_reward_clamped_game1 = self.env1.get_lives() total_reward_game2 = 0 total_reward_clamped_game2 = self.env2.get_lives() # total scores for both games total_reward = total_reward_game1 + total_reward_game2 total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2 while not done: if mode=='random': action = randrange(n_actions) elif mode=='evaluation': action = self.select_action(state, mode='play')[0,0] action1 = self.map_action(action) action2 = action screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1) screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2) # Logging total_reward_game1 += int(reward1) total_reward_game2 += int(reward2) total_reward += int(reward1) + int(reward2) total_reward_clamped_game1 += reward1_clamped total_reward_clamped_game2 += reward2_clamped total_reward_clamped += reward1_clamped + reward2_clamped if mode=='evaluation': # save latest frame, discard oldest for j in range(self.num_stored_frames-1): last_k_frames[j] = last_k_frames[j+1] last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1) # convert frames to range 0 to 1 again state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 # Merged game over indicator done = done1 or done2 # Print current result print('Episode: {:6}/{:6} | '.format(i_episode, n_games) + 'score total: ({:6.1f}/{:7.1f}) | '.format(total_reward_clamped,total_reward) + 'score game1: ({:6.1f}/{:7.1f}) | '.format(total_reward_clamped_game1,total_reward_game1) + 'score game2: ({:6.1f}/{:7.1f})'.format(total_reward_clamped_game2,total_reward_game2)) # Save rewards reward_history_game1.append(total_reward_game1) reward_history_game2.append(total_reward_game2) reward_history.append(total_reward) reward_clamped_history_game1.append(total_reward_clamped_game1) reward_clamped_history_game2.append(total_reward_clamped_game2) reward_clamped_history.append(total_reward_clamped) avg_reward_total = np.sum(reward_history) / len(reward_history) avg_reward_total_clamped = np.sum(reward_clamped_history) / len(reward_clamped_history) avg_reward_game1 = np.sum(reward_history_game1) / len(reward_history_game1) avg_reward_game1_clamped = np.sum(reward_clamped_history_game1) / len(reward_clamped_history_game1) avg_reward_game2 = np.sum(reward_history_game2) / len(reward_history_game2) avg_reward_game2_clamped = np.sum(reward_clamped_history_game2) / len(reward_clamped_history_game2) # Print final result print('\n\n===========================================\n' + 'avg score after {:6} episodes:\n'.format(n_games) + 'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) + 'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) + 'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2)) # Log results to files with open(sub_dir + mode + '.txt', 'w') as fp: fp.write('avg score after {:6} episodes:\n'.format(n_games) + 'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) + 'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) + 'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2)) # Dump reward with open(sub_dir + mode + '_reward_game1.pickle', 'wb') as fp: pickle.dump(reward_history_game1, fp) with open(sub_dir + mode + '_reward_game2.pickle', 'wb') as fp: pickle.dump(reward_history_game2, fp) with open(sub_dir + mode + '_reward_total.pickle', 'wb') as fp: pickle.dump(reward_history, fp) with open(sub_dir + mode + '_reward_clamped_game1', 'wb') as fp: pickle.dump(reward_clamped_history_game1, fp) with open(sub_dir + mode + '_reward_clamped_game2', 'wb') as fp: pickle.dump(reward_clamped_history_game2, fp) with open(sub_dir + mode + '_reward_clamped_total', 'wb') as fp: pickle.dump(reward_clamped_history, fp) def train(self): """ Train the agent """ num_episodes = 100000 net_updates = 0 # Logging sub_dir = self.game1 + '+' + self.game2 + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/' os.makedirs(sub_dir) logfile = sub_dir + 'train.txt' reward_file = sub_dir + 'reward.pickle' reward_file_game1 = sub_dir + 'reward_game1.pickle' reward_file_game2 = sub_dir + 'reward_game2.pickle' reward_clamped_file = sub_dir + 'reward_clamped.pickle' reward_clamped_file_game1 = sub_dir + 'reward_clamped_game1.pickle' reward_clamped_file_game2 = sub_dir + 'reward_clamped_game2.pickle' reward_clamped_file = sub_dir + 'reward_clamped.pickle' log_avg_episodes = 50 # Total scores best_score = 0 best_score_clamped = 0 avg_score = 0 avg_score_clamped = 0 reward_history = [] reward_clamped_history = [] # Scores game 1 avg_score_game1 = 0 avg_score_clamped_game1 = 0 reward_history_game1 = [] reward_clamped_history_game1 = [] # Scores game 2 avg_score_game2 = 0 avg_score_clamped_game2 = 0 reward_history_game2 = [] reward_clamped_history_game2 = [] # Initialize logfile with header with open(logfile, 'w') as fp: fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' + 'Trained game (first): {}\n'.format(self.game1) + 'Trained game (second): {}\n'.format(self.game2) + 'Learning rate: {:.2E}\n'.format(self.learning_rate) + 'Batch size: {:d}\n'.format(self.batch_size) + 'Memory size(replay): {:d}\n'.format(self.mem_size) + 'Pretrained: {}\n'.format(self.pretrained_model) + 'Pretrained subnet 1: {}\n'.format(self.pretrained_subnet1) + 'Pretrained subnet 2: {}\n'.format(self.pretrained_subnet2) + 'Started training after k frames: {:d}\n'.format(self.start_train_after) + 'Optimized after k frames: {:d}\n'.format(self.optimize_each_k) + 'Target net update after k frame: {:d}\n\n'.format(self.update_target_net_each_k_steps) + '--------+-----------+----------------------+------------' + '----------+----------------------+--------------------\n' + 'Episode | Steps | ' + '{:3} games avg total | '.format(log_avg_episodes) + '{:3} games avg game1 | '.format(log_avg_episodes) + '{:3} games avg game2 | '.format(log_avg_episodes) + 'best score total \n' + '--------+-----------+----------------------+------------' + '----------+----------------------+--------------------\n') print('Started training...\nLogging to {}\n'.format(sub_dir) + 'Episode | Steps | score total | score game 1 | ' + 'score game 2 | best score total') for i_episode in range(1,num_episodes): # reset game at the start of each episode screen1 = self.env1.reset() screen2 = self.env2.reset() # list of k last frames last_k_frames = [] for j in range(self.num_stored_frames): last_k_frames.append(None) last_k_frames[j] = torch.cat((gray2pytorch(screen1), gray2pytorch(screen2)), dim=1) if i_episode == 1: self.replay.pushFrame(last_k_frames[0].cpu()) # frame is saved as ByteTensor -> convert to gray value between 0 and 1 state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 # games end indicator variable done1 = False done2 = False # reset score with initial lives, because every lost live adds -1 total_reward_game1 = 0 total_reward_clamped_game1 = self.env1.get_lives() total_reward_game2 = 0 total_reward_clamped_game2 = self.env2.get_lives() # total scores for both games total_reward = total_reward_game1 + total_reward_game2 total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2 # Loop over one game while not done1 and not done2: self.steps +=1 action = self.select_action(state) action1 = self.map_action(action[0,0]) action2 = action[0,0] # perform selected action on game screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1) screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2) # Logging total_reward_game1 += int(reward1) total_reward_game2 += int(reward2) total_reward += int(reward1) + int(reward2) total_reward_clamped_game1 += reward1_clamped total_reward_clamped_game2 += reward2_clamped total_reward_clamped += reward1_clamped + reward2_clamped # Bake reward into tensor reward = torch.FloatTensor([reward1_clamped+reward2_clamped]) # save latest frame, discard oldest for j in range(self.num_stored_frames-1): last_k_frames[j] = last_k_frames[j+1] last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1), gray2pytorch(screen2)), dim=1) # convert frames to range 0 to 1 again if not done1 and not done2: next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0 else: next_state = None # Store transition self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu()) self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity, action, reward, done1 or done2) # only optimize each kth step if self.steps%self.optimize_each_k == 0: self.optimize(net_updates) net_updates += 1 # set current state to next state to select next action if next_state is not None: state = next_state if self.use_cuda: state = state.cuda() # plays episode until there are no more lives left ( == done) if done1 or done2: break; # Save rewards reward_history_game1.append(total_reward_game1) reward_history_game2.append(total_reward_game2) reward_history.append(total_reward) reward_clamped_history_game1.append(total_reward_clamped_game1) reward_clamped_history_game2.append(total_reward_clamped_game2) reward_clamped_history.append(total_reward_clamped) # Sum up for averages avg_score_clamped_game1 += total_reward_clamped_game1 avg_score_clamped_game2 += total_reward_clamped_game2 avg_score_clamped += total_reward_clamped avg_score_game1 += total_reward_game1 avg_score_game2 += total_reward_game2 avg_score += total_reward if total_reward_clamped > best_score_clamped: best_score_clamped = total_reward_clamped if total_reward > best_score: best_score = total_reward print('{:7} | '.format(i_episode) + '{:9} | '.format(self.steps) + '({:6.1f}/{:7.1f}) | '.format(total_reward_clamped,total_reward) + '({:6.1f}/{:7.1f}) | '.format(total_reward_clamped_game1,total_reward_game1) + '({:6.1f}/{:7.1f}) | '.format(total_reward_clamped_game2,total_reward_game2) + '({:6.1f}/{:8.1f})'.format(best_score_clamped, best_score)) if i_episode % log_avg_episodes == 0 and i_episode!=0: avg_score_clamped_game1 /= log_avg_episodes avg_score_clamped_game2 /= log_avg_episodes avg_score_clamped /= log_avg_episodes avg_score_game1 /= log_avg_episodes avg_score_game2 /= log_avg_episodes avg_score /= log_avg_episodes print('--------+-----------+----------------------+------------' + '----------+----------------------+--------------------\n' + 'Episode | Steps | ' + '{:3} games avg total | '.format(log_avg_episodes) + '{:3} games avg game1 | '.format(log_avg_episodes) + '{:3} games avg game2 | '.format(log_avg_episodes) + 'best score total \n' + '{:7} | '.format(i_episode) + '{:9} | '.format(self.steps) + '({:6.1f}/{:7.1f}) | '.format(avg_score_clamped,avg_score) + '({:6.1f}/{:7.1f}) | '.format(avg_score_clamped_game1,avg_score_game1) + '({:6.1f}/{:7.1f}) | '.format(avg_score_clamped_game2,avg_score_game2) + '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score) + '\nLogging to file...\n\n' '--------+-----------+----------------------+------------' + '----------+----------------------+--------------------\n' + 'Episode | Steps | score total | score game 1 | ' + 'score game 2 | best score total') # Logfile with open(logfile, 'a') as fp: fp.write('{:7} | '.format(i_episode) + '{:9} | '.format(self.steps) + '({:6.1f}/{:7.1f}) | '.format(avg_score_clamped,avg_score) + '({:6.1f}/{:7.1f}) | '.format(avg_score_clamped_game1,avg_score_game1) + '({:6.1f}/{:7.1f}) | '.format(avg_score_clamped_game2,avg_score_game2) + '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score)) # Dump reward with open(reward_file_game1, 'wb') as fp: pickle.dump(reward_history_game1, fp) with open(reward_file_game2, 'wb') as fp: pickle.dump(reward_history_game2, fp) with open(reward_file, 'wb') as fp: pickle.dump(reward_history, fp) with open(reward_clamped_file_game1, 'wb') as fp: pickle.dump(reward_clamped_history_game1, fp) with open(reward_clamped_file_game2, 'wb') as fp: pickle.dump(reward_clamped_history_game2, fp) with open(reward_clamped_file, 'wb') as fp: pickle.dump(reward_clamped_history, fp) avg_score_clamped_game1 = 0 avg_score_clamped_game2 = 0 avg_score_clamped = 0 avg_score_game1 = 0 avg_score_game2 = 0 avg_score = 0 if i_episode % self.save_net_each_k_episodes == 0: with open(logfile, 'a') as fp: fp.write('Saved model at episode ' + str(i_episode) + '...\n') self.target_net.save(sub_dir + str(i_episode) + '_episodes.model') print('Training done!') self.target_net.save(sub_dir + 'final.model')
def __init__(self, load_checkpoint, checkpoint_file, env, n_states, n_actions, mem_size=10**6, batch_size=256, n_hid1=256, n_hid2=256, lr=3e-4, gamma=0.99, tau=5e-3, reward_scale=2): self.load_checkpoint = load_checkpoint self.max_action = float(env.action_space.high[0]) self.low_action = float(env.action_space.low[0]) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.reward_scale = reward_scale self.memory_counter = 0 self.memory = ReplayMemory(mem_size, n_states, n_actions) self.actor = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, self.max_action, lr, checkpoint_file, name='_actor') self.critic_1 = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr, checkpoint_file, name='_crtic1') self.critic_2 = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr, checkpoint_file, name='_crtic2') self.value_net = ValueNetwork(n_states, n_hid1, n_hid2, lr, checkpoint_file, name='_value') self.target_value_net = ValueNetwork(n_states, n_hid1, n_hid2, lr, checkpoint_file, name='_value_target') # tau=1 performs an exact copy of the networks to the respective targets # self.update_network_parameters(tau=1) self.update_network_parameters(self.value_net, self.target_value_net, tau=1)
class Agent: def __init__(self, load_checkpoint, checkpoint_file, env, n_states, n_actions, mem_size=10**6, batch_size=256, n_hid1=256, n_hid2=256, lr=3e-4, gamma=0.99, tau=5e-3, reward_scale=2): self.load_checkpoint = load_checkpoint self.max_action = float(env.action_space.high[0]) self.low_action = float(env.action_space.low[0]) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.reward_scale = reward_scale self.memory_counter = 0 self.memory = ReplayMemory(mem_size, n_states, n_actions) self.actor = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, self.max_action, lr, checkpoint_file, name='_actor') self.critic_1 = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr, checkpoint_file, name='_crtic1') self.critic_2 = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr, checkpoint_file, name='_crtic2') self.value_net = ValueNetwork(n_states, n_hid1, n_hid2, lr, checkpoint_file, name='_value') self.target_value_net = ValueNetwork(n_states, n_hid1, n_hid2, lr, checkpoint_file, name='_value_target') # tau=1 performs an exact copy of the networks to the respective targets # self.update_network_parameters(tau=1) self.update_network_parameters(self.value_net, self.target_value_net, tau=1) # self.update_network_parameters_phil(tau=1) def store_transition(self, obs, action, reward, obs_, done): self.memory.store_transition(obs, action, reward, obs_, done) def sample_transitions(self): state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer( self.batch_size) # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class) state_batch = torch.tensor(state_batch, dtype=torch.float).to(self.actor.device) action_batch = torch.tensor(action_batch, dtype=torch.float).to(self.actor.device) reward_batch = torch.tensor(reward_batch, dtype=torch.float).to(self.actor.device) new_state_batch = torch.tensor(new_state_batch, dtype=torch.float).to(self.actor.device) done_batch = torch.tensor(done_batch).to(self.actor.device) return state_batch, action_batch, reward_batch, new_state_batch, done_batch def update_network_parameters(self, network, target_network, tau=None): for par, target_par in zip(network.parameters(), target_network.parameters()): target_par.data.copy_(tau * par.data + (1 - tau) * target_par.data) def choose_action(self, obs): obs = torch.tensor([obs], dtype=torch.float).to(self.actor.device) actions, _ = self.actor.sample_normal(obs, reparametrize=False) return actions.cpu().detach().numpy()[0] def learn_phil(self): if self.memory.mem_counter < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = torch.tensor(reward, dtype=torch.float).to(self.critic_1.device) done = torch.tensor(done).to(self.critic_1.device) state_ = torch.tensor(new_state, dtype=torch.float).to(self.critic_1.device) state = torch.tensor(state, dtype=torch.float).to(self.critic_1.device) action = torch.tensor(action, dtype=torch.float).to(self.critic_1.device) value = self.value_net(state).view(-1) value_ = self.target_value_net(state_).view(-1) value_[done] = 0.0 actions, log_probs = self.actor.sample_normal(state, reparametrize=False) # actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value_net.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * (F.mse_loss(value, value_target)) value_loss.backward(retain_graph=True) self.value_net.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparametrize=True) # actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.reward_scale * reward + self.gamma * value_ q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters(self.value_net, self.target_value_net, self.tau) # self.update_network_parameters_phil() def learn(self): if self.memory.mem_counter < self.batch_size: return state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions( ) # state_batch, action_batch, reward_batch, new_state_batch, done_batch = \ # self.memory.sample_buffer(self.batch_size) # # reward_batch = torch.tensor(reward_batch, dtype=torch.float).to(self.critic_1.device) # done_batch = torch.tensor(done_batch).to(self.critic_1.device) # new_state_batch = torch.tensor(new_state_batch, dtype=torch.float).to(self.critic_1.device) # state_batch = torch.tensor(state_batch, dtype=torch.float).to(self.critic_1.device) # action_batch = torch.tensor(action_batch, dtype=torch.float).to(self.critic_1.device) '''Compute Value Network loss''' self.value_net.optimizer.zero_grad() val = self.value_net(state_batch).view(-1) val_ = self.target_value_net(new_state_batch).view(-1) val_[done_batch] = 0.0 actions, log_probs = self.actor.sample_normal(state_batch, reparametrize=False) log_probs = log_probs.view(-1) q1 = self.critic_1(state_batch, actions) # action_batch) q2 = self.critic_1(state_batch, actions) # action_batch) q = torch.min(q1, q2).view(-1) value_target = q - log_probs value_loss = 0.5 * F.mse_loss(val, value_target) value_loss.backward(retain_graph=True) self.value_net.optimizer.step() # val = val - q + log_prob '''Compute Actor loss''' self.actor.optimizer.zero_grad() # here we need to reparametrize and thus use rsample to make the distribution differentiable # because the log prob of the chosen action will be part of our loss. actions, log_probs = self.actor.sample_normal(state_batch, reparametrize=True) log_probs = log_probs.view(-1) q1 = self.critic_1(state_batch, actions) q2 = self.critic_2(state_batch, actions) q = torch.min(q1, q2).view(-1) actor_loss = log_probs - q actor_loss = torch.mean(actor_loss) actor_loss.backward(retain_graph=True) self.actor.optimizer.step() '''Compute Critic loss''' self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() val_ = self.target_value_net(new_state_batch).view( -1) # value for the critic update val_[done_batch] = 0.0 q_hat = self.reward_scale * reward_batch + self.gamma * val_ q1_old_policy = self.critic_1(state_batch, action_batch).view(-1) q2_old_policy = self.critic_2(state_batch, action_batch).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters(self.value_net, self.target_value_net, self.tau) # self.update_network_parameters_phil() def save_models(self): self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.value_net.save_checkpoint() self.target_value_net.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.value_net.load_checkpoint() self.target_value_net.load_checkpoint() def update_network_parameters_phil(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value_net.named_parameters() value_params = self.value_net.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() + \ (1-tau)*target_value_state_dict[name].clone() self.target_value_net.load_state_dict(value_state_dict)
class DDPGAgent(): def __init__(self, load_checkpoint, n_states, n_actions, checkpoint_file, mem_size=10**6, batch_size=64, n_hid1=400, n_hid2=300, alpha=1e-4, beta=1e-3, gamma=0.99, tau=0.99): self.batch_size = batch_size self.gamma = gamma self.tau = tau self.actor = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, alpha, checkpoint_file, name='actor') self.critic = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, beta, checkpoint_file, name='critic') self.actor_target = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, alpha, checkpoint_file, name='actor_target') self.critic_target = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, beta, checkpoint_file, name='critic_target') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.memory = ReplayMemory(mem_size, n_states, n_actions) self.update_network_parameters_phil(tau=1) if load_checkpoint: self.actor.eval() self.load_checkpoint = load_checkpoint def reset_noise(self): self.noise.reset() def __copy_param(self, net_param_1, net_param_2, tau): # a.copy_(b) reads content from b and copy it to a for par, target_par in zip(net_param_1, net_param_2): with torch.no_grad(): val_to_copy = tau * par.weight + (1 - tau) * target_par.weight target_par.weight.copy_(val_to_copy) if target_par.bias is not None: val_to_copy = tau * par.bias + (1 - tau) * target_par.bias target_par.bias.copy_(val_to_copy) def update_network_parameters(self, tau=None): # TODO: Controlla equivalenza con metodo Phil # during the class initialization we call this method with tau=1, to perform an exact copy of the nets to targets # then when we call this without specifying tau, we use the field stored if tau is None: tau = self.tau actor_params = self.actor.children() actor_target_params = self.actor_target.children() self.__copy_param(actor_params, actor_target_params, tau) critic_params = self.critic.children() critic_target_params = self.critic_target.children() self.__copy_param(critic_params, critic_target_params, tau) def choose_action(self, obs): # when using layer norm, we do not want to calculate statistics for the forward propagation. Not needed # if using batchnorm or dropout self.actor.eval() obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device) # compute actions mu = self.actor(obs) # add some random noise for exploration mu_prime = mu if not self.load_checkpoint: mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy() def store_transitions(self, obs, action, reward, obs_, done): self.memory.store_transition(obs, action, reward, obs_, done) def sample_transitions(self): state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer( self.batch_size) # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class) state_batch = torch.tensor(state_batch, dtype=torch.float).to(self.actor.device) action_batch = torch.tensor(action_batch, dtype=torch.float).to(self.actor.device) reward_batch = torch.tensor(reward_batch, dtype=torch.float).to(self.actor.device) new_state_batch = torch.tensor(new_state_batch, dtype=torch.float).to(self.actor.device) done_batch = torch.tensor(done_batch).to(self.actor.device) return state_batch, action_batch, reward_batch, new_state_batch, done_batch def save_models(self): self.actor.save_checkpoint() self.actor_target.save_checkpoint() self.critic.save_checkpoint() self.critic_target.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.actor_target.load_checkpoint() self.critic.load_checkpoint() self.critic_target.load_checkpoint() def learn(self): # deal with the situation in which we still not have filled the memory to batch size if self.memory.mem_counter < self.batch_size: return state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions( ) ''' compute actor_target actions and critic_target values, then use obtained values to compute target y_i ''' target_actions = self.actor_target( new_state_batch ) # + torch.tensor(self.noise(), dtype=torch.float).to(self.actor.device) target_critic_value_ = self.critic_target(new_state_batch, target_actions) # target_critic_value_next[done_batch==1] = 0.0 # if done_batch is integer valued target_critic_value_[ done_batch] = 0.0 # if done_batch is bool -- see if it works this way target_critic_value_ = target_critic_value_.view( -1) # necessary for operations on matching shapes target = reward_batch + self.gamma * target_critic_value_ target = target.view(self.batch_size, 1) ''' zero out gradients ''' self.actor.optimizer.zero_grad() self.critic.optimizer.zero_grad() ''' compute critic loss ''' critic_value = self.critic(state_batch, action_batch) critic_loss = F.mse_loss(target, critic_value) ''' compute actor loss''' # cannot directly use critic value, because it is evaluating a certain (s,a) pair. # The formula given in the paper - it appears that - wants to use critic to evaluate # the actions produced by an updated actor, given the state # actor_loss = torch.mean(critic_value) actor_loss = -self.critic(state_batch, self.actor(state_batch)) actor_loss = torch.mean(actor_loss) critic_loss.backward() actor_loss.backward() self.actor.optimizer.step() self.critic.optimizer.step() self.update_network_parameters_phil() def __copy_params_phil(self, net_a, net_b, tau): net_a_params = net_a.named_parameters() net_b_params = net_b.named_parameters() net_a_state_dict = dict(net_a_params) net_b_state_dict = dict(net_b_params) for name in net_a_state_dict: net_a_state_dict[name] = tau * net_a_state_dict[name].clone() + ( 1 - tau) * net_b_state_dict[name].clone() return net_a_state_dict def update_network_parameters_phil(self, tau=None): if tau is None: tau = self.tau updated_actor_state_dict = self.__copy_params_phil( self.actor, self.actor_target, tau) updated_critic_state_dict = self.__copy_params_phil( self.critic, self.critic_target, tau) self.actor_target.load_state_dict(updated_actor_state_dict) self.critic_target.load_state_dict(updated_critic_state_dict)
class Agent: def __init__(self, policy_net, target_net, durability, optimizer, name, constants): """An agent class that takes action on the environment and optimizes the action based on the reward. Parameters ---------- policy_net : DQN [description] target_net : DQN [description] durability : int [description] optimizer : [type] [description] name : str The name of agent constants: Constants The hyper-parameters from Constants class """ self.CONSTANTS = constants self.policy_net = policy_net self.target_net = target_net self.target_net.load_state_dict(policy_net.state_dict()) self.durability = durability self.optimizer = optimizer self.name = name self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE) self.steps_done = 0 self.total_reward = 0.0 self.reward = 0.0 self.obtained_reward = 0.0 self.n_best = 0 self.policy_net_flag = False def select_action(self, state, is_first=False): sample = random.random() eps_threshold = self.CONSTANTS.EPS_END + (self.CONSTANTS.EPS_START - self.CONSTANTS.EPS_END) * \ math.exp(-1. * self.steps_done / self.CONSTANTS.EPS_DECAY) self.steps_done += 1 if is_first: self.writer.add_graph(self.policy_net, input_to_model=state.to( self.CONSTANTS.DEVICE), profile_with_cuda=True) if sample > eps_threshold: with torch.no_grad(): self.policy_net_flag = True return self.policy_net(state.to( self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(self.CONSTANTS.N_ACTIONS)]], device=self.CONSTANTS.DEVICE, dtype=torch.long) def select_core_action(self, best_agent_state, flag, best_agent_action): self.steps_done += 1 if flag: with torch.no_grad(): if best_agent_state is None: return self.policy_net(self.state.to( self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1) else: return self.policy_net( best_agent_state.to( self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1) else: return best_agent_action def optimize_model(self): if len(self.memory) < self.CONSTANTS.BATCH_SIZE: return transitions = self.memory.sample(self.CONSTANTS.BATCH_SIZE) # zip(*transitions) unzips the transitions into # Transition(*) creates new named tuple # batch.state - tuple of all the states (each state is a tensor) # batch.next_state - tuple of all the next states (each state is a tensor) # batch.reward - tuple of all the rewards (each reward is a float) # batch.action - tuple of all the actions (each action is an int) # Transition = ReplayMemory.get_transition() transition = self.CONSTANTS.TRANSITION batch = transition(*zip(*transitions)) actions = tuple( (map(lambda a: torch.tensor([[a]], device=self.CONSTANTS.DEVICE), batch.action))) rewards = tuple( (map(lambda r: torch.tensor([r], device=self.CONSTANTS.DEVICE), batch.reward))) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=utils.get_device(), dtype=torch.bool) non_final_next_states = torch.cat([ s for s in batch.next_state if s is not None ]).to(self.CONSTANTS.DEVICE) state_batch = torch.cat(batch.state).to(self.CONSTANTS.DEVICE) action_batch = torch.cat(actions) reward_batch = torch.cat(rewards) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = torch.zeros(self.CONSTANTS.BATCH_SIZE, device=self.CONSTANTS.DEVICE) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.CONSTANTS.GAMMA) + reward_batch loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def set_tf_writer(self, path): self.writer = self._set_tf_writer(path) def _set_tf_writer(self, path): if self.name == "core": writer = SummaryWriter(log_dir="{}/tf-board/core/".format(path)) else: writer = SummaryWriter( log_dir="{}/tf-board/{}".format(path, self.name)) return writer def get_state(self): return self.state def get_next_state(self): return self.next_state def get_init_state(self): return self.init_state def get_name(self): return self.name def get_policy_net_flag(self): return self.policy_net_flag def set_init_state(self, state): self.init_state = state def set_state(self, state): self.state = state self.next_state = state def set_env(self, env): self.env = env def get_env(self): return self.env def set_action(self, action): self.action = action def get_action(self): return self.action def get_durability(self): return self.durability def get_policy_net(self): return self.policy_net def reduce_durability(self, value): self.durability = self.durability - value def heal_durability(self, value): self.durability = self.durability + value def set_done_state(self, done): self.done = done def set_total_reward(self, reward): self.reward = reward if reward > 0.0: self.obtained_reward += reward self.total_reward += reward def reset_total_reward(self): self.total_reward = 0.0 self.obtained_reward = 0.0 def get_reward(self): return self.reward def get_obtained_reward(self): return self.obtained_reward def best_counter(self): self.n_best += 1 def get_n_best(self): return self.n_best def get_total_reward(self): return self.total_reward def set_step_retrun_value(self, obs, done, info): self.obs = obs self.done = done self.info = info def is_done(self): return self.done
class Worker(): def __init__(self, env, name, s_size, a_size, trainer, model_path, global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory) def train(self, rollout, sess, gamma, ISWeights): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] dones = rollout[:, 4] Q_target = sess.run( self.local_Q.Q, feed_dict={self.local_Q.inputs: np.vstack(next_observations)}) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = sess.run(self.local_Q.q_action, feed_dict={ self.local_Q.inputs: np.vstack(next_observations), self.local_Q.actions_q: action_next }) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i] # * (1 - dones[i]) z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) #print q_target_batch isweight = np.zeros((batch_size, N)) for i in range(batch_size): for j in range(N): isweight[i, j] = ISWeights[i] feed_dict = { self.local_Q.inputs: np.vstack(observations), self.local_Q.actions_q: action_now, self.local_Q.q_target: q_target_batch, self.local_Q.ISWeights: isweight } u, l, g_n, v_n, _ = sess.run([ self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms, self.local_Q.var_norms, self.local_Q.apply_grads ], feed_dict=feed_dict) return l / len(rollout), g_n, v_n, Q_target, u def work(self, gamma, sess, coord, saver): global GLOBAL_STEP episode_count = sess.run(self.global_episodes) total_steps = 0 epsilon = 0.2 print("Starting worker " + str(self.number)) best_mean_episode_reward = -float('inf') with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) #episode_buffer = [] episode_reward = 0 episode_step_count = 0 d = False s = self.env.reset() s = process_frame(s) if epsilon > 0.01: epsilon = epsilon * 0.997 while not d: #self.env.render() GLOBAL_STEP += 1 #Take an action using probabilities from policy network output. if random.random() > epsilon: a_dist_list = sess.run( self.local_Q.Q, feed_dict={self.local_Q.inputs: [s]}) a_dist = a_dist_list[0] a = np.argmax(a_dist) else: a = random.randint(0, 5) s1, r, d, _ = self.env.step(a) if d == False: s1 = process_frame(s1) else: s1 = s self.replaymemory.add([s, a, r, s1, d]) episode_reward += r s = s1 total_steps += 1 episode_step_count += 1 if total_steps % 2 == 0 and d != True and total_steps > 50000: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample( batch_size) l, g_n, v_n, Q_target, u = self.train( episode_buffer, sess, gamma, ISWeights) u = np.mean(u, axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx, u) #sess.run(self.update_local_ops) if d == True: break sess.run(self.update_local_ops) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory: if self.name == 'worker_0' and episode_count % 5 == 0: print('\n episode: ', episode_count, 'global_step:', \ GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \ 'epsilon: ', epsilon) print('loss', l, 'Qtargetmean', np.mean(Q_target)) #print 'p_target', p_target if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000: saver.save( sess, self.model_path + '/qr-dqn-' + str(episode_count) + '.cptk') print("Saved Model") mean_reward = np.mean(self.episode_rewards[-100:]) if episode_count > 20 and best_mean_episode_reward < mean_reward: best_mean_episode_reward = mean_reward if self.name == 'worker_0': sess.run(self.increment) #if episode_count%1==0: #print('\r {} {}'.format(episode_count, episode_reward),end=' ') episode_count += 1
class Brain: def __init__(self, num_actions, Double, Dueling, PER): self.num_actions = num_actions # 행동 가짓수(2)를 구함 self.Double = Double self.Dueling = Dueling self.PER = PER # transition을 기억하기 위한 메모리 객체 생성 self.memory = ReplayMemory(CAPACITY) # 신경망 구성 n_out = num_actions self.main_q_network = Net_CNN(n_out, Dueling) # Net 클래스를 사용 self.target_q_network = Net_CNN(n_out, Dueling) # Net 클래스를 사용 print(self.main_q_network) # 신경망의 구조를 출력 # 최적화 기법을 선택 self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001) # PER - TD 오차를 기억하기 위한 메모리 객체 생성 if self.PER == True: self.td_error_memory = TDerrorMemory(CAPACITY) def replay(self, episode=0): ''' Experience Replay로 신경망의 결합 가중치 학습 ''' # 1. 저장된 transition 수 확인 if len(self.memory) < BATCH_SIZE: return # 2. 미니배치 생성 if self.PER == True: self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch( episode) else: self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch( ) # 3. 정답신호로 사용할 Q(s_t, a_t)를 계산 self.expected_state_action_values = self.get_expected_state_action_values( ) # 4. 결합 가중치 수정 self.update_main_q_network() def decide_action(self, state, episode): '''현재 상태로부터 행동을 결정함''' # e-greedy 알고리즘에서 서서히 최적행동의 비중을 늘린다 epsilon = 0.5 * (1 / (episode + 1)) if epsilon <= np.random.uniform(0, 1): self.main_q_network.eval() # 신경망을 추론 모드로 전환 with torch.no_grad(): action = self.main_q_network(state).max(1)[1].view(1, 1) # 신경망 출력의 최댓값에 대한 인덱스 = max(1)[1] # .view(1,1)은 [torch.LongTensor of size 1]을 size 1*1로 변환하는 역할을 함 else: # 행동을 무작위로 반환 (0 혹은 1) action = torch.LongTensor([[random.randrange(self.num_actions)] ]) #행동을 무작위로 반환(0 혹은 1) # action은 [torch.LongTensor of size 1*1] 형태가 된다. return action def make_minibatch(self, episode=0): '''2. 미니배치 생성''' if self.PER == True: # 2.1 PER - 메모리 객체에서 미니배치를 추출 # def make_minibatch(self, episode): if episode < 30: transitions = self.memory.sample(BATCH_SIZE) else: # TD 오차를 이용해 미니배치를 추출하도록 수정 indexes = self.td_error_memory.get_prioritized_indexes( BATCH_SIZE) transitions = [self.memory.memory[n] for n in indexes] else: # 2.1 메모리 객체에서 미니배치를 추출 transitions = self.memory.sample(BATCH_SIZE) # 2.2 각 변수를 미니배치에 맞는 형태로 변형 # transitions는 각 단계별로 (state, action, state_next, reward) 형태로 BATCH_SIZE 개수만큼 저장됨 # 다시 말해, (state, action, state_next, reward) * BATCH_SIZE 형태가 된다. # 이를 미니배치로 만들기 위해 # (state*BATCH_SIZE, action*BATCH_SIZE), state_next*BATCH_SIZE, reward*BATCH_SIZE) # 형태로 변환한다. batch = Transition(*zip(*transitions)) # 2.3 각 변수의 요소를 미니배치에 맞게 변형하고, 신경망으로 다룰 수 있게 Variable로 만든다 # state를 예로 들면, [torch.FloatTensor of size 1*4] 형태의 요소가 BATCH_SIZE 개수만큼 있는 형태다 # 이를 torch.FloatTensor of size BATCH_SIZE*4 형태로 변형한다 # 상태, 행동, 보상, non_final 상태로 된 미니배치를 나타내는 Variable을 생성 # cat은 Concatenates(연접)를 의미한다. state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) return batch, state_batch, action_batch, reward_batch, non_final_next_states def get_expected_state_action_values(self): ''' 정답 신호로 사용할 Q(s_t,a_t)를 계산''' # 3.1 신경망을 추론 모드로 전환 self.main_q_network.eval() self.target_q_network.eval() # 3.2 신경망으로 Q(s_t, a_t)를 계산 # self.model(state_batch)은 왼쪽, 오른쪽에 대한 Q값을 출력하며 # [torch.FloatTensor of size BATCH_SIZEx2] 형태다 # 여기서부터는 실행한 행동 a_t에 대한 Q값을 계산하므로 action_batch에서 취한 행동 # a_t가 왼쪽이냐 오른쪽이냐에 대한 인덱스를 구하고, 이에 대한 Q값을 gather메서드로 모아온다. self.state_action_values = self.main_q_network( self.state_batch).gather(1, self.action_batch) # 3.3 max{Q(s_t+1, a)}값을 계산한다. 이때 다음 상태가 존재하는지에 주의해야 한다 # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듬 non_final_mask = torch.ByteTensor( tuple(map(lambda s: s is not None, self.batch.next_state))) # 먼저 전체를 0으로 초기화 next_state_values = torch.zeros(BATCH_SIZE) # Double DQN if self.Double == True: a_m = torch.zeros(BATCH_SIZE).type(torch.LongTensor) # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산 # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함 a_m[non_final_mask] = self.main_q_network( self.non_final_next_states).detach().max(1)[1] # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1로 변환 a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1) # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산 # detach() 메서드로 값을 꺼내옴 # squeeze()메서드로 size[minibatch*1]을 [minibatch]로 변환 next_state_values[non_final_mask] = self.target_q_network( self.non_final_next_states).gather( 1, a_m_non_final_next_states).detach().squeeze() else: # 다음 상태가 있는 인덱스에 대한 최대 Q값을 구한다 # 출력에 접근해서 열방향 최댓값(max(1))이 되는 [값, 인덱스]를 구한다 # 그리고 이 Q값(인덱스 = 0)을 출력한 다음 # detach 메서드로 이 값을 꺼내온다 next_state_values[non_final_mask] = self.target_q_network( self.non_final_next_states).max(1)[0].detach() # 3.4 정답신호로 사용할 Q(s_t, a_t) 값을 Q러닝 식으로 계산 expected_state_action_values = self.reward_batch + GAMMA * next_state_values return expected_state_action_values def update_main_q_network(self): ''' 4. 결합 가중치 수정 ''' # 4.1 신경망을 학습 모드로 전환 self.main_q_network.train() # 4.2 손실함수를 계산(smooth_l1_loss는 Huber 함수) # expected_state_action_values은 size가 [minibatch]이므로 unsqueeze해서 [minibatch*1]로 만듦 loss = F.smooth_l1_loss(self.state_action_values, self.expected_state_action_values.unsqueeze(1)) # 4.3 결합 가중치를 수정 self.optimizer.zero_grad() # 경사를 초기화 loss.backward() # 역전파 계산 self.optimizer.step() # 결합 가중치 수정 def update_target_q_network(self): # DDQN에서 추가됨 ''' Target Q-Network을 Main Q-Network와 맞춤 ''' self.target_q_network.load_state_dict(self.main_q_network.state_dict()) def update_td_error_memory(self): # Prioritized Experience Replay 에서 추가됨 ''' TD 오차 메모리에 저장된 TD 오차를 업데이트 ''' # 신경망을 추론 모드로 전환 self.main_q_network.eval() self.target_q_network.eval() # 전체 transition으로 미니배치를 생성 transitions = self.memory.memory batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) # 신경망의 출력 Q(s_t, a_t)를 계산 state_action_values = self.main_q_network(state_batch).gather( 1, action_batch) # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듦 non_final_mask = torch.ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) # 먼저 전체를 0으로 초기화, 크기는 기억한 transition 개수만큼 next_state_values = torch.zeros(len(self.memory)) a_m = torch.zeros(len(self.memory)).type(torch.LongTensor) # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산 # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함 a_m[non_final_mask] = self.main_q_network( non_final_next_states).detach().max(1)[1] # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1 로 변환 a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1) # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산 # detach() 메서드로 값을 꺼내옴 # squeeze() 메서드로 size[minibatch*1]을 [minibatch]로 변환 next_state_values[non_final_mask] = self.target_q_network( non_final_next_states).gather( 1, a_m_non_final_next_states).detach().squeeze() # TD 오차를 계산 td_errors = (reward_batch + GAMMA * next_state_values) - state_action_values.squeeze() # state_action_values는 size[minibatch*1]이므로 squeeze 메서드로 size[minibatch]로 변환 # TD 오차 메모리를 업데이트. Tensor를 detach() 메서드로 꺼내와 NumPy 변수로 변환하고 # 다시 파이썬 리스트로 반환 self.td_error_memory.memory = td_errors.detach().numpy().tolist()
class DDPG: def __init__(self, dim): self.critic_path = cst.CN_CKPT_PATH self.actor_path = cst.AN_CKPT_PATH self.replaymemory_path = cst.RM_PATH self.dim_body = dim[0] self.dim_sensor = dim[1] self.dim_state = dim[0] + dim[1] * 3 self.dim_action = dim[2] self.sess = tf.InteractiveSession() self.act_lr = cst.ACT_LEARNING_RATE self.cri_lr = cst.CRI_LEARNING_RATE self.tau = cst.TAU self.batch_size = cst.BATCH_SIZE self.gamma = cst.REWARD_DECAY self.actorNN = ActorNetwork(self.sess, self.dim_state, self.dim_action, self.act_lr, self.tau, self.batch_size) self.criticNN = CriticNetwork(self.sess, self.dim_state, self.dim_action, self.cri_lr, self.tau, self.gamma, self.actorNN.get_num_trainable_vars()) self.sess.run(tf.global_variables_initializer()) self.actorNN.update_target_network() self.criticNN.update_target_network() self.rm = ReplayMemory('DDPG') self.agent_count = cst.AGENT_COUNT self.exploration_rate = cst.EXPLORATION_RATE self.epsilon = cst.CRITIC_EPSILON self.LOSS_ITERATION = cst.LOSS_ITERATION self.expl_noise = OUNoise(self.dim_action) self.expl = False self.expl_decay = cst.EXPLORATION_DECAY #=====================action=========================== def Action(self, obs, action_type, run_type): if action_type == 'GREEDY': return self.action_greedy(obs) self.isExploration(run_type == 'TRAIN') action_list = [] agent_num = len(obs['agent']) for i in range(0, agent_num): agent_obs = obs['agent'][i] if np.linalg.norm(agent_obs['d'] - agent_obs['p']) < cst.AGENT_RADIUS + 10: action = {} action['theta'] = 0 action['velocity'] = 0 action['stop'] = True else: action = self.get_action(agent_obs, run_type == 'TEST') if self.expl: action = self.action_random(action) action_list.append(action) return action_list def action(self, obs, action_type, run_type): if action_type == 'GREEDY': return self.action_greedy(obs) self.isExploration(run_type == 'TRAIN') action_list = [] for i in range(0, self.agent_count): agent_obs = obs['agent'][i] if np.linalg.norm(agent_obs['d'] - agent_obs['p']) < agent_obs['r'] + 10: action = {} action['theta'] = 0 action['velocity'] = 0 action['stop'] = True else: action = self.get_action(agent_obs, run_type == 'TEST') if self.expl: action = self.action_random(action) action_list.append(action) # for i in range(self.agent_count): # agent_obs = obs['agent'][i] # if np.linalg.norm(agent_obs['d']-agent_obs['p']) < agent_obs['r'] + 5: # action = {} # action['theta'] = 0 # action['velocity'] = 0 # action['stop'] = True # else: # if i == 0: # action = self.get_action(agent_obs, run_type=='TEST') # if self.expl: # action = self.action_random() # else: # action = self.get_action_greedy(agent_obs) # action_list.append(action) return action_list def get_action(self, agent_obs, action_target=False): state_ = {} state_ = self.preprocess(agent_obs) state_body = np.reshape(state_['body'], (1, self.dim_body)) state_sensor = np.reshape(state_['sensor'], (1, self.dim_sensor)) if action_target: prediction = self.actorNN.predict_target(state_body, state_sensor) else: prediction = self.actorNN.predict(state_body, state_sensor) action = {} action['theta'] = prediction[0][0] action['velocity'] = prediction[0][1] action['stop'] = False return action def action_greedy(self, obs): action_list = [] agent_num = len(obs['agent']) for i in range(agent_num): agent_obs = obs['agent'][i] action = self.get_action_greedy(agent_obs) action_list.append(action) return action_list def get_action_greedy(self, agent_obs): if np.linalg.norm(agent_obs['d'] - agent_obs['p']) < 10 + 10: action = {} action['theta'] = 0 action['velocity'] = 0 action['stop'] = True return action greedy_dis = None angle_num = 20 next_angle = (190 / 2.0) offset = 2 direction = np.array(agent_obs['d']) - np.array(agent_obs['p']) direction /= np.linalg.norm(direction) greedy_dir = 0 if random.random() < 0.5: greedy_dir = 1 for angle in range(angle_num): if agent_obs['d_map'][angle] < 10 + offset: continue curr_angle = 190 / 2 - angle * 10 curr_q = mMath.AngleToCoor( curr_angle + agent_obs['front']) * agent_obs['d_map'][angle] curr_dis = direction[0] * curr_q[0] + direction[1] * curr_q[1] if greedy_dir == 0: if (greedy_dis is None) or (greedy_dis < curr_dis): next_angle = curr_angle greedy_dis = curr_dis next_q = curr_q else: if (greedy_dis is None) or (greedy_dis <= curr_dis): next_angle = curr_angle greedy_dis = curr_dis next_q = curr_q action = {} action['theta'] = np.clip(next_angle, -10, 10) / 10.0 if greedy_dis is None: action['velocity'] = -1 else: action['velocity'] = 1 action['stop'] = False return action def action_random(self, action=None): if action is None: action = dict() action['theta'] = np.random.normal() action['velocity'] = np.random.normal() else: noise_theta, noise_vel = self.expl_noise.noise() action['theta'] = action['theta'] + noise_theta action['velocity'] = action['velocity'] + noise_vel action['stop'] = False return action #=====================update========================== def Update(self): if len(self.rm.memory['critic']) > 0 and len( self.rm.memory['actor']) > 0: self.update_network() def update_network(self): rm_critic_batch = self.rm.getRandomMemories('critic') s_body_batch, s_sensor_batch, a_batch, r_batch, t_batch, s2_body_batch, s2_sensor_batch = [], [], [], [], [], [], [] for m in rm_critic_batch: state_ = copy.copy(self.preprocess(m['state']['agent'][0])) state_body = copy.copy(state_['body']) state_sensor = copy.copy(state_['sensor']) action = copy.copy( np.array([m['action'][0]['theta'], m['action'][0]['velocity']])) next_state_ = copy.copy( self.preprocess(m['next_state']['agent'][0])) next_state_body = copy.copy(next_state_['body']) next_state_sensor = copy.copy(next_state_['sensor']) s_body_batch.append(state_body[0]) s_sensor_batch.append(state_sensor[0]) a_batch.append(action) r_batch.append(m['reward']) t_batch.append(m['term']) s2_body_batch.append(next_state_body[0]) s2_sensor_batch.append(next_state_sensor[0]) target_q = self.criticNN.predict_target( s2_body_batch, s2_sensor_batch, self.actorNN.predict_target(s2_body_batch, s2_sensor_batch)) y_i = [] c_batch_size = len(rm_critic_batch) for k in range(c_batch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.criticNN.train( s_body_batch, s_sensor_batch, a_batch, np.reshape(y_i, (int(c_batch_size), 1))) # Update the actor policy using the sampled gradient rm_actor_batch = self.rm.getRandomMemories('actor') actor_body_batch, actor_sensor_batch, actor_a_batch = [], [], [] for m in rm_actor_batch: state_ = copy.copy(self.preprocess(m['state']['agent'][0])) state_body = copy.copy(state_['body']) state_sensor = copy.copy(state_['sensor']) action = copy.copy( np.array([m['action'][0]['theta'], m['action'][0]['velocity']])) actor_body_batch.append(state_body[0]) actor_sensor_batch.append(state_sensor[0]) actor_a_batch.append(action) act_batch = self.actorNN.predict(actor_body_batch, actor_sensor_batch) grads = self.criticNN.action_gradients(actor_body_batch, actor_sensor_batch, act_batch) self.actorNN.train(actor_body_batch, actor_sensor_batch, grads[0]) # Update target networks self.actorNN.update_target_network() self.criticNN.update_target_network() #===================evaluate=========================== def evaluate(self, obs, agent_idx, action, run_type='TRAIN'): state_ = {} agent_obs = obs['agent'][agent_idx] state_['body'] = np.array( self.preprocess_body(agent_obs['p'], agent_obs['q'], agent_obs['v'], agent_obs['d'])) state_['action'] = np.array([action['theta'], action['velocity']]) state_['sensor'] = np.array( self.preprocess_sensor(agent_obs['d_map'], agent_obs['v_map'], agent_obs['q_lim'], agent_obs['v_depth'])) state_body = np.reshape(state_['body'], (1, self.dim_body)) state_sensor = np.reshape(state_['sensor'], (1, self.dim_sensor)) action = np.reshape(state_['action'], (1, self.dim_action)) if run_type == 'TEST': prediction = self.criticNN.predict_target(state_body, state_sensor, action)[0] else: prediction = self.criticNN.predict(state_body, state_sensor, action)[0] return prediction def expl_rate_decay(self): if self.exploration_rate > 0.2: self.exploration_rate *= self.expl_decay print "exploration rate : ", self.exploration_rate #=====================replay_memory=========================== def addMemory(self, is_greedy, obs, act, next_state, reward, is_term): if is_greedy: self.rm.addMemory('actor', obs, act, next_state, reward, is_term) self.rm.addMemory('critic', obs, act, next_state, reward, is_term) else: if self.expl: self.rm.addMemory('actor', obs, act, next_state, reward, is_term) self.expl = False else: self.rm.addMemory('critic', obs, act, next_state, reward, is_term) #==================save & load========================== def save(self, m_replay=False, training_time=0, eval_list=None): cur_time = strftime("%Y%m%d_%I%M.ckpt", localtime()) print "Save Critic Network : ", self.criticNN.save(self.critic_path, cur_time) print "Save Actor Network : ", self.actorNN.save(self.actor_path, cur_time) print "Parameters Saved...!" self.save_parameters(cur_time, training_time) print "Networks Saved...!" if m_replay: print "Replay Memories Saved...!" self.save_replaymemory(cur_time) if eval_list != None: print "Evaluation List Saved...!" self.save_evaluation(cur_time, eval_list) def save_replaymemory(self, cur_time): f = open(cst.RM_PATH + "checkpoint", 'w') f.write(cur_time) f.close() f = open(cst.RM_PATH + "rm_" + cur_time, 'w') pickle.dump(self.rm, f, protocol=pickle.HIGHEST_PROTOCOL) f.close() def save_evaluation(self, cur_time, eval_list=None): f = open(cst.EVAL_PATH + "checkpoint", 'w') f.write(cur_time) f.close() f = open(cst.EVAL_PATH + "eval_" + cur_time, 'w') pickle.dump(eval_list, f, protocol=pickle.HIGHEST_PROTOCOL) f.close() def save_parameters(self, cur_time, training_time): f_read = open(cst.PM_READ_PATH, 'r') f_write = open(cst.PM_WRITE_PATH + "pm_" + cur_time + ".txt", 'w') f_write.write("traning time : " + str(training_time)) while True: line = f_read.readline() if not line: break f_write.write(line) f_read.close() f_write.close() def load_network(self, type): if type == 'actor': print "Load Recent Actor Network : ", self.actorNN.load(self.actor_path) elif type == 'critic': print "Load Recent Critic Network : ", self.criticNN.load(self.critic_path) def load_memory(self): f = open(cst.RM_PATH + "checkpoint", 'r') recent_file_name = f.readline() f.close() f_rm = open(cst.RM_PATH + "rm_" + recent_file_name, 'r') self.rm = pickle.load(f_rm) f_rm.close() print "Load Replay Memory : ", cst.RM_PATH, "rm_", recent_file_name def load_eval(self): f = open(cst.EVAL_PATH + "checkpoint", 'r') recent_file_name = f.readline() f.close() f_eval = open(cst.EVAL_PATH + "eval_" + recent_file_name, 'r') self.eval = pickle.load(f_eval) f_eval.close() print "Load Eval List : ", cst.EVAL_PATH, "eval_", recent_file_name #=================other=============================== def preprocess(self, agent_obs): state = {} state['body'] = np.array( self.preprocess_body(agent_obs['p'], agent_obs['q'], agent_obs['v'], agent_obs['d'])).reshape( (1, self.dim_body)) state['sensor'] = np.array( self.preprocess_sensor(agent_obs['d_map'], agent_obs['delta'], 20, cst.VISION_DEPTH)).reshape((1, 40)) return state def preprocess_body(self, p, q, v, d): p_ = np.array(p) q_ = np.array(q) d_ = np.array(d) width = cst.WINDOW_WIDTH / 2.0 height = cst.WINDOW_HEIGHT / 2.0 p_[0] = p_[0] / width p_[1] = p_[1] / height d_[0] = d_[0] / width d_[1] = d_[1] / height q_norm = np.linalg.norm(q_) q_ = (q_ / q_norm) pd = np.array(d_ - p_) pd_len = np.linalg.norm(pd) pd_vec = pd / pd_len inner = mMath.InnerProduct(q_, pd_vec) cross = mMath.CrossProduct(q_, pd_vec) cross_val = 1.0 if cross < 0: cross_val = 0.0 return [v, inner, cross_val, pd_len] def preprocess_sensor(self, d_map, delta_map, q_lim, vision_depth): depth = [d / float(vision_depth) for d in d_map] delta = [d / float(vision_depth) for d in delta_map] # print "depth : ", depth # print "delta : ", delta sensor = depth + delta return np.array(sensor) def get_agent_count(self, is_train, obs): if is_train: return 1 else: return len(obs['agent']) def isExploration(self, flag): self.expl = (flag and random.random() < self.exploration_rate)
def __init__(self, game, mem_size = 1000000, state_buffer_size = 4, batch_size = 64, learning_rate = 1e-5, pretrained_model = None, frameskip = 4 ): """ Inputs: - game: string to select the game - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - record: boolean to enable record option """ # Namestring self.game = game # Environment self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip) # Cuda self.use_cuda = torch.cuda.is_available() # Neural network self.net = DQN(channels_in = state_buffer_size, num_actions = self.env.get_number_of_actions()) self.target_net = DQN(channels_in = state_buffer_size, num_actions = self.env.get_number_of_actions()) if self.use_cuda: self.net.cuda() self.target_net.cuda() if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate) #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 1 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 50000 else: self.start_train_after = mem_size//2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500
class Agent(): def __init__(self, game, agent_type, display, load_model, record, test): self.name = game self.agent_type = agent_type self.ale = ALEInterface() self.ale.setInt(str.encode('random_seed'), np.random.randint(100)) self.ale.setBool(str.encode('display_screen'), display or record) if record: self.ale.setString(str.encode('record_screen_dir'), str.encode('./data/recordings/{}/{}/tmp/'.format(game, agent_type))) self.ale.loadROM(str.encode('./roms/{}.bin'.format(self.name))) self.action_list = list(self.ale.getMinimalActionSet()) self.frame_shape = np.squeeze(self.ale.getScreenGrayscale()).shape if test: self.name += '_test' if 'space_invaders' in self.name: # Account for blinking bullets self.frameskip = 2 else: self.frameskip = 3 self.frame_buffer = deque(maxlen=4) if load_model and not record: self.load_replaymemory() else: self.replay_memory = ReplayMemory(500000, 32) model_input_shape = self.frame_shape + (4,) model_output_shape = len(self.action_list) if agent_type == 'dqn': self.model = DeepQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) elif agent_type == 'double': self.model = DoubleDQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) else: self.model = DuelingDQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) print('{} Loaded!'.format(' '.join(self.name.split('_')).title())) print('Displaying: ', display) print('Frame Shape: ', self.frame_shape) print('Frame Skip: ', self.frameskip) print('Action Set: ', self.action_list) print('Model Input Shape: ', model_input_shape) print('Model Output Shape: ', model_output_shape) print('Agent: ', agent_type) def training(self, steps): ''' Trains the agent for :steps number of weight updates. Returns the average model loss ''' loss = [] # Initialize frame buffer. np.squeeze removes empty dimensions e.g. if shape=(210,160,__) self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) try: for step in range(steps): gameover = False initial_state = np.stack(self.frame_buffer, axis=-1) action = self.model.predict_action(initial_state) # Backup data if step % 5000 == 0: self.model.save_model() self.model.save_hyperparams() self.save_replaymemory() # If using a target model check for weight updates if hasattr(self.model, 'tau'): if self.model.tau == 0: self.model.update_target_model() self.model.tau = 10000 else: self.model.tau -= 1 # Frame skipping technique https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/ lives_before = self.ale.lives() for _ in range(self.frameskip): self.ale.act(action) reward = self.ale.act(action) self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) lives_after = self.ale.lives() if lives_after < lives_before: gameover = True # Taking advice from dude on reddit reward = -1 if self.ale.game_over(): gameover = True reward = -1 self.ale.reset_game() new_state = np.stack(self.frame_buffer, axis=-1) # Experiment with clipping rewards for stability purposes reward = np.clip(reward, -1, 1) self.replay_memory.add( initial_state, action, reward, gameover, new_state ) loss += self.model.replay_train() except: self.model.save_model() self.model.save_hyperparams() self.save_replaymemory() raise KeyboardInterrupt return np.mean(loss, axis=0) def simulate_random(self): print('Simulating game randomly') done = False total_reward = 0 while not done: action = np.random.choice(self.ale.getMinimalActionSet()) reward = self.ale.act(action) total_reward += reward if self.ale.game_over(): reward = -1 done = True reward = np.clip(reward, -1, 1) if reward != 0: print(reward) frames_survived = self.ale.getEpisodeFrameNumber() self.ale.reset_game() return total_reward, frames_survived def simulate_intelligent(self, evaluating=False): done = False total_score = 0 self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) while not done: state = np.stack(self.frame_buffer, axis=-1) action = self.model.predict_action(state, evaluating) for _ in range(self.frameskip): self.ale.act(action) # Remember, ale.act returns the increase in game score with this action total_score += self.ale.act(action) # Pushes oldest frame out self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale())) if self.ale.game_over(): done = True frames_survived = self.ale.getEpisodeFrameNumber() print(' Game Over') print(' Frames Survived: ', frames_survived) print(' Score: ', total_score) print('===========================') self.ale.reset_game() return total_score, frames_survived def save_replaymemory(self): with bz2.BZ2File('./data/{}/{}_replaymem.obj'.format(self.agent_type, self.name), 'wb') as f: pickle.dump(self.replay_memory, f, protocol=pickle.HIGHEST_PROTOCOL) print('Saved replay memory at ', datetime.now()) def load_replaymemory(self): try: with bz2.BZ2File('./data/{}/{}_replaymem.obj'.format(self.agent_type, self.name), 'rb') as f: self.replay_memory = pickle.load(f) print('Loaded replay memory at ', datetime.now()) except FileNotFoundError: print('No replay memory file found') raise KeyboardInterrupt
class Agent(): def __init__(self, load_checkpoint, checkpoint_file, env, n_states, n_actions, update_actor_interval=2, warmup=1000, mem_size=10**6, batch_size=100, n_hid1=400, n_hid2=300, lr_alpha=1e-3, lr_beta=1e-3, gamma=0.99, tau=5e-3, noise_mean=0, noise_sigma=0.1): self.load_checkpoint = load_checkpoint self.checkpoint_file = checkpoint_file # needed for clamping in the learn function self.env = env self.max_action = float(env.action_space.high[0]) self.low_action = float(env.action_space.low[0]) self.n_actions = n_actions # to keep track of how often we call "learn" function, for the actor network self.learn_step_counter = 0 # to handle countdown to the end of the warmup period, incremented every time we call an action self.time_step = 0 self.update_actor_interval = update_actor_interval self.warmup = warmup self.gamma = gamma self.tau = tau self.batch_size = batch_size self.noise_mean = noise_mean self.noise_sigma = noise_sigma self.actor = TD3ActorNetwork(n_states, n_actions, n_hid1, n_hid2, lr_alpha, checkpoint_file, name='actor') self.target_actor = TD3ActorNetwork(n_states, n_actions, n_hid1, n_hid2, lr_alpha, checkpoint_file, name='target_actor') self.critic_1 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='critic_1') self.critic_2 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='critic_2') self.target_critic_1 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='target_critic_1') self.target_critic_2 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='target_critic_2') self.memory = ReplayMemory(mem_size, n_states, n_actions) # tau=1 perform an exact copy of the networks to the respective targets # self.update_network_parameters(tau=1) self.update_network_parameters(self.actor, self.target_actor, tau=1) self.update_network_parameters(self.critic_1, self.target_critic_1, tau=1) self.update_network_parameters(self.critic_2, self.target_critic_2, tau=1) def choose_action(self, obs): if self.time_step < self.warmup: self.time_step += 1 action = torch.tensor(self.env.action_space.sample()) else: obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device) action = self.actor(obs) # exploratory noise, scaled wrt action scale (max_action) noise = torch.tensor( np.random.normal(self.noise_mean, self.noise_sigma * self.max_action, size=self.n_actions)).to(self.actor.device) action += noise action = torch.clamp(action, self.low_action, self.max_action) return action.cpu().detach().numpy() def choose_action_eval(self, obs): obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device) action = self.actor(obs) action = torch.clamp(action, self.low_action, self.max_action) return action.cpu().detach().numpy() def store_transition(self, obs, action, reward, obs_, done): self.memory.store_transition(obs, action, reward, obs_, done) def sample_transitions(self): state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer( self.batch_size) # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class) state_batch = torch.tensor(state_batch, dtype=torch.float).to(self.actor.device) action_batch = torch.tensor(action_batch, dtype=torch.float).to(self.actor.device) reward_batch = torch.tensor(reward_batch, dtype=torch.float).to(self.actor.device) new_state_batch = torch.tensor(new_state_batch, dtype=torch.float).to(self.actor.device) done_batch = torch.tensor(done_batch).to(self.actor.device) return state_batch, action_batch, reward_batch, new_state_batch, done_batch def __copy_param(self, net_param_1, net_param_2, tau): # a.copy_(b) reads content from b and copy it to a for par, target_par in zip(net_param_1, net_param_2): #with torch.no_grad(): val_to_copy = tau * par.weight + (1 - tau) * target_par.weight target_par.weight.copy_(val_to_copy) if target_par.bias is not None: val_to_copy = tau * par.bias + (1 - tau) * target_par.bias target_par.bias.copy_(val_to_copy) def update_network_parameters(self, network, target_network, tau=None): for par, target_par in zip(network.parameters(), target_network.parameters()): target_par.data.copy_(tau * par.data + (1 - tau) * target_par.data) # # # TODO: Controlla equivalenza con metodo Phil # # during the class initialization we call this method with tau=1, to perform an exact copy of the nets to targets # # then when we call this without specifying tau, we use the field stored # if tau is None: # tau = self.tau # # actor_params = self.actor.children() # target_actor_params = self.target_actor.children() # self.__copy_param(actor_params, target_actor_params, tau) # # critic_params1 = self.critic_1.children() # target_critic_1_params = self.target_critic_1.children() # self.__copy_param(critic_params1, target_critic_1_params, tau) # # critic_params2 = self.critic_2.children() # target_critic_2_params = self.target_critic_2.children() # self.__copy_param(critic_params2, target_critic_2_params, tau) def learn(self): self.learn_step_counter += 1 # deal with the situation in which we still not have filled the memory to batch size if self.memory.mem_counter < self.batch_size: return state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions( ) # +- 0.5 as per paper. To be tested if min and max actions are not equal (e.g. -2 and 1) noise = torch.tensor( np.clip( np.random.normal(self.noise_mean, 0.2, size=self.n_actions), -0.5, 0.5)).to(self.actor.device) target_next_action = torch.clamp( self.target_actor(new_state_batch) + noise, self.low_action, self.max_action) target_q1_ = self.target_critic_1(new_state_batch, target_next_action) target_q2_ = self.target_critic_1(new_state_batch, target_next_action) target_q_ = torch.min( target_q1_, target_q2_) # take the min q_vale for every element in the batch target_q_[done_batch] = 0.0 target = target_q_.view(-1) # probably not needed target = reward_batch + self.gamma * target #_q target = target.view(self.batch_size, 1) # probably not needed q_val1 = self.critic_1(state_batch, action_batch) q_val2 = self.critic_1(state_batch, action_batch) critic_loss1 = F.mse_loss(q_val1, target) critic_loss2 = F.mse_loss(q_val2, target) critic_loss = critic_loss1 + critic_loss2 self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() critic_loss.backward() #critic_loss1.backward() #critic_loss2.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() if self.learn_step_counter % self.update_actor_interval: action = self.actor(state_batch) # compute actor loss proportional to the estimated value from q1 given state, action pairs, where the action # is recomputed using the new policy actor_loss = -torch.mean(self.critic_1(state_batch, action)) self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters(self.actor, self.target_actor, self.tau) self.update_network_parameters(self.critic_1, self.target_critic_1, self.tau) self.update_network_parameters(self.critic_2, self.target_critic_2, self.tau) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.target_critic_1.save_checkpoint() self.target_critic_2.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.target_critic_1.load_checkpoint() self.target_critic_2.load_checkpoint()
def __init__(self, game, agent_type, display, load_model, record, test): self.name = game self.agent_type = agent_type self.ale = ALEInterface() self.ale.setInt(str.encode('random_seed'), np.random.randint(100)) self.ale.setBool(str.encode('display_screen'), display or record) if record: self.ale.setString(str.encode('record_screen_dir'), str.encode('./data/recordings/{}/{}/tmp/'.format(game, agent_type))) self.ale.loadROM(str.encode('./roms/{}.bin'.format(self.name))) self.action_list = list(self.ale.getMinimalActionSet()) self.frame_shape = np.squeeze(self.ale.getScreenGrayscale()).shape if test: self.name += '_test' if 'space_invaders' in self.name: # Account for blinking bullets self.frameskip = 2 else: self.frameskip = 3 self.frame_buffer = deque(maxlen=4) if load_model and not record: self.load_replaymemory() else: self.replay_memory = ReplayMemory(500000, 32) model_input_shape = self.frame_shape + (4,) model_output_shape = len(self.action_list) if agent_type == 'dqn': self.model = DeepQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) elif agent_type == 'double': self.model = DoubleDQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) else: self.model = DuelingDQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) print('{} Loaded!'.format(' '.join(self.name.split('_')).title())) print('Displaying: ', display) print('Frame Shape: ', self.frame_shape) print('Frame Skip: ', self.frameskip) print('Action Set: ', self.action_list) print('Model Input Shape: ', model_input_shape) print('Model Output Shape: ', model_output_shape) print('Agent: ', agent_type)
class Learner(): def __init__(self, sess, s_size, a_size, scope, queues, trainer): self.queue = queues[0] self.param_queue = queues[1] self.replaymemory = ReplayMemory(100000) self.sess = sess self.learner_net = network(s_size, a_size, scope, 20) self.q = self.learner_net.q self.Q = self.learner_net.Q self.actions_q = tf.placeholder(shape=[None, a_size, N], dtype=tf.float32) self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32) self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32) self.q_actiona = tf.multiply(self.q, self.actions_q) self.q_action = tf.reduce_sum(self.q_actiona, axis=1) self.u = tf.abs(self.q_target - self.q_action) self.loss = tf.reduce_mean( tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1)) self.local_vars = self.learner_net.local_vars #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, self.local_vars) #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0) self.apply_grads = trainer.apply_gradients( zip(self.gradients, self.local_vars)) self.sess.run(tf.global_variables_initializer()) def run(self, gamma, s_size, a_size, batch_size, env): print('start learning') step, train1 = 0, False epi_q = [] self.env = env while True: if self.queue.empty(): pass else: while not self.queue.empty(): t_error = self.queue.get() step += 1 self.replaymemory.add(t_error) if self.param_queue.empty(): params = self.sess.run(self.local_vars) self.param_queue.put(params) if step >= 10000: train1 = True step = 0 if train1 == True: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample( batch_size) #print 'fadsfdasfadsfa' episode_buffer = np.array(episode_buffer) #print episode_buffer observations = episode_buffer[:, 0] actions = episode_buffer[:, 1] rewards = episode_buffer[:, 2] observations_next = episode_buffer[:, 3] dones = episode_buffer[:, 4] Q_target = self.sess.run(self.Q, feed_dict={ self.learner_net.inputs: np.vstack(observations_next) }) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = self.sess.run(self.q_action, feed_dict={ self.learner_net.inputs: np.vstack(observations_next), self.actions_q: action_next }) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i] z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] * (1 - dones[i]) + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) isweight = np.zeros((batch_size, N)) for i in range(batch_size): for j in range(N): isweight[i, j] = ISWeights[i] feed_dict = { self.q_target: q_target_batch, self.learner_net.inputs: np.vstack(observations), self.actions_q: action_now, self.ISWeights: isweight } l, abs_errors, _ = self.sess.run( [self.loss, self.u, self.apply_grads], feed_dict=feed_dict) #print abs_errors abs_errors = np.mean(abs_errors, axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx, abs_errors)
class Worker(): def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory) def train(self,rollout,sess,gamma,ISWeights): rollout = np.array(rollout) observations = rollout[:,0] actions = rollout[:,1] rewards = rollout[:,2] next_observations = rollout[:,3] dones = rollout[:,4] Q_target = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:np.vstack(next_observations)}) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = sess.run(self.local_Q.q_action, feed_dict={self.local_Q.inputs:np.vstack(next_observations), self.local_Q.actions_q:action_next}) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i]# * (1 - dones[i]) z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) #print q_target_batch isweight = np.zeros((batch_size,N)) for i in range(batch_size): for j in range(N): isweight[i,j] = ISWeights[i] feed_dict = {self.local_Q.inputs:np.vstack(observations), self.local_Q.actions_q:action_now, self.local_Q.q_target:q_target_batch, self.local_Q.ISWeights:isweight} u,l,g_n,v_n,_ = sess.run([self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms, self.local_Q.var_norms, self.local_Q.apply_grads],feed_dict=feed_dict) return l/len(rollout), g_n, v_n, Q_target, u def work(self,gamma,sess,coord,saver): global GLOBAL_STEP episode_count = sess.run(self.global_episodes) total_steps = 0 epsilon = 0.2 print ("Starting worker " + str(self.number)) best_mean_episode_reward = -float('inf') with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) #episode_buffer = [] episode_reward = 0 episode_step_count = 0 d = False s = self.env.reset() s = process_frame(s) if epsilon > 0.01: epsilon = epsilon * 0.997 while not d: #self.env.render() GLOBAL_STEP += 1 #Take an action using probabilities from policy network output. if random.random() > epsilon: a_dist_list = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:[s]}) a_dist = a_dist_list[0] a = np.argmax(a_dist) else: a = random.randint(0, 5) s1, r, d, _ = self.env.step(a) if d == False: s1 = process_frame(s1) else: s1 = s self.replaymemory.add([s,a,r,s1,d]) episode_reward += r s = s1 total_steps += 1 episode_step_count += 1 if total_steps % 2 == 0 and d != True and total_steps > 50000: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(batch_size) l,g_n,v_n,Q_target,u = self.train(episode_buffer,sess,gamma,ISWeights) u = np.mean(u,axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx,u) #sess.run(self.update_local_ops) if d == True: break sess.run(self.update_local_ops) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory: if self.name == 'worker_0' and episode_count % 5 == 0: print('\n episode: ', episode_count, 'global_step:', \ GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \ 'epsilon: ', epsilon) print ('loss', l, 'Qtargetmean', np.mean(Q_target)) #print 'p_target', p_target if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000: saver.save(sess,self.model_path+'/qr-dqn-'+str(episode_count)+'.cptk') print ("Saved Model") mean_reward = np.mean(self.episode_rewards[-100:]) if episode_count > 20 and best_mean_episode_reward < mean_reward: best_mean_episode_reward = mean_reward if self.name == 'worker_0': sess.run(self.increment) #if episode_count%1==0: #print('\r {} {}'.format(episode_count, episode_reward),end=' ') episode_count += 1