def __init__(self, net, actionSet, goalSet, defaultNSample, defaultRandomPlaySteps, controllerMemCap, explorationSteps, trainFreq, hard_update, controllerEpsilon=defaultControllerEpsilon): self.actionSet = actionSet self.controllerEpsilon = controllerEpsilon self.goalSet = goalSet self.nSamples = defaultNSample self.gamma = defaultGamma self.net = net self.controllerMemCap = controllerMemCap self.memory = PrioritizedReplayBuffer(controllerMemCap, alpha=prioritized_replay_alpha) self.enable_double_dqn = True self.exploration = LinearSchedule(schedule_timesteps=explorationSteps, initial_p=1.0, final_p=0.02) self.defaultRandomPlaySteps = defaultRandomPlaySteps self.trainFreq = trainFreq self.randomPlay = True self.learning_done = False self.hard_update = hard_update
def __init__(self, stateShape, actionSpace, numPicks, memorySize, burnin=1000): self.numPicks = numPicks self.memorySize = memorySize self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6) self.stateShape = stateShape self.actionSpace = actionSpace self.step = 0 self.sync = 200 self.burnin = burnin self.alpha = 0.001 self.epsilon = 1 self.epsilon_decay = 0.5 self.epsilon_min = 0.01 self.eps_threshold = 0 self.gamma = 0.99 self.trainNetwork = self.createNetwork( stateShape, len(actionSpace), self.alpha) self.targetNetwork = self.createNetwork( stateShape, len(actionSpace), self.alpha) self.targetNetwork.set_weights( self.trainNetwork.get_weights())
def __init__(self, stateShape, actionSpace, numPicks, memorySize, sync=10, burnin=1000, alpha=0.0001, epsilon=1, epsilon_decay=0.05, epsilon_min=0.01, gamma=0.99): self.numPicks = numPicks self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6) self.stateShape = stateShape self.actionSpace = actionSpace self.step = 0 self.sync = sync self.burnin = burnin self.alpha = alpha self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min self.gamma = gamma self.walpha = 0.01 self.delay = 1 self.trainNetwork = self.createNetwork(stateShape, len(actionSpace), self.alpha) self.targetNetwork = self.createNetwork(stateShape, len(actionSpace), self.alpha) self.targetNetwork.set_weights(self.trainNetwork.get_weights())
def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init, gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False): self.step_now = 0 # record the step self.reward_num = 0 self.reward_accumulated = 0 # delay reward self.final_tem = 10 # just for now self.step_last_update = 0 # record the last update time self.update_period = update_period # for the off policy self.learn_start_time = learn_start_time self.gamma = gamma self.batch_size = batch_size self.memory_size = memory_size self.alpha = 0.6 self.beta = 0.4 self.replay_bata_iters = replay_iters self.replay_eps = 1e-6 self.memory_min_num = 1000 #she min num to learn self.step_last_learn = 0 # record the last learn step self.learn_fre = learn_fre # step frequency to learn self.e_greedy = 1 # record the e_greedy self.eps_T = eps_T # par for updating the maybe step 80,0000 self.eps_t_init = eps_t_init # par for updating the eps self.device = device self.model_path = model_path self.mode_enjoy = model_load if model_load == False: self.policy_net = DQN(board[0], board[1], action_num).to(device) self.target_net = DQN(board[0], board[1], action_num).to(device) self.optimizer = optim.Adagrad(self.policy_net.parameters(), lr=lr) self.loss_fn = nn.functional.mse_loss # use the l1 loss self.memory = PrioritizedReplayBuffer(memory_size, self.alpha) self.beta_schedule = LinearSchedule(self.replay_bata_iters, self.beta, 1.0) else: self.load(o_model_name) #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) self.obs_new = None self.obs_old = None self.action = None self.action_old = None self.dqn_direct_flag = False # show if the dqn action is done self.model_save_flag = False
def clear_memory(self, goal): self.learning_done = True ## Set the done learning flag #del self.trainable_model del self.memory self.memory = PrioritizedReplayBuffer(self.controllerMemCap, alpha=prioritized_replay_alpha) gpu = self.net.gpu device = '/cpu' if gpu < 0 else '/gpu:' + str(gpu) #del self.net gc.collect() rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0) with tf.device(device): self.simple_net = Sequential() self.simple_net.add( Conv2D(32, (8, 8), strides=4, activation='relu', padding='valid', input_shape=(84, 84, 4))) self.simple_net.add( Conv2D(64, (4, 4), strides=2, activation='relu', padding='valid')) self.simple_net.add( Conv2D(64, (3, 3), strides=1, activation='relu', padding='valid')) self.simple_net.add(Flatten()) self.simple_net.add( Dense(HIDDEN_NODES, activation='relu', kernel_initializer=initializers.random_normal( stddev=0.01, seed=SEED))) self.simple_net.add( Dense(nb_Action, activation='linear', kernel_initializer=initializers.random_normal( stddev=0.01, seed=SEED))) self.simple_net.compile(loss='mse', optimizer=rmsProp) self.simple_net.load_weights(recordFolder + '/policy_subgoal_' + str(goal) + '.h5') self.simple_net.reset_states()
def test_per(capacity): # test implementation of proritized replay buffer p_buffer = PrioritizedReplayBuffer(capacity) # populate the buffer for _ in range(capacity // 2): p_buffer.add(Experience()) # update batches of experience n_batches = 10 batch_size = 100 for _ in range(10): # randomly sample $batch_size of tree indices idx = random.sample([x for x in range(capacity - 1, 2 * capacity - 1)], batch_size) td_errors = np.random.uniform(0, 10, batch_size) p_buffer.batch_update(idx, td_errors) assert p_buffer.tree.max_priority == np.max( p_buffer.tree.tree[-capacity:]) # test sample for _ in range(10): p_buffer.sample(batch_size) return
def __init__(self, net, target_net, alpha=0.6, beta=0.4, beta_delta=1.001, e=1e-8, **kwargs): super(DDQNAgentPER, self).__init__(net, target_net, **kwargs) self.memory = PrioritizedReplayBuffer(**kwargs) self.__alpha = alpha self.__beta = beta self.__beta_delta = beta_delta self.__e = e
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) # Target or w- self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if (PRIORITIZED_REPLY_ENABLED): self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.B = .001
def __init__(self, total_timesteps=100000, buffer_size=50000, type_buffer="HER", prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): self.buffer_size = buffer_size self.prioritized_replay_eps = prioritized_replay_eps self.type_buffer = type_buffer if prioritized_replay: if type_buffer == "PER": self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if type_buffer == "HER": self.replay_buffer = HighlightReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(buffer_size) self.beta_schedule = None
def __init__(self, state_size, action_size, config=RLConfig()): self.seed = random.seed(config.seed) self.state_size = state_size self.action_size = action_size self.batch_size = config.batch_size self.batch_indices = torch.arange(config.batch_size).long().to(device) self.samples_before_learning = config.samples_before_learning self.learn_interval = config.learning_interval self.parameter_update_interval = config.parameter_update_interval self.per_epsilon = config.per_epsilon self.tau = config.tau self.gamma = config.gamma if config.useDuelingDQN: self.qnetwork_local = DuelingDQN(state_size, action_size, config.seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, config.seed).to(device) else: self.qnetwork_local = DQN(state_size, action_size, config.seed).to(device) self.qnetwork_target = DQN(state_size, action_size, config.seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=config.learning_rate) self.doubleDQN = config.useDoubleDQN self.usePER = config.usePER if self.usePER: self.memory = PrioritizedReplayBuffer(config.buffer_size, config.per_alpha) else: self.memory = ReplayBuffer(config.buffer_size) self.t_step = 0
def __init__(self, frame_dims, n_actions, priority_replay=True, epsilon=.99, discount=.99): self.epsilon_start = epsilon self.epsilon = epsilon self.epsilon_min = 0.1 self.final_exploration_frame = 1e6 self.epsilon_decay = 0.99 self.learning_rate = 0.00025 self.discount = discount self.frame_dims = frame_dims self.n_actions = n_actions self.alpha = 0.7 self.update_freq = 10000 self.batch_size = 32 self.tb = TensorBoard(log_dir='./logs', write_graph=True, write_images=False) #self.summary_writer = K.summary.FileWriter('./logs/') self.beta = 0.5 self.priority_replay_eps = 1e-6 self.priority_replay = priority_replay self.avg_q = -1 if priority_replay: self.memory = PrioritizedReplayBuffer(100000, self.alpha) else: self.memory = ReplayBuffer(600000)
def td_learning(args): agent = DQNAgent(args) replay_memory = PrioritizedReplayBuffer(1000000, args.alpha) #eval_game(agent, 500) outer = tqdm(range(args.total_steps), desc='Total steps', position=0) game = init_game() ave_score = 0 count = 0 for step in outer: board = copy.deepcopy(game.gameboard.board) if step < args.start_learn: avail_choices = game.gameboard.get_available_choices() index = np.random.randint(len(avail_choices)) choice = avail_choices[index] else: choice = agent.greedy_policy( board, game.gameboard.get_available_choices()) next_board, reward = game.input_pos(choice[0], choice[1]) next_board = copy.deepcopy(next_board) ##### replay_memory.add(board, choice, reward, next_board) ##### if game.termination(): ave_score += game.gameboard.score count += 1 game = init_game() if step >= args.start_learn and step % args.train_freq == 0: if count > 0: message = "ave score of " + str(count) + " game: " + str( ave_score / count) out_fd.write("{} {}\n".format(step, ave_score / count)) outer.write(message) ave_score = 0 count = 0 if step == args.start_learn: experience = replay_memory.sample(args.start_learn, beta=agent.beta) else: experience = replay_memory.sample(args.train_data_size, beta=agent.beta) boards, choices, rewards, next_boards, weights, batch_idxes = experience td_errors = agent.train( (boards, choices, rewards, next_boards, weights)) new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_memory.update_priorities(batch_idxes, new_priorities) agent.update_target(args.soft_tau) agent.update_epsilon() agent.update_beta() eval_game(agent, 500) out_fd.close()
def __init__(self, state_size, action_size, seed, lr_decay=0.9999, double_dqn=False, duel_dqn=False, prio_exp=False): """Initialize an Agent object. Params ====== state_size (int): Dimension of each State action_size (int): Dimension of each Action seed (int): Random Seed lr_decay (float): Decay float for alpha learning rate DOUBLE DQN (boolean): Indicator for Double Deep Q-Network DUEL DQN (boolean): Indicator for Duel Deep Q-Network PRIORITISED_EXPERIENCE (boolean): Indicator for Prioritized Experience Replay """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.lr_decay = lr_decay self.DOUBLE_DQN = double_dqn self.DUEL_DQN = duel_dqn self.PRIORITISED_EXPERIENCE = prio_exp # Determine Deep Q-Network for use if self.DUEL_DQN: self.qnetwork_local = DuelQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelQNetwork(state_size, action_size, seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # Initialize Optimizer self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Determine if Prioritized Experience will be used if self.PRIORITISED_EXPERIENCE: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, alpha=0.6, beta=0.4, beta_anneal=1.0001) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
class DQNAgentPER(DQNAgentBase): def __init__(self, net, target_net, alpha=0.6, beta=0.4, beta_delta=1.001, e=1e-8, **kwargs): super(DQNAgentPER, self).__init__(net, target_net, **kwargs) self.memory = PrioritizedReplayBuffer(**kwargs) self.__alpha = alpha self.__beta = beta self.__beta_delta = beta_delta self.__e = e def _learn(self, samples): states, actions, rewards, next_states, dones, idxs, probs = samples expected_q_values = self.net(states, training=True).gather(1, actions) # DQN target target_q_values_next = self.target_net( next_states, training=True).detach().max(1)[0].unsqueeze(1) target_q_values = rewards + self.gamma * target_q_values_next * (1 - dones) td_err = expected_q_values - target_q_values # calc td error weights = (probs * self.memory.size()).pow(-self.__beta).to( self.device) weights = weights / weights.max() loss = torch.mean(td_err.pow(2).squeeze() * weights) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.memory.update( idxs.cpu().numpy(), td_err.abs().detach().cpu().numpy().squeeze()**self.__alpha + self.__e) return loss.detach().cpu().numpy() def step(self, state, action, reward, next_state, done): loss = super(DQNAgentPER, self).step(state, action, reward, next_state, done) if done: self.__beta = min(1., self.__beta * self.__beta_delta) return loss
def create_replay_buffer(self, prioritized_replay, prioritized_replay_eps, size_buffer, alpha_prioritized_replay, prioritized_replay_beta0, prioritized_replay_beta_iters, steps_total): self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(size_buffer, alpha=alpha_prioritized_replay) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = steps_total self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(size_buffer) self.beta_schedule = None pass
def __init__(self, state_size, action_size, layer_spec, seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.layer_spec = layer_spec self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, layer_spec).to(device) self.qnetwork_target = QNetwork(state_size, action_size, layer_spec).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # (Prioritized) experience replay setup self.buffer_size = BUFFER_SIZE self.batch_size = BATCH_SIZE self.min_prio = MIN_PRIO self.alpha = ALPHA self.beta = INIT_BETA self.beta_increment = BETA_INC if USE_PER: self.memory = PrioritizedReplayBuffer(size=self.buffer_size, alpha=self.alpha) else: self.memory = DequeReplayBuffer(action_size=self.action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, seed=42) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # print info about Agent print('Units in the hidden layers are {}.'.format(str(layer_spec))) print('Using Double-DQN is \"{}\".'.format(str(USE_DDQN))) print('Using prioritized experience replay is \"{}\".'.format( str(USE_PER)))
def learn(env, args): ob = env.reset() ob_shape = ob.shape num_action = int(env.action_space.n) agent = TestAgent(ob_shape, num_action, args) replay_buffer = PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha) args.prioritized_replay_beta_iters = args.max_timesteps beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters, initial_p=args.prioritized_replay_beta0, final_p=1.0) episode_rewards = [0.0] saved_mean_reward = None n_step_seq = [] agent.sample_noise() agent.update_target() for t in range(args.max_timesteps): action = agent.act(ob) new_ob, rew, done, _ = env.step(action) replay_buffer.add(ob, action, rew, new_ob, float(done)) ob = new_ob episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > args.learning_starts and t % args.replay_period == 0: experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(t)) (obs, actions, rewards, obs_next, dones, weights, batch_idxes) = experience agent.sample_noise() kl_errors = agent.update(obs, actions, rewards, obs_next, dones, weights) replay_buffer.update_priorities(batch_idxes, np.abs(kl_errors) + 1e-6) if t > args.learning_starts and t % args.target_network_update_freq == 0: agent.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and args.print_freq is not None and len(episode_rewards) % args.print_freq == 0: print('steps {} episodes {} mean reward {}'.format(t, num_episodes, mean_100ep_reward))
def __init__(self, total_timesteps=100000, buffer_size=50000, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): self.buffer_size = buffer_size if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None
def __init__(self): #self.action_space = [0, 1, 2, 3, 4, 5, 6] self.action_space = [i for i in range(4 * 7) ] # 28 grouped action : board 7x14 self.action_size = len(self.action_space) self.next_stone_size = 6 self.state_size = (rows + 1, cols, 1) self.discount_factor = 0.99 # 딥마인드의 논문에서는 PER을 사용하여 샘플링한 데이터는 학습되는 양이 크기 때문에 # 학습의 안정성을 위해 Learning rate를 기존 random uniform sample을 사용했을 때의 1/4 수준으로 줄였기에 이를 반영했습니다. #self.learning_rate = 0.00025 self.learning_rate = 0.0000625 self.epsilon = 0. #1. self.epsilon_min = 0.0 self.epsilon_decay = 1000000 #1000000 self.model = self.build_model() self.target_model = self.build_model() # custom loss function을 따로 정의하여 학습에 사용합니다. self.model_updater = self.model_optimizer() self.batch_size = 64 self.train_start = 50000 #50000 # PER 선언 및 관련 hyper parameter입니다. # beta는 importance sampling ratio를 얼마나 반영할지에 대한 수치입니다. # 정확한 의미는 아니지만 정말 추상적으로 설명드리면 # beta가 크다 -> PER을 사용함으로써 생기는 데이터 편향을 크게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 감소, 전체적인 학습은 조금더 안정적 # beta가 작다 -> PER을 사용함으로써 생기는 데이터 편향을 작게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 증가, 전체적인 학습은 조금더 불안정 # 논문에서는 초기 beta를 0.4로 두고 학습이 끝날때까지 선형적으로 1까지 증가시킴. # alpha는 TD-error의 크기를 어느정도로 반영할지에 대한 파라미터입니다. 수식으로는 (TD-error)^alpha 로 표현됩니다. # alpha가 0에 가까울수록 TD-error의 크기를 반영하지 않는 것이고 기존의 uniform sampling에 가까워집니다. # alpha가 1에 가까울수록 TD-error의 크기를 반영하는 것이고 PER에 가까워집니다. # 논문에서는 alpha를 0.6으로 사용했습니다. # prioritized_replay_eps는 (TD-error)^alpha를 계산할때 TD-error가 0인 상황을 방지하기위해 TD-error에 더 해주는 아주작은 상수값 입니다. self.memory = PrioritizedReplayBuffer(1000000, alpha=0.6) #1000000 self.beta = 0.4 # 0.4 self.beta_max = 1.0 self.beta_decay = 2000000 #5000000 self.prioritized_replay_eps = 0.000001 # 텐서보드 설정 self.sess = tf.InteractiveSession() K.set_session(self.sess) self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = tf.summary.FileWriter('summary/tetris_dqn', self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.load_model = True if self.load_model: self.model.load_weights("./DQN_tetris_model_0311.h5") self.imitation_mode = False
def __init__( self, env, memory_size, batch_size, target_update=100, gamma=0.99, # replay parameters alpha=0.2, beta=0.6, prior_eps=1e-6, # Categorical DQN parameters v_min=0, v_max=200, atom_size=51, # N-step Learning n_step=3, start_train=32, save_weights=True, log=True, lr=0.001, seed=0, episodes=200): self.env = env obs_dim = self.env.observation_dim action_dim = self.env.action_dim self.batch_size = batch_size self.target_update = target_update self.gamma = gamma self.lr = lr self.memory_size = memory_size self.seed = seed # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # memory for 1-step Learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer(obs_dim, memory_size, batch_size, alpha=alpha) # memory for N-step Learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer(obs_dim, memory_size, batch_size, n_step=n_step, gamma=gamma) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = torch.linspace(self.v_min, self.v_max, self.atom_size).to(self.device) # networks: dqn, dqn_target self.dqn = Network(obs_dim, action_dim, self.atom_size, self.support).to(self.device) self.dqn_target = Network(obs_dim, action_dim, self.atom_size, self.support).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr) # transition to store in memory self.transition = list() self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10)) self.start_train = start_train self.save_weights = save_weights self.time = datetime.datetime.now().timetuple() self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}" self.log = log self.episode_cnt = 0 self.episodes = episodes if self.save_weights is True: self.create_save_directory() plt.ion()
class DuelingDoubleDQNagent(): def __init__(self): #self.action_space = [0, 1, 2, 3, 4, 5, 6] self.action_space = [i for i in range(4 * 7) ] # 28 grouped action : board 7x14 self.action_size = len(self.action_space) self.next_stone_size = 6 self.state_size = (rows + 1, cols, 1) self.discount_factor = 0.99 # 딥마인드의 논문에서는 PER을 사용하여 샘플링한 데이터는 학습되는 양이 크기 때문에 # 학습의 안정성을 위해 Learning rate를 기존 random uniform sample을 사용했을 때의 1/4 수준으로 줄였기에 이를 반영했습니다. #self.learning_rate = 0.00025 self.learning_rate = 0.0000625 self.epsilon = 0. #1. self.epsilon_min = 0.0 self.epsilon_decay = 1000000 #1000000 self.model = self.build_model() self.target_model = self.build_model() # custom loss function을 따로 정의하여 학습에 사용합니다. self.model_updater = self.model_optimizer() self.batch_size = 64 self.train_start = 50000 #50000 # PER 선언 및 관련 hyper parameter입니다. # beta는 importance sampling ratio를 얼마나 반영할지에 대한 수치입니다. # 정확한 의미는 아니지만 정말 추상적으로 설명드리면 # beta가 크다 -> PER을 사용함으로써 생기는 데이터 편향을 크게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 감소, 전체적인 학습은 조금더 안정적 # beta가 작다 -> PER을 사용함으로써 생기는 데이터 편향을 작게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 증가, 전체적인 학습은 조금더 불안정 # 논문에서는 초기 beta를 0.4로 두고 학습이 끝날때까지 선형적으로 1까지 증가시킴. # alpha는 TD-error의 크기를 어느정도로 반영할지에 대한 파라미터입니다. 수식으로는 (TD-error)^alpha 로 표현됩니다. # alpha가 0에 가까울수록 TD-error의 크기를 반영하지 않는 것이고 기존의 uniform sampling에 가까워집니다. # alpha가 1에 가까울수록 TD-error의 크기를 반영하는 것이고 PER에 가까워집니다. # 논문에서는 alpha를 0.6으로 사용했습니다. # prioritized_replay_eps는 (TD-error)^alpha를 계산할때 TD-error가 0인 상황을 방지하기위해 TD-error에 더 해주는 아주작은 상수값 입니다. self.memory = PrioritizedReplayBuffer(1000000, alpha=0.6) #1000000 self.beta = 0.4 # 0.4 self.beta_max = 1.0 self.beta_decay = 2000000 #5000000 self.prioritized_replay_eps = 0.000001 # 텐서보드 설정 self.sess = tf.InteractiveSession() K.set_session(self.sess) self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = tf.summary.FileWriter('summary/tetris_dqn', self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.load_model = True if self.load_model: self.model.load_weights("./DQN_tetris_model_0311.h5") self.imitation_mode = False # 각 에피소드 당 학습 정보를 기록 def setup_summary(self): episode_total_reward = tf.Variable(0.) episode_avg_max_q = tf.Variable(0.) episode_duration = tf.Variable(0.) episode_avg_loss = tf.Variable(0.) tf.summary.scalar('Total Reward/Episode', episode_total_reward) tf.summary.scalar('Total Clear Line/Episode', episode_avg_max_q) #tf.summary.scalar('Duration/Episode', episode_duration) #tf.summary.scalar('Average Loss/Episode', episode_avg_loss) #tf.train.AdamOptimizer summary_vars = [ episode_total_reward, episode_avg_max_q, episode_duration, episode_avg_loss ] summary_placeholders = [ tf.placeholder(tf.float32) for _ in range(len(summary_vars)) ] update_ops = [ summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars)) ] summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op def build_model(self): # Dueling DQN state = Input(shape=( self.state_size[0], self.state_size[1], self.state_size[2], )) layer = Conv2D(32, (5, 5), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(state) # 64, (4, 4) layer = Conv2D(32, (3, 3), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer) ## layer = Conv2D(32, (1, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer) ## layer = Conv2D(32, (3, 3), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer) ## layer = Conv2D(32, (1, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer) ## pool_1 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='valid', data_format=None)(layer) layer_2 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_1) ## layer_2 = Conv2D(32, (1, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer_2) ## layer_2 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer_2) pool_2 = MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='valid', data_format=None)(layer_2) layer_r = Conv2D(32, (rows + 1, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(state) layer_c = Conv2D(32, (1, cols), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(state) pool_1_r = Conv2D(32, (13, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_1) pool_1_c = Conv2D(32, (1, 5), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_1) pool_2_r = Conv2D(32, (12, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_2) pool_2_c = Conv2D(32, (1, 4), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_2) layer = Flatten()(layer) layer_2 = Flatten()(layer_2) pool_1 = Flatten()(pool_1) pool_2 = Flatten()(pool_2) layer_r = Flatten()(layer_r) layer_c = Flatten()(layer_c) pool_1_r = Flatten()(pool_1_r) pool_1_c = Flatten()(pool_1_c) pool_2_r = Flatten()(pool_2_r) pool_2_c = Flatten()(pool_2_c) merge_layer = concatenate([ layer, layer_2, pool_1, pool_2, pool_1_c, pool_1_r, pool_2_c, pool_2_r, layer_c, layer_r ], axis=1) merge_layer = Dense(128, activation='relu', kernel_initializer='he_uniform')(merge_layer) vlayer = Dense(64, activation='relu', kernel_initializer='he_uniform')(merge_layer) alayer = Dense(64, activation='relu', kernel_initializer='he_uniform')(merge_layer) v = Dense(1, activation='linear', kernel_initializer='he_uniform')(vlayer) v = Lambda(lambda v: tf.tile(v, [1, self.action_size]))(v) a = Dense(self.action_size, activation='linear', kernel_initializer='he_uniform')(alayer) a = Lambda(lambda a: a - tf.reduce_mean(a, axis=-1, keep_dims=True))(a) q = Add()([v, a]) model = Model(inputs=state, outputs=q) # custom loss 및 optimizer를 사용할 것이기에 complie 부분은 주석처리 합니다. # model.compile(loss='logcosh', optimizer=Adam(lr=self.learning_rate)) model.summary() return model def update_target_model(self): self.target_model.set_weights(self.model.get_weights()) ''' def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = np.float32(state) q_values = self.model.predict(state) return np.argmax(q_values[0]) def get_action(self, env, state): if np.random.rand() <= self.epsilon: if env.new_stone_flag: return random.randrange(4) else: return random.randrange(self.action_size) else: state = np.float32(state) q_values = self.model.predict(state) return np.argmax(q_values[0]) ''' def get_action(self, env, state): if np.random.rand() <= self.epsilon: if env.stone_number(env.stone) == 1: return random.randrange(14) elif env.stone_number(env.stone) == 4 or env.stone_number( env.stone) == 6: return random.randrange(2) * 7 + random.randrange(6) elif env.stone_number(env.stone) == 2 or env.stone_number( env.stone) == 5 or env.stone_number(env.stone) == 7: return random.randrange(4) * 7 + random.randrange(6) elif env.stone_number(env.stone) == 3: return random.randrange(6) else: state = np.float32(state) q_values = self.model.predict(state) r_action = np.argmax(q_values[0]) return np.argmax(q_values[0]) def model_optimizer(self): target = K.placeholder(shape=[None, self.action_size]) weight = K.placeholder(shape=[ None, ]) # hubber loss에 대한 코드입니다. clip_delta = 1.0 pred = self.model.output err = target - pred cond = K.abs(err) < clip_delta squared_loss = 0.5 * K.square(err) linear_loss = clip_delta * (K.abs(err) - 0.5 * clip_delta) loss1 = tf.where(cond, squared_loss, linear_loss) # 기존 hubber loss에 importance sampling ratio를 곱하는 형태의 PER loss를 정의합니다. weighted_loss = tf.multiply(tf.expand_dims(weight, -1), loss1) loss = K.mean(weighted_loss, axis=-1) optimizer = Adam(lr=self.learning_rate) updates = optimizer.get_updates(self.model.trainable_weights, [], loss) train = K.function([self.model.input, target, weight], [err], updates=updates) return train def train_model(self): (update_input, action, reward, update_target, done, weight, batch_idxes) = self.memory.sample(self.batch_size, beta=self.beta) target = self.model.predict(update_input) target_val = self.target_model.predict(update_target) target_val_arg = self.model.predict(update_target) # Double DQN for i in range(self.batch_size): if done[i]: target[i][action[i]] = reward[i] else: a = np.argmax(target_val_arg[i]) target[i][action[ i]] = reward[i] + self.discount_factor * target_val[i][a] # PER에서 mini-batch로 샘플링한 데이터에 대해 학습을 진행합니다. # 학습을 하는 과정에서 새롭게 계산된 TD-error를 다시 반영하기 위해 err는 따로 출력하여 저장합니다. err = self.model_updater([update_input, target, weight]) err = np.reshape(err, [self.batch_size, self.action_size]) # TD-error가 0이 되는것을 방지하기위해 작은 상수를 더해줍니다. new_priorities = np.abs(np.sum(err, axis=1)) + self.prioritized_replay_eps # 샘플링한 데이터에 대해 새롭게 계산된 TD-error를 업데이트 합니다. self.memory.update_priorities(batch_idxes, new_priorities)
def __init__(self, dimO, dimA, beta, layers_dim, finalize_graph=True): """ :param finalize_graph: if you want to restore a model, using .restore(), set this param to False """ self.dimA = dimA self.dimO = dimO self.beta = beta self.layers_dim = layers_dim tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate self.opt = self.adam self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, FLAGS.alpha) self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True)) self.noise = np.zeros(self.dimA) per_weights = tf.placeholder(tf.float32, [None], 'per_weights') obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") with tf.variable_scope('q'): negQ = self.negQ(obs, act) q = -negQ act_grad, = tf.gradients(negQ, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): negQ_target = self.negQ(obs_target, act_target) act_target_grad, = tf.gradients(negQ_target, act_target) q_target = -negQ_target y = tf.where(term_target, rew, rew + discount * q_target) y = tf.maximum(q - 1., y) y = tf.minimum(q + 1., y) y = tf.stop_gradient(y) print('y shape', y.get_shape()) print('q shape', q.get_shape()) td_error = q - y print('per weights shape', per_weights.get_shape()) print('multi td error^2 per weights shape', tf.multiply(tf.square(td_error), per_weights).get_shape()) ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weights), 0) print('ms td error shape', ms_td_error.get_shape()) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + \ l2norm * tf.reduce_sum(regLosses) + \ FLAGS.alpha_beyond * tf.reduce_sum( tf.where( q > FLAGS.RMAX, tf.square(q - FLAGS.RMAX), tf.zeros((FLAGS.bsize,))) + tf.where( q < FLAGS.RMIN, tf.square(q - FLAGS.RMIN), tf.zeros((FLAGS.bsize,))), 0 ) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [theta_target_i.assign_sub(tau * (theta_target_i - theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph) tf.summary.scalar('Qvalue (batch avg)', tf.reduce_mean(q)) tf.summary.scalar('Qvalue (batch max)', tf.reduce_max(q)) tf.summary.scalar('Qvalue (batch min)', tf.reduce_min(q)) tf.summary.scalar('Q targets (batch avg)', tf.reduce_mean(q_target)) tf.summary.scalar('Q targets (batch min)', tf.reduce_min(q_target)) tf.summary.scalar('Q targets (batch max)', tf.reduce_max(q_target)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('td error', tf.reduce_mean(tf.abs(td_error))) tf.summary.scalar('reward', tf.reduce_mean(rew)) tf.summary.scalar('chosen actions', tf.reduce_mean(act)) tf.summary.scalar('maximizing action (batch avg)', tf.reduce_mean(act_target)) tf.summary.scalar('maximizing action (batch max)', tf.reduce_max(act_target)) tf.summary.scalar('maximizing action (batch min)', tf.reduce_min(act_target)) merged = tf.summary.merge_all() # tf functions with self.sess.as_default(): self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weights], [optimize_q, update_target, loss_q, tf.abs(td_error), q, q_target], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.global_variables_initializer()) self.sess.run(self.makeCvx) self.sess.run([theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]) if finalize_graph: self.sess.graph.finalize() self.t = 0 # global training time (number of observations)
class DQNAgent: def __init__( self, env, memory_size, batch_size, target_update=100, gamma=0.99, # replay parameters alpha=0.2, beta=0.6, prior_eps=1e-6, # Categorical DQN parameters v_min=0, v_max=200, atom_size=51, # N-step Learning n_step=3, start_train=32, save_weights=True, log=True, lr=0.001, seed=0, episodes=200): self.env = env obs_dim = self.env.observation_dim action_dim = self.env.action_dim self.batch_size = batch_size self.target_update = target_update self.gamma = gamma self.lr = lr self.memory_size = memory_size self.seed = seed # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # memory for 1-step Learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer(obs_dim, memory_size, batch_size, alpha=alpha) # memory for N-step Learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer(obs_dim, memory_size, batch_size, n_step=n_step, gamma=gamma) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = torch.linspace(self.v_min, self.v_max, self.atom_size).to(self.device) # networks: dqn, dqn_target self.dqn = Network(obs_dim, action_dim, self.atom_size, self.support).to(self.device) self.dqn_target = Network(obs_dim, action_dim, self.atom_size, self.support).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr) # transition to store in memory self.transition = list() self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10)) self.start_train = start_train self.save_weights = save_weights self.time = datetime.datetime.now().timetuple() self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}" self.log = log self.episode_cnt = 0 self.episodes = episodes if self.save_weights is True: self.create_save_directory() plt.ion() def create_save_directory(self): try: os.mkdir(self.path) except OSError: print("Creation of the directory %s failed" % self.path) else: print("Successfully created the directory %s " % self.path) def select_action(self, state): """Select an action from the input state.""" # NoisyNet: no epsilon greedy action selection selected_action = self.dqn(torch.FloatTensor(state).to( self.device)).argmax() selected_action = selected_action.detach().cpu().numpy() self.transition = [state, selected_action] return selected_action def step(self, action): """Take an action and return the response of the env.""" next_state, reward, done = self.env.step(action) self.transition += [reward, next_state, done] # N-step transition if self.use_n_step: one_step_transition = self.memory_n.store(*self.transition) # 1-step transition else: one_step_transition = self.transition # add a single step transition if one_step_transition: self.memory.store(*one_step_transition) return next_state, reward, done def update_model(self): """Update the model by gradient descent.""" # PER needs beta to calculate weights samples = self.memory.sample_batch(self.beta) weights = torch.FloatTensor(samples["weights"].reshape(-1, 1)).to( self.device) indices = samples["indices"] # 1-step Learning loss elementwise_loss = self._compute_dqn_loss(samples, self.gamma) # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) # N-step Learning loss # we are gonna combine 1-step loss and n-step loss so as to # prevent high-variance. The original rainbow employs n-step loss only. if self.use_n_step: gamma = self.gamma**self.n_step samples = self.memory_n.sample_batch_from_idxs(indices) elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma) elementwise_loss += elementwise_loss_n_loss # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) self.optimizer.zero_grad() loss.backward() # print(loss) clip_grad_norm_(self.dqn.parameters(), 10.0) self.optimizer.step() # PER: update priorities loss_for_prior = elementwise_loss.detach().cpu().numpy() new_priorities = loss_for_prior + self.prior_eps self.memory.update_priorities(indices, new_priorities) # NoisyNet: reset noise self.dqn.reset_noise() self.dqn_target.reset_noise() return loss.item() def train(self, num_frames, plotting_interval=100): """Train the agent.""" if self.log: pass # config = {'gamma': self.gamma, 'log_interval': plotting_interval, 'learning_rate': self.lr, # 'directory': self.path, 'type': 'dqn', 'replay_memory': self.memory_size, 'environment': 'normal', 'seed': self.seed} # wandb.init(project='is_os', entity='pydqn', config=config, notes=self.env.reward_function, reinit=True, tags=['report']) # wandb.watch(self.dqn) self.env.reset() state = self.env.get_state() won = False update_cnt = 0 losses = [] scores = [] score = 0 frame_cnt = 0 self.episode_cnt = 0 for frame_idx in range(1, num_frames + 1): frame_cnt += 1 action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward fraction = min(frame_cnt / num_frames, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) # if agent has trained 500 frames, terminate if frame_cnt == 500: done = True # if episode ends if done: if reward > 0: won = True self.env.reset() state = self.env.get_state() self.episode_cnt += 1 scores.append(score) score = 0 frame_cnt = 0 # if training is ready if len(self.memory) >= self.batch_size: loss = self.update_model() losses.append(loss) update_cnt += 1 # if hard update is needed if update_cnt % self.target_update == 0: self._target_hard_update() # plotting if frame_idx % plotting_interval == 0: self._plot(frame_idx, scores, losses) if frame_idx % 1000 == 0: torch.save(self.dqn.state_dict(), f'{self.path}/{frame_idx}.tar') print(f"model saved at:\n {self.path}/{frame_idx}.tar") # wandb.run.summary['won'] = won self.env.close() def _compute_dqn_loss(self, samples, gamma): """Return categorical dqn loss.""" device = self.device # for shortening the following lines state = torch.FloatTensor(samples["obs"]).to(device) next_state = torch.FloatTensor(samples["next_obs"]).to(device) action = torch.LongTensor(samples["acts"]).to(device) reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device) done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device) # Categorical DQN algorithm delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1) with torch.no_grad(): # Double DQN next_action = self.dqn(next_state).argmax(1) next_dist = self.dqn_target.dist(next_state) next_dist = next_dist[range(self.batch_size), next_action] t_z = reward + (1 - done) * gamma * self.support t_z = t_z.clamp(min=self.v_min, max=self.v_max) b = (t_z - self.v_min) / delta_z l = b.floor().long() u = b.ceil().long() offset = (torch.linspace( 0, (self.batch_size - 1) * self.atom_size, self.batch_size).long().unsqueeze(1).expand( self.batch_size, self.atom_size).to(self.device)) proj_dist = torch.zeros(next_dist.size(), device=self.device) proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)) proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)) dist = self.dqn.dist(state) log_p = torch.log(dist[range(self.batch_size), action]) elementwise_loss = -(proj_dist * log_p).sum(1) return elementwise_loss def _target_hard_update(self): """Hard update: target <- local.""" self.dqn_target.load_state_dict(self.dqn.state_dict()) def _plot(self, frame_cnt, scores, losses): self.ax1.cla() self.ax1.set_title( f'frames: {frame_cnt} score: {np.mean(scores[-10:])}') self.ax1.plot(scores[-999:], color='red') self.ax2.cla() self.ax2.set_title(f'loss: {np.mean(losses[-10:])}') self.ax2.plot(losses[-999:], color='blue') plt.show() plt.pause(0.1) # needed for wandb to not log nans # if frame_cnt < self.start_train + 11: # loss = 0 # else: # loss = np.mean(losses[-10:]) if self.log: pass
class Agent: # @todo: when instantiating two of these, it raises an Exception, because it tries to redefine # @todo: the scopes or variables (the names are already taken) # @todo: FIX THIS !!! """ We don't use the bundle entropy method to optimize wrt actions, but rather plain SGD (or rather Adam) """ def __init__(self, dimO, dimA, beta, layers_dim, finalize_graph=True): """ :param finalize_graph: if you want to restore a model, using .restore(), set this param to False """ self.dimA = dimA self.dimO = dimO self.beta = beta self.layers_dim = layers_dim tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate self.opt = self.adam self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, FLAGS.alpha) self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True)) self.noise = np.zeros(self.dimA) per_weights = tf.placeholder(tf.float32, [None], 'per_weights') obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") with tf.variable_scope('q'): negQ = self.negQ(obs, act) q = -negQ act_grad, = tf.gradients(negQ, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): negQ_target = self.negQ(obs_target, act_target) act_target_grad, = tf.gradients(negQ_target, act_target) q_target = -negQ_target y = tf.where(term_target, rew, rew + discount * q_target) y = tf.maximum(q - 1., y) y = tf.minimum(q + 1., y) y = tf.stop_gradient(y) print('y shape', y.get_shape()) print('q shape', q.get_shape()) td_error = q - y print('per weights shape', per_weights.get_shape()) print('multi td error^2 per weights shape', tf.multiply(tf.square(td_error), per_weights).get_shape()) ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weights), 0) print('ms td error shape', ms_td_error.get_shape()) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + \ l2norm * tf.reduce_sum(regLosses) + \ FLAGS.alpha_beyond * tf.reduce_sum( tf.where( q > FLAGS.RMAX, tf.square(q - FLAGS.RMAX), tf.zeros((FLAGS.bsize,))) + tf.where( q < FLAGS.RMIN, tf.square(q - FLAGS.RMIN), tf.zeros((FLAGS.bsize,))), 0 ) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [theta_target_i.assign_sub(tau * (theta_target_i - theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph) tf.summary.scalar('Qvalue (batch avg)', tf.reduce_mean(q)) tf.summary.scalar('Qvalue (batch max)', tf.reduce_max(q)) tf.summary.scalar('Qvalue (batch min)', tf.reduce_min(q)) tf.summary.scalar('Q targets (batch avg)', tf.reduce_mean(q_target)) tf.summary.scalar('Q targets (batch min)', tf.reduce_min(q_target)) tf.summary.scalar('Q targets (batch max)', tf.reduce_max(q_target)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('td error', tf.reduce_mean(tf.abs(td_error))) tf.summary.scalar('reward', tf.reduce_mean(rew)) tf.summary.scalar('chosen actions', tf.reduce_mean(act)) tf.summary.scalar('maximizing action (batch avg)', tf.reduce_mean(act_target)) tf.summary.scalar('maximizing action (batch max)', tf.reduce_max(act_target)) tf.summary.scalar('maximizing action (batch min)', tf.reduce_min(act_target)) merged = tf.summary.merge_all() # tf functions with self.sess.as_default(): self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weights], [optimize_q, update_target, loss_q, tf.abs(td_error), q, q_target], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.global_variables_initializer()) self.sess.run(self.makeCvx) self.sess.run([theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]) if finalize_graph: self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def adam(self, func, obs, plot=False): """Optimizer to find the greedy action""" # if npr.random() < 1./20: # plot = True b1 = 0.9 b2 = 0.999 lam = 0.5 eps = 1e-8 alpha = 0.01 nBatch = obs.shape[0] act = np.zeros((nBatch, self.dimA)) m = np.zeros_like(act) v = np.zeros_like(act) b1t, b2t = 1., 1. act_best, a_diff, f_best = [None] * 3 hist = {'act': [], 'f': [], 'g': []} for i in range(1000): f, g = func(obs, act) if plot: hist['act'].append(act.copy()) hist['f'].append(f) hist['g'].append(g) if i == 0: act_best = act.copy() f_best = f.copy() else: prev_act_best = act_best.copy() I = (f < f_best) act_best[I] = act[I] f_best[I] = f[I] a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1)) a_diff = a_diff_i if a_diff is None \ else lam * a_diff + (1. - lam) * a_diff_i # print(a_diff_i, a_diff, np.sum(f)) if a_diff < 1e-3 and i > 5: if plot: self.adam_plot(func, obs, hist) return act_best m = b1 * m + (1. - b1) * g v = b2 * v + (1. - b2) * (g * g) b1t *= b1 b2t *= b2 mhat = m / (1. - b1t) vhat = v / (1. - b2t) act -= alpha * mhat / (np.sqrt(v) + eps) act = np.clip(act, FLAGS.a_min + 1e-8, FLAGS.a_max - 1e-8) print(' + Warning: Adam did not converge.') if plot: self.adam_plot(func, obs, hist) return act_best def adam_plot(self, func, obs, hist): hist['act'] = np.array(hist['act']).T hist['f'] = np.array(hist['f']).T hist['g'] = np.array(hist['g']).T if self.dimA == 1: xs = np.linspace(-1. + 1e-8, 1. - 1e-8, 100) ys = [func(obs[[0], :], [[xi]])[0] for xi in xs] fig = plt.figure() plt.plot(xs, ys) plt.plot(hist['act'][0, 0, :], hist['f'][0, :], label='Adam') plt.legend() fname = os.path.join(FLAGS.outdir, 'adamPlt.png') print("Saving Adam plot to {}".format(fname)) plt.savefig(fname) plt.close(fig) elif self.dimA == 2: assert (False) else: xs = npr.uniform(-1., 1., (5000, self.dimA)) ys = np.array([func(obs[[0], :], [xi])[0] for xi in xs]) epi = np.hstack((xs, ys)) pca = PCA(n_components=2).fit(epi) W = pca.components_[:, :-1] xs_proj = xs.dot(W.T) fig = plt.figure() X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100) Z = griddata(xs_proj[:, 0], xs_proj[:, 1], ys.ravel(), X, Y, interp='linear') plt.contourf(X, Y, Z, 15) plt.colorbar() adam_x = hist['act'][:, 0, :].T adam_x = adam_x.dot(W.T) plt.plot(adam_x[:, 0], adam_x[:, 1], label='Adam', color='k') plt.legend() fname = os.path.join(FLAGS.outdir, 'adamPlt.png') print("Saving Adam plot to {}".format(fname)) plt.savefig(fname) plt.close(fig) def reset(self, obs): self.noise = np.zeros(self.dimA) self.observation = obs # initial observation def act(self, test=False): """ Greedily choose action There is noise during training """ with self.sess.as_default(): obs = np.expand_dims(self.observation, axis=0) f = self._fg tflearn.is_training(False) action = self.opt(f, obs) tflearn.is_training(not test) if not test: # sig = (self.t < 40000) * (self.t * (FLAGS.ousigma_end - FLAGS.ousigma_start) / 40000 + FLAGS.ousigma_start) + (self.t >= 40000) * FLAGS.ousigma_end # self.noise = sig * npr.randn(self.dimA) self.noise -= FLAGS.outheta * self.noise - FLAGS.ousigma * npr.randn(self.dimA) action += self.noise action = np.clip(action, FLAGS.a_min, FLAGS.a_max) self.action = np.atleast_1d(np.squeeze(action, axis=0)) return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.rm.add(*(obs1, self.action, rew, obs2, term)) if self.t > FLAGS.warmup: for i in range(FLAGS.iter): loss = self.train() def train(self): self.t += 1 beta = self.beta(self.t) with self.sess.as_default(): obs, act, rew, ob2, term2, w, idx = self.rm.sample(FLAGS.bsize, beta) rew, term2, w = rew.squeeze(), term2.squeeze(), w.squeeze() # fix dimensions # w = np.ones(w.shape) # no prioritization f = self._fg_target tflearn.is_training(False) act2 = self.opt(f, ob2) tflearn.is_training(True) _, _, loss, td_error, q, q_target = self._train(obs, act, rew, ob2, act2, term2, w, log=FLAGS.summary, global_step=self.t) self.sess.run(self.proj) # keep some weights positive # self.rm.update_priorities(idx, np.array(td_error.shape[0] * [1.])) # no prioritization self.rm.update_priorities(idx, td_error + 1e-2) return loss, td_error, q, q_target def negQ(self, x, y, reuse=False): """Architecture of the neural network""" print('x shape', x.get_shape()) print('y shape', y.get_shape()) szs = self.layers_dim assert (len(szs) >= 1) fc = tflearn.fully_connected bn = tflearn.batch_normalization lrelu = tflearn.activations.leaky_relu if reuse: tf.get_variable_scope().reuse_variables() nLayers = len(szs) us = [] zs = [] z_zs = [] z_ys = [] z_us = [] reg = 'L2' prevU = x for i in range(nLayers): with tf.variable_scope('u' + str(i), reuse=reuse) as s: u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg) if i < nLayers - 1: u = tf.nn.relu(u) if FLAGS.icnn_bn: u = bn(u, reuse=reuse, scope=s, name='bn') variable_summaries(u, suffix='u{}'.format(i)) us.append(u) prevU = u prevU, prevZ = x, y for i in range(nLayers + 1): sz = szs[i] if i < nLayers else 1 z_add = [] if i > 0: with tf.variable_scope('z{}_zu_u'.format(i), reuse=reuse) as s: zu_u = fc(prevU, szs[i - 1], reuse=reuse, scope=s, activation='relu', bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(zu_u, suffix='zu_u{}'.format(i)) with tf.variable_scope('z{}_zu_proj'.format(i), reuse=reuse) as s: z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) variable_summaries(z_zu, suffix='z_zu{}'.format(i)) z_zs.append(z_zu) z_add.append(z_zu) with tf.variable_scope('z{}_yu_u'.format(i), reuse=reuse) as s: yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(yu_u, suffix='yu_u{}'.format(i)) with tf.variable_scope('z{}_yu'.format(i), reuse=reuse) as s: z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) z_ys.append(z_yu) variable_summaries(z_yu, suffix='z_yu{}'.format(i)) z_add.append(z_yu) with tf.variable_scope('z{}_u'.format(i), reuse=reuse) as s: z_u = fc(prevU, sz, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(0.)) variable_summaries(z_u, suffix='z_u{}'.format(i)) z_us.append(z_u) z_add.append(z_u) z = tf.add_n(z_add) variable_summaries(z, suffix='z{}_preact'.format(i)) if i < nLayers: # z = tf.nn.relu(z) z = lrelu(z, alpha=FLAGS.lrelu) variable_summaries(z, suffix='z{}_act'.format(i)) zs.append(z) prevU = us[i] if i < nLayers else None prevZ = z print('z shape', z.get_shape()) z = tf.reshape(z, [-1], name='energies') return z def save(self, path): self.saver.save(self.sess, path) def restore(self, filename): """ IMPORTANT: Filename should be the filepath to the 4 following files: - 50314.index - 50314.meta - 50314.data-00000-of-00001 - checkpoint Note that it shouldn't include any extension. In this case, it would therefore be `tensorboard/models/50314` Note that it is `50314` because I used the global training step as a filename to save model !!!! BESIDES YOU SHOULD HAVE INSTANTIATED THE AGENT WITH `finalize_graph=False` !!!! """ self.saver = tf.train.import_meta_graph(filename+'.meta') self.saver.restore(self.sess, filename) self.sess.graph.finalize() def __del__(self): self.sess.close()
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obses_t, windowLen): deicticObses_t = [] for i in range(np.shape(obses_t)[0] - windowLen): for j in range(np.shape(obses_t)[1] - windowLen): deicticObses_t.append(obses_t[i:i+windowLen,j:j+windowLen,:]) return np.array(deicticObses_t) # get set of deictic alternatives # input: batch x n x n x channels # output: (batch x deictic) x dn x dn x channels def getDeictic(obses_t, actions, obses_tp1, weights, windowLen): deicticObses_t = [] deicticActions = [] deicticObses_tp1 = [] deicticWeights = [] for i in range(np.shape(obses_t)[0]): for j in range(np.shape(obses_t)[1] - windowLen): for k in range(np.shape(obses_t)[2] - windowLen): deicticObses_t.append(obses_t[i,j:j+windowLen,k:k+windowLen,:]) deicticActions.append(actions[i]) deicticObses_tp1.append(obses_tp1[i,j:j+windowLen,k:k+windowLen,:]) deicticWeights.append(weights[i]) return np.array(deicticObses_t), np.array(deicticActions), np.array(deicticObses_tp1), np.array(deicticWeights) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong # convs=[(8,4,1)], # used for non-deictic TestRob3-v0 convs=[(4,3,1)], # used for deictic TestRob3-v0 hiddens=[16], dueling=True ) # parameters q_func=model lr=1e-3 # max_timesteps=100000 # max_timesteps=50000 max_timesteps=20000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 # exploration_final_eps=0.1 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1. target_network_update_freq=500 prioritized_replay=False # prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 deicticShape = (3,3,1) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(deicticShape, name=name) matchShape = (batch_size*25,) def make_match_ph(name): return U.BatchInput(matchShape, name=name) sess = U.make_session(num_cpu) sess.__enter__() # act, train, update_target, debug = build_graph.build_train( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min( make_obs_ph=make_obs_ph, make_match_ph=make_match_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # get action to take # action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] # qvalues = getq(np.array(obs)[None]) # action = np.argmax(qvalues) # if np.random.rand() < exploration.value(t): # action = np.random.randint(env.action_space.n) deicticObs = getDeicticObs(obs,3) qvalues = getq(np.array(deicticObs)) action = np.argmax(np.max(qvalues,0)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # # temporarily take uniformly random actions all the time # action = np.random.randint(env.action_space.n) new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None # Convert batch to deictic format obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(obses_t, actions, obses_tp1, weights, 3) obses_t_deic_fingerprints = [np.reshape(obses_t_deic[i],[9]) for i in range(np.shape(obses_t_deic)[0])] _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,axis=0,return_index=True,return_inverse=True) # matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)] # td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) num2avg = 20 rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def main(): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # Dictionary-based value function q_func_tabular = {} defaultQValue = np.ones(env.action_space.n) # Given an integer, return the corresponding boolean array def getBoolBits(state): return np.unpackbits(np.uint8(state), axis=1) == 1 # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func_tabular[x] if x in q_func_tabular else defaultQValue for x in keys ]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 0.1 for i in range(len(keys)): if keys[i] in q_func_tabular: q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[ keys[i]] + alpha * qCurrTargets[i] # q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] max_timesteps = 200000 exploration_fraction = 0.3 exploration_final_eps = 0.02 print_freq = 1 gamma = .98 num_cpu = 16 # Used by buffering and DQN learning_starts = 10 buffer_size = 100 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 valueFunctionType = "TABULAR" # valueFunctionType = "DQN" episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Set up replay buffer prioritized_replay = True # prioritized_replay=False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) sess = U.make_session(num_cpu) sess.__enter__() state = env.reset() episode_rewards = [0.0] timerStart = time.time() for t in range(max_timesteps): # np.unpackbits(np.uint8(np.reshape(states_tp1,[batch_size,1])),axis=1) qCurr = getTabular(getBoolBits([[state]])) qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly # select action at random action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action nextState, rew, done, _ = env.step(action) replay_buffer.add(state, action, rew, nextState, float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta = beta_schedule.value(t) states_t, actions, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actions, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None qNext = getTabular( getBoolBits(np.reshape(states_tp1, [batch_size, 1]))) qNextmax = np.max(qNext, axis=1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrTarget = getTabular( getBoolBits(np.reshape(states_t, [batch_size, 1]))) td_error = qCurrTarget[range(batch_size), actions] - targets qCurrTarget[range(batch_size), actions] = targets trainTabular(getBoolBits(np.reshape(states_t, [batch_size, 1])), qCurrTarget) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal state = np.copy(nextState)
class DQNAgent(object): def __init__(self, stateShape, actionSpace, numPicks, memorySize, burnin=1000): self.numPicks = numPicks self.memorySize = memorySize self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6) self.stateShape = stateShape self.actionSpace = actionSpace self.step = 0 self.sync = 200 self.burnin = burnin self.alpha = 0.001 self.epsilon = 1 self.epsilon_decay = 0.5 self.epsilon_min = 0.01 self.eps_threshold = 0 self.gamma = 0.99 self.trainNetwork = self.createNetwork( stateShape, len(actionSpace), self.alpha) self.targetNetwork = self.createNetwork( stateShape, len(actionSpace), self.alpha) self.targetNetwork.set_weights( self.trainNetwork.get_weights()) def createNetwork(self, n_input, n_output, learningRate): model = keras.models.Sequential() model.add(keras.layers.Dense( 24, activation='relu', input_shape=n_input)) model.add(keras.layers.Dense(48, activation='relu')) model.add(keras.layers.Dense(n_output, activation='linear')) model.compile( loss='mse', optimizer=keras.optimizers.Adam(lr=learningRate)) print(model.summary()) return model def trainDQN(self): if len(self.replayMemory) <= self.numPicks or len(self.replayMemory) < self.burnin: return 0 beta = 0.4 + self.step * (1.0 - 0.4) / 300 samples = self.replayMemory.sample(self.numPicks, beta) #batch = Transition(*zip(*samples)) currStates, actions, rewards, nextStates, dones, weights, indices = samples currStates = np.squeeze(np.array(currStates), 1) Q_currents = self.trainNetwork(currStates, training=False).numpy() nextStates = np.squeeze(np.array(nextStates), 1) Q_futures = self.targetNetwork(nextStates, training=False).numpy().max(axis=1) rewards = np.array(rewards).reshape(self.numPicks,).astype(float) actions = np.array(actions).reshape(self.numPicks,).astype(int) dones = np.array(dones).astype(bool) notDones = (~dones).astype(float) dones = dones.astype(float) Q_currents_cp = deepcopy(Q_currents) Q_currents_cp[np.arange(self.numPicks), actions] = rewards * dones + (rewards + Q_futures * self.gamma)*notDones loss = tf.multiply(tf.pow(tf.subtract(Q_currents[np.arange(self.numPicks), actions], Q_currents_cp[np.arange(self.numPicks), actions]), 2), weights).numpy() prios = loss + 1e-5 self.replayMemory.update_priorities(indices, prios) loss = self.trainNetwork.train_on_batch(currStates, Q_currents) return loss def selectAction(self, state): self.step += 1 if self.step % self.sync == 0: self.targetNetwork.set_weights( self.trainNetwork.get_weights()) q = -100000 if np.random.rand(1) < self.epsilon: action = np.random.randint(0, 3) else: preds = np.squeeze(self.trainNetwork( state, training=False).numpy(), axis=0) action = np.argmax(preds) q = preds[action] return action, q def addMemory(self, state, action, reward, nextState, done): self.replayMemory.add(state, action, reward, nextState, done) def save(self): save_path = ( f"./mountain_car_tfngmo_{int(self.step)}.chkpt" ) self.trainNetwork.save( save_path ) print(f"MountainNet saved to {save_path} done!")
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") env = gym.make("FrozenLake8x8nohole-v0") # robShape = (2,) # robShape = (3,) # robShape = (200,) # robShape = (16,) robShape = (64,) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(robShape, name=name) # # these params are specific to mountaincar # def getOneHotObs(obs): # obsFraction = (obs[0] + 1.2) / 1.8 # idx1 = np.int32(np.trunc(obsFraction*100)) # obsFraction = (obs[1] + 0.07) / 0.14 # idx2 = np.int32(np.trunc(obsFraction*100)) # ident = np.identity(100) # return np.r_[ident[idx1,:],ident[idx2,:]] # these params are specific to frozenlake def getOneHotObs(obs): # ident = np.identity(16) ident = np.identity(64) return ident[obs,:] model = models.mlp([32]) # model = models.mlp([64]) # model = models.mlp([64], layer_norm=True) # model = models.mlp([16, 16]) # parameters q_func=model lr=1e-3 # max_timesteps=100000 max_timesteps=50000 # max_timesteps=10000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 # exploration_final_eps=0.1 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1.0 target_network_update_freq=500 # prioritized_replay=False prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() obs = getOneHotObs(obs) # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) new_obs = getOneHotObs(new_obs) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() obs = getOneHotObs(obs) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess num2avg = 20 rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def learn(env, network, seed=None, lr=5e-5, total_timesteps=100000, buffer_size=500000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, batch_size=32, print_freq=10, checkpoint_freq=100000, checkpoint_path=None, learning_starts=0, gamma=0.99, target_network_update_freq=10000, prioritized_replay=True, prioritized_replay_alpha=0.4, prioritized_replay_beta0=0.6, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-3, param_noise=False, callback=None, load_path=None, load_idx=None, demo_path=None, n_step=10, demo_prioritized_replay_eps=1.0, pre_train_timesteps=750000, epsilon_schedule="constant", **network_kwargs): # Create all the functions necessary to train the model set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) with tf.device('/GPU:0'): model = DQfD(q_func=q_func, observation_shape=env.observation_space.shape, num_actions=env.action_space.n, lr=lr, grad_norm_clipping=10, gamma=gamma, param_noise=param_noise) # Load model from checkpoint if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) if load_idx is None: ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) else: ckpt.restore(manager.checkpoints[load_idx]) print("Restoring from {}".format(manager.checkpoints[load_idx])) # Setup demo trajectory assert demo_path is not None with open(demo_path, "rb") as f: trajectories = pickle.load(f) # Create the replay buffer replay_buffer = PrioritizedReplayBuffer(buffer_size, prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) temp_buffer = deque(maxlen=n_step) is_demo = True for epi in trajectories: for obs, action, rew, new_obs, done in epi: obs, new_obs = np.expand_dims( np.array(obs), axis=0), np.expand_dims(np.array(new_obs), axis=0) if n_step: temp_buffer.append((obs, action, rew, new_obs, done, is_demo)) if len(temp_buffer) == n_step: n_step_sample = get_n_step_sample(temp_buffer, gamma) replay_buffer.demo_len += 1 replay_buffer.add(*n_step_sample) else: replay_buffer.demo_len += 1 replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), float(is_demo)) logger.log("trajectory length:", replay_buffer.demo_len) # Create the schedule for exploration if epsilon_schedule == "constant": exploration = ConstantSchedule(exploration_final_eps) else: # not used exploration = LinearSchedule(schedule_timesteps=int( exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) model.update_target() # ============================================== pre-training ====================================================== start = time() num_episodes = 0 temp_buffer = deque(maxlen=n_step) for t in tqdm(range(pre_train_timesteps)): # sample and train experience = replay_buffer.sample(batch_size, beta=prioritized_replay_beta0) batch_idxes = experience[-1] if experience[6] is None: # for n_step = 0 obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple( map(tf.constant, experience[:6])) obses_tpn, rewards_n, dones_n = None, None, None weights = tf.constant(experience[-2]) else: obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple( map(tf.constant, experience[:-1])) td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train( obses_t, actions, rewards, obses_tp1, dones, is_demos, weights, obses_tpn, rewards_n, dones_n) # Update priorities new_priorities = np.abs(td_errors) + np.abs( n_td_errors) + demo_prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # Update target network periodically if t > 0 and t % target_network_update_freq == 0: model.update_target() # Logging elapsed_time = timedelta(time() - start) if print_freq is not None and t % 10000 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", 0) logger.record_tabular("max 100 episode reward", 0) logger.record_tabular("min 100 episode reward", 0) logger.record_tabular("demo sample rate", 1) logger.record_tabular("epsilon", 0) logger.record_tabular("loss_td", np.mean(loss_dq.numpy())) logger.record_tabular("loss_n_td", np.mean(loss_n.numpy())) logger.record_tabular("loss_margin", np.mean(loss_E.numpy())) logger.record_tabular("loss_l2", np.mean(loss_l2.numpy())) logger.record_tabular("losses_all", weighted_error.numpy()) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("pre_train", True) logger.record_tabular("elapsed time", elapsed_time) logger.dump_tabular() # ============================================== exploring ========================================================= sample_counts = 0 demo_used_counts = 0 episode_rewards = deque(maxlen=100) this_episode_reward = 0. best_score = 0. saved_mean_reward = None is_demo = False obs = env.reset() # Always mimic the vectorized env obs = np.expand_dims(np.array(obs), axis=0) reset = True for t in tqdm(range(total_timesteps)): if callback is not None: if callback(locals(), globals()): break kwargs = {} if not param_noise: update_eps = tf.constant(exploration.value(t)) update_param_noise_threshold = 0. else: # not used update_eps = tf.constant(0.) update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action, epsilon, _, _ = model.step(tf.constant(obs), update_eps=update_eps, **kwargs) action = action[0].numpy() reset = False new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. new_obs = np.expand_dims(np.array(new_obs), axis=0) if n_step: temp_buffer.append((obs, action, rew, new_obs, done, is_demo)) if len(temp_buffer) == n_step: n_step_sample = get_n_step_sample(temp_buffer, gamma) replay_buffer.add(*n_step_sample) else: replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), 0.) obs = new_obs # invert log scaled score for logging this_episode_reward += np.sign(rew) * (np.exp(np.sign(rew) * rew) - 1.) if done: num_episodes += 1 obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(this_episode_reward) reset = True if this_episode_reward > best_score: best_score = this_episode_reward ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, './best_model', max_to_keep=1) manager.save(t) logger.log("saved best model") this_episode_reward = 0.0 if t % train_freq == 0: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) batch_idxes = experience[-1] if experience[6] is None: # for n_step = 0 obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple( map(tf.constant, experience[:6])) obses_tpn, rewards_n, dones_n = None, None, None weights = tf.constant(experience[-2]) else: obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple( map(tf.constant, experience[:-1])) td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train( obses_t, actions, rewards, obses_tp1, dones, is_demos, weights, obses_tpn, rewards_n, dones_n) new_priorities = np.abs(td_errors) + np.abs( n_td_errors ) + demo_prioritized_replay_eps * is_demos + prioritized_replay_eps * ( 1. - is_demos) replay_buffer.update_priorities(batch_idxes, new_priorities) # for logging sample_counts += batch_size demo_used_counts += np.sum(is_demos) if t % target_network_update_freq == 0: # Update target network periodically. model.update_target() if t % checkpoint_freq == 0: save_path = checkpoint_path ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, save_path, max_to_keep=10) manager.save(t) logger.log("saved checkpoint") elapsed_time = timedelta(time() - start) if done and num_episodes > 0 and num_episodes % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", np.mean(episode_rewards)) logger.record_tabular("max 100 episode reward", np.max(episode_rewards)) logger.record_tabular("min 100 episode reward", np.min(episode_rewards)) logger.record_tabular("demo sample rate", demo_used_counts / sample_counts) logger.record_tabular("epsilon", epsilon.numpy()) logger.record_tabular("loss_td", np.mean(loss_dq.numpy())) logger.record_tabular("loss_n_td", np.mean(loss_n.numpy())) logger.record_tabular("loss_margin", np.mean(loss_E.numpy())) logger.record_tabular("loss_l2", np.mean(loss_l2.numpy())) logger.record_tabular("losses_all", weighted_error.numpy()) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("pre_train", False) logger.record_tabular("elapsed time", elapsed_time) logger.dump_tabular() return model
def dist_learn(env, q_dist_func, num_atoms=51, V_max=10, lr=25e-5, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.01, exploration_final_eps=0.008, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=2000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=1, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.single_threaded_session() sess.__enter__() def make_obs_ph(name): print name return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = build_dist_train( make_obs_ph=make_obs_ph, dist_func=q_dist_func, num_actions=env.action_space.n, num_atoms=num_atoms, V_max=V_max, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) # act, train, update_target, debug = build_train( # make_obs_ph=make_obs_ph, # q_func=q_func, # num_actions=env.action_space.n, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10 # ) act_params = { 'make_obs_ph': make_obs_ph, 'q_dist_func': q_dist_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") print model_file # mkdir_p(os.path.dirname(model_file)) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: # print "CCCC" obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # print "Come1" # print np.shape(obses_t), np.shape(actions), np.shape(rewards), np.shape(obses_tp1), np.shape(dones) td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # print "Loss : {}".format(td_errors) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print "steps : {}".format(t) print "episodes : {}".format(num_episodes) print "mean 100 episode reward: {}".format(mean_100ep_reward) # print "mean 100 episode reward".format(mean_100ep_reward) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and t % checkpoint_freq == 0): print "==========================" print "Error: {}".format(td_errors) if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: print "Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward) # logger.log("Saving model due to mean reward increase: {} -> {}".format( # saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: print "Restored model with mean reward: {}".format( saved_mean_reward) # logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)