Example #1
0
 def __init__(self,
              net,
              actionSet,
              goalSet,
              defaultNSample,
              defaultRandomPlaySteps,
              controllerMemCap,
              explorationSteps,
              trainFreq,
              hard_update,
              controllerEpsilon=defaultControllerEpsilon):
     self.actionSet = actionSet
     self.controllerEpsilon = controllerEpsilon
     self.goalSet = goalSet
     self.nSamples = defaultNSample
     self.gamma = defaultGamma
     self.net = net
     self.controllerMemCap = controllerMemCap
     self.memory = PrioritizedReplayBuffer(controllerMemCap,
                                           alpha=prioritized_replay_alpha)
     self.enable_double_dqn = True
     self.exploration = LinearSchedule(schedule_timesteps=explorationSteps,
                                       initial_p=1.0,
                                       final_p=0.02)
     self.defaultRandomPlaySteps = defaultRandomPlaySteps
     self.trainFreq = trainFreq
     self.randomPlay = True
     self.learning_done = False
     self.hard_update = hard_update
    def __init__(self, stateShape, actionSpace, numPicks, memorySize, burnin=1000):
        self.numPicks = numPicks
        self.memorySize = memorySize
        self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6)
        self.stateShape = stateShape
        self.actionSpace = actionSpace

        self.step = 0
        self.sync = 200
        self.burnin = burnin

        self.alpha = 0.001
        self.epsilon = 1
        self.epsilon_decay = 0.5
        self.epsilon_min = 0.01
        self.eps_threshold = 0

        self.gamma = 0.99

        self.trainNetwork = self.createNetwork(
            stateShape, len(actionSpace), self.alpha)
        self.targetNetwork = self.createNetwork(
            stateShape, len(actionSpace), self.alpha)
        self.targetNetwork.set_weights(
            self.trainNetwork.get_weights())
    def __init__(self,
                 stateShape,
                 actionSpace,
                 numPicks,
                 memorySize,
                 sync=10,
                 burnin=1000,
                 alpha=0.0001,
                 epsilon=1,
                 epsilon_decay=0.05,
                 epsilon_min=0.01,
                 gamma=0.99):
        self.numPicks = numPicks
        self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6)
        self.stateShape = stateShape
        self.actionSpace = actionSpace

        self.step = 0

        self.sync = sync
        self.burnin = burnin
        self.alpha = alpha
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.gamma = gamma

        self.walpha = 0.01
        self.delay = 1

        self.trainNetwork = self.createNetwork(stateShape, len(actionSpace),
                                               self.alpha)
        self.targetNetwork = self.createNetwork(stateShape, len(actionSpace),
                                                self.alpha)
        self.targetNetwork.set_weights(self.trainNetwork.get_weights())
    def __init__(self,
                 memory_size,
                 batch_size,
                 learn_start_time,
                 learn_fre,
                 lr,
                 replay_iters,
                 eps_T,
                 eps_t_init,
                 gamma,
                 update_period,
                 board,
                 device,
                 model_path,
                 r_memory_Fname,
                 o_model_name,
                 model_load=False):
        self.step_now = 0  # record the step
        self.reward_num = 0
        self.reward_accumulated = 0  # delay reward
        self.final_tem = 10  # just for now
        self.step_last_update = 0  # record the last update time
        self.update_period = update_period  # for the off policy
        self.learn_start_time = learn_start_time
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.alpha = 0.6
        self.beta = 0.4
        self.replay_bata_iters = replay_iters
        self.replay_eps = 1e-6
        self.memory_min_num = 1000  #she min num to learn
        self.step_last_learn = 0  # record the last learn step
        self.learn_fre = learn_fre  # step frequency to learn
        self.e_greedy = 1  # record the e_greedy
        self.eps_T = eps_T  # par for updating the maybe step 80,0000
        self.eps_t_init = eps_t_init  # par for updating the eps

        self.device = device
        self.model_path = model_path
        self.mode_enjoy = model_load
        if model_load == False:
            self.policy_net = DQN(board[0], board[1], action_num).to(device)
            self.target_net = DQN(board[0], board[1], action_num).to(device)
            self.optimizer = optim.Adagrad(self.policy_net.parameters(), lr=lr)
            self.loss_fn = nn.functional.mse_loss  # use the l1 loss
            self.memory = PrioritizedReplayBuffer(memory_size, self.alpha)
            self.beta_schedule = LinearSchedule(self.replay_bata_iters,
                                                self.beta, 1.0)
        else:
            self.load(o_model_name)
        #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)
        self.obs_new = None
        self.obs_old = None
        self.action = None
        self.action_old = None
        self.dqn_direct_flag = False  # show if the dqn action is done
        self.model_save_flag = False
Example #5
0
    def clear_memory(self, goal):

        self.learning_done = True  ## Set the done learning flag
        #del self.trainable_model

        del self.memory

        self.memory = PrioritizedReplayBuffer(self.controllerMemCap,
                                              alpha=prioritized_replay_alpha)

        gpu = self.net.gpu

        device = '/cpu' if gpu < 0 else '/gpu:' + str(gpu)

        #del self.net

        gc.collect()

        rmsProp = optimizers.RMSprop(lr=LEARNING_RATE,
                                     rho=0.95,
                                     epsilon=1e-08,
                                     decay=0.0)

        with tf.device(device):
            self.simple_net = Sequential()
            self.simple_net.add(
                Conv2D(32, (8, 8),
                       strides=4,
                       activation='relu',
                       padding='valid',
                       input_shape=(84, 84, 4)))
            self.simple_net.add(
                Conv2D(64, (4, 4),
                       strides=2,
                       activation='relu',
                       padding='valid'))
            self.simple_net.add(
                Conv2D(64, (3, 3),
                       strides=1,
                       activation='relu',
                       padding='valid'))
            self.simple_net.add(Flatten())
            self.simple_net.add(
                Dense(HIDDEN_NODES,
                      activation='relu',
                      kernel_initializer=initializers.random_normal(
                          stddev=0.01, seed=SEED)))
            self.simple_net.add(
                Dense(nb_Action,
                      activation='linear',
                      kernel_initializer=initializers.random_normal(
                          stddev=0.01, seed=SEED)))
            self.simple_net.compile(loss='mse', optimizer=rmsProp)
            self.simple_net.load_weights(recordFolder + '/policy_subgoal_' +
                                         str(goal) + '.h5')
            self.simple_net.reset_states()
Example #6
0
def test_per(capacity):
    # test implementation of proritized replay buffer
    p_buffer = PrioritizedReplayBuffer(capacity)

    # populate the buffer
    for _ in range(capacity // 2):
        p_buffer.add(Experience())

    # update batches of experience
    n_batches = 10
    batch_size = 100
    for _ in range(10):
        # randomly sample $batch_size of tree indices
        idx = random.sample([x for x in range(capacity - 1, 2 * capacity - 1)],
                            batch_size)

        td_errors = np.random.uniform(0, 10, batch_size)

        p_buffer.batch_update(idx, td_errors)

        assert p_buffer.tree.max_priority == np.max(
            p_buffer.tree.tree[-capacity:])

    # test sample
    for _ in range(10):
        p_buffer.sample(batch_size)

    return
Example #7
0
 def __init__(self,
              net,
              target_net,
              alpha=0.6,
              beta=0.4,
              beta_delta=1.001,
              e=1e-8,
              **kwargs):
     super(DDQNAgentPER, self).__init__(net, target_net, **kwargs)
     self.memory = PrioritizedReplayBuffer(**kwargs)
     self.__alpha = alpha
     self.__beta = beta
     self.__beta_delta = beta_delta
     self.__e = e
Example #8
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        # Target or w-
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if (PRIORITIZED_REPLY_ENABLED):
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.B = .001
Example #9
0
 def __init__(self,
              total_timesteps=100000,
              buffer_size=50000,
              type_buffer="HER",
              prioritized_replay=True,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta0=0.4,
              prioritized_replay_beta_iters=None,
              prioritized_replay_eps=1e-6):
     self.buffer_size = buffer_size
     self.prioritized_replay_eps = prioritized_replay_eps
     self.type_buffer = type_buffer
     if prioritized_replay:
         if type_buffer == "PER":
             self.replay_buffer = PrioritizedReplayBuffer(
                 buffer_size, alpha=prioritized_replay_alpha)
         if type_buffer == "HER":
             self.replay_buffer = HighlightReplayBuffer(
                 buffer_size, alpha=prioritized_replay_alpha)
         if prioritized_replay_beta_iters is None:
             prioritized_replay_beta_iters = total_timesteps
         self.beta_schedule = LinearSchedule(
             prioritized_replay_beta_iters,
             initial_p=prioritized_replay_beta0,
             final_p=1.0)
     else:
         self.replay_buffer = ReplayBuffer(buffer_size)
         self.beta_schedule = None
Example #10
0
    def __init__(self, state_size, action_size, config=RLConfig()):
        self.seed = random.seed(config.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = config.batch_size
        self.batch_indices = torch.arange(config.batch_size).long().to(device)
        self.samples_before_learning = config.samples_before_learning
        self.learn_interval = config.learning_interval
        self.parameter_update_interval = config.parameter_update_interval
        self.per_epsilon = config.per_epsilon
        self.tau = config.tau
        self.gamma = config.gamma

        if config.useDuelingDQN:
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             config.seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              config.seed).to(device)
        else:
            self.qnetwork_local = DQN(state_size, action_size,
                                      config.seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       config.seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=config.learning_rate)

        self.doubleDQN = config.useDoubleDQN
        self.usePER = config.usePER
        if self.usePER:
            self.memory = PrioritizedReplayBuffer(config.buffer_size,
                                                  config.per_alpha)
        else:
            self.memory = ReplayBuffer(config.buffer_size)

        self.t_step = 0
Example #11
0
File: dqn.py Project: marcinic/dqn
 def __init__(self,
              frame_dims,
              n_actions,
              priority_replay=True,
              epsilon=.99,
              discount=.99):
     self.epsilon_start = epsilon
     self.epsilon = epsilon
     self.epsilon_min = 0.1
     self.final_exploration_frame = 1e6
     self.epsilon_decay = 0.99
     self.learning_rate = 0.00025
     self.discount = discount
     self.frame_dims = frame_dims
     self.n_actions = n_actions
     self.alpha = 0.7
     self.update_freq = 10000
     self.batch_size = 32
     self.tb = TensorBoard(log_dir='./logs',
                           write_graph=True,
                           write_images=False)
     #self.summary_writer = K.summary.FileWriter('./logs/')
     self.beta = 0.5
     self.priority_replay_eps = 1e-6
     self.priority_replay = priority_replay
     self.avg_q = -1
     if priority_replay:
         self.memory = PrioritizedReplayBuffer(100000, self.alpha)
     else:
         self.memory = ReplayBuffer(600000)
Example #12
0
def td_learning(args):
    agent = DQNAgent(args)
    replay_memory = PrioritizedReplayBuffer(1000000, args.alpha)
    #eval_game(agent, 500)
    outer = tqdm(range(args.total_steps), desc='Total steps', position=0)
    game = init_game()
    ave_score = 0
    count = 0
    for step in outer:
        board = copy.deepcopy(game.gameboard.board)
        if step < args.start_learn:
            avail_choices = game.gameboard.get_available_choices()
            index = np.random.randint(len(avail_choices))
            choice = avail_choices[index]
        else:
            choice = agent.greedy_policy(
                board, game.gameboard.get_available_choices())

        next_board, reward = game.input_pos(choice[0], choice[1])
        next_board = copy.deepcopy(next_board)
        #####

        replay_memory.add(board, choice, reward, next_board)
        #####
        if game.termination():
            ave_score += game.gameboard.score
            count += 1
            game = init_game()

        if step >= args.start_learn and step % args.train_freq == 0:
            if count > 0:
                message = "ave score of " + str(count) + " game: " + str(
                    ave_score / count)
                out_fd.write("{} {}\n".format(step, ave_score / count))
                outer.write(message)
                ave_score = 0
                count = 0
            if step == args.start_learn:
                experience = replay_memory.sample(args.start_learn,
                                                  beta=agent.beta)
            else:
                experience = replay_memory.sample(args.train_data_size,
                                                  beta=agent.beta)

            boards, choices, rewards, next_boards, weights, batch_idxes = experience

            td_errors = agent.train(
                (boards, choices, rewards, next_boards, weights))
            new_priorities = np.abs(td_errors) + prioritized_replay_eps
            replay_memory.update_priorities(batch_idxes, new_priorities)

            agent.update_target(args.soft_tau)
            agent.update_epsilon()
            agent.update_beta()

    eval_game(agent, 500)
    out_fd.close()
Example #13
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 lr_decay=0.9999,
                 double_dqn=False,
                 duel_dqn=False,
                 prio_exp=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): Dimension of each State
            action_size (int): Dimension of each Action
            seed (int): Random Seed
            lr_decay (float): Decay float for alpha learning rate
            DOUBLE DQN (boolean): Indicator for Double Deep Q-Network
            DUEL DQN (boolean): Indicator for Duel Deep Q-Network
            PRIORITISED_EXPERIENCE (boolean): Indicator for Prioritized Experience Replay
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.lr_decay = lr_decay
        self.DOUBLE_DQN = double_dqn
        self.DUEL_DQN = duel_dqn
        self.PRIORITISED_EXPERIENCE = prio_exp

        # Determine Deep Q-Network for use
        if self.DUEL_DQN:
            self.qnetwork_local = DuelQNetwork(state_size, action_size,
                                               seed).to(device)
            self.qnetwork_target = DuelQNetwork(state_size, action_size,
                                                seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)

        # Initialize Optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Determine if Prioritized Experience will be used
        if self.PRIORITISED_EXPERIENCE:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_anneal=1.0001)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Example #14
0
class DQNAgentPER(DQNAgentBase):
    def __init__(self,
                 net,
                 target_net,
                 alpha=0.6,
                 beta=0.4,
                 beta_delta=1.001,
                 e=1e-8,
                 **kwargs):
        super(DQNAgentPER, self).__init__(net, target_net, **kwargs)
        self.memory = PrioritizedReplayBuffer(**kwargs)
        self.__alpha = alpha
        self.__beta = beta
        self.__beta_delta = beta_delta
        self.__e = e

    def _learn(self, samples):
        states, actions, rewards, next_states, dones, idxs, probs = samples
        expected_q_values = self.net(states, training=True).gather(1, actions)
        # DQN target
        target_q_values_next = self.target_net(
            next_states, training=True).detach().max(1)[0].unsqueeze(1)
        target_q_values = rewards + self.gamma * target_q_values_next * (1 -
                                                                         dones)
        td_err = expected_q_values - target_q_values  # calc td error
        weights = (probs * self.memory.size()).pow(-self.__beta).to(
            self.device)
        weights = weights / weights.max()
        loss = torch.mean(td_err.pow(2).squeeze() * weights)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.memory.update(
            idxs.cpu().numpy(),
            td_err.abs().detach().cpu().numpy().squeeze()**self.__alpha +
            self.__e)
        return loss.detach().cpu().numpy()

    def step(self, state, action, reward, next_state, done):
        loss = super(DQNAgentPER, self).step(state, action, reward, next_state,
                                             done)
        if done:
            self.__beta = min(1., self.__beta * self.__beta_delta)
        return loss
Example #15
0
 def create_replay_buffer(self, prioritized_replay, prioritized_replay_eps, size_buffer, alpha_prioritized_replay, prioritized_replay_beta0, prioritized_replay_beta_iters, steps_total):
     self.prioritized_replay = prioritized_replay
     self.prioritized_replay_eps = prioritized_replay_eps
     if prioritized_replay:
         self.replay_buffer = PrioritizedReplayBuffer(size_buffer, alpha=alpha_prioritized_replay)
         if prioritized_replay_beta_iters is None:
             prioritized_replay_beta_iters = steps_total
         self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0)
     else:
         self.replay_buffer = ReplayBuffer(size_buffer)
         self.beta_schedule = None
     pass
Example #16
0
    def __init__(self, state_size, action_size, layer_spec, seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.layer_spec = layer_spec
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       layer_spec).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        layer_spec).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # (Prioritized) experience replay setup
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE
        self.min_prio = MIN_PRIO
        self.alpha = ALPHA
        self.beta = INIT_BETA
        self.beta_increment = BETA_INC
        if USE_PER:
            self.memory = PrioritizedReplayBuffer(size=self.buffer_size,
                                                  alpha=self.alpha)
        else:
            self.memory = DequeReplayBuffer(action_size=self.action_size,
                                            buffer_size=self.buffer_size,
                                            batch_size=self.batch_size,
                                            seed=42)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # print info about Agent
        print('Units in the hidden layers are {}.'.format(str(layer_spec)))
        print('Using Double-DQN is \"{}\".'.format(str(USE_DDQN)))
        print('Using prioritized experience replay is \"{}\".'.format(
            str(USE_PER)))
Example #17
0
def learn(env, args):
    ob = env.reset()
    ob_shape = ob.shape
    num_action = int(env.action_space.n)

    agent = TestAgent(ob_shape, num_action, args)
    replay_buffer = PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha)
    args.prioritized_replay_beta_iters = args.max_timesteps
    beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters, 
                                    initial_p=args.prioritized_replay_beta0, 
                                    final_p=1.0)

    episode_rewards = [0.0]
    saved_mean_reward = None
    n_step_seq = []

    agent.sample_noise()
    agent.update_target()

    for t in range(args.max_timesteps):
        action = agent.act(ob)
        new_ob, rew, done, _ = env.step(action)
        replay_buffer.add(ob, action, rew, new_ob, float(done))
        ob = new_ob

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)
            reset = True

        if t > args.learning_starts and t % args.replay_period == 0:
            experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(t))
            (obs, actions, rewards, obs_next, dones, weights, batch_idxes) = experience
            agent.sample_noise()
            kl_errors = agent.update(obs, actions, rewards, obs_next, dones, weights)
            replay_buffer.update_priorities(batch_idxes, np.abs(kl_errors) + 1e-6)

        if t > args.learning_starts and t % args.target_network_update_freq == 0:
            agent.update_target()  

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and args.print_freq is not None and len(episode_rewards) % args.print_freq == 0:
            print('steps {} episodes {} mean reward {}'.format(t, num_episodes, mean_100ep_reward))
Example #18
0
 def __init__(self,
              total_timesteps=100000,
              buffer_size=50000,
              prioritized_replay=False,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta0=0.4,
              prioritized_replay_beta_iters=None,
              prioritized_replay_eps=1e-6):
     self.buffer_size = buffer_size
     if prioritized_replay:
         self.replay_buffer = PrioritizedReplayBuffer(
             buffer_size, alpha=prioritized_replay_alpha)
         if prioritized_replay_beta_iters is None:
             prioritized_replay_beta_iters = total_timesteps
         beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                        initial_p=prioritized_replay_beta0,
                                        final_p=1.0)
     else:
         self.replay_buffer = ReplayBuffer(buffer_size)
         beta_schedule = None
Example #19
0
    def __init__(self):
        #self.action_space = [0, 1, 2, 3, 4, 5, 6]
        self.action_space = [i for i in range(4 * 7)
                             ]  # 28 grouped action : board 7x14
        self.action_size = len(self.action_space)
        self.next_stone_size = 6
        self.state_size = (rows + 1, cols, 1)
        self.discount_factor = 0.99

        # 딥마인드의 논문에서는 PER을 사용하여 샘플링한 데이터는 학습되는 양이 크기 때문에
        # 학습의 안정성을 위해 Learning rate를 기존 random uniform sample을 사용했을 때의 1/4 수준으로 줄였기에 이를 반영했습니다.
        #self.learning_rate = 0.00025
        self.learning_rate = 0.0000625

        self.epsilon = 0.  #1.
        self.epsilon_min = 0.0
        self.epsilon_decay = 1000000  #1000000

        self.model = self.build_model()
        self.target_model = self.build_model()

        # custom loss function을 따로 정의하여 학습에 사용합니다.
        self.model_updater = self.model_optimizer()

        self.batch_size = 64
        self.train_start = 50000  #50000

        # PER 선언 및 관련 hyper parameter입니다.

        # beta는 importance sampling ratio를 얼마나 반영할지에 대한 수치입니다.
        # 정확한 의미는 아니지만 정말 추상적으로 설명드리면
        # beta가 크다 -> PER을 사용함으로써 생기는 데이터 편향을 크게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 감소, 전체적인 학습은 조금더 안정적
        # beta가 작다 -> PER을 사용함으로써 생기는 데이터 편향을 작게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 증가, 전체적인 학습은 조금더 불안정
        # 논문에서는 초기 beta를 0.4로 두고 학습이 끝날때까지 선형적으로 1까지 증가시킴.

        # alpha는 TD-error의 크기를 어느정도로 반영할지에 대한 파라미터입니다. 수식으로는 (TD-error)^alpha 로 표현됩니다.
        # alpha가 0에 가까울수록 TD-error의 크기를 반영하지 않는 것이고 기존의 uniform sampling에 가까워집니다.
        # alpha가 1에 가까울수록 TD-error의 크기를 반영하는 것이고 PER에 가까워집니다.
        # 논문에서는 alpha를 0.6으로 사용했습니다.

        # prioritized_replay_eps는 (TD-error)^alpha를 계산할때 TD-error가 0인 상황을 방지하기위해 TD-error에 더 해주는 아주작은 상수값 입니다.

        self.memory = PrioritizedReplayBuffer(1000000, alpha=0.6)  #1000000
        self.beta = 0.4  # 0.4
        self.beta_max = 1.0
        self.beta_decay = 2000000  #5000000
        self.prioritized_replay_eps = 0.000001

        # 텐서보드 설정
        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)

        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        self.summary_writer = tf.summary.FileWriter('summary/tetris_dqn',
                                                    self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        self.load_model = True
        if self.load_model:
            self.model.load_weights("./DQN_tetris_model_0311.h5")

        self.imitation_mode = False
Example #20
0
    def __init__(
            self,
            env,
            memory_size,
            batch_size,
            target_update=100,
            gamma=0.99,
            # replay parameters
            alpha=0.2,
            beta=0.6,
            prior_eps=1e-6,
            # Categorical DQN parameters
            v_min=0,
            v_max=200,
            atom_size=51,
            # N-step Learning
            n_step=3,
            start_train=32,
            save_weights=True,
            log=True,
            lr=0.001,
            seed=0,
            episodes=200):

        self.env = env

        obs_dim = self.env.observation_dim
        action_dim = self.env.action_dim

        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        self.lr = lr
        self.memory_size = memory_size
        self.seed = seed

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)

        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(obs_dim,
                                              memory_size,
                                              batch_size,
                                              alpha=alpha)

        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(obs_dim,
                                         memory_size,
                                         batch_size,
                                         n_step=n_step,
                                         gamma=gamma)

        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atom_size).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(obs_dim, action_dim, self.atom_size,
                           self.support).to(self.device)
        self.dqn_target = Network(obs_dim, action_dim, self.atom_size,
                                  self.support).to(self.device)

        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)

        # transition to store in memory
        self.transition = list()

        self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10))

        self.start_train = start_train

        self.save_weights = save_weights

        self.time = datetime.datetime.now().timetuple()
        self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}"

        self.log = log
        self.episode_cnt = 0
        self.episodes = episodes

        if self.save_weights is True:
            self.create_save_directory()

        plt.ion()
Example #21
0
class DuelingDoubleDQNagent():
    def __init__(self):
        #self.action_space = [0, 1, 2, 3, 4, 5, 6]
        self.action_space = [i for i in range(4 * 7)
                             ]  # 28 grouped action : board 7x14
        self.action_size = len(self.action_space)
        self.next_stone_size = 6
        self.state_size = (rows + 1, cols, 1)
        self.discount_factor = 0.99

        # 딥마인드의 논문에서는 PER을 사용하여 샘플링한 데이터는 학습되는 양이 크기 때문에
        # 학습의 안정성을 위해 Learning rate를 기존 random uniform sample을 사용했을 때의 1/4 수준으로 줄였기에 이를 반영했습니다.
        #self.learning_rate = 0.00025
        self.learning_rate = 0.0000625

        self.epsilon = 0.  #1.
        self.epsilon_min = 0.0
        self.epsilon_decay = 1000000  #1000000

        self.model = self.build_model()
        self.target_model = self.build_model()

        # custom loss function을 따로 정의하여 학습에 사용합니다.
        self.model_updater = self.model_optimizer()

        self.batch_size = 64
        self.train_start = 50000  #50000

        # PER 선언 및 관련 hyper parameter입니다.

        # beta는 importance sampling ratio를 얼마나 반영할지에 대한 수치입니다.
        # 정확한 의미는 아니지만 정말 추상적으로 설명드리면
        # beta가 크다 -> PER을 사용함으로써 생기는 데이터 편향을 크게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 감소, 전체적인 학습은 조금더 안정적
        # beta가 작다 -> PER을 사용함으로써 생기는 데이터 편향을 작게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 증가, 전체적인 학습은 조금더 불안정
        # 논문에서는 초기 beta를 0.4로 두고 학습이 끝날때까지 선형적으로 1까지 증가시킴.

        # alpha는 TD-error의 크기를 어느정도로 반영할지에 대한 파라미터입니다. 수식으로는 (TD-error)^alpha 로 표현됩니다.
        # alpha가 0에 가까울수록 TD-error의 크기를 반영하지 않는 것이고 기존의 uniform sampling에 가까워집니다.
        # alpha가 1에 가까울수록 TD-error의 크기를 반영하는 것이고 PER에 가까워집니다.
        # 논문에서는 alpha를 0.6으로 사용했습니다.

        # prioritized_replay_eps는 (TD-error)^alpha를 계산할때 TD-error가 0인 상황을 방지하기위해 TD-error에 더 해주는 아주작은 상수값 입니다.

        self.memory = PrioritizedReplayBuffer(1000000, alpha=0.6)  #1000000
        self.beta = 0.4  # 0.4
        self.beta_max = 1.0
        self.beta_decay = 2000000  #5000000
        self.prioritized_replay_eps = 0.000001

        # 텐서보드 설정
        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)

        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        self.summary_writer = tf.summary.FileWriter('summary/tetris_dqn',
                                                    self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        self.load_model = True
        if self.load_model:
            self.model.load_weights("./DQN_tetris_model_0311.h5")

        self.imitation_mode = False

    # 각 에피소드 당 학습 정보를 기록
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_duration = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)

        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
        tf.summary.scalar('Total Clear Line/Episode', episode_avg_max_q)
        #tf.summary.scalar('Duration/Episode', episode_duration)
        #tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
        #tf.train.AdamOptimizer
        summary_vars = [
            episode_total_reward, episode_avg_max_q, episode_duration,
            episode_avg_loss
        ]
        summary_placeholders = [
            tf.placeholder(tf.float32) for _ in range(len(summary_vars))
        ]
        update_ops = [
            summary_vars[i].assign(summary_placeholders[i])
            for i in range(len(summary_vars))
        ]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

    def build_model(self):

        # Dueling DQN

        state = Input(shape=(
            self.state_size[0],
            self.state_size[1],
            self.state_size[2],
        ))
        layer = Conv2D(32, (5, 5),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(state)  # 64, (4, 4)
        layer = Conv2D(32, (3, 3),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(layer)  ##
        layer = Conv2D(32, (1, 1),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(layer)  ##
        layer = Conv2D(32, (3, 3),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(layer)  ##
        layer = Conv2D(32, (1, 1),
                       strides=(1, 1),
                       activation='relu',
                       padding='same',
                       kernel_initializer='he_uniform')(layer)  ##
        pool_1 = MaxPooling2D(pool_size=(3, 3),
                              strides=(1, 1),
                              padding='valid',
                              data_format=None)(layer)

        layer_2 = Conv2D(64, (3, 3),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(pool_1)  ##
        layer_2 = Conv2D(32, (1, 1),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(layer_2)  ##
        layer_2 = Conv2D(64, (3, 3),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(layer_2)
        pool_2 = MaxPooling2D(pool_size=(2, 2),
                              strides=(1, 1),
                              padding='valid',
                              data_format=None)(layer_2)

        layer_r = Conv2D(32, (rows + 1, 1),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(state)
        layer_c = Conv2D(32, (1, cols),
                         strides=(1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_uniform')(state)

        pool_1_r = Conv2D(32, (13, 1),
                          strides=(1, 1),
                          activation='relu',
                          padding='same',
                          kernel_initializer='he_uniform')(pool_1)
        pool_1_c = Conv2D(32, (1, 5),
                          strides=(1, 1),
                          activation='relu',
                          padding='same',
                          kernel_initializer='he_uniform')(pool_1)

        pool_2_r = Conv2D(32, (12, 1),
                          strides=(1, 1),
                          activation='relu',
                          padding='same',
                          kernel_initializer='he_uniform')(pool_2)
        pool_2_c = Conv2D(32, (1, 4),
                          strides=(1, 1),
                          activation='relu',
                          padding='same',
                          kernel_initializer='he_uniform')(pool_2)

        layer = Flatten()(layer)
        layer_2 = Flatten()(layer_2)
        pool_1 = Flatten()(pool_1)
        pool_2 = Flatten()(pool_2)
        layer_r = Flatten()(layer_r)
        layer_c = Flatten()(layer_c)
        pool_1_r = Flatten()(pool_1_r)
        pool_1_c = Flatten()(pool_1_c)
        pool_2_r = Flatten()(pool_2_r)
        pool_2_c = Flatten()(pool_2_c)

        merge_layer = concatenate([
            layer, layer_2, pool_1, pool_2, pool_1_c, pool_1_r, pool_2_c,
            pool_2_r, layer_c, layer_r
        ],
                                  axis=1)
        merge_layer = Dense(128,
                            activation='relu',
                            kernel_initializer='he_uniform')(merge_layer)

        vlayer = Dense(64, activation='relu',
                       kernel_initializer='he_uniform')(merge_layer)
        alayer = Dense(64, activation='relu',
                       kernel_initializer='he_uniform')(merge_layer)
        v = Dense(1, activation='linear',
                  kernel_initializer='he_uniform')(vlayer)
        v = Lambda(lambda v: tf.tile(v, [1, self.action_size]))(v)
        a = Dense(self.action_size,
                  activation='linear',
                  kernel_initializer='he_uniform')(alayer)
        a = Lambda(lambda a: a - tf.reduce_mean(a, axis=-1, keep_dims=True))(a)
        q = Add()([v, a])
        model = Model(inputs=state, outputs=q)
        # custom loss 및 optimizer를 사용할 것이기에 complie 부분은 주석처리 합니다.
        # model.compile(loss='logcosh', optimizer=Adam(lr=self.learning_rate))
        model.summary()

        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        '''
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            state = np.float32(state)
            q_values = self.model.predict(state)
            return np.argmax(q_values[0])


    def get_action(self, env, state):
        if np.random.rand() <= self.epsilon:
            if env.new_stone_flag:
                return random.randrange(4)
            else:
                return random.randrange(self.action_size)
        else:
            state = np.float32(state)
            q_values = self.model.predict(state)
            return np.argmax(q_values[0])
    '''

    def get_action(self, env, state):
        if np.random.rand() <= self.epsilon:
            if env.stone_number(env.stone) == 1:
                return random.randrange(14)
            elif env.stone_number(env.stone) == 4 or env.stone_number(
                    env.stone) == 6:
                return random.randrange(2) * 7 + random.randrange(6)
            elif env.stone_number(env.stone) == 2 or env.stone_number(
                    env.stone) == 5 or env.stone_number(env.stone) == 7:
                return random.randrange(4) * 7 + random.randrange(6)
            elif env.stone_number(env.stone) == 3:
                return random.randrange(6)
        else:
            state = np.float32(state)
            q_values = self.model.predict(state)
            r_action = np.argmax(q_values[0])

            return np.argmax(q_values[0])

    def model_optimizer(self):
        target = K.placeholder(shape=[None, self.action_size])
        weight = K.placeholder(shape=[
            None,
        ])

        # hubber loss에 대한 코드입니다.

        clip_delta = 1.0

        pred = self.model.output

        err = target - pred

        cond = K.abs(err) < clip_delta

        squared_loss = 0.5 * K.square(err)
        linear_loss = clip_delta * (K.abs(err) - 0.5 * clip_delta)

        loss1 = tf.where(cond, squared_loss, linear_loss)

        # 기존 hubber loss에 importance sampling ratio를 곱하는 형태의 PER loss를 정의합니다.
        weighted_loss = tf.multiply(tf.expand_dims(weight, -1), loss1)

        loss = K.mean(weighted_loss, axis=-1)

        optimizer = Adam(lr=self.learning_rate)
        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)

        train = K.function([self.model.input, target, weight], [err],
                           updates=updates)

        return train

    def train_model(self):

        (update_input, action, reward, update_target, done, weight,
         batch_idxes) = self.memory.sample(self.batch_size, beta=self.beta)

        target = self.model.predict(update_input)
        target_val = self.target_model.predict(update_target)
        target_val_arg = self.model.predict(update_target)

        # Double DQN
        for i in range(self.batch_size):
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                a = np.argmax(target_val_arg[i])
                target[i][action[
                    i]] = reward[i] + self.discount_factor * target_val[i][a]

        # PER에서 mini-batch로 샘플링한 데이터에 대해 학습을 진행합니다.
        # 학습을 하는 과정에서 새롭게 계산된 TD-error를 다시 반영하기 위해 err는 따로 출력하여 저장합니다.
        err = self.model_updater([update_input, target, weight])

        err = np.reshape(err, [self.batch_size, self.action_size])

        # TD-error가 0이 되는것을 방지하기위해 작은 상수를 더해줍니다.
        new_priorities = np.abs(np.sum(err,
                                       axis=1)) + self.prioritized_replay_eps

        # 샘플링한 데이터에 대해 새롭게 계산된 TD-error를 업데이트 합니다.
        self.memory.update_priorities(batch_idxes, new_priorities)
Example #22
0
    def __init__(self, dimO, dimA, beta, layers_dim, finalize_graph=True):
        """
        :param finalize_graph: if you want to restore a model, using .restore(), set this param to False
        """
        self.dimA = dimA
        self.dimO = dimO
        self.beta = beta
        self.layers_dim = layers_dim

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate

        self.opt = self.adam

        self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, FLAGS.alpha)
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True))

        self.noise = np.zeros(self.dimA)
        per_weights = tf.placeholder(tf.float32, [None], 'per_weights')

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        q = -negQ
        act_grad, = tf.gradients(negQ, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            negQ_target = self.negQ(obs_target, act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        q_target = -negQ_target

        y = tf.where(term_target, rew, rew + discount * q_target)
        y = tf.maximum(q - 1., y)
        y = tf.minimum(q + 1., y)
        y = tf.stop_gradient(y)
        print('y shape', y.get_shape())
        print('q shape', q.get_shape())
        td_error = q - y
        print('per weights shape', per_weights.get_shape())
        print('multi td error^2 per weights shape', tf.multiply(tf.square(td_error), per_weights).get_shape())
        ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weights), 0)
        print('ms td error shape', ms_td_error.get_shape())

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/')
        loss_q = ms_td_error + \
                 l2norm * tf.reduce_sum(regLosses) + \
                 FLAGS.alpha_beyond * tf.reduce_sum(
                     tf.where(
                         q > FLAGS.RMAX,
                         tf.square(q - FLAGS.RMAX),
                         tf.zeros((FLAGS.bsize,))) +
                     tf.where(
                         q < FLAGS.RMIN,
                         tf.square(q - FLAGS.RMIN),
                         tf.zeros((FLAGS.bsize,))),
                     0
                 )

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/')
        self.theta_cvx_ = [v for v in self.theta_
                           if 'proj' in v.name and 'W:' in v.name]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='q_target/')
        update_target = [theta_target_i.assign_sub(tau * (theta_target_i - theta_i))
                         for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)

        summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'),
                                               self.sess.graph)
        tf.summary.scalar('Qvalue (batch avg)', tf.reduce_mean(q))
        tf.summary.scalar('Qvalue (batch max)', tf.reduce_max(q))
        tf.summary.scalar('Qvalue (batch min)', tf.reduce_min(q))
        tf.summary.scalar('Q targets (batch avg)', tf.reduce_mean(q_target))
        tf.summary.scalar('Q targets (batch min)', tf.reduce_min(q_target))
        tf.summary.scalar('Q targets (batch max)', tf.reduce_max(q_target))
        tf.summary.scalar('loss', ms_td_error)
        tf.summary.scalar('td error', tf.reduce_mean(tf.abs(td_error)))
        tf.summary.scalar('reward', tf.reduce_mean(rew))
        tf.summary.scalar('chosen actions', tf.reduce_mean(act))
        tf.summary.scalar('maximizing action (batch avg)', tf.reduce_mean(act_target))
        tf.summary.scalar('maximizing action (batch max)', tf.reduce_max(act_target))
        tf.summary.scalar('maximizing action (batch min)', tf.reduce_min(act_target))
        merged = tf.summary.merge_all()

        # tf functions
        with self.sess.as_default():
            self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weights],
                              [optimize_q, update_target, loss_q, tf.abs(td_error), q, q_target],
                              merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=100)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
            self.sess.run(self.makeCvx)
            self.sess.run([theta_target_i.assign(theta_i)
                           for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)])

        if finalize_graph:
            self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)
Example #23
0
class DQNAgent:
    def __init__(
            self,
            env,
            memory_size,
            batch_size,
            target_update=100,
            gamma=0.99,
            # replay parameters
            alpha=0.2,
            beta=0.6,
            prior_eps=1e-6,
            # Categorical DQN parameters
            v_min=0,
            v_max=200,
            atom_size=51,
            # N-step Learning
            n_step=3,
            start_train=32,
            save_weights=True,
            log=True,
            lr=0.001,
            seed=0,
            episodes=200):

        self.env = env

        obs_dim = self.env.observation_dim
        action_dim = self.env.action_dim

        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        self.lr = lr
        self.memory_size = memory_size
        self.seed = seed

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)

        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(obs_dim,
                                              memory_size,
                                              batch_size,
                                              alpha=alpha)

        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(obs_dim,
                                         memory_size,
                                         batch_size,
                                         n_step=n_step,
                                         gamma=gamma)

        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atom_size).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(obs_dim, action_dim, self.atom_size,
                           self.support).to(self.device)
        self.dqn_target = Network(obs_dim, action_dim, self.atom_size,
                                  self.support).to(self.device)

        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)

        # transition to store in memory
        self.transition = list()

        self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10))

        self.start_train = start_train

        self.save_weights = save_weights

        self.time = datetime.datetime.now().timetuple()
        self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}"

        self.log = log
        self.episode_cnt = 0
        self.episodes = episodes

        if self.save_weights is True:
            self.create_save_directory()

        plt.ion()

    def create_save_directory(self):
        try:
            os.mkdir(self.path)
        except OSError:
            print("Creation of the directory %s failed" % self.path)
        else:
            print("Successfully created the directory %s " % self.path)

    def select_action(self, state):
        """Select an action from the input state."""
        # NoisyNet: no epsilon greedy action selection
        selected_action = self.dqn(torch.FloatTensor(state).to(
            self.device)).argmax()
        selected_action = selected_action.detach().cpu().numpy()

        self.transition = [state, selected_action]

        return selected_action

    def step(self, action):
        """Take an action and return the response of the env."""
        next_state, reward, done = self.env.step(action)

        self.transition += [reward, next_state, done]

        # N-step transition
        if self.use_n_step:
            one_step_transition = self.memory_n.store(*self.transition)
        # 1-step transition
        else:
            one_step_transition = self.transition

        # add a single step transition
        if one_step_transition:
            self.memory.store(*one_step_transition)

        return next_state, reward, done

    def update_model(self):
        """Update the model by gradient descent."""
        # PER needs beta to calculate weights
        samples = self.memory.sample_batch(self.beta)
        weights = torch.FloatTensor(samples["weights"].reshape(-1, 1)).to(
            self.device)
        indices = samples["indices"]

        # 1-step Learning loss
        elementwise_loss = self._compute_dqn_loss(samples, self.gamma)

        # PER: importance sampling before average
        loss = torch.mean(elementwise_loss * weights)

        # N-step Learning loss
        # we are gonna combine 1-step loss and n-step loss so as to
        # prevent high-variance. The original rainbow employs n-step loss only.
        if self.use_n_step:
            gamma = self.gamma**self.n_step
            samples = self.memory_n.sample_batch_from_idxs(indices)
            elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma)
            elementwise_loss += elementwise_loss_n_loss

            # PER: importance sampling before average
            loss = torch.mean(elementwise_loss * weights)

        self.optimizer.zero_grad()
        loss.backward()
        # print(loss)
        clip_grad_norm_(self.dqn.parameters(), 10.0)
        self.optimizer.step()

        # PER: update priorities
        loss_for_prior = elementwise_loss.detach().cpu().numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)

        # NoisyNet: reset noise
        self.dqn.reset_noise()
        self.dqn_target.reset_noise()

        return loss.item()

    def train(self, num_frames, plotting_interval=100):
        """Train the agent."""

        if self.log:
            pass
            # config = {'gamma': self.gamma, 'log_interval': plotting_interval, 'learning_rate': self.lr,
            #           'directory': self.path, 'type': 'dqn', 'replay_memory': self.memory_size, 'environment': 'normal', 'seed': self.seed}
            # wandb.init(project='is_os', entity='pydqn', config=config, notes=self.env.reward_function, reinit=True, tags=['report'])
            # wandb.watch(self.dqn)

        self.env.reset()
        state = self.env.get_state()
        won = False
        update_cnt = 0
        losses = []
        scores = []
        score = 0
        frame_cnt = 0
        self.episode_cnt = 0

        for frame_idx in range(1, num_frames + 1):
            frame_cnt += 1
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

            fraction = min(frame_cnt / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            # if agent has trained 500 frames, terminate
            if frame_cnt == 500:
                done = True

            # if episode ends
            if done:
                if reward > 0:
                    won = True
                self.env.reset()
                state = self.env.get_state()
                self.episode_cnt += 1
                scores.append(score)
                score = 0
                frame_cnt = 0

            # if training is ready
            if len(self.memory) >= self.batch_size:
                loss = self.update_model()
                losses.append(loss)
                update_cnt += 1

                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self._target_hard_update()

            # plotting
            if frame_idx % plotting_interval == 0:
                self._plot(frame_idx, scores, losses)

            if frame_idx % 1000 == 0:
                torch.save(self.dqn.state_dict(),
                           f'{self.path}/{frame_idx}.tar')
                print(f"model saved at:\n {self.path}/{frame_idx}.tar")

        # wandb.run.summary['won'] = won
        self.env.close()

    def _compute_dqn_loss(self, samples, gamma):
        """Return categorical dqn loss."""
        device = self.device  # for shortening the following lines
        state = torch.FloatTensor(samples["obs"]).to(device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(device)
        action = torch.LongTensor(samples["acts"]).to(device)
        reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device)
        done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device)

        # Categorical DQN algorithm
        delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1)

        with torch.no_grad():
            # Double DQN
            next_action = self.dqn(next_state).argmax(1)
            next_dist = self.dqn_target.dist(next_state)
            next_dist = next_dist[range(self.batch_size), next_action]

            t_z = reward + (1 - done) * gamma * self.support
            t_z = t_z.clamp(min=self.v_min, max=self.v_max)
            b = (t_z - self.v_min) / delta_z
            l = b.floor().long()
            u = b.ceil().long()

            offset = (torch.linspace(
                0, (self.batch_size - 1) * self.atom_size,
                self.batch_size).long().unsqueeze(1).expand(
                    self.batch_size, self.atom_size).to(self.device))

            proj_dist = torch.zeros(next_dist.size(), device=self.device)
            proj_dist.view(-1).index_add_(0, (l + offset).view(-1),
                                          (next_dist *
                                           (u.float() - b)).view(-1))
            proj_dist.view(-1).index_add_(0, (u + offset).view(-1),
                                          (next_dist *
                                           (b - l.float())).view(-1))

        dist = self.dqn.dist(state)
        log_p = torch.log(dist[range(self.batch_size), action])
        elementwise_loss = -(proj_dist * log_p).sum(1)

        return elementwise_loss

    def _target_hard_update(self):
        """Hard update: target <- local."""
        self.dqn_target.load_state_dict(self.dqn.state_dict())

    def _plot(self, frame_cnt, scores, losses):
        self.ax1.cla()
        self.ax1.set_title(
            f'frames: {frame_cnt} score: {np.mean(scores[-10:])}')
        self.ax1.plot(scores[-999:], color='red')
        self.ax2.cla()
        self.ax2.set_title(f'loss: {np.mean(losses[-10:])}')
        self.ax2.plot(losses[-999:], color='blue')
        plt.show()
        plt.pause(0.1)

        # needed for wandb to not log nans
        # if frame_cnt < self.start_train + 11:
        #     loss = 0
        # else:
        #     loss = np.mean(losses[-10:])

        if self.log:
            pass
Example #24
0
class Agent:
    # @todo: when instantiating two of these, it raises an Exception, because it tries to redefine
    # @todo: the scopes or variables (the names are already taken)
    # @todo: FIX THIS !!!
    """
    We don't use the bundle entropy method to optimize wrt actions,
    but rather plain SGD (or rather Adam)
    """
    def __init__(self, dimO, dimA, beta, layers_dim, finalize_graph=True):
        """
        :param finalize_graph: if you want to restore a model, using .restore(), set this param to False
        """
        self.dimA = dimA
        self.dimO = dimO
        self.beta = beta
        self.layers_dim = layers_dim

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate

        self.opt = self.adam

        self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, FLAGS.alpha)
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True))

        self.noise = np.zeros(self.dimA)
        per_weights = tf.placeholder(tf.float32, [None], 'per_weights')

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        q = -negQ
        act_grad, = tf.gradients(negQ, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            negQ_target = self.negQ(obs_target, act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        q_target = -negQ_target

        y = tf.where(term_target, rew, rew + discount * q_target)
        y = tf.maximum(q - 1., y)
        y = tf.minimum(q + 1., y)
        y = tf.stop_gradient(y)
        print('y shape', y.get_shape())
        print('q shape', q.get_shape())
        td_error = q - y
        print('per weights shape', per_weights.get_shape())
        print('multi td error^2 per weights shape', tf.multiply(tf.square(td_error), per_weights).get_shape())
        ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weights), 0)
        print('ms td error shape', ms_td_error.get_shape())

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/')
        loss_q = ms_td_error + \
                 l2norm * tf.reduce_sum(regLosses) + \
                 FLAGS.alpha_beyond * tf.reduce_sum(
                     tf.where(
                         q > FLAGS.RMAX,
                         tf.square(q - FLAGS.RMAX),
                         tf.zeros((FLAGS.bsize,))) +
                     tf.where(
                         q < FLAGS.RMIN,
                         tf.square(q - FLAGS.RMIN),
                         tf.zeros((FLAGS.bsize,))),
                     0
                 )

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/')
        self.theta_cvx_ = [v for v in self.theta_
                           if 'proj' in v.name and 'W:' in v.name]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='q_target/')
        update_target = [theta_target_i.assign_sub(tau * (theta_target_i - theta_i))
                         for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)

        summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'),
                                               self.sess.graph)
        tf.summary.scalar('Qvalue (batch avg)', tf.reduce_mean(q))
        tf.summary.scalar('Qvalue (batch max)', tf.reduce_max(q))
        tf.summary.scalar('Qvalue (batch min)', tf.reduce_min(q))
        tf.summary.scalar('Q targets (batch avg)', tf.reduce_mean(q_target))
        tf.summary.scalar('Q targets (batch min)', tf.reduce_min(q_target))
        tf.summary.scalar('Q targets (batch max)', tf.reduce_max(q_target))
        tf.summary.scalar('loss', ms_td_error)
        tf.summary.scalar('td error', tf.reduce_mean(tf.abs(td_error)))
        tf.summary.scalar('reward', tf.reduce_mean(rew))
        tf.summary.scalar('chosen actions', tf.reduce_mean(act))
        tf.summary.scalar('maximizing action (batch avg)', tf.reduce_mean(act_target))
        tf.summary.scalar('maximizing action (batch max)', tf.reduce_max(act_target))
        tf.summary.scalar('maximizing action (batch min)', tf.reduce_min(act_target))
        merged = tf.summary.merge_all()

        # tf functions
        with self.sess.as_default():
            self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weights],
                              [optimize_q, update_target, loss_q, tf.abs(td_error), q, q_target],
                              merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=100)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
            self.sess.run(self.makeCvx)
            self.sess.run([theta_target_i.assign(theta_i)
                           for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)])

        if finalize_graph:
            self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def adam(self, func, obs, plot=False):
        """Optimizer to find the greedy action"""
        # if npr.random() < 1./20:
        #     plot = True
        b1 = 0.9
        b2 = 0.999
        lam = 0.5
        eps = 1e-8
        alpha = 0.01
        nBatch = obs.shape[0]
        act = np.zeros((nBatch, self.dimA))
        m = np.zeros_like(act)
        v = np.zeros_like(act)

        b1t, b2t = 1., 1.
        act_best, a_diff, f_best = [None] * 3
        hist = {'act': [], 'f': [], 'g': []}
        for i in range(1000):
            f, g = func(obs, act)
            if plot:
                hist['act'].append(act.copy())
                hist['f'].append(f)
                hist['g'].append(g)

            if i == 0:
                act_best = act.copy()
                f_best = f.copy()
            else:
                prev_act_best = act_best.copy()
                I = (f < f_best)
                act_best[I] = act[I]
                f_best[I] = f[I]
                a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1))
                a_diff = a_diff_i if a_diff is None \
                    else lam * a_diff + (1. - lam) * a_diff_i
                # print(a_diff_i, a_diff, np.sum(f))
                if a_diff < 1e-3 and i > 5:
                    if plot:
                        self.adam_plot(func, obs, hist)
                    return act_best

            m = b1 * m + (1. - b1) * g
            v = b2 * v + (1. - b2) * (g * g)
            b1t *= b1
            b2t *= b2
            mhat = m / (1. - b1t)
            vhat = v / (1. - b2t)

            act -= alpha * mhat / (np.sqrt(v) + eps)
            act = np.clip(act, FLAGS.a_min + 1e-8, FLAGS.a_max - 1e-8)

        print('  + Warning: Adam did not converge.')
        if plot:
            self.adam_plot(func, obs, hist)
        return act_best

    def adam_plot(self, func, obs, hist):
        hist['act'] = np.array(hist['act']).T
        hist['f'] = np.array(hist['f']).T
        hist['g'] = np.array(hist['g']).T
        if self.dimA == 1:
            xs = np.linspace(-1. + 1e-8, 1. - 1e-8, 100)
            ys = [func(obs[[0], :], [[xi]])[0] for xi in xs]
            fig = plt.figure()
            plt.plot(xs, ys)
            plt.plot(hist['act'][0, 0, :], hist['f'][0, :], label='Adam')
            plt.legend()
            fname = os.path.join(FLAGS.outdir, 'adamPlt.png')
            print("Saving Adam plot to {}".format(fname))
            plt.savefig(fname)
            plt.close(fig)
        elif self.dimA == 2:
            assert (False)
        else:
            xs = npr.uniform(-1., 1., (5000, self.dimA))
            ys = np.array([func(obs[[0], :], [xi])[0] for xi in xs])
            epi = np.hstack((xs, ys))
            pca = PCA(n_components=2).fit(epi)
            W = pca.components_[:, :-1]
            xs_proj = xs.dot(W.T)
            fig = plt.figure()

            X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100)
            Z = griddata(xs_proj[:, 0], xs_proj[:, 1], ys.ravel(),
                         X, Y, interp='linear')

            plt.contourf(X, Y, Z, 15)
            plt.colorbar()

            adam_x = hist['act'][:, 0, :].T
            adam_x = adam_x.dot(W.T)
            plt.plot(adam_x[:, 0], adam_x[:, 1], label='Adam', color='k')
            plt.legend()

            fname = os.path.join(FLAGS.outdir, 'adamPlt.png')
            print("Saving Adam plot to {}".format(fname))
            plt.savefig(fname)
            plt.close(fig)

    def reset(self, obs):
        self.noise = np.zeros(self.dimA)
        self.observation = obs  # initial observation

    def act(self, test=False):
        """
        Greedily choose action
        There is noise during training
        """
        with self.sess.as_default():
            obs = np.expand_dims(self.observation, axis=0)

            f = self._fg

            tflearn.is_training(False)
            action = self.opt(f, obs)
            tflearn.is_training(not test)

            if not test:
                # sig = (self.t < 40000) * (self.t * (FLAGS.ousigma_end - FLAGS.ousigma_start) / 40000 + FLAGS.ousigma_start) + (self.t >= 40000) * FLAGS.ousigma_end
                # self.noise = sig * npr.randn(self.dimA)
                self.noise -= FLAGS.outheta * self.noise - FLAGS.ousigma * npr.randn(self.dimA)
                action += self.noise
            action = np.clip(action, FLAGS.a_min, FLAGS.a_max)

            self.action = np.atleast_1d(np.squeeze(action, axis=0))
            return self.action

    def observe(self, rew, term, obs2, test=False):
        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:

            self.rm.add(*(obs1, self.action, rew, obs2, term))

            if self.t > FLAGS.warmup:
                for i in range(FLAGS.iter):
                    loss = self.train()

    def train(self):
        self.t += 1
        beta = self.beta(self.t)
        with self.sess.as_default():
            obs, act, rew, ob2, term2, w, idx = self.rm.sample(FLAGS.bsize, beta)
            rew, term2, w = rew.squeeze(), term2.squeeze(), w.squeeze()  # fix dimensions
            # w = np.ones(w.shape)  # no prioritization
            f = self._fg_target
            tflearn.is_training(False)
            act2 = self.opt(f, ob2)
            tflearn.is_training(True)

            _, _, loss, td_error, q, q_target = self._train(obs, act, rew, ob2, act2, term2, w,
                                                            log=FLAGS.summary, global_step=self.t)
            self.sess.run(self.proj)  # keep some weights positive
            # self.rm.update_priorities(idx, np.array(td_error.shape[0] * [1.]))  # no prioritization
            self.rm.update_priorities(idx, td_error + 1e-2)
            return loss, td_error, q, q_target

    def negQ(self, x, y, reuse=False):
        """Architecture of the neural network"""
        print('x shape', x.get_shape())
        print('y shape', y.get_shape())
        szs = self.layers_dim
        assert (len(szs) >= 1)
        fc = tflearn.fully_connected
        bn = tflearn.batch_normalization
        lrelu = tflearn.activations.leaky_relu

        if reuse:
            tf.get_variable_scope().reuse_variables()

        nLayers = len(szs)
        us = []
        zs = []
        z_zs = []
        z_ys = []
        z_us = []

        reg = 'L2'

        prevU = x
        for i in range(nLayers):
            with tf.variable_scope('u' + str(i), reuse=reuse) as s:
                u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg)
                if i < nLayers - 1:
                    u = tf.nn.relu(u)
                    if FLAGS.icnn_bn:
                        u = bn(u, reuse=reuse, scope=s, name='bn')
            variable_summaries(u, suffix='u{}'.format(i))
            us.append(u)
            prevU = u

        prevU, prevZ = x, y
        for i in range(nLayers + 1):
            sz = szs[i] if i < nLayers else 1
            z_add = []
            if i > 0:
                with tf.variable_scope('z{}_zu_u'.format(i), reuse=reuse) as s:
                    zu_u = fc(prevU, szs[i - 1], reuse=reuse, scope=s,
                              activation='relu', bias=True,
                              regularizer=reg, bias_init=tf.constant_initializer(1.))
                    variable_summaries(zu_u, suffix='zu_u{}'.format(i))
                with tf.variable_scope('z{}_zu_proj'.format(i), reuse=reuse) as s:
                    z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s,
                              bias=False, regularizer=reg)
                    variable_summaries(z_zu, suffix='z_zu{}'.format(i))
                z_zs.append(z_zu)
                z_add.append(z_zu)

            with tf.variable_scope('z{}_yu_u'.format(i), reuse=reuse) as s:
                yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True,
                          regularizer=reg, bias_init=tf.constant_initializer(1.))
                variable_summaries(yu_u, suffix='yu_u{}'.format(i))
            with tf.variable_scope('z{}_yu'.format(i), reuse=reuse) as s:
                z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False,
                          regularizer=reg)
                z_ys.append(z_yu)
                variable_summaries(z_yu, suffix='z_yu{}'.format(i))
            z_add.append(z_yu)

            with tf.variable_scope('z{}_u'.format(i), reuse=reuse) as s:
                z_u = fc(prevU, sz, reuse=reuse, scope=s,
                         bias=True, regularizer=reg,
                         bias_init=tf.constant_initializer(0.))
                variable_summaries(z_u, suffix='z_u{}'.format(i))
            z_us.append(z_u)
            z_add.append(z_u)

            z = tf.add_n(z_add)
            variable_summaries(z, suffix='z{}_preact'.format(i))
            if i < nLayers:
                # z = tf.nn.relu(z)
                z = lrelu(z, alpha=FLAGS.lrelu)
                variable_summaries(z, suffix='z{}_act'.format(i))

            zs.append(z)
            prevU = us[i] if i < nLayers else None
            prevZ = z

        print('z shape', z.get_shape())
        z = tf.reshape(z, [-1], name='energies')
        return z

    def save(self, path):
        self.saver.save(self.sess, path)

    def restore(self, filename):
        """
        IMPORTANT:
        Filename should be the filepath to the 4 following files:
            - 50314.index
            - 50314.meta
            - 50314.data-00000-of-00001
            - checkpoint
        Note that it shouldn't include any extension. In this case, it would therefore be `tensorboard/models/50314`
        Note that it is `50314` because I used the global training step as a filename to save model

        !!!! BESIDES YOU SHOULD HAVE INSTANTIATED THE AGENT WITH `finalize_graph=False` !!!!
        """
        self.saver = tf.train.import_meta_graph(filename+'.meta')
        self.saver.restore(self.sess, filename)
        self.sess.graph.finalize()

    def __del__(self):
        self.sess.close()
Example #25
0
def main():

#    env = gym.make("CartPoleRob-v0")
#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCarRob-v0")
#    env = gym.make("FrozenLake-v0")
#    env = gym.make("FrozenLake8x8-v0")
#    env = gym.make("FrozenLake8x8rob-v0")
#    env = gym.make("FrozenLake16x16rob-v0")
    env = gym.make("TestRob3-v0")
    
    
    
    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obses_t, windowLen):
        deicticObses_t = []
        for i in range(np.shape(obses_t)[0] - windowLen):
            for j in range(np.shape(obses_t)[1] - windowLen):
                deicticObses_t.append(obses_t[i:i+windowLen,j:j+windowLen,:])
        return np.array(deicticObses_t)

    # get set of deictic alternatives
    # input: batch x n x n x channels
    # output: (batch x deictic) x dn x dn x channels
    def getDeictic(obses_t, actions, obses_tp1, weights, windowLen):
        deicticObses_t = []
        deicticActions = []
        deicticObses_tp1 = []
        deicticWeights = []
        for i in range(np.shape(obses_t)[0]):
            for j in range(np.shape(obses_t)[1] - windowLen):
                for k in range(np.shape(obses_t)[2] - windowLen):
                    deicticObses_t.append(obses_t[i,j:j+windowLen,k:k+windowLen,:])
                    deicticActions.append(actions[i])
                    deicticObses_tp1.append(obses_tp1[i,j:j+windowLen,k:k+windowLen,:])
                    deicticWeights.append(weights[i])
        return np.array(deicticObses_t), np.array(deicticActions), np.array(deicticObses_tp1), np.array(deicticWeights)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
#        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong
#        hiddens=[256],  # used in pong
#        convs=[(8,4,1)], # used for non-deictic TestRob3-v0
        convs=[(4,3,1)], # used for deictic TestRob3-v0
        hiddens=[16],
        dueling=True
    )

    # parameters
    q_func=model
    lr=1e-3
#    max_timesteps=100000
#    max_timesteps=50000
    max_timesteps=20000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
#    exploration_final_eps=0.1
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.
    target_network_update_freq=500
    prioritized_replay=False
#    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16
    
    deicticShape = (3,3,1)
    def make_obs_ph(name):
#        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(deicticShape, name=name)

    matchShape = (batch_size*25,)
    def make_match_ph(name):
        return U.BatchInput(matchShape, name=name)

    
    sess = U.make_session(num_cpu)
    sess.__enter__()

#    act, train, update_target, debug = build_graph.build_train(
#    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
#    getq, train, trainWOUpdate, debug = build_graph.build_train_deictic(
#    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min(
        make_obs_ph=make_obs_ph,
        make_match_ph=make_match_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):
        
        # get action to take
#        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
#        qvalues = getq(np.array(obs)[None])
#        action = np.argmax(qvalues)
#        if np.random.rand() < exploration.value(t):
#            action = np.random.randint(env.action_space.n)
        
        deicticObs = getDeicticObs(obs,3)
        qvalues = getq(np.array(deicticObs))
        action = np.argmax(np.max(qvalues,0))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)
        
#        # temporarily take uniformly random actions all the time
#        action = np.random.randint(env.action_space.n)
        
        new_obs, rew, done, _ = env.step(action)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Get batch
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            # Convert batch to deictic format
            obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(obses_t, actions, obses_tp1, weights, 3)
            
            obses_t_deic_fingerprints = [np.reshape(obses_t_deic[i],[9]) for i in range(np.shape(obses_t_deic)[0])]
            _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,axis=0,return_index=True,return_inverse=True)
#            matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)]
            
#            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
#            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
#            debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
#            debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)
            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
            

    num2avg = 20
    rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg
    plt.plot(rListAvg)
#    plt.plot(episode_rewards)
    plt.show()

    sess
def main():

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")

    # Dictionary-based value function
    q_func_tabular = {}
    defaultQValue = np.ones(env.action_space.n)

    # Given an integer, return the corresponding boolean array
    def getBoolBits(state):
        return np.unpackbits(np.uint8(state), axis=1) == 1

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func_tabular[x] if x in q_func_tabular else defaultQValue
            for x in keys
        ])

#    def trainTabular(vectorKey,qCurrTargets,weights):

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 0.1
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
                q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[
                    keys[i]] + alpha * qCurrTargets[i]


#                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]

    max_timesteps = 200000
    exploration_fraction = 0.3
    exploration_final_eps = 0.02
    print_freq = 1
    gamma = .98
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 10
    buffer_size = 100
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    valueFunctionType = "TABULAR"
    #    valueFunctionType = "DQN"

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Set up replay buffer
    prioritized_replay = True
    #    prioritized_replay=False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    state = env.reset()

    episode_rewards = [0.0]
    timerStart = time.time()
    for t in range(max_timesteps):

        #        np.unpackbits(np.uint8(np.reshape(states_tp1,[batch_size,1])),axis=1)
        qCurr = getTabular(getBoolBits([[state]]))

        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly

        # select action at random
        action = np.argmax(qCurrNoise)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        nextState, rew, done, _ = env.step(action)

        replay_buffer.add(state, action, rew, nextState, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actions, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actions, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            qNext = getTabular(
                getBoolBits(np.reshape(states_tp1, [batch_size, 1])))

            qNextmax = np.max(qNext, axis=1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrTarget = getTabular(
                getBoolBits(np.reshape(states_t, [batch_size, 1])))

            td_error = qCurrTarget[range(batch_size), actions] - targets
            qCurrTarget[range(batch_size), actions] = targets

            trainTabular(getBoolBits(np.reshape(states_t, [batch_size, 1])),
                         qCurrTarget)

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        state = np.copy(nextState)
class DQNAgent(object):
    def __init__(self, stateShape, actionSpace, numPicks, memorySize, burnin=1000):
        self.numPicks = numPicks
        self.memorySize = memorySize
        self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6)
        self.stateShape = stateShape
        self.actionSpace = actionSpace

        self.step = 0
        self.sync = 200
        self.burnin = burnin

        self.alpha = 0.001
        self.epsilon = 1
        self.epsilon_decay = 0.5
        self.epsilon_min = 0.01
        self.eps_threshold = 0

        self.gamma = 0.99

        self.trainNetwork = self.createNetwork(
            stateShape, len(actionSpace), self.alpha)
        self.targetNetwork = self.createNetwork(
            stateShape, len(actionSpace), self.alpha)
        self.targetNetwork.set_weights(
            self.trainNetwork.get_weights())

    def createNetwork(self, n_input, n_output, learningRate):
        model = keras.models.Sequential()

        model.add(keras.layers.Dense(
            24, activation='relu', input_shape=n_input))
        model.add(keras.layers.Dense(48, activation='relu'))
        model.add(keras.layers.Dense(n_output, activation='linear'))
        model.compile(
            loss='mse', optimizer=keras.optimizers.Adam(lr=learningRate))
        print(model.summary())
        return model

    def trainDQN(self):
        if len(self.replayMemory) <= self.numPicks or len(self.replayMemory) < self.burnin:
            return 0

        beta = 0.4 + self.step * (1.0 - 0.4) / 300
        samples = self.replayMemory.sample(self.numPicks, beta)
        #batch = Transition(*zip(*samples))
        currStates, actions, rewards, nextStates, dones, weights, indices = samples

        currStates = np.squeeze(np.array(currStates), 1)
        Q_currents = self.trainNetwork(currStates, training=False).numpy()

        nextStates = np.squeeze(np.array(nextStates), 1)
        Q_futures = self.targetNetwork(nextStates, training=False).numpy().max(axis=1)

        rewards = np.array(rewards).reshape(self.numPicks,).astype(float)
        actions = np.array(actions).reshape(self.numPicks,).astype(int)

        dones = np.array(dones).astype(bool)
        notDones = (~dones).astype(float)
        dones = dones.astype(float)

        Q_currents_cp = deepcopy(Q_currents)
        Q_currents_cp[np.arange(self.numPicks), actions] = rewards * dones + (rewards + Q_futures * self.gamma)*notDones

        loss = tf.multiply(tf.pow(tf.subtract(Q_currents[np.arange(self.numPicks), actions], Q_currents_cp[np.arange(self.numPicks), actions]), 2), weights).numpy()
        prios = loss + 1e-5
        self.replayMemory.update_priorities(indices, prios)

        loss = self.trainNetwork.train_on_batch(currStates, Q_currents)
        return loss

    def selectAction(self, state):
        self.step += 1

        if self.step % self.sync == 0:
            self.targetNetwork.set_weights(
                self.trainNetwork.get_weights())

        q = -100000
        if np.random.rand(1) < self.epsilon:
            action = np.random.randint(0, 3)
        else:
            preds = np.squeeze(self.trainNetwork(
                state, training=False).numpy(), axis=0)
            action = np.argmax(preds)
            q = preds[action]
        return action, q

    def addMemory(self, state, action, reward, nextState, done):
        self.replayMemory.add(state, action, reward, nextState, done)

    def save(self):
        save_path = (
            f"./mountain_car_tfngmo_{int(self.step)}.chkpt"
        )
        self.trainNetwork.save(
            save_path
        )
        print(f"MountainNet saved to {save_path} done!")
Example #28
0
def main():

#    env = gym.make("CartPoleRob-v0")
#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCarRob-v0")
#    env = gym.make("FrozenLake-v0")
#    env = gym.make("FrozenLake8x8-v0")
    env = gym.make("FrozenLake8x8nohole-v0")
    
#    robShape = (2,)
#    robShape = (3,)
#    robShape = (200,)
#    robShape = (16,)
    robShape = (64,)
    def make_obs_ph(name):
#        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(robShape, name=name)

#    # these params are specific to mountaincar
#    def getOneHotObs(obs):
#        obsFraction = (obs[0] + 1.2) / 1.8
#        idx1 = np.int32(np.trunc(obsFraction*100))
#        obsFraction = (obs[1] + 0.07) / 0.14
#        idx2 = np.int32(np.trunc(obsFraction*100))
#        ident = np.identity(100)
#        return np.r_[ident[idx1,:],ident[idx2,:]]

    # these params are specific to frozenlake
    def getOneHotObs(obs):
#        ident = np.identity(16)
        ident = np.identity(64)
        return ident[obs,:]

    model = models.mlp([32])
#    model = models.mlp([64])
#    model = models.mlp([64], layer_norm=True)
#    model = models.mlp([16, 16])

    # parameters
    q_func=model
    lr=1e-3
#    max_timesteps=100000
    max_timesteps=50000
#    max_timesteps=10000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
#    exploration_final_eps=0.1
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.0
    target_network_update_freq=500
#    prioritized_replay=False
    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16

#    # try mountaincar w/ different input dimensions
#    inputDims = [50,2]
    
    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    obs = getOneHotObs(obs)
    
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        new_obs = getOneHotObs(new_obs)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            obs = getOneHotObs(obs)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess
            

    num2avg = 20
    rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg
    plt.plot(rListAvg)
#    plt.plot(episode_rewards)
    plt.show()

    sess
Example #29
0
def learn(env,
          network,
          seed=None,
          lr=5e-5,
          total_timesteps=100000,
          buffer_size=500000,
          exploration_fraction=0.1,
          exploration_final_eps=0.01,
          train_freq=1,
          batch_size=32,
          print_freq=10,
          checkpoint_freq=100000,
          checkpoint_path=None,
          learning_starts=0,
          gamma=0.99,
          target_network_update_freq=10000,
          prioritized_replay=True,
          prioritized_replay_alpha=0.4,
          prioritized_replay_beta0=0.6,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-3,
          param_noise=False,
          callback=None,
          load_path=None,
          load_idx=None,
          demo_path=None,
          n_step=10,
          demo_prioritized_replay_eps=1.0,
          pre_train_timesteps=750000,
          epsilon_schedule="constant",
          **network_kwargs):
    # Create all the functions necessary to train the model
    set_global_seeds(seed)
    q_func = build_q_func(network, **network_kwargs)

    with tf.device('/GPU:0'):
        model = DQfD(q_func=q_func,
                     observation_shape=env.observation_space.shape,
                     num_actions=env.action_space.n,
                     lr=lr,
                     grad_norm_clipping=10,
                     gamma=gamma,
                     param_noise=param_noise)

    # Load model from checkpoint
    if load_path is not None:
        load_path = osp.expanduser(load_path)
        ckpt = tf.train.Checkpoint(model=model)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
        if load_idx is None:
            ckpt.restore(manager.latest_checkpoint)
            print("Restoring from {}".format(manager.latest_checkpoint))
        else:
            ckpt.restore(manager.checkpoints[load_idx])
            print("Restoring from {}".format(manager.checkpoints[load_idx]))

    # Setup demo trajectory
    assert demo_path is not None
    with open(demo_path, "rb") as f:
        trajectories = pickle.load(f)

    # Create the replay buffer
    replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                            prioritized_replay_alpha)
    if prioritized_replay_beta_iters is None:
        prioritized_replay_beta_iters = total_timesteps
    beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                   initial_p=prioritized_replay_beta0,
                                   final_p=1.0)
    temp_buffer = deque(maxlen=n_step)
    is_demo = True
    for epi in trajectories:
        for obs, action, rew, new_obs, done in epi:
            obs, new_obs = np.expand_dims(
                np.array(obs), axis=0), np.expand_dims(np.array(new_obs),
                                                       axis=0)
            if n_step:
                temp_buffer.append((obs, action, rew, new_obs, done, is_demo))
                if len(temp_buffer) == n_step:
                    n_step_sample = get_n_step_sample(temp_buffer, gamma)
                    replay_buffer.demo_len += 1
                    replay_buffer.add(*n_step_sample)
            else:
                replay_buffer.demo_len += 1
                replay_buffer.add(obs[0], action, rew, new_obs[0], float(done),
                                  float(is_demo))
    logger.log("trajectory length:", replay_buffer.demo_len)
    # Create the schedule for exploration
    if epsilon_schedule == "constant":
        exploration = ConstantSchedule(exploration_final_eps)
    else:  # not used
        exploration = LinearSchedule(schedule_timesteps=int(
            exploration_fraction * total_timesteps),
                                     initial_p=1.0,
                                     final_p=exploration_final_eps)

    model.update_target()

    # ============================================== pre-training ======================================================
    start = time()
    num_episodes = 0
    temp_buffer = deque(maxlen=n_step)
    for t in tqdm(range(pre_train_timesteps)):
        # sample and train
        experience = replay_buffer.sample(batch_size,
                                          beta=prioritized_replay_beta0)
        batch_idxes = experience[-1]
        if experience[6] is None:  # for n_step = 0
            obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple(
                map(tf.constant, experience[:6]))
            obses_tpn, rewards_n, dones_n = None, None, None
            weights = tf.constant(experience[-2])
        else:
            obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple(
                map(tf.constant, experience[:-1]))
        td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train(
            obses_t, actions, rewards, obses_tp1, dones, is_demos, weights,
            obses_tpn, rewards_n, dones_n)

        # Update priorities
        new_priorities = np.abs(td_errors) + np.abs(
            n_td_errors) + demo_prioritized_replay_eps
        replay_buffer.update_priorities(batch_idxes, new_priorities)

        # Update target network periodically
        if t > 0 and t % target_network_update_freq == 0:
            model.update_target()

        # Logging
        elapsed_time = timedelta(time() - start)
        if print_freq is not None and t % 10000 == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward", 0)
            logger.record_tabular("max 100 episode reward", 0)
            logger.record_tabular("min 100 episode reward", 0)
            logger.record_tabular("demo sample rate", 1)
            logger.record_tabular("epsilon", 0)
            logger.record_tabular("loss_td", np.mean(loss_dq.numpy()))
            logger.record_tabular("loss_n_td", np.mean(loss_n.numpy()))
            logger.record_tabular("loss_margin", np.mean(loss_E.numpy()))
            logger.record_tabular("loss_l2", np.mean(loss_l2.numpy()))
            logger.record_tabular("losses_all", weighted_error.numpy())
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.record_tabular("pre_train", True)
            logger.record_tabular("elapsed time", elapsed_time)
            logger.dump_tabular()

    # ============================================== exploring =========================================================
    sample_counts = 0
    demo_used_counts = 0
    episode_rewards = deque(maxlen=100)
    this_episode_reward = 0.
    best_score = 0.
    saved_mean_reward = None
    is_demo = False
    obs = env.reset()
    # Always mimic the vectorized env
    obs = np.expand_dims(np.array(obs), axis=0)
    reset = True
    for t in tqdm(range(total_timesteps)):
        if callback is not None:
            if callback(locals(), globals()):
                break
        kwargs = {}
        if not param_noise:
            update_eps = tf.constant(exploration.value(t))
            update_param_noise_threshold = 0.
        else:  # not used
            update_eps = tf.constant(0.)
            update_param_noise_threshold = -np.log(1. - exploration.value(t) +
                                                   exploration.value(t) /
                                                   float(env.action_space.n))
            kwargs['reset'] = reset
            kwargs[
                'update_param_noise_threshold'] = update_param_noise_threshold
            kwargs['update_param_noise_scale'] = True
        action, epsilon, _, _ = model.step(tf.constant(obs),
                                           update_eps=update_eps,
                                           **kwargs)
        action = action[0].numpy()
        reset = False
        new_obs, rew, done, _ = env.step(action)

        # Store transition in the replay buffer.
        new_obs = np.expand_dims(np.array(new_obs), axis=0)
        if n_step:
            temp_buffer.append((obs, action, rew, new_obs, done, is_demo))
            if len(temp_buffer) == n_step:
                n_step_sample = get_n_step_sample(temp_buffer, gamma)
                replay_buffer.add(*n_step_sample)
        else:
            replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), 0.)
        obs = new_obs

        # invert log scaled score for logging
        this_episode_reward += np.sign(rew) * (np.exp(np.sign(rew) * rew) - 1.)
        if done:
            num_episodes += 1
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(this_episode_reward)
            reset = True
            if this_episode_reward > best_score:
                best_score = this_episode_reward
                ckpt = tf.train.Checkpoint(model=model)
                manager = tf.train.CheckpointManager(ckpt,
                                                     './best_model',
                                                     max_to_keep=1)
                manager.save(t)
                logger.log("saved best model")
            this_episode_reward = 0.0

        if t % train_freq == 0:
            experience = replay_buffer.sample(batch_size,
                                              beta=beta_schedule.value(t))
            batch_idxes = experience[-1]
            if experience[6] is None:  # for n_step = 0
                obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple(
                    map(tf.constant, experience[:6]))
                obses_tpn, rewards_n, dones_n = None, None, None
                weights = tf.constant(experience[-2])
            else:
                obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple(
                    map(tf.constant, experience[:-1]))
            td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train(
                obses_t, actions, rewards, obses_tp1, dones, is_demos, weights,
                obses_tpn, rewards_n, dones_n)
            new_priorities = np.abs(td_errors) + np.abs(
                n_td_errors
            ) + demo_prioritized_replay_eps * is_demos + prioritized_replay_eps * (
                1. - is_demos)
            replay_buffer.update_priorities(batch_idxes, new_priorities)

            # for logging
            sample_counts += batch_size
            demo_used_counts += np.sum(is_demos)

        if t % target_network_update_freq == 0:
            # Update target network periodically.
            model.update_target()

        if t % checkpoint_freq == 0:
            save_path = checkpoint_path
            ckpt = tf.train.Checkpoint(model=model)
            manager = tf.train.CheckpointManager(ckpt,
                                                 save_path,
                                                 max_to_keep=10)
            manager.save(t)
            logger.log("saved checkpoint")

        elapsed_time = timedelta(time() - start)
        if done and num_episodes > 0 and num_episodes % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward",
                                  np.mean(episode_rewards))
            logger.record_tabular("max 100 episode reward",
                                  np.max(episode_rewards))
            logger.record_tabular("min 100 episode reward",
                                  np.min(episode_rewards))
            logger.record_tabular("demo sample rate",
                                  demo_used_counts / sample_counts)
            logger.record_tabular("epsilon", epsilon.numpy())
            logger.record_tabular("loss_td", np.mean(loss_dq.numpy()))
            logger.record_tabular("loss_n_td", np.mean(loss_n.numpy()))
            logger.record_tabular("loss_margin", np.mean(loss_E.numpy()))
            logger.record_tabular("loss_l2", np.mean(loss_l2.numpy()))
            logger.record_tabular("losses_all", weighted_error.numpy())
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.record_tabular("pre_train", False)
            logger.record_tabular("elapsed time", elapsed_time)
            logger.dump_tabular()

    return model
def dist_learn(env,
               q_dist_func,
               num_atoms=51,
               V_max=10,
               lr=25e-5,
               max_timesteps=100000,
               buffer_size=50000,
               exploration_fraction=0.01,
               exploration_final_eps=0.008,
               train_freq=1,
               batch_size=32,
               print_freq=1,
               checkpoint_freq=2000,
               learning_starts=1000,
               gamma=1.0,
               target_network_update_freq=500,
               prioritized_replay=False,
               prioritized_replay_alpha=0.6,
               prioritized_replay_beta0=0.4,
               prioritized_replay_beta_iters=None,
               prioritized_replay_eps=1e-6,
               num_cpu=1,
               callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.single_threaded_session()
    sess.__enter__()

    def make_obs_ph(name):
        print name
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = build_dist_train(
        make_obs_ph=make_obs_ph,
        dist_func=q_dist_func,
        num_actions=env.action_space.n,
        num_atoms=num_atoms,
        V_max=V_max,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)

    # act, train, update_target, debug = build_train(
    #     make_obs_ph=make_obs_ph,
    #     q_func=q_func,
    #     num_actions=env.action_space.n,
    #     optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    #     gamma=gamma,
    #     grad_norm_clipping=10
    # )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_dist_func': q_dist_func,
        'num_actions': env.action_space.n,
    }
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        print model_file
        # mkdir_p(os.path.dirname(model_file))
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            action = act(np.array(obs)[None],
                         update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    # print "CCCC"
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                # print "Come1"
                # print np.shape(obses_t), np.shape(actions), np.shape(rewards), np.shape(obses_tp1), np.shape(dones)
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                # print "Loss : {}".format(td_errors)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                print "steps : {}".format(t)
                print "episodes : {}".format(num_episodes)
                print "mean 100 episode reward: {}".format(mean_100ep_reward)
                # print "mean 100 episode reward".format(mean_100ep_reward)
                # logger.record_tabular("episodes", num_episodes)
                # logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.dump_tabular()
                # logger.record_tabular("steps", t)
                # logger.record_tabular("episodes", num_episodes)
                # logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and t % checkpoint_freq == 0):
                print "=========================="
                print "Error: {}".format(td_errors)
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        print "Saving model due to mean reward increase: {} -> {}".format(
                            saved_mean_reward, mean_100ep_reward)
                        # logger.log("Saving model due to mean reward increase: {} -> {}".format(
                        #            saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                print "Restored model with mean reward: {}".format(
                    saved_mean_reward)
                # logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)