コード例 #1
0
ファイル: dqn_learn.py プロジェクト: jiangtaibai/pytorch_dqn
def main():
    policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
    policy_net.apply(init_weights)
    if pretrained:
        ckp = torch.load('/data2/jiangjigang/ckp/dqn.pth')
        policy_net.load_state_dict(
            {k.replace('module.', ''): v
             for k, v in ckp.items()})
    target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
    target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
    target_net.eval()
    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=learning_rate)  #定义优化器Adam,可以更换
    buffer = ReplayBuffer(
        buffer_size
    )  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
    criterion = torch.nn.MSELoss(reduction='sum')

    # training
    for i_episode in range(num_episodes):

        state0 = [user_loc, user_dis, node_loc, use_buff]  #获得一个初始化状态
        error = 0.0
        all_reward = 0
        for t in count():
            # 选择动作
            action = e_greedy_select_action(state0, policy_net)
            a = np.array([action.data.cpu().numpy()])
            #print("action selected by e_greedy is {}".format(action))
            # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
            state1, done, flag = transition_function(state0, action)
            # 利用奖励函数,获得当前的奖励值
            reward, cost_migration = reward_function(state0, action, state1,
                                                     flag)
            all_reward = all_reward + reward
            # 将经验数据存储至buffer中
            buffer.add(state0, a, reward, state1, done)

            # exit an episode after MAX_T steps
            if t > MAX_T:
                break

            #当episode>10时进行网络参数更新,目的是为了让经验池中有较多的数据,使得训练较为稳定。
            if i_episode > 1:

                # 从buffer中取出一批训练样本,训练数据batch由BATCH_SIZE参数决定
                batch = buffer.getBatch(BATCH_SIZE)

                policy_net, target_net, bellman_error = optimize_model(
                    batch, policy_net, target_net, optimizer_policy, criterion)
                error = error + bellman_error.data.cpu().numpy()
            # 进入下一状态
            state0 = state1
        ave_error = error / (t * 1.00)
        ave_reward = all_reward / (t * 1.00)
        print(ave_error, ave_reward)
    torch.save(policy_net.state_dict(), '/data2/jiangjigang/ckp/dqn.pth')
コード例 #2
0
    def _create_model(self):
        """
            Create a deep Q model for function approximation with Adam optimizer.

            :return: net, tgt_net, optimizer
            """
        tgt_net = DQN(self.env.observation_space.shape[0],
                      self.nactions**2).to(self.device)
        if self.load_from is not None:
            assert type(
                self.load_from
            ) == str, "Name of model to be loaded has to be a string!"
            self.net.load_state_dict(torch.load(self.load_from))
            tgt_net.load_state_dict(torch.load(self.load_from))
        optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate)
        return tgt_net, optimizer
コード例 #3
0
ファイル: agent_dqn.py プロジェクト: aditthapron/RL_project
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        self.action = env.get_action_space()

        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Using device:', self.device)
        self.model = DQN().to(self.device)
        self.model_target = DQN().to(self.device)
        self.episode = 100000
        self.max_steps_per_episode = 14000
        self.update_target_network = 10000
        self.epsilon = 1.0
        self.min_epsilon = 0.1
        self.step_epsilon = (self.epsilon - self.min_epsilon) / (1E6)
        self.env = env
        self.history = []
        self.buffer_size = min(args.history_size // 5, 2000)
        self.history_size = args.history_size
        self.learning_rate = 1e-4
        self.name = args.name
        self.batch_size = 32
        self.gamma = 0.99
        self.priority = []
        self.w = 144
        self.h = 256
        self.mode = args.mode
        self.delay = args.delay
        self.epoch = args.continue_epoch
        if args.test_dqn or self.epoch > 0:
            #you can load your model here
            print('loading trained model')
            ###########################
            self.model.load_state_dict(
                torch.load(self.name + '.pth', map_location=self.device))
            self.model_target.load_state_dict(
                torch.load(self.name + '.pth', map_location=self.device))
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.model.eval()
        with torch.no_grad():
            if test == False:
                if np.random.random() < self.epsilon or len(
                        self.history) < self.buffer_size:
                    action = int(np.random.choice([0, 1], 1)[0])
                else:
                    obs = torch.from_numpy(observation).to(self.device).float()
                    action_prob = self.model(obs.view(1, 12, self.h, self.w))
                    action = torch.argmax(action_prob).detach().item()
                return action

            else:
                observation = np.swapaxes(observation, 0, 2) / 255.
                obs = torch.from_numpy(observation).to(self.device).float()
                action_prob = self.model(obs.view(1, 12, self.h, self.w))
                action = torch.argmax(action_prob).detach().item()

                return self.action[action]
        ###########################

    def push(self, state, action, reward, done, state_next, smooth=None):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.history.append(
            np.array([state, action, reward, done, state_next, smooth]))

        if len(self.history) > self.history_size:
            self.history.pop(0)

        ###########################

    def replay_buffer(self, refresh=False):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if 'prioritized' in self.mode.split('_'):
            if refresh:
                self.priority = np.zeros(len(self.history))
                for i in range(len(self.history)):
                    max_reward, _ = torch.max(self.model_target(
                        torch.from_numpy(self.history[i][4]).to(
                            self.device).float().view(1, 12, self.h, self.w)),
                                              axis=1)
                    max_reward = max_reward.detach().item()
                    Q = self.model(
                        torch.from_numpy(
                            self.history[i][0]).to(self.device).float().view(
                                1, 12, self.h,
                                self.w))[0,
                                         self.history[i][1]].detach().item()
                    self.priority[i] = abs(
                        (self.history[i][2] + self.gamma * max_reward - Q))
                self.priority = self.priority / sum(self.priority)
                return 0
            priority = np.zeros(len(self.history))
            priority[:len(self.priority)] = self.priority
            if sum(priority) == 0:
                indices = np.random.choice(range(len(self.history)),
                                           size=self.batch_size)
            else:
                indices = np.random.choice(range(len(self.history)),
                                           size=self.batch_size,
                                           p=priority)

            ###########################
            return indices
        else:
            return np.random.choice(range(len(self.history)),
                                    size=self.batch_size)

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        episode_reward_history = []
        best_reward = -10
        optimizer = torch.optim.Adam(self.model.parameters(),
                                     lr=self.learning_rate)
        # optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.learning_rate,momentum=0.5)
        loss_fn = torch.nn.SmoothL1Loss()
        frame_count = 0
        if self.epoch > 0:
            f = open(self.name + '.txt', "a")
        else:
            f = open(self.name + '.txt', "w")
        done = False
        for ep in range(self.epoch, self.episode):
            state = self.env.reset()
            state = np.swapaxes(state, 0, 2) / 255.
            episode_reward = 0
            pre_action = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            smooth = 0
            for timestep in range(0, self.max_steps_per_episode):
                frame_count += 1
                action = self.make_action(state, test=False)
                if done:
                    action = 1

                # Decay
                self.epsilon -= self.step_epsilon
                self.epsilon = max(self.epsilon, self.min_epsilon)

                # next frame
                state_next, reward, done, _ = self.env.step(
                    self.action[action])
                state_next = np.swapaxes(state_next, 0, 2) / 255.
                episode_reward += reward
                # print(reward)
                #normalize reward
                # reward = np.sign(reward)
                # Save actions and states in replay buffer

                state = state_next
                if 'smooth1' in self.mode.split('_'):
                    pre_action.pop(0)
                    pre_action.append(action)
                    smooth = float(np.mean(pre_action) - 0.5)

                self.push(state, action, reward, done, state_next, smooth)

                if frame_count % 8 == 0 and len(
                        self.history) >= self.buffer_size:
                    if frame_count % self.history_size // 10 == 0 and 'prioritized' in self.mode.split(
                            '_'):
                        #update priority vector
                        self.replay_buffer(refresh=True)
                    indice = self.replay_buffer()
                    self.model.train()
                    # data_batch = torch.from_numpy(np.array(self.history)[indice]).to(self.device).float()
                    state_sample = torch.from_numpy(
                        np.array([self.history[i][0]
                                  for i in indice])).to(self.device).float()
                    action_sample = torch.from_numpy(
                        np.array([self.history[i][1]
                                  for i in indice])).to(self.device).float()
                    rewards_sample = torch.from_numpy(
                        np.array([self.history[i][2]
                                  for i in indice])).to(self.device).float()
                    done_sample = torch.from_numpy(
                        np.array([self.history[i][3]
                                  for i in indice])).to(self.device).float()
                    next_state_sample = torch.from_numpy(
                        np.array([self.history[i][4]
                                  for i in indice])).to(self.device).float()
                    smooth_sample = torch.from_numpy(
                        np.array([self.history[i][5]
                                  for i in indice])).to(self.device).float()
                    future_rewards = self.model_target(next_state_sample)

                    max_reward, _ = torch.max(future_rewards, axis=1)
                    updated_q_values = rewards_sample + self.gamma * max_reward
                    updated_q_values = updated_q_values * (
                        1 - done_sample) - done_sample
                    mask = F.one_hot(action_sample.long(),
                                     2).to(self.device).float()

                    q_values = self.model(state_sample)
                    q_action = torch.sum(q_values * mask, axis=1)
                    loss = loss_fn(q_action, updated_q_values)

                    if 'smooth1' in self.mode.split('_') and self.delay < ep:
                        penalty = torch.abs((ep - self.delay) / self.episode *
                                            torch.sum(smooth_sample))
                        loss += penalty

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm(self.model.parameters(), 1.0)
                    optimizer.step()

                if frame_count % self.update_target_network == 0:
                    self.model_target.load_state_dict(self.model.state_dict())

                if done:
                    break
            episode_reward_history.append(episode_reward)
            if len(episode_reward_history) > 30:
                del episode_reward_history[:1]
            running_reward = np.mean(episode_reward_history)
            #             if ep%500==0:
            #                 print("Episode:\t{},\t Avereged reward: {:.2f}\n".format(ep,running_reward))
            f.write("Episode:\t{},\t Avereged reward: {:.2f}\n".format(
                ep, running_reward))
            if running_reward > best_reward:
                best_reward = running_reward
                torch.save(self.model.state_dict(), self.name + '.pth')
        f.close()
コード例 #4
0
class Simulation:
    """
    Simulation for the game of 3D Pong.

    Parameters
    ----------
    params: dict
            Dictionary of all the simulation parameters
    """
    def __init__(self, params, player_n=0):
        # unpack the parameters:
        #### simulation
        self.device = params["device"]
        self.env_name = params["env_name"]
        self.training_frames = params["training_frames"]
        self.skip_frames = params["skip_frames"]
        self.nactions = params["nactions"]
        self.messages_enabled = params["messages_enabled"]
        self.selfplay = params["selfplay"]
        #### qnet model
        self.learning_rate = params["learning_rate"]
        self.sync = params["sync"]
        self.load_from = params["load_from"]
        #### buffer
        self.batch_size = params["batch_size"]
        self.replay_size = params["replay_size"]
        self.nstep = params["nstep"]
        #### agent model
        self.gamma = params["gamma"]
        self.eps_start = params["eps_start"]
        self.eps_end = params["eps_end"]
        self.eps_decay_rate = params["eps_decay_rate"]
        self.player_n = player_n
        self.double = params["double"]
        # initialize the simulation with shared properties
        self.env = gym.make(
            self.env_name
        )  # environment, agent etc. can"t be created jointly in a server simulation
        self.net = DQN(self.env.observation_space.shape[0],
                       self.nactions**2).to(self.device)

    def _create_environment(self):
        """
            create a gym environment for the simulation.

            Actions are discretized into nactions and frames are skipped for faster training
            :return: env
            """
        env = gym.make(self.env_name)
        if self.selfplay:
            env.unwrapped.multiplayer(env,
                                      game_server_guid="selfplayer",
                                      player_n=self.player_n)
        env = wrappers.action_space_discretizer(env, n=self.nactions)
        env = wrappers.SkipEnv(env, skip=self.skip_frames)
        return env

    def _create_agent(self, env):
        """
            Create agent with buffer for the simulation.

            :return: agent
            """
        # buffer = ExperienceBuffer(self.replay_size)
        buffer = Extendedbuffer(self.replay_size,
                                nstep=self.nstep,
                                gamma=self.gamma)
        agent = pongagent.Pongagent(env, self.player_n, buffer)
        return agent

    def _create_model(self):
        """
            Create a deep Q model for function approximation with Adam optimizer.

            :return: net, tgt_net, optimizer
            """
        tgt_net = DQN(self.env.observation_space.shape[0],
                      self.nactions**2).to(self.device)
        if self.load_from is not None:
            assert type(
                self.load_from
            ) == str, "Name of model to be loaded has to be a string!"
            self.net.load_state_dict(torch.load(self.load_from))
            tgt_net.load_state_dict(torch.load(self.load_from))
        optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate)
        return tgt_net, optimizer

    def _init_non_shared(self, player_n):
        env = self._create_environment()
        tgt_net, optimizer = self._create_model()
        agent = self._create_agent(env)
        writer = SummaryWriter(
            comment="-" + "player" + str(player_n) + "batch" +
            str(self.batch_size) + "_n" + str(env.action_space.n) + "_eps" +
            str(self.eps_decay_rate) + "_skip" + str(self.skip_frames) +
            "learning_rate" + str(self.learning_rate))
        return env, agent, tgt_net, optimizer, writer

    def _fill_buffer(self, agent):
        if self.messages_enabled:
            print("Player populating Buffer ...")
        agent.exp_buffer.fill(agent.env, self.replay_size, self.nstep)
        if self.messages_enabled:
            print("Buffer_populated!")

    def train(self, net, player_n=0):
        self.net = net
        env, agent, tgt_net, optimizer, writer = self._init_non_shared(
            player_n)
        self._fill_buffer(agent)
        if self.messages_enabled:
            print("Player %i start training: " % player_n)
        reward = []
        for frame in range(self.training_frames):
            epsilon = max(self.eps_end,
                          self.eps_start - frame / self.eps_decay_rate)
            ep_reward = agent.play_step(net, epsilon, self.device)
            if ep_reward:
                reward.append(ep_reward)
                writer.add_scalar("episode_reward", ep_reward, frame)
                writer.add_scalar("mean100_reward", np.mean(reward[-100:]),
                                  frame)
            if (frame % self.sync) == 0:
                tgt_net.load_state_dict(
                    net.state_dict())  # Syncs target and Standard net
                if self.messages_enabled:
                    print("We are at: %7i / %7i frames" %
                          (frame, self.training_frames))
                if player_n == 0:
                    torch.save(net.state_dict(),
                               self.env_name + "-time_update.dat")

            optimizer.zero_grad()
            batch = agent.exp_buffer.sample(self.batch_size)
            loss_t = calc_loss(batch, net, tgt_net, self.gamma**self.nstep,
                               self.double, self.device)
            loss_t.backward()
            optimizer.step()

            writer.add_scalar("loss", loss_t, frame)
            writer.add_scalar("epsilon", epsilon, frame)

        writer.close()
        if self.messages_enabled:
            print("Player %i end training!" % player_n)
        torch.save(net.state_dict(), self.env_name + "end_of_training.dat")

        return np.mean(reward[-len(reward) // 2:])

    # TODO: clean this function!
    def run(self, mode="play"):
        """
        runs the simulation.
        :param mode: str, either "play" or "train"
        :return: mean reward over all episodes with eps_end
        """
        if mode == "train":
            reward = self.train(self.net)
            return reward
        elif mode == "play":
            # Run play.py to see model in action
            pass

        else:
            raise Exception("Mode should be either play or train")
コード例 #5
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # Declare variables
        self.exp_id = uuid.uuid4().__str__().replace('-', '_')
        self.args = args
        self.env = env
        self.eps_threshold = None
        self.nA = env.action_space.n
        self.action_list = np.arange(self.nA)
        self.reward_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.max_q_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.loss_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.probability_list = np.zeros(env.action_space.n, np.float32)
        self.cur_eps = self.args.eps
        self.t = 0
        self.ep_len = 0
        self.mode = None
        if self.args.use_pri_buffer:
            self.replay_buffer = NaivePrioritizedBuffer(
                capacity=self.args.capacity, args=self.args)
        else:
            self.replay_buffer = ReplayBuffer(capacity=self.args.capacity,
                                              args=self.args)
        self.position = 0

        self.args.save_dir += f'/{self.exp_id}/'
        os.system(f"mkdir -p {self.args.save_dir}")
        self.meta = MetaData(fp=open(
            os.path.join(self.args.save_dir, 'result.csv'), 'w'),
                             args=self.args)
        self.eps_delta = (self.args.eps -
                          self.args.eps_min) / self.args.eps_decay_window
        self.beta_by_frame = lambda frame_idx: min(
            1.0, args.pri_beta_start + frame_idx *
            (1.0 - args.pri_beta_start) / args.pri_beta_decay)

        # Create Policy and Target Networks
        if self.args.use_dueling:
            print("Using dueling dqn . . .")
            self.policy_net = DuelingDQN(env, self.args).to(self.args.device)
            self.target_net = DuelingDQN(env, self.args).to(self.args.device)
        elif self.args.use_crnn:
            print("Using dueling crnn . . .")
            self.policy_net = CrnnDQN(env).to(self.args.device)
            self.target_net = CrnnDQN(env).to(self.args.device)
        else:
            self.policy_net = DQN(env, self.args).to(self.args.device)
            self.target_net = DQN(env, self.args).to(self.args.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.args.lr,
                                    eps=self.args.optimizer_eps)
        if self.args.lr_scheduler:
            print("Enabling LR Decay . . .")
            self.scheduler = optim.lr_scheduler.ExponentialLR(
                optimizer=self.optimizer, gamma=self.args.lr_decay)
        self.cur_lr = self.optimizer.param_groups[0]['lr']

        # Compute Huber loss
        self.loss = F.smooth_l1_loss

        # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370
        self.policy_net.share_memory()
        self.target_net.share_memory()

        # Set defaults for networks
        self.policy_net.train()
        self.target_net.eval()
        self.target_net.load_state_dict(self.policy_net.state_dict())

        if args.test_dqn:
            # you can load your model here
            ###########################
            # YOUR IMPLEMENTATION HERE #
            print('loading trained model')
            self.load_model()

        if args.use_pri_buffer:
            print('Using priority buffer . . .')
        if args.use_double_dqn:
            print('Using double dqn . . .')

        if args.use_bnorm:
            print("Using batch normalization . . .")

        print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n')

    def init_game_setting(self):
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        with torch.no_grad():
            if self.args.test_dqn:
                q, argq = self.policy_net(
                    Variable(
                        self.channel_first(observation))).data.cpu().max(1)
                return self.action_list[argq]
            # Fill up probability list equal for all actions
            self.probability_list.fill(self.cur_eps / self.nA)
            # Fetch q from the model prediction
            q, argq = self.policy_net(Variable(
                self.channel_first(observation))).data.cpu().max(1)
            # Increase the probability for the selected best action
            self.probability_list[argq[0].item()] += 1 - self.cur_eps
            # Use random choice to decide between a random action / best action
            action = torch.tensor(
                [np.random.choice(self.action_list, p=self.probability_list)])

        ###########################
        return action.item(), q.item()

    def optimize_model(self):
        """
        Function to perform optimization on DL Network
        :return: Loss
        """
        # Return if initial buffer is not filled.
        if len(self.replay_buffer.memory) < self.args.mem_init_size:
            return 0
        if self.args.use_pri_buffer:
            batch_state, batch_action, batch_next_state, batch_reward, batch_done, indices, weights = self.replay_buffer.sample(
                self.args.batch_size, beta=self.beta_by_frame(self.t))
        else:
            batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.replay_buffer.sample(
                self.args.batch_size)
        batch_state = Variable(
            self.channel_first(
                torch.tensor(np.array(batch_state), dtype=torch.float32)))
        batch_action = Variable(
            torch.tensor(np.array(batch_action), dtype=torch.long))
        batch_next_state = Variable(
            self.channel_first(
                torch.tensor(np.array(batch_next_state), dtype=torch.float32)))
        batch_reward = Variable(
            torch.tensor(np.array(batch_reward), dtype=torch.float32))
        batch_done = Variable(
            torch.tensor(np.array(batch_done), dtype=torch.float32))
        policy_max_q = self.policy_net(batch_state).gather(
            1, batch_action.unsqueeze(1)).squeeze(1)
        if self.args.use_double_dqn:
            policy_ns_max_q = self.policy_net(batch_next_state)
            next_q_value = self.target_net(batch_next_state).gather(
                1,
                torch.max(policy_ns_max_q, 1)[1].unsqueeze(1)).squeeze(1)
            target_max_q = next_q_value * self.args.gamma * (1 - batch_done)
        else:
            target_max_q = self.target_net(batch_next_state).detach().max(
                1)[0].squeeze(0) * self.args.gamma * (1 - batch_done)
        # Compute Huber loss
        if self.args.use_pri_buffer:
            loss = (policy_max_q -
                    (batch_reward + target_max_q.detach())).pow(2) * Variable(
                        torch.tensor(weights, dtype=torch.float32))
            prios = loss + 1e-5
            loss = loss.mean()
        else:
            loss = self.loss(policy_max_q, batch_reward + target_max_q)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        # Clip gradients between -1 and 1
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)

        if self.args.use_pri_buffer:
            self.replay_buffer.update_priorities(indices,
                                                 prios.data.cpu().numpy())

        self.optimizer.step()
        return loss.cpu().detach().numpy()

    def train(self):
        """
        Implement your training algorithm here
        """

        ###########################
        # YOUR IMPLEMENTATION HERE #
        def train_fn():
            self.t = 1
            self.mode = "Random"
            train_start = time.time()
            if not self.args.load_dir == '':
                self.load_model()
            for i_episode in range(1, self.args.max_episodes + 1):
                # Initialize the environment and state
                start_time = time.time()
                state = self.env.reset()
                self.reward_list.append(0)
                self.loss_list.append(0)
                self.max_q_list.append(0)
                self.ep_len = 0
                done = False

                # Save Model
                self.save_model(i_episode)
                # Collect garbage
                self.collect_garbage(i_episode)

                # Run the game
                while not done:
                    # Update the target network, copying all weights and biases in DQN
                    if self.t % self.args.target_update == 0:
                        print("Updating target network . . .")
                        self.target_net.load_state_dict(
                            self.policy_net.state_dict())
                    # Select and perform an action
                    self.cur_eps = max(self.args.eps_min,
                                       self.cur_eps - self.eps_delta)
                    if self.cur_eps == self.args.eps_min:
                        self.mode = 'Exploit'
                    else:
                        self.mode = "Explore"
                    action, q = self.make_action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    self.reward_list[-1] += reward
                    self.max_q_list[-1] = max(self.max_q_list[-1], q)
                    # Store the transition in memory
                    self.replay_buffer.push(state, action, next_state, reward,
                                            done)
                    self.meta.update_step(self.t, self.cur_eps,
                                          self.reward_list[-1],
                                          self.max_q_list[-1],
                                          self.loss_list[-1], self.cur_lr)

                    # Increment step and Episode Length
                    self.t += 1
                    self.ep_len += 1

                    # Move to the next state
                    state = next_state

                    # Perform one step of the optimization (on the target network)
                    if self.ep_len % self.args.learn_freq == 0:
                        loss = self.optimize_model()
                        self.loss_list[-1] += loss
                self.loss_list[-1] /= self.ep_len

                # Decay Step:
                if self.args.lr_scheduler:
                    self.cur_lr = self.scheduler.get_lr()[0]
                    if i_episode % self.args.lr_decay_step == 0 and self.cur_lr > self.args.lr_min:
                        self.scheduler.step(i_episode)

                # Update meta
                self.meta.update_episode(
                    i_episode, self.t,
                    time.time() - start_time,
                    time.time() - train_start, self.ep_len,
                    len(self.replay_buffer.memory),
                    self.cur_eps, self.reward_list[-1],
                    np.mean(self.reward_list), self.max_q_list[-1],
                    np.mean(self.max_q_list), self.loss_list[-1],
                    np.mean(self.loss_list), self.mode, self.cur_lr)

        import multiprocessing as mp
        processes = []
        for rank in range(4):
            p = mp.Process(target=train_fn)
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
コード例 #6
0
class Agent_DQN():
    def __init__(self, env, test=False):
        self.cuda = torch.device('cuda')
        print("Using device: " + torch.cuda.get_device_name(self.cuda),
              flush=True)

        self.env = env
        self.state_shape = env.observation_space.shape
        self.n_actions = env.action_space.n

        self.memory = deque(maxlen=100000)
        self.batch_size = 32
        self.mem_threshold = 50000

        self.gamma = 0.99

        self.learning_rate = 1e-4

        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_period = 10000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.epsilon_period

        self.update_rate = 4

        self.start_epoch = 1
        self.epochs = 10
        self.epoch = 10000

        self.model = DQN(self.state_shape, self.n_actions).to(self.cuda)
        print("DQN parameters: {}".format(count_parameters(self.model)))

        self.target = DQN(self.state_shape, self.n_actions).to(self.cuda)
        self.target.eval()
        self.target_update = 10000

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)
        if test:
            self.model.load_state_dict(torch.load('model.pt'))

    def init_game_setting(self):
        pass

    def make_action(self, observation, test=False):
        epsilon = 0.01 if test else self.epsilon
        # turn action into tensor
        observation = torch.tensor(observation,
                                   device=self.cuda,
                                   dtype=torch.float)
        # turn off learning
        self.model.eval()
        # epsilon greedy policy
        if random.random() > epsilon:
            # no need to calculate gradient
            with torch.no_grad():
                # choose highest value action
                b = self.model(observation)
                b = b.cpu().data.numpy()
                action = np.random.choice(
                    np.flatnonzero(np.isclose(b, b.max())))
        else:
            # random action
            action = random.choice(np.arange(self.n_actions))
        # turn learning back on
        self.model.train()
        return action

    def replay_buffer(self):
        # Return tuple of sars transitions
        states, actions, rewards, next_states, dones = zip(
            *random.sample(self.memory, self.batch_size))
        states = torch.tensor(np.vstack(states),
                              device=self.cuda,
                              dtype=torch.float)
        actions = torch.tensor(np.array(actions),
                               device=self.cuda,
                               dtype=torch.long)
        rewards = torch.tensor(np.array(rewards, dtype=np.float32),
                               device=self.cuda,
                               dtype=torch.float)
        next_states = torch.tensor(np.vstack(next_states),
                                   device=self.cuda,
                                   dtype=torch.float)
        dones = torch.tensor(np.array(dones, dtype=np.float32),
                             device=self.cuda,
                             dtype=torch.float)
        return states, actions, rewards, next_states, dones

    def experience_replay(self, n=0):
        # clamp gradient
        clamp = False
        # Reset gradient (because it accumulates by default)
        self.optimizer.zero_grad()
        # sample experience memory
        states, actions, rewards, next_states, dones = self.replay_buffer()
        # get Q(s,a) for sample
        Q = self.model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        # get max_a' Q(s',a')
        Q_prime = self.target(next_states).detach().max(1)[0]
        # calculate y = r + gamma * max_a' Q(s',a') for non-terminal states
        Y = rewards + (self.gamma * Q_prime) * (1 - dones)
        # Huber loss of Q and Y
        loss = F.smooth_l1_loss(Q, Y)
        # Compute dloss/dx
        loss.backward()
        # Clamp gradient
        if clamp:
            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
        # Change the weights
        self.optimizer.step()

    def train(self):
        step = 0
        learn_step = 0
        print("Begin Training:", flush=True)
        learn_curve = []
        last30 = deque(maxlen=30)
        for epoch in range(self.start_epoch, self.epochs + 1):
            durations = []
            rewards = []
            flag = []
            # progress bar
            epoch_bar = tqdm(range(self.epoch), total=self.epoch, ncols=200)
            for episode in epoch_bar:
                # reset state
                state = self.env.reset()
                # decay epsilon
                if self.epsilon > self.epsilon_min:
                    self.epsilon -= self.epsilon_decay
                # run one episode
                done = False
                ep_duration = 0
                ep_reward = 0
                while not done:
                    step += 1
                    ep_duration += 1
                    # get epsilon-greedy action
                    action = self.make_action(state)
                    # do action
                    next_state, reward, done, info = self.env.step(action)
                    ep_reward += reward
                    # add transition to replay memory
                    self.memory.append(
                        Transition(state, action, reward, next_state, done))
                    state = next_state
                    # learn from experience, if available
                    if step % self.update_rate == 0 and len(
                            self.memory) > self.mem_threshold:
                        self.experience_replay(learn_step)
                        learn_step += 1
                    # update target network
                    if step % self.target_update == 1:
                        self.target.load_state_dict(self.model.state_dict())

                durations.append(ep_duration)
                rewards.append(ep_reward)
                last30.append(ep_reward)
                learn_curve.append(np.mean(last30))
                flag.append(info['flag_get'])
                epoch_bar.set_description(
                    "epoch {}/{}, avg duration = {:.2f}, avg reward = {:.2f}, last30 = {:2f}"
                    .format(epoch, self.epochs, np.mean(durations),
                            np.mean(rewards), learn_curve[-1]))
            # save model every epoch
            plt.clf()
            plt.plot(learn_curve)
            plt.title(f"DQN Epoch {epoch} with {save_prefix} Reward")
            plt.xlabel('Episodes')
            plt.ylabel('Moving Average Reward')
            if not os.path.exists(f"{save_prefix}_DQN"):
                os.mkdir(f"{save_prefix}_DQN")
            torch.save(self.model.state_dict(),
                       f'{save_prefix}_DQN/DQN_model_ep{epoch}.pt')
            pickle.dump(
                rewards,
                open(f"{save_prefix}_DQN/DQN_reward_ep{epoch}.pkl", 'wb'))
            pickle.dump(flag,
                        open(f"{save_prefix}_DQN/flag_ep{epoch}.pkl", 'wb'))
            plt.savefig(f"{save_prefix}_DQN/epoch{epoch}.png")
            learn_curve = []
コード例 #7
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.memory = []
        self.env = env
        self.n_actions = env.env.action_space.n
        self.policy_net = DQN(4, self.n_actions).to(device).float()
        self.target_net = DQN(4, self.n_actions).to(device).float()
        # self.policy_net.load_state_dict(torch.load("best_weights_model.pt"))
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.eps_threshold = EPS_START
        self.args = args
        self.test_count = 0
        self.max_reward = 0
        self.max_reward_so_far = 0
        self.reward_buffer = []
        self.flag = 0
        self.steps_done = 0
        # self.target_net.eval()
        self.transition = []
        self.test_mean_reward = 0

        self.optimizer = torch.optim.Adam(self.policy_net.parameters(),
                                          lr=LEARNING_RATE)

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            self.policy_net.load_state_dict(
                torch.load("best_weights_model.pt",
                           map_location=torch.device('cpu')))

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # self.env.reset()
        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        observation = np.transpose(observation, (2, 0, 1))
        if not test:
            # print("Helllo")
            # global steps_done

            sample = random.random()

            self.eps_threshold = self.eps_threshold - (EPS_START -
                                                       EPS_END) / EPS_DECAY

            if self.eps_threshold < EPS_END:
                self.eps_threshold = EPS_END
            # print("Steps after increment ", self.steps_done)
            if sample > self.eps_threshold:
                with torch.no_grad():

                    q_sa = self.policy_net(
                        torch.from_numpy(observation).unsqueeze(0).to(device))
                    index = torch.argmax(q_sa.data, dim=1).item()

                    return index
            else:
                return np.random.randint(0, self.n_actions)
        else:
            q_sa = self.policy_net(
                torch.from_numpy(observation).unsqueeze(0).to(device))
            index = torch.argmax(q_sa.data, dim=1).item()

            return index

        ###########################
        # return action

    def push(self):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if len(self.memory) >= 50000:
            self.memory.pop(0)
        self.memory.append(self.transition)

        # if(len(self.memory)%500==0 or len(self.memory)>= 50000):
        #     print("Memory size : ", len(self.memory))
        ###########################

    def replay_buffer(self, batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        transitions = random.sample(self.memory, BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        ###########################
        return batch

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        # reward_buffer = deque([])
        current_loss = 0.0
        mean_reward = 0.0
        for i_episode in range(NUM_EPISODES):
            # Initialize the environment and state
            # self.env.reset()
            # last_screen = get_screen()
            # current_screen = get_screen()
            state = self.env.reset()
            # state = np.transpose(state,(2,0,1)) #New
            # state = torch.tensor([state])
            episode_Reward = 0.0
            for t in range(EPISODE_STEP_LIMIT):
                # Render here
                # self.env.env.render()
                self.steps_done += 1

                action = self.make_action(state, False)
                # 'Transition',('state', 'action', 'next_state', 'reward', 'done'))

                next_state, reward, done, _ = self.env.step(action)
                episode_Reward += reward

                state = np.transpose(state, (2, 0, 1))  #New
                next_state = np.transpose(next_state, (2, 0, 1))
                self.transition = (state, action, next_state, reward, done)
                self.push()

                # Move to the next state
                state = next_state

                # self.env.render()

                # Update the target network, copying all weights and biases in DQN
                # print("Steps : ",steps_done)
                if self.steps_done % TARGET_UPDATE == 0:
                    print("**********Updating Target********")
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())

                # Perform one step of the optimization (on the target network)
                # optimize step start
                # print("Memory Size", len(self.memory))
                # print("Completed 10,000 steps")
                if len(self.memory) > 10000 and len(self.memory) % 4 == 0:
                    if self.flag == 0:
                        print("Crossed 10000")
                        self.flag = 1

                    batch = self.replay_buffer(BATCH_SIZE)

                    # 'Transition',('state', 'action', 'next_state', 'reward', 'done'))
                    state_batch = torch.from_numpy(np.asarray(batch[0]))
                    action_batch = torch.from_numpy(np.asarray(batch[1]))
                    next_state_batch = torch.from_numpy(np.asarray(batch[2]))
                    reward_batch = torch.from_numpy(np.asarray(
                        batch[3])).to(device)
                    done_batch = torch.from_numpy(np.asarray(
                        batch[4])).to(device)

                    state_action_values = self.policy_net(
                        state_batch.to(device)).gather(
                            1, action_batch[:, None].to(device)).squeeze(1)

                    q_max = self.target_net(
                        next_state_batch.to(device)).max(1)[0].detach()

                    q_max[done_batch] = 0

                    expected_state_action_values = (
                        q_max) * GAMMA + reward_batch
                    #print (state_action_values.double().size())

                    #print (expected_state_action_values.double().size())
                    loss = F.smooth_l1_loss(
                        state_action_values.double(),
                        expected_state_action_values.double())

                    current_loss = loss
                    # print("Episode : ", i_episode, ", iteration : ",t, " Loss :  ", current_loss, " Steps : ", steps_done," Epsilon : ", self.eps_threshold, " Mean Reward : ", mean_reward)

                    #optimze the model
                    self.optimizer.zero_grad()
                    loss.backward()

                    self.optimizer.step()

                if done:
                    if len(self.reward_buffer) >= REWARD_BUFFER_SIZE:
                        self.reward_buffer.pop(0)
                    self.reward_buffer.append(episode_Reward)
                    mean_reward = np.mean(self.reward_buffer)
                    break

            if (i_episode % 500 == 0):
                env2 = env('BreakoutNoFrameskip-v4',
                           self.args,
                           atari_wrapper=True,
                           test=True)
                test(self, env2, total_episodes=100)
                writer.add_scalar('Test Mean Reward', self.test_mean_reward,
                                  i_episode)
                if self.test_mean_reward > self.max_reward_so_far:
                    torch.save(self.policy_net.state_dict(),
                               "best_weights_model.pt")
                    self.max_reward_so_far = self.test_mean_reward

            writer.add_scalar('Train Mean Reward', mean_reward, i_episode)
            writer.add_scalar('Training LOSS', current_loss, i_episode)

            # To calculate mean reward
            if i_episode % 100 == 0:
                # print("*****************")
                print("TRAIN Mean Reward after ", i_episode, " episodes is ",
                      mean_reward, " Epsilon ", self.eps_threshold)
            if i_episode % 500 == 0:
                torch.save(self.policy_net.state_dict(), "saved_model.pt")
                print("Saved Model after ", i_episode, " episodes")
        self.env.env.close()
        self.writer.close()
コード例 #8
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """

        super(Agent_DQN, self).__init__(env)
        self.env = env
        self.args = args
        self.episode = 0
        self.n_actions = self.env.action_space.n
        self.epsilon_start = 1.0
        self.epsilon_final = 0.025
        self.epsilon_decay = 3000
        self.epsilon_by_frame = lambda frame_idx: self.epsilon_final + (
            self.epsilon_start - self.epsilon_final) * math.exp(
                -1. * frame_idx / self.epsilon_decay)
        self.epsilon = 0
        self.eval_net = DQN().cuda()
        self.target_net = DQN().cuda()
        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.criterion = nn.MSELoss()
        #self._model = Net(self.env.observation_space.shape, self.env.action_space.n)
        self._use_cuda = torch.cuda.is_available()
        self.optim = torch.optim.Adam(self.eval_net.parameters(),
                                      lr=self.args.learning_rate)

        if self._use_cuda:
            self.eval_net = self.eval_net.cuda()
            self.target_net = self.target_net.cuda()
            self.criterion = self.criterion.cuda()

#       self.replaybuffer = ReplayBuffer(args.buffer_size)
        self.buffer = deque(maxlen=10000)
        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            self.eval_net.load_state_dict(torch.load(args.model_dqn))
            self.target_net.load_state_dict(self.eval_net.state_dict())
            if self._use_cuda:
                self.eval_net = self.eval_net.cuda()
                self.target_net = self.target_net.cuda()

        ##################
        # YOUR CODE HERE #
        ##################

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary
        """
        ##################
        # YOUR CODE HERE #
        ##################
        pass

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        self.buffer.append((state, action, reward, next_state, done))

    def replay_buffer(self, batch_size):
        state, action, reward, next_state, done = zip(
            *random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(
            next_state), done

    def train(self):
        """
        Implement your training algorithm here
        """
        ##################
        # YOUR CODE HERE #
        ##################

        print('begin train...')

        #        if self.args.log_file is not None:
        #        fp_log = open(self.args.log_file, 'w', buffering=1)
        fout = open('dqn_score.log', 'w')
        if os.path.exists('model') == False:
            os.makedirs('model')

        losses = []
        all_rewards = []
        episode_reward = 0
        best_mean_reward = 0
        state = self.env.reset()
        for i_step in range(self.args.max_steps):
            self.epsilon = self.epsilon_by_frame(i_step)
            action = self.make_action(state)
            next_state, reward, done, _ = self.env.step(action)

            self.push(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward

            if done:
                state = self.env.reset()
                all_rewards.append(episode_reward)
                self.episode += 1
                print('{},{}'.format(self.episode, episode_reward))
                fout.write('Episode{},episode_reward{}\n'.format(
                    self.episode, episode_reward))
                episode_reward = 0

            if len(self.buffer) == self.args.buffer_size:
                if i_step % self.args.eval_net_update_step == 0:
                    loss = self.optimize_model()
                    losses.append(loss)

                if i_step % self.args.target_net_update_step == 0:
                    self.target_net.load_state_dict(self.eval_net.state_dict())

            if i_step % self.args.save_freq == 0:
                mean_reward = \
                    sum(all_rewards[-100:]) / 100
                if best_mean_reward < mean_reward:
                    print('save best model with mean reward = %f' %
                          mean_reward)
                    best_mean_reward = mean_reward
                    torch.save(self.eval_net.state_dict(), self.args.model_dqn)

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ##################
        # YOUR CODE HERE #
        ##################
        observation = torch.cuda.FloatTensor(
            observation.reshape((1, 84, 84, 4))).transpose(1,
                                                           3).transpose(2, 3)
        #        print(type(observation))
        Q_value = self.eval_net.forward(observation).data.cpu().numpy()
        if random.random() > self.epsilon:
            action = np.argmax(Q_value)
        else:
            action = self.env.get_random_action()
        return action

    def optimize_model(self):

        state, action, reward, next_state, done = self.replay_buffer(
            self.args.batch_size)

        state = torch.FloatTensor(np.float32(state)).permute(0, 3, 1, 2)
        next_state = torch.FloatTensor(np.float32(next_state)).permute(
            0, 3, 1, 2)
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        done = torch.ByteTensor(done)

        if self._use_cuda:
            state = state.cuda()
            next_state = next_state.cuda()
            action = action.cuda()
            reward = reward.cuda()
            done = done.cuda()

        q_values = self.eval_net(state)

        # next_q_values = self.target_net(next_state).detach()

        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

        next_q_values = self.target_net(next_state).detach()
        next_q_value = next_q_values.max(1)[0]

        expected_q_value = reward + self.args.gamma * next_q_value * (1 - done)

        loss = self.criterion(q_value, expected_q_value.data)

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        return loss
コード例 #9
0
def ddqn_rankWeight_train(env, exploreScheduler, betaScheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, 
	exp_frame, exp_initial, exp_final, prob_alpha, gamma, target_update_steps, frames_per_epoch, 
	frames_per_state, output_directory, last_checkpoint, max_frames, envo):

	"""
	Implementation of the training algorithm for DDQN using Rank-based prioritization.
	Information with regards to the algorithm can be found in the paper, 
	"Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and
	David Silver. Refer to section 3.3 in the paper for more info. 
	"""
	
	gym.undo_logger_setup()
	logging.basicConfig(filename=envo+'_'+'ddqn_rank_weightedLoss_training.log',level=logging.INFO)
	num_actions = env.action_space.n
	env.reset()
	
	print('No. of actions: ', num_actions)
	print(env.unwrapped.get_action_meanings())

	# initialize action value and target network with the same weights
	model = DQN(num_actions)
	target = DQN(num_actions)

	if use_cuda:
		model.cuda()
		target.cuda()

	frames_count = 1

	if last_checkpoint:
		model.load_state_dict(torch.load(last_checkpoint))
		print(last_checkpoint)
		print('weights loaded...')

		#TODO: Implementation of resume
		# exp_replay = util.initialize_rank_replay_resume(env, rp_start, rp_size, frames_per_state, 
		# 	model, target, gamma, batch_size)
		# frames_count = get_index_from_checkpoint_path(last_checkpoint)

	else:
		exp_replay = util.initialize_rank_replay(env, rp_start, rp_size, frames_per_state, 
			model, target, gamma, prob_alpha)

	target.load_state_dict(model.state_dict())

	optimizer = optimizer_constructor.type(model.parameters(), lr=optimizer_constructor.kwargs['lr'],
		alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps'] )

	episodes_count = 1
	frames_per_episode = 1
	epsiodes_durations = []
	rewards_per_episode = 0
	rewards_duration = []
	loss_per_epoch = []
	wLoss_func = Weighted_Loss()

	
	current_state, _, _, _ = util.play_game(env, frames_per_state)
	print('Starting training...')

	for frames_count in range(1, max_frames):

		epsilon=exploreScheduler.anneal_linear(frames_count)
		beta = betaScheduler.anneal_linear(frames_count)
		choice = random.uniform(0,1)

		# epsilon greedy algorithm
		if choice <= epsilon:
			action = LongTensor([[random.randrange(num_actions)]])

		else:
			action = util.get_greedy_action(model, current_state)

		curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0])

		rewards_per_episode += reward
		reward = Tensor([[reward]])
		current_state_ex = Variable(current_state, volatile=True)
		curr_obs_ex = Variable(curr_obs, volatile=True)
		action_ex = Variable(action, volatile=True)
		reward_ex = Variable(reward, volatile=True)

		#compute td-error for one sample
		td_error = ddqn_compute_td_error(batch_size=1, state_batch=current_state_ex, reward_batch=reward_ex, action_batch=action_ex, 
			next_state_batch=curr_obs_ex, model=model, target=target, gamma=gamma)

		td_error = torch.pow(torch.abs(td_error)+1e-8, prob_alpha)
		exp_replay.push(current_state, action, reward, curr_obs, td_error)
		current_state = curr_obs

		# compute y 
		if len(exp_replay) >= batch_size:
			# Get batch samples
			obs_samples, obs_ranks, obs_priorityVals = exp_replay.sample(batch_size)
			num_samples_per_batch = len(obs_samples)
			obs_priorityTensor = torch.from_numpy(np.array(obs_priorityVals))
			p_batch = 1/ obs_priorityTensor
			w_batch_raw = (1/len(exp_replay) * p_batch)**beta
			max_weight = exp_replay.get_max_weight(beta)
			w_batch = w_batch_raw/max_weight
			w_batch = w_batch.type(Tensor)
			
			batch = Experience(*zip(*obs_samples))

			loss, new_weights = ddqn_compute_y(batch, num_samples_per_batch, model, target, gamma, w_batch, wLoss_func)
			loss_abs = torch.abs(new_weights)
			exp_replay.update(obs_ranks, loss_abs)

			optimizer.zero_grad()
			loss.backward()

			for param in model.parameters():
				param.grad.data.clamp_(-1,1)

			optimizer.step()
			loss_per_epoch.append(loss.data.cpu().numpy()[0])
		
		frames_per_episode+= frames_per_state

		if done:
			rewards_duration.append(rewards_per_episode)
			rewards_per_episode = 0
			frames_per_episode=1
			episodes_count+=1
			env.reset()
			current_state, _, _, _ = util.play_game(env, frames_per_state)

			if episodes_count % 100 == 0:
				avg_episode_reward = sum(rewards_duration)/100.0
				avg_reward_content = 'Episode from', episodes_count-99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(loss_per_epoch)
				print(avg_reward_content)
				logging.info(avg_reward_content)
				rewards_duration = []
				loss_per_epoch = []

		# update weights of target network for every TARGET_UPDATE_FREQ steps
		if frames_count % target_update_steps == 0:
			target.load_state_dict(model.state_dict())

		# sort memory replay every half of it's capacity iterations 
		if frames_count % int(rp_size/2) == 0:
			exp_replay.sort()


		#Save weights every 250k frames
		if frames_count % 250000 == 0:
			util.make_sure_path_exists(output_directory+'/'+envo+'/')
			torch.save(model.state_dict(), output_directory+'/'+envo+'/rank_weightedLoss_'+ str(frames_count)+'.pth')


		#Print frame count and sort experience replay for every 1000000 (one million) frames:
		if frames_count % 1000000 == 0:
			training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
			print(training_update)
			logging.info(training_update)
コード例 #10
0
def ddqn_rank_train(env, exploreScheduler, betaScheduler,
                    optimizer_constructor, model_type, batch_size, rp_start,
                    rp_size, exp_frame, exp_initial, exp_final, prob_alpha,
                    gamma, target_update_steps, frames_per_epoch,
                    frames_per_state, output_directory, last_checkpoint,
                    max_frames, envo):
    """
	Implementation of the training algorithm for DDQN using Rank-based prioritization.
	Information with regards to the algorithm can be found in the paper, 
	"Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and
	David Silver. Refer to section 3.3 in the paper for more info. 
	"""

    gym.undo_logger_setup()
    logging.basicConfig(filename=envo + '_' +
                        'ddqn_rank_weighted_training.log',
                        level=logging.INFO)
    num_actions = env.action_space.n
    env.reset()

    print('No. of actions: ', num_actions)
    print(env.unwrapped.get_action_meanings())

    # initialize action value and target network with the same weights
    model = DQN(num_actions)
    target = DQN(num_actions)

    if use_cuda:
        model.cuda()
        target.cuda()

    frames_count = 1

    if last_checkpoint:
        model.load_state_dict(torch.load(last_checkpoint))
        print(last_checkpoint)
        print('weights loaded...')

        #TODO: Implementation of resume
        # exp_replay = util.initialize_rank_replay_resume(env, rp_start, rp_size, frames_per_state,
        # 	model, target, gamma, batch_size)
        # frames_count = get_index_from_checkpoint_path(last_checkpoint)

    else:
        exp_replay = util.initialize_rank_replay(env, rp_start, rp_size,
                                                 frames_per_state, model,
                                                 target, gamma, prob_alpha)

    target.load_state_dict(model.state_dict())

    optimizer = optimizer_constructor.type(
        model.parameters(),
        lr=optimizer_constructor.kwargs['lr'],
        alpha=optimizer_constructor.kwargs['alpha'],
        eps=optimizer_constructor.kwargs['eps'])

    episodes_count = 1
    epsiodes_durations = []
    rewards_per_episode = 0
    rewards_duration = []
    loss_per_epoch = []
    current_state, _, _, _ = util.play_game(env, frames_per_state)
    wLoss_func = Weighted_Loss()

    print('Starting training...')

    for frames_count in range(1, max_frames):

        epsilon = exploreScheduler.anneal_linear(frames_count)
        beta = betaScheduler.anneal_linear(frames_count)
        choice = random.uniform(0, 1)

        # epsilon greedy algorithm
        if choice <= epsilon:
            action = LongTensor([[random.randrange(num_actions)]])

        else:
            action = util.get_greedy_action(model, current_state)

        curr_obs, reward, done, _ = util.play_game(env, frames_per_state,
                                                   action[0][0])

        rewards_per_episode += reward
        reward = Tensor([[reward]])
        td_error = 1

        temp_exp = Experience(current_state, action, reward, curr_obs,
                              td_error)
        current_state = curr_obs

        # compute y
        if len(exp_replay) >= batch_size:
            # Get batch samples

            # start = time.time()

            if frames_count % rp_size == 0:
                obs_samples, obs_priorityVals = exp_replay.sample(batch_size -
                                                                  1,
                                                                  prob_alpha,
                                                                  sort=True)
            else:
                obs_samples, obs_priorityVals = exp_replay.sample(batch_size -
                                                                  1,
                                                                  prob_alpha,
                                                                  sort=False)

            obs_samples.append(temp_exp)
            obs_priorityVals.append(td_error)

            obs_pVals_tensor = torch.from_numpy(np.array(obs_priorityVals))
            # print("P(i): ", obs_pVals_tensor)
            IS_weights = torch.pow((obs_pVals_tensor * rp_size), -beta)
            max_weight = torch.max(IS_weights)

            IS_weights_norm = torch.div(IS_weights, max_weight).type(Tensor)
            IS_weights_norm[-1] = torch.max(IS_weights_norm)

            # print("Norm W(i): ", IS_weights_norm)

            batch = Experience(*zip(*obs_samples))
            loss, new_weights = ddqn_compute_y(batch, batch_size, model,
                                               target, gamma, IS_weights_norm,
                                               wLoss_func)
            new_weights = torch.pow(new_weights, prob_alpha)
            new_exp = Experience(temp_exp.state, temp_exp.action,
                                 temp_exp.reward, temp_exp.next_state,
                                 new_weights[batch_size - 1])
            exp_replay.update(obs_samples, new_weights, new_exp)
            optimizer.zero_grad()
            loss.backward()
            # print("loss: ", loss.data)
            optimizer.step()
            loss_per_epoch.append(loss.data.cpu().numpy()[0])

        else:
            exp_replay.push(new_exp.state, new_exp.action, new_exp.reward,
                            new_exp.next_state, td_error)

        # end = time.time()

        # duration = end-start

        # print('duration : ', duration)

        if done:
            # print('Game: ', rewards_per_episode)
            rewards_duration.append(rewards_per_episode)
            rewards_per_episode = 0
            episodes_count += 1
            env.reset()
            current_state, _, _, _ = util.play_game(env, frames_per_state)

            if episodes_count % 100 == 0:
                avg_episode_reward = sum(rewards_duration) / 100.0
                avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(
                    loss_per_epoch)
                print(avg_reward_content)
                logging.info(avg_reward_content)
                rewards_duration = []
                loss_per_epoch = []

        # update weights of target network for every TARGET_UPDATE_FREQ steps
        if frames_count % target_update_steps == 0:
            target.load_state_dict(model.state_dict())

        #Save weights every 250k frames
        if frames_count % 250000 == 0:
            util.make_sure_path_exists(output_directory + '/' + envo + '/')
            torch.save(
                model.state_dict(), output_directory + '/' + envo +
                '/rank_uniform' + str(frames_count) + '.pth')

        #Print frame count and sort experience replay for every 1000000 (one million) frames:
        if frames_count % 1000000 == 0:
            training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
            print(training_update)
            logging.info(training_update)
コード例 #11
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN,self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.epsilon_start = 1
        self.epsilon_end = 0.02
        self.epsilon_decay = 200000
        self.epsilon = self.epsilon_start
        
        self.gamma = 0.99
        self.env = env
        
        self.buffer_size = 30000
        self.buffer = deque(maxlen=30000)
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(),lr=0.00015)
        self.reward_array = []
        self.reward_x_axis = []
        self.batch_size = 32
        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.policy_net.load_state_dict(torch.load('policy_model'))
        self.target_net.load_state_dict(self.policy_net.state_dict()) 
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        ###########################
        pass
    
    
    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if test==True:
            self.epsilon = 0
        observation=torch.cuda.FloatTensor(observation.reshape((1,84,84,4))).transpose(1,3).transpose(2,3)
        q = self.policy_net(observation).data.cpu().numpy()
        if random.random() > self.epsilon:
           action  = np.argmax(q)
        else:
            action = random.randint(0,4)
        return action
    
    def push(self,data):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.buffer.append(data)
        ###########################
        
        
    def replay_buffer(self,batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #      
        ###########################
        return random.sample(self.buffer,batch_size)
        
    def play_game(self,start_state):
        action = self.make_action(start_state)
        n_s,r,terminal,_ = self.env.step(action)
        self.push((start_state,action,r,n_s,terminal))
        return n_s,r,terminal
    
    def loss_function(self):
        data = self.replay_buffer(self.batch_size)
        s,a,r,n_s,terminal = zip(*data)
        s = torch.FloatTensor(np.float32(s)).permute(0,3,1,2).to(self.device)
        a = torch.LongTensor(a).to(self.device)
        r = torch.FloatTensor(r).to(self.device)
        n_s = torch.FloatTensor(np.float32(n_s)).permute(0,3,1,2).to(self.device).to(self.device)
        terminal = torch.FloatTensor(terminal).to(self.device)
        q = self.policy_net(s).gather(1,a.unsqueeze(1)).squeeze(1)
        n_q = self.target_net(n_s).detach().max(1)[0]
        expected_q = r + self.gamma * n_q * (1 - terminal)
        loss = F.smooth_l1_loss(q, expected_q.data)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        rewards_array = []
        reward_ = 0
        best_mean = 0
        print_rate = 100
        last_saved = None
        start_state = self.env.reset()
        for frames in range (3500000):
            self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. *frames / self.epsilon_decay)
            n_s,r,terminal = self.play_game(start_state)
            start_state = n_s
            reward_ += r
            if terminal:
                start_state = self.env.reset()
                rewards_array.append(reward_)
                if len(rewards_array) % print_rate==0:
                    print('%%%%%%%%%%%%%%%%%%%%%%%%%')
                    print('Frames = ', frames)
                    print('Current Epsilon = ', self.epsilon)
                    print('Episode = ', len(rewards_array))
                    print('Reward = ', np.mean(rewards_array[-100:]))#sum(rewards_array[-100:]) / 100)
                    print('Buffer Length = ', len(self.buffer))
                    self.reward_array.append(np.mean(rewards_array[-100:]))
                    self.reward_x_axis.append(len(rewards_array))
                    self.print_graph()
                    if last_saved != None:
                        print("last saved = ", best_mean)
                    print('%%%%%%%%%%%%%%%%%%%%%%%%%')
                reward_ = 0
                
            if len(self.buffer)<10000:
                continue   
            if len(self.buffer) > 10000 and frames % 4 ==0:
                    self.loss_function()
                
            if frames % 1000 == 0:
                    print("Target net updated")
                    self.target_net.load_state_dict(self.policy_net.state_dict())
                    
            mean_reward = np.mean(rewards_array[-100:])
            if mean_reward > best_mean and frames % 100==0:
                    print("Saving model with reward = ", mean_reward)
                    best_mean = mean_reward
                    last_saved = mean_reward
                    torch.save(self.policy_net.state_dict(), 'policy_model_')
        ###########################
    
    def print_graph(self):
        fig = plt.figure()
        ax = plt.subplot(111)
        ax.plot(self.reward_x_axis,self.reward_array,label='$y = Rewards, $x = episodes')
        ax.legend()
        fig.savefig('plot.png')
コード例 #12
0
ファイル: play.py プロジェクト: P-Schumacher/Multiagent-Pong
import time
import gym.wrappers

'''
This function is for seeing models in action, it uses a slightly modified version of the environment which has longer 
timeouts and longer episode lengths
NOT WORKING
'''

DEFAULT_ENV_NAME = "RoboschoolPong-v1"  # Use a longer version of Pong for demonstration (needs to be defined in source)
MAKE_VIDEO = False  # Set true or false here to record video OR render, not both

env = gym.make(DEFAULT_ENV_NAME)
env = wrappers.action_space_discretizer(env, 2)
net = DQN(env.observation_space.shape[0], env.action_space.n)
net.load_state_dict(torch.load("RoboschoolPong-v1-time_update.dat"))
env.reset()
recorder = gym.wrappers.monitoring.video_recorder.VideoRecorder(env, "./recording.mp4", enabled=MAKE_VIDEO)
still_open = True

for i in range(1):
    obs = env.reset()
    while True:
        recorder.capture_frame()
        action = net(torch.tensor(obs, dtype=torch.float32)).max(0)[1]
        action = action.item()
        action = int(action)
        if not MAKE_VIDEO:
            still_open = env.render()
        if not still_open:
            break
コード例 #13
0
ファイル: agent_dqn.py プロジェクト: AJ1897/DQN-Breakout
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN,self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.epochs = 10
        self.n_episodes = 1000000
        self.env = env
        self.nA = self.env.action_space.n
        # self.nS = self.env.observation_space
        self.batch_size = 32
        self.DQN = DQN()
        self.Target_DQN = DQN()
        self.buffer_memory = 1000000
        self.train_buffer_size = 4
        self.min_buffer_size = 10000
        self.target_update_buffer = 10000
        self.learning_rate = 0.0001
        self.discount_factor = 0.999
        self.epsilon = 1
        self.min_epsilon = 0.01
        # self.decay_rate = 0.999
        self.ep_decrement = (self.epsilon - self.min_epsilon)/self.n_episodes
        self.criteria = nn.MSELoss()
        self.optimiser = optim.Adam(self.DQN.parameters(),self.learning_rate)
        self.buffer=[]
        self.Evaluation = 100000
        self.total_evaluation__episodes = 100
        self.full_train = 100000
        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        obs = self.env.reset()
        ###########################
        pass
    
    
    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if not test:
            p = random.random()
            if p < self.epsilon:
                action = np.random.randint(0,self.nA)
            else:
                a = self.DQN(torch.from_numpy(observation).unsqueeze(0))
                action = np.argmax(a.detach().numpy())
        else:
            a = self.Target_DQN(torch.from_numpy(observation).unsqueeze(0))
            action = np.argmax(a.detach().numpy())
        ###########################
        return action
    
    def push(self,episode):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if len(self.buffer) < self.buffer_memory:
            self.buffer.append(episode)
        else:
            self.buffer.pop(0)
            self.buffer.append(episode)
        ###########################
        
        
    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        
        batch = random.sample(self.buffer,self.batch_size)
        # print(np.shape(batch[0][:]))
        batch = list(zip(*batch))
        # print(np.asarray(batch[1]))
        batch_x = torch.from_numpy(np.asarray(batch[0]))
        act = torch.from_numpy(np.vstack(batch[1]))
        rew = torch.from_numpy(np.asarray(batch[2]))
        dones = torch.from_numpy(np.asarray(batch[3]))
        batch_y = torch.from_numpy(np.asarray(batch[4]))
        # print(act.shape)
        ###########################
        return batch_x,act,rew,dones,batch_y
        

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        current = 1
        reward_list =[]
        loss_list= []
        current_train = 1
        current_target = 1
        for x in range(self.n_episodes):
            obs = np.transpose(self.env.reset(),(2,0,1))
            # print(obs[0][40][:])
            e_list=[]
            done = False
            accumulated_rewards = 0
            while not done:
                # self.env.render()
                action = self.make_action(obs,False)
                next_obs,reward,done,info = self.env.step(action)
                next_obs = np.transpose(next_obs,(2,0,1))
                # print(info['ale.lives'])
                # print(np.shape(e_list[-1]))
                accumulated_rewards+=reward
                self.push([obs,action,reward,done,next_obs])
                self.epsilon-=self.ep_decrement
                if current_train % self.train_buffer_size == 0 and len(self.buffer) > self.min_buffer_size:
                    batch_x,act,rew,dones,batch_y = self.replay_buffer()
                    self.optimiser.zero_grad()
                    future_return =  self.Target_DQN(batch_y).max(1)[0].detach() * self.discount_factor
                    future_return[dones] = 0
                    y = rew + future_return
                    c_q = self.DQN(batch_x).gather(1,act)
                    loss = self.criteria(c_q.double(),(y.double()).unsqueeze(1))
                    loss_list.append(loss.detach())
                    loss.backward()
                    # self.env.render()
                    self.optimiser.step()
                    current_train = 1

                if current_target > self.target_update_buffer:
                    self.Target_DQN.load_state_dict(self.DQN.state_dict())
                    current_target = 1

                if current % self.full_train == 0:
                    # current = 1
                    # print("\n Weights: \n",list(self.DQN.parameters()),"\r")
                    dataset = my_dataset(self.buffer)
                    for i in range(self.epochs):
                        loader = torch.utils.data.DataLoader(dataset, batch_size = 32, shuffle = True)
                        # print(len(list(loader)))
                        for batch in list(loader):
                            batch_x,act,rew,dones,batch_y=batch
                            self.optimiser.zero_grad()
                            future_return =  self.Target_DQN(batch_y).max(1)[0].detach() * self.discount_factor
                            future_return[dones] = 0
                            y = rew + future_return
                            c_q = self.DQN(batch_x).gather(1,act.unsqueeze(1))
                            loss = self.criteria(c_q.double(),y.double().unsqueeze(1))
                            loss_list.append(loss.detach())
                            loss.backward()
                            self.optimiser.step()
                
                if current % self.Evaluation == 0:
                    # current = 1
                    # print("\n Weights: \n",list(self.DQN.parameters()),"\r")
                    print("\n","#" * 40, "Evaluation number %d"%(current/self.Evaluation),"#" * 40)
                    for i in range(self.total_evaluation__episodes):
                        state = np.transpose(self.env.reset(),(2,0,1))
                        done = False
                        episode_reward = 0.0
                        rewards=[]
                        #playing one game
                        while(not done):
                            action = self.make_action(state, test=True)
                            state, reward, done, info = self.env.step(action)
                            episode_reward += reward
                            state = np.transpose(state,(2,0,1))
                        rewards.append(episode_reward)
                    print('Run %d episodes'%(self.total_evaluation__episodes))
                    print('Mean:', np.mean(rewards))
                    print("#" * 40, "Evaluation Ended!","#" * 40,"\n")


                current+=1
                current_train += 1
                current_target += 1
                obs = next_obs
            reward_list.append(accumulated_rewards)
            if len(reward_list) % 200 == 0:
                reward_list = reward_list[-150:]
                # print(reward_list)
                loss_list = loss_list[-150:]
            if x%100 == 0:
                print("Current = %d, episode = %d, Average_reward = %0.2f, epsilon = %0.2f"%(current, x+1, np.mean(reward_list[-100:]), self.epsilon))






        
        ###########################
class Agent_DQN_Trainer(object):
    def __init__(self, env, args):

        # Training Parameters
        self.args = args
        self.env = env
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.gamma = args.gamma_reward_decay
        self.n_actions = env.action_space.n
        self.output_logs = args.output_logs
        self.step = 8e6
        self.curr_step = 0
        self.ckpt_path = args.save_dir
        self.epsilon = args.eps_start
        self.eps_end = args.eps_end
        self.target_update = args.update_target
        self.observe_steps = args.observe_steps
        self.explore_steps = args.explore_steps
        self.saver_steps = args.saver_steps
        self.resume = args.resume
        self.writer = TensorboardSummary(self.args.log_dir).create_summary()
        # Model Settings

        self.cuda = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.policy_net = DQN(4, self.n_actions)
        self.target_net = DQN(4, self.n_actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        if self.cuda:
            self.policy_net.to(self.cuda)
            self.target_net.to(self.cuda)

        self.target_net.eval()
        train_params = self.policy_net.parameters()
        self.optimizer = optim.RMSprop(train_params,
                                       self.lr,
                                       momentum=0.95,
                                       eps=0.01)
        self.memory = ReplayMemory(args.replay_memory_size)

        if args.resume:
            if not os.path.isfile(args.resume):
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    args.resume))
            checkpoint = torch.load(args.resume)

            self.epsilon = checkpoint['epsilon']
            self.curr_step = checkpoint['step']

            self.policy_net.load_state_dict(checkpoint['policy_state_dict'])
            self.target_net.load_state_dict(checkpoint['target_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['episode']))

    def epsilon_greedy_policy(self, observation, nA, test=False):

        observation = to_float(observation).to(self.cuda)
        # print("size of observation->"+str(sys.getsizeof(observation.storage())))
        sample = random.random()

        if test:
            return self.policy_net(observation).max(1)[1].view(1, 1).item()

        if sample <= self.epsilon:
            action = torch.tensor([[random.randrange(self.n_actions)]],
                                  device=self.cuda,
                                  dtype=torch.long)
        else:
            with torch.no_grad():
                action = self.policy_net(observation).max(1)[1].view(1, 1)

        return action

    def optimize_model(self):

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.cuda,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [to_float(s) for s in batch.next_state if s is not None])
        state_batch = torch.cat(
            [to_float(s).to(self.cuda) for s in batch.state])
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)
        next_state_values = torch.zeros(self.batch_size, device=self.cuda)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        loss = F.smooth_l1_loss(
            state_action_values.float(),
            expected_state_action_values.unsqueeze(1).float())
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)

        self.optimizer.step()
        return loss.item()

    def train(self):

        current_loss = 0
        train_rewards = []
        train_episode_len = 0.0
        file_loss = open(self.output_logs, "a")
        file_loss.write("episode,step,epsilon,reward,loss,length\n")
        print("Training Started")
        episode = 0
        loss = 0.0

        while self.curr_step < self.step:
            state = to_tensor(self.env.reset())

            # * State is in torch.uint8 format , convert before passing to model*#
            done = False
            episode_reward = 0.0
            train_loss = 0
            s = 0  # length of episode
            while not done:
                # self.env.env.render()

                action = self.epsilon_greedy_policy(state, self.n_actions)

                new_state, reward, done, _ = self.env.step(
                    action.item())  # new_state torch.uint8 format
                new_state, reward = to_tensor(new_state).to(
                    self.cuda), torch.tensor([reward], device=self.cuda)
                episode_reward += reward
                self.memory.push(state, action, new_state, reward)

                if (self.curr_step > self.observe_steps) and (
                        self.curr_step % self.args.update_current) == 0:
                    loss = self.optimize_model()
                    train_loss += loss

                print(
                    'Step: %i,  Episode: %i,  Action: %i,  Reward: %.0f,  Epsilon: %.5f, Loss: %.5f'
                    % (self.curr_step, episode, action.item(), reward.item(),
                       self.epsilon, loss),
                    end='\r')

                if self.curr_step > self.observe_steps and self.curr_step % self.target_update == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
                    # TO CHECK APPROXIMATELY HOW MUCH GPU MEMORY OUR REPLAY MEMORY IS CONSUMING
                    print(torch.cuda.get_device_name(0))
                    print('Memory Usage:')
                    print('Allocated:',
                          round(torch.cuda.memory_allocated(0) / 1024**3, 1),
                          'GB')
                    print('Cached:   ',
                          round(torch.cuda.memory_cached(0) / 1024**3, 1),
                          'GB')

                if self.epsilon > self.args.eps_end and self.curr_step > self.observe_steps:
                    interval = self.args.eps_start - self.args.eps_end
                    self.epsilon -= interval / float(self.args.explore_steps)

                self.curr_step += 1
                state = new_state
                s += 1

                if self.curr_step % self.args.saver_steps == 0 and episode != 0 and self.curr_step != 0:
                    k = {
                        'step': self.curr_step,
                        'epsilon': self.epsilon,
                        'episode': episode,
                        'policy_state_dict': self.policy_net.state_dict(),
                        'target_state_dict': self.target_net.state_dict(),
                        'optimizer': self.optimizer.state_dict()
                    }
                    filename = os.path.join(self.ckpt_path, 'ckpt.pth.tar')
                    torch.save(k, filename)

            episode += 1
            train_rewards.append(episode_reward.item())
            train_episode_len += s

            if episode % self.args.num_eval == 0 and episode != 0:
                current_loss = train_loss
                avg_reward_train = np.mean(train_rewards)
                train_rewards = []
                avg_episode_len_train = train_episode_len / float(
                    self.args.num_eval)
                train_episode_len = 0.0
                file_loss.write(
                    str(episode) + "," + str(self.curr_step) + "," +
                    "{:.4f}".format(self.epsilon) + "," +
                    "{:.2f}".format(avg_reward_train) + "," +
                    "{:.4f}".format(current_loss) + "," +
                    "{:.2f}".format(avg_episode_len_train) + "\n")
                file_loss.flush()
                self.writer.add_scalar('train_loss/episode(avg100)',
                                       current_loss, episode)
                self.writer.add_scalar('episode_reward/episode(avg100)',
                                       avg_reward_train, episode)
                self.writer.add_scalar('length of episode/episode(avg100)',
                                       avg_episode_len_train, episode)

            self.writer.add_scalar('train_loss/episode', train_loss, episode)
            self.writer.add_scalar('episode_reward/episode', episode_reward,
                                   episode)
            self.writer.add_scalar('epsilon/num_steps', self.epsilon,
                                   self.curr_step)
            self.writer.add_scalar('length of episode/episode', s, episode)

        print("GAME OVER")
コード例 #15
0
node_loc = np.random.randint(
    0, 101,
    U_num).tolist()  #边缘节点位置 1-100号 (so we suppose that node_num == U_num???)

user_loc = np.random.randint(0, 101, U_num).tolist()  #用户位置 1-100号
user_dis = random_displacement(user_loc)  #用户未来位移 上下左右 -10,10,-1,1
use_buff = np.random.randint(3, 8, U_num).tolist()  #资源所需
state0 = [user_loc, user_dis, node_loc, use_buff]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#主程序部分

policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
target_net.eval()
optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=learning_rate)  #定义优化器Adam,可以更换
buffer = ReplayBuffer(
    buffer_size)  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
criterion = torch.nn.MSELoss(reduction='sum')

# training
for i_episode in range(num_episodes):

    #state0 #获得一个初始化状态

    for t in count():
        # 选择动作
        action = e_greedy_select_action(state0)
コード例 #16
0
class Agent_DQN(Agent):

    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            parameters for neural network
            initialize Q net and target Q net
            parameters for replay buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN, self).__init__(env)

        if torch.cuda.is_available():
            self.device = 'cuda'
            print("Using GPU!!!!")
        else:
            'cpu'
            print("WARNING")
            print("WARNING")
            print("Using CPU")

        self.state_size = env.get_state()[0].as1xnArray().shape[0]
        self.action_size = 4

        self.memory = deque(maxlen=10000)
        self.thirty_ep_ep = deque(maxlen=10000)
        self.thirty_ep_reward = deque(maxlen=10000)

        # Discount Factor
        self.gamma = 0.99
        # Exploration Rate: at the beginning do 100% exploration
        self.epsilon = 1.0
        # Decay epsilon so we can shift from exploration to exploitation
        self.epsilon_decay = 0.995
        # Set floor for how low epsilon can go
        self.epsilon_min = 0.01
        # Set the learning rate
        self.learning_rate = 0.00015
        # batch_size
        self.batch_size = 32

        self.epsilon_decay_frames = 1.0/500000

        self.policy_net = DQN(self.state_size, self.action_size).to(self.device)
        self.target_net = DQN(self.state_size, self.action_size).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)

        self.loss = 0

        self.file_path = 'trained_models_2/./Q_Network_Parameters_'

        with open('trained_models_2/log2.txt', 'w+') as log:
            log.write("episode,avg_reward,epsilon\n")

        if args.test_dqn:
            # load trained model
            print('loading trained model')
            file_number_to_load = 1933
            load_file_path = self.file_path+str(file_number_to_load)+'.pth'
            self.policy_net.load_state_dict(torch.load(load_file_path, map_location=lambda storage, loc: storage))

            # for name, param in self.policy_net.named_parameters():
            # print(name, '\t\t', param.shape)
            print('loaded weights')
            print(self.policy_net.head.weight)


    def train(self, n_episodes=100000):
        ep_epsilon = []
        accumulated_reward = 0
        rewards_30 = []

        for i_episode in range(n_episodes):
            results = self.env.reset()
            state, reward, done, _ = self.unpack(results)
            render = os.path.isfile('.makePicture')


            # Counters for Reward Averages per episode:
            ep_reward = 0.0

            while not done:
                action = self.make_action(state, False)
                results = self.env.step({0: action})
                next_state, reward, done, _ =  self.unpack(results)
                # print(reward, done)
                self.push(state, action, reward, next_state, done)
                state = next_state

                ep_reward += reward
                accumulated_reward += reward


            if i_episode > 1000 and len(self.memory) > self.batch_size:
                self.learn()
                if i_episode % 5000 == 0:
                    print('------------ UPDATING TARGET -------------')
                    self.target_net.load_state_dict(self.policy_net.state_dict())

            rewards_30.append(ep_reward)
            # print(rewards_30)
            if len(rewards_30) > 30:
                # print("IN HERE")
                del rewards_30[0]

            ep_epsilon.append(self.epsilon)
            # Print average reward for the episode:
            # print('Episode ', i_episode, 'had a reward of: ', ep_reward)
            # print('Epsilon: ', self.epsilon)

            # Logging the average reward over 30 episodes
            if i_episode % 30 == 0:
                self.thirty_ep_reward.append(accumulated_reward/30.0)
                self.thirty_ep_ep.append(i_episode)
                with open('trained_models_2/log.txt', 'a+') as log:
                    log.write(str(i_episode)+' had a reward of ' + str(accumulated_reward/30.0)+' over 30 ep\n')
                with open('trained_models_2/log2.txt', 'a+') as log:
                    log.write(str(i_episode) + ',' + str(sum(rewards_30)/30.0) + ',' + str(self.epsilon) + '\n')

                accumulated_reward = 0.0
                # Save network weights after we have started to learn
                if i_episode > 3000 and i_episode % 1000 == 0:

                    print('saving... ', i_episode)
                    save_file_path = self.file_path+str(i_episode)+'.pth'
                    torch.save(self.policy_net.state_dict(), save_file_path)


                fig = plt.figure()
                plt.plot(ep_epsilon)
                plt.title('Epsilon decay')
                plt.xlabel('Episodes')
                plt.ylabel('Epsilon Value')
                plt.savefig('trained_models_2/epsilon.png')
                plt.close()

                fig = plt.figure()
                plt.plot(self.thirty_ep_ep, self.thirty_ep_reward)
                plt.title('Average Reward per 30 Episodes')
                plt.xlabel('Episodes')
                plt.ylabel('Average Reward')
                plt.savefig('trained_models_2/reward.png')
                plt.close()

            if i_episode % 200 == 0:
                print('Episode: ',i_episode ,'Avg reward of last 30 episodes: ', sum(rewards_30)/30.0)

    def learn(self):
        sampled_batch = self.replay_buffer(self.batch_size)

        states, actions, rewards, next_states, dones = list(zip(*sampled_batch))

        states = torch.from_numpy(np.stack(states)).to(self.device)
        actions = torch.from_numpy(np.stack(actions)).to(self.device)
        rewards = torch.from_numpy(np.stack(rewards)).to(self.device)
        next_states = torch.from_numpy(np.stack(next_states)).to(self.device)
        dones = torch.from_numpy(np.stack(dones)).to(self.device)
        
        states = states.float()
        next_states = next_states.float()
        actions = actions.unsqueeze(1)
        qfun = self.policy_net(states)

        state_action_values = qfun.gather(1, actions.long()).squeeze()

        next_state_values = self.target_net(next_states).max(1).values.detach()

        TD_error = rewards + self.gamma*next_state_values*(1-dones)

        self.loss = f.smooth_l1_loss(state_action_values, TD_error)

        self.optimizer.zero_grad()
        self.loss.backward()

        # for param in self.policy_net.parameters():
        #     param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        observation = torch.tensor(observation, dtype=torch.float32).to(self.device)
        observation = observation.unsqueeze(0)

        if not test:
            if np.random.rand() <= self.epsilon:
                action = random.randrange(self.action_size)
            else:
                with torch.no_grad():
                    # action = torch.argmax(self.policy_net(observation)).item()
                    action = self.target_net(observation).max(1)[1].view(1, 1).item()
                    # print(action)

            if self.epsilon > self.epsilon_min:
                self.epsilon = max(0, self.epsilon - self.epsilon_decay_frames)
        else:
            with torch.no_grad():
                action = torch.argmax(self.policy_net(observation)).item()
        return action

    def push(self, state, action, reward, next_state, done):
        """
        Push new data to buffer and remove the old one if the buffer is full.
        """
        action = np.array(action, dtype=np.uint8)
        reward = np.array(reward, dtype=np.float32)
        done = np.array(done, dtype=np.float32)
        self.memory.append((state, action, reward, next_state, done))

    def replay_buffer(self, batch_size):
        """
        Select batch from buffer.
        """
        return random.sample(self.memory, batch_size)

    def unpack(self, results):
        result = results[0]
        state, reward, done, info = result.asTuple()
        return state.as1xnArray(), reward, done, info

    def init_game_setting(self):
        pass
コード例 #17
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.env = env
        self.buffer = ReplayBuffer()
        self.num_action = self.env.get_action_space().n
        self.cur_step = 0
        self.greedyPolicy = EpsilonGreedyStrategy(1, 0.025, 0.01)
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.num_episode = args.num_episode
        self.learning_rate = args.learning_rate
        self.sample_batch_size = args.sample_batch_size
        self.gamma = args.gamma
        self.e = 1
        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        if test:
            with torch.no_grad():
                action = self.policy_net(observation).argmax(dim=1).item()
        else:
            if self.e > random.random():
                action = random.randrange(self.num_action)
            else:
                observation = self.transform(observation)
                with torch.no_grad():
                    action = self.policy_net(observation).argmax(dim=1).item()
            self.e -= self.greedyPolicy.get_exploration_rate()
        ###########################
        return action

    def push(self,experience):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.buffer.append(experience)
        ###########################

    def replay_buffer(self, batch_size=32):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        experience = self.buffer.sample(batch_size)
        ###########################
        return  experience

    def transform(self, state):
        state = np.asarray(state) / 255.
        state = torch.tensor(state)
        state = state.unsqueeze(0)
        state = state.permute(0, 3, 1, 2)
        state = state.to(device=self.device, dtype=torch.float)
        return state

    def extract_tensors(self, experiences):
        batch = Experience(*zip(*experiences))
        t1 = batch.state
        t2 = batch.action
        t3 = batch.next_state
        t4 = batch.reward
        t5 = batch.termination
        return t1, t2, t3, t4, t5

    def get_current_q(self, states, actions):
        states = np.asarray(states) / 255.
        a = np.count_nonzero(states)
        states = torch.tensor(states, device=self.device, dtype=torch.float)
        states = states.permute(0, 3, 1, 2)
        actions = torch.tensor(np.asarray(actions), device=self.device, dtype=torch.long).unsqueeze(-1)
        QS = self.policy_net(states).gather(1,  actions)#.requires_grad_(True)
        QS = QS.permute(1, 0)
        return QS[0]

    def get_next_q(self, next_states, terminations):
        next_states = np.asarray(next_states) / 255.
        next_states = torch.tensor(next_states,device=self.device, dtype=torch.float)
        next_states = next_states.permute(0, 3, 1, 2)
        QS = self.target_net(next_states).max(1)[0].detach()#.requires_grad_(True)
        QS = QS * torch.tensor(terminations, device=self.device, dtype=torch.float, requires_grad= True)

        return QS

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        optimizor = optim.Adam(params=self.policy_net.parameters(), lr = self.learning_rate,
                               betas = (0.5, 0.999))
        max_reward = -1
        rewards_sum = 0
        reward_collection = []
        episode_collection = []
        print(self.device)

        for episode in range(self.num_episode):
            done = False
            state = self.env.reset()
            while not done:
                action = self.make_action(state, False)
                next_state, reward, done, info = self.env.step(action)
                self.push(
                    Experience(state,
                               action,
                               next_state,
                               reward,
                               (not done)
                               )
                          )
                rewards_sum += reward
                state = next_state
                if self.buffer.can_sample():
                    experiences = self.buffer.sample(self.sample_batch_size)

                    states, actions, next_states, rewards, terminations = self.extract_tensors(experiences)

                    current_q = self.get_current_q(states, actions)

                    next_q = self.get_next_q(next_states, terminations)

                    target_q = self.gamma * next_q +  torch.tensor(rewards, device=self.device, dtype=torch.float)

                    loss = F.smooth_l1_loss(current_q, target_q)

                    optimizor.zero_grad()
                    loss.backward()
                    for param in self.policy_net.parameters():
                        param.grad.data.clamp_(-1, 1)
                    optimizor.step()
            if episode % 3000 == 0:
                    self.target_net.load_state_dict(self.policy_net.state_dict())
            if episode % 30 == 0:
                print("episode: ", episode, "\t", "average reward :", rewards_sum/30)
                reward_collection.append(rewards_sum/30)
                episode_collection.append(episode)

                if rewards_sum > max_reward:
                    torch.save(self.policy_net.state_dict(), "model/policy_net_max_reward.pth")
                rewards_sum = 0
            if episode%1000 == 0:
                torch.save(self.policy_net.state_dict(), "model/policy_net.pth")
        torch.save(self.policy_net.state_dict(), "model/policy_net.pth")
        x = episode_collection
        y = reward_collection
        plt.plot(x,y)
        plt.show()
        plt.savefig('episode-reward.png')
コード例 #18
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN,self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.state_size = env.observation_space.shape
        self.action_size = env.action_space.n
        self.memory = deque(maxlen = 1000000)
        self.thirty_ep_reward = deque(maxlen = 100000)

        #print(self.state_size, self.action_size)
        # Discount Factor
        self.gamma = 0.99
        # Exploration Rate: at the beginning do 100% exploration
        self.epsilon = 1.0
        # Decay epsilon so we can shift from exploration to exploitation
        self.epsilon_decay = 0.995
        # Set floor for how low epsilon can go
        self.epsilon_min = 0.01
        # Set the learning rate
        self.learning_rate = 0.00015
        # batch_size
        self.batch_size = 32

        self.epsilon_decay_frames = 1.0/500000

        self.qnetwork = DQN(self.state_size[0], self.state_size[1], self.action_size).to(self.device)
        print('initial weights:')
        print(self.qnetwork.head.weight)
        self.q_prime = DQN(self.state_size[0], self.state_size[1], self.action_size).to(self.device)
        self.q_prime.load_state_dict(self.qnetwork.state_dict())

        self.optimizer = optim.Adam(self.qnetwork.parameters(), lr = self.learning_rate)

        self.loss = 0

        self.file_path = 'trained_models_2/./Q_Network_Parameters_'

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            file_number_to_load = 1933
            load_file_path = self.file_path+str(file_number_to_load)+'.pth'
            self.qnetwork.load_state_dict(torch.load(load_file_path, map_location = lambda storage, loc: storage))

            #for name, param in self.qnetwork.named_parameters():
            #    print(name, '\t\t', param.shape)
            print('loaded weights')
            print(self.qnetwork.head.weight)
    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.curr_state = self.env.reset()
        ###########################
        pass


    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        observation = observation[np.newaxis,:]
        observation = torch.tensor(observation, dtype = torch.float32).to(self.device)
        observation = observation.permute(0 , 3, 1, 2)
        if not test:
            if np.random.rand()<=self.epsilon:
                action = random.randrange(self.action_size)
            else:
                action = torch.argmax(self.qnetwork(observation)).item()
        else:
            action = torch.argmax(self.qnetwork(observation)).item()
        ###########################
        return action

    def push(self, state, action, reward, next_state, done):
        """ You can add additional arguments as you need.
        Push new data to buffer and remove the old one if the buffer is full.

        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        action = np.array(action, dtype = np.uint8)
        reward = np.array(reward, dtype = np.float32)
        done = np.array(done, dtype = np.float32)
        self.memory.append((state, action, reward, next_state, done))
        ###########################


    def replay_buffer(self, batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        minibatch = random.sample(self.memory, self.batch_size)
        ###########################
        return minibatch
    def learn(self):
        minibatch = self.replay_buffer(self.batch_size)

        states, actions, rewards, next_states, dones = list(zip(*minibatch))

        states = torch.from_numpy(np.stack(states)).to(self.device)
        actions = torch.from_numpy(np.stack(actions)).to(self.device)
        rewards = torch.from_numpy(np.stack(rewards)).to(self.device)
        next_states = torch.from_numpy(np.stack(next_states)).to(self.device)
        dones = torch.from_numpy(np.stack(dones)).to(self.device)

        states = states.permute(0 , 3, 1, 2).float()
        next_states = next_states.permute(0, 3, 1, 2).float()
        actions = actions.unsqueeze(1)
        qfun = self.qnetwork(states)

        #print('input...\n',states[1][1].shape)
        #fig = plt.figure()
        #plt.imshow(states[0,0,:,:].cpu())
        #plt.title('State')
        #plt.savefig('state.png')
        #plt.close()

        state_action_values = qfun.gather(1, actions.long()).squeeze()

        next_state_values = self.q_prime(next_states).max(1).values.detach()

        TD_error = rewards + self.gamma*next_state_values*(1-dones)

        self.loss = F.smooth_l1_loss(state_action_values, TD_error)

        self.optimizer.zero_grad()
        self.loss.backward()

        if self.epsilon > self.epsilon_min:
            self.epsilon = max(0, self.epsilon - self.epsilon_decay_frames)


        for param in self.qnetwork.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        #print(torch.sum(self.qnetwork.conv1.weight.data))

    def train(self, n_episodes = 100000):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        # Initializing counters and lists for average reward over 30 episodes:
        ep_counter = 0.0
        time_steps = 0.0
        thirty_reward = 0.0
        ep_epsilon = []
        thirty_ep_reward = []
        thirty_ep_ep = []

        naming_counter = 0
        log = open('trained_models_2/log.txt', 'w+')
        log.write('Beginning of Log\n')
        log.close()

        frames = 0.0
        for e in range(n_episodes):

            running_loss = 0.0
            ep_counter += 1
            state = self.env.reset()
            done = False
            render = os.path.isfile('.makePicture')

            # Counters for Reward Averages per episode:
            ep_reward = 0.0
            counter = 0.0



            while not done:
                frames += 1
                counter += 1
                time_steps += 1

                if render: self.env.env.render()
                action = self.make_action(state, False)
                next_state, reward, done, _ = self.env.step(action)
                reward = np.clip(reward, -1, 1)

                self.push(state, action, reward, next_state, done)

                state = next_state
                #if done:
                #    reward = -1

                if frames > 500000:
                    if len(self.memory) > self.batch_size:
                        self.learn()
                        if frames%5000 == 0:
                            print('------------ UPDATING TARGET -------------')
                            self.q_prime.load_state_dict(self.qnetwork.state_dict())

                running_loss+= self.loss
                ep_reward+=reward
                thirty_reward += reward

            ep_epsilon.append(self.epsilon)
            # Print average reward for the episode:
            print('Episode ', e, 'had a reward of: ', ep_reward)
            print('Epsilon: ', self.epsilon)

            # Loging the average reward over 30 episodes
            if ep_counter%30 == 0:
                print('Frame: ', frames)
                thirty_ep_reward.append(thirty_reward/30)
                thirty_ep_ep.append(e)
                print('The Avereage Reward over 30 Episodes: ', thirty_reward/30.0)
                with open('trained_models_2/log.txt', 'a+') as log:
                    log.write(str(naming_counter)+' had a reward of '+ str(thirty_reward/30.0)+' over 30 ep\n')

                time_steps = 0.0
                thirty_reward = 0.0
                # Save network weights after we have started to learn
                if e > 3000:

                    print('saving... ', naming_counter)
                    save_file_path = self.file_path+str(naming_counter)+'.pth'
                    torch.save(self.qnetwork.state_dict(), save_file_path)
                    naming_counter += 1


                fig = plt.figure()
                plt.plot(ep_epsilon)
                plt.title('Epsilon decay')
                plt.xlabel('Episodes')
                plt.ylabel('Epsilon Value')
                plt.savefig('trained_models_2/epsilon.png')
                plt.close()

                fig = plt.figure()
                plt.plot(thirty_ep_ep, thirty_ep_reward)
                plt.title('Average Reward per 30 Episodes')
                plt.xlabel('Episodes')
                plt.ylabel('Average Reward')
                plt.savefig('trained_models_2/reward.png')
                plt.close()
コード例 #19
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.env = env
        self.batch_size = BATCH_SIZE
        self.gamma = 0.999
        self.eps_start = EPS_START
        self.eps_decay = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE

        self.policy_net = DQN(self.env.action_space.n)
        self.target_net = DQN(self.env.action_space.n)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        if use_cuda:
            self.policy_net.cuda()
            self.target_net.cuda()

        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-5)
        self.memory = deque(maxlen=10000)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        global steps_done
        self.policy_net.eval()
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * steps_done / EPS_DECAY)
        steps_done += 1
        if sample > eps_threshold:
            return self.policy_net(
                Variable(torch.from_numpy(observation),
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1)
        else:
            return LongTensor([[random.randrange(self.env.action_space.n)]])
        ###########################
        return action

    def push(self, s, a, r, s_, done):
        """ You can add additional arguments as you need.
        Push new data to buffer and remove the old one if the buffer is full.

        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.memory.append((s, a, r, s_, done))
        if len(self.memory) > self.maxlen:
            self.replay_memory_store.popleft()
        self.memory_counter += 1

        ###########################

    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        #print("memory", len(self.memory), self.BATCH_SIZE)
        minibatch = random.sample(self.memory, self.BATCH_SIZE)
        minibatch = np.array(minibatch).transpose(0, 3, 1, 2)
        minibatch = torch.tensor(minibatch / 255.0)
        ###########################
        return minibatch

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
        # detailed explanation).
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True).cuda()
        state_batch = Variable(torch.cat(batch.state)).cuda()
        action_batch = Variable(torch.cat(batch.action)).cuda()
        reward_batch = Variable(torch.cat(batch.reward)).cuda()

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        self.policy_net.train()
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_state_values = Variable(
            torch.zeros(BATCH_SIZE).type(Tensor)).cuda()
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0]
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch
        # Undo volatility (which was used to prevent unnecessary gradients)
        expected_state_action_values = Variable(
            expected_state_action_values.data).cuda()

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        num_episodes = 1400000
        for i_episode in range(num_episodes):
            # Initialize the environment and state
            observation = self.env.reset()
            observation = observation.transpose((2, 0, 1))
            observation = observation[np.newaxis, :]
            state = observation

            for t in count():
                # Select and perform an action
                action = self.make_action(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                next_state = next_state.transpose((2, 0, 1))
                next_state = next_state[np.newaxis, :]
                reward = Tensor([reward])

                # Store the transition in memory
                self.memory.push(torch.from_numpy(state), action,
                                 torch.from_numpy(next_state), reward)

                # Observe new state
                if not done:
                    state = next_state
                else:
                    state = None

                # Perform one step of the optimization (on the target network)
                self.optimize_model()
                if done:
                    print(
                        'resetting env. episode %d \'s reward total was %d.' %
                        (i_episode + 1, t + 1))
                    break
            # Update the target network
            if i_episode % TARGET_UPDATE == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            if i_episode % 50 == 0:
                checkpoint_path = os.path.join('save_dir', 'model-best.pth')
                torch.save(self.policy_net.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
コード例 #20
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # initializations for replay memory
        self.env = env
        self.buffer = collections.deque(
            maxlen=REPLAY_SIZE)  # initializing a replay memory buffer

        #initializations of agent
        self._reset()
        self.last_action = 0
        self.net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE)
        self.target_net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE)
        LOAD_MODEL = True

        if args.test_dqn:
            #you can load your model here
            print('preparing to load trained model')
            ###########################
            LOAD_MODEL = True

        if LOAD_MODEL:
            self.net.load_state_dict(
                torch.load(MODEL, map_location=lambda storage, loc: storage))
            print('loaded trained model')
            self.target_net.load_state_dict(self.net.state_dict())

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################

        ###########################
        pass

    def push(self, experience):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        """
        ###########################
        self.buffer.append(experience)
        ###########################

    def replay_buffer(self, batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.

        sample a batch of 32 from the experience collected
        """
        ###########################
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(
            *[self.buffer[idx] for idx in indices])
        ###########################
        # The 'states' below are already in the transposed form because they are sampled from experience
        return np.array(states, dtype=np.float32), np.array(actions), np.array(
            rewards,
            dtype=np.float32), np.array(dones,
                                        dtype=np.bool), np.array(next_states)

    def _reset(self):
        self.state = self.env.reset()
        self.total_reward = 0.0

    def make_action(self, observation, test=True):
        """
        this is exclusively for testing our actions
        select action
        """
        state_a_test = np.array([observation.transpose(2, 0, 1)], copy=False)
        #torch.tensor opperation appends a '1' at the start of the numpy array
        state_v_test = torch.tensor(state_a_test).to('cpu')
        #feeding observation to the network
        Q_values_v_test = self.net.forward(state_v_test)
        # picking the action with maximum probability
        #picking the best action
        _, action_v_test = torch.max(Q_values_v_test, dim=1)
        #coverting tensor to int
        action_test = int(action_v_test.item())
        ###########################
        return action_test

    def make_action_train(self, net, epsilon=0.0, device=DEVICE):
        """
        select action using epsilon greedy method for training purposes
        """

        if np.random.random() < self.epsilon:
            action = random.randrange(self.env.action_space.n)

        else:
            state_a = np.array([self.state.transpose(2, 0, 1)], copy=False)
            #torch.tensor opperation appends a '1' at the start of the numpy array
            # and makes it a tensor to be fed to the net
            state_v = Variable(torch.FloatTensor(state_a).to(device))

            #Q_values_v = self.net(state_v)
            Q_values_v = self.net.forward(state_v)

            #picking the best action
            _, action_v = torch.max(Q_values_v, dim=1)
            #coverting tensor to int
            action = int(action_v.item())

        ###########################
        return action

    def take_a_step(self, net, epsilon=0.0, device=DEVICE):
        """
        execute action and take a step in the environment
        add the state,action,rewards to the experience replay
        return the total_reward
        """
        done_reward = None

        action_for_exp = self.make_action_train(self.net, self.epsilon, DEVICE)

        new_state, reward, is_done, _ = self.env.step(action_for_exp)

        #Here total reward is the reward for each episode
        self.total_reward += reward
        new_state = new_state

        #remember that the state that comes in from taking a step in our environment
        # will be in the form of width X height X depth

        # But whatever state goes into experience will be in the form of depth X height X width
        # i.e the experience buffer will have state in the transposed format
        # because this is the format that pytorch input should look like
        exp = Experience(self.state.transpose(2, 0, 1), action_for_exp, reward,
                         is_done, new_state.transpose(2, 0, 1))

        #adding experiences in our replay memory
        self.push(exp)
        self.state = new_state

        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

    def loss_function(self, batch, net, target_net, optimizer, device=DEVICE):

        states, actions, rewards, dones, next_states = batch

        states_v = Variable(torch.FloatTensor(states).to(device))
        next_states_v = Variable(torch.FloatTensor(next_states).to(device))
        actions_v = Variable(torch.LongTensor(actions).to(device))
        rewards_v = Variable(torch.FloatTensor(rewards).to(device))
        done = Variable(torch.FloatTensor(dones).to(device))

        #Q_vals
        state_action_values = self.net(states_v).gather(
            1,
            actions_v.long().unsqueeze(-1)).squeeze(-1)

        #next_Q_vals
        next_state_values = self.target_net(next_states_v).max(1)[0]

        #next_state_values[done] = 0.0
        #next_state_values = next_state_values.detach()

        expected_state_action_values = rewards_v + next_state_values * GAMMA * (
            1 - done)

        loss = (state_action_values -
                Variable(expected_state_action_values)).pow(2).mean()

        # we dont wanna accumilate our gradients
        # hence it is importent to make them zero at every iteration

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return loss

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        device = torch.device(DEVICE)

        #defining the optimizer for your neural network
        optimizer = optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE)

        #empty list of total rewards
        total_rewards = []
        best_mean_reward = None
        # initializations for time and speed calculation
        frame_idx = 0
        timestep_frame = 0
        timestep = time.time()

        while True:

            frame_idx += 1
            self.epsilon = EPSILON_END + (EPSILON_START -
                                          EPSILON_END) * math.exp(
                                              -1. * frame_idx / EPSILON_DECAY)

            reward = self.take_a_step(self.net, self.epsilon, device=device)

            if reward is not None:
                #appending rewards in an empty list of total_rewards
                total_rewards.append(reward)

                # not asked to calculate speed
                speed = (frame_idx - timestep_frame) / (time.time() - timestep)
                timestep_frame = frame_idx
                timestep = time.time()

                #calculating mean of last(recent) 1000 rewards
                mean_reward = np.mean(total_rewards[-100:])

                print(
                    "{} frames: done {} games, mean reward {}, epsilon {}, speed {} frames/s"
                    .format(frame_idx, len(total_rewards),
                            round(mean_reward, 3), round(self.epsilon, 2),
                            round(speed, 2)))

                if best_mean_reward is None or best_mean_reward < mean_reward or len(
                        total_rewards) % 25 == 0:

                    if best_mean_reward is not None:
                        print("New best mean reward {} -> {}, model saved".
                              format(round(best_mean_reward, 3),
                                     round(mean_reward, 3)))

            if frame_idx % SAVE_INTERVAL == 0:
                torch.save(self.net.state_dict(),
                           'breakoutNoFrameSkip-4v1' + '.dat')

            #checking the replay memory
            if len(self.buffer) < LEARNING_STARTS:
                continue

            #check if we need to update our target function
            if frame_idx % TARGET_UPDATE_INTERVAL == 0:
                self.target_net.load_state_dict(self.net.state_dict())

            # sampling a batch from buffer
            batch = self.replay_buffer(BATCH_SIZE)
            #calculate and backpropogate
            loss_t = self.loss_function(batch, self.net, self.target_net,
                                        optimizer, device)

            #printing loss at every 100 episodes
            if len(total_rewards) % 100 == 0:
                print("loss at episode" + str(len(total_rewards)) + "is" +
                      str(float(loss_t.item())))

            with open('rewards_collection-100mean.csv', mode='w') as dataFile:
                writer = csv.writer(dataFile,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow(total_rewards)

        self.env.close()
コード例 #21
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        #Gym parameters
        self.num_actions = env.action_space.n

        # parameters for repaly buffer
        self.buffer_max_len = 20000
        self.buffer = deque(maxlen=self.buffer_max_len)
        self.episode_reward_list = []
        self.moving_reward_avg = []

        # paramters for neural network
        self.batch_size = 32
        self.gamma = 0.999
        self.eps_threshold = 0
        self.eps_start = 1
        self.eps_end = 0.025
        self.max_expisode_decay = 10000
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        #Training
        self.steps_done = 0
        self.num_episode = 20000
        self.target_update = 5000
        self.learning_rate = 1.5e-4

        # Neural Network
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.learning_rate)

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            self.policy_net = torch.load('policy_net.hb5')
            self.policy_net.eval()
            ###########################
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        with torch.no_grad():
            sample = random.random()

            ## Check if this is the best way to decline
            observation = torch.tensor(observation,
                                       dtype=torch.float,
                                       device=self.device).permute(
                                           2, 0, 1).unsqueeze(0)

            if test:
                print("testing")
                return self.policy_net(observation).max(1)[1].item()

            if sample > self.eps_threshold:
                #print("Above threshold")
                return self.policy_net(observation).max(1)[1].item()
            else:
                #print("Below Threshold")
                return self.env.action_space.sample()
        ###########################

    def push(self, state, reward, action, next_state, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.buffer.append((state, reward, action, next_state, done))
        ###########################

    def replay_buffer(self, batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        batch = random.sample(self.buffer, batch_size)
        states = []
        rewards = []
        actions = []
        next_states = []
        dones = []
        for sample in batch:
            state, reward, action, next_state, done = sample
            states.append(state)
            rewards.append(reward)
            actions.append(action)
            next_states.append(next_state)
            dones.append(done)
        ###########################
        return states, rewards, actions, next_states, dones

    def update(self):
        if self.steps_done < 5000:
            return
        states, rewards, actions, next_states, dones = self.replay_buffer(
            self.batch_size)
        loss = self.compute_loss(states, rewards, actions, next_states, dones)
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp(-1, 1)
        self.optimizer.step()

    def compute_loss(self, states, rewards, actions, next_states, dones):
        non_final_mask = [not done for done in dones]

        states = torch.tensor(states,
                              dtype=torch.float).permute(0, 3, 1,
                                                         2).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float).permute(
            0, 3, 1, 2).to(self.device)
        dones = torch.tensor(dones, dtype=torch.long).to(self.device)

        Q_current = self.policy_net.forward(states).gather(
            1, actions.unsqueeze(1))
        Q_current = Q_current.squeeze(1)
        ## Should do this with no grad

        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            next_states[non_final_mask]).max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.gamma) + rewards

        loss = F.smooth_l1_loss(Q_current, expected_state_action_values)

        del states, rewards, actions, next_states, dones, Q_current, next_state_values, expected_state_action_values

        return loss

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        for episode in range(self.num_episode):
            #Check this please
            observation = self.env.reset() / 255

            self.eps_threshold = max(
                1 +
                (((self.eps_end - self.eps_start) / self.max_expisode_decay) *
                 episode), self.eps_end)
            episode_steps = 0
            done = False
            episode_reward = 0
            ## Not sure if this is the right way to do this?
            while not done:
                action = self.make_action(observation, test=False)
                new_observation, reward, done, _ = self.env.step(action)

                new_observation = new_observation / 255
                episode_reward += reward
                self.steps_done += 1
                episode_steps += 1

                self.push(observation, reward, action, new_observation, done)

                ## Updating the network
                self.update()

                observation = new_observation

                if self.steps_done % self.target_update == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
            self.episode_reward_list.append(episode_reward)

            if episode % 100 == 0:
                print('episode: {} reward: {} episode length: {}'.format(
                    episode, episode_reward, episode_steps))
                torch.save(self.policy_net.state_dict(), 'test_model.pt')
        ###########################
        print("Done")
コード例 #22
0
ファイル: dqn_eval.py プロジェクト: raymondchua/dqn
def dqn_eval(env, scheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, 
	exp_frame, exp_initial, exp_final, gamma, target_update_steps, frames_per_epoch, 
	frames_per_state, output_directory, last_checkpoint, envo):
	
	env.seed(7)
	random.seed(7)
	gym.undo_logger_setup()
	logging.basicConfig(filename=envo+'_'+model_type+'_eval.log',level=logging.INFO)
	num_actions = env.action_space.n
	
	print('No. of actions: ', num_actions)
	print(env.unwrapped.get_action_meanings())

	# initialize action value and target network with the same weights
	model = DQN(num_actions, use_bn=False)

	if use_cuda:
		model.cuda()

	saved_params = None
	directory = None
	index = []

	for (dirpath, dirnames, filenames) in os.walk(output_directory):
		directory = dirpath
		saved_params = filenames

	count = 0 
	counter = 0

	chckpoint_index = get_index_from_checkpoint_path(last_checkpoint)

	for x in saved_params:
		temp = get_index_from_checkpoint_file(x)
		if temp > chckpoint_index:
			index.append(temp)

	index = sorted(index, key=int)

	for w in index:

		path = directory + '/' + model_type + '_weights_' + str(w) + '.pth'
		model.load_state_dict(torch.load(path))
		print(path)
		print('saved weights loaded...')

		eval_epsilon = 0.05
		env.reset()
		total_reward = []
		rewards_per_episode = 0

		action_value = torch.zeros(num_actions)

		current_state, _, _, _ = play_game(env, frames_per_state, action=0, evaluate=True)

		average_action = {k: [] for k in range(num_actions)}

		for i in range(NUM_GAMES):
			for frame in range(int(MAX_FRAMES_PER_GAME/frames_per_state)):

				eval_choice = random.uniform(0,1)
	
				# epsilon greedy algorithm
				if eval_choice <= eval_epsilon:
					action = LongTensor([[random.randrange(num_actions)]])

				else:
					action = get_greedy_action(model, current_state)

				curr_obs, reward, done, _ = play_game(env, frames_per_state, action[0][0], evaluate=True)

				average_action[action[0,0]].append(get_Q_value(model, action.view(1,1), curr_obs))

				current_state = curr_obs
				rewards_per_episode += reward

				if done:
					env.reset()
					total_reward.append(rewards_per_episode)
					rewards_per_episode = 0
					current_state, _, _, _ = play_game(env, frames_per_state, action=0, evaluate=True)
					break

		average_reward = sum(total_reward)/float(len(total_reward))

		total_action = 0
		for i in range(num_actions):
			total_action += sum(average_action[i])/len(average_action[i])

		average_action_value = total_action/num_actions

		#Compute Standard Deviation
		diff = 0
		for x in total_reward:
			diff += (x - average_reward)*(x - average_reward)
		var = diff/len(total_reward)
		std_dev = math.sqrt(var)

		eval_content = 'Average Score: ', average_reward
		eval_std_dev = 'Standard Deviation: ', std_dev
		average_action_value_content = 'Average Action Value: ', average_action_value
		print(average_action_value_content)
		print(eval_content)
		print(eval_std_dev)
		log_content = path + ' ' + str(average_reward) + ' ' + str(average_action_value) + ' ' + str(std_dev)
		logging.info(log_content)

		count += 1

	print(count)
コード例 #23
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """

        super(Agent_DQN, self).__init__(env)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')

        ##################
        # YOUR CODE HERE #
        ##################
        self.env = env
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.eps_start = EPS_START
        self.eps_decay = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE

        self.policy_net = DQN(self.env.action_space.n)
        self.target_net = DQN(self.env.action_space.n)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.policy_net.to(device)
        self.target_net.to(device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1.5e-4)
        self.memory = ReplayMemory(10000)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
            self.policy_net.load_state_dict(
                torch.load(os.path.join('save_dir/'
                                        'model-best.pth'),
                           map_location=torch.device('cpu')))
            self.policy_net.eval()

    def init_game_setting(self):
        """

        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary

        """
        ##################
        # YOUR CODE HERE #
        ##################
        pass

    def train(self):
        """
        Implement your training algorithm here
        """
        ##################
        # YOUR CODE HERE #
        ##################
        logfile = open('simple_dqn.log', 'w+')
        step = 0
        num_episodes = 1400000
        for i_episode in range(num_episodes):
            # Initialize the environment and state
            observation = self.env.reset()
            observation = observation.transpose((2, 0, 1))
            observation = observation[np.newaxis, :]
            state = observation
            sum_reward = 0
            for t in count():
                # Select and perform an action
                action = self.make_action(state, test=False)
                next_state, reward, done, _ = self.env.step(action.item())
                reward = np.clip(reward, -1., 1.)
                next_state = next_state.transpose((2, 0, 1))
                next_state = next_state[np.newaxis, :]
                sum_reward += reward
                reward = Tensor([reward])
                step += 1

                # Store the transition in memory
                self.memory.push(torch.from_numpy(state), action,
                                 torch.from_numpy(next_state), reward)

                # Observe new state
                if not done:
                    state = next_state
                else:
                    state = None

                if step >= 5000 and step % 5000 == 0:
                    self.optimize_model()
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
                    # Perform one step of the optimization (on the target network)

                if done:
                    print(
                        'resetting env. episode %d \'s step=%d reward total was %d.'
                        % (i_episode + 1, step, sum_reward))
                    print(
                        'resetting env. episode %d \'s step=%d reward total was %d.'
                        % (i_episode + 1, step, sum_reward),
                        file=logfile)
                    logfile.flush()
                    break

            # Update the target network
            # if i_episode % TARGET_UPDATE == 0:
            #     print("Update the target net.")
            #     # print(self.policy_net.state_dict())
            #     self.target_net.load_state_dict(self.policy_net.state_dict())
            if i_episode % 50 == 0:
                checkpoint_path = os.path.join('save_dir', 'model-best.pth')
                torch.save(self.policy_net.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent

        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)

        Return:
            action: int
                the predicted action from trained model
        """
        ##################
        # YOUR CODE HERE #
        ##################
        global steps_done
        if test:
            observation = observation.transpose((2, 0, 1))
            observation = observation[np.newaxis, :]
            # self.policy_net.eval()
            return self.policy_net(
                Variable(torch.from_numpy(observation),
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1).item()
        else:
            self.policy_net.eval()
            sample = random.random()
            eps_threshold = EPS_END + (EPS_START - EPS_END) * \
                math.exp(-1. * steps_done / EPS_DECAY)
            steps_done += 1
            if sample > eps_threshold:
                return self.policy_net(
                    Variable(
                        torch.from_numpy(observation),
                        volatile=True).type(FloatTensor)).data.max(1)[1].view(
                            1, 1)
            else:
                return LongTensor([[random.randrange(self.env.action_space.n)]
                                   ])

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None]).to(device)
        state_batch = torch.cat(batch.state).to(device)
        action_batch = torch.cat(batch.action).to(device)
        reward_batch = torch.cat(batch.reward).to(device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch.float()).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(BATCH_SIZE, device=device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states.float()).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
コード例 #24
0
class Agent_DQN():
    def __init__(self, env, args):
        # Parameters for q-learning

        super(Agent_DQN, self).__init__()

        self.env = env
        state = env.reset()
        state = state.transpose(2, 0, 1)

        self.policy_net = DQN(state.shape,
                              self.env.action_space.n)  # Behavior Q
        self.target_net = DQN(state.shape, self.env.action_space.n)  # Target Q
        self.target_net.load_state_dict(self.policy_net.state_dict())
        #Initial Q

        if USE_CUDA:
            print("Using CUDA . . .     ")
            self.policy_net = self.policy_net.cuda()
            self.target_net = self.target_net.cuda()

        print('hyperparameters and network initialized')

        if args.test_dqn or LOAD == True:
            print('loading trained model')
            checkpoint = torch.load('trainData')
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])

        self.target_net.load_state_dict(self.policy_net.state_dict())

    def init_game_setting(self):
        print('loading trained model')
        checkpoint = torch.load('trainData')
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        memory.append((state, action, reward, next_state, done))

    def replay_buffer(self):
        state, action, reward, next_state, done = zip(
            *random.sample(memory, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(
            next_state), done

    def __len__(self):
        return len(self.buffer)

    def make_action(self, observation, test=True):

        observation = observation.transpose(2, 0, 1)

        if np.random.random() > EPSILON or test == True:
            observation = Variable(torch.FloatTensor(
                np.float32(observation)).unsqueeze(0),
                                   volatile=True)
            q_value = self.policy_net.forward(observation)
            action = q_value.max(1)[1].data[0]
            action = int(action.item())
        else:
            action = random.randrange(4)
        return action

    def optimize_model(self):

        states, actions, next_states, rewards, dones = self.replay_buffer()

        states_v = Variable(torch.FloatTensor(np.float32(states)))
        next_states_v = Variable(torch.FloatTensor(np.float32(next_states)),
                                 volatile=True)
        actions_v = Variable(torch.LongTensor(actions))
        rewards_v = Variable(torch.FloatTensor(rewards))
        done = Variable(torch.FloatTensor(dones))

        state_action_values = self.policy_net(states_v).gather(
            1, actions_v.unsqueeze(1)).squeeze(1)
        next_state_values = self.target_net(next_states_v).max(1)[0]
        expected_q_value = rewards_v + next_state_values * GAMMA * (
            1 - done)  #+ rewards_v

        loss = (state_action_values -
                Variable(expected_q_value.data)).pow(2).mean()
        return loss

    def train(self):
        optimizer = optim.Adam(self.policy_net.parameters(), lr=ALPHA)

        # Fill the memory with experiences
        print('Gathering experiences ...')
        meanScore = 0
        AvgRewards = []
        AllScores = []
        step = 1
        iEpisode = 0

        while meanScore < 50:

            state = self.env.reset()
            done = False
            EpisodeScore = 0
            tBegin = time.time()
            done = False

            while not done:

                action = self.make_action(state)
                nextState, reward, done, _ = self.env.step(action)
                self.push(state.transpose(2, 0, 1), action,
                          nextState.transpose(2, 0, 1), reward, done)

                state = nextState

                if len(memory) > StartLearning:
                    loss = self.optimize_model()
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                else:
                    iEpisode = 0
                    continue

                # Update exploration factor
                EPSILON = EPS_END + (EPS_START - EPS_END) * math.exp(
                    -1. * step / EPS_DECAY)
                storeEpsilon.append(EPSILON)
                step += 1

                EpisodeScore += reward

                if step % TARGET_UPDATE == 0:
                    print('Updating Target Network . . .')
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())

            iEpisode += 1
            AllScores.append(EpisodeScore)
            meanScore = np.mean(AllScores[-100:])
            AvgRewards.append(meanScore)

            if len(memory) > StartLearning:
                print('Episode: ', iEpisode, ' score:', EpisodeScore,
                      ' Avg Score:', meanScore, ' epsilon: ', EPSILON, ' t: ',
                      time.time() - tBegin, ' loss:', loss.item())
            else:
                print('Gathering Data . . .')

            if iEpisode % 500 == 0:
                torch.save(
                    {
                        'epoch': iEpisode,
                        'model_state_dict': self.policy_net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss,
                        'AvgRewards': AvgRewards
                    }, 'trainData')

                os.remove("Rewards.csv")
                with open('Rewards.csv', mode='w') as dataFile:
                    rewardwriter = csv.writer(dataFile,
                                              delimiter=',',
                                              quotechar='"',
                                              quoting=csv.QUOTE_MINIMAL)
                    rewardwriter.writerow(AvgRewards)

        print('======== Complete ========')
        torch.save(
            {
                'epoch': iEpisode,
                'model_state_dict': self.policy_net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                'AvgRewards': AvgRewards
            }, 'trainData')

        with open('Rewards.csv', mode='w') as dataFile:
            rewardwriter = csv.writer(dataFile,
                                      delimiter=',',
                                      quotechar='"',
                                      quoting=csv.QUOTE_MINIMAL)
            rewardwriter.writerow(AvgRewards)
コード例 #25
0
ファイル: ddqn_learn.py プロジェクト: raymondchua/dqn
def ddqn_train(env, scheduler, optimizer_constructor, model_type, batch_size,
               rp_start, rp_size, exp_frame, exp_initial, exp_final, gamma,
               target_update_steps, frames_per_epoch, frames_per_state,
               output_directory, last_checkpoint, envo):
    """
	Implementation of the training algorithm for DDQN. 
	"""

    gym.undo_logger_setup()
    logging.basicConfig(filename=envo + '_' + model_type + 'ddqn_training.log',
                        level=logging.INFO)
    num_actions = env.action_space.n
    env.reset()

    print('No. of actions: ', num_actions)
    print(env.unwrapped.get_action_meanings())

    # initialize action value and target network with the same weights
    model = DQN(num_actions, use_bn=False)
    target = DQN(num_actions, use_bn=False)

    if use_cuda:
        model.cuda()
        target.cuda()

    exp_replay = None
    episodes_count = 1

    if last_checkpoint:
        model.load_state_dict(torch.load(last_checkpoint))
        print(last_checkpoint)
        print('weights loaded...')

        exp_replay = initialize_replay_resume(env, rp_start, rp_size,
                                              frames_per_state, model)
        episodes_count = get_index_from_checkpoint_path(last_checkpoint)

    else:
        exp_replay = initialize_replay(env, rp_start, rp_size,
                                       frames_per_state)

    target.load_state_dict(model.state_dict())

    # scheduler = Scheduler(exp_frame, exp_initial, exp_final)
    optimizer = optimizer_constructor.type(
        model.parameters(),
        lr=optimizer_constructor.kwargs['lr'],
        alpha=optimizer_constructor.kwargs['alpha'],
        eps=optimizer_constructor.kwargs['eps'])

    frames_count = 1
    frames_per_episode = 1
    epsiodes_durations = []
    rewards_per_episode = 0
    rewards_duration = []
    loss_per_epoch = []

    current_state, _, _, _ = play_game(env, frames_per_state)
    print('Starting training...')

    count = 0

    while True:

        epsilon = scheduler.anneal_linear(frames_count)
        choice = random.uniform(0, 1)

        # epsilon greedy algorithm
        if choice <= epsilon:
            action = LongTensor([[random.randrange(num_actions)]])

        else:
            action = get_greedy_action(model, current_state)

        curr_obs, reward, done, _ = play_game(env, frames_per_state,
                                              action[0][0])

        rewards_per_episode += reward
        reward = Tensor([reward])

        exp_replay.push(current_state, action, reward, curr_obs)

        current_state = curr_obs

        #sample random mini-batch
        obs_sample = exp_replay.sample(batch_size)

        batch = Experience(
            *zip(*obs_sample)
        )  #unpack the batch into states, actions, rewards and next_states

        #compute y
        if len(exp_replay) >= batch_size:

            loss = ddqn_compute_y(batch, batch_size, model, target, gamma)
            optimizer.zero_grad()
            loss.backward()

            for param in model.parameters():
                param.grad.data.clamp_(-1, 1)

            optimizer.step()
            loss_per_epoch.append(loss.data.cpu().numpy())

        frames_count += 1
        frames_per_episode += frames_per_state

        if done:
            rewards_duration.append(rewards_per_episode)
            rewards_per_episode = 0
            frames_per_episode = 1
            episodes_count += 1
            env.reset()
            current_state, _, _, _ = play_game(env, frames_per_state)

            if episodes_count % 100 == 0:
                avg_episode_reward = sum(rewards_duration) / 100.0
                avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(
                    loss_per_epoch)
                print(avg_reward_content)
                logging.info(avg_reward_content)
                rewards_duration = []
                loss_per_epoch = []

        # update weights of target network for every TARGET_UPDATE_FREQ steps
        if frames_count % target_update_steps == 0:
            target.load_state_dict(model.state_dict())
            # print('weights updated at frame no. ', frames_count)

        #Save weights every 250k frames
        if frames_count % 250000 == 0:
            util.make_sure_path_exists(output_directory + '/' + envo + '/')
            torch.save(
                model.state_dict(), output_directory + envo + '_' +
                model_type + '/weights_' + str(frames_count) + '.pth')

        #Print frame count for every 1000000 (one million) frames:
        if frames_count % 1000000 == 0:
            training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
            print(training_update)
            logging.info(training_update)
コード例 #26
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    model = DQN(in_channels=input_arg, num_actions=num_actions)
    target_Q = DQN(in_channels=input_arg, num_actions=num_actions)

    if USE_CUDA:
        target_Q = target_Q.cuda()
        model = model.cuda()

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(model.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        ##### OUR CODE
        idx = replay_buffer.store_frame(last_obs)
        encoded_obs = replay_buffer.encode_recent_observation()

        if t > learning_starts:
            action = select_epilson_greedy_action(model, encoded_obs, t)[0]
        else:
            action = random.randrange(num_actions)

        obs, reward, done, info = env.step(action)
        reward = max(-1.0, min(reward, 1.0))

        replay_buffer.store_effect(idx, action, reward, done)
        if done:
            obs = env.reset()
        last_obs = obs
        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable

            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)

            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)

            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            act_batch = Variable(
                torch.Tensor(act_batch).type(torch.LongTensor))
            rew_batch = Variable(torch.from_numpy(rew_batch))

            done_mask = Variable(
                torch.Tensor([1. if val == 0 else 0. for val in done_mask]))
            if USE_CUDA:
                done_mask = done_mask.cuda()
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()
                obs_batch = obs_batch.cuda()
                next_obs_batch = next_obs_batch.cuda()

            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).

            # We choose Q based on action taken.
            current_Q_values = model(obs_batch).gather(
                1, act_batch.unsqueeze(1))  #[0, act_batch]
            # 5. Obtain maxQ' and set our target value for chosen action using the bellman equation.
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = torch.mul(done_mask, next_max_q)

            target_Q_values = rew_batch + (gamma * next_Q_values)
            if USE_CUDA:
                target_Q_values = target_Q_values.cuda()
            d_error = target_Q_values.unsqueeze(1) - current_Q_values
            d_error = d_error.clamp(-1, 1) * -1

            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.

            optimizer.zero_grad()
            current_Q_values.backward(d_error)
            optimizer.step()
            num_param_updates += 1

            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(model.state_dict())

            # YOUR CODE HERE

            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
            if hasattr(exploration, 'add_reward'):
                exploration.add_reward(episode_rewards)
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
コード例 #27
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.env = env
        self.args = args
        self.gamma = self.args.gamma
        self.batch_size = self.args.batch_size
        self.memory_cap = self.args.memory_cap
        self.n_episode = self.args.n_episode
        self.lr = self.args.learning_rate

        self.epsilon = self.args.epsilon
        self.epsilon_decay_window = self.args.epsilon_decay_window
        self.epsilon_min = self.args.epsilon_min
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.epsilon_decay_window

        self.n_step = self.args.n_step
        self.f_update = self.args.f_update
        self.load_model = self.args.load_model
        self.action_size = self.args.action_size
        #         self.algorithm = self.args.algorithm

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        print('using device ', torch.cuda.get_device_name(0))
        self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
        self.LongTensor = torch.cuda.LongTensor if self.use_cuda else torch.LongTensor
        self.ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor
        self.Tensor = self.FloatTensor

        # Create the policy net and the target net
        self.policy_net = DQN()
        self.policy_net.to(self.device)
        #         if self.algorithm == 'DDQN':
        #             self.policy_net_2 = DQN()
        #             self.policy_net_2.to(self.device)
        self.target_net = DQN()
        self.target_net.to(self.device)
        self.policy_net.train()
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        # buffer
        self.memory = []

        ##
        self.mean_window = 100
        self.print_frequency = 100
        self.out_dir = "DQN_Module_b1_1/"

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            self.policy_net.load_state_dict(
                torch.load('model.pth', map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            if self.algorithm == 'DDQN':
                self.policy_net_2.load_state_dict(
                    torch.load('model.pth', map_location=self.device))
            self.print_test = True

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=False):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if test:
            self.epsilon = self.epsilon_min * 0.5
            observation = observation / 255.
        else:
            self.epsilon = max(self.epsilon - self.epsilon_decay,
                               self.epsilon_min)
        if random.random() > self.epsilon:
            observation = self.Tensor(observation.reshape(
                (1, 84, 84, 4))).transpose(1, 3).transpose(2, 3)
            state_action_value = self.policy_net(
                observation).data.cpu().numpy()
            action = np.argmax(state_action_value)
        else:
            action = random.randint(0, self.action_size - 1)
        ###########################
        return action

    def push(self, state, action, reward, next_state, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if len(self.memory) >= self.memory_cap:
            self.memory.pop(0)
        self.memory.append((state, action, reward, next_state, done))
        ###########################

    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.mini_batch = random.sample(self.memory, self.batch_size)
        ###########################
        return

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.steps_done = 0
        self.steps = []
        self.rewards = []
        self.mean_rewards = []
        self.time = []
        self.best_reward = 0
        self.last_saved_reward = 0
        self.start_time = time.time()
        print('train')
        # continue training from where it stopped
        if self.load_model:
            self.policy_net.load_state_dict(
                torch.load(self.out_dir + 'model.pth',
                           map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.epsilon = self.epsilon_min
            print('Loaded')
        for episode in range(self.n_episode):
            # Initialize the environment and state
            state = self.env.reset() / 255.
            #             self.last_life = 5
            total_reward = 0
            self.step = 0
            done = False

            while (not done) and self.step < 10000:
                # move to next state
                self.step += 1
                self.steps_done += 1
                action = self.make_action(state)
                next_state, reward, done, life = self.env.step(action)
                # lives matter
                #                 self.now_life = life['ale.lives']
                #                 dead = self.now_life < self.last_life
                #                 self.last_life = self.now_life
                next_state = next_state / 255.
                # Store the transition in memory
                self.push(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

                if done:
                    self.rewards.append(total_reward)
                    self.mean_reward = np.mean(
                        self.rewards[-self.mean_window:])
                    self.mean_rewards.append(self.mean_reward)
                    self.time.append(time.time() - self.start_time)
                    self.steps.append(self.step)

                    # print the process to terminal
                    progress = "episode: " + str(
                        episode) + ",\t epsilon: " + str(
                            self.epsilon
                        ) + ",\t Current mean reward: " + "{:.2f}".format(
                            self.mean_reward)
                    progress += ',\t Best mean reward: ' + "{:.2f}".format(
                        self.best_reward) + ",\t time: " + time.strftime(
                            '%H:%M:%S', time.gmtime(self.time[-1]))
                    print(progress)

                    if episode % self.print_frequency == 0:
                        self.print_and_plot()
                    # save the best model
                    if self.mean_reward > self.best_reward and len(
                            self.memory) >= 5000:
                        print('~~~~~~~~~~<Model updated with best reward = ',
                              self.mean_reward, '>~~~~~~~~~~')
                        checkpoint_path = self.out_dir + 'model.pth'
                        torch.save(self.policy_net.state_dict(),
                                   checkpoint_path)
                        self.last_saved_reward = self.mean_reward
                        self.best_reward = self.mean_reward

                if len(self.memory) >= 5000 and self.steps_done % 4 == 0:
                    #                     if self.algorithm == 'DQN':
                    self.optimize_DQN()
                if self.steps_done % self.f_update == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
#                     print('-------<target net updated at step,',self.steps_done,'>-------')

###########################

    def optimize_DQN(self):
        # sample
        self.replay_buffer()
        state, action, reward, next_state, done = zip(*self.mini_batch)

        # transfer 1*84*84*4 to 1*4*84*84, which is 0,3,1,2
        state = self.Tensor(np.float32(state)).permute(0, 3, 1,
                                                       2).to(self.device)
        action = self.LongTensor(action).to(self.device)
        reward = self.Tensor(reward).to(self.device)
        next_state = self.Tensor(np.float32(next_state)).permute(
            0, 3, 1, 2).to(self.device)

        done = self.Tensor(done).to(self.device)

        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        # Compute next Q, including the mask
        next_state_values = self.target_net(next_state).detach().max(1)[0]
        # Compute the expected Q value. stop update if done
        expected_state_action_values = reward + (next_state_values *
                                                 self.gamma) * (1 - done)
        # Compute Huber loss
        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.data)
        # Optimize the model
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
        return

    def print_and_plot(self):
        fig1 = plt.figure(1)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Steps')
        plt.plot(self.steps)
        fig1.savefig(self.out_dir + 'steps.png')

        fig2 = plt.figure(2)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.plot(self.mean_rewards)
        fig2.savefig(self.out_dir + 'rewards.png')

        fig2 = plt.figure(3)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Time')
        plt.plot(self.time)
        fig2.savefig(self.out_dir + 'time.png')
コード例 #28
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # import arguments
        self.args = args
        self.env = env
        self.batch_size = self.args.batch_size
        self.gamma = self.args.gamma
        self.lr = self.args.learning_rate
        self.memory_cap = self.args.memory_cap
        self.n_episode = self.args.n_episode
        self.n_step = self.args.n_step
        self.update_f = self.args.update_f
        self.explore_step = self.args.explore_step
        self.action_size = self.args.action_size
        self.algorithm = self.args.algorithm
        self.save_path = "dqn/"
        print('using algorithm ', self.algorithm)

        # whether continue training
        self.load_model = self.args.load_model

        # unify tensor tpye according to device names
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        print('using device ', torch.cuda.get_device_name(0))
        self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
        self.LongTensor = torch.cuda.LongTensor if self.use_cuda else torch.LongTensor
        self.ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor
        self.Tensor = self.FloatTensor  # default type

        # epsilon decay
        self.epsilon = 1.0
        self.epsilon_min = 0.025
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_step

        # Create the policy net and the target net
        self.policy_net = DQN()
        self.policy_net.to(self.device)
        if self.algorithm == 'DDQN':
            self.policy_net_2 = DQN()
            self.policy_net_2.to(self.device)
        self.target_net = DQN()
        self.target_net.to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        # replay buffer
        self.memory = []

        # optimizer
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        if self.algorithm == 'DDQN':
            self.optimizer_2 = optim.Adam(
                params=self.policy_net_2.parameters(), lr=self.lr)

        # other
        self.f_skip = 4  # frame skip
        self.n_avg_reward = 100
        self.f_print = 100
        self.print_test = False

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            self.policy_net.load_state_dict(
                torch.load('model.pth', map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            if self.algorithm == 'DDQN':
                self.policy_net_2.load_state_dict(
                    torch.load('model.pth', map_location=self.device))
            self.print_test = True

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        state = self.env.reset() / 255.
        self.last_life = 5
        self.step = 0
        done = False
        total_reward = 0
        ###########################
        return state, done, total_reward

    def make_action(self, observation, test=False):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if test:
            self.epsilon = self.epsilon_min
            observation = observation / 255.
        else:
            self.epsilon = max(self.epsilon - self.epsilon_decay,
                               self.epsilon_min)
        if random.random() > self.epsilon:
            observation = self.Tensor(observation.reshape(
                (1, 84, 84, 4))).transpose(1, 3).transpose(2, 3)
            state_action_value = self.policy_net(
                observation).data.cpu().numpy()
            action = np.argmax(state_action_value)
        else:
            action = random.randint(0, self.action_size - 1)
        ###########################
        return action

    def push(self, state, action, reward, next_state, dead, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if len(self.memory) >= self.memory_cap:
            self.memory.pop(0)
        self.memory.append((state, action, reward, next_state, dead, done))
        ###########################

    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.mini_batch = random.sample(self.memory, self.batch_size)
        ###########################
        return

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # initialize
        self.steps_done = 0
        self.steps = []
        self.rewards = []
        self.mean_rewards = []
        self.best_reward = 0
        self.last_saved_reward = 0

        start = time.time()
        logfile = open('dqn.log', 'w+')
        # continue training
        if self.load_model:
            self.policy_net.load_state_dict(
                torch.load(self.save_path + 'model.pth',
                           map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.epsilon = self.epsilon_min

        for episode in range(self.n_episode):
            state, done, total_reward = self.init_game_setting()
            while (not done) and self.step < 10000:
                # move to next state
                self.step += 1
                self.steps_done += 1
                action = self.make_action(state)
                next_state, reward, done, life = self.env.step(action)
                # lives matter
                now_life = life['ale.lives']
                dead = (now_life < self.last_life)
                self.last_life = now_life
                next_state = next_state / 255.
                # Store the transition in memory
                self.push(state, action, reward, next_state, dead, done)
                state = next_state
                total_reward += reward

                if len(self.memory
                       ) >= self.n_step and self.steps_done % self.f_skip == 0:
                    if self.algorithm == 'DQN':
                        self.optimize_DQN()
                    elif self.algorithm == 'DDQN':
                        self.optimize_DDQN()
                if self.steps_done % self.update_f == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())

            self.rewards.append(total_reward)
            self.mean_reward = np.mean(self.rewards[-self.n_avg_reward:])
            self.mean_rewards.append(self.mean_reward)
            self.steps.append(self.step)
            # print progress in terminal
            progress = "Episode: " + str(
                episode) + ",\tCurrent mean reward: " + "{:.2f}".format(
                    self.mean_reward
                ) + ',\tBest mean reward: ' + "{:.2f}".format(self.best_reward)
            progress += ",\tCurerent Reward: " + str(
                total_reward) + ",\tTime: " + time.strftime(
                    '%H:%M:%S', time.gmtime(time.time() - start))
            print(progress)
            print(episode,
                  self.mean_reward,
                  self.best_reward,
                  total_reward,
                  time.time() - start,
                  file=logfile)
            logfile.flush()
            if (episode + 1) % self.f_print == 0:
                self.plots()
            # save the best model
            if self.mean_reward > self.best_reward and self.steps_done > self.n_step:
                checkpoint_path = self.save_path + 'model.pth'
                torch.save(self.policy_net.state_dict(), checkpoint_path)
                self.last_saved_reward = self.mean_reward
                self.best_reward = max(self.mean_reward, self.best_reward)
        ###########################

    def optimize_DQN(self):
        # sample
        self.replay_buffer()
        state, action, reward, next_state, dead, done = zip(*self.mini_batch)

        state = self.Tensor(np.float32(state)).permute(0, 3, 1,
                                                       2).to(self.device)
        action = self.LongTensor(action).to(self.device)
        reward = self.Tensor(reward).to(self.device)
        next_state = self.Tensor(np.float32(next_state)).permute(
            0, 3, 1, 2).to(self.device)
        dead = self.Tensor(dead).to(self.device)
        done = self.Tensor(done).to(self.device)

        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        # Compute next Q, including the mask
        next_state_values = self.target_net(next_state).detach().max(1)[0]
        # Compute the expected Q value. stop update if done
        expected_state_action_values = reward + (next_state_values *
                                                 self.gamma) * (1 - done)
        # Compute Huber loss
        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.data)
        # Optimize the model
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
        return

    def optimize_DDQN(self):
        # sample
        self.replay_buffer()
        state, action, reward, next_state, dead, done = zip(*self.mini_batch)

        # transfer 1*84*84*4 to 1*4*84*84, which is 0,3,1,2
        state = self.Tensor(np.float32(state)).permute(0, 3, 1,
                                                       2).to(self.device)
        action = self.LongTensor(action).to(self.device)
        reward = self.Tensor(reward).to(self.device)
        next_state = self.Tensor(np.float32(next_state)).permute(
            0, 3, 1, 2).to(self.device)
        dead = self.Tensor(dead).to(self.device)
        done = self.Tensor(done).to(self.device)

        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        state_action_values_2 = self.policy_net_2(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        # Compute next Q, including the mask
        next_state_values = self.target_net(next_state).detach().max(1)[0]
        next_state_values_2 = self.target_net(next_state).detach().max(1)[0]
        next_state_values = torch.min(next_state_values, next_state_values_2)
        # Compute the expected Q value. stop update if done
        expected_state_action_values = reward + (next_state_values *
                                                 self.gamma) * (1 - done)
        # Compute Huber loss
        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.data)
        self.loss_2 = F.smooth_l1_loss(state_action_values_2,
                                       expected_state_action_values.data)
        # Optimize the model
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
        self.optimizer_2.zero_grad()
        self.loss_2.backward()
        self.optimizer_2.step()
        return

    def plots(self):
        fig1 = plt.figure(1)
        plt.clf()
        plt.title('Training_Steps_per_Episode')
        plt.xlabel('Episode')
        plt.ylabel('Steps')
        plt.plot(self.steps)
        fig1.savefig(self.save_path + 'steps.png')

        fig2 = plt.figure(2)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.plot(self.rewards)

        if len(self.rewards) >= self.n_avg_reward:
            plt.plot(self.mean_rewards)
        fig2.savefig(self.save_path + 'rewards.png')

        rewards = np.array(self.rewards)
        np.save(self.save_path + 'rewards.npy', rewards)
コード例 #29
0
def ddqn_rankBatch_train(env, scheduler, optimizer_constructor, model_type,
                         batch_size, rp_start, rp_size, exp_frame, exp_initial,
                         exp_final, inital_beta, gamma, target_update_steps,
                         frames_per_epoch, frames_per_state, output_directory,
                         last_checkpoint):
    """
	Implementation of the training algorithm for DDQN using Rank-based prioritization.
	Information with regards to the algorithm can be found in the paper, 
	"Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and
	David Silver. Refer to section 3.3 in the paper for more info. 
	"""

    gym.undo_logger_setup()
    logging.basicConfig(filename='ddqn_rank_training.log', level=logging.INFO)
    num_actions = env.action_space.n
    env.reset()

    print('No. of actions: ', num_actions)
    print(env.unwrapped.get_action_meanings())

    # initialize action value and target network with the same weights
    model = DQN(num_actions, use_bn=False)
    target = DQN(num_actions, use_bn=False)

    if use_cuda:
        model.cuda()
        target.cuda()

    frames_count = 1

    if last_checkpoint:
        model.load_state_dict(torch.load(last_checkpoint))
        print(last_checkpoint)
        print('weights loaded...')

        exp_replay = util.initialize_rank_replay_resume(
            env, rp_start, rp_size, frames_per_state, model, target, gamma,
            batch_size)
        frames_count = get_index_from_checkpoint_path(last_checkpoint)

    else:
        exp_replay = util.initialize_rank_replay(env, rp_start, rp_size,
                                                 frames_per_state, model,
                                                 target, gamma)

    target.load_state_dict(model.state_dict())

    optimizer = optimizer_constructor.type(
        model.parameters(),
        lr=optimizer_constructor.kwargs['lr'],
        alpha=optimizer_constructor.kwargs['alpha'],
        eps=optimizer_constructor.kwargs['eps'])

    episodes_count = 1
    frames_per_episode = 1
    epsiodes_durations = []
    rewards_per_episode = 0
    rewards_duration = []
    loss_per_epoch = []

    current_state, _, _, _ = util.play_game(env, frames_per_state)
    print('Starting training...')

    count = 0

    while True:

        epsilon = scheduler.anneal_linear(frames_count)
        choice = random.uniform(0, 1)

        # epsilon greedy algorithm
        if choice <= epsilon:
            action = LongTensor([[random.randrange(num_actions)]])

        else:
            action = util.get_greedy_action(model, current_state)

        curr_obs, reward, done, _ = util.play_game(env, frames_per_state,
                                                   action[0][0])

        rewards_per_episode += reward
        reward = Tensor([[reward]])
        current_state_ex = Variable(current_state, volatile=True)
        curr_obs_ex = Variable(curr_obs, volatile=True)
        action_ex = Variable(action, volatile=True)
        reward_ex = Variable(reward, volatile=True)

        #compute td-error for one sample
        td_error = ddqn_compute_td_error(batch_size=1,
                                         state_batch=current_state_ex,
                                         reward_batch=reward_ex,
                                         action_batch=action_ex,
                                         next_state_batch=curr_obs_ex,
                                         model=model,
                                         target=target,
                                         gamma=gamma)

        td_error = torch.abs(td_error)
        exp_replay.push(current_state_ex, action_ex, reward_ex, curr_obs_ex,
                        td_error)
        current_state = curr_obs

        # compute y
        if len(exp_replay) >= batch_size:
            # Get batch samples
            obs_samples, obs_ranks, obs_priorityVals = exp_replay.sample(
                batch_size)
            obs_priorityTensor = torch.from_numpy(np.array(obs_priorityVals))
            p_batch = 1 / obs_priorityTensor
            w_batch = (1 / len(exp_replay) * p_batch)**inital_beta
            max_weight = exp_replay.get_max_weight(inital_beta)
            params_grad = []

            for i in range(len(obs_samples)):
                sample = obs_samples[i]
                sample.state.volatile = False
                sample.next_state.volatile = False
                sample.reward.volatile = False
                sample.action.volatile = False
                loss = ddqn_compute_y(batch_size=1,
                                      state_batch=sample.state,
                                      reward_batch=sample.reward,
                                      action_batch=sample.action,
                                      next_state_batch=sample.next_state,
                                      model=model,
                                      target=target,
                                      gamma=gamma)
                loss_abs = torch.abs(loss)
                exp_replay.update(obs_ranks[i], loss_abs)

                for param in model.parameters():
                    if param.grad is not None:
                        param.grad.data.zero_()

                loss.backward()

                #accumulate weight change
                if i == 0:
                    for param in model.parameters():
                        tmp = ((w_batch[i] / max_weight) *
                               loss.data[0]) * param.grad.data
                        params_grad.append(tmp)

                else:
                    paramIndex = 0
                    for param in model.parameters():
                        tmp = ((w_batch[i] / max_weight) *
                               loss.data[0]) * param.grad.data
                        params_grad[paramIndex] = tmp + params_grad[paramIndex]
                        paramIndex += 1

            # update weights
            paramIndex = 0
            for param in model.parameters():
                param.data += params_grad[paramIndex].mul(
                    optimizer_constructor.kwargs['lr']).type(Tensor)
                paramIndex += 1

        frames_count += 1
        frames_per_episode += frames_per_state

        if done:
            rewards_duration.append(rewards_per_episode)
            rewards_per_episode = 0
            frames_per_episode = 1
            episodes_count += 1
            env.reset()
            current_state, _, _, _ = util.play_game(env, frames_per_state)

            if episodes_count % 100 == 0:
                avg_episode_reward = sum(rewards_duration) / 100.0
                avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(
                    loss_per_epoch)
                print(avg_reward_content)
                logging.info(avg_reward_content)
                rewards_duration = []
                loss_per_epoch = []

        # update weights of target network for every TARGET_UPDATE_FREQ steps
        if frames_count % target_update_steps == 0:
            target.load_state_dict(model.state_dict())
            # print('weights updated at frame no. ', frames_count)

        #Save weights every 250k frames
        if frames_count % 250000 == 0:
            util.make_sure_path_exists(output_directory + model_type + '/')
            torch.save(model.state_dict(),
                       'rank_weights_' + str(frames_count) + '.pth')

        #Print frame count and sort experience replay for every 1000000 (one million) frames:
        if frames_count % 1000000 == 0:
            training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
            print(training_update)
            logging.info(training_update)
            exp_replay.sort()