Example #1
0
test_time = False

n_steps = 8
n_actions = env.action_space.n
img_height = 64
img_width = 64
policy_net = None
network_path = "target_net.pt"
if os.path.exists(network_path):
    policy_net = torch.load(network_path)
    print("successfully loaded existing network from file: " + network_path)
else:
    policy_net = DQN(img_height, img_width, n_actions)
target_net = DQN(img_height, img_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.RMSprop(policy_net.parameters(), lr=LR)
memory = ReplayMemory(10000)

steps_done = 0
logfile = "train_log.txt"
with open(logfile, "w+") as f:
    f.write("CS4803 MineRL Project Logs:\n")
def append_log(s):
    with open(logfile, "a") as f:
        f.write(s + "\n")

def state_from_obs(obs):
    # get the camera image from the observation dict and convert the image
    # to the correct shape: (C, H, W)
    img = torch.tensor(obs["pov"] / 255.0, dtype=torch.float32)
Example #2
0
class Agent:
    def __init__(
        self,
        state_size,
        action_size,
        n_agents,
        buffer_size: int = 1e5,
        batch_size: int = 256,
        gamma: float = 0.995,
        tau: float = 1e-3,
        learning_rate: float = 7e-4,
        update_every: int = 4,
    ):
        """
        Initialize DQN agent using the agent-experience buffer

        Args:
            state_size (int): Size of the state observation returned by the
                environment
            action_size (int): Action space size
            n_agents (int): Number of agents in the environment
            buffer_size (int): Desired total experience buffer size
            batch_size (int): Mini-batch size
            gamma (float): Discount factor
            tau (float): For soft update of target parameters
            learning_rate (float): Learning rate
            update_every (int): Number of steps before target network update
        """

        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents

        # Q-Networks
        self.policy_net = DQN(state_size, action_size).to(device)
        self.target_net = DQN(state_size, action_size).to(device)

        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=learning_rate)
        self.memory = AgentReplayMemory(buffer_size, n_agents, state_size,
                                        device)

        self.t_step = 0

        self.update_every = update_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

    def step(self, states, actions, rewards, next_steps, done):

        self.memory.push_agent_actions(states, actions, rewards, next_steps,
                                       done)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if self.memory.at_capacity():
                experience = self.memory.sample(self.batch_size)
                self.learn(experience, self.gamma)

    def act(self, states, eps=0):
        states = torch.from_numpy(states).float().to(device)
        self.policy_net.eval()

        with torch.no_grad():
            action_values = self.policy_net(states)
        self.policy_net.train()

        r = np.random.random(size=self.n_agents)

        action_values = np.argmax(action_values.cpu().data.numpy(), axis=1)
        random_choices = np.random.randint(0,
                                           self.action_size,
                                           size=self.n_agents)

        return np.where(r > eps, action_values, random_choices)

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        criterion = torch.nn.MSELoss()
        self.policy_net.train()
        self.target_net.eval()

        # shape of output from the model (batch_size,action_dim) = (64,4)
        predicted_targets = self.policy_net(states).gather(1, actions)

        with torch.no_grad():
            labels_next = self.target_net(next_states).detach().max(
                1)[0].unsqueeze(1)

        # .detach() ->  Returns a new Tensor, detached from the current graph.
        labels = rewards + (gamma * labels_next * (1 - dones))

        loss = criterion(predicted_targets, labels).to(device)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.policy_net, self.target_net, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.

        θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)
Example #3
0
class TrainNQL:
    def __init__(self, epi, cfg=dcfg, validation=False):
        #cpu or cuda
        torch.cuda.empty_cache()
        self.device = cfg.device  #torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_dim = cfg.proc_frame_size  #State dimensionality 84x84.
        self.state_size = cfg.state_size
        #self.t_steps= tsteps
        self.t_eps = cfg.t_eps
        self.minibatch_size = cfg.minibatch_size
        # Q-learning parameters
        self.discount = cfg.discount  #Discount factor.
        self.replay_memory = cfg.replay_memory
        self.bufferSize = cfg.bufferSize
        self.target_q = cfg.target_q
        self.validation = validation
        if (validation):
            self.episode = epi
        else:
            self.episode = int(epi) - 1
        self.cfg = cfg

        modelGray = 'results/ep' + str(self.episode) + '/modelGray.net'
        modelDepth = 'results/ep' + str(self.episode) + '/modelDepth.net'
        tModelGray = 'results/ep' + str(self.episode) + '/tModelGray.net'
        tModelDepth = 'results/ep' + str(self.episode) + '/tModelDepth.net'

        if os.path.exists(modelGray) and os.path.exists(modelDepth):
            print("Loading model")
            self.gray_policy_net = torch.load(modelGray).to(self.device)
            self.gray_target_net = torch.load(tModelGray).to(self.device)
            self.depth_policy_net = torch.load(modelDepth).to(self.device)
            self.depth_target_net = torch.load(tModelDepth).to(self.device)

        else:
            print("New model")
            self.gray_policy_net = DQN(noutputs=cfg.noutputs,
                                       nfeats=cfg.nfeats,
                                       nstates=cfg.nstates,
                                       kernels=cfg.kernels,
                                       strides=cfg.strides,
                                       poolsize=cfg.poolsize).to(self.device)
            self.gray_target_net = DQN(noutputs=cfg.noutputs,
                                       nfeats=cfg.nfeats,
                                       nstates=cfg.nstates,
                                       kernels=cfg.kernels,
                                       strides=cfg.strides,
                                       poolsize=cfg.poolsize).to(self.device)
            self.depth_policy_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)
            self.depth_target_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)

        if not validation and self.target_q and self.episode % self.target_q == 0:
            print("cloning")
            self.depth_policy_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)
            self.depth_target_net = DQN(noutputs=cfg.noutputs,
                                        nfeats=cfg.nfeats,
                                        nstates=cfg.nstates,
                                        kernels=cfg.kernels,
                                        strides=cfg.strides,
                                        poolsize=cfg.poolsize).to(self.device)

        self.gray_target_net.load_state_dict(self.gray_target_net.state_dict())
        self.gray_target_net.eval()

        self.depth_target_net.load_state_dict(
            self.depth_target_net.state_dict())
        self.depth_target_net.eval()

        self.gray_optimizer = optim.RMSprop(self.gray_policy_net.parameters())
        self.depth_optimizer = optim.RMSprop(
            self.depth_policy_net.parameters())
        self.memory = ReplayMemory(self.replay_memory)

    def get_tensor_from_image(self, file):
        convert = T.Compose([
            T.ToPILImage(),
            T.Resize((self.state_dim, self.state_dim),
                     interpolation=Image.BILINEAR),
            T.ToTensor()
        ])
        screen = Image.open(file)
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        screen = convert(screen).unsqueeze(0).to(self.device)
        return screen

    def get_data(self, episode, tsteps):
        #images=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device)
        #depths=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device)
        images = []
        depths = []
        dirname_rgb = 'dataset/RGB/ep' + str(episode)
        dirname_dep = 'dataset/Depth/ep' + str(episode)
        for step in range(tsteps):
            #proc_image=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device)
            #proc_depth=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device)
            proc_image = []
            proc_depth = []

            dirname_rgb = 'dataset/RGB/ep' + str(episode)
            dirname_dep = 'dataset/Depth/ep' + str(episode)
            for i in range(self.state_size):
                grayfile = dirname_rgb + '/image_' + str(step + 1) + '_' + str(
                    i + 1) + '.png'
                depthfile = dirname_dep + '/depth_' + str(
                    step + 1) + '_' + str(i + 1) + '.png'
                #proc_image[i] = self.get_tensor_from_image(grayfile)
                #proc_depth[i] = self.get_tensor_from_image(depthfile)
                proc_image.append(grayfile)
                proc_depth.append(depthfile)
            #images[step]=proc_image
            #depths[step]=proc_depth
            images.append(proc_image)
            depths.append(proc_depth)
        return images, depths

    def load_data(self):

        rewards = torch.load('files/reward_history.dat')
        actions = torch.load('files/action_history.dat')
        ep_rewards = torch.load('files/ep_rewards.dat')

        print("Loading images")

        best_scores = range(len(actions))
        buffer_selection_mode = 'default'

        if (buffer_selection_mode == 'success_handshake'):
            eps_values = []
            for i in range(len(actions)):

                hspos = 0
                hsneg = 0
                for step in range(len(actions[i])):
                    if (len(actions[i]) > 0):
                        if actions[i][step] == 3:
                            if rewards[i][step] > 0:
                                hspos = hspos + 1
                            elif rewards[i][step] == -0.1:
                                hsneg = hsneg + 1
                accuracy = float(((hspos) / (hspos + hsneg)))
                eps_values.append(accuracy)

            best_scores = np.argsort(eps_values)

        for i in best_scores:
            print('Ep: ', i + 1)
            dirname_gray = 'dataset/RGB/ep' + str(i + 1)
            dirname_dep = 'dataset/Depth/ep' + str(i + 1)
            files = []
            if (os.path.exists(dirname_gray)):
                files = os.listdir(dirname_gray)

            k = 0
            for file in files:
                if re.match(r"image.*\.png", file):
                    k = k + 1
            k = int(k / 8)
            while (k % 4 != 0):
                k = k - 1
            if (k > self.bufferSize):
                k = self.bufferSize
            print(k)

            #os.system("free -h")
            #with torch.no_grad():
            images, depths = self.get_data(i + 1, k)
            print("Loading done")

            for step in range(k - 1):
                #print(len(rewards),i)
                #print(len(rewards[i]), step)
                reward = self.cfg.neutral_reward
                if rewards[i][step] >= 1:
                    reward = self.cfg.hs_success_reward
                elif rewards[i][step] < 0:
                    reward = self.cfg.hs_fail_reward
                reward = torch.tensor([reward], device=self.device)
                action = torch.tensor([[actions[i][step]]],
                                      device=self.device,
                                      dtype=torch.long)
                #image = images[step].unsqueeze(0).to(self.device)
                #depth = depths[step].unsqueeze(0).to(self.device)
                #next_image = images[step+1].unsqueeze(0).to(self.device)
                #next_depth = depths[step+1].unsqueeze(0).to(self.device)
                image = images[step]
                depth = depths[step]
                next_image = images[step + 1]
                next_depth = depths[step + 1]
                self.memory.push(image, depth, action, next_image, next_depth,
                                 reward)
                #print("Memory size: ",getsizeof(self.memory))
                #torch.cuda.empty_cache()

    def train(self):
        if len(self.memory) < self.minibatch_size:
            return
        for i in range(0, len(self.memory), self.minibatch_size):
            #transitions = self.memory.sample(self.minibatch_size)
            transitions = self.memory.pull(self.minibatch_size)

            print('Batch train: ' + str(int(i / self.minibatch_size) + 1) +
                  "/" + str(int(len(self.memory) / self.minibatch_size) + 1))

            aux_transitions = []
            for t in transitions:
                proc_sgray = torch.Tensor(self.state_size, self.state_dim,
                                          self.state_dim).to(self.device)
                proc_sdepth = torch.Tensor(self.state_size, self.state_dim,
                                           self.state_dim).to(self.device)
                proc_next_sgray = torch.Tensor(self.state_size, self.state_dim,
                                               self.state_dim).to(self.device)
                proc_next_sdepth = torch.Tensor(self.state_size,
                                                self.state_dim,
                                                self.state_dim).to(self.device)
                count = 0
                for sgray, sdepth, next_sgray, next_sdepth in zip(
                        t.sgray, t.sdepth, t.next_sgray, t.next_sdepth):
                    proc_sgray[count] = self.get_tensor_from_image(sgray)
                    proc_sdepth[count] = self.get_tensor_from_image(sdepth)
                    proc_next_sgray[count] = self.get_tensor_from_image(
                        next_sgray)
                    proc_next_sdepth[count] = self.get_tensor_from_image(
                        next_sdepth)
                    count += 1

                proc_sgray = proc_sgray.unsqueeze(0).to(self.device)
                proc_sdepth = proc_sdepth.unsqueeze(0).to(self.device)
                proc_next_sgray = proc_next_sgray.unsqueeze(0).to(self.device)
                proc_next_sdepth = proc_next_sdepth.unsqueeze(0).to(
                    self.device)
                #('sgray','sdepth','action','next_sgray','next_sdepth','reward')
                one_transition = Transition(proc_sgray, proc_sdepth, t.action,
                                            proc_next_sgray, proc_next_sdepth,
                                            t.reward)
                aux_transitions.append(one_transition)
            transitions = aux_transitions

            # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
            # detailed explanation). This converts batch-array of Transitions
            # to Transition of batch-arrays.
            batch = Transition(*zip(*transitions))
            #print(batch.sgray)

            # Compute a mask of non-final states and concatenate the batch elements
            # (a final state would've been the one after which simulation ended)
            gray_non_final_mask = torch.tensor(tuple(
                map(lambda s: s is not None, batch.next_sgray)),
                                               device=self.device,
                                               dtype=torch.bool)
            gray_non_final_next_states = torch.cat(
                [s for s in batch.next_sgray if s is not None])

            depth_non_final_mask = torch.tensor(tuple(
                map(lambda s: s is not None, batch.next_sdepth)),
                                                device=self.device,
                                                dtype=torch.bool)
            depth_non_final_next_states = torch.cat(
                [s for s in batch.next_sdepth if s is not None])
            sgray_batch = torch.cat(batch.sgray)
            sdepth_batch = torch.cat(batch.sdepth)

            action_batch = torch.cat(batch.action)
            reward_batch = torch.cat(batch.reward)

            # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
            # columns of actions taken. These are the actions which would've been taken
            # for each batch state according to policy_net
            sgray_action_values = self.gray_policy_net(sgray_batch).gather(
                1, action_batch)
            sdepth_action_values = self.depth_policy_net(sdepth_batch).gather(
                1, action_batch)

            # Compute V(s_{t+1}) for all next states.
            # Expected values of actions for non_final_next_states are computed based
            # on the "older" target_net; selecting their best reward with max(1)[0].
            # This is merged based on the mask, such that we'll have either the expected
            # state value or 0 in case the state was final.
            next_sgray_values = torch.zeros(self.minibatch_size,
                                            device=self.device)
            next_sgray_values[gray_non_final_mask] = self.gray_target_net(
                gray_non_final_next_states).max(1)[0].detach()

            next_sdepth_values = torch.zeros(self.minibatch_size,
                                             device=self.device)
            next_sdepth_values[depth_non_final_mask] = self.depth_target_net(
                depth_non_final_next_states).max(1)[0].detach()
            # Compute the expected Q values
            expected_sgray_action_values = (next_sgray_values *
                                            self.discount) + reward_batch
            expected_sdepth_action_values = (next_sdepth_values *
                                             self.discount) + reward_batch

            # Compute Huber loss
            gray_loss = F.smooth_l1_loss(
                sgray_action_values, expected_sgray_action_values.unsqueeze(1))
            depth_loss = F.smooth_l1_loss(
                sdepth_action_values,
                expected_sdepth_action_values.unsqueeze(1))

            # Optimize the model
            self.gray_optimizer.zero_grad()
            gray_loss.backward()
            for param in self.gray_policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            self.gray_optimizer.step()

            # Optimize the model
            self.depth_optimizer.zero_grad()
            depth_loss.backward()
            for param in self.depth_policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            self.depth_optimizer.step()