Exemple #1
0
class Testor:
    def __init__(self, model_dict, idx, num_channels=3, num_actions=19):
        import gym
        import minerl
        self.testor_idx = idx
        self.env = gym.make(ENV_NAME)
        self.port_number = int("12340") + self.testor_idx
        print("testor environment %d initialize successfully" %
              self.testor_idx)
        self.env.make_interactive(port=self.port_number, realtime=False)

        self.testor_network = DQN(num_channels, num_actions).cuda()
        self.testor_network.load_state_dict(model_dict)
        print("testor network %d initialize successfully" % self.testor_idx)

        self.writer = SummaryWriter(f'runs/apex/test/testor{self.testor_idx}')

        self.max_epi = 100

    def explore(self):
        for num_epi in range(self.max_epi):
            obs = self.env.reset()
            state = converter(ENV_NAME, obs).cuda()
            state = state.float()
            done = False
            total_reward = 0
            steps = 0
            total_steps = 0

            while not done:
                steps += 1
                total_steps += 1
                action_tensor = self.testor_network.forward(state)
                print(action_tensor)
                action_index = torch.argmax(action_tensor).item()
                print(action_index)

                action = make_19action(self.env, action_index)
                #print(action)
                obs_prime, reward, done, info = self.env.step(action)
                total_reward += reward
                state_prime = converter(ENV_NAME, obs_prime).cuda()
                state = state_prime
                if done:
                    print("%d episode is done" % num_epi)
                    print("total rewards : %d " % total_reward)
                    self.writer.add_scalar('Rewards/test', total_reward,
                                           num_epi)
                    break
class DQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_dim, self.config.action_dim).cuda()
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):
        s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_value = next_q_values.max(1)[0]

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        # Notice that detach the expected_q_value
        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()


        return loss.item()

    def cuda(self):
        self.model.cuda()

    def load_weights(self, model_path):
        if model_path is None: return
        self.model.load_state_dict(torch.load(model_path))

    def save_model(self, output, tag=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")
Exemple #3
0
# saving training variables
outliers = []
centroids = []
G = []
episode_rewards = []

mean_reward_episodes_list = []
best_reward_episodes_list = []
episode_rewards_list = []

for t in range(MAX_FRAMES):
    x = np.concatenate((s, g), axis=0).reshape((1, 5, 84, 84))
    if t < LEARNING_STARTS:
        a = env.action_space.sample()
    else:
        qt = Qt.forward(torch.Tensor(x).type(dtype) / 255)
        a = epsilon_greedy(qt.cpu().detach().numpy(),
                           epsilon=epsilon)  # random action
    SP, r, terminal, step_info = step(a)
    episode_rewards.append(r)
    sp = four_frames_to_4_84_84(SP)
    xp = np.concatenate((sp, g), axis=0).reshape((1, 5, 84, 84))
    man_mask = get_man_mask(SP)
    man_loc = get_man_xy_np_coordinate(man_mask)
    # intrinsic_done_task = are_masks_align(man_mask, subgoal_mask)
    intrinsic_done_task = is_man_inside_subgoal_mask(man_mask, subgoal_mask)
    # outlier
    if r > 0:
        print('Outler detected at', man_loc)
        outliers.append(man_loc)
    R += r
Exemple #4
0
class DDQNAgent:
    def __init__(self, config: Config, training=True):
        self.config = config
        self.is_training = training
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_shape, self.config.action_dim)
        self.target_model = DQN(self.config.state_shape,
                                self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())

        self.optim = Adam(self.model.parameters(),
                          lr=self.config.learning_rate)

        self.model.cuda()
        self.target_model.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learn(self, t):
        s, a, r, s2, done = self.buffer.sample(self.config.batch_size)

        s = torch.tensor(s, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        s2 = torch.tensor(s2, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        s = s.cuda()
        a = a.cuda()
        r = r.cuda()
        s2 = s2.cuda()
        done = done.cuda()

        q_values = self.model(s).cuda()
        next_q_values = self.model(s2).cuda()
        next_q_state_values = self.target_model(s2).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)

        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        if t % self.config.update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_checkpoint(self):
        os.makedirs('ckpt', exist_ok=True)
        torch.save(self.model.state_dict(), 'ckpt/model.pt')

    def load_checkpoint(self):
        self.model.load_state_dict('ckpt/model.pt')
        self.target_model.load_state_dict('ckpt/model.pt')
Exemple #5
0
class Algorithm():
    def __init__(self, lr, gamma, act_dim, state_dim, memory_capacity, epsilon,
                 batch_size):
        self.model = DQN(state_dim, act_dim)
        self.state_dim = state_dim
        self.act_dim = act_dim
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.target_model = copy.deepcopy(self.model)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()
        self.memory_capacity = memory_capacity
        self.replay_buffer = np.zeros((memory_capacity, 2 * state_dim + 3))
        self.memory_counter = 0
        self.batch_size = batch_size

    def sync_target(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def pridict(self, obs):
        return self.model.forward(obs)

    def choose_action(self, state):
        state = torch.unsqueeze(torch.Tensor(state), 0)
        if np.random.rand() <= self.epsilon:
            action_value = self.model.forward(state)
            action = torch.max(action_value, dim=1)[1].numpy()[0]
        else:
            action = np.random.randint(0, self.act_dim)
        return action

    def store_transition(self, state, action, reward, next_state, done):
        transition = np.hstack((state, [action, reward], next_state, done))
        index = self.memory_counter % self.memory_capacity
        self.replay_buffer[index, :] = transition
        self.memory_counter += 1

    def learn(self):
        sample_index = np.random.choice(self.memory_capacity, self.batch_size)
        batch_memory = self.replay_buffer[sample_index, :]
        batch_state = torch.FloatTensor(batch_memory[:, :self.state_dim])
        batch_action = torch.LongTensor(
            batch_memory[:, self.state_dim:self.state_dim + 1].astype(int))
        batch_reward = torch.FloatTensor(batch_memory[:, self.state_dim +
                                                      1:self.state_dim + 2])
        batch_next_state = torch.FloatTensor(
            batch_memory[:, self.state_dim + 2:2 * self.state_dim + 2])
        batch_done = torch.FloatTensor(batch_memory[:, -1:])

        next_value = self.target_model.forward(batch_next_state)
        max_value = torch.max(next_value, dim=1)[0]
        torch.detach(max_value)

        target = batch_reward.squeeze() + self.gamma * (
            1 - batch_done).squeeze() * max_value

        q_value = self.model.forward(batch_state)
        behavior = torch.gather(q_value, dim=1, index=batch_action).squeeze()

        self.optimizer.zero_grad()

        output = self.loss(behavior, target)
        output.backward()
        self.optimizer.step()

        return output