def __init__(self, name, n_o, n_a):
        self.agent_name = name  #name of the agent: 'predator' or 'prey'
        self.dtype = torch.float
        self.obs_dim = n_o  #observation space dimensions
        self.act_dim = n_a  #action space dimensions
        self.GAMMA = 0.99  #discount factor
        self.ALPHA = 1e-3  #learning rate
        self.BATCH_SIZE = 16  #number of samples in batch
        self.BUFFER_SIZE = 10000
        self.TAU = 0.001
        self.buffer = Memory(self.BUFFER_SIZE)
        self.noise = OU_noise(dt=0.05)

        self.online_actor = Actor(self.obs_dim, self.act_dim)
        self.online_critic = Critic(self.obs_dim, 1)

        self.target_actor = Actor(self.obs_dim, self.act_dim)
        self.target_critic = Critic(self.obs_dim, 1)
        '''Copy Online Parameters to target Parameters'''
        self.target_actor.policy.load_state_dict(
            self.online_actor.policy.state_dict())
        self.target_critic.value_func.load_state_dict(
            self.online_critic.value_func.state_dict())

        self.optim = torch.optim.Adam(
            self.online_critic.value_func.parameters(), lr=self.ALPHA)

        self.state = None  #temp holding of variables before being pushed to replay buffer. 2D tensor [[state]]
        self.act = None  #temp holding of variables before being pushed to replay buffer. 2D tensore [[action]]
예제 #2
0
    def build(self):
        self.policy_net1 = DQN2D(84, 84, self.pars).to(self.device)
        self.target_net1 = DQN2D(84, 84, self.pars).to(self.device)
        self.target_net1.load_state_dict(self.policy_net1.state_dict())
        self.target_net1.eval()

        self.policy_net2 = DQN2D(84, 84, self.pars).to(self.device)
        self.target_net2 = DQN2D(84, 84, self.pars).to(self.device)
        self.target_net2.load_state_dict(self.policy_net2.state_dict())
        self.target_net2.eval()

        self.optimizer1 = optim.SGD(self.policy_net1.parameters(),
                                    lr=self.pars['lr'],
                                    momentum=self.pars['momentum'])  #
        self.optimizer2 = optim.SGD(self.policy_net2.parameters(),
                                    lr=self.pars['lr'],
                                    momentum=self.pars['momentum'])  #
        #self.optimizer1 = optim.Adam(self.policy_net1.parameters())
        #self.optimizer2 = optim.Adam(self.policy_net2.parameters())
        self.memory2 = ReplayMemory(10000)
        self.memory1 = ReplayMemory(10000)

        if self.pars['ppe'] == '1':
            self.memory1 = Memory(10000)
            self.memory2 = Memory(10000)
예제 #3
0
    def build(self):
        self.policy_net = DQN(97, self.pars,
                              rec=self.pars['rec'] == 1).to(self.device)
        self.target_net = DQN(97, self.pars,
                              rec=self.pars['rec'] == 1).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        if self.pars['momentum'] > 0:
            self.optimizer = optim.SGD(self.policy_net.parameters(),
                                       lr=self.pars['lr'],
                                       momentum=self.pars['momentum'])  #
        else:
            self.optimizer = optim.Adam(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        if 'ppe' in self.pars:
            self.memory = Memory(10000)
        if self.pars['load'] is not None:
            self.load(self.pars['load'])
            self.target_net.load_state_dict(self.policy_net.state_dict())
            print('loaded')
예제 #4
0
 def build(self):
     self.policy_net = DQN2D(84,84, self.pars, rec=self.pars['rec']==1).to(self.device)
     self.q_net = DQN2D(84,84, self.pars).to(self.device)
     self.target_net = DQN2D(84,84, self.pars).to(self.device)
     self.target_net.load_state_dict(self.q_net.state_dict())
     self.target_net.eval()
     
     if self.pars['momentum']>0:
         self.optimizer = optim.SGD(
                 self.q_net.parameters(), lr=self.pars['lr'], 
                 momentum=self.pars['momentum'])#
         self.policy_optimizer = optim.SGD(
                 self.policy_net.parameters(), lr=self.pars['lr'], 
                 momentum=self.pars['momentum'])#
     else:
         self.optimizer = optim.Adam(self.q_net.parameters())
         self.policy_optimizer = optim.Adam(self.policy_net.parameters())
     self.memory = ReplayMemory(10000)
     
     if self.pars['ppe'] == '1':
         self.memory = Memory(10000)
     self.eps_threshold = 0.01
예제 #5
0
def collect_batch(env: gym.Env, actor: torch.nn.Module, buffer: Memory,
                  batch_size: int, device: torch.device):
    while len(buffer) < batch_size:
        obs = env.reset()
        done = False
        obs = torch.tensor(obs, dtype=torch.float32, device=device)
        prev_idx = buffer.add_obs(obs)

        while not done:
            obs = torch.unsqueeze(obs, dim=0)
            action, action_logprobs = actor.act(obs)
            action = action.cpu().numpy()[0]
            obs, rew, done, _ = env.step(action)
            obs = torch.tensor(obs, dtype=torch.float32, device=device)
            next_idx = buffer.add_obs(obs)
            buffer.add_timestep(prev_idx, next_idx, action, action_logprobs,
                                rew, done)
            prev_idx = next_idx
        buffer.end_rollout()
예제 #6
0
class Agent:
    def __init__(self, name, pars, nrenvs=1, job=None, experiment=None):
        self.job = job
        self.name = name
        self.experiment = experiment
        self.pars = pars
        self.envs = [GameEnv(pars['subhid']) for i in range(nrenvs)]
        for env in self.envs:
            env.reset()
        self.BATCH_SIZE = pars['bs']
        self.GAMMA = 0.999
        self.rnnB = 3
        self.EPS_START = 0.9
        self.EPS_END = 0.05
        self.alpha = pars['alpha']
        self.EPS_DECAY = 200
        self.TARGET_UPDATE = pars['tg']
        self.nrf = pars['nrf']
        self.capmem = 0
        self.prob = 0.5
        self.idC = 0
        if pars['comm'] == '0':
            self.prob = -1
        if pars['comm'] == '1':
            self.idC = 1
        self.nopr = False
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.build()

        if pars['results_path']:
            result_path = pars['results_path'] + name
            if not os.path.exists(result_path):
                os.makedirs(result_path)

        result_path = result_path + '/results_' + str(0) + '.csv'
        self.result_out = open(result_path, 'w')
        csv_meta = '#' + json.dumps(pars) + '\n'
        self.result_out.write(csv_meta)
        self.writer = csv.DictWriter(self.result_out,
                                     fieldnames=['episode', 'reward'])
        self.writer.writeheader()

        self.steps_done = 0
        self.num_episodes = pars['numep']
        self.lr = pars['lr']
        self.momentum = pars['momentum']
        self.maxR = 0
        self.dru = DRU(0.2, comm_narrow=True, hard=False, device=self.device)

    def build(self):
        self.policy_net = DQN(97, self.pars,
                              rec=self.pars['rec'] == 1).to(self.device)
        self.target_net = DQN(97, self.pars,
                              rec=self.pars['rec'] == 1).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        if self.pars['momentum'] > 0:
            self.optimizer = optim.SGD(self.policy_net.parameters(),
                                       lr=self.pars['lr'],
                                       momentum=self.pars['momentum'])  #
        else:
            self.optimizer = optim.Adam(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        if 'ppe' in self.pars:
            self.memory = Memory(10000)
        if self.pars['load'] is not None:
            self.load(self.pars['load'])
            self.target_net.load_state_dict(self.policy_net.state_dict())
            print('loaded')

    def getComm(self, mes, policy_net, state1_batch):
        return policy_net(
            state1_batch, 1,
            mes)[self.idC] if np.random.rand() < self.prob else mes

    def optimize_model(self, policy_net, target_net, memory, optimizer):
        if self.pars['ppe'] != '1' and len(memory) < self.BATCH_SIZE:
            return
        if self.pars['ppe'] == '1' and self.capmem < self.BATCH_SIZE:
            return

        if self.pars['ppe'] == '1':
            #state1, action1, next_state1, reward1, state2
            tree_idx, batch, ISWeights_mb = memory.sample(self.BATCH_SIZE)
            non_final_next_states = torch.cat([i[2] for i in batch])
            state_batch = torch.cat([i[0] for i in batch])
            action_batch = torch.cat([i[1] for i in batch])
            reward_batch = torch.cat([i[3] for i in batch])
            state1_batch = torch.cat([i[4] for i in batch])
        else:
            transitions = memory.sample(self.BATCH_SIZE)
            batch = Transition(*zip(*transitions))

            non_final_next_states = torch.cat(batch.next_state)
            state_batch = torch.cat(batch.state)
            action_batch = torch.cat(batch.action)
            reward_batch = torch.cat(batch.reward)
            state1_batch = torch.cat(batch.agent_index)

        mes = torch.tensor([[0, 0, 0, 0] for i in range(self.BATCH_SIZE)],
                           device=self.device)

        if self.pars['att'] == 1:
            _, comm, att = policy_net(state1_batch, 1, mes)
            if np.random.rand() < 0.0001:
                print(att.cpu().data.numpy()[:10, 0])
        else:
            comm = self.getComm(mes, policy_net, state1_batch)
        if self.pars['dru'] > 0:
            comm = self.dru.forward(comm, True)
        if self.pars['comm'] == '2':
            comm = comm.detach()

        q, _ = policy_net(state_batch, 1, comm)[:2]
        state_action_values = q.gather(1, action_batch)

        next_state_values = target_net(non_final_next_states, 1,
                                       mes)[0].max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch.float()
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))
        #loss = weighted_mse_loss(state_action_values, expected_state_action_values.unsqueeze(1),
        #                         torch.tensor(ISWeights_mb, device=self.device))
        #print(torch.tensor(ISWeights_mb, device=self.device).size())

        if self.pars['ppe'] == '1':
            absolute_errors = (state_action_values -
                               expected_state_action_values.unsqueeze(1)
                               ).abs().cpu().data.numpy().reshape((-1))
            memory.batch_update(tree_idx, absolute_errors)
        # Optimize the model
        if self.pars['att'] == 1:
            loss = loss + att.mean() * 0.001
        if self.pars['commr'] == 1:
            comm1 = torch.flip(comm.detach(), [0])
            q1 = policy_net(state_batch, 1, comm1)[0]
            #print(comm.detach(), comm1)
            #F.smooth_l1_loss(comm.detach().float(), comm1.float())# F.smooth_l1_loss(q1,q)#
            dc = 0.1 * ((comm.detach().float() - comm1.float())**2).mean(
                -1)  #F.kl_div(comm.detach().float(), comm1.float())
            dq = ((q1 - q)**2).mean(-1)  #F.kl_div(q1, q)
            loss = loss + 0.01 * ((dc - dq)**2).mean()

            if np.random.rand() < 0.0005:
                print('difc', dc.cpu().data.numpy()[:10])
                print('difq', dq.cpu().data.numpy()[:10])

        optimizer.zero_grad()
        loss.backward()
        for param in policy_net.parameters():
            if param.grad is not None:
                param.grad.data.clamp_(-1, 1)
        optimizer.step()

    def select_action(self, state, comm, policy_net):
        sample = random.random()
        eps_threshold = self.EPS_END + (
            self.EPS_START - self.EPS_END) * math.exp(
                -1. * self.steps_done / self.EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                return policy_net(state, 1, comm)[0].max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(4)]],
                                device=self.device,
                                dtype=torch.long)

    def getaction(self, state1, state2, test=False):
        mes = torch.tensor([[0, 0, 0, 0]], device=self.device)
        if test:
            comm2 = self.policy_net(
                state2, 0, mes)[self.idC].detach() if 0 < self.prob else mes
            comm1 = self.policy_net(
                state1, 0, mes)[self.idC].detach() if 0 < self.prob else mes
            if self.pars['dru'] > 0:
                comm1 = self.dru.forward(comm1, False)
                comm2 = self.dru.forward(comm2, False)
            action1 = self.policy_net(state1, 1, comm2)[0].max(1)[1].view(1, 1)
            action2 = self.policy_net(state2, 1, comm1)[0].max(1)[1].view(1, 1)
        else:
            comm2 = self.policy_net(state2, 0, mes)[
                self.idC].detach() if np.random.rand() < self.prob else mes
            comm1 = self.policy_net(state1, 0, mes)[
                self.idC].detach() if np.random.rand() < self.prob else mes
            if self.pars['dru'] > 0:
                comm1 = self.dru.forward(comm1, True)
                comm2 = self.dru.forward(comm2, True)
            action1 = self.select_action(state1, comm2, self.policy_net)
            action2 = self.select_action(state2, comm1, self.policy_net)
        return action1, action2, [comm1, comm2]

    def getStates(self, env):
        screen1 = env.render_env_1d(0)  #.transpose((2, 0, 1))
        screen2 = env.render_env_1d(1)  #.transpose((2, 0, 1))
        return torch.from_numpy(screen1).unsqueeze(0).to(
            self.device), torch.from_numpy(screen2).unsqueeze(0).to(
                self.device)

    def saveStates(self, state1, state2, action1, action2, next_state1,
                   next_state2, reward1, reward2, env_id, t):
        self.capmem += 2
        if self.pars['ppe'] != '1':
            self.memory.push(state2, action2, next_state2, reward2, state1)
            self.memory.push(state1, action1, next_state1, reward1, state2)
        else:
            self.rnnS1[-1].append(reward1)
            self.rnnS2[-1].append(reward2)
            if self.pars['rec'] == 1 and len(
                    self.rnnS1) < self.rnnB:  #always full steps
                self.rnnS1 = [self.rnnS1[0]] * (self.rnnB - len(self.rnnS1) +
                                                1) + self.rnnS1
                self.rnnS2 = [self.rnnS2[0]] * (self.rnnB - len(self.rnnS2) +
                                                1) + self.rnnS2
                #print(len(self.rnnS1),111)
            #print(len(self.rnnS1[-self.rnnB:]))
            self.memory.store([
                state1, action1, next_state1, reward1, state2,
                self.rnnS1[-self.rnnB:]
            ])
            self.memory.store([
                state2, action2, next_state2, reward2, state1,
                self.rnnS2[-self.rnnB:]
            ])

    def optimize(self):
        self.optimize_model(self.policy_net, self.target_net, self.memory,
                            self.optimizer)

    def getDB(self):
        with open(self.pars['pretrain']) as f:
            data = json.load(f)
        #self.pars['pretrain'] = None
        self.nopr = True
        return data

    def pretrain(self):
        db = self.getDB()
        num_episodes = len(db)
        nr = len(db) - 1
        totalN = len(db)
        sc = 5
        for i_episode in range(num_episodes):
            if i_episode % 10:
                sc -= 1
            sc = max(1, sc)
            if self.job is not None and self.job.stopEx:
                return
            for env_id, env in enumerate(self.envs[:1]):
                for i in db[nr:]:
                    env.getFrom(i[0])
                    state1, state2 = self.getStates(env)
                    env.getFrom(i[3])
                    next_state1, next_state2 = self.getStates(env)
                    action1 = torch.tensor([[i[1][0]]], device=self.device)
                    action2 = torch.tensor([[i[1][1]]], device=self.device)
                    reward1 = torch.tensor([i[2][0] * 1.], device=self.device)
                    reward2 = torch.tensor([i[2][1] * 1.], device=self.device)
                    self.saveStates(state1, state2, action1, action2,
                                    next_state1, next_state2, reward1, reward2,
                                    env_id)
                for t in range((totalN - nr) * sc):
                    self.bufs = [[] for i in range(len(self.envs) * 2)]
                    bs = 1
                    self.h2 = torch.zeros(1,
                                          bs,
                                          self.pars['en'],
                                          device=self.device)
                    self.h1 = torch.zeros(1,
                                          bs,
                                          self.pars['en'],
                                          device=self.device)
                    env.getFrom(db[nr][0])
                    self.buf1 = []
                    self.buf2 = []
                    state1, state2 = self.getStates(env)
                    rt = 0
                    ac = []
                    start_time = time.time()
                    buf1 = []
                    buf2 = []
                    ep1 = []

                    for t in range((totalN - nr)):

                        action1, action2, _ = self.getaction(state1, state2)
                        reward1, reward2 = env.move(
                            action1.item(), action2.item())  #multi envs??
                        rt += reward1 + reward2
                        ac.append(str(action1.item()))

                        reward1 = torch.tensor([
                            reward1 * self.alpha + reward2 * (1 - self.alpha)
                        ],
                                               device=self.device)
                        reward2 = torch.tensor([
                            reward2 * self.alpha + reward1 * (1 - self.alpha)
                        ],
                                               device=self.device)

                        next_state1, next_state2 = self.getStates(env)
                        self.saveStates(state1, state2, action1, action2,
                                        next_state1, next_state2, reward1,
                                        reward2, env_id)

                        state1 = next_state1
                        state2 = next_state2

                        self.optimize()
                        self.updateTarget(i_episode, step=True)
            if i_episode % self.pars['show'] == 0:
                print('ep', i_episode, 'reward train', rt, 'time',
                      time.time() - start_time, ','.join(ac[:20]))
            self.updateTarget(i_episode)
            nr -= 1
            if nr < 0:
                nr = 0

    def getInitState(self):
        return torch.zeros(1, 1, self.pars['en'], device=self.device)

    def train(self, num_episodes):
        if self.pars['pretrain'] is not None and not self.nopr:
            self.pretrain()

        for i_episode in range(num_episodes):
            if self.job is not None and self.job.stopEx:
                return
            self.bufs = [[] for i in range(len(self.envs) * 2)]
            bs = 1
            self.h2 = torch.zeros(1, bs, self.pars['en'], device=self.device)
            self.h1 = torch.zeros(1, bs, self.pars['en'], device=self.device)
            for env_id, env in enumerate(self.envs):
                env.reset()
                self.buf1 = []
                self.buf2 = []
                state1, state2 = self.getStates(env)
                rt = 0
                ac = []
                start_time = time.time()
                buf1 = []
                buf2 = []
                ep1 = []
                self.rnnS1 = []
                self.rnnS2 = []
                self.reset_hx()
                for t in range(self.pars['epsteps']):

                    action1, action2, _ = self.getaction(state1, state2)
                    reward1, reward2 = env.move(action1.item(),
                                                action2.item())  #multi envs??
                    rt += reward1 + reward2
                    ac.append(str(action1.item()))

                    reward1 = torch.tensor(
                        [reward1 * self.alpha + reward2 * (1 - self.alpha)],
                        device=self.device)
                    reward2 = torch.tensor(
                        [reward2 * self.alpha + reward1 * (1 - self.alpha)],
                        device=self.device)

                    next_state1, next_state2 = self.getStates(env)
                    self.saveStates(state1, state2, action1, action2,
                                    next_state1, next_state2, reward1, reward2,
                                    env_id, t == self.pars['epsteps'] - 1)

                    state1 = next_state1
                    state2 = next_state2

                    self.optimize()
                    self.updateTarget(i_episode, step=True)
            if i_episode % self.pars['show'] == 0:
                print('ep', i_episode, 'reward train', rt, 'time',
                      time.time() - start_time, ','.join(ac[:20]))
            self.updateTarget(i_episode)

    def test(self, tries=3, log=True, i_episode=-1):
        rs = []
        rt1 = 0
        ep = []
        #self.policy_net.eval()
        for i in range(tries):
            ep = []
            rt1 = 0
            self.envs[0].reset()
            bs = 1
            self.h2 = torch.zeros(1, bs, self.pars['en'], device=self.device)
            self.h1 = torch.zeros(1, bs, self.pars['en'], device=self.device)
            self.reset_hx()
            for t in range(100):  #sep function
                state1, state2 = self.getStates(self.envs[0])
                action1, action2, r = self.getaction(state1, state2, test=True)
                comm1, comm2 = r
                reward1, reward2 = self.envs[0].move(action1.item(),
                                                     action2.item())
                ep.append([
                    self.envs[0].render_env(),
                    [action1.item(), action2.item()], [reward1, reward2],
                    [
                        comm1.cpu().data.numpy()[0].tolist(),
                        comm2.cpu().data.numpy()[0].tolist()
                    ], [comm1.max(1)[1].item(),
                        comm2.max(1)[1].item()]
                ])
                rt1 += reward1 + reward2
            rs.append(rt1)
        rm = np.mean(rs)
        if log and i_episode > 0:
            if self.job is not None:
                self.job.log({'reward' + self.name: rm, 'ep': i_episode})
            if self.experiment is not None:
                self.experiment.set_step(i_episode)
                self.experiment.log_metric("reward" + self.name, rm)
            save_episode_and_reward_to_csv(self.result_out, self.writer,
                                           i_episode, rt1, ep, self.name,
                                           self.pars)
            if rm > self.maxR:
                self.maxR = rm
                self.save()
                print('saved')
        print('reward test', rm, rs, 'com',
              comm1.cpu().data.numpy()[0].tolist())
        #self.policy_net.train()
        return rm

    def updateTarget(self, i_episode, step=False):
        #soft_update(self.target_net, self.policy_net, tau=0.01)
        if step:
            return
        if i_episode % self.TARGET_UPDATE == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

    def reset_hx(self):
        pass

    def save(self):
        torch.save(self.policy_net.state_dict(),
                   self.pars['results_path'] + self.name + '/model')

    def load(self, PATH):
        #torch.cuda.is_available()
        self.policy_net.load_state_dict(
            torch.load(
                PATH,
                map_location='cuda' if torch.cuda.is_available() else 'cpu'))

    def perturb_learning_rate(self, i_episode, nolast=True):
        if nolast:
            new_lr_factor = 10**np.random.normal(scale=1.0)
            new_momentum_delta = np.random.normal(scale=0.1)
            self.EPS_DECAY += np.random.normal(scale=50.0)
            if self.EPS_DECAY < 50:
                self.EPS_DECAY = 50
            if self.prob >= 0:
                self.prob += np.random.normal(scale=0.05) - 0.025
                self.prob = min(max(0, self.prob), 1)
        for param_group in self.optimizer.param_groups:
            if nolast:
                param_group['lr'] *= new_lr_factor
                param_group['momentum'] += new_momentum_delta
            self.momentum = param_group['momentum']
            self.lr = param_group['lr']
        with open(
                os.path.join(self.pars['results_path'] + self.name,
                             'hyper-{}.json').format(i_episode),
                'w') as outfile:
            json.dump(
                {
                    'lr': self.lr,
                    'momentum': self.momentum,
                    'eps_decay': self.EPS_DECAY,
                    'prob': self.prob,
                    'i_episode': i_episode
                }, outfile)

    def clone(self, agent):
        state_dict = agent.policy_net.state_dict()
        self.policy_net.load_state_dict(state_dict)
        state_dict = agent.optimizer.state_dict()
        self.optimizer.load_state_dict(state_dict)
        self.EPS_DECAY = agent.EPS_DECAY
        self.prob = agent.prob
        self.target_net.load_state_dict(self.policy_net.state_dict())
예제 #7
0
    def __init__(self, args, env_params):
        self.s_dim = env_params['o_dim'] + env_params['g_dim']
        self.a_dim = env_params['a_dim']
        self.f_dim = args.f_dim
        self.action_bound = env_params['action_max']
        self.max_timestep = env_params['max_timestep']
        self.max_episode = args.max_episode
        self.evaluate_episode = args.evaluate_episode
        self.evaluate_interval = args.evaluate_interval
        self.log_interval = args.log_interval
        self.save_model_interval = args.save_model_interval
        self.save_model_start = args.save_model_start

        self.lr = args.lr
        self.lr_model = args.lr_model
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.eta = args.eta
        self.noise_eps = args.noise_eps
        self.device = torch.device(args.device)

        self.normalizer_s = Normalizer(size=self.s_dim,
                                       eps=1e-2,
                                       clip_range=1.)

        self.memory = Memory(size=args.memory_size,
                             s_dim=self.s_dim,
                             a_dim=self.a_dim)

        self.policy = Policy(s_dim=self.s_dim,
                             a_dim=self.a_dim).to(self.device)
        self.policy_target = Policy(s_dim=self.s_dim,
                                    a_dim=self.a_dim).to(self.device)
        self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device)
        self.Q_target = QFunction(s_dim=self.s_dim,
                                  a_dim=self.a_dim).to(self.device)

        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr)

        self.encoder = StateEncoder(s_dim=self.s_dim,
                                    f_dim=self.f_dim).to(self.device)
        self.EnvForward = ForwardModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)
        self.EnvInverse = InverseModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)

        self.optimizer_forward = optim.Adam(
            [{
                'params': self.EnvForward.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)
        self.optimizer_inverse = optim.Adam(
            [{
                'params': self.EnvInverse.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)

        self.hard_update()

        self.update_num = 0
예제 #8
0
class DDPG_Agent():
    def __init__(self, args, env_params):
        self.s_dim = env_params['o_dim'] + env_params['g_dim']
        self.a_dim = env_params['a_dim']
        self.f_dim = args.f_dim
        self.action_bound = env_params['action_max']
        self.max_timestep = env_params['max_timestep']
        self.max_episode = args.max_episode
        self.evaluate_episode = args.evaluate_episode
        self.evaluate_interval = args.evaluate_interval
        self.log_interval = args.log_interval
        self.save_model_interval = args.save_model_interval
        self.save_model_start = args.save_model_start

        self.lr = args.lr
        self.lr_model = args.lr_model
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.eta = args.eta
        self.noise_eps = args.noise_eps
        self.device = torch.device(args.device)

        self.normalizer_s = Normalizer(size=self.s_dim,
                                       eps=1e-2,
                                       clip_range=1.)

        self.memory = Memory(size=args.memory_size,
                             s_dim=self.s_dim,
                             a_dim=self.a_dim)

        self.policy = Policy(s_dim=self.s_dim,
                             a_dim=self.a_dim).to(self.device)
        self.policy_target = Policy(s_dim=self.s_dim,
                                    a_dim=self.a_dim).to(self.device)
        self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device)
        self.Q_target = QFunction(s_dim=self.s_dim,
                                  a_dim=self.a_dim).to(self.device)

        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr)

        self.encoder = StateEncoder(s_dim=self.s_dim,
                                    f_dim=self.f_dim).to(self.device)
        self.EnvForward = ForwardModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)
        self.EnvInverse = InverseModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)

        self.optimizer_forward = optim.Adam(
            [{
                'params': self.EnvForward.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)
        self.optimizer_inverse = optim.Adam(
            [{
                'params': self.EnvInverse.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)

        self.hard_update()

        self.update_num = 0

    def select_action(self, state, train_mode=True):
        s = self.normalize_input(state)
        s = torch.tensor(state, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            action = self.policy(s).cpu().numpy()

        if train_mode:
            action += np.random.randn(
                self.a_dim
            ) * self.noise_eps * self.action_bound  #Gaussian Noise
        else:
            pass

        action = np.clip(action,
                         a_min=-self.action_bound,
                         a_max=self.action_bound)
        return action

    def get_intrisic_reward(self, s, a, s_):
        s, a, s_ = torch.from_numpy(s).to(
            self.device).float(), torch.from_numpy(a).to(
                self.device).float(), torch.from_numpy(s_).to(
                    self.device).float()
        with torch.no_grad():
            feature = self.encoder(s)
            next_feature_pred = self.EnvForward(feature, a)
            next_feature = self.encoder(s_)
        r_i = self.eta * torch.norm(next_feature_pred - next_feature)
        r_i = torch.clamp(r_i, min=-0.1, max=0.1)
        return r_i.cpu().detach().numpy()

    def train(self, env, logger=None):
        total_step = 0
        loss_pi, loss_q, loss_forward, loss_inverse = 0., 0., 0., 0.
        for i_episode in range(self.max_episode):
            obs = env.reset()
            s = get_state(obs)

            cumulative_r = 0.
            for i_step in range(self.max_timestep):
                a = self.select_action(s)
                obs_, r_e, done, info = env.step(a)
                s_ = get_state(obs_)

                r_i = self.get_intrisic_reward(s, a, s_)
                r = r_e + r_i

                self.memory.store(s, a, r, s_)
                s = s_

                if len(self.memory) > self.batch_size:
                    loss_pi, loss_q, loss_forward, loss_inverse = self.learn()
                cumulative_r += r_e
                total_step += 1

            print(
                'i_episode: {} total step: {} cumulative reward: {:.4f} is_success: {} '
                .format(i_episode, total_step, cumulative_r,
                        info['is_success']))
            if logger is not None and i_episode % self.log_interval == 0:
                logger.add_scalar('Indicator/cumulative reward', cumulative_r,
                                  i_episode)
                logger.add_scalar('Loss/pi_loss', loss_pi, i_episode)
                logger.add_scalar('Loss/q_loss', loss_q, i_episode)
                logger.add_scalar('Loss/forward_loss', loss_forward, i_episode)
                logger.add_scalar('Loss/inverse_loss', loss_inverse, i_episode)
            if i_episode % self.evaluate_interval == 0:
                success_rate = self.evaluate(env)
                if logger is not None:
                    logger.add_scalar('Indicator/success rate', success_rate,
                                      i_episode)

            if i_episode > self.save_model_start and i_episode % self.save_model_interval == 0:
                self.save_model(remarks='{}_{}'.format(env.spec.id, i_episode))

    def evaluate(self, env, render=False):
        success_count = 0
        for i_episode in range(self.evaluate_episode):
            obs = env.reset()
            s = get_state(obs)
            for i_step in range(self.max_timestep):
                if render:
                    env.render()
                a = self.select_action(s, train_mode=False)
                obs_, r_e, done, info = env.step(a)
                s_ = get_state(obs_)
                s = s_
            success_count += info['is_success']

        return success_count / self.evaluate_episode

    def learn(self):
        s, a, r, s_ = self.memory.sample_batch(batch_size=self.batch_size)
        self.normalizer_s.update(s)

        s, s_ = self.normalize_input(s, s_)
        s = torch.from_numpy(s).to(self.device)
        a = torch.from_numpy(a).to(self.device)
        r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1)
        s_ = torch.from_numpy(s_).to(self.device)

        #update policy and Q
        with torch.no_grad():
            a_next_tar = self.policy_target(s_)
            Q_next_tar = self.Q_target(s_, a_next_tar)
            loss_q_tar = r + self.gamma * Q_next_tar
        loss_q_pred = self.Q(s, a)
        loss_q = F.mse_loss(loss_q_pred, loss_q_tar.detach())
        self.optimizer_q.zero_grad()
        loss_q.backward()
        self.optimizer_q.step()

        loss_p = -self.Q(s, self.policy(s)).mean()
        self.optimizer_p.zero_grad()
        loss_p.backward()
        self.optimizer_p.step()

        self.soft_update()

        #update env model and encoder
        feature = self.encoder(s)
        next_feature = self.encoder(s_)
        a_pred = self.EnvInverse(feature, next_feature)
        loss_inverse = F.mse_loss(a_pred, a)

        next_feature_pred = self.EnvForward(feature, a)
        with torch.no_grad():
            next_feature_tar = self.encoder(s_)
        loss_forward = F.mse_loss(next_feature_pred, next_feature_tar.detach())

        self.optimizer_forward.zero_grad()
        self.optimizer_inverse.zero_grad()
        loss_forward.backward(retain_graph=True)
        loss_inverse.backward()
        self.optimizer_forward.step()
        self.optimizer_inverse.step()

        self.update_num += 1
        return loss_p.cpu().detach().numpy(), loss_q.cpu().detach().numpy(
        ), loss_forward.cpu().detach().numpy(), loss_inverse.cpu().detach(
        ).numpy()

    def update_normalizer(self, states):
        states = np.array(states, dtype=np.float32)
        self.normalizer_s.update(states)

    def hard_update(self):
        self.policy_target.load_state_dict(self.policy.state_dict())
        self.Q_target.load_state_dict(self.Q.state_dict())

    def soft_update(self):
        for param, param_target in zip(self.policy.parameters(),
                                       self.policy_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))
        for param, param_target in zip(self.Q.parameters(),
                                       self.Q_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))

    def normalize_input(self, s, s_=None):
        s = self.normalizer_s.normalize(s)
        if s_ is not None:
            s_ = self.normalizer_s.normalize(s_)
            return s, s_
        else:
            return s

    def save_model(self, remarks):
        if not os.path.exists('pretrained_models_DDPG/'):
            os.mkdir('pretrained_models_DDPG/')
        path = 'pretrained_models_DDPG/{}.pt'.format(remarks)
        print('Saving model to {}'.format(path))
        torch.save([
            self.normalizer_s.mean, self.normalizer_s.std,
            self.policy.state_dict()
        ], path)

    def load_model(self, remark):
        print('Loading models with remark {}'.format(remark))
        self.normalizer_s.mean, self.normalizer_s.std, policy_model = torch.load(
            'pretrained_models_DDPG/{}.pt'.format(remark),
            map_location=lambda storage, loc: storage)
        self.policy.load_state_dict(policy_model)
class Agent():
    def __init__(self, name, n_o, n_a):
        self.agent_name = name  #name of the agent: 'predator' or 'prey'
        self.dtype = torch.float
        self.obs_dim = n_o  #observation space dimensions
        self.act_dim = n_a  #action space dimensions
        self.GAMMA = 0.99  #discount factor
        self.ALPHA = 1e-3  #learning rate
        self.BATCH_SIZE = 16  #number of samples in batch
        self.BUFFER_SIZE = 10000
        self.TAU = 0.001
        self.buffer = Memory(self.BUFFER_SIZE)
        self.noise = OU_noise(dt=0.05)

        self.online_actor = Actor(self.obs_dim, self.act_dim)
        self.online_critic = Critic(self.obs_dim, 1)

        self.target_actor = Actor(self.obs_dim, self.act_dim)
        self.target_critic = Critic(self.obs_dim, 1)
        '''Copy Online Parameters to target Parameters'''
        self.target_actor.policy.load_state_dict(
            self.online_actor.policy.state_dict())
        self.target_critic.value_func.load_state_dict(
            self.online_critic.value_func.state_dict())

        self.optim = torch.optim.Adam(
            self.online_critic.value_func.parameters(), lr=self.ALPHA)

        self.state = None  #temp holding of variables before being pushed to replay buffer. 2D tensor [[state]]
        self.act = None  #temp holding of variables before being pushed to replay buffer. 2D tensore [[action]]

    def agent_get_action(self, observation):
        assert type(
            observation
        ) is torch.Tensor, 'The state is not a tensor! It is of type: %s' % (
            type(observation))
        self.state = observation

        #get the action using online actor network and add noise for exploration
        action_d = self.online_actor.get_action(self.state) + self.noise.step()
        self.act = action_d.mean().unsqueeze(0).unsqueeze(1)
        #return action to world
        return action_d.detach().numpy()

    def agent_train(self, ns, r, done=False):
        #convert next state and reward to tensors
        #next_state_v = torch.tensor([next_state],dtype=dtype)
        #reward_v = torch.tensor([reward],dtype=dtype)

        #save the values in the replay buffer
        self.buffer.push(self.state, self.act, r, ns, done)
        #set the state to the next state to advance agent
        self.state = ns

        #if there are enough samples in replay buffer, perform network updates
        if len(self.buffer) >= self.BUFFER_SIZE:
            #get a mini batch from the replay buffer
            sample = self.buffer.sample(self.BATCH_SIZE)
            #make the data nice
            compressed_states, compressed_actions, compressed_next_states, compressed_rewards = utils.extract_data(
                sample)

            #critic network training
            #yt=r(st,at)+γ⋅Q(st+1,μ(st+1))
            na_from_tactor_a = self.target_actor.get_action(
                compressed_next_states)
            na_from_tactor = na_from_tactor_a.mean(dim=1).unsqueeze(-1)
            v_from_tcritic = self.target_critic.get_state_value(
                compressed_next_states, na_from_tactor)

            #calculate yt=r(st,at)+γ⋅Q(st+1,μ(st+1))
            target_v = compressed_rewards.unsqueeze(
                1) + self.GAMMA * v_from_tcritic
            actual_v = self.online_critic.get_state_value(
                compressed_states, compressed_actions)
            loss = nn.MSELoss()
            output = loss(actual_v, target_v)
            self.optim.zero_grad()
            output.backward(retain_graph=True)
            self.optim.step()

            self.online_critic.value_func.zero_grad()

            for s, a in zip(compressed_states.split(1),
                            compressed_actions.split(1)):
                online_v = self.online_critic.get_state_value(s, a)
                grad_wrt_a = torch.autograd.grad(online_v, (s, a))

                action = self.online_actor.get_action(s)
                action.mean().backward(retain_graph=True)

                for param in self.online_actor.policy.parameters():
                    param.data += self.ALPHA * (
                        param.grad * grad_wrt_a[1].item()) / (self.BATCH_SIZE)

                self.online_actor.policy.zero_grad()
                self.online_critic.value_func.zero_grad()

    #            #soft update

            for param_o, param_t in zip(self.online_actor.policy.parameters(),
                                        self.target_actor.policy.parameters()):
                param_t.data = param_o.data * self.TAU + param_t.data * (
                    1 - self.TAU)

            for param_o, param_t in zip(
                    self.online_critic.value_func.parameters(),
                    self.target_critic.value_func.parameters()):
                param_t.data = param_o.data * self.TAU + param_t.data * (
                    1 - self.TAU)

            self.online_actor.policy.zero_grad()
            self.target_actor.policy.zero_grad()
            self.online_critic.value_func.zero_grad()
            self.target_critic.value_func.zero_grad()

            torch.save(self.target_actor.policy.state_dict(),
                       self.agent_name + 'target_actor_state_1.pt')
            torch.save(self.target_critic.value_func.state_dict(),
                       self.agent_name + 'target_critic_state_1.pt')
예제 #10
0
def train_a2c(env_name=ENV_NAME,
              iterations=ITERATIONS,
              gamma=GAMMA,
              a_lr=A_LR,
              c_lr=C_LR,
              stats_freq=STATS_FREQ,
              batch_size=BATCH_SIZE,
              reward_done=REWARD_DONE,
              num_target_updates=NUM_TARGET_UPDATES,
              num_critic_updates=NUM_CRITIC_UPDATES,
              normalize_adv=NORMALIZE_ADV,
              filename=FILENAME,
              use_gpu=USE_GPU):
    if use_gpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    env = gym.make(env_name)
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    actor = Actor(ob_dim, ac_dim, discrete)
    critic = Critic(ob_dim)

    actor.to(device)
    critic.to(device)

    actor_optimizer = torch.optim.Adam(actor.parameters(), lr=a_lr)
    critic_optimizer = torch.optim.Adam(critic.parameters(), lr=c_lr)
    logger = StatsLogger()
    stats = []
    time_list = []
    for i in range(iterations):
        time = datetime.datetime.now()

        buffer = Memory()
        collect_batch(env, actor, buffer, batch_size, device)
        advantages = update_critic(critic,
                                   critic_optimizer,
                                   buffer,
                                   gamma=gamma,
                                   num_target_updates=num_target_updates,
                                   num_critic_updates=num_critic_updates,
                                   device=device)

        update_actor(actor,
                     actor_optimizer,
                     advantages,
                     buffer,
                     normalize_adv=normalize_adv)

        running_reward = logger.calc_running_reward(buffer)

        time_after = datetime.datetime.now()
        time_diff = time_after - time
        time_list.append(time_diff.total_seconds())

        if not i % stats_freq:
            logger.print_running_reward(i)
            stats.append([i, logger.running_reward])
            print('Average iteration is',
                  sum(time_list) / len(time_list), 'seconds')
            time_list = []

        if reward_done is not None and running_reward >= reward_done:
            logger.task_done(i)
            break

        if filename is not None:
            with open(filename + '_logs.pkl', 'wb') as f:
                pkl.dump(stats, f)

    if filename is not None:
        torch.save(actor.state_dict(), filename + '_model.pt')
        torch.save(critic.state_dict(), filename + '_critic_model.pt')