Ejemplo n.º 1
0
class DDPGG(DDPG):
    def __init__(self, args, env, env_test, logger):
        super(DDPGG, self).__init__(args, env, env_test, logger)

    def init(self, args, env):
        names = ['s0', 'a', 's1', 'r', 't', 'g']
        metrics = ['loss_dqn', 'loss_actor']
        self.buffer = ReplayBuffer(limit=int(1e6),
                                   names=names.copy(),
                                   args=args)
        self.actorCritic = ActorCriticDDPGG(args, env)
        for metric in metrics:
            self.metrics[metric] = 0

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            targets_dqn = self.actorCritic.get_targets_dqn(
                exp['r'], exp['t'], exp['s1'], exp['g'])
            inputs = [exp['s0'], exp['a'], exp['g'], targets_dqn]
            loss_dqn = self.actorCritic.trainQval(inputs)
            action, criticActionGrads, invertedCriticActionGrads = self.actorCritic.trainActor(
                [exp['s0'], exp['g']])
            self.metrics['loss_dqn'] += np.squeeze(loss_dqn)
            self.actorCritic.target_train()

    def make_input(self, state, mode):
        if mode == 'train':
            input = [np.expand_dims(i, axis=0) for i in [state, self.env.goal]]
        else:
            input = [
                np.expand_dims(i, axis=0) for i in [state, self.env_test.goal]
            ]
        return input

    def reset(self):

        if self.trajectory:
            self.env.end_episode(self.trajectory)
            for expe in self.trajectory:
                self.buffer.append(expe.copy())
            if self.args['--her'] != '0':
                augmented_ep = self.env.augment_episode(self.trajectory)
                for e in augmented_ep:
                    self.buffer.append(e)
            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state
Ejemplo n.º 2
0
class Qoff(Agent):
    def __init__(self, args, env, env_test, logger):
        super(Qoff, self).__init__(args, env, env_test, logger)
        self.args = args
        self.gamma = 0.99
        self.lr = 0.1
        self.names = ['state0', 'action', 'state1', 'reward', 'terminal']
        self.init(args, env)

    def init(self, args, env):
        self.critic = np.zeros(shape=(5, 5, 4))
        self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)

    def train(self):
        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t, g, m = [exp[name] for name in self.names]
            for k in range(self.batch_size):
                target = r[k] + (1 - t[k]) * self.gamma * np.max(
                    self.critic[tuple(s1[k])])
                self.critic[tuple(s0[k])][a0[k]] = self.lr * target + \
                                                       (1 - self.lr) * self.critic[tuple(s0[k])][a0[k]]

    def act(self, state):
        if np.random.rand() < 0.2:
            action = np.random.randint(self.env.action_space.n)
        else:
            action = np.argmax(self.critic[tuple(state)])
        return action

    def reset(self):

        if self.trajectory:
            self.env.processEp(self.trajectory)
            for expe in reversed(self.trajectory):
                self.buffer.append(expe.copy())
            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state
Ejemplo n.º 3
0
class Agent():
    def __init__(self, s_dim, num_actions, lr):
        self.step = 0
        self.epStep = 0
        self.ep = 0
        self.tutorListened = True
        self.tutorInput = ''
        self.sDim = s_dim
        self.num_actions = num_actions
        self.learning_rate = lr
        self.names = ['state0', 'action', 'feedback', 'fWeight']
        self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)
        self.batchSize = 64
        self.episode = deque(maxlen=400)
        self.model = self.create_model()

    def create_model(self):
        state = Input(shape=self.sDim)
        action = Input(shape=(1,), dtype='uint8')
        l1 = Dense(400, activation="relu")(state)
        feedback = Dense(self.num_actions, activation=None, kernel_initializer='random_uniform')(l1)
        feedback = Reshape((1, self.num_actions))(feedback)
        mask = Lambda(K.one_hot, arguments={'num_classes': self.num_actions},
                      output_shape=(self.num_actions,))(action)
        feedback = multiply([feedback, mask])
        feedback = Lambda(K.sum, arguments={'axis': 2})(feedback)
        feedbackModel = Model(inputs=[state, action], outputs=feedback)
        feedbackModel.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return feedbackModel

    def train(self):
        loss = 0
        if self.buffer.nb_entries > self.batchSize:
            samples = self.buffer.sample(self.batchSize)
            s, a, targets, weights = [np.array(samples[name]) for name in self.names]
            loss = self.model.train_on_batch(x=[s,a], y=targets, sample_weight=weights)
        return loss

    def tutorListener(self):
        self.tutorInput = input("> ")
        print("maybe updating...the kbdInput variable is: {}".format(self.tutorInput))
        self.tutorListened = True

    def run(self):
        state0 = np.random.randint(0, 4, size=(5,))
        while self.step < 100000:

            if self.tutorInput != '':
                print("Received new keyboard Input. Setting playing ID to keyboard input value")
                for i in range(1,10):
                    self.episode[-i]['fWeight'] = 1
                    self.episode[-i]['feedback'] = self.tutorInput
                self.tutorInput = ''
            else:
                action = np.random.randint(self.num_actions)
                state1 = np.random.randint(0, 4, size=(5,))
                self.step += 1
                self.epStep += 1
                experience = {'state0': state0, 'action': action, 'fWeight': 0}
                self.episode.append(experience)
                self.loss = self.train()
                state0 = state1
                time.sleep(0.001)

            if self.tutorListened:
                self.tutorListened = False
                self.listener = Thread(target=self.tutorListener)
                self.listener.start()

            if self.epStep >= 200:
                if self.ep > 0:
                    for s in range(self.epStep):
                        exp = self.episode.popleft()
                        if exp['fWeight'] != 0:
                            self.buffer.append(exp)
                self.epStep = 0
                self.ep += 1
                state0 = np.random.randint(0, 4, size=(5,))
            if self.step % 1000 == 0:
                print(self.step, self.loss)

    def input(self):
        while True:
            if input() == '+':
                inputStep = self.step
                time.sleep(2)
                print('input +1, step = ', inputStep)
            elif input() == '-':
                inputStep = self.step
                time.sleep(2)
                print('input -1, step = ', inputStep)
            else:
                print('wrong input')
Ejemplo n.º 4
0
class DQNAgent():
    '''
	Agent class. It control all the agent functionalities
	'''
    rewards = []
    total_reward = 0
    birth_time = 0
    n_iter = 0
    n_games = 0
    ts_frame = 0
    ts = time.time()

    Memory = namedtuple('Memory',
                        ['obs', 'action', 'new_obs', 'reward', 'done'],
                        rename=False)

    def __init__(self, env, device, hyperparameters, summary_writer=None):
        '''
		Agent initialization. It create the CentralControl that control all the low
		'''

        # The CentralControl is the 'brain' of the agent
        self.cc = CentralControl(env.observation_space.shape,
                                 env.action_space.n, hyperparameters['gamma'],
                                 hyperparameters['n_multi_step'],
                                 hyperparameters['double_DQN'],
                                 hyperparameters['noisy_net'],
                                 hyperparameters['dueling'], device)

        self.cc.set_optimizer(hyperparameters['learning_rate'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

        self.accumulated_loss = []
        self.device = device

        # initialize the replay buffer (i.e. the memory) of the agent
        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])
        self.summary_writer = summary_writer

        self.noisy_net = hyperparameters['noisy_net']

        self.env = env

    def act(self, obs):
        '''
		Greedy action outputted by the NN in the CentralControl
		'''
        return self.cc.get_max_action(obs)

    def act_eps_greedy(self, obs):
        '''
		E-greedy action
		'''

        # In case of a noisy net, it takes a greedy action
        if self.noisy_net:
            return self.act(obs)

        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return self.act(obs)

    def add_env_feedback(self, obs, action, new_obs, reward, done):
        '''
		Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean.
		'''

        # Create the new memory and update the buffer
        new_memory = self.Memory(obs=obs,
                                 action=action,
                                 new_obs=new_obs,
                                 reward=reward,
                                 done=done)
        self.replay_buffer.append(new_memory)

        # update the variables
        self.n_iter += 1
        # decrease epsilon
        self.epsilon = max(
            self.epsilon_final,
            self.epsilon_start - self.n_iter / self.epsilon_decay)
        self.total_reward += reward

    def sample_and_optimize(self, batch_size):
        '''
		Sample batch_size memories from the buffer and optimize them
		'''

        if len(self.replay_buffer) > self.buffer_start_size:
            # sample
            mini_batch = self.replay_buffer.sample(batch_size)
            # optimize
            l_loss = self.cc.optimize(mini_batch)
            self.accumulated_loss.append(l_loss)

        # update target NN
        if self.n_iter % self.iter_update_target == 0:
            self.cc.update_target()

    def reset_stats(self):
        '''
		Reset the agent's statistics
		'''
        self.rewards.append(self.total_reward)
        self.total_reward = 0
        self.accumulated_loss = []
        self.n_games += 1

    def print_info(self):
        '''
		Print information about the agent
		'''
        fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts)
        print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' %
              (self.n_iter, self.n_games, self.total_reward,
               np.mean(self.rewards[-40:]), self.epsilon, fps,
               np.mean(self.accumulated_loss)))

        self.ts_frame = self.n_iter
        self.ts = time.time()

        if self.summary_writer != None:
            self.summary_writer.add_scalar('reward', self.total_reward,
                                           self.n_games)
            self.summary_writer.add_scalar('mean_reward',
                                           np.mean(self.rewards[-40:]),
                                           self.n_games)
            self.summary_writer.add_scalar('10_mean_reward',
                                           np.mean(self.rewards[-10:]),
                                           self.n_games)
            self.summary_writer.add_scalar('esilon', self.epsilon,
                                           self.n_games)
            self.summary_writer.add_scalar('loss',
                                           np.mean(self.accumulated_loss),
                                           self.n_games)
Ejemplo n.º 5
0
class PlayroomGM(Wrapper):
    def __init__(self, env, args):
        super(PlayroomGM, self).__init__(env)

        self.gamma = float(args['--gamma'])
        self.eps = float(args['--eps'])
        self.demo_f = [int(f) for f in args['--demo'].split(',')]

        self.feat = np.array([int(f) for f in args['--features'].split(',')])
        self.N = self.feat.shape[0]
        vs = np.zeros(shape=(self.N, self.state_dim[0]))
        vs[np.arange(self.N), self.feat] = 1
        self.vs = vs / np.sum(vs, axis=1, keepdims=True)
        self.R = 100
        self.idx = -1
        self.v = np.zeros(shape=(self.state_dim[0], 1))
        self.g = np.ones(shape=(self.state_dim[0]))
        self.queues = [CompetenceQueue() for _ in range(self.N)]
        self.names = ['s0', 'r0', 'a', 's1', 'r1', 'g', 'v', 'o', 'u']
        self.buffer = ReplayBuffer(limit=int(1e5), names=self.names, N=self.N)

    def reset(self, exp):
        self.idx, self.v = self.sample_v(exp['s0'])
        exp['g'] = self.g
        exp['v'] = self.v
        return exp

    def get_r(self, s, g, v):
        return self.R * np.sum(np.multiply(v, s == g), axis=1, keepdims=True)

    def sample_v(self, s):
        remaining_v = [i for i in range(self.N) if s[self.feat[i]] != 1]
        probs = self.get_probs(idxs=remaining_v, eps=self.eps)
        idx = np.random.choice(remaining_v, p=probs)
        v = self.vs[idx]
        return idx, v

    def sampleT(self, batch_size):
        idxs = [
            i for i in range(self.N)
            if self.buffer._tutorBuffers[i]._numsamples > batch_size
        ]
        probs = self.get_probs(idxs=idxs, eps=self.eps)
        t = np.random.choice(idxs, p=probs)
        samples = self.buffer.sampleT(batch_size, t)
        return samples, t

    def end_episode(self, episode):
        term = episode[-1]['r1'][self.idx] == self.R
        self.queues[self.idx].process_ep(episode, term)
        base_util = np.zeros(shape=(self.N, ))
        base_util[self.idx] = 1
        self.process_trajectory(episode, base_util=base_util)

    def process_trajectory(self, trajectory, base_util=None):
        if base_util is None:
            u = np.zeros(shape=(self.N, ))
        else:
            u = base_util
        u = np.expand_dims(u, axis=1)
        # mcr = np.zeros(shape=(self.N,))
        for exp in reversed(trajectory):
            u = self.gamma * u
            u[np.where(exp['r1'] > exp['r0'])] = 1

            # u_idx = np.where(u != 0)
            # mcr[u_idx] = exp['r1'][u_idx] + self.gamma * mcr[u_idx]
            exp['u'] = u.squeeze()
            # exp['mcr'] = mcr
            if any(u != 0):
                self.buffer.append(exp.copy())

    # def sample(self, batchsize):
    #     probs = self.get_probs(idxs=range(self.N), eps=self.eps2)
    #     idx = np.random.choice(self.N, p=probs)
    #     samples = self.buffer.sample(batchsize, idx)
    #     if samples is not None:
    #         self.queues[idx].process_samples(samples)
    #     return idx, samples
    #
    # def sampleT(self, batchsize):
    #     probs = self.get_probs(idxs=range(self.N), eps=self.eps3)
    #     idx = np.random.choice(self.N, p=probs)
    #     samples = self.buffer.sampleT(batchsize, idx)
    #     if samples is not None:
    #         self.queues[idx].process_samplesT(samples)
    #     return idx, samples

    def get_demo(self):
        demo = []
        exp = {}
        exp['s0'] = self.env.reset()
        exp['r0'] = self.get_r(exp['s0'], self.g, self.vs).squeeze()
        exp['g'] = self.g
        task = np.random.choice(self.demo_f)
        exp['v'] = self.vs[list(self.feat).index(task)]
        while True:
            a, done = self.opt_action(task)
            if done:
                break
            else:
                exp['a'] = np.expand_dims(a, axis=1)
                exp['s1'] = self.env.step(exp['a'], True)[0]
                exp['r1'] = self.get_r(exp['s1'], self.g, self.vs).squeeze()
                exp['o'] = 1
                demo.append(exp.copy())
                exp['s0'] = exp['s1']
                exp['r0'] = exp['r1']

        return demo, task

    def opt_action(self, t):
        return self.env.opt_action(t)

    def get_stats(self):
        stats = {}
        for i, f in enumerate(self.feat):
            self.queues[i].update()
            for key, val in self.queues[i].get_stats().items():
                stats[key + str(f)] = val
            self.queues[i].init_stat()
        return stats

    def get_cps(self):
        return [np.maximum(abs(q.CP + 0.05) - 0.05, 0) for q in self.queues]

    def get_probs(self, idxs, eps):
        cps = self.get_cps()
        vals = [cps[idx] for idx in idxs]
        l = len(vals)
        s = np.sum(vals)
        if s == 0:
            probs = [1 / l] * l
        else:
            probs = [eps / l + (1 - eps) * v / s for v in vals]
        return probs

    @property
    def state_dim(self):
        return 8,

    @property
    def goal_dim(self):
        return 8,

    @property
    def action_dim(self):
        return 5
Ejemplo n.º 6
0
class DDPG(Agent):
    def __init__(self, args, env, env_test, logger):
        super(DDPG, self).__init__(args, env, env_test, logger)
        self.args = args
        self.init(args, env)
        for metric in self.critic.model.metrics_names:
            self.metrics[self.critic.model.name + '_' + metric] = 0

    def init(self, args, env):
        names = ['state0', 'action', 'state1', 'reward', 'terminal']
        self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
        self.actorCritic = ActorCriticDDPG(args, env)
        # self.critic = CriticDDPG(args, env)
        # self.actor = ActorDDPG(args, env)

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t = [exp[name] for name in self.buffer.names]
            a1 = self.actor.target_model.predict_on_batch(s1)
            a1 = np.clip(a1, self.env.action_space.low,
                         self.env.action_space.high)
            q = self.critic.Tmodel.predict_on_batch([s1, a1])
            targets = r + (1 - t) * self.critic.gamma * np.squeeze(q)
            targets = np.clip(targets, self.env.minR / (1 - self.critic.gamma),
                              self.env.maxR)
            inputs = [s0, a0]
            loss = self.critic.model.train_on_batch(inputs, targets)
            for i, metric in enumerate(self.critic.model.metrics_names):
                self.metrics[metric] += loss[i]

            # a2 = self.actor.model.predict_on_batch(s0)
            # grads = self.critic.gradsModel.predict_on_batch([s0, a2])
            # low = self.env.action_space.low
            # high = self.env.action_space.high
            # for d in range(grads[0].shape[0]):
            #     width = high[d] - low[d]
            #     for k in range(self.batch_size):
            #         if grads[k][d] >= 0:
            #             grads[k][d] *= (high[d] - a2[k][d]) / width
            #         else:
            #             grads[k][d] *= (a2[k][d] - low[d]) / width
            # self.actor.train(s0, grads)

            self.actor.target_train()
            self.critic.target_train()

    def reset(self):

        if self.trajectory:
            T = int(self.trajectory[-1]['terminal'])
            R = np.sum([
                self.env.unshape(exp['reward'], exp['terminal'])
                for exp in self.trajectory
            ])
            S = len(self.trajectory)
            self.env.processEp(R, S, T)
            for expe in reversed(self.trajectory):
                self.buffer.append(expe.copy())

            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def make_input(self, state):
        input = [np.reshape(state, (1, self.actor.s_dim[0]))]
        return input

    def act(self, state):
        input = self.make_input(state)
        action = self.actor.model.predict(input, batch_size=1)
        noise = np.random.normal(0., 0.1, size=action.shape)
        action = noise + action
        action = np.clip(action, self.env.action_space.low,
                         self.env.action_space.high)
        action = action.squeeze()
        return action
Ejemplo n.º 7
0
class DQNAgent():
    """Deep Q-learning agent."""

    # def __init__(self,
    # env, device=DEVICE, summary_writer=writer,  # noqa
    # hyperparameters=DQN_HYPERPARAMS):  # noqa

    rewards = []
    total_reward = 0
    birth_time = 0
    n_iter = 0
    n_games = 0
    ts_frame = 0
    ts = time.time()

    # Memory = namedtuple(
    # 'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'],
    # verbose=False, rename=False)
    Memory = namedtuple('Memory',
                        ['obs', 'action', 'new_obs', 'reward', 'done'],
                        rename=False)

    def __init__(self, env, hyperparameters, device, summary_writer=None):
        """Set parameters, initialize network."""

        state_space_shape = env.observation_space.shape
        action_space_size = env.action_space.n

        self.env = env

        self.online_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        self.target_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        # XXX maybe not really necesary?
        self.update_target_network()

        self.experience_replay = None

        self.accumulated_loss = []
        self.device = device

        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=hyperparameters['learning_rate'])

        self.double_DQN = hyperparameters['double_DQN']

        # Discount factor
        self.gamma = hyperparameters['gamma']

        # XXX ???
        self.n_multi_step = hyperparameters['n_multi_step']

        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.summary_writer = summary_writer

        # Greedy search hyperparameters
        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

    def get_max_action(self, obs):
        '''
        Forward pass of the NN to obtain the action of the given observations
        '''
        # convert the observation in tensor
        state_t = torch.tensor(np.array([obs])).to(self.device)

        # forward pass
        q_values_t = self.online_network(state_t)

        # get the maximum value of the output (i.e. the best action to take)
        _, act_t = torch.max(q_values_t, dim=1)

        return int(act_t.item())

    def act(self, obs):
        '''
        Greedy action outputted by the NN in the CentralControl
        '''
        return self.get_max_action(obs)

    def act_eps_greedy(self, obs):
        '''
        E-greedy action
        '''

        # In case of a noisy net, it takes a greedy action
        # if self.noisy_net:
        # return self.act(obs)

        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return self.act(obs)

    def update_target_network(self):
        """Update target network weights with current online network values."""

        self.target_network.load_state_dict(self.online_network.state_dict())

    def set_optimizer(self, learning_rate):
        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=learning_rate)

    def sample_and_optimize(self, batch_size):
        '''
        Sample batch_size memories from the buffer and optimize them
        '''

        # This should be the part where it waits until it has enough
        # experience
        if len(self.replay_buffer) > self.buffer_start_size:
            # sample
            mini_batch = self.replay_buffer.sample(batch_size)
            # optimize
            # l_loss = self.cc.optimize(mini_batch)
            l_loss = self.optimize(mini_batch)
            self.accumulated_loss.append(l_loss)

        # update target NN
        if self.n_iter % self.iter_update_target == 0:
            self.update_target_network()

    def optimize(self, mini_batch):
        '''
        Optimize the NN
        '''
        # reset the grads
        self.optimizer.zero_grad()
        # caluclate the loss of the mini batch
        loss = self._calulate_loss(mini_batch)
        loss_v = loss.item()

        # do backpropagation
        loss.backward()
        # one step of optimization
        self.optimizer.step()

        return loss_v

    def _calulate_loss(self, mini_batch):
        '''
        Calculate mini batch's MSE loss.
        It support also the double DQN version
        '''

        states, actions, next_states, rewards, dones = mini_batch

        # convert the data in tensors
        states_t = torch.as_tensor(states, device=self.device)
        next_states_t = torch.as_tensor(next_states, device=self.device)
        actions_t = torch.as_tensor(actions, device=self.device)
        rewards_t = torch.as_tensor(rewards,
                                    dtype=torch.float32,
                                    device=self.device)

        done_t = torch.as_tensor(dones, dtype=torch.uint8,
                                 device=self.device)  # noqa

        # Value of the action taken previously (recorded in actions_v)
        # in state_t
        state_action_values = self.online_network(states_t).gather(
            1, actions_t[:, None]).squeeze(-1)

        # NB gather is a differentiable function

        # Next state value with Double DQN. (i.e. get the value predicted
        # by the target nn, of the best action predicted by the online nn)
        if self.double_DQN:
            double_max_action = self.online_network(next_states_t).max(1)[1]
            double_max_action = double_max_action.detach()
            target_output = self.target_network(next_states_t)

            # NB: [:,None] add an extra dimension
            next_state_values = torch.gather(
                target_output, 1, double_max_action[:, None]).squeeze(-1)

        # Next state value in the normal configuration
        else:
            next_state_values = self.target_network(next_states_t).max(1)[0]

        next_state_values = next_state_values.detach()  # No backprop

        # Use the Bellman equation
        expected_state_action_values = rewards_t + \
            (self.gamma**self.n_multi_step) * next_state_values

        # compute the loss
        return nn.MSELoss()(state_action_values, expected_state_action_values)

    def reset_stats(self):
        '''
        Reset the agent's statistics
        '''
        self.rewards.append(self.total_reward)
        self.total_reward = 0
        self.accumulated_loss = []
        self.n_games += 1

    def add_env_feedback(self, obs, action, new_obs, reward, done):
        '''
        Acquire a new feedback from the environment. The feedback is
        constituted by the new observation, the reward and the done boolean.
        '''

        # Create the new memory and update the buffer
        new_memory = self.Memory(obs=obs,
                                 action=action,
                                 new_obs=new_obs,
                                 reward=reward,
                                 done=done)

        # Append it to the replay buffer
        self.replay_buffer.append(new_memory)

        # update the variables
        self.n_iter += 1

        # TODO check this...
        # decrease epsilon
        self.epsilon = max(
            self.epsilon_final,
            self.epsilon_start - self.n_iter / self.epsilon_decay)

        self.total_reward += reward

    def print_info(self):
        '''
        Print information about the agent
        '''

        fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts)

        # TODO replace with proper logger
        print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' %
              (self.n_iter, self.n_games, self.total_reward,
               np.mean(self.rewards[-40:]), self.epsilon, fps,
               np.mean(self.accumulated_loss)))

        self.ts_frame = self.n_iter
        self.ts = time.time()

        if self.summary_writer is not None:
            self.summary_writer.add_scalar('reward', self.total_reward,
                                           self.n_games)
            self.summary_writer.add_scalar('mean_reward',
                                           np.mean(self.rewards[-40:]),
                                           self.n_games)
            self.summary_writer.add_scalar('10_mean_reward',
                                           np.mean(self.rewards[-10:]),
                                           self.n_games)
            self.summary_writer.add_scalar('epsilon', self.epsilon,
                                           self.n_games)
            self.summary_writer.add_scalar('loss',
                                           np.mean(self.accumulated_loss),
                                           self.n_games)
Ejemplo n.º 8
0
class DDPG(Agent):
    def __init__(self, args, env, env_test, logger):
        super(DDPG, self).__init__(args, env, env_test, logger)
        self.args = args
        self.init(args, env)

    def init(self, args, env):
        names = ['s0', 'a', 's1', 'r', 't']
        metrics = ['loss_dqn', 'loss_actor']
        self.buffer = ReplayBuffer(limit=int(1e6),
                                   names=names.copy(),
                                   args=args)
        self.actorCritic = ActorCriticDDPG(args, env)
        for metric in metrics:
            self.metrics[metric] = 0

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            targets_dqn = self.actorCritic.get_targets_dqn(
                exp['r'], exp['t'], exp['s1'])
            inputs = [exp['s0'], exp['a'], targets_dqn]
            loss_dqn = self.actorCritic.trainQval(inputs)
            action, criticActionGrads, invertedCriticActionGrads = self.actorCritic.trainActor(
                [exp['s0']])
            self.metrics['loss_dqn'] += np.squeeze(loss_dqn)

            # a2 = self.actor.model.predict_on_batch(s0)
            # grads = self.critic.gradsModel.predict_on_batch([s0, a2])
            # low = self.env.action_space.low
            # high = self.env.action_space.high
            # for d in range(grads[0].shape[0]):
            #     width = high[d] - low[d]
            #     for k in range(self.batch_size):
            #         if grads[k][d] >= 0:
            #             grads[k][d] *= (high[d] - a2[k][d]) / width
            #         else:
            #             grads[k][d] *= (a2[k][d] - low[d]) / width
            # self.actor.train(s0, grads)

            self.actorCritic.target_train()

    def make_input(self, state, mode):
        input = [np.expand_dims(state, axis=0)]
        return input

    def reset(self):

        if self.trajectory:
            self.env.end_episode(self.trajectory)
            for expe in self.trajectory:
                self.buffer.append(expe.copy())

            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def act(self, state, mode='train'):
        input = self.make_input(state, mode)
        action = self.actorCritic.action(input)[0]
        if mode == 'train':
            noise = np.random.normal(0., 0.1, size=action[0].shape)
            action = noise + action
        action = np.clip(action, self.env.action_space.low,
                         self.env.action_space.high)
        action = action.squeeze()
        return action

    def save_model(self):
        self.actorCritic.actionModel.save(os.path.join(self.logger.get_dir(),
                                                       'actor_model'),
                                          overwrite=True)
        self.actorCritic.qvalModel.save(os.path.join(self.logger.get_dir(),
                                                     'qval_model'),
                                        overwrite=True)
Ejemplo n.º 9
0
class ACDQNGM(DQNG):
    def __init__(self, args, env, env_test, logger):
        super(ACDQNGM, self).__init__(args, env, env_test, logger)

    def init(self, args, env):
        names = ['s0', 'a', 's1', 'r', 't', 'g', 'm', 'task', 'mcr']
        metrics = ['loss_dqn', 'qval', 'val']
        self.buffer = ReplayBuffer(limit=int(1e6),
                                   names=names.copy(),
                                   args=args)
        self.actorCritic = ActorCriticDQNGM(args, env)
        for metric in metrics:
            self.metrics[metric] = 0
        self.goalcounts = np.zeros((len(self.env.goals), ))

    def train(self):

        if self.buffer.nb_entries > 100 * self.batch_size:

            samples = self.buffer.sample(self.batch_size)
            samples = self.env.augment_samples(samples)
            targets = self.actorCritic.get_targets_dqn(samples['r'],
                                                       samples['t'],
                                                       samples['s1'],
                                                       samples['g'],
                                                       samples['m'])
            inputs = [
                samples['s0'], samples['a'], samples['g'], samples['m'],
                targets
            ]
            metricsCritic = self.actorCritic.trainCritic(inputs)
            self.metrics['loss_dqn'] += np.squeeze(metricsCritic[0])
            self.metrics['qval'] += np.mean(metricsCritic[1])
            self.goalcounts += np.bincount(samples['task'],
                                           minlength=len(self.env.goals))
            metricsActor = self.actorCritic.trainActor(
                [samples['s0'], samples['g'], samples['m']])
            if self.env_step % 1000 == 0:
                print(metricsActor[0], metricsActor[1])
            self.metrics['val'] += np.mean(metricsActor[2])
            self.actorCritic.target_train()

    def get_stats(self):
        sumsamples = np.sum(self.goalcounts)
        if sumsamples != 0:
            for i, goal in enumerate(self.env.goals):
                self.stats['samplecount_{}'.format(goal)] = float(
                    "{0:.3f}".format(self.goalcounts[i] / sumsamples))

    def make_input(self, state, mode):
        if mode == 'train':
            input = [
                np.expand_dims(i, axis=0)
                for i in [state, self.env.goal, self.env.mask]
            ]
        else:
            input = [
                np.expand_dims(i, axis=0)
                for i in [state, self.env_test.goal, self.env_test.mask]
            ]
        return input

    def act(self, exp, mode='train'):
        input = self.make_input(exp['s0'], mode)
        actionProbs = self.actorCritic.probs(input)[0].squeeze()
        # if self.env_step % 1000 == 0: print(actionProbs)
        if mode == 'train':
            action = np.random.choice(range(self.env.action_dim),
                                      p=actionProbs)
        else:
            action = np.argmax(actionProbs[0])
        prob = actionProbs[action]
        action = np.expand_dims(action, axis=1)
        exp['a'] = action
        # exp['p_a'] = prob
        return exp

    def reset(self):

        if self.trajectory:
            augmented_episode = self.env.end_episode(self.trajectory)
            for expe in augmented_episode:
                self.buffer.append(expe)
            # for expe in self.trajectory:
            #     self.buffer.append(expe.copy())
            # augmented_ep = self.env.augment_episode(self.trajectory)
            # for e in augmented_ep:
            #     self.buffer.append(e)
            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def get_demo(self, rndprop):
        demo = []
        exp = {}
        exp['s0'] = self.env_test.env.reset()
        # obj = np.random.choice(self.env_test.env.objects)
        # goal = np.random.randint(obj.high[2]+1)
        obj = self.env_test.env.light
        goal = 1
        while True:
            if np.random.rand() < rndprop:
                a = np.random.randint(self.env_test.action_dim)
                done = False
            else:
                a, done = self.env_test.env.opt_action(obj, goal)
            if not done:
                exp['a'] = np.expand_dims(a, axis=1)
                exp['s1'] = self.env_test.env.step(exp['a'])[0]
                demo.append(exp.copy())
                exp['s0'] = exp['s1']
            else:
                break
        return demo

    def demo(self):
        if self.env_step % self.demo_freq == 0:
            for i in range(5):
                demo = self.get_demo(rndprop=0.)
                augmented_demo = self.env.augment_demo(demo)
                for exp in augmented_demo:
                    self.buffer.append(exp)
Ejemplo n.º 10
0
class DQN(Agent):
    def __init__(self, args, env, env_test, logger):
        super(DQN, self).__init__(args, env, env_test, logger)
        self.args = args
        self.init(args, env)

    def init(self, args, env):
        names = ['state0', 'action', 'state1', 'reward', 'terminal']
        self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
        self.critic = CriticDQN(args, env)
        for metric_name in ['loss_dqn', 'qval', 'val']:
            self.metrics[metric_name] = 0

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t = [exp[name] for name in self.buffer.names]
            targets_dqn = self.critic.get_targets_dqn(r, t, s1)
            inputs = [s0, a0]
            loss = self.critic.criticModel.train_on_batch(inputs, targets_dqn)
            for i, metric in enumerate(self.critic.criticModel.metrics_names):
                self.metrics[metric] += loss[i]

            self.critic.target_train()

    def reset(self):

        if self.trajectory:
            R = np.sum([
                self.env.unshape(exp['reward'], exp['terminal'])
                for exp in self.trajectory
            ])
            self.env.processEp(R)
            for expe in reversed(self.trajectory):
                self.buffer.append(expe.copy())

            if self.args['--imit'] != '0':
                Es = [0]
                for i, expe in enumerate(reversed(self.trajectory)):
                    if self.trajectory[-1]['terminal']:
                        Es[0] = Es[0] * self.critic.gamma + expe['reward']
                        expe['expVal'] = Es[0]
                    else:
                        expe['expVal'] = -self.ep_steps
                    self.bufferImit.append(expe.copy())

            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def make_input(self, state, mode):
        input = [np.reshape(state, (1, self.critic.s_dim[0]))]
        input.append(np.expand_dims([0.5], axis=0))
        return input

    def act(self, state, mode='train'):
        input = self.make_input(state, mode)
        actionProbs = self.critic.actionProbs(input)
        if mode == 'train':
            action = np.random.choice(range(self.env.action_dim),
                                      p=actionProbs[0].squeeze())
        else:
            action = np.argmax(actionProbs[0])
        return np.expand_dims(action, axis=1)