class DDPGG(DDPG): def __init__(self, args, env, env_test, logger): super(DDPGG, self).__init__(args, env, env_test, logger) def init(self, args, env): names = ['s0', 'a', 's1', 'r', 't', 'g'] metrics = ['loss_dqn', 'loss_actor'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy(), args=args) self.actorCritic = ActorCriticDDPGG(args, env) for metric in metrics: self.metrics[metric] = 0 def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) targets_dqn = self.actorCritic.get_targets_dqn( exp['r'], exp['t'], exp['s1'], exp['g']) inputs = [exp['s0'], exp['a'], exp['g'], targets_dqn] loss_dqn = self.actorCritic.trainQval(inputs) action, criticActionGrads, invertedCriticActionGrads = self.actorCritic.trainActor( [exp['s0'], exp['g']]) self.metrics['loss_dqn'] += np.squeeze(loss_dqn) self.actorCritic.target_train() def make_input(self, state, mode): if mode == 'train': input = [np.expand_dims(i, axis=0) for i in [state, self.env.goal]] else: input = [ np.expand_dims(i, axis=0) for i in [state, self.env_test.goal] ] return input def reset(self): if self.trajectory: self.env.end_episode(self.trajectory) for expe in self.trajectory: self.buffer.append(expe.copy()) if self.args['--her'] != '0': augmented_ep = self.env.augment_episode(self.trajectory) for e in augmented_ep: self.buffer.append(e) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state
class Qoff(Agent): def __init__(self, args, env, env_test, logger): super(Qoff, self).__init__(args, env, env_test, logger) self.args = args self.gamma = 0.99 self.lr = 0.1 self.names = ['state0', 'action', 'state1', 'reward', 'terminal'] self.init(args, env) def init(self, args, env): self.critic = np.zeros(shape=(5, 5, 4)) self.buffer = ReplayBuffer(limit=int(1e6), names=self.names) def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) s0, a0, s1, r, t, g, m = [exp[name] for name in self.names] for k in range(self.batch_size): target = r[k] + (1 - t[k]) * self.gamma * np.max( self.critic[tuple(s1[k])]) self.critic[tuple(s0[k])][a0[k]] = self.lr * target + \ (1 - self.lr) * self.critic[tuple(s0[k])][a0[k]] def act(self, state): if np.random.rand() < 0.2: action = np.random.randint(self.env.action_space.n) else: action = np.argmax(self.critic[tuple(state)]) return action def reset(self): if self.trajectory: self.env.processEp(self.trajectory) for expe in reversed(self.trajectory): self.buffer.append(expe.copy()) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state
class Agent(): def __init__(self, s_dim, num_actions, lr): self.step = 0 self.epStep = 0 self.ep = 0 self.tutorListened = True self.tutorInput = '' self.sDim = s_dim self.num_actions = num_actions self.learning_rate = lr self.names = ['state0', 'action', 'feedback', 'fWeight'] self.buffer = ReplayBuffer(limit=int(1e6), names=self.names) self.batchSize = 64 self.episode = deque(maxlen=400) self.model = self.create_model() def create_model(self): state = Input(shape=self.sDim) action = Input(shape=(1,), dtype='uint8') l1 = Dense(400, activation="relu")(state) feedback = Dense(self.num_actions, activation=None, kernel_initializer='random_uniform')(l1) feedback = Reshape((1, self.num_actions))(feedback) mask = Lambda(K.one_hot, arguments={'num_classes': self.num_actions}, output_shape=(self.num_actions,))(action) feedback = multiply([feedback, mask]) feedback = Lambda(K.sum, arguments={'axis': 2})(feedback) feedbackModel = Model(inputs=[state, action], outputs=feedback) feedbackModel.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) return feedbackModel def train(self): loss = 0 if self.buffer.nb_entries > self.batchSize: samples = self.buffer.sample(self.batchSize) s, a, targets, weights = [np.array(samples[name]) for name in self.names] loss = self.model.train_on_batch(x=[s,a], y=targets, sample_weight=weights) return loss def tutorListener(self): self.tutorInput = input("> ") print("maybe updating...the kbdInput variable is: {}".format(self.tutorInput)) self.tutorListened = True def run(self): state0 = np.random.randint(0, 4, size=(5,)) while self.step < 100000: if self.tutorInput != '': print("Received new keyboard Input. Setting playing ID to keyboard input value") for i in range(1,10): self.episode[-i]['fWeight'] = 1 self.episode[-i]['feedback'] = self.tutorInput self.tutorInput = '' else: action = np.random.randint(self.num_actions) state1 = np.random.randint(0, 4, size=(5,)) self.step += 1 self.epStep += 1 experience = {'state0': state0, 'action': action, 'fWeight': 0} self.episode.append(experience) self.loss = self.train() state0 = state1 time.sleep(0.001) if self.tutorListened: self.tutorListened = False self.listener = Thread(target=self.tutorListener) self.listener.start() if self.epStep >= 200: if self.ep > 0: for s in range(self.epStep): exp = self.episode.popleft() if exp['fWeight'] != 0: self.buffer.append(exp) self.epStep = 0 self.ep += 1 state0 = np.random.randint(0, 4, size=(5,)) if self.step % 1000 == 0: print(self.step, self.loss) def input(self): while True: if input() == '+': inputStep = self.step time.sleep(2) print('input +1, step = ', inputStep) elif input() == '-': inputStep = self.step time.sleep(2) print('input -1, step = ', inputStep) else: print('wrong input')
class DQNAgent(): ''' Agent class. It control all the agent functionalities ''' rewards = [] total_reward = 0 birth_time = 0 n_iter = 0 n_games = 0 ts_frame = 0 ts = time.time() Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], rename=False) def __init__(self, env, device, hyperparameters, summary_writer=None): ''' Agent initialization. It create the CentralControl that control all the low ''' # The CentralControl is the 'brain' of the agent self.cc = CentralControl(env.observation_space.shape, env.action_space.n, hyperparameters['gamma'], hyperparameters['n_multi_step'], hyperparameters['double_DQN'], hyperparameters['noisy_net'], hyperparameters['dueling'], device) self.cc.set_optimizer(hyperparameters['learning_rate']) self.birth_time = time.time() self.iter_update_target = hyperparameters['n_iter_update_target'] self.buffer_start_size = hyperparameters['buffer_start_size'] self.epsilon_start = hyperparameters['epsilon_start'] self.epsilon = hyperparameters['epsilon_start'] self.epsilon_decay = hyperparameters['epsilon_decay'] self.epsilon_final = hyperparameters['epsilon_final'] self.accumulated_loss = [] self.device = device # initialize the replay buffer (i.e. the memory) of the agent self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma']) self.summary_writer = summary_writer self.noisy_net = hyperparameters['noisy_net'] self.env = env def act(self, obs): ''' Greedy action outputted by the NN in the CentralControl ''' return self.cc.get_max_action(obs) def act_eps_greedy(self, obs): ''' E-greedy action ''' # In case of a noisy net, it takes a greedy action if self.noisy_net: return self.act(obs) if np.random.random() < self.epsilon: return self.env.action_space.sample() else: return self.act(obs) def add_env_feedback(self, obs, action, new_obs, reward, done): ''' Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean. ''' # Create the new memory and update the buffer new_memory = self.Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done) self.replay_buffer.append(new_memory) # update the variables self.n_iter += 1 # decrease epsilon self.epsilon = max( self.epsilon_final, self.epsilon_start - self.n_iter / self.epsilon_decay) self.total_reward += reward def sample_and_optimize(self, batch_size): ''' Sample batch_size memories from the buffer and optimize them ''' if len(self.replay_buffer) > self.buffer_start_size: # sample mini_batch = self.replay_buffer.sample(batch_size) # optimize l_loss = self.cc.optimize(mini_batch) self.accumulated_loss.append(l_loss) # update target NN if self.n_iter % self.iter_update_target == 0: self.cc.update_target() def reset_stats(self): ''' Reset the agent's statistics ''' self.rewards.append(self.total_reward) self.total_reward = 0 self.accumulated_loss = [] self.n_games += 1 def print_info(self): ''' Print information about the agent ''' fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts) print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' % (self.n_iter, self.n_games, self.total_reward, np.mean(self.rewards[-40:]), self.epsilon, fps, np.mean(self.accumulated_loss))) self.ts_frame = self.n_iter self.ts = time.time() if self.summary_writer != None: self.summary_writer.add_scalar('reward', self.total_reward, self.n_games) self.summary_writer.add_scalar('mean_reward', np.mean(self.rewards[-40:]), self.n_games) self.summary_writer.add_scalar('10_mean_reward', np.mean(self.rewards[-10:]), self.n_games) self.summary_writer.add_scalar('esilon', self.epsilon, self.n_games) self.summary_writer.add_scalar('loss', np.mean(self.accumulated_loss), self.n_games)
class PlayroomGM(Wrapper): def __init__(self, env, args): super(PlayroomGM, self).__init__(env) self.gamma = float(args['--gamma']) self.eps = float(args['--eps']) self.demo_f = [int(f) for f in args['--demo'].split(',')] self.feat = np.array([int(f) for f in args['--features'].split(',')]) self.N = self.feat.shape[0] vs = np.zeros(shape=(self.N, self.state_dim[0])) vs[np.arange(self.N), self.feat] = 1 self.vs = vs / np.sum(vs, axis=1, keepdims=True) self.R = 100 self.idx = -1 self.v = np.zeros(shape=(self.state_dim[0], 1)) self.g = np.ones(shape=(self.state_dim[0])) self.queues = [CompetenceQueue() for _ in range(self.N)] self.names = ['s0', 'r0', 'a', 's1', 'r1', 'g', 'v', 'o', 'u'] self.buffer = ReplayBuffer(limit=int(1e5), names=self.names, N=self.N) def reset(self, exp): self.idx, self.v = self.sample_v(exp['s0']) exp['g'] = self.g exp['v'] = self.v return exp def get_r(self, s, g, v): return self.R * np.sum(np.multiply(v, s == g), axis=1, keepdims=True) def sample_v(self, s): remaining_v = [i for i in range(self.N) if s[self.feat[i]] != 1] probs = self.get_probs(idxs=remaining_v, eps=self.eps) idx = np.random.choice(remaining_v, p=probs) v = self.vs[idx] return idx, v def sampleT(self, batch_size): idxs = [ i for i in range(self.N) if self.buffer._tutorBuffers[i]._numsamples > batch_size ] probs = self.get_probs(idxs=idxs, eps=self.eps) t = np.random.choice(idxs, p=probs) samples = self.buffer.sampleT(batch_size, t) return samples, t def end_episode(self, episode): term = episode[-1]['r1'][self.idx] == self.R self.queues[self.idx].process_ep(episode, term) base_util = np.zeros(shape=(self.N, )) base_util[self.idx] = 1 self.process_trajectory(episode, base_util=base_util) def process_trajectory(self, trajectory, base_util=None): if base_util is None: u = np.zeros(shape=(self.N, )) else: u = base_util u = np.expand_dims(u, axis=1) # mcr = np.zeros(shape=(self.N,)) for exp in reversed(trajectory): u = self.gamma * u u[np.where(exp['r1'] > exp['r0'])] = 1 # u_idx = np.where(u != 0) # mcr[u_idx] = exp['r1'][u_idx] + self.gamma * mcr[u_idx] exp['u'] = u.squeeze() # exp['mcr'] = mcr if any(u != 0): self.buffer.append(exp.copy()) # def sample(self, batchsize): # probs = self.get_probs(idxs=range(self.N), eps=self.eps2) # idx = np.random.choice(self.N, p=probs) # samples = self.buffer.sample(batchsize, idx) # if samples is not None: # self.queues[idx].process_samples(samples) # return idx, samples # # def sampleT(self, batchsize): # probs = self.get_probs(idxs=range(self.N), eps=self.eps3) # idx = np.random.choice(self.N, p=probs) # samples = self.buffer.sampleT(batchsize, idx) # if samples is not None: # self.queues[idx].process_samplesT(samples) # return idx, samples def get_demo(self): demo = [] exp = {} exp['s0'] = self.env.reset() exp['r0'] = self.get_r(exp['s0'], self.g, self.vs).squeeze() exp['g'] = self.g task = np.random.choice(self.demo_f) exp['v'] = self.vs[list(self.feat).index(task)] while True: a, done = self.opt_action(task) if done: break else: exp['a'] = np.expand_dims(a, axis=1) exp['s1'] = self.env.step(exp['a'], True)[0] exp['r1'] = self.get_r(exp['s1'], self.g, self.vs).squeeze() exp['o'] = 1 demo.append(exp.copy()) exp['s0'] = exp['s1'] exp['r0'] = exp['r1'] return demo, task def opt_action(self, t): return self.env.opt_action(t) def get_stats(self): stats = {} for i, f in enumerate(self.feat): self.queues[i].update() for key, val in self.queues[i].get_stats().items(): stats[key + str(f)] = val self.queues[i].init_stat() return stats def get_cps(self): return [np.maximum(abs(q.CP + 0.05) - 0.05, 0) for q in self.queues] def get_probs(self, idxs, eps): cps = self.get_cps() vals = [cps[idx] for idx in idxs] l = len(vals) s = np.sum(vals) if s == 0: probs = [1 / l] * l else: probs = [eps / l + (1 - eps) * v / s for v in vals] return probs @property def state_dim(self): return 8, @property def goal_dim(self): return 8, @property def action_dim(self): return 5
class DDPG(Agent): def __init__(self, args, env, env_test, logger): super(DDPG, self).__init__(args, env, env_test, logger) self.args = args self.init(args, env) for metric in self.critic.model.metrics_names: self.metrics[self.critic.model.name + '_' + metric] = 0 def init(self, args, env): names = ['state0', 'action', 'state1', 'reward', 'terminal'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy()) self.actorCritic = ActorCriticDDPG(args, env) # self.critic = CriticDDPG(args, env) # self.actor = ActorDDPG(args, env) def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) s0, a0, s1, r, t = [exp[name] for name in self.buffer.names] a1 = self.actor.target_model.predict_on_batch(s1) a1 = np.clip(a1, self.env.action_space.low, self.env.action_space.high) q = self.critic.Tmodel.predict_on_batch([s1, a1]) targets = r + (1 - t) * self.critic.gamma * np.squeeze(q) targets = np.clip(targets, self.env.minR / (1 - self.critic.gamma), self.env.maxR) inputs = [s0, a0] loss = self.critic.model.train_on_batch(inputs, targets) for i, metric in enumerate(self.critic.model.metrics_names): self.metrics[metric] += loss[i] # a2 = self.actor.model.predict_on_batch(s0) # grads = self.critic.gradsModel.predict_on_batch([s0, a2]) # low = self.env.action_space.low # high = self.env.action_space.high # for d in range(grads[0].shape[0]): # width = high[d] - low[d] # for k in range(self.batch_size): # if grads[k][d] >= 0: # grads[k][d] *= (high[d] - a2[k][d]) / width # else: # grads[k][d] *= (a2[k][d] - low[d]) / width # self.actor.train(s0, grads) self.actor.target_train() self.critic.target_train() def reset(self): if self.trajectory: T = int(self.trajectory[-1]['terminal']) R = np.sum([ self.env.unshape(exp['reward'], exp['terminal']) for exp in self.trajectory ]) S = len(self.trajectory) self.env.processEp(R, S, T) for expe in reversed(self.trajectory): self.buffer.append(expe.copy()) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state def make_input(self, state): input = [np.reshape(state, (1, self.actor.s_dim[0]))] return input def act(self, state): input = self.make_input(state) action = self.actor.model.predict(input, batch_size=1) noise = np.random.normal(0., 0.1, size=action.shape) action = noise + action action = np.clip(action, self.env.action_space.low, self.env.action_space.high) action = action.squeeze() return action
class DQNAgent(): """Deep Q-learning agent.""" # def __init__(self, # env, device=DEVICE, summary_writer=writer, # noqa # hyperparameters=DQN_HYPERPARAMS): # noqa rewards = [] total_reward = 0 birth_time = 0 n_iter = 0 n_games = 0 ts_frame = 0 ts = time.time() # Memory = namedtuple( # 'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], # verbose=False, rename=False) Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], rename=False) def __init__(self, env, hyperparameters, device, summary_writer=None): """Set parameters, initialize network.""" state_space_shape = env.observation_space.shape action_space_size = env.action_space.n self.env = env self.online_network = DQN(state_space_shape, action_space_size).to(device) self.target_network = DQN(state_space_shape, action_space_size).to(device) # XXX maybe not really necesary? self.update_target_network() self.experience_replay = None self.accumulated_loss = [] self.device = device self.optimizer = optim.Adam(self.online_network.parameters(), lr=hyperparameters['learning_rate']) self.double_DQN = hyperparameters['double_DQN'] # Discount factor self.gamma = hyperparameters['gamma'] # XXX ??? self.n_multi_step = hyperparameters['n_multi_step'] self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma']) self.birth_time = time.time() self.iter_update_target = hyperparameters['n_iter_update_target'] self.buffer_start_size = hyperparameters['buffer_start_size'] self.summary_writer = summary_writer # Greedy search hyperparameters self.epsilon_start = hyperparameters['epsilon_start'] self.epsilon = hyperparameters['epsilon_start'] self.epsilon_decay = hyperparameters['epsilon_decay'] self.epsilon_final = hyperparameters['epsilon_final'] def get_max_action(self, obs): ''' Forward pass of the NN to obtain the action of the given observations ''' # convert the observation in tensor state_t = torch.tensor(np.array([obs])).to(self.device) # forward pass q_values_t = self.online_network(state_t) # get the maximum value of the output (i.e. the best action to take) _, act_t = torch.max(q_values_t, dim=1) return int(act_t.item()) def act(self, obs): ''' Greedy action outputted by the NN in the CentralControl ''' return self.get_max_action(obs) def act_eps_greedy(self, obs): ''' E-greedy action ''' # In case of a noisy net, it takes a greedy action # if self.noisy_net: # return self.act(obs) if np.random.random() < self.epsilon: return self.env.action_space.sample() else: return self.act(obs) def update_target_network(self): """Update target network weights with current online network values.""" self.target_network.load_state_dict(self.online_network.state_dict()) def set_optimizer(self, learning_rate): self.optimizer = optim.Adam(self.online_network.parameters(), lr=learning_rate) def sample_and_optimize(self, batch_size): ''' Sample batch_size memories from the buffer and optimize them ''' # This should be the part where it waits until it has enough # experience if len(self.replay_buffer) > self.buffer_start_size: # sample mini_batch = self.replay_buffer.sample(batch_size) # optimize # l_loss = self.cc.optimize(mini_batch) l_loss = self.optimize(mini_batch) self.accumulated_loss.append(l_loss) # update target NN if self.n_iter % self.iter_update_target == 0: self.update_target_network() def optimize(self, mini_batch): ''' Optimize the NN ''' # reset the grads self.optimizer.zero_grad() # caluclate the loss of the mini batch loss = self._calulate_loss(mini_batch) loss_v = loss.item() # do backpropagation loss.backward() # one step of optimization self.optimizer.step() return loss_v def _calulate_loss(self, mini_batch): ''' Calculate mini batch's MSE loss. It support also the double DQN version ''' states, actions, next_states, rewards, dones = mini_batch # convert the data in tensors states_t = torch.as_tensor(states, device=self.device) next_states_t = torch.as_tensor(next_states, device=self.device) actions_t = torch.as_tensor(actions, device=self.device) rewards_t = torch.as_tensor(rewards, dtype=torch.float32, device=self.device) done_t = torch.as_tensor(dones, dtype=torch.uint8, device=self.device) # noqa # Value of the action taken previously (recorded in actions_v) # in state_t state_action_values = self.online_network(states_t).gather( 1, actions_t[:, None]).squeeze(-1) # NB gather is a differentiable function # Next state value with Double DQN. (i.e. get the value predicted # by the target nn, of the best action predicted by the online nn) if self.double_DQN: double_max_action = self.online_network(next_states_t).max(1)[1] double_max_action = double_max_action.detach() target_output = self.target_network(next_states_t) # NB: [:,None] add an extra dimension next_state_values = torch.gather( target_output, 1, double_max_action[:, None]).squeeze(-1) # Next state value in the normal configuration else: next_state_values = self.target_network(next_states_t).max(1)[0] next_state_values = next_state_values.detach() # No backprop # Use the Bellman equation expected_state_action_values = rewards_t + \ (self.gamma**self.n_multi_step) * next_state_values # compute the loss return nn.MSELoss()(state_action_values, expected_state_action_values) def reset_stats(self): ''' Reset the agent's statistics ''' self.rewards.append(self.total_reward) self.total_reward = 0 self.accumulated_loss = [] self.n_games += 1 def add_env_feedback(self, obs, action, new_obs, reward, done): ''' Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean. ''' # Create the new memory and update the buffer new_memory = self.Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done) # Append it to the replay buffer self.replay_buffer.append(new_memory) # update the variables self.n_iter += 1 # TODO check this... # decrease epsilon self.epsilon = max( self.epsilon_final, self.epsilon_start - self.n_iter / self.epsilon_decay) self.total_reward += reward def print_info(self): ''' Print information about the agent ''' fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts) # TODO replace with proper logger print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' % (self.n_iter, self.n_games, self.total_reward, np.mean(self.rewards[-40:]), self.epsilon, fps, np.mean(self.accumulated_loss))) self.ts_frame = self.n_iter self.ts = time.time() if self.summary_writer is not None: self.summary_writer.add_scalar('reward', self.total_reward, self.n_games) self.summary_writer.add_scalar('mean_reward', np.mean(self.rewards[-40:]), self.n_games) self.summary_writer.add_scalar('10_mean_reward', np.mean(self.rewards[-10:]), self.n_games) self.summary_writer.add_scalar('epsilon', self.epsilon, self.n_games) self.summary_writer.add_scalar('loss', np.mean(self.accumulated_loss), self.n_games)
class DDPG(Agent): def __init__(self, args, env, env_test, logger): super(DDPG, self).__init__(args, env, env_test, logger) self.args = args self.init(args, env) def init(self, args, env): names = ['s0', 'a', 's1', 'r', 't'] metrics = ['loss_dqn', 'loss_actor'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy(), args=args) self.actorCritic = ActorCriticDDPG(args, env) for metric in metrics: self.metrics[metric] = 0 def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) targets_dqn = self.actorCritic.get_targets_dqn( exp['r'], exp['t'], exp['s1']) inputs = [exp['s0'], exp['a'], targets_dqn] loss_dqn = self.actorCritic.trainQval(inputs) action, criticActionGrads, invertedCriticActionGrads = self.actorCritic.trainActor( [exp['s0']]) self.metrics['loss_dqn'] += np.squeeze(loss_dqn) # a2 = self.actor.model.predict_on_batch(s0) # grads = self.critic.gradsModel.predict_on_batch([s0, a2]) # low = self.env.action_space.low # high = self.env.action_space.high # for d in range(grads[0].shape[0]): # width = high[d] - low[d] # for k in range(self.batch_size): # if grads[k][d] >= 0: # grads[k][d] *= (high[d] - a2[k][d]) / width # else: # grads[k][d] *= (a2[k][d] - low[d]) / width # self.actor.train(s0, grads) self.actorCritic.target_train() def make_input(self, state, mode): input = [np.expand_dims(state, axis=0)] return input def reset(self): if self.trajectory: self.env.end_episode(self.trajectory) for expe in self.trajectory: self.buffer.append(expe.copy()) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state def act(self, state, mode='train'): input = self.make_input(state, mode) action = self.actorCritic.action(input)[0] if mode == 'train': noise = np.random.normal(0., 0.1, size=action[0].shape) action = noise + action action = np.clip(action, self.env.action_space.low, self.env.action_space.high) action = action.squeeze() return action def save_model(self): self.actorCritic.actionModel.save(os.path.join(self.logger.get_dir(), 'actor_model'), overwrite=True) self.actorCritic.qvalModel.save(os.path.join(self.logger.get_dir(), 'qval_model'), overwrite=True)
class ACDQNGM(DQNG): def __init__(self, args, env, env_test, logger): super(ACDQNGM, self).__init__(args, env, env_test, logger) def init(self, args, env): names = ['s0', 'a', 's1', 'r', 't', 'g', 'm', 'task', 'mcr'] metrics = ['loss_dqn', 'qval', 'val'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy(), args=args) self.actorCritic = ActorCriticDQNGM(args, env) for metric in metrics: self.metrics[metric] = 0 self.goalcounts = np.zeros((len(self.env.goals), )) def train(self): if self.buffer.nb_entries > 100 * self.batch_size: samples = self.buffer.sample(self.batch_size) samples = self.env.augment_samples(samples) targets = self.actorCritic.get_targets_dqn(samples['r'], samples['t'], samples['s1'], samples['g'], samples['m']) inputs = [ samples['s0'], samples['a'], samples['g'], samples['m'], targets ] metricsCritic = self.actorCritic.trainCritic(inputs) self.metrics['loss_dqn'] += np.squeeze(metricsCritic[0]) self.metrics['qval'] += np.mean(metricsCritic[1]) self.goalcounts += np.bincount(samples['task'], minlength=len(self.env.goals)) metricsActor = self.actorCritic.trainActor( [samples['s0'], samples['g'], samples['m']]) if self.env_step % 1000 == 0: print(metricsActor[0], metricsActor[1]) self.metrics['val'] += np.mean(metricsActor[2]) self.actorCritic.target_train() def get_stats(self): sumsamples = np.sum(self.goalcounts) if sumsamples != 0: for i, goal in enumerate(self.env.goals): self.stats['samplecount_{}'.format(goal)] = float( "{0:.3f}".format(self.goalcounts[i] / sumsamples)) def make_input(self, state, mode): if mode == 'train': input = [ np.expand_dims(i, axis=0) for i in [state, self.env.goal, self.env.mask] ] else: input = [ np.expand_dims(i, axis=0) for i in [state, self.env_test.goal, self.env_test.mask] ] return input def act(self, exp, mode='train'): input = self.make_input(exp['s0'], mode) actionProbs = self.actorCritic.probs(input)[0].squeeze() # if self.env_step % 1000 == 0: print(actionProbs) if mode == 'train': action = np.random.choice(range(self.env.action_dim), p=actionProbs) else: action = np.argmax(actionProbs[0]) prob = actionProbs[action] action = np.expand_dims(action, axis=1) exp['a'] = action # exp['p_a'] = prob return exp def reset(self): if self.trajectory: augmented_episode = self.env.end_episode(self.trajectory) for expe in augmented_episode: self.buffer.append(expe) # for expe in self.trajectory: # self.buffer.append(expe.copy()) # augmented_ep = self.env.augment_episode(self.trajectory) # for e in augmented_ep: # self.buffer.append(e) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state def get_demo(self, rndprop): demo = [] exp = {} exp['s0'] = self.env_test.env.reset() # obj = np.random.choice(self.env_test.env.objects) # goal = np.random.randint(obj.high[2]+1) obj = self.env_test.env.light goal = 1 while True: if np.random.rand() < rndprop: a = np.random.randint(self.env_test.action_dim) done = False else: a, done = self.env_test.env.opt_action(obj, goal) if not done: exp['a'] = np.expand_dims(a, axis=1) exp['s1'] = self.env_test.env.step(exp['a'])[0] demo.append(exp.copy()) exp['s0'] = exp['s1'] else: break return demo def demo(self): if self.env_step % self.demo_freq == 0: for i in range(5): demo = self.get_demo(rndprop=0.) augmented_demo = self.env.augment_demo(demo) for exp in augmented_demo: self.buffer.append(exp)
class DQN(Agent): def __init__(self, args, env, env_test, logger): super(DQN, self).__init__(args, env, env_test, logger) self.args = args self.init(args, env) def init(self, args, env): names = ['state0', 'action', 'state1', 'reward', 'terminal'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy()) self.critic = CriticDQN(args, env) for metric_name in ['loss_dqn', 'qval', 'val']: self.metrics[metric_name] = 0 def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) s0, a0, s1, r, t = [exp[name] for name in self.buffer.names] targets_dqn = self.critic.get_targets_dqn(r, t, s1) inputs = [s0, a0] loss = self.critic.criticModel.train_on_batch(inputs, targets_dqn) for i, metric in enumerate(self.critic.criticModel.metrics_names): self.metrics[metric] += loss[i] self.critic.target_train() def reset(self): if self.trajectory: R = np.sum([ self.env.unshape(exp['reward'], exp['terminal']) for exp in self.trajectory ]) self.env.processEp(R) for expe in reversed(self.trajectory): self.buffer.append(expe.copy()) if self.args['--imit'] != '0': Es = [0] for i, expe in enumerate(reversed(self.trajectory)): if self.trajectory[-1]['terminal']: Es[0] = Es[0] * self.critic.gamma + expe['reward'] expe['expVal'] = Es[0] else: expe['expVal'] = -self.ep_steps self.bufferImit.append(expe.copy()) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state def make_input(self, state, mode): input = [np.reshape(state, (1, self.critic.s_dim[0]))] input.append(np.expand_dims([0.5], axis=0)) return input def act(self, state, mode='train'): input = self.make_input(state, mode) actionProbs = self.critic.actionProbs(input) if mode == 'train': action = np.random.choice(range(self.env.action_dim), p=actionProbs[0].squeeze()) else: action = np.argmax(actionProbs[0]) return np.expand_dims(action, axis=1)