def act(self, obs): with chainer.using_config('train', False), chainer.no_backprop_mode(): action_value = self.model( self.batch_states([obs], self.xp, self.phi)) embedding = self.model.embedding if len(self.value_buffer) > 0: q_np = self.value_buffer.compute_q(embedding=embedding) q_theta = action_value.q_values.array q = Variable((1 - self.lamda) * q_theta + self.lamda * q_np) q = DiscreteActionValue(q) q = float(q.max.array) else: q = float(action_value.max.array) action = cuda.to_cpu(action_value.greedy_actions.array)[0] # Update stats self.average_q *= self.average_q_decay self.average_q += (1 - self.average_q_decay) * q self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value) self.eval_t += 1 self._backup_if_necessary(self.eval_t, embedding) return action
def __call__(self, x, eva=False, test=False): q = self.q_func(x) self.embedding = self.get_embedding() if not eva or self.lambdas == 0 or self.lambdas == 1: return q qnp = self.non_q.get_q(self.embedding.array) qout = self.lambdas * q.q_values + (1 - self.lambdas) * qnp return DiscreteActionValue(qout)
def __call__(self, x, eva=False, test=False): q = self.q_func(x) self.embedding = self.get_embedding() if not eva or self.lambdas == 0 or self.lambdas == 1: return q #Output Q-value from value buffer qnp = self.non_q.get_q(self.embedding.array) #Q-value adjustment qout = self.lambdas * q.q_values + (1 - self.lambdas) * qnp return DiscreteActionValue(qout)
def __call__(self, x, eva=False, test=False): """TODO: stateを受け取って, Q値を返す""" q = self.q_func(x) self.embedding = self.get_embedding() if not eva or self.lambdas == 0: return q # Output the non-Q from value buffer qnp = self.non_q.get_q(self.embedding.array) #Q-value adjustment qout = self.lambdas * q.q_values + (1 - self.lambdas) * qnp return DiscreteActionValue(qout)
def forward(self, x): """Compute Q-values of actions for given observations.""" x1 = x[:, 0, :, :].reshape((-1, 1, obs_size * 2 + 1, obs_size * 2 + 1)) x2 = x[:, 1, :, :].reshape((-1, (obs_size * 2 + 1) ** 2)) if x2.shape[0] == 1: x2 = np.tile(x2, (minibatch_size, 1)) h = F.relu(self.bn1(self.conv1(x))) h = F.relu(self.bn2(self.conv2(x))) h = F.relu(self.bn3(self.conv3(x))) h = self.l(h) return DiscreteActionValue(h)
def __call__(self, x, test=False): self.embedding = self.hout(x) activation = F.relu(self.embedding) batch_size = x.shape[0] ya = self.a_stream(activation) mean = F.reshape(F.sum(ya, axis=1) / self.num_actions, (batch_size, 1)) ya, mean = F.broadcast(ya, mean) ya -= mean ys = self.v_stream(activation) ya, ys = F.broadcast(ya, ys) q = ya + ys return DiscreteActionValue(q)
def __call__(self, x): h = self.model(x) return DiscreteActionValue(h)
def __call__(self, x): h = F.relu(self.fc(x)) h = self.lstm(h) return DiscreteActionValue(self.out(h))
def __call__(self, x, test=False): h = F.relu(self.fc(x, test=test)) h = self.lstm(h) return DiscreteActionValue(self.out(h))
def __call__(self, x, test=False): h = self.model(x, test=test) return DiscreteActionValue(h)
def __call__(self, x, test=False): self.embedding = self.hout(x) return DiscreteActionValue(self.qout(F.relu(self.embedding)))
def __call__(self, x): if self.use_tuple: batch_size = x[0].shape[0] h = x[0] else: batch_size = x.shape[0] h = x for l in self.conv_layers: h = self.activation(l(h)) if self.use_tuple: h = F.reshape(h, (batch_size, -1)) # concatenate additional observations h = F.concat((h, x[1])) # Advantage a1 = self.a_branch_1(h) a2 = self.a_branch_2(h) a3 = self.a_branch_3(h) a4 = self.a_branch_4(h) if len(self.branch_sizes) > 4: a5 = self.a_branch_5(h) # Compute means for each branch mean_a1 = F.sum(a1, axis=1) / self.branch_sizes[0] mean_a1 = F.reshape(mean_a1, (batch_size, 1)) mean_a1 = F.broadcast_to(mean_a1, a1.shape) mean_a2 = F.sum(a2, axis=1) / self.branch_sizes[1] mean_a2 = F.reshape(mean_a2, (batch_size, 1)) mean_a2 = F.broadcast_to(mean_a2, a2.shape) mean_a3 = F.sum(a3, axis=1) / self.branch_sizes[2] mean_a3 = F.reshape(mean_a3, (batch_size, 1)) mean_a3 = F.broadcast_to(mean_a3, a3.shape) mean_a4 = F.sum(a4, axis=1) / self.branch_sizes[3] mean_a4 = F.reshape(mean_a4, (batch_size, 1)) mean_a4 = F.broadcast_to(mean_a4, a4.shape) if len(self.branch_sizes) > 4: mean_a5 = F.sum(a5, axis=1) / self.branch_sizes[4] mean_a5 = F.reshape(mean_a5, (batch_size, 1)) mean_a5 = F.broadcast_to(mean_a5, a5.shape) # Broadcast state values v = self.v_stream(h) v1 = F.broadcast_to(v, a1.shape) v2 = F.broadcast_to(v, a2.shape) v3 = F.broadcast_to(v, a3.shape) v4 = F.broadcast_to(v, a4.shape) if len(self.branch_sizes) > 4: v5 = F.broadcast_to(v, a5.shape) # Q-values q1 = v1 + a1 - mean_a1 q2 = v2 + a2 - mean_a2 q3 = v3 + a3 - mean_a3 q4 = v4 + a4 - mean_a4 if len(self.branch_sizes) > 4: q5 = v5 + a5 - mean_a5 branches = [] branches.append(DiscreteActionValue(q1)) branches.append(DiscreteActionValue(q2)) branches.append(DiscreteActionValue(q3)) branches.append(DiscreteActionValue(q4)) if len(self.branch_sizes) > 4: branches.append(DiscreteActionValue(q5)) return BranchedActionValue(branches)
def act_and_train(self, obs, reward): with chainer.using_config('train', False), chainer.no_backprop_mode(): action_value = self.model( #84*84を1*84*84にしている #この形で渡さないといけない #そのためにobsを[]に入れて次元を増やしている self.batch_states([obs], self.xp, self.phi)) # embeddingの所得 embedding = self.model.embedding #Q値を合わせる if len(self.value_buffer) > 0: q_np = self.value_buffer.compute_q(embedding=embedding) q_theta = action_value.q_values.array q = Variable(self.lamda * q_theta + (1 - self.lamda) * q_np) q = DiscreteActionValue(q) q = float(q.max.array) #q_mixed = (1-self.lamda)*q_theta + self.lamda*q_np #q = float(q_mixed.max()) #greedy_action = cuda.to_cpu(q_mixed.argmax()) else: q = float(action_value.max.array) greedy_action = cuda.to_cpu(action_value.greedy_actions.array)[0] # Update stats self.average_q *= self.average_q_decay self.average_q += (1 - self.average_q_decay) * q self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value) action = self.explorer.select_action(self.t, lambda: greedy_action, action_value=action_value) self.t += 1 # Update the target network if self.t % self.target_update_interval == 0: self.sync_target_network() if self.last_state is not None: assert self.last_action is not None assert self.last_embedding is not None # Add a transition to the replay buffer self.replay_buffer.add(state=self.last_state, action=self.last_action, reward=reward, embedding=self.last_embedding, next_state=obs, next_action=action, is_state_terminal=False) self._backup_if_necessary(self.t, embedding) self.last_state = obs self.last_action = action self.last_embedding = embedding self.replay_updater.update_if_necessary(self.t) self.logger.debug('t:%s r:%s a:%s', self.t, reward, action) return self.last_action