def test_minimum_inconsistent_shapes(self): x1_data = numpy.random.uniform(-1, 1, (3, 2)).astype(self.dtype) x2_data = numpy.random.uniform(-1, 1, (2, 3)).astype(self.dtype) x1 = chainer.Variable(x1_data) x2 = chainer.Variable(x2_data) with self.assertRaises(type_check.InvalidType): functions.minimum(x1, x2)
def intersection(bbox0, bbox1): x0, y0, w0, h0 = bbox0 x1, y1, w1, h1 = bbox1 w = F.relu(F.minimum(x0 + w0 / 2, x1 + w1 / 2) - F.maximum(x0 - w0 / 2, x1 - w1 / 2)) h = F.relu(F.minimum(y0 + h0 / 2, y1 + h1 / 2) - F.maximum(y0 - h0 / 2, y1 - h1 / 2)) return w * h
def _perform_gradient_step(self, iterator): batch = iterator.next() s_current, action, r, s_next, non_terminal = \ concat_examples(batch, device=self._device) r = F.reshape(r, shape=(*r.shape, 1)) non_terminal = F.reshape(non_terminal, shape=(*non_terminal.shape, 1)) with chainer.using_config('enable_backprop', False), chainer.using_config('train', False): a_next, log_pi = self._pi.action_with_log_pi(s_next) log_pi = F.reshape(log_pi, shape=(*log_pi.shape, 1)) # update q target_q1 = self._target_q1(s_next, a_next) target_q2 = self._target_q2(s_next, a_next) min_q = F.minimum(target_q1, target_q2) q_target = r + self._gamma * non_terminal * ( min_q - self._alpha.exp() * log_pi) q1 = self._q1(s_current, action) q2 = self._q2(s_current, action) q1_loss = 0.5 * F.mean_squared_error(q_target, q1) q2_loss = 0.5 * F.mean_squared_error(q_target, q2) q_loss = q1_loss + q2_loss self._q1_optimizer.target.cleargrads() self._q2_optimizer.target.cleargrads() q_loss.backward() q_loss.unchain_backward() self._q1_optimizer.update() self._q2_optimizer.update() # update pi pi_action, log_pi = self._pi.action_with_log_pi(s_current) log_pi = F.reshape(log_pi, shape=(*log_pi.shape, 1)) q1 = self._q1(s_current, pi_action) q2 = self._q2(s_current, pi_action) min_q = F.minimum(q1, q2) pi_loss = F.mean(self._alpha.exp() * log_pi - min_q) self._pi_optimizer.target.cleargrads() pi_loss.backward() pi_loss.unchain_backward() self._pi_optimizer.update() # update temperature alpha_loss = -self._alpha.exp() * F.mean(log_pi + self._temperature_target) self._alpha_optimizer.target.cleargrads() alpha_loss.backward() alpha_loss.unchain_backward() self._alpha_optimizer.update() self._update_target_network(self._target_q1, self._q1, self._tau) self._update_target_network(self._target_q2, self._q2, self._tau)
def loss_func_dsgan(x, z, theta, tau=10): if x.shape[1] == 4: x = x[:, :3] loss_ds_1 = F.batch_l2_norm_squared(x[::2] - x[1::2]) / (F.batch_l2_norm_squared(z[::2] - z[1::2]) + 1e-8) loss_ds_2 = F.batch_l2_norm_squared(x[::2] - x[1::2]) / (F.absolute(theta[::2] - theta[1::2]) + 1e-8) / 1000 xp = chainer.cuda.get_array_module(x.array) loss_ds_1 = F.minimum(F.sqrt(loss_ds_1), xp.full_like(loss_ds_1.array, tau)) loss_ds_2 = F.minimum(F.sqrt(loss_ds_2), xp.full_like(loss_ds_2.array, tau)) print(loss_ds_1.array.mean(), loss_ds_2.array.mean()) return -F.mean(loss_ds_1) - F.mean(loss_ds_2)
def update_model(self): # start minibatch learning for t in range(self.num_train_per_episode): # get learning data with self.lock: states, actions, advantages = self.get_data_from_train_buffer() # get policy and value policies, values = self.model(states) old_policies, _ = self.old_model(states) # calculate loss loss_v = F.squared_error(values, np.array(advantages).astype(np.float32)) loss_ent = -policies.entropy() r = (policies.get_prob(actions) + 1.0e-10) / (old_policies.get_prob(actions) + 1.0e-10) loss_clip = (advantages - values.data) * F.minimum( r, F.clip(r, 1.0 - self.eps, 1.0 + self.eps)) loss = F.mean(-loss_clip + loss_v * 0.2 + 0.01 * loss_ent) self.model.cleargrads() loss.backward() self.optimizer.update() # update old model self.old_model = self.copy_model() self.clear_buffer()
def __call__(self, noise, tag, depth, alpha): h = F.reshape(noise, (len(noise), -1, 1, 1)) h = F.leaky_relu(self.c0(h, False)) h = F.leaky_relu(self.c1(h)) for i in range(depth - 1): h = getattr(self, "b%d" % i)(h) if 0 < depth and alpha < 1: h2 = getattr(self, "b%d" % (depth - 1))(h, True) if depth == 1: #h = F.tanh(self.to_RGB(h)) h = self.to_RGB(h, False) else: h = getattr(self, "b%d" % (depth - 2)).to_RGB(h, False) h = F.unpooling_2d(h, 2, 2, 0, outsize=(2 * h.shape[2], 2 * h.shape[3])) h = h * (1.0 - alpha) + h2 * alpha elif depth == 0: #h = F.tanh(self.to_RGB(h)) h = self.to_RGB(h, False) else: h = getattr(self, "b%d" % (depth - 1))(h, True) h = F.minimum(h, xp.ones(h.shape).astype(np.float32)) #h = F.maximum(h,-1 * xp.ones(h.shape).astype(np.float32)) return h
def optimize_surrogate_loss(iterator, model, optimizer, alpha, args): optimizer.target.cleargrads() batch = iterator.next() s_current, action, _, _, log_likelihood, v_target, advantage = concat_examples( batch, device=args.gpu) log_pi_theta = model.compute_log_likelihood(s_current, action) log_pi_theta_old = log_likelihood # print('log_pi_theta: ', log_pi_theta, ' shape: ', log_pi_theta.shape) # print('log_pi_theta_old: ', log_pi_theta_old, ' shape: ', log_pi_theta_old.shape) # division of probability is exponential of difference between log probability probability_ratio = F.exp(log_pi_theta - log_pi_theta_old) clipped_ratio = F.clip(probability_ratio, 1 - args.epsilon * alpha, 1 + args.epsilon * alpha) lower_bounds = F.minimum(probability_ratio * advantage, clipped_ratio * advantage) clip_loss = F.mean(lower_bounds) value = model.value(s_current) xp = chainer.backend.get_array_module(v_target) v_target = xp.reshape(v_target, newshape=value.shape) # print('value: ', value, ' shape: ', value.shape) # print('v_target: ', v_target, ' shape: ', v_target.shape) value_loss = F.mean_squared_error(value, v_target) entropy = model.compute_entropy(s_current) entropy_loss = F.mean(entropy) loss = -clip_loss + args.vf_coeff * value_loss - args.entropy_coeff * entropy_loss # Update parameter loss.backward() optimizer.update() loss.unchain_backward()
def update_policy_and_temperature(self, batch): """Compute loss for actor.""" batch_state = batch['state'] action_distrib = self.policy(batch_state) actions, log_prob = action_distrib.sample_with_log_prob() q1 = self.q_func1(batch_state, actions) q2 = self.q_func2(batch_state, actions) q = F.minimum(q1, q2) entropy_term = self.temperature * log_prob[..., None] assert q.shape == entropy_term.shape loss = F.mean(entropy_term - q) self.policy_optimizer.update(lambda: loss) if self.entropy_target is not None: self.update_temperature(log_prob.array) # Record entropy with chainer.no_backprop_mode(): try: self.entropy_record.extend( cuda.to_cpu(action_distrib.entropy.array)) except NotImplementedError: # Record - log p(x) instead self.entropy_record.extend(cuda.to_cpu(-log_prob.array))
def _lossfun(self, entropy, vs_pred, log_probs, vs_pred_old, log_probs_old, advs, vs_teacher): prob_ratio = F.exp(log_probs - log_probs_old) loss_policy = -F.mean( F.minimum( prob_ratio * advs, F.clip(prob_ratio, 1 - self.clip_eps, 1 + self.clip_eps) * advs)) if self.clip_eps_vf is None: loss_value_func = F.mean_squared_error(vs_pred, vs_teacher) else: loss_value_func = F.mean( F.maximum( F.square(vs_pred - vs_teacher), F.square( _elementwise_clip(vs_pred, vs_pred_old - self.clip_eps_vf, vs_pred_old + self.clip_eps_vf) - vs_teacher))) loss_entropy = -F.mean(entropy) self.value_loss_record.append(float(loss_value_func.array)) self.policy_loss_record.append(float(loss_policy.array)) loss = (loss_policy + self.value_func_coef * loss_value_func + self.entropy_coef * loss_entropy) return loss
def check_forward(self, x1_data, x2_data, y_expected): x1 = chainer.Variable(x1_data) x2 = chainer.Variable(x2_data) y = functions.minimum(x1, x2) self.assertEqual(y.data.dtype, self.dtype) testing.assert_allclose(y_expected, y.data, **self.check_forward_options)
def update_q_func(self, batch): """Compute loss for a given Q-function.""" batch_next_state = batch['next_state'] batch_rewards = batch['reward'] batch_terminal = batch['is_state_terminal'] batch_state = batch['state'] batch_actions = batch['action'] batch_discount = batch['discount'] with chainer.no_backprop_mode(), chainer.using_config('train', False): next_actions = self.target_policy_smoothing_func( self.target_policy(batch_next_state).sample().array) next_q1 = self.target_q_func1(batch_next_state, next_actions) next_q2 = self.target_q_func2(batch_next_state, next_actions) next_q = F.minimum(next_q1, next_q2) target_q = batch_rewards + batch_discount * \ (1.0 - batch_terminal) * F.flatten(next_q) predict_q1 = F.flatten(self.q_func1(batch_state, batch_actions)) predict_q2 = F.flatten(self.q_func2(batch_state, batch_actions)) loss1 = F.mean_squared_error(target_q, predict_q1) loss2 = F.mean_squared_error(target_q, predict_q2) # Update stats self.q1_record.extend(cuda.to_cpu(predict_q1.array)) self.q2_record.extend(cuda.to_cpu(predict_q2.array)) self.q_func1_loss_record.append(float(loss1.array)) self.q_func2_loss_record.append(float(loss2.array)) self.q_func1_optimizer.update(lambda: loss1) self.q_func2_optimizer.update(lambda: loss2)
def __call__(self, x): x = F.log_softmax(x) h = x + x * F.broadcast_to(self.W, x.shape) + F.broadcast_to(self.b, x.shape) mx = F.maximum(h, F.broadcast_to(self.lb, x.shape)) mn = F.minimum(h, F.broadcast_to(self.lb, x.shape)) y = mx + F.log(1.0 + F.exp(mn - mx)) return y
def check_forward(self, x1_data, x2_data, y_expected): x1 = chainer.Variable(x1_data) x2 = chainer.Variable(x2_data) y = functions.minimum(x1, x2) self.assertEqual(y.data.dtype, self.dtype) testing.assert_allclose( y_expected, y.data, **self.check_forward_options)
def minimum(self, a, b): assert a.dtype == b.dtype if a.dtype.name.startswith('float'): x = F.minimum(a, b) else: x = Variable(np.minimum(a.data, b.data)) return x
def train(self, replay_buffer, iterations, d, clip_value, gamma, tau): if not self._initialized: self._initialize_target_networks() iterator = self._prepare_iterator(replay_buffer) for i in range(iterations): batch = iterator.next() s_current, action, r, s_next, non_terminal = concat_examples( batch, device=self._device) epsilon = F.clip( self._sample_action_noise(shape=(self._batch_size)), -clip_value, clip_value) target_pi = self._target_pi(s_current) assert target_pi.shape == epsilon.shape a_tilde = target_pi + epsilon target_q1 = self._target_q1(s_next, a_tilde) target_q2 = self._target_q2(s_next, a_tilde) r = F.reshape(r, shape=(*r.shape, 1)) non_terminal = F.reshape(non_terminal, shape=(*non_terminal.shape, 1)) min_q = F.minimum(target_q1, target_q2) # print('r shape: ', r.shape) # print('done shape: ', non_terminal.shape) # print('min q shape: ', min_q.shape) y = r + gamma * non_terminal * min_q # print('y shape: ', y.shape) # Remove reference to avoid unexpected gradient update y.unchain() q1 = self._q1(s_current, action) q1_loss = F.mean_squared_error(y, q1) q2 = self._q2(s_current, action) q2_loss = F.mean_squared_error(y, q2) critic_loss = q1_loss + q2_loss self._q1_optimizer.target.cleargrads() self._q2_optimizer.target.cleargrads() critic_loss.backward() critic_loss.unchain_backward() self._q1_optimizer.update() self._q2_optimizer.update() if i % d == 0: a = self._pi(s_current) q1 = self._q1(s_current, a) pi_loss = -F.mean(q1) self._pi_optimizer.target.cleargrads() pi_loss.backward() pi_loss.unchain_backward() self._pi_optimizer.update() self._update_target_network(self._target_q1, self._q1, tau) self._update_target_network(self._target_q2, self._q2, tau) self._update_target_network(self._target_pi, self._pi, tau)
def multi_overlap(x1, len1, x2, len2): len1_half = len1/2 len2_half = len2/2 left = F.maximum(x1 - len1_half, x2 - len2_half) right = F.minimum(x1 + len1_half, x2 + len2_half) return right - left
def multi_overlap(x1, len1, x2, len2): len1_half = len1 / 2 len2_half = len2 / 2 left = F.maximum(x1 - len1_half, x2 - len2_half) right = F.minimum(x1 + len1_half, x2 + len2_half) return right - left
def calc_height_loss(self, height): # penalize bboxes that are not high enough to contain text (10 pixels) shifted_height = height - 10 thresholded_height = F.minimum(shifted_height, self.xp.zeros_like(shifted_height)) thresholded_height *= -1 return F.average(thresholded_height)
def occupancy_grid_3d(points, *, pitch, origin, dims, threshold=1): d_IP, d_JP, d_KP = OccupancyGrid3D(pitch=pitch, origin=origin, dims=dims)(points) d_IJKP = F.sqrt(d_IP**2 + d_JP**2 + d_KP**2) d_IJK = F.min(d_IJKP, axis=3) m_IJK = F.relu(threshold - d_IJK) m_IJK = F.minimum(m_IJK, m_IJK.array * 0 + 1) return m_IJK
def greedy_actions(self): a = self.mu if self.min_action is not None: a = F.maximum( self.xp.broadcast_to(self.min_action, a.data.shape), a) if self.max_action is not None: a = F.minimum( self.xp.broadcast_to(self.max_action, a.data.shape), a) return a
def occupancy_grid_2d(points, *, pitch, origin, dimension, threshold=1): d_IK, d_JK = OccupancyGrid2D(pitch=pitch, origin=origin, dimension=dimension)(points) d_IJK = F.sqrt(d_IK**2 + d_JK**2) m_IJK = F.relu(threshold - F.absolute(d_IJK)) m_IJK = F.minimum(m_IJK, m_IJK.array * 0 + 1) m = F.max(m_IJK, axis=2) return m
def calc_bboxes(self, predicted_bboxes, image_size, out_size): predicted_bboxes = (predicted_bboxes + 1) / 2 x_points = predicted_bboxes[:, 0, ...] * image_size.width y_points = predicted_bboxes[:, 1, ...] * image_size.height top_left_x = F.get_item(x_points, [..., 0, 0]) top_left_y = F.get_item(y_points, [..., 0, 0]) bottom_right_x = F.get_item(x_points, [..., out_size.height - 1, out_size.width - 1]) bottom_right_y = F.get_item(y_points, [..., out_size.height - 1, out_size.width - 1]) bboxes = F.stack( [ F.minimum(top_left_x, bottom_right_x), F.minimum(top_left_y, bottom_right_y), F.maximum(top_left_x, bottom_right_x), F.maximum(top_left_y, bottom_right_y), ], axis=1 ) return bboxes
def greedy_actions(self): with chainer.force_backprop_mode(): a = self.mu if self.min_action is not None: a = F.maximum( self.xp.broadcast_to(self.min_action, a.array.shape), a) if self.max_action is not None: a = F.minimum( self.xp.broadcast_to(self.max_action, a.array.shape), a) return a
def _compute_ppo_loss(self, obs, acts, at, vt, old_params): params = self._pi_f(obs) cv = F.flatten(self._vf_f(obs)) ratio = F.exp(self._logp(params, acts) - self._logp(old_params, acts)) surr1 = ratio * at surr2 = F.clip(ratio, 1 - self._ppo_clipparam, 1 + self._ppo_clipparam) * at ppo_surr_loss = ( -sym_mean(F.minimum(surr1, surr2)) + self._ppo_klcoeff * sym_mean(self.kl(old_params, params)) + sym_mean(F.square(cv - vt))) return ppo_surr_loss
def batched_triangle_intersect_(p0, p1, p2, eps, fn, id, ro, rd, t0, t1): xp = chainer.backend.get_array_module(ro) BB = p0.shape[0] EB = p0.shape[0] _, _, H, W = ro.shape[:4] p0 = F.broadcast_to(p0.reshape((BB, 3, 1, 1)), (BB, 3, H, W)) p1 = F.broadcast_to(p1.reshape((BB, 3, 1, 1)), (BB, 3, H, W)) p2 = F.broadcast_to(p2.reshape((BB, 3, 1, 1)), (BB, 3, H, W)) fn = F.broadcast_to(fn.reshape((BB, 3, 1, 1)), (BB, 3, H, W)) id = F.broadcast_to(id.reshape((BB, 1, 1, 1)), (BB, 1, H, W)) eps = F.broadcast_to(eps.reshape((EB, 1, 1, 1)), (BB, 1, H, W)) ro = F.broadcast_to(ro.reshape((1, 3, H, W)), (BB, 3, H, W)) rd = F.broadcast_to(rd.reshape((1, 3, H, W)), (BB, 3, H, W)) t0 = F.broadcast_to(t0.reshape((1, 1, H, W)), (BB, 1, H, W)) t1 = F.broadcast_to(t1.reshape((1, 1, H, W)), (BB, 1, H, W)) aa = p0 - ro A = vdot(aa, fn) B = vdot(rd, fn) B = F.where(xp.abs(B.data) < eps.data, eps, B) #tx = F.where((xp.abs(A.data) < 1e-6)&(xp.abs(B.data) < 1e-6), t1, A / B) tx = F.maximum(t0, F.minimum(A / B, t1)) p = ro + tx * rd e0 = p0.data - p.data e1 = p1.data - p.data e2 = p2.data - p.data n01 = vcross_(e0, e1, xp) n12 = vcross_(e1, e2, xp) n20 = vcross_(e2, e0, xp) MASK_P = is_positive_(vdot_(n01, n12, xp)) MASK_Q = is_positive_(vdot_(n12, n20, xp)) MASK_R = is_positive_(vdot_(n20, n01, xp)) MASK_B = is_positive_(xp.abs(B.data)) #MASK_TN = is_positive(tx) MASK_T0 = is_positive_(tx.data - t0.data) MASK_T1 = is_positive_(t1.data - tx.data) b = MASK_P & MASK_Q & MASK_R & MASK_B & MASK_T0 & MASK_T1 t = F.where(b, tx, t1) p = ro + t * rd n = -xp.sign(vdot_(rd.data, fn.data, xp)) * fn return b, t, p, n, id
def get_aabb_corners(grids, image_size): _, _, height, width = grids.shape grids = (grids + 1) / 2 x_points = grids[:, 0, ...] * image_size.width y_points = grids[:, 1, ...] * image_size.height x_points = F.clip(x_points, 0., float(image_size.width)) y_points = F.clip(y_points, 0., float(image_size.height)) top_left_x = F.get_item(x_points, [..., 0, 0]) top_left_y = F.get_item(y_points, [..., 0, 0]) top_right_x = F.get_item(x_points, [..., 0, width - 1]) top_right_y = F.get_item(y_points, [..., 0, width - 1]) bottom_right_x = F.get_item(x_points, [..., height - 1, width - 1]) bottom_right_y = F.get_item(y_points, [..., height - 1, width - 1]) bottom_left_x = F.get_item(x_points, [..., height - 1, 0]) bottom_left_y = F.get_item(y_points, [..., height - 1, 0]) top_left_x_aabb = F.minimum(top_left_x, bottom_left_x) top_left_y_aabb = F.minimum(top_left_y, top_right_y) bottom_right_x_aabb = F.maximum(top_right_x, bottom_right_x) bottom_right_y_aabb = F.maximum(bottom_left_y, bottom_right_y) return top_left_y_aabb, top_left_x_aabb, bottom_right_y_aabb, bottom_right_x_aabb
def _perform_gradient_step(self, iterator): batch = iterator.next() s_current, action, r, s_next, non_terminal = \ concat_examples(batch, device=self._device) r = F.reshape(r, shape=(*r.shape, 1)) non_terminal = F.reshape(non_terminal, shape=(*non_terminal.shape, 1)) pi_action, log_pi = self._pi.action_with_log_pi(s_current) log_pi = F.reshape(log_pi, shape=(*log_pi.shape, 1)) q1 = self._q1(s_current, pi_action) q2 = self._q2(s_current, pi_action) min_q = F.minimum(q1, q2) v_target = min_q - log_pi v_target.unchain() v = self._v(s_current) # update v v_loss = 0.5 * F.mean_squared_error(v_target, v) self._v_optimizer.target.cleargrads() v_loss.backward() self._v_optimizer.update() # update pi # Original implementation uses q1 as target instead of min_q pi_loss = F.mean(log_pi - q1) self._pi_optimizer.target.cleargrads() pi_loss.backward() self._pi_optimizer.update() v_loss.unchain_backward() pi_loss.unchain_backward() # update q functions q_target = r + self._gamma * non_terminal * self._v_target(s_next) q_target.unchain() q1 = self._q1(s_current, action) q2 = self._q2(s_current, action) q1_loss = 0.5 * F.mean_squared_error(q_target, q1) q2_loss = 0.5 * F.mean_squared_error(q_target, q2) q_loss = q1_loss + q2_loss self._q1_optimizer.target.cleargrads() self._q2_optimizer.target.cleargrads() q_loss.backward() q_loss.unchain_backward() self._q1_optimizer.update() self._q2_optimizer.update() self._update_target_network(self._v_target, self._v, self._tau)
def update_q_func(self, batch): """Compute loss for a given Q-function.""" batch_next_state = batch['next_state'] batch_rewards = batch['reward'] batch_terminal = batch['is_state_terminal'] batch_state = batch['state'] batch_actions = batch['action'] batch_discount = batch['discount'] with chainer.no_backprop_mode(), chainer.using_config('train', False): next_action_distrib = self.policy(batch_next_state) next_actions, next_log_prob =\ next_action_distrib.sample_with_log_prob() entropy_term = self.temperature * next_log_prob if self.is_discrete: next_q1 = F.select_item(self.target_q_func1(batch_next_state), next_actions) next_q2 = F.select_item(self.target_q_func2(batch_next_state), next_actions) else: next_q1 = self.target_q_func1(batch_next_state, next_actions) next_q2 = self.target_q_func2(batch_next_state, next_actions) entropy_term = entropy_term[..., None] next_q = F.minimum(next_q1, next_q2) assert next_q.shape == entropy_term.shape target_q = batch_rewards + batch_discount * \ (1.0 - batch_terminal) * F.flatten(next_q - entropy_term) if self.is_discrete: predict_q1 = F.flatten( F.select_item(self.q_func1(batch_state), batch_actions)) predict_q2 = F.flatten( F.select_item(self.q_func2(batch_state), batch_actions)) else: predict_q1 = F.flatten(self.q_func1(batch_state, batch_actions)) predict_q2 = F.flatten(self.q_func2(batch_state, batch_actions)) loss1 = 0.5 * F.mean_squared_error(target_q, predict_q1) loss2 = 0.5 * F.mean_squared_error(target_q, predict_q2) # Update stats self.q1_record.extend(cuda.to_cpu(predict_q1.array)) self.q2_record.extend(cuda.to_cpu(predict_q2.array)) self.q_func1_loss_record.append(float(loss1.array)) self.q_func2_loss_record.append(float(loss2.array)) self.q_func1_optimizer.update(lambda: loss1) self.q_func2_optimizer.update(lambda: loss2)
def update_q_func(self, batch): """Compute loss for a given Q-function.""" batch_next_state = batch['next_state'] batch_rewards = batch['reward'] batch_terminal = batch['is_state_terminal'] batch_state = batch['state'] batch_actions = batch['action'] batch_discount = batch['discount'] with chainer.no_backprop_mode(), chainer.using_config('train', False): next_action_distrib = self.policy(batch_next_state) next_actions, next_log_prob =\ next_action_distrib.sample_with_log_prob() next_q1 = self.target_q_func1(batch_next_state, next_actions) next_q2 = self.target_q_func2(batch_next_state, next_actions) next_q = F.minimum(next_q1, next_q2) entropy_term = self.temperature * next_log_prob[..., None] assert next_q.shape == entropy_term.shape target_q = batch_rewards + batch_discount * \ (1.0 - batch_terminal) * F.flatten(next_q - entropy_term) predict_q1 = F.flatten(self.q_func1(batch_state, batch_actions)) predict_q2 = F.flatten(self.q_func2(batch_state, batch_actions)) loss1 = 0.5 * F.mean_squared_error(target_q, predict_q1) loss2 = 0.5 * F.mean_squared_error(target_q, predict_q2) if self.use_mutual_learning: for idx, agent in enumerate(self.all_agents): if idx != self.assigned_idx: #self.logger.info('Mutual learn Q') other_predict_q1 = F.flatten( agent.q_func1(batch_state, batch_actions)) other_predict_q2 = F.flatten( agent.q_func2(batch_state, batch_actions)) loss1 += 0.5 * F.mean_squared_error( predict_q1, other_predict_q1) loss2 += 0.5 * F.mean_squared_error( predict_q2, other_predict_q2) # Update stats self.q1_record.extend(cuda.to_cpu(predict_q1.array)) self.q2_record.extend(cuda.to_cpu(predict_q2.array)) self.q_func1_loss_record.append(float(loss1.array)) self.q_func2_loss_record.append(float(loss2.array)) self.q_func1_optimizer.update(lambda: loss1) self.q_func2_optimizer.update(lambda: loss2)
def clamp(x, lower, upper): """ Naive Clamping in [2] A [[ x ]]_b = min(max(u,b_lower),b_upper) :param x: :param lower: :param upper: :return: """ # Not None assert x.shape == lower.shape assert x.shape == upper.shape assert (lower.array <= upper.array).all(), " lower is larger than upper" \ + " lower: " + str(lower) + "upper: " + str(upper) return F.minimum(F.maximum(x, lower), upper)
def calc_loss(self, grids, image_size): top_left_x, top_right_x, bottom_left_x, top_left_y, top_right_y, bottom_left_y = self.get_corners(grids, image_size, scale_to_image_size=False) # determine whether a point is out of the image, image range is [-1, 1] # everything outside of this increases the loss! bbox = F.concat([top_left_x, top_left_y, top_right_x, bottom_left_y], axis=0) top_loss = bbox + 1 bottom_loss = bbox - 1 # do not penalize anything inside the image top_loss = F.absolute(F.minimum(top_loss, self.xp.zeros_like(top_loss))) bottom_loss = F.maximum(bottom_loss, self.xp.zeros_like(bottom_loss)) loss = F.sum(F.concat([top_loss, bottom_loss], axis=0)) return loss
def __call__(self, t, condition): # t(timesteps): 1-T distribution = chainer.distributions.Normal( self.xp.array(0, dtype='f'), self.xp.array(1, dtype='f')) z = distribution.sample(t.shape) # z(timesteps): 1-T condition = self.encoder(condition) # condition(timesteps): 1-T s_means, s_scales = self.student(z, condition) s_clipped_scales = F.maximum( s_scales, self.scalar_to_tensor(s_scales, -7)) # s_means, s_scales(timesteps): 2-(T+1) x = z[:, :, 1:] * F.exp(s_scales[:, :, :-1]) + s_means[:, :, :-1] # x(timesteps): 2-T with chainer.using_config('train', False): y = self.teacher(x, condition[:, :, 1:]) t_means, t_scales = y[:, 1:2], y[:, 2:3] t_clipped_scales = F.maximum( t_scales, self.scalar_to_tensor(t_scales, -7)) # t_means, t_scales(timesteps): 3-(T+1) s_distribution = chainer.distributions.Normal( s_means[:, :, 1:], log_scale=s_clipped_scales[:, :, 1:]) t_distribution = chainer.distributions.Normal( t_means, log_scale=t_clipped_scales) # s_distribution, t_distribution(timesteps): 3-(T+1) kl = chainer.kl_divergence(s_distribution, t_distribution) kl = F.minimum( kl, self.scalar_to_tensor(kl, 100)) kl = F.average(kl) regularization = F.mean_squared_error( t_scales, s_scales[:, :, 1:]) spectrogram_frame_loss = F.mean_squared_error( self.stft.magnitude(t[:, :, 1:]), self.stft.magnitude(x)) loss = kl + self.lmd * regularization + spectrogram_frame_loss chainer.reporter.report({ 'loss': loss, 'kl_divergence': kl, 'regularization': regularization, 'spectrogram_frame_loss': spectrogram_frame_loss}, self) return loss
def forward(self, inputs, device): x1, x2 = inputs return functions.minimum(x1, x2),