def grid_coord(guide, xx, yy, sz, small_sz, sigma_r, bs): gx = ((xx + 0.5) / sz) * small_sz gy = ((yy + 0.5) / sz) * small_sz expanded_guide = C.reshape(guide, [bs, 1, sz, sz]) gz = expanded_guide * sigma_r fx = C.floor(gx - 0.5) fy = C.floor(gy - 0.5) fz = C.clip(C.floor(gz - 0.5), 0, sigma_r - 1) cx = C.element_min(fx + 1, small_sz - 1) cy = C.element_min(fy + 1, small_sz - 1) cz = C.clip(fz + 1, 0, sigma_r - 1) return gx, gy, gz, fx, fy, fz, cx, cy, cz
def Loss(self): # Evaluating old actions and values : logprobs, state_value, dist_entropy = self.policy.evaluate() # Finding the ratio (pi_theta / pi_theta__old): # (importance sampling) c_old_logprobs = C.input_variable(logprobs.shape, name='old_log_probs') ratios = C.exp(logprobs - C.stop_gradient(c_old_logprobs)) c_rewards = C.input_variable(1, name='rewards') advantages = c_rewards - C.stop_gradient(state_value) # Finding Surrogate Loss: surr1 = ratios * advantages surr2 = C.clip(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages neglog_loss = -C.element_min(surr1, surr2) entropy_loss = -0.01 * dist_entropy actor_loss = C.reduce_mean(neglog_loss + entropy_loss) critic_loss = 0.5 * C.reduce_mean(C.square(state_value - c_rewards)) loss = actor_loss + critic_loss chunk = { 'neglog_loss': neglog_loss, 'entropy_loss': entropy_loss, 'actor_loss': actor_loss, 'critic_loss': critic_loss } trainer = C.Trainer( loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule_per_sample(self.lr), C.momentum_schedule_per_sample(self.betas[0]), variance_momentum=C.momentum_schedule_per_sample( self.betas[1]))) # trainer = C.Trainer(loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule(10), C.momentum_schedule(0.9), variance_momentum=C.momentum_schedule(0.999))) # higher learning rate return loss, chunk, trainer
def test_Min(tmpdir): data0 = np.asarray([1., 1., 1., 1.], dtype=np.float32) data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=np.float32) model = C.element_min(data0, data1) verify_no_input(model, tmpdir, 'Min_0')
def test_Min(tmpdir, dtype): with C.default_options(dtype=dtype): data0 = np.asarray([1., 1., 1., 1.], dtype=dtype) data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=dtype) model = C.element_min(data0, data1) verify_no_input(model, tmpdir, 'Min_0')
def test_Min(tmpdir, dtype): with C.default_options(dtype = dtype): data0 = np.asarray([1., 1., 1., 1.], dtype=dtype) data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=dtype) model = C.element_min(data0, data1) verify_no_input(model, tmpdir, 'Min_0')
def validate_model(test_data, model, polymath): begin_logits = model.outputs[0] end_logits = model.outputs[1] loss = model.outputs[2] root = C.as_composite(loss.owner) mb_source, input_map = create_mb_and_map(root, test_data, polymath, randomize=False, repeat=False) begin_label = argument_by_name(root, 'ab') end_label = argument_by_name(root, 'ae') begin_prediction = C.sequence.input_variable( 1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True) end_prediction = C.sequence.input_variable( 1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True) best_span_score = symbolic_best_span(begin_prediction, end_prediction) predicted_span = C.layers.Recurrence( C.plus)(begin_prediction - C.sequence.past_value(end_prediction)) true_span = C.layers.Recurrence(C.plus)(begin_label - C.sequence.past_value(end_label)) common_span = C.element_min(predicted_span, true_span) begin_match = C.sequence.reduce_sum( C.element_min(begin_prediction, begin_label)) end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label)) predicted_len = C.sequence.reduce_sum(predicted_span) true_len = C.sequence.reduce_sum(true_span) common_len = C.sequence.reduce_sum(common_span) f1 = 2 * common_len / (predicted_len + true_len) exact_match = C.element_min(begin_match, end_match) precision = common_len / predicted_len recall = common_len / true_len overlap = C.greater(common_len, 0) s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes()) stats = C.splice(s(f1), s(exact_match), s(precision), s(recall), s(overlap), s(begin_match), s(end_match)) # Evaluation parameters minibatch_size = 2048 num_sequences = 0 stat_sum = 0 loss_sum = 0 with tqdm(ncols=32) as progress_bar: while True: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) if not data or not (begin_label in data ) or data[begin_label].num_sequences == 0: break out = model.eval(data, outputs=[begin_logits, end_logits, loss], as_numpy=False) testloss = out[loss] g = best_span_score.grad( { begin_prediction: out[begin_logits], end_prediction: out[end_logits] }, wrt=[begin_prediction, end_prediction], as_numpy=False) other_input_map = { begin_prediction: g[begin_prediction], end_prediction: g[end_prediction], begin_label: data[begin_label], end_label: data[end_label] } stat_sum += stats.eval((other_input_map)) loss_sum += np.sum(testloss.asarray()) num_sequences += data[begin_label].num_sequences progress_bar.update(data[begin_label].num_sequences) stat_avg = stat_sum / num_sequences loss_avg = loss_sum / num_sequences print( "\nValidated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}" .format(num_sequences, loss_avg, stat_avg[0], stat_avg[1], stat_avg[2], stat_avg[3], stat_avg[4], stat_avg[5], stat_avg[6])) return loss_avg
def main(): show_image = False if show_image: bs = 1 ci = 3 co = 3 cg = co * (ci + 1) gd = 8 gh = 64 gw = 64 h = 256 w = 256 else: bs = 1 ci = 3 co = 3 cg = co * (ci + 1) gd = 8 gh = 64 gw = 64 h = 1024 w = 1024 im = C.input_variable([bs, ci, h, w], needs_gradient=True, dynamic_axes=[]) guide = C.input_variable([bs, h, w], needs_gradient=True, dynamic_axes=[]) guide_no_grad = C.input_variable([bs, h, w], needs_gradient=False, dynamic_axes=[]) grid = C.input_variable([bs, cg, gd, gh, gw], needs_gradient=True, dynamic_axes=[]) # Create indices xx = np.arange(0, w).reshape(1, -1).repeat(h, 0).astype(np.float32) yy = np.arange(0, h).reshape(-1, 1).repeat(w, 1).astype(np.float32) xx = C.Constant(xx, xx.shape) yy = C.Constant(yy, yy.shape) gx = ((xx + 0.5) / w) * gw gy = ((yy + 0.5) / h) * gh gz = C.clip(guide, 0.0, 1.0) * gd gz_no_grad = C.clip(guide_no_grad, 0.0, 1.0) * gd fx = C.element_max(C.floor(gx - 0.5), 0.0) fy = C.element_max(C.floor(gy - 0.5), 0.0) fz = C.element_max(C.floor(gz - 0.5), 0.0) fz_no_grad = C.element_max(C.floor(gz_no_grad - 0.5), 0.0) wx = gx - 0.5 - fx wy = gy - 0.5 - fy wx = C.expand_dims(C.expand_dims(wx, -1 - len(wx.shape)), -1 - len(wx.shape)) wy = C.expand_dims(C.expand_dims(wy, -1 - len(wy.shape)), -1 - len(wy.shape)) wz = C.abs(gz - 0.5 - fz) wz = C.expand_dims(wz, 0) fx = C.expand_dims(C.expand_dims(fx, -1 - len(fx.shape)), -1 - len(fx.shape)) fy = C.expand_dims(C.expand_dims(fy, -1 - len(fy.shape)), -1 - len(fy.shape)) cx = C.element_min(fx + 1, gw - 1) cy = C.element_min(fy + 1, gh - 1) cz = C.element_min(fz_no_grad + 1, gd - 1) batch_idx = np.arange(bs).reshape(bs, 1, 1, 1).astype(np.float32) batch_idx = C.Constant(batch_idx, batch_idx.shape) out = [] flat_grid = C.reshape(grid, [-1]) for c_ in range(co): c_idx = np.arange((ci + 1) * c_, (ci + 1) * (c_ + 1)).reshape(1, ci + 1, 1, 1).astype(np.float32) c_idx = C.Constant(c_idx, c_idx.shape) def flatten_and_gather(x, y, z): linear_idx = x + gw * y + gw * gh * z + c_idx * gw * gh * gd + batch_idx * gw * gh * gd * cg flat_linear_idx = C.reshape(linear_idx, [-1]) return C.reshape(C.gather(flat_grid, flat_linear_idx), linear_idx.shape) gather_fff = flatten_and_gather(fx, fy, fz_no_grad) gather_ffc = flatten_and_gather(fx, fy, cz) gather_fcf = flatten_and_gather(fx, cy, fz_no_grad) gather_fcc = flatten_and_gather(fx, cy, cz) gather_cff = flatten_and_gather(cx, fy, fz_no_grad) gather_cfc = flatten_and_gather(cx, fy, cz) gather_ccf = flatten_and_gather(cx, cy, fz_no_grad) gather_ccc = flatten_and_gather(cx, cy, cz) a = gather_fff*(1-wx)*(1-wy)*(1-wz) + \ gather_ffc*(1-wx)*(1-wy)*( wz) + \ gather_fcf*(1-wx)*( wy)*(1-wz) + \ gather_fcc*(1-wx)*( wy)*( wz) + \ gather_cff*( wx)*(1-wy)*(1-wz) + \ gather_cfc*( wx)*(1-wy)*( wz) + \ gather_ccf*( wx)*( wy)*(1-wz) + \ gather_ccc*( wx)*( wy)*( wz) o = C.reduce_sum(a[:, :-1, ...] * im, 1) + a[:, -1, ...] print(o.shape) out.append(C.expand_dims(o, 0)) out = C.splice(*out, axis=1) loss = C.reduce_l2(out) grid_val = np.random.rand(bs, cg, gd, gh, gw).astype(np.float32) if show_image: guide_val = skio.imread("/data/rgb.png").mean(2)[:h, :w].astype( np.float32) guide_val = np.expand_dims(guide_val / 255.0, 0) im_val = np.tile(np.expand_dims(guide_val, 1), [1, 3, 1, 1]) out_val = out.eval({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) out_val = np.clip(np.transpose(np.squeeze(out_val), [1, 2, 0]), 0, 1) skio.imsave("/output/imout.png", out_val) else: im_val = np.random.randn(bs, ci, h, w) guide_val = np.random.rand(bs, h, w).astype(np.float32) # burning iteration for it in range(5): print('burning (', it, ')') g = loss.grad({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) # actual iterations start = time.time() for it in range(50): print('profiling (', it, ')') g = loss.grad({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) end = time.time() runtime = (end - start) * 1000.0 / 50.0 print('Runtime:', runtime)
def test_Min(tmpdir): data0 = np.asarray([1., 1., 1., 1.], dtype=np.float32) data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=np.float32) model = C.element_min(data0, data1) verify_no_input(model, tmpdir, 'Min_0')
import cntk print("Tensor A = [1,2,3]") print("Tensor B = [4,5,6]\n") print("A+B:") sum = cntk.plus([1, 2, 3], [4, 5, 6]).eval() print("{}\n".format(sum)) print("A-B:") minus = cntk.minus([1, 2, 3], [4, 5, 6]).eval() print("{}\n".format(minus)) print("A*B:") times = cntk.times([1, 3, 4], [4, 5, 6]).eval() print("{}\n".format(times)) print("A/B:") divide = cntk.element_divide([4, 32, 15], [2, 4, 5]).eval() print("{}\n".format(divide)) print("A^B:") pow = cntk.pow([1, 3, 4], [4, 2, 3]).eval() print("{}\n".format(pow)) print("Min(A,B):") min = cntk.element_min([1, 2, 3], [4, 5, 6], [2, 1, 0]).eval() print("{}\n".format(min)) print("Max(A,B):") max = cntk.element_max([1, 2, 3], [4, 5, 6], [2, 9, 0]).eval() print("{}\n".format(max))
def run(self): while self.episode < EPISODES: obs, action, pred, reward = self.get_batch() obs, action, pred, reward = obs[: BUFFER_SIZE], action[: BUFFER_SIZE], pred[: BUFFER_SIZE], reward[: BUFFER_SIZE] old_prediction = pred # # from IPython import embed;embed(header='run') # exit() # # pred_values = self.critic.predict(obs) # advantage = reward - pred_values # actor_loss = self.actor.fit([obs, advantage, old_prediction], [action], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=False) # critic_loss = self.critic.fit([obs], [reward], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=False) # self.writer.add_scalar('Actor loss', actor_loss.history['loss'][-1], self.gradient_steps) # self.writer.add_scalar('Critic loss', critic_loss.history['loss'][-1], self.gradient_steps) #region actor training pred_values = self.critic.eval({self.critic.arguments[0]: obs}) advantage = reward - pred_values # actor_loss = c_action = C.input_variable(action.shape[-1], name='action') c_prediction = C.input_variable(old_prediction.shape[-1], name='old_prediction') c_advantage = C.input_variable(1, name='advantage') prob = C.reduce_sum(c_action * self.actor) old_prob = C.reduce_sum(c_action * c_prediction) ratio = prob / (old_prob + 1e-10) surr1 = c_advantage * ratio surr2 = c_advantage * C.clip(ratio, 1 - LOSS_CLIPPING, 1 + LOSS_CLIPPING) # loss = -C.reduce_mean(C.element_min(surr1, surr2) + ENTROPY_LOSS * -(prob * C.log(prob + 1e-10))) # from keras neglog_loss = -C.element_min(surr1, surr2) entropy_loss = -ENTROPY_LOSS * -(prob * C.log(prob + 1e-10)) # loss = -C.element_min(surr1, surr2) - ENTROPY_LOSS * -(prob * C.log(prob + 1e-10)) # from keras # loss = -C.element_min(surr1, surr2) + ENTROPY_LOSS * -(prob * C.log(prob + 1e-10)) # from pytorch ??? loss = C.reduce_mean(neglog_loss + entropy_loss) actor_loss = loss trainer = C.Trainer( actor_loss, (actor_loss, None), C.adam(actor_loss.parameters, C.learning_parameter_schedule_per_sample(LR), C.learning_parameter_schedule_per_sample(0.99))) avg = 0 avg_out = {neglog_loss.output: 0, entropy_loss.output: 0} for epoch in range(EPOCHS): data_size = action.shape[0] suffle_idx = random.sample(list(range(data_size)), data_size) mb_action = action[suffle_idx] mb_obs = obs[suffle_idx] mb_old_prediction = old_prediction[suffle_idx] mb_advantage = advantage[suffle_idx] updated, out = trainer.train_minibatch(dict( zip(actor_loss.arguments, [mb_advantage, mb_action, mb_obs, mb_old_prediction])), outputs=[ neglog_loss.output, entropy_loss.output ]) # print(trainer.previous_minibatch_loss_average) avg += trainer.previous_minibatch_loss_average avg_out[neglog_loss.output] += out[neglog_loss.output].mean() avg_out[entropy_loss.output] += out[entropy_loss.output].mean() #endregion self.writer.add_scalar('Actor loss', avg / EPOCHS, self.gradient_steps) self.writer.add_scalar('neglog loss', avg_out[neglog_loss.output] / EPOCHS, self.gradient_steps) self.writer.add_scalar('entropy loss', avg_out[entropy_loss.output] / EPOCHS, self.gradient_steps) #region critic training c_reward = C.input_variable(1, name='reward') loss = C.reduce_mean(C.square(self.critic - c_reward)) critic_loss = loss trainer = C.Trainer( critic_loss, (critic_loss, None), C.adam(critic_loss.parameters, C.learning_parameter_schedule_per_sample(LR), C.learning_parameter_schedule_per_sample(0.99))) avg = 0 for epoch in range(EPOCHS): data_size = action.shape[0] suffle_idx = random.sample(list(range(data_size)), data_size) mb_obs = obs[suffle_idx] mb_reward = reward[suffle_idx] trainer.train_minibatch( dict(zip(critic_loss.arguments, [mb_obs, mb_reward]))) # print(trainer.previous_minibatch_loss_average) avg += trainer.previous_minibatch_loss_average #endregion self.writer.add_scalar('Critic loss', avg / EPOCHS, self.gradient_steps) self.gradient_steps += 1
def validate_model(test_data, model, polymath): begin_logits = model.outputs[0] end_logits = model.outputs[1] loss = model.outputs[2] root = C.as_composite(loss.owner) mb_source, input_map = create_mb_and_map(root, test_data, polymath, randomize=False, repeat=False) begin_label = argument_by_name(root, 'ab') end_label = argument_by_name(root, 'ae') begin_prediction = C.sequence.input_variable(1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True) end_prediction = C.sequence.input_variable(1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True) best_span_score = symbolic_best_span(begin_prediction, end_prediction) predicted_span = C.layers.Recurrence(C.plus)(begin_prediction - C.sequence.past_value(end_prediction)) true_span = C.layers.Recurrence(C.plus)(begin_label - C.sequence.past_value(end_label)) common_span = C.element_min(predicted_span, true_span) begin_match = C.sequence.reduce_sum(C.element_min(begin_prediction, begin_label)) end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label)) predicted_len = C.sequence.reduce_sum(predicted_span) true_len = C.sequence.reduce_sum(true_span) common_len = C.sequence.reduce_sum(common_span) f1 = 2*common_len/(predicted_len+true_len) exact_match = C.element_min(begin_match, end_match) precision = common_len/predicted_len recall = common_len/true_len overlap = C.greater(common_len, 0) s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes()) stats = C.splice(s(f1), s(exact_match), s(precision), s(recall), s(overlap), s(begin_match), s(end_match)) # Evaluation parameters minibatch_size = 20000 num_sequences = 0 stat_sum = 0 loss_sum = 0 while True: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) if not data or not (begin_label in data) or data[begin_label].num_sequences == 0: break out = model.eval(data, outputs=[begin_logits,end_logits,loss], as_numpy=False) testloss = out[loss] g = best_span_score.grad({begin_prediction:out[begin_logits], end_prediction:out[end_logits]}, wrt=[begin_prediction,end_prediction], as_numpy=False) other_input_map = {begin_prediction: g[begin_prediction], end_prediction: g[end_prediction], begin_label: data[begin_label], end_label: data[end_label]} stat_sum += stats.eval((other_input_map)) loss_sum += np.sum(testloss.asarray()) num_sequences += data[begin_label].num_sequences stat_avg = stat_sum / num_sequences loss_avg = loss_sum / num_sequences print("Validated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}".format( num_sequences, loss_avg, stat_avg[0], stat_avg[1], stat_avg[2], stat_avg[3], stat_avg[4], stat_avg[5], stat_avg[6])) return loss_avg
def self_attention_layer(in_dims: int, out_dims: int, name='self_attention', as_block: bool = False, k_ph: bool = False, v_ph: bool = False, mask_opt: bool = False) -> C.Function: sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims') X = C.placeholder( in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') if k_ph is False and v_ph is False: q = C.layers.Dense(out_dims, name=name + '_q')( X ) # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q') k = C.layers.Dense(out_dims, name=name + '_k')( X ) # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k') v = C.layers.Dense(out_dims, name=name + '_v')( X ) # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v') elif k_ph is True and v_ph is True: q = C.layers.Dense(out_dims, name=name + '_q')(X) k = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_k_ph') v = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_v_ph') else: raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}') q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q') k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k') v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v') scores = C.times_transpose(q_, k_, name=name + '_score_matrix') scaled = scores / sq_sa_dims # div_k if mask_opt: mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) softmax = C.softmax(scaled, name=name + '_softmax') attention = C.times(softmax, v_, name=name + '_attention') result = C.to_sequence_like(attention, X) if as_block: if k_ph is False and v_ph is False: return C.as_block(result, [(X, X)], 'self_attention', 'self_attention_') elif k_ph is True and v_ph is True: return C.as_block(result, [(X, X), (k, k), (v, v)], 'self_attention', 'self_attention_') else: raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}') else: return result
def gpt2_self_attention(token_dims: int, head_dims: int, mask_opt: bool = False, as_block: bool = False, name: str = 'self_attention'): X = C.placeholder(token_dims, dynamic_axes=(C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name) # q = C.layers.Dense(token_dims, name=name+'_q')(X) # k = C.layers.Dense(token_dims, name=name+'_k')(X) # v = C.layers.Dense(token_dims, name=name+'_v')(X) # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w') # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv') qkv = C.layers.Dense((3, token_dims), name='qkv')(X) q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2] q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q') k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k') v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v') #region split multi head attention q_heads = [ C.squeeze(q_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] k_heads = [ C.squeeze(k_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] v_heads = [ C.squeeze(v_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] #endregion attention_head = [] for i in range(head_dims): q = q_heads[i] k = k_heads[i] v = v_heads[i] #region score # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i)) # q seq k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i)) # k seq v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i)) # v seq scores = C.times_transpose(q_, k_) scaled = scores * (1 / C.sqrt(v_.shape[-1])) #region mask opt mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) #endregion softmax = C.softmax(scaled) #endregion #region sum attention = C.times(softmax, v_) attention_seq = C.to_sequence_like(attention, X) #endregion attention_head.append(attention_seq) #region merge attention heads attention = C.splice(*attention_head, name='merged_attention') #endergion #region project project = C.layers.Dense(token_dims, name='project')(attention) #endregion if as_block: return C.as_block(project, [(X, X)], 'gpt2_self_attention', 'gpt2_self_attention') return project