Exemple #1
0
def grid_coord(guide, xx, yy, sz, small_sz, sigma_r, bs):
    gx = ((xx + 0.5) / sz) * small_sz
    gy = ((yy + 0.5) / sz) * small_sz
    expanded_guide = C.reshape(guide, [bs, 1, sz, sz])
    gz = expanded_guide * sigma_r
    fx = C.floor(gx - 0.5)
    fy = C.floor(gy - 0.5)
    fz = C.clip(C.floor(gz - 0.5), 0, sigma_r - 1)
    cx = C.element_min(fx + 1, small_sz - 1)
    cy = C.element_min(fy + 1, small_sz - 1)
    cz = C.clip(fz + 1, 0, sigma_r - 1)
    return gx, gy, gz, fx, fy, fz, cx, cy, cz
Exemple #2
0
    def Loss(self):
        # Evaluating old actions and values :
        logprobs, state_value, dist_entropy = self.policy.evaluate()

        # Finding the ratio (pi_theta / pi_theta__old): # (importance sampling)
        c_old_logprobs = C.input_variable(logprobs.shape, name='old_log_probs')
        ratios = C.exp(logprobs - C.stop_gradient(c_old_logprobs))

        c_rewards = C.input_variable(1, name='rewards')
        advantages = c_rewards - C.stop_gradient(state_value)

        # Finding Surrogate Loss:
        surr1 = ratios * advantages
        surr2 = C.clip(ratios, 1 - self.eps_clip,
                       1 + self.eps_clip) * advantages
        neglog_loss = -C.element_min(surr1, surr2)
        entropy_loss = -0.01 * dist_entropy
        actor_loss = C.reduce_mean(neglog_loss + entropy_loss)
        critic_loss = 0.5 * C.reduce_mean(C.square(state_value - c_rewards))
        loss = actor_loss + critic_loss

        chunk = {
            'neglog_loss': neglog_loss,
            'entropy_loss': entropy_loss,
            'actor_loss': actor_loss,
            'critic_loss': critic_loss
        }

        trainer = C.Trainer(
            loss, (loss, None),
            C.adam(loss.parameters,
                   C.learning_parameter_schedule_per_sample(self.lr),
                   C.momentum_schedule_per_sample(self.betas[0]),
                   variance_momentum=C.momentum_schedule_per_sample(
                       self.betas[1])))
        # trainer = C.Trainer(loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule(10), C.momentum_schedule(0.9), variance_momentum=C.momentum_schedule(0.999))) # higher learning rate

        return loss, chunk, trainer
Exemple #3
0
def test_Min(tmpdir):
    data0 = np.asarray([1., 1., 1., 1.], dtype=np.float32)
    data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=np.float32)
    model = C.element_min(data0, data1)
    verify_no_input(model, tmpdir, 'Min_0')
Exemple #4
0
def test_Min(tmpdir, dtype):
    with C.default_options(dtype=dtype):
        data0 = np.asarray([1., 1., 1., 1.], dtype=dtype)
        data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=dtype)
        model = C.element_min(data0, data1)
        verify_no_input(model, tmpdir, 'Min_0')
Exemple #5
0
def test_Min(tmpdir, dtype):
    with C.default_options(dtype = dtype):
        data0 = np.asarray([1., 1., 1., 1.], dtype=dtype)
        data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=dtype)
        model = C.element_min(data0, data1)
        verify_no_input(model, tmpdir, 'Min_0')
def validate_model(test_data, model, polymath):
    begin_logits = model.outputs[0]
    end_logits = model.outputs[1]
    loss = model.outputs[2]
    root = C.as_composite(loss.owner)
    mb_source, input_map = create_mb_and_map(root,
                                             test_data,
                                             polymath,
                                             randomize=False,
                                             repeat=False)
    begin_label = argument_by_name(root, 'ab')
    end_label = argument_by_name(root, 'ae')

    begin_prediction = C.sequence.input_variable(
        1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True)
    end_prediction = C.sequence.input_variable(
        1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True)

    best_span_score = symbolic_best_span(begin_prediction, end_prediction)
    predicted_span = C.layers.Recurrence(
        C.plus)(begin_prediction - C.sequence.past_value(end_prediction))
    true_span = C.layers.Recurrence(C.plus)(begin_label -
                                            C.sequence.past_value(end_label))
    common_span = C.element_min(predicted_span, true_span)
    begin_match = C.sequence.reduce_sum(
        C.element_min(begin_prediction, begin_label))
    end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label))

    predicted_len = C.sequence.reduce_sum(predicted_span)
    true_len = C.sequence.reduce_sum(true_span)
    common_len = C.sequence.reduce_sum(common_span)
    f1 = 2 * common_len / (predicted_len + true_len)
    exact_match = C.element_min(begin_match, end_match)
    precision = common_len / predicted_len
    recall = common_len / true_len
    overlap = C.greater(common_len, 0)
    s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes())
    stats = C.splice(s(f1), s(exact_match), s(precision), s(recall),
                     s(overlap), s(begin_match), s(end_match))

    # Evaluation parameters
    minibatch_size = 2048
    num_sequences = 0

    stat_sum = 0
    loss_sum = 0

    with tqdm(ncols=32) as progress_bar:
        while True:
            data = mb_source.next_minibatch(minibatch_size,
                                            input_map=input_map)
            if not data or not (begin_label in data
                                ) or data[begin_label].num_sequences == 0:
                break
            out = model.eval(data,
                             outputs=[begin_logits, end_logits, loss],
                             as_numpy=False)
            testloss = out[loss]
            g = best_span_score.grad(
                {
                    begin_prediction: out[begin_logits],
                    end_prediction: out[end_logits]
                },
                wrt=[begin_prediction, end_prediction],
                as_numpy=False)
            other_input_map = {
                begin_prediction: g[begin_prediction],
                end_prediction: g[end_prediction],
                begin_label: data[begin_label],
                end_label: data[end_label]
            }
            stat_sum += stats.eval((other_input_map))
            loss_sum += np.sum(testloss.asarray())
            num_sequences += data[begin_label].num_sequences
            progress_bar.update(data[begin_label].num_sequences)

    stat_avg = stat_sum / num_sequences
    loss_avg = loss_sum / num_sequences

    print(
        "\nValidated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}"
        .format(num_sequences, loss_avg, stat_avg[0], stat_avg[1], stat_avg[2],
                stat_avg[3], stat_avg[4], stat_avg[5], stat_avg[6]))

    return loss_avg
Exemple #7
0
def main():
    show_image = False
    if show_image:
        bs = 1
        ci = 3
        co = 3
        cg = co * (ci + 1)
        gd = 8
        gh = 64
        gw = 64
        h = 256
        w = 256
    else:
        bs = 1
        ci = 3
        co = 3
        cg = co * (ci + 1)
        gd = 8
        gh = 64
        gw = 64
        h = 1024
        w = 1024

    im = C.input_variable([bs, ci, h, w], needs_gradient=True, dynamic_axes=[])
    guide = C.input_variable([bs, h, w], needs_gradient=True, dynamic_axes=[])
    guide_no_grad = C.input_variable([bs, h, w],
                                     needs_gradient=False,
                                     dynamic_axes=[])
    grid = C.input_variable([bs, cg, gd, gh, gw],
                            needs_gradient=True,
                            dynamic_axes=[])
    # Create indices
    xx = np.arange(0, w).reshape(1, -1).repeat(h, 0).astype(np.float32)
    yy = np.arange(0, h).reshape(-1, 1).repeat(w, 1).astype(np.float32)
    xx = C.Constant(xx, xx.shape)
    yy = C.Constant(yy, yy.shape)
    gx = ((xx + 0.5) / w) * gw
    gy = ((yy + 0.5) / h) * gh
    gz = C.clip(guide, 0.0, 1.0) * gd
    gz_no_grad = C.clip(guide_no_grad, 0.0, 1.0) * gd
    fx = C.element_max(C.floor(gx - 0.5), 0.0)
    fy = C.element_max(C.floor(gy - 0.5), 0.0)
    fz = C.element_max(C.floor(gz - 0.5), 0.0)
    fz_no_grad = C.element_max(C.floor(gz_no_grad - 0.5), 0.0)
    wx = gx - 0.5 - fx
    wy = gy - 0.5 - fy
    wx = C.expand_dims(C.expand_dims(wx, -1 - len(wx.shape)),
                       -1 - len(wx.shape))
    wy = C.expand_dims(C.expand_dims(wy, -1 - len(wy.shape)),
                       -1 - len(wy.shape))
    wz = C.abs(gz - 0.5 - fz)
    wz = C.expand_dims(wz, 0)
    fx = C.expand_dims(C.expand_dims(fx, -1 - len(fx.shape)),
                       -1 - len(fx.shape))
    fy = C.expand_dims(C.expand_dims(fy, -1 - len(fy.shape)),
                       -1 - len(fy.shape))
    cx = C.element_min(fx + 1, gw - 1)
    cy = C.element_min(fy + 1, gh - 1)
    cz = C.element_min(fz_no_grad + 1, gd - 1)
    batch_idx = np.arange(bs).reshape(bs, 1, 1, 1).astype(np.float32)
    batch_idx = C.Constant(batch_idx, batch_idx.shape)
    out = []
    flat_grid = C.reshape(grid, [-1])
    for c_ in range(co):
        c_idx = np.arange((ci + 1) * c_,
                          (ci + 1) * (c_ + 1)).reshape(1, ci + 1, 1,
                                                       1).astype(np.float32)
        c_idx = C.Constant(c_idx, c_idx.shape)

        def flatten_and_gather(x, y, z):
            linear_idx = x + gw * y + gw * gh * z + c_idx * gw * gh * gd + batch_idx * gw * gh * gd * cg
            flat_linear_idx = C.reshape(linear_idx, [-1])
            return C.reshape(C.gather(flat_grid, flat_linear_idx),
                             linear_idx.shape)

        gather_fff = flatten_and_gather(fx, fy, fz_no_grad)
        gather_ffc = flatten_and_gather(fx, fy, cz)
        gather_fcf = flatten_and_gather(fx, cy, fz_no_grad)
        gather_fcc = flatten_and_gather(fx, cy, cz)
        gather_cff = flatten_and_gather(cx, fy, fz_no_grad)
        gather_cfc = flatten_and_gather(cx, fy, cz)
        gather_ccf = flatten_and_gather(cx, cy, fz_no_grad)
        gather_ccc = flatten_and_gather(cx, cy, cz)
        a = gather_fff*(1-wx)*(1-wy)*(1-wz) + \
            gather_ffc*(1-wx)*(1-wy)*(  wz) + \
            gather_fcf*(1-wx)*(  wy)*(1-wz) + \
            gather_fcc*(1-wx)*(  wy)*(  wz) + \
            gather_cff*(  wx)*(1-wy)*(1-wz) + \
            gather_cfc*(  wx)*(1-wy)*(  wz) + \
            gather_ccf*(  wx)*(  wy)*(1-wz) + \
            gather_ccc*(  wx)*(  wy)*(  wz)
        o = C.reduce_sum(a[:, :-1, ...] * im, 1) + a[:, -1, ...]
        print(o.shape)
        out.append(C.expand_dims(o, 0))
    out = C.splice(*out, axis=1)
    loss = C.reduce_l2(out)

    grid_val = np.random.rand(bs, cg, gd, gh, gw).astype(np.float32)
    if show_image:
        guide_val = skio.imread("/data/rgb.png").mean(2)[:h, :w].astype(
            np.float32)
        guide_val = np.expand_dims(guide_val / 255.0, 0)
        im_val = np.tile(np.expand_dims(guide_val, 1), [1, 3, 1, 1])
        out_val = out.eval({
            im: im_val,
            guide: guide_val,
            guide_no_grad: guide_val,
            grid: grid_val
        })
        out_val = np.clip(np.transpose(np.squeeze(out_val), [1, 2, 0]), 0, 1)
        skio.imsave("/output/imout.png", out_val)
    else:
        im_val = np.random.randn(bs, ci, h, w)
        guide_val = np.random.rand(bs, h, w).astype(np.float32)
        # burning iteration
        for it in range(5):
            print('burning (', it, ')')
            g = loss.grad({
                im: im_val,
                guide: guide_val,
                guide_no_grad: guide_val,
                grid: grid_val
            })
        # actual iterations
        start = time.time()
        for it in range(50):
            print('profiling (', it, ')')
            g = loss.grad({
                im: im_val,
                guide: guide_val,
                guide_no_grad: guide_val,
                grid: grid_val
            })
        end = time.time()
    runtime = (end - start) * 1000.0 / 50.0
    print('Runtime:', runtime)
Exemple #8
0
def test_Min(tmpdir):
    data0 = np.asarray([1., 1., 1., 1.], dtype=np.float32)
    data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=np.float32)
    model = C.element_min(data0, data1)
    verify_no_input(model, tmpdir, 'Min_0')
Exemple #9
0
import cntk
print("Tensor A = [1,2,3]")
print("Tensor B = [4,5,6]\n")

print("A+B:")
sum = cntk.plus([1, 2, 3], [4, 5, 6]).eval()
print("{}\n".format(sum))

print("A-B:")
minus = cntk.minus([1, 2, 3], [4, 5, 6]).eval()
print("{}\n".format(minus))

print("A*B:")
times = cntk.times([1, 3, 4], [4, 5, 6]).eval()
print("{}\n".format(times))

print("A/B:")
divide = cntk.element_divide([4, 32, 15], [2, 4, 5]).eval()
print("{}\n".format(divide))

print("A^B:")
pow = cntk.pow([1, 3, 4], [4, 2, 3]).eval()
print("{}\n".format(pow))

print("Min(A,B):")
min = cntk.element_min([1, 2, 3], [4, 5, 6], [2, 1, 0]).eval()
print("{}\n".format(min))

print("Max(A,B):")
max = cntk.element_max([1, 2, 3], [4, 5, 6], [2, 9, 0]).eval()
print("{}\n".format(max))
Exemple #10
0
    def run(self):
        while self.episode < EPISODES:
            obs, action, pred, reward = self.get_batch()
            obs, action, pred, reward = obs[:
                                            BUFFER_SIZE], action[:
                                                                 BUFFER_SIZE], pred[:
                                                                                    BUFFER_SIZE], reward[:
                                                                                                         BUFFER_SIZE]
            old_prediction = pred

            #
            # from IPython import embed;embed(header='run')
            # exit()
            #
            # pred_values = self.critic.predict(obs)

            # advantage = reward - pred_values

            # actor_loss = self.actor.fit([obs, advantage, old_prediction], [action], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=False)
            # critic_loss = self.critic.fit([obs], [reward], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=False)
            # self.writer.add_scalar('Actor loss', actor_loss.history['loss'][-1], self.gradient_steps)
            # self.writer.add_scalar('Critic loss', critic_loss.history['loss'][-1], self.gradient_steps)

            #region actor training
            pred_values = self.critic.eval({self.critic.arguments[0]: obs})

            advantage = reward - pred_values

            # actor_loss =
            c_action = C.input_variable(action.shape[-1], name='action')
            c_prediction = C.input_variable(old_prediction.shape[-1],
                                            name='old_prediction')
            c_advantage = C.input_variable(1, name='advantage')

            prob = C.reduce_sum(c_action * self.actor)
            old_prob = C.reduce_sum(c_action * c_prediction)
            ratio = prob / (old_prob + 1e-10)
            surr1 = c_advantage * ratio
            surr2 = c_advantage * C.clip(ratio, 1 - LOSS_CLIPPING,
                                         1 + LOSS_CLIPPING)
            # loss = -C.reduce_mean(C.element_min(surr1, surr2) + ENTROPY_LOSS * -(prob * C.log(prob + 1e-10))) # from keras
            neglog_loss = -C.element_min(surr1, surr2)
            entropy_loss = -ENTROPY_LOSS * -(prob * C.log(prob + 1e-10))
            # loss = -C.element_min(surr1, surr2) - ENTROPY_LOSS * -(prob * C.log(prob + 1e-10)) # from keras
            # loss = -C.element_min(surr1, surr2) + ENTROPY_LOSS * -(prob * C.log(prob + 1e-10)) # from pytorch ???
            loss = C.reduce_mean(neglog_loss + entropy_loss)
            actor_loss = loss

            trainer = C.Trainer(
                actor_loss, (actor_loss, None),
                C.adam(actor_loss.parameters,
                       C.learning_parameter_schedule_per_sample(LR),
                       C.learning_parameter_schedule_per_sample(0.99)))

            avg = 0
            avg_out = {neglog_loss.output: 0, entropy_loss.output: 0}
            for epoch in range(EPOCHS):
                data_size = action.shape[0]
                suffle_idx = random.sample(list(range(data_size)), data_size)

                mb_action = action[suffle_idx]
                mb_obs = obs[suffle_idx]
                mb_old_prediction = old_prediction[suffle_idx]
                mb_advantage = advantage[suffle_idx]

                updated, out = trainer.train_minibatch(dict(
                    zip(actor_loss.arguments,
                        [mb_advantage, mb_action, mb_obs, mb_old_prediction])),
                                                       outputs=[
                                                           neglog_loss.output,
                                                           entropy_loss.output
                                                       ])
                # print(trainer.previous_minibatch_loss_average)
                avg += trainer.previous_minibatch_loss_average
                avg_out[neglog_loss.output] += out[neglog_loss.output].mean()
                avg_out[entropy_loss.output] += out[entropy_loss.output].mean()
#endregion
            self.writer.add_scalar('Actor loss', avg / EPOCHS,
                                   self.gradient_steps)
            self.writer.add_scalar('neglog loss',
                                   avg_out[neglog_loss.output] / EPOCHS,
                                   self.gradient_steps)
            self.writer.add_scalar('entropy loss',
                                   avg_out[entropy_loss.output] / EPOCHS,
                                   self.gradient_steps)

            #region critic training
            c_reward = C.input_variable(1, name='reward')
            loss = C.reduce_mean(C.square(self.critic - c_reward))
            critic_loss = loss

            trainer = C.Trainer(
                critic_loss, (critic_loss, None),
                C.adam(critic_loss.parameters,
                       C.learning_parameter_schedule_per_sample(LR),
                       C.learning_parameter_schedule_per_sample(0.99)))

            avg = 0
            for epoch in range(EPOCHS):
                data_size = action.shape[0]
                suffle_idx = random.sample(list(range(data_size)), data_size)

                mb_obs = obs[suffle_idx]
                mb_reward = reward[suffle_idx]

                trainer.train_minibatch(
                    dict(zip(critic_loss.arguments, [mb_obs, mb_reward])))
                # print(trainer.previous_minibatch_loss_average)
                avg += trainer.previous_minibatch_loss_average


#endregion
            self.writer.add_scalar('Critic loss', avg / EPOCHS,
                                   self.gradient_steps)

            self.gradient_steps += 1
def validate_model(test_data, model, polymath):
    begin_logits = model.outputs[0]
    end_logits   = model.outputs[1]
    loss         = model.outputs[2]
    root = C.as_composite(loss.owner)
    mb_source, input_map = create_mb_and_map(root, test_data, polymath, randomize=False, repeat=False)
    begin_label = argument_by_name(root, 'ab')
    end_label   = argument_by_name(root, 'ae')

    begin_prediction = C.sequence.input_variable(1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True)
    end_prediction = C.sequence.input_variable(1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True)

    best_span_score = symbolic_best_span(begin_prediction, end_prediction)
    predicted_span = C.layers.Recurrence(C.plus)(begin_prediction - C.sequence.past_value(end_prediction))
    true_span = C.layers.Recurrence(C.plus)(begin_label - C.sequence.past_value(end_label))
    common_span = C.element_min(predicted_span, true_span)
    begin_match = C.sequence.reduce_sum(C.element_min(begin_prediction, begin_label))
    end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label))

    predicted_len = C.sequence.reduce_sum(predicted_span)
    true_len = C.sequence.reduce_sum(true_span)
    common_len = C.sequence.reduce_sum(common_span)
    f1 = 2*common_len/(predicted_len+true_len)
    exact_match = C.element_min(begin_match, end_match)
    precision = common_len/predicted_len
    recall = common_len/true_len
    overlap = C.greater(common_len, 0)
    s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes())
    stats = C.splice(s(f1), s(exact_match), s(precision), s(recall), s(overlap), s(begin_match), s(end_match))

    # Evaluation parameters
    minibatch_size = 20000
    num_sequences = 0

    stat_sum = 0
    loss_sum = 0

    while True:
        data = mb_source.next_minibatch(minibatch_size, input_map=input_map)
        if not data or not (begin_label in data) or data[begin_label].num_sequences == 0:
            break
        out = model.eval(data, outputs=[begin_logits,end_logits,loss], as_numpy=False)
        testloss = out[loss]
        g = best_span_score.grad({begin_prediction:out[begin_logits], end_prediction:out[end_logits]}, wrt=[begin_prediction,end_prediction], as_numpy=False)
        other_input_map = {begin_prediction: g[begin_prediction], end_prediction: g[end_prediction], begin_label: data[begin_label], end_label: data[end_label]}
        stat_sum += stats.eval((other_input_map))
        loss_sum += np.sum(testloss.asarray())
        num_sequences += data[begin_label].num_sequences

    stat_avg = stat_sum / num_sequences
    loss_avg = loss_sum / num_sequences

    print("Validated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}".format(
            num_sequences,
            loss_avg,
            stat_avg[0],
            stat_avg[1],
            stat_avg[2],
            stat_avg[3],
            stat_avg[4],
            stat_avg[5],
            stat_avg[6]))

    return loss_avg
Exemple #12
0
def self_attention_layer(in_dims: int,
                         out_dims: int,
                         name='self_attention',
                         as_block: bool = False,
                         k_ph: bool = False,
                         v_ph: bool = False,
                         mask_opt: bool = False) -> C.Function:
    sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims')

    X = C.placeholder(
        in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()),
        name=name + '_ph')

    if k_ph is False and v_ph is False:
        q = C.layers.Dense(out_dims, name=name + '_q')(
            X
        )  # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q')
        k = C.layers.Dense(out_dims, name=name + '_k')(
            X
        )  # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k')
        v = C.layers.Dense(out_dims, name=name + '_v')(
            X
        )  # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v')
    elif k_ph is True and v_ph is True:
        q = C.layers.Dense(out_dims, name=name + '_q')(X)
        k = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_k_ph')
        v = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_v_ph')
    else:
        raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}')

    q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q')
    k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k')
    v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v')

    scores = C.times_transpose(q_, k_, name=name + '_score_matrix')
    scaled = scores / sq_sa_dims  # div_k

    if mask_opt:
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')
        scaled = C.element_min(scaled, inf_mask)

    softmax = C.softmax(scaled, name=name + '_softmax')
    attention = C.times(softmax, v_, name=name + '_attention')

    result = C.to_sequence_like(attention, X)

    if as_block:
        if k_ph is False and v_ph is False:
            return C.as_block(result, [(X, X)], 'self_attention',
                              'self_attention_')
        elif k_ph is True and v_ph is True:
            return C.as_block(result, [(X, X), (k, k), (v, v)],
                              'self_attention', 'self_attention_')
        else:
            raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}')
    else:
        return result
Exemple #13
0
def gpt2_self_attention(token_dims: int,
                        head_dims: int,
                        mask_opt: bool = False,
                        as_block: bool = False,
                        name: str = 'self_attention'):
    X = C.placeholder(token_dims,
                      dynamic_axes=(C.Axis.default_batch_axis(),
                                    C.Axis.default_dynamic_axis()),
                      name=name)

    # q = C.layers.Dense(token_dims, name=name+'_q')(X)
    # k = C.layers.Dense(token_dims, name=name+'_k')(X)
    # v = C.layers.Dense(token_dims, name=name+'_v')(X)

    # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w')
    # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv')

    qkv = C.layers.Dense((3, token_dims), name='qkv')(X)
    q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2]

    q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q')
    k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k')
    v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v')

    #region split multi head attention
    q_heads = [
        C.squeeze(q_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    k_heads = [
        C.squeeze(k_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    v_heads = [
        C.squeeze(v_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    #endregion

    attention_head = []
    for i in range(head_dims):
        q = q_heads[i]
        k = k_heads[i]
        v = v_heads[i]

        #region score
        # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present
        q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i))  # q seq
        k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i))  # k seq
        v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i))  # v seq

        scores = C.times_transpose(q_, k_)
        scaled = scores * (1 / C.sqrt(v_.shape[-1]))

        #region mask opt
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')

        scaled = C.element_min(scaled, inf_mask)
        #endregion

        softmax = C.softmax(scaled)
        #endregion
        #region sum
        attention = C.times(softmax, v_)
        attention_seq = C.to_sequence_like(attention, X)
        #endregion
        attention_head.append(attention_seq)


#region merge attention heads
    attention = C.splice(*attention_head, name='merged_attention')
    #endergion

    #region project
    project = C.layers.Dense(token_dims, name='project')(attention)
    #endregion

    if as_block:
        return C.as_block(project, [(X, X)], 'gpt2_self_attention',
                          'gpt2_self_attention')

    return project