Ejemplo n.º 1
0
Archivo: model.py Proyecto: yw774/neon
    def allocate_deltas(self):
        if getattr(self, 'global_deltas', None) is None:
            self.global_deltas = DeltasTree()
            self.layers.allocate_deltas(self.global_deltas)

            # allocate the buffers now that all the
            # nesting and max sizes have been determined
            self.global_deltas.allocate_buffers(self.be)

        # set the deltas
        self.layers.set_deltas(self.global_deltas)
Ejemplo n.º 2
0
def check_rnn(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # ======== create models ========
    # neon RNN
    rnn = Recurrent(hidden_size, init_func, activation=Tanh())

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    Wxh = rnn_ref.Wxh
    Whh = rnn_ref.Whh
    bh = rnn_ref.bh

    # ========= generate data =================
    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = rnn.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(seq_len, batch_size,
                                   input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    # ========= running models ==========
    # run neon fprop
    rnn.configure((input_size, seq_len))
    rnn.prev_layer = True
    rnn.allocate()

    dtree = DeltasTree()
    rnn.allocate_deltas(dtree)
    dtree.allocate_buffers()
    rnn.set_deltas(dtree)

    rnn.fprop(inpa)

    # weights are only initialized after doing fprop, so now
    # make ref weights and biases the same with neon model
    Wxh[:] = rnn.W_input.get()
    Whh[:] = rnn.W_recur.get()
    bh[:] = rnn.b.get()

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list,
     d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref)

    # now test the bprop
    rnn.bprop(rnn.be.array(deltas))
    # grab the delta W from gradient buffer
    dWxh_neon = rnn.dW_input.get()
    dWhh_neon = rnn.dW_recur.get()
    db_neon = rnn.db.get()

    # comparing outputs
    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(rnn.outputs.get(),
                             h_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('fprop is verified')

    neon_logger.display('====Verifying update on W and b ====')
    neon_logger.display('dWxh')
    assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWhh')
    assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on bias====')
    neon_logger.display('db')
    assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('bprop is verified')

    return
Ejemplo n.º 3
0
def gradient_calc(seq_len,
                  input_size,
                  hidden_size,
                  batch_size,
                  epsilon=None,
                  rand_scale=None,
                  inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon rnn instance
    rnn = Recurrent(hidden_size, Gaussian(), activation=Tanh())
    inpa = rnn.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    rnn.configure((input_size, seq_len))
    rnn.prev_layer = True
    rnn.allocate()

    dtree = DeltasTree()
    rnn.allocate_deltas(dtree)
    dtree.allocate_buffers()
    rnn.set_deltas(dtree)

    out_bl = rnn.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = rnn.bprop(rnn.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_pos = rnn.fprop(rnn.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_neg = rnn.fprop(rnn.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 * (loss_pos - loss_neg) / epsilon

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del rnn
    return (grads_est, deltas_neon)
Ejemplo n.º 4
0
def deltas_buffer_wref():
    # returns 2 empty DeltasTree object for
    # tests that need to allocate shared
    # deltas buffers for 2 models
    # (one test, one reference)
    return (DeltasTree(), DeltasTree())
Ejemplo n.º 5
0
def deltas_buffer():
    # empty DeltasTree object for tests that need
    # to allocate shared deltas buffers
    return DeltasTree()
Ejemplo n.º 6
0
def gradient_calc(seq_len,
                  input_size,
                  hidden_size,
                  batch_size,
                  add_init_state=False,
                  epsilon=None,
                  rand_scale=None,
                  inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon gru instance
    gru = GRU(hidden_size,
              init=Gaussian(),
              activation=Tanh(),
              gate_activation=Logistic())
    inpa = gru.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    gru.configure((input_size, seq_len))
    gru.prev_layer = True
    gru.allocate()

    test_buffer = DeltasTree()
    gru.allocate_deltas(test_buffer)
    test_buffer.allocate_buffers()
    gru.set_deltas(test_buffer)

    if add_init_state is True:
        slice_shape = (hidden_size, batch_size)
        ini_s = np.random.randn(*slice_shape)
        ini_s_dev = gru.be.array(ini_s.copy())
        out_bl = gru.fprop(inpa, ini_s_dev).get()
    else:
        out_bl = gru.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = gru.bprop(gru.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_gru(gru)
        gru.allocate()
        if add_init_state is True:
            ini_s_dev = gru.be.array(ini_s.copy())
            out_pos = gru.fprop(gru.be.array(inp_pert), ini_s_dev).get()
        else:
            out_pos = gru.fprop(gru.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_gru(gru)
        gru.allocate()
        if add_init_state is True:
            ini_s_dev = gru.be.array(ini_s.copy())
            out_neg = gru.fprop(gru.be.array(inp_pert), ini_s_dev).get()
        else:
            out_neg = gru.fprop(gru.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 / float(epsilon) * (loss_pos - loss_neg)

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del gru
    return (grads_est, deltas_neon)
Ejemplo n.º 7
0
def check_gru(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              inp_moms=[0.0, 1.0],
              add_init_state=False):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    slice_shape = (hidden_size, batch_size)

    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon GRU
    gru = GRU(hidden_size,
              init_func,
              activation=Tanh(),
              gate_activation=Logistic())

    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inp_dev = gru.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # run neon fprop
    gru.configure((input_size, seq_len))
    gru.prev_layer = True
    gru.allocate()

    test_buffer = DeltasTree()
    gru.allocate_deltas(test_buffer)
    test_buffer.allocate_buffers()
    gru.set_deltas(test_buffer)

    if add_init_state:
        init_state = np.random.rand(*slice_shape) * inp_moms[1] + inp_moms[0]
        init_state_dev = gru.be.array(init_state)
        gru.fprop(inp_dev, init_state=init_state_dev)
    else:
        gru.fprop(inp_dev)

    # reference numpy GRU
    gru_ref = RefGRU(input_size, hidden_size)
    WGRU = gru_ref.weights

    # make ref weights and biases the same with neon model
    r_range = list(range(hidden_size))
    z_range = list(range(hidden_size, hidden_size * 2))
    c_range = list(range(hidden_size * 2, hidden_size * 3))

    WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range]
    WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range]
    WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range]

    WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range]
    WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range]
    WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range]

    WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range]
    WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range]
    WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range]

    # transpose input X and do fprop
    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(seq_len, batch_size,
                                   input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    if add_init_state:
        init_state_ref = init_state.copy()
        (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list,
         dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref, init_state_ref)
    else:
        (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list,
         dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref)

    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(gru.outputs.get(),
                             h_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('fprop is verified')

    # now test the bprop
    neon_logger.display('Making sure neon GRU matches numpy GRU in bprop')
    gru.bprop(gru.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = gru.dW_input.get()
    dWrecur_neon = gru.dW_recur.get()
    db_neon = gru.db.get()
    dWxr_neon = dWinput_neon[r_range]
    dWxz_neon = dWinput_neon[z_range]
    dWxc_neon = dWinput_neon[c_range]
    dWrr_neon = dWrecur_neon[r_range]
    dWrz_neon = dWrecur_neon[z_range]
    dWrc_neon = dWrecur_neon[c_range]
    dbr_neon = db_neon[r_range]
    dbz_neon = db_neon[z_range]
    dbc_neon = db_neon[c_range]

    drzc_neon = gru.rzhcan_delta_buffer.get()
    dr_neon = drzc_neon[r_range]
    dz_neon = drzc_neon[z_range]
    dc_neon = drzc_neon[c_range]

    dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr]
    dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz]
    dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc]
    dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr]
    dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz]
    dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc]
    dbr_ref = dWGRU_ref[gru_ref.dW_ind_br]
    dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz]
    dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc]

    # neon_logger.display '====Verifying hidden deltas ===='
    neon_logger.display('====Verifying r deltas ====')
    assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying z deltas ====')
    assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying hcan deltas ====')
    assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on W_input====')
    neon_logger.display('dWxr')
    assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWxz')
    assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWxc')
    assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on W_recur====')

    neon_logger.display('dWrr')
    assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWrz')
    assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWrc')
    assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on bias====')
    neon_logger.display('dbr')
    assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dbz')
    assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dbc')
    assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('bprop is verified')

    return
Ejemplo n.º 8
0
def test_branch_model(backend_gpu):
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 64
    main1 = main_branch()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)
    neon_layer.allocate()
    neon_logger.display(neon_layer.nested_str())
    neon_layer.layers[0].prev_layer = True

    neon_layer.allocate_deltas()

    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()

        temp_buff = DeltasTree()
        ll.allocate_deltas(temp_buff)
        temp_buff.allocate_buffers()
        ll.set_deltas(temp_buff)

    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            temp_buff = DeltasTree()
            ll.allocate_deltas(temp_buff)
            temp_buff.allocate_buffers()
            ll.set_deltas(temp_buff)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[8].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out, neon_out_ref, rtol=0)

    neon_logger.display("Beginning Back prop")
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[8:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas

    neon_ref_deltas = ref_deltas.get()

    assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
Ejemplo n.º 9
0
def test_branch_model_fork_cpu(backend_cpu64):
    from neon.layers import BranchNode, Tree
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 32
    bnode = BranchNode()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()

    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()

    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        temp_deltas = DeltasTree()
        temp_deltas.proc_layer(ll)
        temp_deltas.allocate_buffers()
        ll.set_deltas(temp_deltas)

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            temp_deltas = DeltasTree()
            temp_deltas.proc_layer(ll)
            temp_deltas.allocate_buffers()
            ll.set_deltas(temp_deltas)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0)

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    assert allclose_with_out(neon_out_ref2, neon_out[1])

    neon_logger.display("Beginning Back prop")
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = alpha2 * lbranch2[0].deltas
    ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas
    neon_ref_deltas = ref_deltas.get()
    assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0)

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
Ejemplo n.º 10
0
def general_gradient_comp(layer,
                          inp,
                          epsilon=1.0e-5,
                          loss_scale=None,
                          lshape=None,
                          pert_inds=None,
                          pooling=False):
    # given a layer, test the bprop
    # using finite differences

    # run neon fprop
    layer.reset()
    inpa = layer.be.array(inp.copy())
    in_shape = lshape if lshape is not None else inpa.shape[0]
    layer.configure(in_shape)
    if layer.owns_delta:
        layer.prev_layer = True
    layer.allocate()

    dtree = DeltasTree()
    layer.allocate_deltas(dtree)
    dtree.allocate_buffers()
    layer.set_deltas(dtree)

    out = layer.fprop(inpa).get()

    out_shape = out.shape

    # scale out by random matrix...
    if loss_scale is None:
        loss_scale = np.random.random(out_shape) * 2.0 - 1.0

    # the loss function is:
    # loss_bl = np.sum(loss_scale * out)

    # run bprop, input deltas is rand_scale
    bprop_deltas = layer.bprop(layer.be.array(loss_scale.copy())).get()

    max_abs_err = -1.0
    max_rel_err = -1.0

    inp_pert = inp.copy()
    if pert_inds is None:
        pert_inds = list(range(inp.size))
    for pert_ind in pert_inds:
        save_val = inp_pert.flat[pert_ind]
        # add/subtract perturbations to input
        inp_pert.flat[pert_ind] = save_val + epsilon
        # and run fprop on perturbed input
        layer.reset()
        layer.configure(in_shape)
        layer.allocate()
        inpa = layer.be.array(inp_pert.copy())
        out_pos = layer.fprop(inpa).get().copy()

        inp_pert.flat[pert_ind] = save_val - epsilon
        inpa = layer.be.array(inp_pert.copy())
        layer.reset()
        layer.configure(in_shape)
        layer.allocate()
        out_neg = layer.fprop(inpa).get().copy()

        # calculate the loss on outputs
        loss_pos = np.sum(loss_scale * out_pos)
        loss_neg = np.sum(loss_scale * out_neg)
        grad_est = 0.5 * (loss_pos - loss_neg) / epsilon

        # reset input
        inp_pert.flat[pert_ind] = save_val

        bprop_val = bprop_deltas.flat[pert_ind]

        abs_err = abs(grad_est - bprop_val)
        if abs_err > max_abs_err:
            max_abs_err = abs_err
            max_abs_vals = [grad_est, bprop_val]

        if (abs(grad_est) + abs(bprop_val)) == 0.0:
            rel_err = 0.0
        else:
            rel_err = float(abs_err) / (abs(grad_est) + abs(bprop_val))
        if rel_err > max_rel_err:
            max_rel_err = rel_err
            max_rel_vals = [grad_est, bprop_val]

    neon_logger.display('Worst case diff %e, vals grad: %e, bprop: %e' %
                        (max_abs_err, max_abs_vals[0], max_abs_vals[1]))
    neon_logger.display('Worst case diff %e, vals grad: %e, bprop: %e' %
                        (max_rel_err, max_rel_vals[0], max_rel_vals[1]))
    return (max_abs_err, max_rel_err)
Ejemplo n.º 11
0
def check_lstm(seq_len, input_size, hidden_size,
               batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # run neon fprop
    lstm.configure((input_size, seq_len))
    lstm.prev_layer = True  # Hack to force allocating a delta buffer
    lstm.allocate()

    dtree = DeltasTree()
    lstm.allocate_deltas(dtree)
    dtree.allocate_buffers()
    lstm.set_deltas(dtree)

    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size + 1, :] = lstm.W_input.get().T
    WLSTM[input_size + 1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref,
                                                             WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    neon_logger.display('====Verifying IFOG====')
    assert allclose_with_out(lstm.ifog_buffer.get(),
                             IFOGf_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying cell states====')
    assert allclose_with_out(lstm.c_act_buffer.get(),
                             Ct_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(lstm.outputs.get(),
                             Hout_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('fprop is verified')

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref,
                                                               batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size + 1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    neon_logger.display('Making sure neon LSTM match numpy LSTM in bprop')
    neon_logger.display('====Verifying update on W_recur====')

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying update on W_input====')
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying update on bias====')
    assert allclose_with_out(db_neon.flatten(),
                             db_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying output delta====')
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('bprop is verified')

    return