コード例 #1
0
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
コード例 #2
0
def make_ntm_initial_states(opt):
    n, m, h, b = opt.n, opt.m, opt.h, opt.b
    M_1nm = cgt.shared(.1*nr.randn(1,n,m))
    winit_1Hn = cgt.shared(.1*nr.rand(1,2*h,n))
    winit_1Hn = sum_normalize2(cgt.exp(winit_1Hn))
    rinit_1hm = cgt.shared(np.zeros((1,h,m)))
    return [cgt.repeat(arr, b, axis=0) for arr in (M_1nm, winit_1Hn, rinit_1hm)]
コード例 #3
0
def test_flatvec():
    cgt.reset_config
    cgt.set_precision('double')
    cgt.core.update_config(backend="python")  # XXX

    N = 10
    K = 3

    Xval = np.random.randn(N, K)
    wval = np.random.randn(K)
    bval = np.random.randn()
    yval = np.random.randn(N)

    X_nk = cgt.shared(Xval, "X")
    y_n = cgt.shared(yval, "y")
    w_k = cgt.shared(wval, "w")
    b = cgt.shared(bval, name="b")

    ypred = cgt.dot(X_nk, w_k) + b

    err = cgt.sum(cgt.square(ypred - y_n))
    g = cgt.grad(err, [w_k, b])
    g = core.simplify(g)

    pars = [w_k, b]
    flatx = nn.setup_contiguous_storage(pars)
    f = cgt.function([], [err, cgt.flatcat(g)])
コード例 #4
0
ファイル: test_devices.py プロジェクト: zxie/cgt
def test_devices():
    N = 10
    K = 3

    compile_info = cgt.compilation.get_compile_info()
    cuda_enabled = compile_info["CGT_ENABLE_CUDA"]
    if not cuda_enabled:
        raise SkipTest("cuda disabled")

    Xval = np.random.randn(N, K).astype(cgt.floatX)
    wval = np.random.randn(K).astype(cgt.floatX)
    bval = np.asarray(np.random.randn()).astype(cgt.floatX)
    yval = np.random.randn(N).astype(cgt.floatX)

    with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")):

        X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu'))
        y_n = cgt.shared(yval, "y")
        w_k = cgt.shared(wval, "w")
        b = cgt.shared(bval, name="b")

        print "bval", bval

        ypred = cgt.dot(cgt.square(X_nk), w_k) + b

        err = cgt.sum(cgt.sin(ypred - y_n))
        g = cgt.grad(err, [w_k, b])
        outputs = [err] + g
        f = cgt.function([], [err] + g)
        results = f()
        print results
        assert np.allclose(
            results[0],
            np.sin(np.square(Xval).dot(wval) + bval - yval).sum())
コード例 #5
0
ファイル: _test_flatvec.py プロジェクト: EdsterG/cgt
def test_flatvec():
    cgt.reset_config
    cgt.set_precision('double')
    cgt.core.update_config(backend="python") # XXX

    N = 10
    K = 3

    Xval = np.random.randn(N,K)
    wval = np.random.randn(K)
    bval = np.random.randn()
    yval = np.random.randn(N)

    X_nk = cgt.shared(Xval, "X")
    y_n = cgt.shared(yval, "y")
    w_k = cgt.shared(wval, "w")
    b = cgt.shared(bval, name="b")

    ypred = cgt.dot(X_nk, w_k) + b

    err = cgt.sum(cgt.square(ypred - y_n))
    g = cgt.grad(err, [w_k, b])
    g = core.simplify(g)

    pars = [w_k, b]
    flatx = nn.setup_contiguous_storage(pars)
    f = cgt.function([], [err,cgt.flatcat(g)])
コード例 #6
0
def make_ntm_initial_states(opt):
    n, m, h, b = opt.n, opt.m, opt.h, opt.b
    M_1nm = cgt.shared(.1*nr.randn(1,n,m))
    winit_1Hn = cgt.shared(.1*nr.rand(1,2*h,n))
    winit_1Hn = sum_normalize2(cgt.exp(winit_1Hn))
    rinit_1hm = cgt.shared(np.zeros((1,h,m)))
    return [cgt.repeat(arr, b, axis=0) for arr in (M_1nm, winit_1Hn, rinit_1hm)]
コード例 #7
0
ファイル: test_devices.py プロジェクト: EdsterG/cgt
def test_devices():
    N = 10
    K = 3

    compile_info = cgt.compilation.get_compile_info()
    cuda_enabled = compile_info["CGT_ENABLE_CUDA"]
    if not cuda_enabled:
        raise SkipTest("cuda disabled")

    Xval = np.random.randn(N,K).astype(cgt.floatX)
    wval = np.random.randn(K).astype(cgt.floatX)
    bval = np.asarray(np.random.randn()).astype(cgt.floatX)
    yval = np.random.randn(N).astype(cgt.floatX)

    with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")):

        X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu'))
        y_n = cgt.shared(yval, "y")
        w_k = cgt.shared(wval, "w")
        b = cgt.shared(bval, name="b")

        print "bval",bval

        ypred = cgt.dot(cgt.square(X_nk), w_k) + b

        err = cgt.sum(cgt.sin(ypred - y_n))
        g = cgt.grad(err, [w_k, b])
        outputs = [err]+g
        f = cgt.function([], [err]+g)
        results = f()
        print results
        assert np.allclose(results[0] , np.sin(np.square(Xval).dot(wval)+bval-yval).sum())
コード例 #8
0
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
コード例 #9
0
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    """ Adadelta updates
    The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes.

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))``
    * ``param = param - learning_rate * update``
    * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form
    (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new)

    References
    ----------
    .. [1] Zeiler, M. D. (2012):
           ADADELTA: An Adaptive Learning Rate Method.
           arXiv Preprint arXiv:1212.5701.
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        delta_accu = cgt.shared(
            np.zeros(param.op.get_shape(), dtype=param.dtype))

        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) /
                  cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - learning_rate * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update**2
        updates.append((delta_accu, delta_accu_new))

    return updates
コード例 #10
0
ファイル: test_optimizers.py プロジェクト: EdsterG/cgt
def test_nesterov_momentum():
    results = []
    for scale in scales:
        A = cgt.shared(1.0)
        B = cgt.shared(1.0)
        updates = nn.momentum(f(A, scale) + f(B, scale), [A, B], learning_rate=0.1, mu=0.5)
        do_update = cgt.function([], [], updates=updates)
        for _ in range(10):
            do_update()
        assert np.allclose(A.op.get_value(), B.op.get_value())
        results.append(A.op.get_value().copy())

    assert np.allclose(results, torch_values['nesterov_momentum'])
コード例 #11
0
ファイル: nn.py プロジェクト: EdsterG/cgt
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    """ Adadelta updates
    The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes.

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))``
    * ``param = param - learning_rate * update``
    * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form
    (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new)

    References
    ----------
    .. [1] Zeiler, M. D. (2012):
           ADADELTA: An Adaptive Learning Rate Method.
           arXiv Preprint arXiv:1212.5701.
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        delta_accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))

        accu_new = rho * accu + (1 - rho) * grad ** 2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - learning_rate * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2
        updates.append((delta_accu, delta_accu_new))

    return updates
コード例 #12
0
ファイル: test_optimizers.py プロジェクト: zxie/cgt
def test_sgd():
    results = []
    for scale in scales:
        A = cgt.shared(1.0)
        B = cgt.shared(1.0)
        updates = nn.sgd(f(A, scale) + f(B, scale), [A, B], learning_rate=0.1)
        do_update = cgt.function([], [], updates=updates)
        for _ in range(10):
            do_update()

        assert np.allclose(A.op.get_value(), B.op.get_value())
        results.append(A.op.get_value().copy())

    assert np.allclose(results, torch_values['sgd'])
コード例 #13
0
ファイル: test_optimizers.py プロジェクト: zxie/cgt
def test_adadelta():
    results = []
    for scale in scales:
        A = cgt.shared(1.0)
        B = cgt.shared(1.0)
        updates = nn.adadelta(f(A, scale) + f(B, scale), [A, B])
        do_update = cgt.function([], [], updates=updates)
        for _ in range(10):
            do_update()

        assert np.allclose(A.op.get_value(), B.op.get_value())
        results.append(A.op.get_value().copy())

    assert np.allclose(results, torch_values['adadelta'])
コード例 #14
0
ファイル: test_optimizers.py プロジェクト: EdsterG/cgt
def test_adadelta():
    results = []
    for scale in scales:
        A = cgt.shared(1.0)
        B = cgt.shared(1.0)
        updates = nn.adadelta(f(A, scale) + f(B, scale), [A, B])
        do_update = cgt.function([], [], updates=updates)
        for _ in range(10):
            do_update()

        assert np.allclose(A.op.get_value(), B.op.get_value())
        results.append(A.op.get_value().copy())

    assert np.allclose(results, torch_values['adadelta'])
コード例 #15
0
ファイル: test_optimizers.py プロジェクト: EdsterG/cgt
def test_rmsprop():
    results = []
    for scale in scales:
        A = cgt.shared(1.0)
        B = cgt.shared(1.0)
        updates = nn.rmsprop(f(A, scale) + f(B, scale), [A, B], learning_rate=0.01)
        do_update = cgt.function([], [], updates=updates)
        for _ in range(10):
            do_update()

        assert np.allclose(A.op.get_value(), B.op.get_value())
        results.append(A.op.get_value().copy())

    assert np.allclose(results, torch_values['rmsprop'])
コード例 #16
0
ファイル: test_optimizers.py プロジェクト: mirwang666/cgt
def run_nesterov_momenutm():
    results = []
    for scale in scales:
        A = cgt.shared(1.0)
        B = cgt.shared(1.0)
        updates = nn.momentum(f(A, scale) + f(B, scale), [A, B], learning_rate=0.1, momentum=0.5)
        do_update = cgt.function([], [], updates=updates)
        for _ in range(10):
            do_update()

        assert np.allclose(A.op.get_value(), B.op.get_value())
        results.append(A.op.get_value().copy())

    assert np.allclose(results, torch_values['nesterov_momentum'])
コード例 #17
0
def momentum(cost, params, learning_rate, mu=0.9):
    """Stochastic Gradient Descent (SGD) updates with momentum
    Math:
    * ``velocity := mu * velocity - learning_rate * grad``
    * ``param := param + velocity``
    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    momentum: float
        Tunes the weight given to the velocity term.
    
    Returns
    -------
    list of tuples of the form (param, new_param) and (velocity, new_velocity)
    """
    updates = []
    grads = cgt.grad(cost, params)
    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        velocity = cgt.shared(np.zeros(param.op.get_shape(),
                                       dtype=param.dtype))
        new_velocity = mu * velocity - learning_rate * grad
        new_param = param + new_velocity
        updates.append((velocity, new_velocity))
        updates.append((param, new_param))

    return updates
コード例 #18
0
ファイル: nn.py プロジェクト: x724/cgt
def nesterov_momentum(cost, params, learning_rate, momentum=0.9):
    """Stochastic Gradient Descent (SGD) updates with Nesterov momentum

    Math:
    * ``velocity := momentum * velocity - learning_rate * grad``
    * ``param := momentum*velocity + param - learning_rate * grad``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    momentum: float
        Tunes the weight given to the velocity term.

    Returns
    -------
    list of tuples of the form [(param, updates) (velocity, velocity_update)]
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        velocity = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        x = momentum * velocity - learning_rate * grad
        updates.append((velocity, x))
        updates.append((param, momentum * x + param - learning_rate * grad))

    return updates
コード例 #19
0
ファイル: nn.py プロジェクト: EdsterG/cgt
def momentum(cost, params, learning_rate, mu=0.9):
    """Stochastic Gradient Descent (SGD) updates with momentum
    Math:
    * ``velocity := mu * velocity - learning_rate * grad``
    * ``param := param + velocity``
    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    momentum: float
        Tunes the weight given to the velocity term.
    
    Returns
    -------
    list of tuples of the form (param, new_param) and (velocity, new_velocity)
    """
    updates = []
    grads = cgt.grad(cost, params)
    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        velocity = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        new_velocity = mu * velocity - learning_rate * grad
        new_param = param + new_velocity
        updates.append((velocity, new_velocity))
        updates.append((param, new_param))

    return updates
コード例 #20
0
ファイル: nn.py プロジェクト: nebw/cgt
def nesterov_momentum(cost, params, learning_rate, momentum=0.9):
    """Stochastic Gradient Descent (SGD) updates with Nesterov momentum

    Math:
    * ``velocity := momentum * velocity - learning_rate * grad``
    * ``param := momentum*velocity + param - learning_rate * grad``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    momentum: float
        Tunes the weight given to the velocity term.

    Returns
    -------
    list of tuples of the form [(param, updates) (velocity, velocity_update)]
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        velocity = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        x = momentum * velocity - learning_rate * grad
        updates.append((velocity, x))
        updates.append((param, momentum*x + param - learning_rate * grad))

    return updates
コード例 #21
0
ファイル: nn.py プロジェクト: x724/cgt
def parameter(val, name=None, device=None):
    fixed_shape_mask = "all"
    out = cgt.shared(val,
                     name=name,
                     device=device,
                     fixed_shape_mask=fixed_shape_mask)
    out.props["is_parameter"] = True
    return out
コード例 #22
0
def test_update():
    with cgt.scoped_update_config(parallel=True):
        xval = np.array(1.5)
        x = cgt.shared(xval)
        f = cgt.function([], x.sum(), updates=[(x, x + 1)])
        before = x.op.get_value().copy()
        f()
        after = x.op.get_value()
        assert np.allclose(after, before + 1)
コード例 #23
0
ファイル: FC.py プロジェクト: zuiwufenghua/VIN
def adagrad_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        delta_accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))

        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) /
                  cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - stepsize * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update**2
        updates.append((delta_accu, delta_accu_new))
    return updates
コード例 #24
0
ファイル: test_par_interp.py プロジェクト: ketranm/cgt
def test_update():
    with cgt.scoped_update_config(parallel = True, backend="native"):
        xval = np.array(1.5)
        x = cgt.shared(xval)
        f = cgt.function([], x.sum(), updates=[(x,x+1)])
        before = x.op.get_value().copy()
        f()
        after = x.op.get_value()
        assert np.allclose(after , before+1)
コード例 #25
0
ファイル: demo_mnist.py プロジェクト: EdsterG/cgt
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for p, g in zip(params, grads):
        acc = cgt.shared(p.op.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * cgt.square(g)
        gradient_scaling = cgt.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - stepsize * g))
    return updates
コード例 #26
0
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for p, g in zip(params, grads):
        acc = cgt.shared(p.op.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * cgt.square(g)
        gradient_scaling = cgt.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - stepsize * g))
    return updates
コード例 #27
0
ファイル: __init__.py プロジェクト: AliceLane/tensorfuse
def shared(val, name=None, broadcastable=None, borrow=False):
    if is_theano():
        return theano.shared(val, name=name, broadcastable=broadcastable)
    elif is_cgt():
        return cgt.shared(val, name=name)
    else:
        var = tf.Variable(val.astype(floatX), name=name)
        var._tensorfuse_shape_template = val.shape
        var._tensorfuse_shared = True
        compat.tf_add_blank_var(var)
        return var
コード例 #28
0
def shared(val, name=None, broadcastable=None, borrow=False):
    if is_theano():
        return theano.shared(val, name=name, broadcastable=broadcastable)
    elif is_cgt():
        return cgt.shared(val, name=name)
    else:
        var = tf.Variable(val.astype(floatX), name=name)
        var._tensorfuse_shape_template = val.shape
        var._tensorfuse_shared = True
        compat.tf_add_blank_var(var)
        return var
コード例 #29
0
    def __init__(self, xdim, args, dec="bernoulli"):
        self.xdim = xdim
        self.hdim = args.hdim
        self.zdim = args.zdim
        self.lmbda = args.lmbda  # weight decay coefficient * 2
        self.x = cgt.matrix("x", dtype=cgt.floatX)
        self.eps = cgt.matrix("eps", dtype=cgt.floatX)

        self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps)
        if dec == "bernoulli":
            # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y)
            self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        elif dec == "gaussian":
            self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        else:
            raise RuntimeError("unrecognized decoder %" % dec)

        self.cost = (-cgt.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size
        self.params = self.enc_mlp.params + self.dec_mlp.params
        # L2 regularization
        self.gparams = [cgt.grad(self.cost, [p])[0] + self.lmbda * p for p in self.params]
        self.gaccums = [cgt.shared(np.zeros(p.op.get_value().shape, dtype=cgt.floatX)) for p in self.params]

        # XXX replace w/ adagrad update from nn
        ADAGRAD_EPS = 1e-10  # for stability
        self.updates = [
            (param, param - args.lr * gparam / cgt.sqrt(gaccum + cgt.square(gparam) + ADAGRAD_EPS))
            for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums)
        ]
        self.updates += [
            (gaccum, gaccum + cgt.square(gparam))
            for gaccum, gparam in zip(self.gaccums, self.gparams)
        ]

        self.train = cgt.function(
            [self.x, self.eps],
            self.cost,
            updates=self.updates
        )
        self.test = cgt.function(
            [self.x, self.eps],
            self.cost,
            updates=None
        )
        # can be used for semi-supervised learning for example
        self.encode = cgt.function(
            [self.x, self.eps],
            self.enc_mlp.out
        )
コード例 #30
0
def test_array_wrapper():
    xval = np.zeros(10)
    x = cgt.shared(xval)
    f = cgt.function([], [], updates=[(x, x + 1)])
    f()
    g = cgt.function([], x.sum())
    assert np.allclose(x.op.get_value(), xval + 1)
    xval2 = np.arange(10)
    x.op.set_value(xval2)
    print x.op.get_value()
    assert np.allclose(x.op.get_value(), xval2)
    assert g() == xval2.sum()
    f()
    assert np.allclose(x.op.get_value(), xval2 + 1)
    assert g() == (xval2 + 1).sum()
コード例 #31
0
    def __init__(self, input, n_in, n_out, W=None, b=None,
                 activation=cgt.tanh, prefix=""):
        self.n_in = n_in
        self.n_out = n_out

        if W is None:
            # XXX replace with nn init
            W_values = np.asarray(
                rng.uniform(
                    low=-np.sqrt(6. / (n_in + n_out)),
                    high=np.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=cgt.floatX
            )
            if activation == cgt.sigmoid:
                W_values *= 4

            W = cgt.shared(W_values, name=prefix+"_W")

        if b is None:
            b_values = np.zeros((n_out,), dtype=cgt.floatX)
            b = cgt.shared(b_values, name=prefix+"_b")

        self.W = W
        self.b = b

        # XXX broadcast api may change
        lin_output = cgt.broadcast("+", cgt.dot(input, self.W),
                cgt.dimshuffle(self.b, ["x", 0]), "xx,1x")
        self.output = (
            lin_output if activation is None
            else activation(lin_output)
        )
        # parameters of the model
        self.params = [self.W, self.b]
コード例 #32
0
ファイル: test_array_wrapper.py プロジェクト: EdsterG/cgt
def test_array_wrapper():
    xval = np.zeros(10)
    x = cgt.shared(xval)
    f = cgt.function([],[],updates=[(x,x+1)])
    f()
    g = cgt.function([],x.sum())
    assert np.allclose(x.op.get_value(), xval+1)
    xval2 = np.arange(10)
    x.op.set_value(xval2)
    print x.op.get_value()
    assert np.allclose(x.op.get_value(), xval2)
    assert g() == xval2.sum()
    f()
    assert np.allclose(x.op.get_value(), xval2+1)
    assert g() == (xval2+1).sum()
コード例 #33
0
ファイル: test_inc_subtensor.py プロジェクト: zxie/cgt
def test_incsubtensor0():
    # First let's test fancy slice along zeroth dimension

    W = cgt.shared(np.zeros((5, 3)), name="W")
    inc = cgt.matrix()  # we'll increment W by this matrix
    incval = np.arange(9).reshape(3, 3)

    inds = cgt.vector(dtype='i8')
    updates = {W: cgt.inc_subtensor(W, inds, inc)}
    f = cgt.function([inds, inc], [], updates=updates)
    f([1, 2, 4], incval)

    assert np.allclose(
        W.op.get_value(),
        np.array([[0., 0., 0.], [0., 1., 2.], [3., 4., 5.], [0., 0., 0.],
                  [6., 7., 8.]]))
コード例 #34
0
ファイル: test_array_wrapper.py プロジェクト: zclfly/cgt
def runtest(backend, precision):
    with cgt.scoped_update_config(backend='native', precision=precision):
        xval = np.zeros(10)
        x = cgt.shared(xval)
        f = cgt.function([], [], updates=[(x, x + 1)])
        f()
        g = cgt.function([], x.sum())
        assert np.allclose(x.op.get_value(), xval + 1)
        xval2 = np.arange(10)
        x.op.set_value(xval2)
        print x.op.get_value()
        assert np.allclose(x.op.get_value(), xval2)
        assert g() == xval2.sum()
        f()
        assert np.allclose(x.op.get_value(), xval2 + 1)
        assert g() == (xval2 + 1).sum()
コード例 #35
0
ファイル: test_array_wrapper.py プロジェクト: ketranm/cgt
def runtest(backend, precision):
    with cgt.scoped_update_config(backend='native',precision=precision):
        xval = np.zeros(10)
        x = cgt.shared(xval)
        f = cgt.function([],[],updates=[(x,x+1)])
        f()
        g = cgt.function([],x.sum())
        assert np.allclose(x.op.get_value(), xval+1)
        xval2 = np.arange(10)
        x.op.set_value(xval2)
        print x.op.get_value()
        assert np.allclose(x.op.get_value(), xval2)
        assert g() == xval2.sum()
        f()
        assert np.allclose(x.op.get_value(), xval2+1)
        assert g() == (xval2+1).sum()
コード例 #36
0
ファイル: gru.py プロジェクト: EdsterG/cgt
    def __init__(self,input_sizes,mem_size,name_prefix=""):

        Wiz_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes]
        self.Wizs = [cgt.shared(Wiz_val,name=name_prefix+"Wiz") for Wiz_val in Wiz_vals]
        Wmz_val = normc(randnf(mem_size,mem_size))
        self.Wmz = cgt.shared(Wmz_val,name=name_prefix+"Wmz")
        bz = np.zeros((1,mem_size),cgt.floatX)
        self.bz = cgt.shared(bz,name=name_prefix+"bz")

        Wir_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes]
        self.Wirs = [cgt.shared(Wir_val,name=name_prefix+"Wir") for Wir_val in Wir_vals]
        Wmr_val = normc(randnf(mem_size,mem_size))
        self.Wmr = cgt.shared(Wmr_val,name=name_prefix+"Wmr")
        br = np.zeros((1,mem_size),cgt.floatX)
        self.br = cgt.shared(br,name=name_prefix+"br")

        Wim_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes]
        self.Wims = [cgt.shared(Wim_val,name=name_prefix+"Wim") for Wim_val in Wim_vals]
        Wmm_val = normc(np.eye(mem_size,dtype=cgt.floatX))
        self.Wmm = cgt.shared(Wmm_val,name=name_prefix+"Wmm")
        bm = np.zeros((1,mem_size),cgt.floatX)
        self.bm = cgt.shared(bm,name=name_prefix+"bm")
コード例 #37
0
    def __init__(self, input_sizes, mem_size, name_prefix=""):

        Wiz_vals = [
            normc(randnf(input_size, mem_size)) for input_size in input_sizes
        ]
        self.Wizs = [
            cgt.shared(Wiz_val, name=name_prefix + "Wiz")
            for Wiz_val in Wiz_vals
        ]
        Wmz_val = normc(randnf(mem_size, mem_size))
        self.Wmz = cgt.shared(Wmz_val, name=name_prefix + "Wmz")
        bz = np.zeros((1, mem_size), cgt.floatX)
        self.bz = cgt.shared(bz, name=name_prefix + "bz")

        Wir_vals = [
            normc(randnf(input_size, mem_size)) for input_size in input_sizes
        ]
        self.Wirs = [
            cgt.shared(Wir_val, name=name_prefix + "Wir")
            for Wir_val in Wir_vals
        ]
        Wmr_val = normc(randnf(mem_size, mem_size))
        self.Wmr = cgt.shared(Wmr_val, name=name_prefix + "Wmr")
        br = np.zeros((1, mem_size), cgt.floatX)
        self.br = cgt.shared(br, name=name_prefix + "br")

        Wim_vals = [
            normc(randnf(input_size, mem_size)) for input_size in input_sizes
        ]
        self.Wims = [
            cgt.shared(Wim_val, name=name_prefix + "Wim")
            for Wim_val in Wim_vals
        ]
        Wmm_val = normc(np.eye(mem_size, dtype=cgt.floatX))
        self.Wmm = cgt.shared(Wmm_val, name=name_prefix + "Wmm")
        bm = np.zeros((1, mem_size), cgt.floatX)
        self.bm = cgt.shared(bm, name=name_prefix + "bm")
コード例 #38
0
def test_lrn():
    if not get_compile_info()["CGT_ENABLE_CUDA"]:
        raise SkipTest("Skipping because CUDA disabled")

    nr.seed(0)
    Xval = nr.randn(4, 8, 16, 16)
    X = cgt.shared(Xval, name="X", fixed_shape_mask="all")
    # X = cgt.tensor4(name='X')
    y = cross_channel_lrn(X, localsize=4, alpha=.1, beta=.5)
    f = cgt.function([], y)
    print f().sum()
    print f().sum()
    print f().sum()
    assert np.isfinite(f().sum())
    # print f(Xval).sum()
    a = nr.rand(*cgt.infer_shape(y))
    loss = (y * a).sum()
    gradcheck_model(loss, [X], eps=1e-5)
コード例 #39
0
ファイル: test_inc_subtensor.py プロジェクト: zxie/cgt
def test_incsubtensor2():
    W = cgt.shared(np.zeros((5, 3)), name="W")
    i0 = cgt.vector(dtype='i8')
    i1 = cgt.vector(dtype='i8')
    inc = cgt.vector()

    updates2 = {W: cgt.inc_subtensor(W, (i0, i1), inc)}
    f2 = cgt.function([i0, i1, inc], [], updates=updates2)
    f2([0, 1, 2, 2], [0, 1, 2, 2], [1, 2, 3, 4])
    assert np.allclose(
        W.op.get_value(),
        np.array([
            [1., 0., 0.],
            [0., 2., 0.],
            [0., 0., 7.],
            [0., 0., 0.],
            [0., 0., 0.],
        ]))
コード例 #40
0
ファイル: test_imgproc.py プロジェクト: EdsterG/cgt
def test_lrn():
    if not get_compile_info()["CGT_ENABLE_CUDA"]:
        raise SkipTest("Skipping because CUDA disabled")

    nr.seed(0)
    Xval = nr.randn(4,8,16,16)
    X = cgt.shared(Xval, name="X", fixed_shape_mask="all")
    # X = cgt.tensor4(name='X')
    y = cross_channel_lrn(X, localsize=4, alpha=.1, beta=.5)
    f = cgt.function([],y)
    print f().sum()
    print f().sum()
    print f().sum()
    assert np.isfinite(f().sum())
    # print f(Xval).sum()
    a = nr.rand(*cgt.infer_shape(y))
    loss = (y*a).sum()
    gradcheck_model(loss, [X],eps=1e-5)
コード例 #41
0
ファイル: test_inc_subtensor.py プロジェクト: EdsterG/cgt
def test_incsubtensor2():
    W = cgt.shared(np.zeros((5,3)), name="W")
    i0 = cgt.vector(dtype='i8')
    i1 = cgt.vector(dtype='i8')
    inc = cgt.vector()

    updates2 = {W : cgt.inc_subtensor(W, (i0,i1), inc)}
    f2 = cgt.function([i0,i1,inc],[],updates=updates2)
    f2([0,1,2,2],[0,1,2,2],[1,2,3,4])
    assert np.allclose(W.op.get_value(), 
        np.array(
        [
         [ 1.,  0.,  0.],
         [ 0.,  2.,  0.],
         [ 0.,  0.,  7.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         ]))
コード例 #42
0
ファイル: nn.py プロジェクト: x724/cgt
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
    """RMSProp updates
    Divide learning rate by moving average of RMS gradients. See [1]

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)]

    References
    ----------
    .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015):
           RMSProp and equilibrated adaptive learning rates for non-convex optimization
           arXiv:1502.04390 http://arxiv.org/abs/1502.04390
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))
        updates.append(
            (param,
             param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))))

    return updates
コード例 #43
0
ファイル: test_inc_subtensor.py プロジェクト: zxie/cgt
def test_incsubtensor1():
    W = cgt.shared(np.zeros((5, 3)), name="W")
    inc = cgt.matrix()  # we'll increment W by this matrix
    incval = np.arange(9).reshape(3, 3)

    start = cgt.scalar(dtype='i8')
    stop = cgt.scalar(dtype='i8')
    updates = {W: cgt.inc_subtensor(W, slice(start, stop), inc)}
    f = cgt.function([start, stop, inc], [], updates=updates)
    f(0, 3, incval)
    assert np.allclose(
        W.op.get_value(),
        np.array([
            [0., 1., 2.],
            [3., 4., 5.],
            [6., 7., 8.],
            [0., 0., 0.],
            [0., 0., 0.],
        ]))
コード例 #44
0
ファイル: test_inc_subtensor.py プロジェクト: EdsterG/cgt
def test_incsubtensor1():
    W = cgt.shared(np.zeros((5,3)), name="W")
    inc = cgt.matrix() # we'll increment W by this matrix
    incval = np.arange(9).reshape(3,3)

    start = cgt.scalar(dtype='i8')
    stop = cgt.scalar(dtype='i8')
    updates = {W : cgt.inc_subtensor(W, slice(start, stop), inc)}
    f = cgt.function([start,stop,inc],[],updates=updates)
    f(0,3,incval)
    assert np.allclose(W.op.get_value(), 
        np.array(
        [
         [ 0.,  1.,  2.],
         [ 3.,  4.,  5.],
         [ 6.,  7.,  8.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         ]))
コード例 #45
0
ファイル: nn.py プロジェクト: EdsterG/cgt
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
    """RMSProp updates
    Divide learning rate by moving average of RMS gradients. See [1]

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form (param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)

    References
    ----------
    .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015):
           RMSProp and equilibrated adaptive learning rates for non-convex optimization
           arXiv:1502.04390 http://arxiv.org/abs/1502.04390
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        accu_new = rho * accu + (1 - rho) * grad ** 2
        updates.append((accu, accu_new))
        updates.append((param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))))

    return updates
コード例 #46
0
ファイル: test_inc_subtensor.py プロジェクト: EdsterG/cgt
def test_incsubtensor0():
    # First let's test fancy slice along zeroth dimension

    W = cgt.shared(np.zeros((5,3)), name="W")
    inc = cgt.matrix() # we'll increment W by this matrix
    incval = np.arange(9).reshape(3,3)
    

    inds = cgt.vector(dtype='i8')
    updates = {W : cgt.inc_subtensor(W, inds, inc)}
    f = cgt.function([inds,inc],[],updates=updates)
    f([1,2,4],incval)

    assert np.allclose(W.op.get_value(), 
        np.array(
        [[ 0.,  0.,  0.],
         [ 0.,  1.,  2.],
         [ 3.,  4.,  5.],
         [ 0.,  0.,  0.],
         [ 6.,  7.,  8.]]))
コード例 #47
0
ファイル: nn.py プロジェクト: x724/cgt
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6):
    """Adagrad updates
    The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients.

    Math:
    * ``accu_new = accu + grad ** 2``
    * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    epsilon: avoids division close to zero. Small float.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)]

    References
    ----------
    .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011):
           Adaptive subgradient methods for online learning and stochastic
           optimization. JMLR, 12:2121-2159.
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        accu_new = accu + grad**2
        updates.append((accu, accu_new))
        updates.append(
            (param,
             param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)))

    return updates
コード例 #48
0
ファイル: nn.py プロジェクト: EdsterG/cgt
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6):
    """Adagrad updates
    The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients.

    Math:
    * ``accu_new = accu + grad ** 2``
    * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    epsilon: avoids division close to zero. Small float.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)]

    References
    ----------
    .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011):
           Adaptive subgradient methods for online learning and stochastic
           optimization. JMLR, 12:2121-2159.
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        accu_new = accu + grad ** 2
        updates.append((accu, accu_new))
        updates.append((param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)))

    return updates
コード例 #49
0
def nesterov_momentum(cost, params, learning_rate, mu=0.9):
    """Stochastic Gradient Descent (SGD) updates with Nesterov momentum

    Math:
    * ``new_velocity := mu * velocity - learning_rate * grad``
    * ``param := param - mu * velocity + (1 + mu) * new_velocity``

    See http://arxiv.org/abs/1212.0901v2, first part of eq 7
    At each step we're returning the "peaked-ahead parameters"


    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    mu: float
        Tunes the weight given to the velocity term.

    Returns
    -------
    list of tuples of the form (param, updates), (velocity, velocity_update)
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        velocity = cgt.shared(np.zeros(param.op.get_shape(),
                                       dtype=param.dtype))
        new_velocity = mu * velocity - learning_rate * grad
        new_param = param - mu * velocity + (mu + 1) * new_velocity
        updates.append((velocity, new_velocity))
        updates.append((param, new_param))

    return updates
コード例 #50
0
ファイル: nn.py プロジェクト: EdsterG/cgt
def nesterov_momentum(cost, params, learning_rate, mu=0.9):
    """Stochastic Gradient Descent (SGD) updates with Nesterov momentum

    Math:
    * ``new_velocity := mu * velocity - learning_rate * grad``
    * ``param := param - mu * velocity + (1 + mu) * new_velocity``

    See http://arxiv.org/abs/1212.0901v2, first part of eq 7
    At each step we're returning the "peaked-ahead parameters"


    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    mu: float
        Tunes the weight given to the velocity term.

    Returns
    -------
    list of tuples of the form (param, updates), (velocity, velocity_update)
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        velocity = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        new_velocity = mu * velocity - learning_rate * grad
        new_param = param - mu * velocity + (mu + 1) * new_velocity
        updates.append((velocity, new_velocity))
        updates.append((param, new_param))

    return updates
コード例 #51
0
ファイル: demo_mnist.py プロジェクト: EdsterG/cgt
def init_weights(*shape):
    return cgt.shared(np.random.randn(*shape) * 0.01, fixed_shape_mask='all')
コード例 #52
0
ファイル: caffe2cgt.py プロジェクト: EdsterG/cgt
     crop_size = tp.crop_size
     chans = len(tp.mean_value)
     dp = layer.data_param
     batch_size = dp.batch_size
     output = [cgt.tensor(dtype=cgt.floatX,ndim=4,name=layer.name, fixed_shape=(batch_size,chans,crop_size,crop_size)),
               cgt.tensor(dtype='i8',ndim=2,name=layer.name, fixed_shape=(batch_size, 1))]
 elif layer.type == "Convolution":
     X = inputs[0]
     param = layer.convolution_param
     kh,kw = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\
         else (param.kernel_h, param.kernel_w)
     nchanin = infer_shape(X)[0]
     Wshape = (param.num_output, nchanin, kh, kw)
     Wname = layer.param[0].name or layer.name+":W"
     Wval = np.empty(Wshape, dtype=cgt.floatX)
     W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all")
     bshape = (1, param.num_output, 1, 1)
     bname = layer.param[1].name or layer.name+":b"
     bval = np.empty(bshape, dtype=cgt.floatX)
     b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all")
     sh,sw = (param.stride, param.stride) if param.HasField("stride")\
         else (param.stride_h, param.stride_w)
     output = [cgt.broadcast("+",nn.conv2d(X, W, subsample=(sh,sw)), b, "xxxx,1x11")]
 elif layer.type == "Pooling":
     param = layer.pooling_param
     X = inputs[0]
     pool_type = {param.MAX : "max", param.AVE : "mean"}[param.pool]
     height_in,width_in = infer_shape(X)[2:4]
     kernel = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\
         else (param.kernel_h, param.kernel_w)
     stride = (param.stride, param.stride) if param.HasField("stride")\
コード例 #53
0
def init_weights(*shape):
    return cgt.shared(np.random.randn(*shape) * 0.01, fixed_shape_mask='all')
コード例 #54
0
ファイル: caffe2cgt.py プロジェクト: zxie/cgt
         cgt.tensor(dtype='i8',
                    ndim=2,
                    name=layer.name,
                    fixed_shape=(batch_size, 1))
     ]
 elif layer.type == "Convolution":
     X = inputs[0]
     param = layer.convolution_param
     kh,kw = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\
         else (param.kernel_h, param.kernel_w)
     nchanin = infer_shape(X)[0]
     Wshape = (param.num_output, nchanin, kh, kw)
     Wname = layer.param[0].name or layer.name + ":W"
     Wval = np.empty(Wshape, dtype=cgt.floatX)
     W = name2node[Wname] = cgt.shared(Wval,
                                       name=Wname,
                                       fixed_shape_mask="all")
     bshape = (1, param.num_output, 1, 1)
     bname = layer.param[1].name or layer.name + ":b"
     bval = np.empty(bshape, dtype=cgt.floatX)
     b = name2node[bname] = cgt.shared(bval,
                                       name=bname,
                                       fixed_shape_mask="all")
     sh,sw = (param.stride, param.stride) if param.HasField("stride")\
         else (param.stride_h, param.stride_w)
     output = [
         cgt.broadcast("+", nn.conv2d(X, W, subsample=(sh, sw)), b,
                       "xxxx,1x11")
     ]
 elif layer.type == "Pooling":
     param = layer.pooling_param
コード例 #55
0
# split data
X_train, X_test, Y_train, Y_test = train_test_split(data, targets, test_size=0.2, random_state=0)

# hyperparams
#
# Be careful when setting alpha! If it's too large
# here the cost will blow up.
alpha = 1e-7
epochs = 100

# Linear regression model
np.random.seed(0)
X = cgt.matrix("X", fixed_shape=(None, nfeats))
Y = cgt.vector("Y")
w = cgt.shared(np.random.randn(nfeats) * 0.01)

# prediction
ypred = cgt.dot(X, w)

# cost
cost = cgt.square(Y - ypred).mean()

# derivative with respect to w
dw = cgt.grad(cost=cost, wrt=w)
updates = [(w, w - dw * alpha)]

# training function
trainf = cgt.function(inputs=[X, Y], outputs=[], updates=updates)
# cost function, no updates
costf = cgt.function(inputs=[X, Y], outputs=cost)
コード例 #56
0
ファイル: nn.py プロジェクト: EdsterG/cgt
def parameter(val, name=None, device=None):
    fixed_shape_mask = "all"
    out = cgt.shared(val, name=name, device=device, fixed_shape_mask=fixed_shape_mask)
    out.props["is_parameter"] = True
    return out