def __init__(self, n_actions):
        Serializable.__init__(self, n_actions)
        cgt.set_precision('double')
        n_in = 128
        o_no = cgt.matrix("o_no",fixed_shape=(None,n_in))
        a_n = cgt.vector("a_n",dtype='i8')
        q_n = cgt.vector("q_n")
        oldpdist_np = cgt.matrix("oldpdists")

        h0 = (o_no - 128.0)/128.0 
        nhid = 64
        h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0))
        probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1))
        logprobs_na = cgt.log(probs_na)
        b = cgt.size(o_no, 0)
        logps_n = logprobs_na[cgt.arange(b), a_n]
        surr = (logps_n*q_n).mean()
        kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean()

        params = nn.get_parameters(surr)
        gradsurr = cgt.grad(surr, params)
        flatgrad = cgt.concatenate([p.flatten() for p in gradsurr])

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], 
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)]))
        self.f_pdist = cgt.function([o_no], probs_na)

        self.f_probs = cgt.function([o_no], probs_na)
        self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl])
        self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad)

        self.pc = ParamCollection(params)
Exemple #2
0
def test_devices():
    N = 10
    K = 3

    compile_info = cgt.compilation.get_compile_info()
    cuda_enabled = compile_info["CGT_ENABLE_CUDA"]
    if not cuda_enabled:
        raise SkipTest("cuda disabled")

    Xval = np.random.randn(N, K).astype(cgt.floatX)
    wval = np.random.randn(K).astype(cgt.floatX)
    bval = np.asarray(np.random.randn()).astype(cgt.floatX)
    yval = np.random.randn(N).astype(cgt.floatX)

    with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")):

        X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu'))
        y_n = cgt.shared(yval, "y")
        w_k = cgt.shared(wval, "w")
        b = cgt.shared(bval, name="b")

        print "bval", bval

        ypred = cgt.dot(cgt.square(X_nk), w_k) + b

        err = cgt.sum(cgt.sin(ypred - y_n))
        g = cgt.grad(err, [w_k, b])
        outputs = [err] + g
        f = cgt.function([], [err] + g)
        results = f()
        print results
        assert np.allclose(
            results[0],
            np.sin(np.square(Xval).dot(wval) + bval - yval).sum())
Exemple #3
0
def momentum(cost, params, learning_rate, mu=0.9):
    """Stochastic Gradient Descent (SGD) updates with momentum
    Math:
    * ``velocity := mu * velocity - learning_rate * grad``
    * ``param := param + velocity``
    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    momentum: float
        Tunes the weight given to the velocity term.
    
    Returns
    -------
    list of tuples of the form (param, new_param) and (velocity, new_velocity)
    """
    updates = []
    grads = cgt.grad(cost, params)
    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        velocity = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        new_velocity = mu * velocity - learning_rate * grad
        new_param = param + new_velocity
        updates.append((velocity, new_velocity))
        updates.append((param, new_param))

    return updates
Exemple #4
0
Fichier : nn.py Projet : x724/cgt
def nesterov_momentum(cost, params, learning_rate, momentum=0.9):
    """Stochastic Gradient Descent (SGD) updates with Nesterov momentum

    Math:
    * ``velocity := momentum * velocity - learning_rate * grad``
    * ``param := momentum*velocity + param - learning_rate * grad``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    momentum: float
        Tunes the weight given to the velocity term.

    Returns
    -------
    list of tuples of the form [(param, updates) (velocity, velocity_update)]
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        velocity = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        x = momentum * velocity - learning_rate * grad
        updates.append((velocity, x))
        updates.append((param, momentum * x + param - learning_rate * grad))

    return updates
Exemple #5
0
def test_devices():
    N = 10
    K = 3

    compile_info = cgt.compilation.get_compile_info()
    cuda_enabled = compile_info["CGT_ENABLE_CUDA"]
    if not cuda_enabled:
        raise SkipTest("cuda disabled")

    Xval = np.random.randn(N,K).astype(cgt.floatX)
    wval = np.random.randn(K).astype(cgt.floatX)
    bval = np.asarray(np.random.randn()).astype(cgt.floatX)
    yval = np.random.randn(N).astype(cgt.floatX)

    with cgt.scoped_update_config(default_device=cgt.Device(devtype="gpu")):

        X_nk = cgt.shared(Xval, "X", device=cgt.Device(devtype='gpu'))
        y_n = cgt.shared(yval, "y")
        w_k = cgt.shared(wval, "w")
        b = cgt.shared(bval, name="b")

        print "bval",bval

        ypred = cgt.dot(cgt.square(X_nk), w_k) + b

        err = cgt.sum(cgt.sin(ypred - y_n))
        g = cgt.grad(err, [w_k, b])
        outputs = [err]+g
        f = cgt.function([], [err]+g)
        results = f()
        print results
        assert np.allclose(results[0] , np.sin(np.square(Xval).dot(wval)+bval-yval).sum())
Exemple #6
0
def test_cudnn():
    with cgt.scoped_update_config(precision="double",backend="native"):
        if not get_compile_info()["CGT_ENABLE_CUDNN"]:
            raise SkipTest("CUDNN not enabled. Skipping this test")

        Xval = nr.randn(2,3,19,18)
        Wval = nr.randn(5,3,3,3)
        bval = nr.randn(1,5,1,1)

        X = cgt.tensor4("X", fixed_shape=Xval.shape)
        W = cgt.tensor4("W", fixed_shape=Wval.shape)
        b = cgt.tensor4("b", fixed_shape=bval.shape)


        Y = cgt.core.Result(cudnn_ops.CudnnConvForward(1,1,1,1),[X, W, b])

        Y2 = nr.randn(*cgt.core.infer_shape(Y))

        fY = cgt.function([X,W,b],Y)
        Yval = fY(Xval,Wval,bval)
        cost = (Y*Y2).sum()
        fcost = cgt.function([X,W,b],cost)
        fgrad = cgt.function([X,W,b],cgt.grad(cost, [X,W,b]))
        angrads = fgrad(Xval,Wval,bval)
        nugrads = numeric_grad_multi(fcost, [Xval, Wval, bval],eps=1e-3)
        for (nugrad,angrad) in zip(nugrads,angrads):
            assert np.allclose(nugrad, angrad)
Exemple #7
0
Fichier : nn.py Projet : nebw/cgt
def nesterov_momentum(cost, params, learning_rate, momentum=0.9):
    """Stochastic Gradient Descent (SGD) updates with Nesterov momentum

    Math:
    * ``velocity := momentum * velocity - learning_rate * grad``
    * ``param := momentum*velocity + param - learning_rate * grad``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    momentum: float
        Tunes the weight given to the velocity term.

    Returns
    -------
    list of tuples of the form [(param, updates) (velocity, velocity_update)]
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        velocity = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        x = momentum * velocity - learning_rate * grad
        updates.append((velocity, x))
        updates.append((param, momentum*x + param - learning_rate * grad))

    return updates
Exemple #8
0
def test_flatvec():
    cgt.reset_config
    cgt.set_precision('double')
    cgt.core.update_config(backend="python")  # XXX

    N = 10
    K = 3

    Xval = np.random.randn(N, K)
    wval = np.random.randn(K)
    bval = np.random.randn()
    yval = np.random.randn(N)

    X_nk = cgt.shared(Xval, "X")
    y_n = cgt.shared(yval, "y")
    w_k = cgt.shared(wval, "w")
    b = cgt.shared(bval, name="b")

    ypred = cgt.dot(X_nk, w_k) + b

    err = cgt.sum(cgt.square(ypred - y_n))
    g = cgt.grad(err, [w_k, b])
    g = core.simplify(g)

    pars = [w_k, b]
    flatx = nn.setup_contiguous_storage(pars)
    f = cgt.function([], [err, cgt.flatcat(g)])
Exemple #9
0
def test_cudnn():
    if not get_compile_info()["CGT_ENABLE_CUDNN"]:
        raise SkipTest("CUDNN not enabled. Skipping this test")

    Xval = nr.randn(2, 3, 19, 18)
    Wval = nr.randn(5, 3, 3, 3)
    bval = nr.randn(1, 5, 1, 1)

    X = cgt.tensor4("X", fixed_shape=Xval.shape)
    W = cgt.tensor4("W", fixed_shape=Wval.shape)
    b = cgt.tensor4("b", fixed_shape=bval.shape)

    Y = cgt.core.Result(cudnn_ops.CudnnConvForward(1, 1, 1, 1), [X, W, b])

    Y2 = nr.randn(*cgt.core.infer_shape(Y))

    fY = cgt.function([X, W, b], Y)
    Yval = fY(Xval, Wval, bval)
    cost = (Y * Y2).sum()
    fcost = cgt.function([X, W, b], cost)
    fgrad = cgt.function([X, W, b], cgt.grad(cost, [X, W, b]))
    angrads = fgrad(Xval, Wval, bval)
    nugrads = numeric_grad_multi(fcost, [Xval, Wval, bval], eps=1e-3)
    for (nugrad, angrad) in zip(nugrads, angrads):
        assert np.allclose(nugrad, angrad)
Exemple #10
0
def test_flatvec():
    cgt.reset_config
    cgt.set_precision('double')
    cgt.core.update_config(backend="python") # XXX

    N = 10
    K = 3

    Xval = np.random.randn(N,K)
    wval = np.random.randn(K)
    bval = np.random.randn()
    yval = np.random.randn(N)

    X_nk = cgt.shared(Xval, "X")
    y_n = cgt.shared(yval, "y")
    w_k = cgt.shared(wval, "w")
    b = cgt.shared(bval, name="b")

    ypred = cgt.dot(X_nk, w_k) + b

    err = cgt.sum(cgt.square(ypred - y_n))
    g = cgt.grad(err, [w_k, b])
    g = core.simplify(g)

    pars = [w_k, b]
    flatx = nn.setup_contiguous_storage(pars)
    f = cgt.function([], [err,cgt.flatcat(g)])
Exemple #11
0
def momentum(cost, params, learning_rate, mu=0.9):
    """Stochastic Gradient Descent (SGD) updates with momentum
    Math:
    * ``velocity := mu * velocity - learning_rate * grad``
    * ``param := param + velocity``
    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    momentum: float
        Tunes the weight given to the velocity term.
    
    Returns
    -------
    list of tuples of the form (param, new_param) and (velocity, new_velocity)
    """
    updates = []
    grads = cgt.grad(cost, params)
    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        velocity = cgt.shared(np.zeros(param.op.get_shape(),
                                       dtype=param.dtype))
        new_velocity = mu * velocity - learning_rate * grad
        new_param = param + new_velocity
        updates.append((velocity, new_velocity))
        updates.append((param, new_param))

    return updates
Exemple #12
0
def gradcheck_model(cost,
                    params,
                    extravars=(),
                    extravals=(),
                    atol=1e-8,
                    eps=1e-9):
    precision = cgt.get_precision()
    if precision == "single":
        cgt.utils.warn(
            "You're doing a gradient check with %s precision. Use double or better yet quad for best results"
            % (precision))
    assert all(param.is_input() for param in params)
    assert len(extravars) == len(extravals)

    # Convert to Argument nodes
    param_args = [
        cgt.core.Argument(typ=s.typ, name=s.name) if s.is_data() else s
        for s in params
    ]

    # Get new cost in terms o farguments
    cost = cgt.core.clone(cost, replace=dict(zip(params, param_args)))

    grads = cgt.grad(cost, param_args)
    paramvals = [param.op.get_value() for param in params]
    fcost = cgt.function(param_args, cost, givens=zip(extravars, extravals))
    fgrad = cgt.function(param_args, grads, givens=zip(extravars, extravals))

    angrads = fgrad(*paramvals)
    nugrads = numeric_grad_multi(fcost, paramvals, eps=eps)

    for (angrad, nugrad) in zip(angrads, nugrads):
        assert np.allclose(angrad, nugrad, atol=atol)
Exemple #13
0
def test_cudnn():
    compile_info = get_compile_info()
    if not (compile_info["CGT_ENABLE_CUDNN"] and compile_info["CGT_ENABLE_CUDA"]):
        raise SkipTest("CUDNN not enabled. Skipping this test")

    Xval = nr.randn(2,3,19,18)
    Wval = nr.randn(5,3,3,3)
    bval = nr.randn(1,5,1,1)

    X = cgt.tensor4("X", fixed_shape=Xval.shape)
    W = cgt.tensor4("W", fixed_shape=Wval.shape)
    b = cgt.tensor4("b", fixed_shape=bval.shape)


    Y = cgt.core.Result(cudnn_ops.CudnnConvForward(1,1,1,1),[X, W, b])

    Y2 = nr.randn(*cgt.core.infer_shape(Y))

    fY = cgt.function([X,W,b],Y)
    Yval = fY(Xval,Wval,bval)
    cost = (Y*Y2).sum()
    fcost = cgt.function([X,W,b],cost)
    fgrad = cgt.function([X,W,b],cgt.grad(cost, [X,W,b]))
    angrads = fgrad(Xval,Wval,bval)
    nugrads = numeric_grad_multi(fcost, [Xval, Wval, bval],eps=1e-3)
    for (nugrad,angrad) in zip(nugrads,angrads):
        assert np.allclose(nugrad, angrad, rtol=9e-3, atol=1e-7) 
 def make_updater_fc():
     X = cgt.matrix("X", fixed_shape=(None, 28 * 28))
     y = cgt.vector("y", dtype="i8")
     stepsize = cgt.scalar("stepsize")
     loss = build_fc_return_loss(X, y)
     params = nn.get_parameters(loss)
     gparams = cgt.grad(loss, params)
     updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
     return cgt.function([X, y, stepsize], loss, updates=updates)
Exemple #15
0
def check_scalar_grads(precision, backend):
    cgt.reset_config()
    np.random.seed(0)
    cgt.set_precision(precision)
    cgt.core.update_config(backend=backend)
    x = cgt.scalar('x')
    y = cgt.scalar('y')
    z = cgt.scalar('z')
    vars = [x,y,z] #pylint: disable=W0622
    vals = nr.rand(len(vars))+1

    PROB2RESULT = {}

    for ((key,_), cls) in it.chain(
            it.izip(core.UNARY_INFO.items(),it.repeat(core.ElwiseUnary)),
            it.izip(core.BINARY_INFO.items(),it.repeat(core.ElwiseBinary))
            ):
        if key == "conj":
            print "skipping conj"
            continue
        utils.colorprint(utils.Color.YELLOW, "Testing %s\n"%key)
        if cls == core.ElwiseUnary:
            n_in = 1
            op = cls(key)
        else:
            n_in = 2
            op = cls(key, (True,True))
        inputvars = vars[0:n_in]
        inputvals = vals[0:n_in]
        out = core.Result(op, inputvars)
        f = cgt.function(inputvars, out)
        try:
            grads = cgt.grad(out, inputvars)
        except core.NonDifferentiable:
            print "nondiff"
            continue
        if DISPLAY:
            print "Function:"
            cgt.print_tree(out)
            print "Gradient original:"
            cgt.print_tree(grads)
            print "Gradient simplified:"
        grads_simple = core.simplify(grads)
        if DISPLAY: cgt.print_tree(grads_simple)
        gradf = cgt.function(inputvars, grads)
        eps = {"single":1e-4,"double":1e-9}[precision]
        nugrad = numeric_grad(lambda li: f(*li), inputvals,eps=eps) #pylint: disable=W0640
        cgtgrad = gradf(*inputvals)
        np.testing.assert_almost_equal(nugrad,cgtgrad,decimal={"single":3,"double":6}[precision])

        grad_count = core.count_nodes(grads_simple)
        PROB2RESULT[key] = {}
        PROB2RESULT[key]["grad"] = grad_count

    if DISPLAY:
        from thirdparty.tabulate import tabulate
        print tabulate([[key,val["grad"]] for (key,val) in PROB2RESULT.iteritems()],headers=["funcname","gradcount"])    
Exemple #16
0
 def make_updater_fc():
     X = cgt.matrix("X", fixed_shape=(None, 28 * 28))
     y = cgt.vector("y", dtype='i8')
     stepsize = cgt.scalar("stepsize")
     loss = build_fc_return_loss(X, y)
     params = nn.get_parameters(loss)
     gparams = cgt.grad(loss, params)
     updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
     return cgt.function([X, y, stepsize], loss, updates=updates)
Exemple #17
0
 def initialize(self, loss, scale):
     self._iter = 0
     self.pc = Params(self.params)
     cur_val = self.pc.get_value_flat()
     idx = cur_val.nonzero()
     new_val = np.random.uniform(-scale, scale, size=(self.pc.get_total_size(),))
     new_val[idx] = cur_val[idx]
     self.sync(new_val)
     grad = cgt.concatenate([g.flatten() for g in cgt.grad(loss, self.params)])
     return grad
    def make_updater_convnet():
        X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28))  # so shapes can be inferred
        y = cgt.vector("y", dtype="i8")
        stepsize = cgt.scalar("stepsize")
        loss = build_convnet_return_loss(X, y)

        params = nn.get_parameters(loss)
        gparams = cgt.grad(loss, params)
        updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], loss, updates=updates)
    def __init__(self, obs_dim, ctrl_dim):

        cgt.set_precision('double')
        Serializable.__init__(self, obs_dim, ctrl_dim)

        self.obs_dim = obs_dim
        self.ctrl_dim = ctrl_dim

        o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim))
        a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim))
        adv_n = cgt.vector("adv_n")
        oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim))
        self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a")
        std_1a = cgt.exp(logstd_1a)

        # Here's where we apply the network
        h0 = o_no
        nhid = 32
        h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0))
        h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1))
        mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2)

        b = cgt.size(o_no, 0)
        std_na = cgt.repeat(std_1a, b, axis=0)

        oldmean_na = oldpdist_np[:, 0:self.ctrl_dim]
        oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim]

        logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum()
        oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1)

        ratio_n = cgt.exp(logp_n - oldlogp_n)

        surr = (ratio_n*adv_n).mean()

        pdists_np = cgt.concatenate([mean_na, std_na], axis=1)
        # kl = cgt.log(sigafter/)

        params = nn.get_parameters(surr)

        oldvar_na = cgt.square(oldstd_na)
        var_na = cgt.square(std_na)
        kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean()


        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl])
        self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], 
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)]))
        self.f_pdist = cgt.function([o_no], pdists_np)

        self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl])

        self.pc = ParamCollection(params)
Exemple #20
0
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for p, g in zip(params, grads):
        acc = cgt.shared(p.op.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * cgt.square(g)
        gradient_scaling = cgt.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - stepsize * g))
    return updates
Exemple #21
0
def test_scalars():
    np.random.seed(0)
    x = cgt.scalar('x')
    y = cgt.scalar('y')
    z = cgt.scalar('z')
    vars = [x,y,z] #pylint: disable=W0622
    vals = nr.rand(len(vars))+1

    PROB2RESULT = {}

    for ((key,_), cls) in it.chain(
            it.izip(core.UNARY_INFO.items(),it.repeat(core.ElwiseUnary)),
            it.izip(core.BINARY_INFO.items(),it.repeat(core.ElwiseBinary))
            ):
        if key == "conj":
            print "skipping conj"
            continue
        utils.colorprint(utils.Color.YELLOW, "Testing %s\n"%key)
        if cls == core.ElwiseUnary:
            n_in = 1
            op = cls(key)
        else:
            n_in = 2
            op = cls(key, (True,True))
        inputvars = vars[0:n_in]
        inputvals = vals[0:n_in]
        out = core.Result(op, inputvars)
        f = cgt.function(inputvars, out)
        try:
            grads = cgt.grad(out, inputvars)
        except core.NonDifferentiable:
            print "nondiff"
            continue
        if DISPLAY:
            print "Function:"
            cgt.print_tree(out)
            print "Gradient original:"
            cgt.print_tree(grads)
            print "Gradient simplified:"
        grads_simple = core.simplify(grads)
        if DISPLAY: cgt.print_tree(grads_simple)
        gradf = cgt.function(inputvars, grads)
        eps = {"single":1e-4,"double":1e-9}[cgt.get_precision()]
        nugrad = numeric_grad(lambda li: f(*li), inputvals,eps=eps) #pylint: disable=W0640
        cgtgrad = gradf(*inputvals)
        np.testing.assert_almost_equal(nugrad,cgtgrad,decimal={"single":3,"double":6}[cgt.get_precision()])

        grad_count = core.count_nodes(grads_simple)
        PROB2RESULT[key] = {}
        PROB2RESULT[key]["grad"] = grad_count

    if DISPLAY:
        from thirdparty.tabulate import tabulate
        print tabulate([[key,val["grad"]] for (key,val) in PROB2RESULT.iteritems()],headers=["funcname","gradcount"])    
Exemple #22
0
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for p, g in zip(params, grads):
        acc = cgt.shared(p.op.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * cgt.square(g)
        gradient_scaling = cgt.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - stepsize * g))
    return updates
Exemple #23
0
    def make_updater_convnet():
        X = cgt.tensor4("X", fixed_shape=(None, 1, 28,
                                          28))  # so shapes can be inferred
        y = cgt.vector("y", dtype='i8')
        stepsize = cgt.scalar("stepsize")
        loss = build_convnet_return_loss(X, y)

        params = nn.get_parameters(loss)
        gparams = cgt.grad(loss, params)
        updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], loss, updates=updates)
def CGT_dvLJ(x):
    N = len(x)
    xt = cgt.vector('xt')
    vLJt = 0
    for j in range(1,N):
        for i in range(j):
            rho = ((xt[i*D:i*D+D] - xt[j*D:j*D+D])**2).sum()
            vLJt += rho**(-6.0)-(rho**(-3.0))
    
    dvLJc = cgt.grad(4*vLJt, xt)    
    df = cgt.function([xt],dvLJc)
    return df(np.ravel(x))
Exemple #25
0
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    """ Adadelta updates
    The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes.

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))``
    * ``param = param - learning_rate * update``
    * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form
    (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new)

    References
    ----------
    .. [1] Zeiler, M. D. (2012):
           ADADELTA: An Adaptive Learning Rate Method.
           arXiv Preprint arXiv:1212.5701.
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        delta_accu = cgt.shared(
            np.zeros(param.op.get_shape(), dtype=param.dtype))

        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) /
                  cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - learning_rate * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update**2
        updates.append((delta_accu, delta_accu_new))

    return updates
Exemple #26
0
def check_affine(f, *nu_inputs):
    types = ",".join(["{%s,%s}" % (x.dtype, x.ndim) for x in nu_inputs])
    cgt.utils.colorprint(cgt.utils.Color.YELLOW,
                         "Testing %s(%s)\n" % (f.__name__, types))
    sy_inputs = map(tensor_like, nu_inputs)
    for (i, sy) in enumerate(sy_inputs):
        sy.name = "x%i" % i

    sy_result = f(*sy_inputs)

    def maybeprint(msg):
        if DISPLAY: print msg

    maybeprint("Function:")
    if DISPLAY: cgt.print_tree([sy_result])

    f_cgt = cgt.function(sy_inputs, sy_result)
    sy_grads = cgt.grad(sy_result, sy_inputs)
    gradf_cgt = cgt.function(sy_inputs, sy_grads)

    sy_result_simple = core.simplify([sy_result])
    sy_grads_simple = core.simplify(sy_grads)

    maybeprint("Gradient:")
    if DISPLAY: cgt.print_tree(sy_grads)

    maybeprint("Gradient after simplification:")
    if DISPLAY: cgt.print_tree(sy_grads_simple)

    out_true = f(*nu_inputs)
    out_cgt = f_cgt(*nu_inputs)

    grads_true = gradients_affine(f_cgt,
                                  nu_inputs,
                                  h=1e-4 if "max" in f.__name__ else 1e-1)
    grads_cgt = gradf_cgt(*nu_inputs)

    rtol = {"single": 1e-3, "double": 1e-5}[cgt.get_precision()]
    np.testing.assert_allclose(out_cgt, out_true, rtol=rtol)

    for (g_cgt, g_true) in zip(grads_cgt, grads_true):
        np.testing.assert_allclose(g_cgt, g_true, rtol=rtol)

    result_count = cgt.count_nodes(sy_result_simple)
    grad_count = cgt.count_nodes(sy_grads_simple)
    maybeprint("Result before: %i. after: %i" %
               (cgt.count_nodes([sy_result]), result_count))
    maybeprint("Grad before: %i. after: %i" %
               (cgt.count_nodes(sy_grads), grad_count))

    PROB2RESULT[f.__name__] = {}
    PROB2RESULT[f.__name__]["fn"] = result_count
    PROB2RESULT[f.__name__]["grad"] = grad_count
Exemple #27
0
def check_affine(f, *nu_inputs):
    types = ",".join(["{%s,%s}" % (x.dtype, x.ndim) for x in nu_inputs])
    cgt.utils.colorprint(cgt.utils.Color.YELLOW, "Testing %s(%s)\n" % (f.__name__, types))
    sy_inputs = map(tensor_like, nu_inputs)
    for (i, sy) in enumerate(sy_inputs):
        sy.name = "x%i" % i

    sy_result = f(*sy_inputs)

    def maybeprint(msg):
        if DISPLAY:
            print msg

    maybeprint("Function:")
    if DISPLAY:
        cgt.print_tree([sy_result])

    f_cgt = cgt.function(sy_inputs, sy_result)
    sy_grads = cgt.grad(sy_result, sy_inputs)
    gradf_cgt = cgt.function(sy_inputs, sy_grads)

    sy_result_simple = core.simplify([sy_result])
    sy_grads_simple = core.simplify(sy_grads)

    maybeprint("Gradient:")
    if DISPLAY:
        cgt.print_tree(sy_grads)

    maybeprint("Gradient after simplification:")
    if DISPLAY:
        cgt.print_tree(sy_grads_simple)

    out_true = f(*nu_inputs)
    out_cgt = f_cgt(*nu_inputs)

    grads_true = gradients_affine(f_cgt, nu_inputs, h=1e-4 if "max" in f.__name__ else 1e-1)
    grads_cgt = gradf_cgt(*nu_inputs)

    rtol = {"single": 1e-3, "double": 1e-5}[cgt.get_precision()]
    np.testing.assert_allclose(out_cgt, out_true, rtol=rtol)

    for (g_cgt, g_true) in zip(grads_cgt, grads_true):
        np.testing.assert_allclose(g_cgt, g_true, rtol=rtol)

    result_count = cgt.count_nodes(sy_result_simple)
    grad_count = cgt.count_nodes(sy_grads_simple)
    maybeprint("Result before: %i. after: %i" % (cgt.count_nodes([sy_result]), result_count))
    maybeprint("Grad before: %i. after: %i" % (cgt.count_nodes(sy_grads), grad_count))

    PROB2RESULT[f.__name__] = {}
    PROB2RESULT[f.__name__]["fn"] = result_count
    PROB2RESULT[f.__name__]["grad"] = grad_count
Exemple #28
0
    def __init__(self, n_actions):
        Serializable.__init__(self, n_actions)
        cgt.set_precision('double')
        n_in = 128
        o_no = cgt.matrix("o_no", fixed_shape=(None, n_in))
        a_n = cgt.vector("a_n", dtype='i8')
        q_n = cgt.vector("q_n")
        oldpdist_np = cgt.matrix("oldpdists")

        h0 = (o_no - 128.0) / 128.0
        nhid = 64
        h1 = cgt.tanh(
            nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0))
        probs_na = nn.softmax(
            nn.Affine(nhid, n_actions,
                      weight_init=nn.IIDGaussian(std=0.01))(h1))
        logprobs_na = cgt.log(probs_na)
        b = cgt.size(o_no, 0)
        logps_n = logprobs_na[cgt.arange(b), a_n]
        surr = (logps_n * q_n).mean()
        kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean()

        params = nn.get_parameters(surr)
        gradsurr = cgt.grad(surr, params)
        flatgrad = cgt.concatenate([p.flatten() for p in gradsurr])

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._f_grad_lagrangian = cgt.function(
            [lam, oldpdist_np, o_no, a_n, q_n],
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)]))
        self.f_pdist = cgt.function([o_no], probs_na)

        self.f_probs = cgt.function([o_no], probs_na)
        self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n],
                                      [surr, kl])
        self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad)

        self.pc = ParamCollection(params)
Exemple #29
0
def make_loss_and_grad(net):
    X_b = inps[0] #cgt.matrix(dtype=cgt.floatX)
    y_onehot = cgt.matrix(dtype='i4')
    outputs = [logprobs]

    loss = nn.crossent(outputs[0], y_onehot) / b_size
    #gradloss = cgt.grad(loss, params)
    gradloss = cgt.grad(loss, param_list)

    # XXX use flatcat function
    grad = cgt.concatenate([x.flatten() for x in gradloss])
    #grad = gradloss
    return cgt.make_function([X_b, y_onehot], [loss, grad, logprobs])
Exemple #30
0
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    """ Adadelta updates
    The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes.

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))``
    * ``param = param - learning_rate * update``
    * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form
    (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new)

    References
    ----------
    .. [1] Zeiler, M. D. (2012):
           ADADELTA: An Adaptive Learning Rate Method.
           arXiv Preprint arXiv:1212.5701.
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        delta_accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))

        accu_new = rho * accu + (1 - rho) * grad ** 2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - learning_rate * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2
        updates.append((delta_accu, delta_accu_new))

    return updates
    def __init__(self, xdim, args, dec="bernoulli"):
        self.xdim = xdim
        self.hdim = args.hdim
        self.zdim = args.zdim
        self.lmbda = args.lmbda  # weight decay coefficient * 2
        self.x = cgt.matrix("x", dtype=cgt.floatX)
        self.eps = cgt.matrix("eps", dtype=cgt.floatX)

        self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps)
        if dec == "bernoulli":
            # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y)
            self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        elif dec == "gaussian":
            self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        else:
            raise RuntimeError("unrecognized decoder %" % dec)

        self.cost = (-cgt.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size
        self.params = self.enc_mlp.params + self.dec_mlp.params
        # L2 regularization
        self.gparams = [cgt.grad(self.cost, [p])[0] + self.lmbda * p for p in self.params]
        self.gaccums = [cgt.shared(np.zeros(p.op.get_value().shape, dtype=cgt.floatX)) for p in self.params]

        # XXX replace w/ adagrad update from nn
        ADAGRAD_EPS = 1e-10  # for stability
        self.updates = [
            (param, param - args.lr * gparam / cgt.sqrt(gaccum + cgt.square(gparam) + ADAGRAD_EPS))
            for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums)
        ]
        self.updates += [
            (gaccum, gaccum + cgt.square(gparam))
            for gaccum, gparam in zip(self.gaccums, self.gparams)
        ]

        self.train = cgt.function(
            [self.x, self.eps],
            self.cost,
            updates=self.updates
        )
        self.test = cgt.function(
            [self.x, self.eps],
            self.cost,
            updates=None
        )
        # can be used for semi-supervised learning for example
        self.encode = cgt.function(
            [self.x, self.eps],
            self.enc_mlp.out
        )
Exemple #32
0
def test_im2col():
    for settings in [ ((4,4),(0,0),(1,1)), ((3,3),(1,1),(2,2)), ((3,3),(1,1),(3,3)) ]:
        xval = np.arange(2*1*28*28).reshape(2,1,28,28).astype(cgt.floatX)
        x = cgt.tensor4("x", fixed_shape=xval.shape)
        y = im2col(x, *settings)
        h = cgt.constant(np.random.randn(*cgt.infer_shape(y)))
        cost = (y*h).sum()

        fcost = cgt.function([x],cost)
        fgrad = cgt.function([x], cgt.grad(cost, [x])[0])

        from cgt.numeric_diff import numeric_grad
        gnum = numeric_grad(fcost, xval,eps=1e-5)
        gana = fgrad(xval)
        assert np.allclose(gnum, gana)
    def make_updater_fc_parallel():
        X = cgt.matrix("X", fixed_shape=(None,28*28))
        y = cgt.vector("y",dtype='i8')
        stepsize = cgt.scalar("stepsize")

        loss = build_fc_return_loss(X,y)
        params = nn.get_parameters(loss)        
        m = nn.Module([X,y], [loss])
        split_loss = 0
        for start in xrange(0, batch_size, batch_size//4):
            sli = slice(start, start+batch_size//4)
            split_loss += m([X[sli], y[sli]])[0]
        split_loss /= 4
        gparams = cgt.grad(split_loss, params)
        updates2 = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X,y, stepsize], split_loss, updates=updates2)
Exemple #34
0
    def make_updater_fc_parallel():
        X = cgt.matrix("X", fixed_shape=(None, 28 * 28))
        y = cgt.vector("y", dtype='i8')
        stepsize = cgt.scalar("stepsize")

        loss = build_fc_return_loss(X, y)
        params = nn.get_parameters(loss)
        m = nn.Module([X, y], [loss])
        split_loss = 0
        for start in xrange(0, batch_size, batch_size // 4):
            sli = slice(start, start + batch_size // 4)
            split_loss += m([X[sli], y[sli]])[0]
        split_loss /= 4
        gparams = cgt.grad(split_loss, params)
        updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], split_loss, updates=updates2)
    def make_updater_convnet_parallel():
        X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28))  # so shapes can be inferred
        y = cgt.vector("y", dtype="i8")
        stepsize = cgt.scalar("stepsize")
        loss = build_convnet_return_loss(X, y)

        m = nn.Module([X, y], [loss])
        split_loss = 0
        for start in xrange(0, batch_size, batch_size // 4):
            sli = slice(start, start + batch_size // 4)
            split_loss += m([X[sli], y[sli]])[0]
        split_loss /= 4
        params = nn.get_parameters(loss)
        gparams = cgt.grad(split_loss, params)
        updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], split_loss, updates=updates2)
Exemple #36
0
def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem,
                                size_batch, n_layers, n_unroll, k_in, k_h):
    # symbolic variables

    x_tnk = cgt.tensor3()
    targ_tnk = cgt.tensor3()
    #make_network = make_deep_lstm if arch=="lstm" else make_deep_gru
    make_network = make_deep_rrnn_rot_relu
    network = make_network(size_input, size_mem, n_layers, size_output,
                           size_batch, k_in, k_h)
    init_hiddens = [
        cgt.matrix() for _ in xrange(get_num_hiddens(arch, n_layers))
    ]
    # TODO fixed sizes

    cur_hiddens = init_hiddens
    loss = 0
    for t in xrange(n_unroll):
        outputs = network([x_tnk[t]] + cur_hiddens)
        cur_hiddens, prediction_logprobs = outputs[:-1], outputs[-1]
        # loss = loss + nn.categorical_negloglik(prediction_probs, targ_tnk[t]).sum()
        loss = loss - (prediction_logprobs * targ_tnk[t]).sum()
        cur_hiddens = outputs[:-1]

    final_hiddens = cur_hiddens

    loss = loss / (n_unroll * size_batch)

    params = network.get_parameters()
    gradloss = cgt.grad(loss, params)

    flatgrad = flatcat(gradloss)

    with utils.Message("compiling loss+grad"):
        f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens,
                                       [loss, flatgrad] + final_hiddens)
    f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss)

    assert len(init_hiddens) == len(final_hiddens)

    x_nk = cgt.matrix('x')
    outputs = network([x_nk] + init_hiddens)

    f_step = cgt.function([x_nk] + init_hiddens, outputs)

    # print "node count", cgt.count_nodes(flatgrad)
    return network, f_loss, f_loss_and_grad, f_step
Exemple #37
0
def test_im2col():
    for settings in [((4, 4), (0, 0), (1, 1)), ((3, 3), (1, 1), (2, 2)),
                     ((3, 3), (1, 1), (3, 3))]:
        xval = np.arange(2 * 1 * 28 * 28).reshape(2, 1, 28,
                                                  28).astype(cgt.floatX)
        x = cgt.tensor4("x", fixed_shape=xval.shape)
        y = im2col(x, *settings)
        h = cgt.constant(np.random.randn(*cgt.infer_shape(y)))
        cost = (y * h).sum()

        fcost = cgt.function([x], cost)
        fgrad = cgt.function([x], cgt.grad(cost, [x])[0])

        from cgt.numeric_diff import numeric_grad
        gnum = numeric_grad(fcost, xval, eps=1e-5)
        gana = fgrad(xval)
        assert np.allclose(gnum, gana)
Exemple #38
0
def test_pool(**kwargs):
    np.random.seed(0)
    x = cgt.tensor4("x", fixed_shape=(2,3,5,7))
    y = max_pool_2d(x, (4,4),(0,0),(1,1))
    xval = np.random.randn(2,3,5,7)
    hval = np.random.randn(*cgt.infer_shape(y))
    h = cgt.constant(hval)

    cost = (y*h).sum()

    fcost = cgt.function([x], cost)
    fgrad = cgt.function([x], cgt.grad(cost, [x])[0])

    from cgt.numeric_diff import numeric_grad
    gnum = numeric_grad(fcost, xval)
    gana = fgrad(xval)
    assert np.allclose(gnum,gana)
Exemple #39
0
def test_cpu_pool(**kwargs):
    np.random.seed(0)
    x = cgt.tensor4("x", fixed_shape=(2, 3, 5, 7))
    y = max_pool_2d(x, (4, 4), (0, 0), (1, 1))
    xval = np.random.randn(2, 3, 5, 7)
    hval = np.random.randn(*cgt.infer_shape(y))
    h = cgt.constant(hval)

    cost = (y * h).sum()

    fcost = cgt.function([x], cost)
    fgrad = cgt.function([x], cgt.grad(cost, [x])[0])

    from cgt.numeric_diff import numeric_grad
    gnum = numeric_grad(fcost, xval)
    gana = fgrad(xval)
    assert np.allclose(gnum, gana)
Exemple #40
0
def adagrad_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        delta_accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))

        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) /
                  cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - stepsize * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update**2
        updates.append((delta_accu, delta_accu_new))
    return updates
Exemple #41
0
Fichier : nn.py Projet : x724/cgt
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
    """RMSProp updates
    Divide learning rate by moving average of RMS gradients. See [1]

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)]

    References
    ----------
    .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015):
           RMSProp and equilibrated adaptive learning rates for non-convex optimization
           arXiv:1502.04390 http://arxiv.org/abs/1502.04390
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))
        updates.append(
            (param,
             param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))))

    return updates
Exemple #42
0
def make_funcs(config, dbg_out=None):
    params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network(
        config['rnn_steps'], config['num_inputs'], config['num_outputs'],
        config['num_units'], config['num_mems'])

    # basic
    size_batch = Xs[0].shape[0]
    dY = Ys[0].shape[-1]
    Ys_gt = [
        cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d' % t)
        for t in range(len(Ys))
    ]
    Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys]
    net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T

    # calculate loss
    loss_vec = []
    for i in range(len(Ys)):
        #     if i == 0: continue
        _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i])
        loss_vec.append(_l)
    loss_vec = cgt.add_multi(loss_vec)
    if config['weight_decay'] > 0.:
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = config['weight_decay'] * cgt.sum(params_flat**2)
        loss_vec -= loss_param  # / size_batch
    loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch
    grad = cgt.grad(loss, params)

    # functions
    def f_init(size_batch):
        c_0, h_0 = [], []
        for _n_m in config['num_mems']:
            if _n_m > 0:
                c_0.append(np.zeros((size_batch, _n_m)))
                h_0.append(np.zeros((size_batch, _n_m)))
        return c_0, h_0

    f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1)
    f_loss = cgt.function(net_inputs + Ys_gt, loss)
    f_grad = cgt.function(net_inputs + Ys_gt, grad)
    f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad)
    return params, f_step, f_loss, f_grad, f_init, f_surr
Exemple #43
0
def test_linreg():
    cgt.reset_config()
    cgt.set_precision('double')
    N = 10
    K = 3

    Xval = np.random.randn(N, K)
    wval = np.random.randn(K)
    bval = np.random.randn()
    yval = np.random.randn(N)

    X_nk = cgt.matrix("X")
    y_n = cgt.vector("y")
    w_k = cgt.vector("w")
    b = cgt.scalar(name="b")

    ypred = cgt.dot(X_nk, w_k) + b

    err = cgt.sum(cgt.square(ypred - y_n))
    g = cgt.grad(err, [w_k, b])

    g_simple, an, _ = cgt.core.simplify_and_analyze(g)

    print "Loss function:"
    cgt.print_tree([err])
    print "Gradient:"
    cgt.print_tree(g)

    print "Gradient simplified"
    cgt.print_tree(
        g_simple,
        nodefn=lambda node, o: o.write(" " + an["node2hash"][node][:5]))

    print "-------"

    d = {X_nk: Xval, w_k: wval, b: bval, y_n: yval}

    np.testing.assert_allclose(cgt.numeric_eval(err, d),
                               np.linalg.norm(Xval.dot(wval) + bval - yval)**2)
    np.testing.assert_allclose(cgt.numeric_eval(g[0], d),
                               2 * Xval.T.dot(Xval.dot(wval) + bval - yval))
    np.testing.assert_allclose(cgt.numeric_eval(g[1], d),
                               2 * np.sum(Xval.dot(wval) + bval - yval, 0))
Exemple #44
0
 def __init__(self, num_features=None, num_hidden=100):
     stepsize = 0.01
     # with shape (batchsize, ncols)
     X = cgt.matrix("X", fixed_shape=(1, num_features))
     # y: a symbolic variable representing the rewards, which are integers
     y = cgt.scalar("y", dtype='float64')
     
     hid1 = nn.rectify(
         nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X)
     )
     # One final fully-connected layer, and then a linear activation output for reward
     output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1)
     abs_deviation = cgt.abs(output - y).mean()
     params = nn.get_parameters(abs_deviation)
     gparams = cgt.grad(abs_deviation, params)
     
     updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)]
     self.predictor = cgt.function([X], output)
     self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def CGT_vLJ_Optimize(x):
    N = len(x)
    #cgt.set_precision('double')
    xt = cgt.vector('xt')
    vLJt = 0
    for j in range(1,N):
        for i in range(j):
            rho = ((xt[i*D:i*D+D] - xt[j*D:j*D+D])**2).sum()
            vLJt += rho**(-6.0)-(rho**(-3.0))
    
    f = cgt.function([xt],4*vLJt)
    dvLJc = cgt.grad(4*vLJt, xt)    
    df = cgt.function([xt],dvLJc)
    
    CGT_BFGSres = optimize.minimize(f, np.ravel(x), \
                                  method='L-BFGS-B',        \
                                  jac = df,     \
                                  options={'disp': False})
    return np.reshape(CGT_BFGSres.x, (N,D))
Exemple #46
0
def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem, size_batch, n_layers, n_unroll, k_in, k_h):
    # symbolic variables

    x_tnk = cgt.tensor3()
    targ_tnk = cgt.tensor3()
    #make_network = make_deep_lstm if arch=="lstm" else make_deep_gru
    make_network = make_deep_rrnn_rot_relu
    network = make_network(size_input, size_mem, n_layers, size_output, size_batch, k_in, k_h)
    init_hiddens = [cgt.matrix() for _ in xrange(get_num_hiddens(arch, n_layers))]
    # TODO fixed sizes

    cur_hiddens = init_hiddens
    loss = 0
    for t in xrange(n_unroll):
        outputs = network([x_tnk[t]] + cur_hiddens)
        cur_hiddens, prediction_logprobs = outputs[:-1], outputs[-1]
        # loss = loss + nn.categorical_negloglik(prediction_probs, targ_tnk[t]).sum()
        loss = loss - (prediction_logprobs*targ_tnk[t]).sum()
        cur_hiddens = outputs[:-1]

    final_hiddens = cur_hiddens

    loss = loss / (n_unroll * size_batch)

    params = network.get_parameters()
    gradloss = cgt.grad(loss, params)

    flatgrad = flatcat(gradloss)

    with utils.Message("compiling loss+grad"):
        f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens)
    f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss)

    assert len(init_hiddens) == len(final_hiddens)

    x_nk = cgt.matrix('x')
    outputs = network([x_nk] + init_hiddens)

    f_step = cgt.function([x_nk]+init_hiddens, outputs)

    # print "node count", cgt.count_nodes(flatgrad)
    return network, f_loss, f_loss_and_grad, f_step
Exemple #47
0
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
    """RMSProp updates
    Divide learning rate by moving average of RMS gradients. See [1]

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form (param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)

    References
    ----------
    .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015):
           RMSProp and equilibrated adaptive learning rates for non-convex optimization
           arXiv:1502.04390 http://arxiv.org/abs/1502.04390
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        accu_new = rho * accu + (1 - rho) * grad ** 2
        updates.append((accu, accu_new))
        updates.append((param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))))

    return updates
Exemple #48
0
def make_funcs(config, dbg_out=None):
    params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network(
        config['rnn_steps'], config['num_inputs'], config['num_outputs'],
        config['num_units'], config['num_mems']
    )

    # basic
    size_batch = Xs[0].shape[0]
    dY = Ys[0].shape[-1]
    Ys_gt = [cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d'%t)
             for t in range(len(Ys))]
    Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys]
    net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T

    # calculate loss
    loss_vec = []
    for i in range(len(Ys)):
        #     if i == 0: continue
        _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i])
        loss_vec.append(_l)
    loss_vec = cgt.add_multi(loss_vec)
    if config['weight_decay'] > 0.:
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2)
        loss_vec -= loss_param  # / size_batch
    loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch
    grad = cgt.grad(loss, params)

    # functions
    def f_init(size_batch):
        c_0, h_0 = [], []
        for _n_m in config['num_mems']:
            if _n_m > 0:
                c_0.append(np.zeros((size_batch, _n_m)))
                h_0.append(np.zeros((size_batch, _n_m)))
        return c_0, h_0
    f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1)
    f_loss = cgt.function(net_inputs + Ys_gt, loss)
    f_grad = cgt.function(net_inputs + Ys_gt, grad)
    f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad)
    return params, f_step, f_loss, f_grad, f_init, f_surr
Exemple #49
0
Fichier : nn.py Projet : x724/cgt
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6):
    """Adagrad updates
    The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients.

    Math:
    * ``accu_new = accu + grad ** 2``
    * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    epsilon: avoids division close to zero. Small float.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)]

    References
    ----------
    .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011):
           Adaptive subgradient methods for online learning and stochastic
           optimization. JMLR, 12:2121-2159.
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        accu_new = accu + grad**2
        updates.append((accu, accu_new))
        updates.append(
            (param,
             param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)))

    return updates
Exemple #50
0
def sgd(cost, params, learning_rate):
    """Stochastic Gradient Descent (SGD) updates
    Math:
    * ``param := param - learning_rate * gradient``
    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float

    Returns
    -------
    list of tuples of the form (param, new_param)
    """
    updates = []
    grads = cgt.grad(cost, params)
    for param, grad in zip(params, grads):
        updates.append((param, param - learning_rate * grad))

    return updates
Exemple #51
0
Fichier : nn.py Projet : x724/cgt
def sgd(cost, params, learning_rate):
    """Stochastic Gradient Descent (SGD) updates
    Math:
    * ``param := param - learning_rate * gradient``
    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    Returns
    -------
    list of tuples of the form (param, updates)
    """
    updates = []
    grads = cgt.grad(cost, params)
    for param, grad in zip(params, grads):
        updates.append((param, param - learning_rate * grad))

    return updates
def make_funcs(opt, ntm, total_time, loss_timesteps):
    x_tbk = cgt.tensor3("x", fixed_shape=(total_time, opt.b, opt.k))
    y_tbp = cgt.tensor3("y", fixed_shape=(total_time, opt.b, opt.p))
    loss_timesteps = set(loss_timesteps)

    initial_states = make_ntm_initial_states(opt)
    params = ntm.get_parameters() + get_parameters(initial_states)
    # params = ntm.get_parameters()

    lossCE = 0
    loss01 = 0

    state_arrs = initial_states
    for t in xrange(total_time):
        tmp = ntm([x_tbk[t]] + state_arrs)
        raw_pred = tmp[0]
        state_arrs = tmp[1:4]

        if t in loss_timesteps:
            p_pred = cgt.sigmoid(raw_pred)
            ce = bernoulli_crossentropy(
                y_tbp[t],
                p_pred).sum()  # cross-entropy of bernoulli distribution
            lossCE = lossCE + ce
            loss01 = loss01 + cgt.cast(cgt.equal(y_tbp[t], round01(p_pred)),
                                       cgt.floatX).sum()

    lossCE = lossCE / (len(loss_timesteps) * opt.p * opt.b) / np.log(2)
    loss01 = loss01 / (len(loss_timesteps) * opt.p * opt.b)
    gradloss = cgt.grad(lossCE, params)

    flatgrad = flatcat(gradloss)

    f_loss = cgt.function([x_tbk, y_tbp], lossCE)
    f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad])

    print "number of nodes in computation graph:", core.count_nodes(
        [lossCE, loss01, flatgrad])

    return f_loss, f_loss_and_grad, params
Exemple #53
0
def test_linreg():
    N = 10
    K = 3

    Xval = np.random.randn(N,K)
    wval = np.random.randn(K)
    bval = np.random.randn()
    yval = np.random.randn(N)

    X_nk = cgt.matrix("X")
    y_n = cgt.vector("y")
    w_k = cgt.vector("w")
    b = cgt.scalar(name="b")

    ypred = cgt.dot(X_nk, w_k) + b

    err = cgt.sum(cgt.square(ypred - y_n))
    g = cgt.grad(err, [w_k, b])

    g_simple,an,_ = cgt.core.simplify_and_analyze(g)


    print "Loss function:"
    cgt.print_tree([err])
    print "Gradient:"
    cgt.print_tree(g)

    print "Gradient simplified"
    cgt.print_tree(g_simple, nodefn=lambda node,o: o.write(" " + an["node2hash"][node][:5]))

    print "-------"

    d = {X_nk : Xval, w_k : wval, b : bval, y_n : yval}

    np.testing.assert_allclose(cgt.numeric_eval(err,d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2,
        atol={"single":1e-3,"double":1e-6}[cgt.get_precision()])
    np.testing.assert_allclose(cgt.numeric_eval(g[0],d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval),
        atol={"single":1e-3,"double":1e-6}[cgt.get_precision()])
    np.testing.assert_allclose(cgt.numeric_eval(g[1],d), 2 *  np.sum(Xval.dot(wval) + bval - yval, 0),
        atol={"single":1e-3,"double":1e-6}[cgt.get_precision()])
Exemple #54
0
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6):
    """Adagrad updates
    The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients.

    Math:
    * ``accu_new = accu + grad ** 2``
    * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    epsilon: avoids division close to zero. Small float.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)]

    References
    ----------
    .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011):
           Adaptive subgradient methods for online learning and stochastic
           optimization. JMLR, 12:2121-2159.
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        accu_new = accu + grad ** 2
        updates.append((accu, accu_new))
        updates.append((param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)))

    return updates
Exemple #55
0
def test_cpu_pool():
    with cgt.scoped_update_config(precision="quad", backend="native"):
        print cgt.get_precision()
        ci = get_compile_info()

        np.random.seed(0)
        x = cgt.tensor4("x", fixed_shape=(2, 3, 5, 7))
        y = max_pool_2d(x, (4, 4), (0, 0), (1, 1))
        xval = np.random.randn(2, 3, 5, 7)
        hval = np.random.randn(*cgt.infer_shape(y))
        h = cgt.constant(hval)

        cost = (y * h).sum()

        fcost = cgt.function([x], cost)
        fgrad = cgt.function([x], cgt.grad(cost, [x])[0])

        from cgt.numeric_diff import numeric_grad
        gnum = numeric_grad(fcost, xval)
        gana = fgrad(xval)

        assert np.allclose(gnum, gana)