Ejemplo n.º 1
0
    def __init__(self, n_actions):
        Serializable.__init__(self, n_actions)
        cgt.set_precision('double')
        n_in = 128
        o_no = cgt.matrix("o_no",fixed_shape=(None,n_in))
        a_n = cgt.vector("a_n",dtype='i8')
        q_n = cgt.vector("q_n")
        oldpdist_np = cgt.matrix("oldpdists")

        h0 = (o_no - 128.0)/128.0 
        nhid = 64
        h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0))
        probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1))
        logprobs_na = cgt.log(probs_na)
        b = cgt.size(o_no, 0)
        logps_n = logprobs_na[cgt.arange(b), a_n]
        surr = (logps_n*q_n).mean()
        kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean()

        params = nn.get_parameters(surr)
        gradsurr = cgt.grad(surr, params)
        flatgrad = cgt.concatenate([p.flatten() for p in gradsurr])

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], 
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)]))
        self.f_pdist = cgt.function([o_no], probs_na)

        self.f_probs = cgt.function([o_no], probs_na)
        self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl])
        self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad)

        self.pc = ParamCollection(params)
Ejemplo n.º 2
0
    def __init__(self, input_feature_size, input_time_size, num_units,
                 weight_init=HeUniform(),
                 activation=cgt.sigmoid,
                 cell_out_init=IIDUniform(-0.1, 0.1),
                 hid_out_init=IIDUniform(-0.1, 0.1),
                 #cell_out_init=Constant(0.0),
                 #hid_out_init=Constant(0.0),
                 backwards=False):

        ingate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation)
        forgetgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation)
        cell = Gate(W_cell=None, nonlinearity=cgt.tanh)
        outgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation)

        self.nonlinearity = activation
        self.num_units = num_units
        self.backwards = backwards
        self.timesteps = input_time_size

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (parameter(init_array(gate.W_in, (input_feature_size, num_units)), name=None),
                    parameter(init_array(gate.W_hid, (num_units, num_units)), name=None),
                    parameter(init_array(gate.b, (1, num_units)), name=None),
                    gate.nonlinearity)

        # Add in parameters from the supplied Gate instances
        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')

        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params(forgetgate, 'forgetgate')

        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell')

        (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate')

        self.hid_init = parameter(init_array(hid_out_init, (1, num_units)), name=None)

        self.cell_init = parameter(init_array(cell_out_init, (1, num_units)), name=None)


        # Stack input weight matrices into a (num_inputs, 4*num_units) #checks out
        # matrix, which speeds up computation
        self.W_in_stacked = cgt.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate,
             self.W_in_to_cell, self.W_in_to_outgate], axis=1)

        # Same for hidden weight matrices
        self.W_hid_stacked = cgt.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
             self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)

        # Stack biases into a (4*num_units) vector
        self.b_stacked = cgt.concatenate(
            [self.b_ingate, self.b_forgetgate,
             self.b_cell, self.b_outgate], axis=1)

        self.cell_prev = None
        self.hid_prev = None
Ejemplo n.º 3
0
def circ_conv_1d(wg_bhn, s_bh3, axis=2):
    "VERY inefficient way to implement circular convolution for the special case of filter size 3"
    assert axis == 2
    n = cgt.size(wg_bhn,2)
    wback = cgt.concatenate([wg_bhn[:,:,n-1:n], wg_bhn[:,:,:n-1]], axis=2)
    w = wg_bhn
    wfwd = cgt.concatenate([wg_bhn[:,:,1:n], wg_bhn[:,:,0:1]], axis=2)
    return cgt.broadcast("*", s_bh3[:,:,0:1] , wback, "xx1,xxx")\
     + cgt.broadcast("*", s_bh3[:,:,1:2] , w, "xx1,xxx")\
     + cgt.broadcast("*", s_bh3[:,:,2:3] , wfwd, "xx1,xxx")
Ejemplo n.º 4
0
def circ_conv_1d(wg_bhn, s_bh3, axis=2):
    "VERY inefficient way to implement circular convolution for the special case of filter size 3"
    assert axis == 2
    n = cgt.size(wg_bhn,2)
    wback = cgt.concatenate([wg_bhn[:,:,n-1:n], wg_bhn[:,:,:n-1]], axis=2)
    w = wg_bhn
    wfwd = cgt.concatenate([wg_bhn[:,:,1:n], wg_bhn[:,:,0:1]], axis=2)
    return cgt.broadcast("*", s_bh3[:,:,0:1] , wback, "xx1,xxx")\
     + cgt.broadcast("*", s_bh3[:,:,1:2] , w, "xx1,xxx")\
     + cgt.broadcast("*", s_bh3[:,:,2:3] , wfwd, "xx1,xxx")
Ejemplo n.º 5
0
    def __init__(self, obs_dim, ctrl_dim):

        cgt.set_precision('double')
        Serializable.__init__(self, obs_dim, ctrl_dim)

        self.obs_dim = obs_dim
        self.ctrl_dim = ctrl_dim

        o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim))
        a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim))
        adv_n = cgt.vector("adv_n")
        oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim))
        self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a")
        std_1a = cgt.exp(logstd_1a)

        # Here's where we apply the network
        h0 = o_no
        nhid = 32
        h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0))
        h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1))
        mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2)

        b = cgt.size(o_no, 0)
        std_na = cgt.repeat(std_1a, b, axis=0)

        oldmean_na = oldpdist_np[:, 0:self.ctrl_dim]
        oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim]

        logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum()
        oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1)

        ratio_n = cgt.exp(logp_n - oldlogp_n)

        surr = (ratio_n*adv_n).mean()

        pdists_np = cgt.concatenate([mean_na, std_na], axis=1)
        # kl = cgt.log(sigafter/)

        params = nn.get_parameters(surr)

        oldvar_na = cgt.square(oldstd_na)
        var_na = cgt.square(std_na)
        kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean()


        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl])
        self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], 
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)]))
        self.f_pdist = cgt.function([o_no], pdists_np)

        self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl])

        self.pc = ParamCollection(params)
Ejemplo n.º 6
0
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
Ejemplo n.º 7
0
Archivo: sfnn.py Proyecto: TZ2016/snn
def make_funcs(config, dbg_out={}):
    net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'],
                                     config['num_units'], config['num_sto'],
                                     dbg_out=dbg_out)
    if not config['dbg_out_full']: dbg_out = {}
    # def f_sample(_inputs, num_samples=1, flatten=False):
    #     _mean, _var = f_step(_inputs)
    #     _samples = []
    #     for _m, _v in zip(_mean, _var):
    #         _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples)
    #         if flatten: _samples.extend(_s)
    #         else: _samples.append(_s)
    #     return np.array(_samples)
    Y_gt = cgt.matrix("Y")
    Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs']))
    params = nn.get_parameters(net_out)
    size_batch, size_out = net_out.shape
    inputs, outputs = [net_in], [net_out]
    if config['no_bias']:
        print "Excluding bias"
        params = [p for p in params if not p.name.endswith(".b")]
    loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec)
    if config['weight_decay'] > 0.:
        print "Applying penalty on parameter norm"
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2)
        loss_vec -= loss_param # / size_batch
    loss = cgt.sum(loss_vec) / size_batch

    # TODO_TZ f_step seems not to fail if X has wrong dim
    f_step = cgt.function(inputs, outputs)
    f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs,
                                [loss_vec], params, _dbg_out=dbg_out)

    return params, f_step, None, None, None, f_surr
Ejemplo n.º 8
0
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
Ejemplo n.º 9
0
Archivo: layers.py Proyecto: TZ2016/snn
def combo_layer(X, size_in, size_out, splits,
                s_funcs=None, o_funcs=None,
                name='?', dbg_out={}):
    """
    Create a combination of specified sub-layers and non-linearity.

    :param X: symbolic input
    :param size_in: input size (except batch)
    :param size_out: output size (except batch)
    :param splits: split points for applying each sub-layer
    :param s_funcs: list of functions to create each sub-layer
        of signature (input, in_size, out_size, name) -> output
    :param o_funcs: list of non-linearity functions
        of signature (input) -> output
    :param name: layer name
    :type name: str
    :return: symbolic output

    Note for s_funcs and o_funcs:
        - broadcasting enabled if not a list
        - element with value None has no effect (skip)
    """

    assert isinstance(splits, tuple) and len(splits) > 0
    assert all(splits[i] < splits[i+1] for i in xrange(len(splits) - 1))
    assert splits[0] >= 0 and splits[-1] <= size_out
    splits = list(splits)
    assert not isinstance(o_funcs, list) and not isinstance(s_funcs, list)
    o_funcs = list(o_funcs) if isinstance(o_funcs, tuple) \
        else [o_funcs] * (len(splits) + 1)
    s_funcs = list(s_funcs) if isinstance(s_funcs, tuple) \
        else [s_funcs] * (len(splits) + 1)
    assert len(splits) + 1 == len(o_funcs) == len(s_funcs)
    if splits[0] == 0:
        splits.pop(0)
        o_funcs.pop(0)
        s_funcs.pop(0)
    if len(splits) > 0 and splits[-1] == size_out:
        splits.pop()
        o_funcs.pop()
        s_funcs.pop()
    curr, names, ins, outs = 0, [], [], []
    splits.append(size_out)
    for _split, _f_s, _f_o in zip(splits, s_funcs, o_funcs):
        _name = 'L%s[%d:%d]' % (name, curr, _split)
        _s_out = _split - curr
        _i = X if _f_s is None else _f_s(X, size_in, _s_out, name=_name)
        _o = _i if _f_o is None else _f_o(_i)
        curr = _split
        ins.append(_i)
        outs.append(_o)
        names.append(_name)
    out = cgt.concatenate(outs, axis=1) if len(outs) > 1 else outs[0]
    dbg_out.update(dict(zip([_n + '~in' for _n in names], ins)))
    dbg_out.update(dict(zip([_n + '~out' for _n in names], outs)))
    return out
Ejemplo n.º 10
0
 def initialize(self, loss, scale):
     self._iter = 0
     self.pc = Params(self.params)
     cur_val = self.pc.get_value_flat()
     idx = cur_val.nonzero()
     new_val = np.random.uniform(-scale, scale, size=(self.pc.get_total_size(),))
     new_val[idx] = cur_val[idx]
     self.sync(new_val)
     grad = cgt.concatenate([g.flatten() for g in cgt.grad(loss, self.params)])
     return grad
Ejemplo n.º 11
0
def make_loss_and_grad(net):
    X_b = inps[0] #cgt.matrix(dtype=cgt.floatX)
    y_onehot = cgt.matrix(dtype='i4')
    outputs = [logprobs]

    loss = nn.crossent(outputs[0], y_onehot) / b_size
    #gradloss = cgt.grad(loss, params)
    gradloss = cgt.grad(loss, param_list)

    # XXX use flatcat function
    grad = cgt.concatenate([x.flatten() for x in gradloss])
    #grad = gradloss
    return cgt.make_function([X_b, y_onehot], [loss, grad, logprobs])
Ejemplo n.º 12
0
    def __init__(self, n_actions):
        Serializable.__init__(self, n_actions)
        cgt.set_precision('double')
        n_in = 128
        o_no = cgt.matrix("o_no", fixed_shape=(None, n_in))
        a_n = cgt.vector("a_n", dtype='i8')
        q_n = cgt.vector("q_n")
        oldpdist_np = cgt.matrix("oldpdists")

        h0 = (o_no - 128.0) / 128.0
        nhid = 64
        h1 = cgt.tanh(
            nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0))
        probs_na = nn.softmax(
            nn.Affine(nhid, n_actions,
                      weight_init=nn.IIDGaussian(std=0.01))(h1))
        logprobs_na = cgt.log(probs_na)
        b = cgt.size(o_no, 0)
        logps_n = logprobs_na[cgt.arange(b), a_n]
        surr = (logps_n * q_n).mean()
        kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean()

        params = nn.get_parameters(surr)
        gradsurr = cgt.grad(surr, params)
        flatgrad = cgt.concatenate([p.flatten() for p in gradsurr])

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._f_grad_lagrangian = cgt.function(
            [lam, oldpdist_np, o_no, a_n, q_n],
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)]))
        self.f_pdist = cgt.function([o_no], probs_na)

        self.f_probs = cgt.function([o_no], probs_na)
        self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n],
                                      [surr, kl])
        self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad)

        self.pc = ParamCollection(params)
Ejemplo n.º 13
0
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]):
    assert size_out >= size_random >= 0
    out = cgt.sigmoid(nn.Affine(
        size_in, size_out, name="InnerProd(%d->%d)" % (size_in, size_out)
    )(X))
    dbg_out.append(out)
    if size_random == 0:
        return out
    if size_random == size_out:
        out_s = cgt.bernoulli(out)
        return out_s
    out_s = cgt.bernoulli(out[:, :size_random])
    out = cgt.concatenate([out_s, out[:, size_random:]], axis=1)
    return out
Ejemplo n.º 14
0
def pyramidLayer(nn_input, temporal_resolution_decrease=2):
    """
    Batch by time by features. Decreases temporal resolution and increases feature dimension by a resolution decrease factor.
    """
    t_steps = cgt.infer_shape(nn_input)[1]
    if t_steps % temporal_resolution_decrease != 0:
        raise ValueError('number of timesteps is not divisable by resolution decrease!')
    out_list = []
    for iter_step in range(0, t_steps, temporal_resolution_decrease):
        concentrate_list = []
        for sub_iter_step in range(0, temporal_resolution_decrease):
            concentrate_list.append(nn_input[:, iter_step + sub_iter_step, :])
        out_list.append(cgt.concatenate(concentrate_list, axis=1))
    return cgt.dimshuffle(cgt.stack(out_list), [1, 0, 2])
Ejemplo n.º 15
0
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]):
    assert size_out >= size_random >= 0
    out = cgt.sigmoid(
        nn.Affine(size_in,
                  size_out,
                  name="InnerProd(%d->%d)" % (size_in, size_out))(X))
    dbg_out.append(out)
    if size_random == 0:
        return out
    if size_random == size_out:
        out_s = cgt.bernoulli(out)
        return out_s
    out_s = cgt.bernoulli(out[:, :size_random])
    out = cgt.concatenate([out_s, out[:, size_random:]], axis=1)
    return out
Ejemplo n.º 16
0
Archivo: layers.py Proyecto: TZ2016/snn
def mask_layer(func, X, size_in, i_start, i_end=None):
    if i_end is None:
        i_start, i_end = 0, i_start
    assert isinstance(i_start, int) and isinstance(i_end, int)
    assert -1 < i_start <= i_end <= size_in
    if i_end == i_start:
        return X
    if i_end - i_start == size_in:
        return func(X)
    outs = []
    if i_start > 0:
        outs.append(X[:, :i_start])
    outs.append(func(X[:, i_start:i_end]))
    if i_end < size_in:
        outs.append(X[:, i_end:])
    out = cgt.concatenate(outs, axis=1)
    return out
Ejemplo n.º 17
0
 def get_features_bengio(nn_input, num_units=256, recurrent_layer=None):
     if recurrent_layer is None:
         recurrent_layer = recurrentLayer
     w_init = IIDUniform(-0.1, 0.1)
     activation = cgt.sigmoid
     assert num_units % 2 == 0
     num_units /= 2
     l1_f = recurrent_layer(nn_input=nn_input, num_units=num_units, activation=activation, w_init=w_init)
     l1_b = recurrent_layer(nn_input=nn_input, num_units=num_units, activation=activation, w_init=w_init)
     l1_plus = cgt.concatenate([l1_f, l1_b], axis=2)
     #l2_f = recurrent_layer(nn_input=l1_plus, num_units=num_units, activation=activation, w_init=w_init)
     #l2_b = recurrent_layer(nn_input=l1_plus, num_units=num_units, activation=activation, w_init=w_init)
     #l2_plus = cgt.concatenate([l2_f, l2_b], axis=2)
     #l3_f = recurrent_layer(nn_input=l2_plus, num_units=num_units, activation=activation, w_init=w_init)
     #l3_b = recurrent_layer(nn_input=l2_plus, num_units=num_units, activation=activation, w_init=w_init)
     #l3_plus = cgt.concatenate([l3_f, l3_b], axis=2)
     return l1_plus
Ejemplo n.º 18
0
Archivo: layers.py Proyecto: TZ2016/snn
def mask_layer(func, X, size_in, i_start, i_end=None):
    if i_end is None:
        i_start, i_end = 0, i_start
    assert isinstance(i_start, int) and isinstance(i_end, int)
    assert -1 < i_start <= i_end <= size_in
    if i_end == i_start:
        return X
    if i_end - i_start == size_in:
        return func(X)
    outs = []
    if i_start > 0:
        outs.append(X[:, :i_start])
    outs.append(func(X[:, i_start:i_end]))
    if i_end < size_in:
        outs.append(X[:, i_end:])
    out = cgt.concatenate(outs, axis=1)
    return out
Ejemplo n.º 19
0
def main(num_epochs=NUM_EPOCHS):
    #cgt.set_precision('half')
    print("Building network ...")
    # Recurrent layers expect input of shape
    # (batch size, max sequence length, number of features)
    X = cgt.tensor3(name='X', fixed_shape=(N_BATCH, MAX_LENGTH, 2))
    l_forward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN)
    l_backward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN, backwards=True)
    #l_forward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid)
    #l_backward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid, backwards=True)
    #l_forward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify)
    #l_backward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify, backwards=True)
    l_forward_slice = l_forward[:, MAX_LENGTH-1, :]  # Take the last element in the forward slice time dimension
    l_backward_slice = l_backward[:, 0, :]  # And the first element in the backward slice time dimension
    l_sum = cgt.concatenate([l_forward_slice, l_backward_slice], axis=1)
    l_out = nnbuilder.denseLayer(l_sum, num_units=1, activation=cgt.tanh)
    target_values = cgt.vector('target_output')
    predicted_values = l_out[:, 0]  # For this task we only need the last value
    cost = cgt.mean((predicted_values - target_values)**2)
    # Compute SGD updates for training
    print("Computing updates ...")
    updates = nn.rmsprop(cost, nn.get_parameters(l_out), LEARNING_RATE)
    #updates = nn.nesterov_momentum(cost, nn.get_parameters(l_out), 0.05)
    # cgt functions for training and computing cost
    print("Compiling functions ...")
    train = cgt.function([X, target_values], cost, updates=updates)
    compute_cost = cgt.function([X, target_values], cost)

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val = gen_data()

    print("Training ...")
    time_start = time.time()
    try:
        for epoch in range(num_epochs):
            for _ in range(EPOCH_SIZE):
                X, y, m = gen_data()
                train(X, y)
            cost_val = compute_cost(X_val, y_val)
            print("Epoch {} validation cost = {}".format(epoch+1, cost_val))
            print ('Epoch took ' + str(time.time() - time_start))
            time_start = time.time()
    except KeyboardInterrupt:
        pass
Ejemplo n.º 20
0
def make_funcs(config, dbg_out=None):
    params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network(
        config['rnn_steps'], config['num_inputs'], config['num_outputs'],
        config['num_units'], config['num_mems'])

    # basic
    size_batch = Xs[0].shape[0]
    dY = Ys[0].shape[-1]
    Ys_gt = [
        cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d' % t)
        for t in range(len(Ys))
    ]
    Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys]
    net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T

    # calculate loss
    loss_vec = []
    for i in range(len(Ys)):
        #     if i == 0: continue
        _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i])
        loss_vec.append(_l)
    loss_vec = cgt.add_multi(loss_vec)
    if config['weight_decay'] > 0.:
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = config['weight_decay'] * cgt.sum(params_flat**2)
        loss_vec -= loss_param  # / size_batch
    loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch
    grad = cgt.grad(loss, params)

    # functions
    def f_init(size_batch):
        c_0, h_0 = [], []
        for _n_m in config['num_mems']:
            if _n_m > 0:
                c_0.append(np.zeros((size_batch, _n_m)))
                h_0.append(np.zeros((size_batch, _n_m)))
        return c_0, h_0

    f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1)
    f_loss = cgt.function(net_inputs + Ys_gt, loss)
    f_grad = cgt.function(net_inputs + Ys_gt, grad)
    f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad)
    return params, f_step, f_loss, f_grad, f_init, f_surr
Ejemplo n.º 21
0
Archivo: rnn.py Proyecto: TZ2016/snn
def make_funcs(config, dbg_out=None):
    params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network(
        config['rnn_steps'], config['num_inputs'], config['num_outputs'],
        config['num_units'], config['num_mems']
    )

    # basic
    size_batch = Xs[0].shape[0]
    dY = Ys[0].shape[-1]
    Ys_gt = [cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d'%t)
             for t in range(len(Ys))]
    Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys]
    net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T

    # calculate loss
    loss_vec = []
    for i in range(len(Ys)):
        #     if i == 0: continue
        _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i])
        loss_vec.append(_l)
    loss_vec = cgt.add_multi(loss_vec)
    if config['weight_decay'] > 0.:
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2)
        loss_vec -= loss_param  # / size_batch
    loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch
    grad = cgt.grad(loss, params)

    # functions
    def f_init(size_batch):
        c_0, h_0 = [], []
        for _n_m in config['num_mems']:
            if _n_m > 0:
                c_0.append(np.zeros((size_batch, _n_m)))
                h_0.append(np.zeros((size_batch, _n_m)))
        return c_0, h_0
    f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1)
    f_loss = cgt.function(net_inputs + Ys_gt, loss)
    f_grad = cgt.function(net_inputs + Ys_gt, grad)
    f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad)
    return params, f_step, f_loss, f_grad, f_init, f_surr
Ejemplo n.º 22
0
Archivo: sfnn.py Proyecto: TZ2016/snn
def make_funcs(config, dbg_out={}):
    net_in, net_out = hybrid_network(config['num_inputs'],
                                     config['num_outputs'],
                                     config['num_units'],
                                     config['num_sto'],
                                     dbg_out=dbg_out)
    if not config['dbg_out_full']: dbg_out = {}
    # def f_sample(_inputs, num_samples=1, flatten=False):
    #     _mean, _var = f_step(_inputs)
    #     _samples = []
    #     for _m, _v in zip(_mean, _var):
    #         _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples)
    #         if flatten: _samples.extend(_s)
    #         else: _samples.append(_s)
    #     return np.array(_samples)
    Y_gt = cgt.matrix("Y")
    Y_prec = cgt.tensor3('V',
                         fixed_shape=(None, config['num_inputs'],
                                      config['num_inputs']))
    params = nn.get_parameters(net_out)
    size_batch, size_out = net_out.shape
    inputs, outputs = [net_in], [net_out]
    if config['no_bias']:
        print "Excluding bias"
        params = [p for p in params if not p.name.endswith(".b")]
    loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec)
    if config['weight_decay'] > 0.:
        print "Applying penalty on parameter norm"
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = config['weight_decay'] * cgt.sum(params_flat**2)
        loss_vec -= loss_param  # / size_batch
    loss = cgt.sum(loss_vec) / size_batch

    # TODO_TZ f_step seems not to fail if X has wrong dim
    f_step = cgt.function(inputs, outputs)
    f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt],
                                outputs, [loss_vec],
                                params,
                                _dbg_out=dbg_out)

    return params, f_step, None, None, None, f_surr
Ejemplo n.º 23
0
def make_funcs(net_in, net_out, config, dbg_out=None):
    def f_grad (*x):
        out = f_surr(*x)
        return out['loss'], out['surr_loss'], out['surr_grad']
    Y = cgt.matrix("Y")
    params = nn.get_parameters(net_out)
    if 'no_bias' in config and config['no_bias']:
        print "Excluding bias"
        params = [p for p in params if not p.name.endswith(".b")]
    size_out, size_batch = Y.shape[1], net_in.shape[0]
    f_step = cgt.function([net_in], [net_out])
    # loss_raw of shape (size_batch, 1); loss should be a scalar
    # sum-of-squares loss
    sigma = 0.1
    loss_raw = -cgt.sum((net_out - Y) ** 2, axis=1, keepdims=True) / sigma
    # negative log-likelihood
    # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6  # positive sigma
    # loss_raw = -gaussian_diagonal.logprob(
    #     Y, net_out,
        # out_sigma
        # cgt.fill(.01, [size_batch, size_out])
    # )
    if 'param_penal_wt' in config:
        print "Applying penalty on parameter norm"
        assert config['param_penal_wt'] > 0
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = cgt.fill(cgt.sum(params_flat ** 2), [size_batch, 1])
        loss_param *= config['param_penal_wt']
        loss_raw += loss_param
    loss = cgt.sum(loss_raw) / size_batch
    # end of loss definition
    f_loss = cgt.function([net_in, Y], [net_out, loss])
    f_surr = get_surrogate_func([net_in, Y],
                                [net_out] + dbg_out,
                                [loss_raw], params)
    return params, f_step, f_loss, f_grad, f_surr
Ejemplo n.º 24
0
def make_funcs(net_in, net_out, config, dbg_out=None):
    def f_grad(*x):
        out = f_surr(*x)
        return out['loss'], out['surr_loss'], out['surr_grad']

    Y = cgt.matrix("Y")
    params = nn.get_parameters(net_out)
    if 'no_bias' in config and config['no_bias']:
        print "Excluding bias"
        params = [p for p in params if not p.name.endswith(".b")]
    size_out, size_batch = Y.shape[1], net_in.shape[0]
    f_step = cgt.function([net_in], [net_out])
    # loss_raw of shape (size_batch, 1); loss should be a scalar
    # sum-of-squares loss
    sigma = 0.1
    loss_raw = -cgt.sum((net_out - Y)**2, axis=1, keepdims=True) / sigma
    # negative log-likelihood
    # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6  # positive sigma
    # loss_raw = -gaussian_diagonal.logprob(
    #     Y, net_out,
    # out_sigma
    # cgt.fill(.01, [size_batch, size_out])
    # )
    if 'param_penal_wt' in config:
        print "Applying penalty on parameter norm"
        assert config['param_penal_wt'] > 0
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = cgt.fill(cgt.sum(params_flat**2), [size_batch, 1])
        loss_param *= config['param_penal_wt']
        loss_raw += loss_param
    loss = cgt.sum(loss_raw) / size_batch
    # end of loss definition
    f_loss = cgt.function([net_in, Y], [net_out, loss])
    f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw],
                                params)
    return params, f_step, f_loss, f_grad, f_surr
Ejemplo n.º 25
0
    def take_one_step(self, nn_input_bf, hid_out):

        #PROBABLY BUGGED. SHOULD BE REWRITTEN.

        self.num_batches = cgt.infer_shape(nn_input_bf)[0]

        # (n_time_steps, n_batch, n_features)
        #input_bf = cgt.dimshuffle(nn_input_bf, [1, 0, 2])

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = cgt.concatenate(
            [self.W_in_to_resetgate, self.W_in_to_updategate,
             self.W_in_to_hidden_update], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = cgt.concatenate(
            [self.W_hid_to_resetgate, self.W_hid_to_updategate,
             self.W_hid_to_hidden_update], axis=1)

        # Stack gate biases into a (3*num_units) vector
        b_stacked = cgt.concatenate(
            [self.b_resetgate, self.b_updategate,
             self.b_hidden_update], axis=1)


        # At each loop, input_n will be (n_time_steps, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input = cgt.dot(hid_previous, W_hid_stacked)

            # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
            input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x")

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate*hidden_update_hid

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate)*hid_previous + updategate*hidden_update
            return self.nonlinearity_hid(hid)  # adding this non-linearity seems to help stability.
            #return hid

        if hid_out is None:
            if self.hid_out is None:
                self.hid_out = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init)
            hid_out = self.hid_out



        # Retrieve the dimensionality of the incoming layer
        hid_out = step(nn_input_bf, hid_out, W_hid_stacked, W_in_stacked, b_stacked)

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        # self.hid_out = cgt.dimshuffle(self.hid_out, [1, 0, 2])

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = cgt.flip(hid_out, [1])

        self.hid_out = hid_out

        return hid_out
Ejemplo n.º 26
0
    def __call__(self, input_btf):

        # (n_time_steps, n_batch, n_features)
        input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2])
        self.num_batches = cgt.infer_shape(input_tbf)[1]

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = cgt.concatenate(
            [self.W_in_to_resetgate, self.W_in_to_updategate,
             self.W_in_to_hidden_update], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = cgt.concatenate(
            [self.W_hid_to_resetgate, self.W_hid_to_updategate,
             self.W_hid_to_hidden_update], axis=1)

        # Stack gate biases into a (3*num_units) vector
        b_stacked = cgt.concatenate(
            [self.b_resetgate, self.b_updategate,
             self.b_hidden_update], axis=1)


        # At each loop, input_n will be (n_time_steps, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input = cgt.dot(hid_previous, W_hid_stacked)

            # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
            input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x")

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate*hidden_update_hid

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate)*hid_previous + updategate*hidden_update
            return hid

        sequences = [input_tbf]
        step_fun = step
        hid_init = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        non_seqs += [W_in_stacked, b_stacked]
        # theano.scan only allows for positional arguments, so when
        # self.precompute_input is True, we need to supply fake placeholder
        # arguments for the input weights and biases.

        # Retrieve the dimensionality of the incoming layer
        hid_out = unroll_lstm(
            fn=step_fun,
            sequences=sequences,
            outputs_info=[hid_init],
            go_backwards=self.backwards,
            non_sequences=non_seqs,
            n_steps=self.timesteps)[0]

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = cgt.dimshuffle(hid_out, [1, 0, 2])

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = cgt.flip(hid_out, [1])

        return hid_out
Ejemplo n.º 27
0
def concatenate(items, axis=0):
    return cgt.concatenate(items, axis=axis)
Ejemplo n.º 28
0
def build_fcn_action_cond_encoder_net(input_shapes, levels=None):
    x_shape, u_shape = input_shapes
    x_c_dim = x_shape[0]
    x1_c_dim = 16
    levels = levels or [3]
    levels = sorted(set(levels))

    X = cgt.tensor4('X', fixed_shape=(None, ) + x_shape)
    U = cgt.matrix('U', fixed_shape=(None, ) + u_shape)

    # encoding
    Xlevels = {}
    for level in range(levels[-1] + 1):
        if level == 0:
            Xlevel = X
        else:
            if level == 1:
                xlevelm1_c_dim = x_c_dim
                xlevel_c_dim = x1_c_dim
            else:
                xlevelm1_c_dim = xlevel_c_dim
                xlevel_c_dim = 2 * xlevel_c_dim
            Xlevel_1 = nn.rectify(
                nn.SpatialConvolution(xlevelm1_c_dim,
                                      xlevel_c_dim,
                                      kernelshape=(3, 3),
                                      pad=(1, 1),
                                      stride=(1, 1),
                                      name='conv%d_1' % level,
                                      weight_init=nn.IIDGaussian(std=0.01))(
                                          Xlevels[level - 1]))
            Xlevel_2 = nn.rectify(
                nn.SpatialConvolution(
                    xlevel_c_dim,
                    xlevel_c_dim,
                    kernelshape=(3, 3),
                    pad=(1, 1),
                    stride=(1, 1),
                    name='conv%d_2' % level,
                    weight_init=nn.IIDGaussian(std=0.01))(Xlevel_1))
            Xlevel = nn.max_pool_2d(Xlevel_2,
                                    kernelshape=(2, 2),
                                    pad=(0, 0),
                                    stride=(2, 2))
        Xlevels[level] = Xlevel

    # bilinear
    Xlevels_next_pred_0 = {}
    Ylevels = OrderedDict()
    Ylevels_diff_pred = OrderedDict()
    for level in levels:
        Xlevel = Xlevels[level]
        Xlevel_diff_pred = Bilinear(input_shapes,
                                    b=None,
                                    axis=2,
                                    name='bilinear%d' % level)(Xlevel, U)
        Xlevels_next_pred_0[level] = Xlevel + Xlevel_diff_pred
        Ylevels[level] = Xlevel.reshape(
            (Xlevel.shape[0], cgt.mul_multi(Xlevel.shape[1:])))
        Ylevels_diff_pred[level] = Xlevel_diff_pred.reshape(
            (Xlevel_diff_pred.shape[0],
             cgt.mul_multi(Xlevel_diff_pred.shape[1:])))

    # decoding
    Xlevels_next_pred = {}
    for level in range(levels[-1] + 1)[::-1]:
        if level == levels[-1]:
            Xlevel_next_pred = Xlevels_next_pred_0[level]
        else:
            if level == 0:
                xlevelm1_c_dim = x_c_dim
            elif level < levels[-1] - 1:
                xlevel_c_dim = xlevelm1_c_dim
                xlevelm1_c_dim = xlevelm1_c_dim // 2
            Xlevel_next_pred_2 = SpatialDeconvolution(
                xlevel_c_dim,
                xlevel_c_dim,
                kernelshape=(2, 2),
                pad=(0, 0),
                stride=(2, 2),
                name='upsample%d' % (level + 1),
                weight_init=nn.IIDGaussian(std=0.01))(Xlevels_next_pred[
                    level +
                    1])  # TODO initialize with bilinear # TODO should rectify?
            Xlevel_next_pred_1 = nn.rectify(
                SpatialDeconvolution(
                    xlevel_c_dim,
                    xlevel_c_dim,
                    kernelshape=(3, 3),
                    pad=(1, 1),
                    stride=(1, 1),
                    name='deconv%d_2' % (level + 1),
                    weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_2))
            nonlinearity = nn.rectify if level > 0 else cgt.tanh
            Xlevel_next_pred = nonlinearity(
                SpatialDeconvolution(
                    xlevel_c_dim,
                    xlevelm1_c_dim,
                    kernelshape=(3, 3),
                    pad=(1, 1),
                    stride=(1, 1),
                    name='deconv%d_1' % (level + 1),
                    weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_1))
            if level in Xlevels_next_pred_0:
                coefs = nn.parameter(nn.init_array(nn.Constant(0.5), (2, )),
                                     name='sum%d.coef' % level)
                Xlevel_next_pred = coefs[0] * Xlevel_next_pred + coefs[
                    1] * Xlevels_next_pred_0[level]
            # TODO: tanh should be after sum
        Xlevels_next_pred[level] = Xlevel_next_pred

    X_next_pred = Xlevels_next_pred[0]
    Y = cgt.concatenate(Ylevels.values(), axis=1)
    Y_diff_pred = cgt.concatenate(Ylevels_diff_pred.values(), axis=1)

    X_diff = cgt.tensor4('X_diff', fixed_shape=(None, ) + x_shape)
    X_next = X + X_diff
    loss = ((X_next - X_next_pred)**2).mean(axis=0).sum() / 2.

    net_name = 'FcnActionCondEncoderNet_levels' + ''.join(
        str(level) for level in levels)
    input_vars = OrderedDict([(var.name, var) for var in [X, U, X_diff]])
    pred_vars = OrderedDict([('Y_diff_pred', Y_diff_pred), ('Y', Y),
                             ('X_next_pred', X_next_pred)])
    return net_name, input_vars, pred_vars, loss
Ejemplo n.º 29
0
def stack(tensors, axis=0):
    if axis is not 0:
        raise ValueError('only axis=0 is supported under cgt')
    return cgt.concatenate(map(lambda x: cgt.reshape(x, [1] + x.shape),
                               tensors),
                           axis=0)
Ejemplo n.º 30
0
 def get_character_distribution(self, state_bf, context_bf):
     total_state = cgt.concatenate([state_bf, context_bf], axis=1)
     d1 = self.final_out_dense(total_state)
     return softmax(d1, axis=1)
Ejemplo n.º 31
0
                                              fixed_shape_mask="all")
            yname = layer.top[0]
            output = [cgt.broadcast("+", X.dot(W), b, "xx,1x")]
        elif layer.type == "ReLU":
            output = [nn.rectify(inputs[0])]
        elif layer.type == "Softmax":
            output = [nn.softmax(inputs[0])]
        elif layer.type == "LRN":
            # XXX needs params
            param = layer.lrn_param
            output = [
                nn.lrn(inputs[0], param.alpha, param.beta, param.local_size)
            ]
        elif layer.type == "Concat":
            param = layer.concat_param
            output = [cgt.concatenate(inputs, param.concat_dim)]
        elif layer.type == "Dropout":
            output = [nn.dropout(inputs[0])]
        elif layer.type == "SoftmaxWithLoss":
            output = [nn.loglik_softmax(inputs[0], inputs[1])]
        elif layer.type == "Accuracy":
            output = [nn.zero_one_loss(inputs[0], inputs[1])]
        else:
            cgt.error("unrecognized layer type %s" % layer.type)

        assert output is not None

        # assert isinstance(output, cgt.Node)
        for i in xrange(len(layer.top)):
            name2node[layer.top[i]] = output[i]
        print "stored", layer.top[0]
Ejemplo n.º 32
0
Archivo: rrnn.py Proyecto: zobot/rrnn
def flatcat(xs):
    return cgt.concatenate([x.flatten() for x in xs])
Ejemplo n.º 33
0
def stack(tensors, axis=0):
    if axis is not 0:
        raise ValueError('only axis=0 is supported under cgt')
    return cgt.concatenate(map(lambda x: cgt.reshape(x, [1] + x.shape), tensors), axis=0)
Ejemplo n.º 34
0
            bname = layer.param[1].name or layer.name+":b"
            bval = np.empty(bshape, dtype=cgt.floatX)
            b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all")
            yname = layer.top[0]
            output = [cgt.broadcast("+",X.dot(W), b, "xx,1x")          ]
        elif layer.type == "ReLU":
            output = [nn.rectify(inputs[0])]
        elif layer.type == "Softmax":
            output = [nn.softmax(inputs[0])]
        elif layer.type == "LRN":
            # XXX needs params
            param = layer.lrn_param
            output = [nn.lrn(inputs[0], param.alpha,param.beta, param.local_size)]
        elif layer.type == "Concat":
            param = layer.concat_param
            output = [cgt.concatenate(inputs, param.concat_dim)            ]
        elif layer.type == "Dropout":
            output = [nn.dropout(inputs[0])]
        elif layer.type == "SoftmaxWithLoss":
            output = [nn.loglik_softmax(inputs[0], inputs[1])]
        elif layer.type == "Accuracy":
            output = [nn.zero_one_loss(inputs[0], inputs[1])]
        else:
            cgt.error("unrecognized layer type %s"%layer.type)

        assert output is not None

        # assert isinstance(output, cgt.Node)
        for i in xrange(len(layer.top)): name2node[layer.top[i]] = output[i]
        print "stored", layer.top[0]
        if layer.type != "Data":
Ejemplo n.º 35
0
 def get_decoder_state(self, context_bf, prev_out_bc, prev_decoder_state):
     input_bf = cgt.concatenate([context_bf, prev_out_bc], axis=1)
     l1 = self.recurrent_decoder_one(input_bf, prev_decoder_state)
     l2 = self.recurrent_decoder_two(l1)
     return l2
Ejemplo n.º 36
0
def flatcat(xs):
    return cgt.concatenate([x.flatten() for x in xs])
Ejemplo n.º 37
0
def concatenate(items, axis=0):
    return cgt.concatenate(items, axis=axis)
Ejemplo n.º 38
0
    def __call__(self, nn_input_btf):

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        nn_input_tbf = cgt.dimshuffle(nn_input_btf, [1, 0, 2])
        seq_len, num_batch = nn_input_tbf.shape[0], nn_input_tbf.shape[1]

        # Stack input weight matrices into a (num_inputs, 4*num_units) #checks out
        # matrix, which speeds up computation
        W_in_stacked = cgt.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate,
             self.W_in_to_cell, self.W_in_to_outgate], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = cgt.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
             self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)

        # Stack biases into a (4*num_units) vector
        b_stacked = cgt.concatenate(
            [self.b_ingate, self.b_forgetgate,
             self.b_cell, self.b_outgate], axis=1)

        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked):

            input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x")

            # Calculate gates pre-activations and slice
            gates = input_n + cgt.dot(hid_previous, W_hid_stacked)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input
            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            return [cell, hid]

        sequences = nn_input_tbf
        step_fun = step

        ones = cgt.ones((num_batch, 1))
        cell_init = cgt.dot(ones, self.cell_init)
        hid_init = cgt.dot(ones, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        non_seqs += [W_in_stacked, b_stacked]
        cell_out, hid_out = unroll_lstm(
            fn=step_fun,
            sequences=sequences,
            outputs_info=[cell_init, hid_init],
            go_backwards=self.backwards,
            non_sequences=non_seqs,
            n_steps=self.timesteps)


        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = cgt.dimshuffle(hid_out, [1, 0, 2])

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = cgt.flip(hid_out, [1])

        return hid_out
Ejemplo n.º 39
0
    def __init__(self, obs_dim, ctrl_dim):

        cgt.set_precision('double')
        Serializable.__init__(self, obs_dim, ctrl_dim)

        self.obs_dim = obs_dim
        self.ctrl_dim = ctrl_dim

        o_no = cgt.matrix("o_no", fixed_shape=(None, obs_dim))
        a_na = cgt.matrix("a_na", fixed_shape=(None, ctrl_dim))
        adv_n = cgt.vector("adv_n")
        oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2 * ctrl_dim))
        self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)),
                                               name="std_1a")
        std_1a = cgt.exp(logstd_1a)

        # Here's where we apply the network
        h0 = o_no
        nhid = 32
        h1 = cgt.tanh(
            nn.Affine(obs_dim, nhid, weight_init=nn.IIDGaussian(std=0.1))(h0))
        h2 = cgt.tanh(
            nn.Affine(nhid, nhid, weight_init=nn.IIDGaussian(std=0.1))(h1))
        mean_na = nn.Affine(nhid,
                            ctrl_dim,
                            weight_init=nn.IIDGaussian(std=0.01))(h2)

        b = cgt.size(o_no, 0)
        std_na = cgt.repeat(std_1a, b, axis=0)

        oldmean_na = oldpdist_np[:, 0:self.ctrl_dim]
        oldstd_na = oldpdist_np[:, self.ctrl_dim:2 * self.ctrl_dim]

        logp_n = ((-.5) * cgt.square(
            (a_na - mean_na) / std_na).sum(axis=1)) - logstd_1a.sum()
        oldlogp_n = ((-.5) * cgt.square(
            (a_na - oldmean_na) / oldstd_na).sum(axis=1)
                     ) - cgt.log(oldstd_na).sum(axis=1)

        ratio_n = cgt.exp(logp_n - oldlogp_n)

        surr = (ratio_n * adv_n).mean()

        pdists_np = cgt.concatenate([mean_na, std_na], axis=1)
        # kl = cgt.log(sigafter/)

        params = nn.get_parameters(surr)

        oldvar_na = cgt.square(oldstd_na)
        var_na = cgt.square(std_na)
        kl = (cgt.log(std_na / oldstd_na) +
              (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) -
              .5).sum(axis=1).mean()

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n],
                                             [surr, kl])
        self._compute_grad_lagrangian = cgt.function(
            [lam, oldpdist_np, o_no, a_na, adv_n],
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)]))
        self.f_pdist = cgt.function([o_no], pdists_np)

        self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n],
                                   [surr, kl])

        self.pc = ParamCollection(params)
Ejemplo n.º 40
0
Archivo: layers.py Proyecto: TZ2016/snn
def combo_layer(X,
                size_in,
                size_out,
                splits,
                s_funcs=None,
                o_funcs=None,
                name='?',
                dbg_out={}):
    """
    Create a combination of specified sub-layers and non-linearity.

    :param X: symbolic input
    :param size_in: input size (except batch)
    :param size_out: output size (except batch)
    :param splits: split points for applying each sub-layer
    :param s_funcs: list of functions to create each sub-layer
        of signature (input, in_size, out_size, name) -> output
    :param o_funcs: list of non-linearity functions
        of signature (input) -> output
    :param name: layer name
    :type name: str
    :return: symbolic output

    Note for s_funcs and o_funcs:
        - broadcasting enabled if not a list
        - element with value None has no effect (skip)
    """

    assert isinstance(splits, tuple) and len(splits) > 0
    assert all(splits[i] < splits[i + 1] for i in xrange(len(splits) - 1))
    assert splits[0] >= 0 and splits[-1] <= size_out
    splits = list(splits)
    assert not isinstance(o_funcs, list) and not isinstance(s_funcs, list)
    o_funcs = list(o_funcs) if isinstance(o_funcs, tuple) \
        else [o_funcs] * (len(splits) + 1)
    s_funcs = list(s_funcs) if isinstance(s_funcs, tuple) \
        else [s_funcs] * (len(splits) + 1)
    assert len(splits) + 1 == len(o_funcs) == len(s_funcs)
    if splits[0] == 0:
        splits.pop(0)
        o_funcs.pop(0)
        s_funcs.pop(0)
    if len(splits) > 0 and splits[-1] == size_out:
        splits.pop()
        o_funcs.pop()
        s_funcs.pop()
    curr, names, ins, outs = 0, [], [], []
    splits.append(size_out)
    for _split, _f_s, _f_o in zip(splits, s_funcs, o_funcs):
        _name = 'L%s[%d:%d]' % (name, curr, _split)
        _s_out = _split - curr
        _i = X if _f_s is None else _f_s(X, size_in, _s_out, name=_name)
        _o = _i if _f_o is None else _f_o(_i)
        curr = _split
        ins.append(_i)
        outs.append(_o)
        names.append(_name)
    out = cgt.concatenate(outs, axis=1) if len(outs) > 1 else outs[0]
    dbg_out.update(dict(zip([_n + '~in' for _n in names], ins)))
    dbg_out.update(dict(zip([_n + '~out' for _n in names], outs)))
    return out