Example #1
0
def tinyconv_model(X, w, w2, p_drop):
    l1 = nn.conv2d(X, w, kernelshape=(3, 3), pad=(1, 1), stride=(3, 3))
    l1a = nn.dropout(l1, p_drop)
    batchsize, channels, rows, cols = l1.shape
    l1flat = cgt.reshape(l1, [batchsize, channels * rows * cols])
    pyx = nn.softmax(l1flat.dot(w2))
    return l1, pyx
Example #2
0
    def get_context(self, prev_state_bf):

        state_step_bf = self.states_mlp_bf(prev_state_bf)
        state_step_b1f = cgt.dimshuffle(state_step_bf, [0, 'x', 1])

        # Compute the inner product <phi(s_i), psi(h_u)> where phi and psi are MLPs.
        # The below line computes the pointwise product of phi(s_i) and psi(h_u) and then sums to get the inner product.
        # scalar_energies_vec_bt = cgt.sqrt(cgt.sum(cgt.broadcast('*', state_step_b1f, self.features_post_mlp_btf, 'x1x,xxx'), axis=2))

        # Compute tau=tanh(h_u*W + s_i*V), broadcasting to do all h_u mults at once.
        scalar_energies_vec_btf = cgt.tanh(cgt.broadcast('+', self.features_post_mlp_btf, state_step_b1f, 'xxx,x1x'))

        # The next two lines compute w^T*(tau) with a pointwise product and then a sum.
        scalar_energies_vec_btf = cgt.broadcast('*', self.mixing_vec_w, scalar_energies_vec_btf, '11x,xxx')
        scalar_energies_vec_bt = cgt.sum(scalar_energies_vec_btf, axis=2)

        # Softmax weights the blended features over their time dimesions.
        softmax_weights_bt = nn.softmax(scalar_energies_vec_bt, axis=1)

        # This weight multiplies all features.
        extended_softmax_bt1 = cgt.dimshuffle(softmax_weights_bt, [0, 1, 'x'])
        # Weight the features by it's temporally dependent softmax weight.
        pre_blended = cgt.broadcast('*', extended_softmax_bt1, self.features_post_mlp_btf, 'xx1,xxx')
        # Integrate out time.
        blended_features_bf = cgt.sum(pre_blended, axis=1)

        return blended_features_bf
    def __init__(self, n_actions):
        Serializable.__init__(self, n_actions)
        cgt.set_precision('double')
        n_in = 128
        o_no = cgt.matrix("o_no",fixed_shape=(None,n_in))
        a_n = cgt.vector("a_n",dtype='i8')
        q_n = cgt.vector("q_n")
        oldpdist_np = cgt.matrix("oldpdists")

        h0 = (o_no - 128.0)/128.0 
        nhid = 64
        h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0))
        probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1))
        logprobs_na = cgt.log(probs_na)
        b = cgt.size(o_no, 0)
        logps_n = logprobs_na[cgt.arange(b), a_n]
        surr = (logps_n*q_n).mean()
        kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean()

        params = nn.get_parameters(surr)
        gradsurr = cgt.grad(surr, params)
        flatgrad = cgt.concatenate([p.flatten() for p in gradsurr])

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], 
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)]))
        self.f_pdist = cgt.function([o_no], probs_na)

        self.f_probs = cgt.function([o_no], probs_na)
        self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl])
        self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad)

        self.pc = ParamCollection(params)
Example #4
0
def tinyconv_model(X, w, w2, p_drop):
    l1 = nn.conv2d(X, w, kernelshape=(3,3), pad=(1,1),stride=(3,3))
    l1a = nn.dropout(l1, p_drop)
    batchsize,channels,rows,cols = l1.shape
    l1flat = cgt.reshape(l1, [batchsize,channels*rows*cols])
    pyx = nn.softmax(l1flat.dot(w2))
    return l1, pyx
Example #5
0
def dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = nn.dropout(X, p_drop_input)
    h = nn.rectify(cgt.dot(X, w_h))

    h = nn.dropout(h, p_drop_hidden)
    h2 = nn.rectify(cgt.dot(h, w_h2))

    h2 = nn.dropout(h2, p_drop_hidden)
    py_x = nn.softmax(cgt.dot(h2, w_o))
    return py_x
Example #6
0
def dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = nn.dropout(X, p_drop_input)
    h = nn.rectify(cgt.dot(X, w_h))

    h = nn.dropout(h, p_drop_hidden)
    h2 = nn.rectify(cgt.dot(h, w_h2))

    h2 = nn.dropout(h2, p_drop_hidden)
    py_x = nn.softmax(cgt.dot(h2, w_o))
    return py_x
Example #7
0
def dense_model3(X, w_h, w_h2, w_h3, w_o, p_drop_input, p_drop_hidden):
    X = nn.dropout(X, p_drop_input)
    h = nn.rectify(cgt.dot(X, w_h))

    h = nn.dropout(h, p_drop_hidden[0])
    h2 = nn.rectify(cgt.dot(h, w_h2))

    h2 = nn.dropout(h2, p_drop_hidden[1])
    h3 = nn.rectify(cgt.dot(h2, w_h3))

    h3 = nn.dropout(h3, p_drop_hidden[2])
    py_x = nn.softmax(cgt.dot(h3, w_o))
    return py_x
Example #8
0
def convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden):
    l1a = nn.rectify(nn.conv2d(X, w, kernelshape=(3, 3), pad=(1, 1)))
    l1 = nn.max_pool_2d(l1a, kernelshape=(2, 2), stride=(2, 2))
    l1 = nn.dropout(l1, p_drop_conv)

    l2a = nn.rectify(nn.conv2d(l1, w2, kernelshape=(3, 3), pad=(1, 1)))
    l2 = nn.max_pool_2d(l2a, kernelshape=(2, 2), stride=(2, 2))
    l2 = nn.dropout(l2, p_drop_conv)

    l3a = nn.rectify(nn.conv2d(l2, w3, kernelshape=(3, 3), pad=(1, 1)))
    l3b = nn.max_pool_2d(l3a, kernelshape=(2, 2), stride=(2, 2))
    batchsize, channels, rows, cols = l3b.shape
    l3 = cgt.reshape(l3b, [batchsize, channels * rows * cols])
    l3 = nn.dropout(l3, p_drop_conv)

    l4 = nn.rectify(cgt.dot(l3, w4))
    l4 = nn.dropout(l4, p_drop_hidden)

    pyx = nn.softmax(cgt.dot(l4, w_o))
    return pyx
Example #9
0
def convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden):
    l1a = nn.rectify(nn.conv2d(X, w, kernelshape=(3,3), pad=(1,1)))
    l1 = nn.max_pool_2d(l1a, kernelshape=(2, 2), stride=(2,2))
    l1 = nn.dropout(l1, p_drop_conv)

    l2a = nn.rectify(nn.conv2d(l1, w2, kernelshape=(3,3), pad=(1,1)))
    l2 = nn.max_pool_2d(l2a, kernelshape=(2, 2), stride=(2,2))
    l2 = nn.dropout(l2, p_drop_conv)

    l3a = nn.rectify(nn.conv2d(l2, w3, kernelshape=(3,3), pad=(1,1)))
    l3b = nn.max_pool_2d(l3a, kernelshape=(2, 2), stride=(2,2))
    batchsize,channels,rows,cols = l3b.shape
    l3 = cgt.reshape(l3b, [batchsize, channels*rows*cols])
    l3 = nn.dropout(l3, p_drop_conv)

    l4 = nn.rectify(cgt.dot(l3, w4))
    l4 = nn.dropout(l4, p_drop_hidden)
    
    pyx = nn.softmax(cgt.dot(l4, w_o))
    return pyx
Example #10
0
    def get_context_backup(self, prev_state_bf):
        state_step_bf = cgt.sigmoid(self.states_mlp_bf(prev_state_bf))

        product_list = []
        for time_step in range(0, 3):
            inner_product = cgt.sum(state_step_bf*self.features_post_mlp_btf[:, time_step, :], axis=1)
            product_list.append(inner_product)
        st = cgt.stack(product_list)
        st = cgt.dimshuffle(st, [1, 0])
        softmax_weights = softmax(st)

        sum = None

        for time_step in range(0, 3):
            softmax_t_step = cgt.dimshuffle(softmax_weights[:, time_step], [0, 'x'])
            if sum is None:
                sum = cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx')
            else:
                sum += cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx')

        return sum
Example #11
0
    def __init__(self, n_actions):
        Serializable.__init__(self, n_actions)
        cgt.set_precision('double')
        n_in = 128
        o_no = cgt.matrix("o_no", fixed_shape=(None, n_in))
        a_n = cgt.vector("a_n", dtype='i8')
        q_n = cgt.vector("q_n")
        oldpdist_np = cgt.matrix("oldpdists")

        h0 = (o_no - 128.0) / 128.0
        nhid = 64
        h1 = cgt.tanh(
            nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0))
        probs_na = nn.softmax(
            nn.Affine(nhid, n_actions,
                      weight_init=nn.IIDGaussian(std=0.01))(h1))
        logprobs_na = cgt.log(probs_na)
        b = cgt.size(o_no, 0)
        logps_n = logprobs_na[cgt.arange(b), a_n]
        surr = (logps_n * q_n).mean()
        kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean()

        params = nn.get_parameters(surr)
        gradsurr = cgt.grad(surr, params)
        flatgrad = cgt.concatenate([p.flatten() for p in gradsurr])

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._f_grad_lagrangian = cgt.function(
            [lam, oldpdist_np, o_no, a_n, q_n],
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)]))
        self.f_pdist = cgt.function([o_no], probs_na)

        self.f_probs = cgt.function([o_no], probs_na)
        self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n],
                                      [surr, kl])
        self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad)

        self.pc = ParamCollection(params)
Example #12
0
y = cgt.vector('y', dtype='i8')

conv1 = nn.rectify(
        nn.SpatialConvolution(1, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(X)
        )
pool1 = nn.max_pool_2d(conv1, kernelshape=(2,2), stride=(2,2))

conv2 = nn.rectify(
        nn.SpatialConvolution(32, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(pool1)
        )
pool2 = nn.max_pool_2d(conv2, kernelshape=(2,2), stride=(2,2))
d0, d1, d2, d3 = pool2.shape

flat = pool2.reshape([d0, d1*d2*d3])
nfeats = cgt.infer_shape(flat)[1]
probs = nn.softmax(nn.Affine(nfeats, 10)(flat))
cost = -categorical.loglik(y, probs).mean()

y_preds = cgt.argmax(probs, axis=1)
err = cgt.cast(cgt.not_equal(y, y_preds), cgt.floatX).mean()

params = nn.get_parameters(cost)
updates = nn.sgd(cost, params, 1e-3) 

# training function
f = cgt.function(inputs=[X, y], outputs=[], updates=updates)
# compute the cost and error
cost_and_err = cgt.function(inputs=[X, y], outputs=[cost, err])

for i in xrange(epochs):
    t0 = time.time()
Example #13
0
 def get_character_distribution(self, state_bf, context_bf):
     total_state = cgt.concatenate([state_bf, context_bf], axis=1)
     d1 = self.final_out_dense(total_state)
     return softmax(d1, axis=1)
Example #14
0
     Wval = np.empty(Wshape, dtype=cgt.floatX)
     W = name2node[Wname] = cgt.shared(Wval,
                                       name=Wname,
                                       fixed_shape_mask="all")
     bshape = (1, param.num_output)
     bname = layer.param[1].name or layer.name + ":b"
     bval = np.empty(bshape, dtype=cgt.floatX)
     b = name2node[bname] = cgt.shared(bval,
                                       name=bname,
                                       fixed_shape_mask="all")
     yname = layer.top[0]
     output = [cgt.broadcast("+", X.dot(W), b, "xx,1x")]
 elif layer.type == "ReLU":
     output = [nn.rectify(inputs[0])]
 elif layer.type == "Softmax":
     output = [nn.softmax(inputs[0])]
 elif layer.type == "LRN":
     # XXX needs params
     param = layer.lrn_param
     output = [
         nn.lrn(inputs[0], param.alpha, param.beta, param.local_size)
     ]
 elif layer.type == "Concat":
     param = layer.concat_param
     output = [cgt.concatenate(inputs, param.concat_dim)]
 elif layer.type == "Dropout":
     output = [nn.dropout(inputs[0])]
 elif layer.type == "SoftmaxWithLoss":
     output = [nn.loglik_softmax(inputs[0], inputs[1])]
 elif layer.type == "Accuracy":
     output = [nn.zero_one_loss(inputs[0], inputs[1])]
Example #15
0
     param = layer.inner_product_param
     nchanin = infer_shape(X)[1]
     Wshape = (param.num_output, nchanin)
     Wname = layer.param[0].name or layer.name+":W"
     Wval = np.empty(Wshape, dtype=cgt.floatX)
     W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all")
     bshape = (1, param.num_output)
     bname = layer.param[1].name or layer.name+":b"
     bval = np.empty(bshape, dtype=cgt.floatX)
     b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all")
     yname = layer.top[0]
     output = [cgt.broadcast("+",X.dot(W), b, "xx,1x")          ]
 elif layer.type == "ReLU":
     output = [nn.rectify(inputs[0])]
 elif layer.type == "Softmax":
     output = [nn.softmax(inputs[0])]
 elif layer.type == "LRN":
     # XXX needs params
     param = layer.lrn_param
     output = [nn.lrn(inputs[0], param.alpha,param.beta, param.local_size)]
 elif layer.type == "Concat":
     param = layer.concat_param
     output = [cgt.concatenate(inputs, param.concat_dim)            ]
 elif layer.type == "Dropout":
     output = [nn.dropout(inputs[0])]
 elif layer.type == "SoftmaxWithLoss":
     output = [nn.loglik_softmax(inputs[0], inputs[1])]
 elif layer.type == "Accuracy":
     output = [nn.zero_one_loss(inputs[0], inputs[1])]
 else:
     cgt.error("unrecognized layer type %s"%layer.type)
np.random.seed(42)
sortinds = np.random.permutation(Xtrain.shape[0])
Xtrain = Xtrain[sortinds]
ytrain = ytrain[sortinds]

# Model:
# Two linear/affine layers with a ReLU activation in between
# followed by a logsoftmax.
X = cgt.matrix('X', fixed_shape=(None, 784))
y = cgt.vector('y', dtype='i8')

layer1 = nn.Affine(784, 400, weight_init=nn.XavierNormal())(X)
act1 = nn.rectify(layer1)
layer2 = nn.Affine(400, 400, weight_init=nn.XavierNormal())(act1)
act2 = nn.rectify(layer2)
probs = nn.softmax(nn.Affine(400, 10)(act2))

y_preds = cgt.argmax(probs, axis=1)
cost = -cgt.mean(categorical.loglik(y, probs))
err = cgt.cast(cgt.not_equal(y, y_preds), cgt.floatX).mean()

params = nn.get_parameters(cost)
updates = nn.sgd(cost, params, learning_rate) # train via sgd

# training function
f = cgt.function(inputs=[X, y], outputs=[], updates=updates)
# compute the cost and error
cost_and_err = cgt.function(inputs=[X, y], outputs=[cost, err])

for i in xrange(epochs):
    t0 = time.time()