Exemple #1
0
def test_get_train_objective():
    batch_size = 32
    feat_t_steps = 5
    feat_num_features = 256
    max_label_length = 5
    num_out_classes = 27
    feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features))
    ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes))
    seq2seq = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes)
    train_objective = seq2seq.get_train_objective(max_label_length=max_label_length,
                                                  ground_labels_basis_btc=ground_labels_basis)
    train_shape = cgt.infer_shape(train_objective)
    assert train_shape == ()
    nn.get_parameters(train_objective)
Exemple #2
0
def save_weights(network_out_layer, file_name_for_saving):
    all_params = get_parameters(network_out_layer)
    param_values = []
    for param in all_params:
        param_values.append(param.op.get_value())

    pickle.dump(param_values, open(file_name_for_saving+".p", "wb"))
Exemple #3
0
def make_funcs(config, dbg_out={}):
    net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'],
                                     config['num_units'], config['num_sto'],
                                     dbg_out=dbg_out)
    if not config['dbg_out_full']: dbg_out = {}
    # def f_sample(_inputs, num_samples=1, flatten=False):
    #     _mean, _var = f_step(_inputs)
    #     _samples = []
    #     for _m, _v in zip(_mean, _var):
    #         _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples)
    #         if flatten: _samples.extend(_s)
    #         else: _samples.append(_s)
    #     return np.array(_samples)
    Y_gt = cgt.matrix("Y")
    Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs']))
    params = nn.get_parameters(net_out)
    size_batch, size_out = net_out.shape
    inputs, outputs = [net_in], [net_out]
    if config['no_bias']:
        print "Excluding bias"
        params = [p for p in params if not p.name.endswith(".b")]
    loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec)
    if config['weight_decay'] > 0.:
        print "Applying penalty on parameter norm"
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2)
        loss_vec -= loss_param # / size_batch
    loss = cgt.sum(loss_vec) / size_batch

    # TODO_TZ f_step seems not to fail if X has wrong dim
    f_step = cgt.function(inputs, outputs)
    f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs,
                                [loss_vec], params, _dbg_out=dbg_out)

    return params, f_step, None, None, None, f_surr
    def __init__(self, n_actions):
        Serializable.__init__(self, n_actions)
        cgt.set_precision('double')
        n_in = 128
        o_no = cgt.matrix("o_no",fixed_shape=(None,n_in))
        a_n = cgt.vector("a_n",dtype='i8')
        q_n = cgt.vector("q_n")
        oldpdist_np = cgt.matrix("oldpdists")

        h0 = (o_no - 128.0)/128.0 
        nhid = 64
        h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0))
        probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1))
        logprobs_na = cgt.log(probs_na)
        b = cgt.size(o_no, 0)
        logps_n = logprobs_na[cgt.arange(b), a_n]
        surr = (logps_n*q_n).mean()
        kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean()

        params = nn.get_parameters(surr)
        gradsurr = cgt.grad(surr, params)
        flatgrad = cgt.concatenate([p.flatten() for p in gradsurr])

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], 
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)]))
        self.f_pdist = cgt.function([o_no], probs_na)

        self.f_probs = cgt.function([o_no], probs_na)
        self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl])
        self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad)

        self.pc = ParamCollection(params)
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--profile",action="store_true")
    parser.add_argument("--unittest",action="store_true")
    parser.add_argument("--epochs",type=int,default=10)
    args = parser.parse_args()

    batchsize = 64
    Xshape = (batchsize, 3, 32, 32)
    X = cgt.tensor4("X", fixed_shape = Xshape)
    y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4')

    conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), 
        weight_init=nn.IIDGaussian(std=1e-4))(X)
    relu1 = nn.rectify(conv1)
    pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2))
    conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), 
        weight_init=nn.IIDGaussian(std=0.01))(relu1)
    relu2 = nn.rectify(conv2)
    pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2))
    conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), 
        weight_init=nn.IIDGaussian(std=0.01))(pool2)
    pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2))
    relu3 = nn.rectify(pool3)
    d0,d1,d2,d3 = relu3.shape
    flatlayer = relu3.reshape([d0,d1*d2*d3])
    nfeats = cgt.infer_shape(flatlayer)[1]
    ip1 = nn.Affine(nfeats, 10)(flatlayer)
    logprobs = nn.logsoftmax(ip1)
    loss = -logprobs[cgt.arange(batchsize), y].mean()

    params = nn.get_parameters(loss)
    updates = rmsprop_updates(loss, params, stepsize=1e-3)
    
    train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates)

    if args.profile: cgt.profiler.start()

    data = np.load("/Users/joschu/Data/cifar-10-batches-py/cifar10.npz")
    Xtrain = data["X_train"]
    ytrain = data["y_train"]

    print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"])
    for i_epoch in xrange(args.epochs):
        for start in xrange(0, Xtrain.shape[0], batchsize):
            tstart = time.time()
            end = start+batchsize
            print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart
            if start > batchsize*5: break
        # elapsed = time.time() - tstart
        # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)])
        # testerr, testloss = computeloss(Xtest, ytest)
        # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])
        if args.profile: 
            cgt.profiler.print_stats()
            return
        if args.unittest:
            break
Exemple #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--profile",action="store_true")
    parser.add_argument("--unittest",action="store_true")
    parser.add_argument("--epochs",type=int,default=10)
    args = parser.parse_args()

    batchsize = 64
    Xshape = (batchsize, 3, 32, 32)
    X = cgt.tensor4("X", fixed_shape = Xshape)
    y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4')

    conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), 
        weight_init=nn.IIDGaussian(std=1e-4))(X)
    relu1 = nn.rectify(conv1)
    pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2))
    conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), 
        weight_init=nn.IIDGaussian(std=0.01))(pool1)
    relu2 = nn.rectify(conv2)
    pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2))
    conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), 
        weight_init=nn.IIDGaussian(std=0.01))(pool2)
    pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2))
    relu3 = nn.rectify(pool3)
    d0,d1,d2,d3 = relu3.shape
    flatlayer = relu3.reshape([d0,d1*d2*d3])
    nfeats = cgt.infer_shape(flatlayer)[1]
    ip1 = nn.Affine(nfeats, 10)(flatlayer)
    logprobs = nn.logsoftmax(ip1)
    loss = -logprobs[cgt.arange(batchsize), y].mean()

    params = nn.get_parameters(loss)
    updates = rmsprop_updates(loss, params, stepsize=1e-3)
    
    train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates)

    if args.profile: cgt.profiler.start()

    data = fetch_dataset("http://rll.berkeley.edu/cgt-data/cifar10.npz")
    Xtrain = data["X_train"]
    ytrain = data["y_train"]

    print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"])
    for i_epoch in xrange(args.epochs):
        for start in xrange(0, Xtrain.shape[0], batchsize):
            tstart = time.time()
            end = start+batchsize
            print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart
            if start > batchsize*5: break
        # elapsed = time.time() - tstart
        # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)])
        # testerr, testloss = computeloss(Xtest, ytest)
        # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])
        if args.profile: 
            cgt.profiler.print_stats()
            return
        if args.unittest:
            break
 def make_updater_fc():
     X = cgt.matrix("X", fixed_shape=(None, 28 * 28))
     y = cgt.vector("y", dtype="i8")
     stepsize = cgt.scalar("stepsize")
     loss = build_fc_return_loss(X, y)
     params = nn.get_parameters(loss)
     gparams = cgt.grad(loss, params)
     updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
     return cgt.function([X, y, stepsize], loss, updates=updates)
Exemple #8
0
 def make_updater_fc():
     X = cgt.matrix("X", fixed_shape=(None, 28 * 28))
     y = cgt.vector("y", dtype='i8')
     stepsize = cgt.scalar("stepsize")
     loss = build_fc_return_loss(X, y)
     params = nn.get_parameters(loss)
     gparams = cgt.grad(loss, params)
     updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
     return cgt.function([X, y, stepsize], loss, updates=updates)
    def make_updater_convnet():
        X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28))  # so shapes can be inferred
        y = cgt.vector("y", dtype="i8")
        stepsize = cgt.scalar("stepsize")
        loss = build_convnet_return_loss(X, y)

        params = nn.get_parameters(loss)
        gparams = cgt.grad(loss, params)
        updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], loss, updates=updates)
    def __init__(self, obs_dim, ctrl_dim):

        cgt.set_precision('double')
        Serializable.__init__(self, obs_dim, ctrl_dim)

        self.obs_dim = obs_dim
        self.ctrl_dim = ctrl_dim

        o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim))
        a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim))
        adv_n = cgt.vector("adv_n")
        oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim))
        self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a")
        std_1a = cgt.exp(logstd_1a)

        # Here's where we apply the network
        h0 = o_no
        nhid = 32
        h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0))
        h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1))
        mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2)

        b = cgt.size(o_no, 0)
        std_na = cgt.repeat(std_1a, b, axis=0)

        oldmean_na = oldpdist_np[:, 0:self.ctrl_dim]
        oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim]

        logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum()
        oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1)

        ratio_n = cgt.exp(logp_n - oldlogp_n)

        surr = (ratio_n*adv_n).mean()

        pdists_np = cgt.concatenate([mean_na, std_na], axis=1)
        # kl = cgt.log(sigafter/)

        params = nn.get_parameters(surr)

        oldvar_na = cgt.square(oldstd_na)
        var_na = cgt.square(std_na)
        kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean()


        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl])
        self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], 
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)]))
        self.f_pdist = cgt.function([o_no], pdists_np)

        self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl])

        self.pc = ParamCollection(params)
Exemple #11
0
    def make_updater_convnet():
        X = cgt.tensor4("X", fixed_shape=(None, 1, 28,
                                          28))  # so shapes can be inferred
        y = cgt.vector("y", dtype='i8')
        stepsize = cgt.scalar("stepsize")
        loss = build_convnet_return_loss(X, y)

        params = nn.get_parameters(loss)
        gparams = cgt.grad(loss, params)
        updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], loss, updates=updates)
    def make_updater_convnet_parallel():
        X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28))  # so shapes can be inferred
        y = cgt.vector("y", dtype="i8")
        stepsize = cgt.scalar("stepsize")
        loss = build_convnet_return_loss(X, y)

        m = nn.Module([X, y], [loss])
        split_loss = 0
        for start in xrange(0, batch_size, batch_size // 4):
            sli = slice(start, start + batch_size // 4)
            split_loss += m([X[sli], y[sli]])[0]
        split_loss /= 4
        params = nn.get_parameters(loss)
        gparams = cgt.grad(split_loss, params)
        updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], split_loss, updates=updates2)
    def make_updater_fc_parallel():
        X = cgt.matrix("X", fixed_shape=(None,28*28))
        y = cgt.vector("y",dtype='i8')
        stepsize = cgt.scalar("stepsize")

        loss = build_fc_return_loss(X,y)
        params = nn.get_parameters(loss)        
        m = nn.Module([X,y], [loss])
        split_loss = 0
        for start in xrange(0, batch_size, batch_size//4):
            sli = slice(start, start+batch_size//4)
            split_loss += m([X[sli], y[sli]])[0]
        split_loss /= 4
        gparams = cgt.grad(split_loss, params)
        updates2 = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X,y, stepsize], split_loss, updates=updates2)
Exemple #14
0
    def make_updater_fc_parallel():
        X = cgt.matrix("X", fixed_shape=(None, 28 * 28))
        y = cgt.vector("y", dtype='i8')
        stepsize = cgt.scalar("stepsize")

        loss = build_fc_return_loss(X, y)
        params = nn.get_parameters(loss)
        m = nn.Module([X, y], [loss])
        split_loss = 0
        for start in xrange(0, batch_size, batch_size // 4):
            sli = slice(start, start + batch_size // 4)
            split_loss += m([X[sli], y[sli]])[0]
        split_loss /= 4
        gparams = cgt.grad(split_loss, params)
        updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], split_loss, updates=updates2)
Exemple #15
0
def main():
    print("Loading data...")
    X = cgt.matrix("X", fixed_shape=(None, 28*28))
    y = cgt.vector("y", dtype='i8')

    model = build_model(X, 0.0)
    loss = -cgt.mean(categorical.loglik(y, model))

    updates = nn.rmsprop(loss, nn.get_parameters(loss), 0.01)
    train = cgt.function(inputs=[X, y], outputs=[], updates=updates)

    y_nodrop = cgt.argmax(model, axis=1)

    cost_nodrop = -cgt.mean(categorical.loglik(y, model))
    err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean()

    computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop])


    batch_size=128
    Xdata, ydata = load_data()

    Xtrain = Xdata[0:60000]
    ytrain = ydata[0:60000]

    Xtest = Xdata[60000:70000]
    ytest = ydata[60000:70000]

    sortinds = np.random.permutation(60000)
    Xtrain = Xtrain[sortinds]
    ytrain = ytrain[sortinds]

    print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"])
    for i_epoch in xrange(3):
        tstart = time.time()
        for start in xrange(0, Xtrain.shape[0], batch_size):
            end = start+batch_size
            train(Xtrain[start:end], ytrain[start:end])
        elapsed = time.time() - tstart
        trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)])
        testerr, testloss = computeloss(Xtest, ytest)
        print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])

    nnbuilder.save_weights(model, 'mnist')
Exemple #16
0
def main(num_epochs=NUM_EPOCHS):
    #cgt.set_precision('half')
    print("Building network ...")
    # Recurrent layers expect input of shape
    # (batch size, max sequence length, number of features)
    X = cgt.tensor3(name='X', fixed_shape=(N_BATCH, MAX_LENGTH, 2))
    l_forward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN)
    l_backward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN, backwards=True)
    #l_forward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid)
    #l_backward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid, backwards=True)
    #l_forward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify)
    #l_backward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify, backwards=True)
    l_forward_slice = l_forward[:, MAX_LENGTH-1, :]  # Take the last element in the forward slice time dimension
    l_backward_slice = l_backward[:, 0, :]  # And the first element in the backward slice time dimension
    l_sum = cgt.concatenate([l_forward_slice, l_backward_slice], axis=1)
    l_out = nnbuilder.denseLayer(l_sum, num_units=1, activation=cgt.tanh)
    target_values = cgt.vector('target_output')
    predicted_values = l_out[:, 0]  # For this task we only need the last value
    cost = cgt.mean((predicted_values - target_values)**2)
    # Compute SGD updates for training
    print("Computing updates ...")
    updates = nn.rmsprop(cost, nn.get_parameters(l_out), LEARNING_RATE)
    #updates = nn.nesterov_momentum(cost, nn.get_parameters(l_out), 0.05)
    # cgt functions for training and computing cost
    print("Compiling functions ...")
    train = cgt.function([X, target_values], cost, updates=updates)
    compute_cost = cgt.function([X, target_values], cost)

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val = gen_data()

    print("Training ...")
    time_start = time.time()
    try:
        for epoch in range(num_epochs):
            for _ in range(EPOCH_SIZE):
                X, y, m = gen_data()
                train(X, y)
            cost_val = compute_cost(X_val, y_val)
            print("Epoch {} validation cost = {}".format(epoch+1, cost_val))
            print ('Epoch took ' + str(time.time() - time_start))
            time_start = time.time()
    except KeyboardInterrupt:
        pass
Exemple #17
0
 def __init__(self, num_features=None, num_hidden=100):
     stepsize = 0.01
     # with shape (batchsize, ncols)
     X = cgt.matrix("X", fixed_shape=(1, num_features))
     # y: a symbolic variable representing the rewards, which are integers
     y = cgt.scalar("y", dtype='float64')
     
     hid1 = nn.rectify(
         nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X)
     )
     # One final fully-connected layer, and then a linear activation output for reward
     output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1)
     abs_deviation = cgt.abs(output - y).mean()
     params = nn.get_parameters(abs_deviation)
     gparams = cgt.grad(abs_deviation, params)
     
     updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)]
     self.predictor = cgt.function([X], output)
     self.updater = cgt.function([X, y], abs_deviation, updates=updates)
Exemple #18
0
def make_funcs(config, dbg_out={}):
    net_in, net_out = hybrid_network(config['num_inputs'],
                                     config['num_outputs'],
                                     config['num_units'],
                                     config['num_sto'],
                                     dbg_out=dbg_out)
    if not config['dbg_out_full']: dbg_out = {}
    # def f_sample(_inputs, num_samples=1, flatten=False):
    #     _mean, _var = f_step(_inputs)
    #     _samples = []
    #     for _m, _v in zip(_mean, _var):
    #         _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples)
    #         if flatten: _samples.extend(_s)
    #         else: _samples.append(_s)
    #     return np.array(_samples)
    Y_gt = cgt.matrix("Y")
    Y_prec = cgt.tensor3('V',
                         fixed_shape=(None, config['num_inputs'],
                                      config['num_inputs']))
    params = nn.get_parameters(net_out)
    size_batch, size_out = net_out.shape
    inputs, outputs = [net_in], [net_out]
    if config['no_bias']:
        print "Excluding bias"
        params = [p for p in params if not p.name.endswith(".b")]
    loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec)
    if config['weight_decay'] > 0.:
        print "Applying penalty on parameter norm"
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = config['weight_decay'] * cgt.sum(params_flat**2)
        loss_vec -= loss_param  # / size_batch
    loss = cgt.sum(loss_vec) / size_batch

    # TODO_TZ f_step seems not to fail if X has wrong dim
    f_step = cgt.function(inputs, outputs)
    f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt],
                                outputs, [loss_vec],
                                params,
                                _dbg_out=dbg_out)

    return params, f_step, None, None, None, f_surr
Exemple #19
0
    def __init__(self, n_actions):
        Serializable.__init__(self, n_actions)
        cgt.set_precision('double')
        n_in = 128
        o_no = cgt.matrix("o_no", fixed_shape=(None, n_in))
        a_n = cgt.vector("a_n", dtype='i8')
        q_n = cgt.vector("q_n")
        oldpdist_np = cgt.matrix("oldpdists")

        h0 = (o_no - 128.0) / 128.0
        nhid = 64
        h1 = cgt.tanh(
            nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0))
        probs_na = nn.softmax(
            nn.Affine(nhid, n_actions,
                      weight_init=nn.IIDGaussian(std=0.01))(h1))
        logprobs_na = cgt.log(probs_na)
        b = cgt.size(o_no, 0)
        logps_n = logprobs_na[cgt.arange(b), a_n]
        surr = (logps_n * q_n).mean()
        kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean()

        params = nn.get_parameters(surr)
        gradsurr = cgt.grad(surr, params)
        flatgrad = cgt.concatenate([p.flatten() for p in gradsurr])

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._f_grad_lagrangian = cgt.function(
            [lam, oldpdist_np, o_no, a_n, q_n],
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)]))
        self.f_pdist = cgt.function([o_no], probs_na)

        self.f_probs = cgt.function([o_no], probs_na)
        self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n],
                                      [surr, kl])
        self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad)

        self.pc = ParamCollection(params)
Exemple #20
0
def main():
    X = cgt.matrix(name='data', dtype=cgt.floatX, fixed_shape=(None, 2212))
    y = cgt.vector("y", dtype='i8')
    model = build_nn(X)
    loss = -cgt.mean(categorical.loglik(y, model))
    updates = nn.adagrad(loss, nn.get_parameters(loss), 0.01)

    y_nodrop = cgt.argmax(model, axis=1)

    cost_nodrop = -cgt.mean(categorical.loglik(y, model))
    err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean()

    train = cgt.function(inputs=[X, y], outputs=[], updates=updates)
    computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop])

    batch_size = 20
    Xdata, ydata = load_data()

    Xtrain = Xdata[0:5200]
    ytrain = ydata[0:5200]

    Xtest = Xdata[5200:5573]
    ytest = ydata[5200:5573]

    sortinds = np.random.permutation(5200)
    Xtrain = Xtrain[sortinds]
    ytrain = ytrain[sortinds]

    print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"])
    for i_epoch in xrange(20):
        tstart = time.time()
        for start in xrange(0, Xtrain.shape[0], batch_size):
            end = start+batch_size
            train(Xtrain[start:end], ytrain[start:end])
        elapsed = time.time() - tstart
        trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)])
        testerr, testloss = computeloss(Xtest, ytest)
        print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])
Exemple #21
0
def make_funcs(net_in, net_out, config, dbg_out=None):
    def f_grad(*x):
        out = f_surr(*x)
        return out['loss'], out['surr_loss'], out['surr_grad']

    Y = cgt.matrix("Y")
    params = nn.get_parameters(net_out)
    if 'no_bias' in config and config['no_bias']:
        print "Excluding bias"
        params = [p for p in params if not p.name.endswith(".b")]
    size_out, size_batch = Y.shape[1], net_in.shape[0]
    f_step = cgt.function([net_in], [net_out])
    # loss_raw of shape (size_batch, 1); loss should be a scalar
    # sum-of-squares loss
    sigma = 0.1
    loss_raw = -cgt.sum((net_out - Y)**2, axis=1, keepdims=True) / sigma
    # negative log-likelihood
    # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6  # positive sigma
    # loss_raw = -gaussian_diagonal.logprob(
    #     Y, net_out,
    # out_sigma
    # cgt.fill(.01, [size_batch, size_out])
    # )
    if 'param_penal_wt' in config:
        print "Applying penalty on parameter norm"
        assert config['param_penal_wt'] > 0
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = cgt.fill(cgt.sum(params_flat**2), [size_batch, 1])
        loss_param *= config['param_penal_wt']
        loss_raw += loss_param
    loss = cgt.sum(loss_raw) / size_batch
    # end of loss definition
    f_loss = cgt.function([net_in, Y], [net_out, loss])
    f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw],
                                params)
    return params, f_step, f_loss, f_grad, f_surr
Exemple #22
0
def make_funcs(net_in, net_out, config, dbg_out=None):
    def f_grad (*x):
        out = f_surr(*x)
        return out['loss'], out['surr_loss'], out['surr_grad']
    Y = cgt.matrix("Y")
    params = nn.get_parameters(net_out)
    if 'no_bias' in config and config['no_bias']:
        print "Excluding bias"
        params = [p for p in params if not p.name.endswith(".b")]
    size_out, size_batch = Y.shape[1], net_in.shape[0]
    f_step = cgt.function([net_in], [net_out])
    # loss_raw of shape (size_batch, 1); loss should be a scalar
    # sum-of-squares loss
    sigma = 0.1
    loss_raw = -cgt.sum((net_out - Y) ** 2, axis=1, keepdims=True) / sigma
    # negative log-likelihood
    # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6  # positive sigma
    # loss_raw = -gaussian_diagonal.logprob(
    #     Y, net_out,
        # out_sigma
        # cgt.fill(.01, [size_batch, size_out])
    # )
    if 'param_penal_wt' in config:
        print "Applying penalty on parameter norm"
        assert config['param_penal_wt'] > 0
        params_flat = cgt.concatenate([p.flatten() for p in params])
        loss_param = cgt.fill(cgt.sum(params_flat ** 2), [size_batch, 1])
        loss_param *= config['param_penal_wt']
        loss_raw += loss_param
    loss = cgt.sum(loss_raw) / size_batch
    # end of loss definition
    f_loss = cgt.function([net_in, Y], [net_out, loss])
    f_surr = get_surrogate_func([net_in, Y],
                                [net_out] + dbg_out,
                                [loss_raw], params)
    return params, f_step, f_loss, f_grad, f_surr
Exemple #23
0
    def __init__(self, num_features=None, num_hidden=100):
        stepsize = 0.01
        # with shape (batchsize, ncols)
        X = cgt.matrix("X", fixed_shape=(1, num_features))
        # y: a symbolic variable representing the rewards, which are integers
        y = cgt.scalar("y", dtype='float64')

        hid1 = nn.rectify(
            nn.Affine(num_features,
                      num_hidden,
                      weight_init=nn.IIDGaussian(std=.1),
                      bias_init=nn.Constant(1))(X))
        # One final fully-connected layer, and then a linear activation output for reward
        output = nn.Affine(num_hidden,
                           1,
                           weight_init=nn.IIDGaussian(std=.1),
                           bias_init=nn.Constant(1))(hid1)
        abs_deviation = cgt.abs(output - y).mean()
        params = nn.get_parameters(abs_deviation)
        gparams = cgt.grad(abs_deviation, params)

        updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        self.predictor = cgt.function([X], output)
        self.updater = cgt.function([X, y], abs_deviation, updates=updates)
Exemple #24
0
    def __init__(self, obs_dim, ctrl_dim):

        cgt.set_precision('double')
        Serializable.__init__(self, obs_dim, ctrl_dim)

        self.obs_dim = obs_dim
        self.ctrl_dim = ctrl_dim

        o_no = cgt.matrix("o_no", fixed_shape=(None, obs_dim))
        a_na = cgt.matrix("a_na", fixed_shape=(None, ctrl_dim))
        adv_n = cgt.vector("adv_n")
        oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2 * ctrl_dim))
        self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)),
                                               name="std_1a")
        std_1a = cgt.exp(logstd_1a)

        # Here's where we apply the network
        h0 = o_no
        nhid = 32
        h1 = cgt.tanh(
            nn.Affine(obs_dim, nhid, weight_init=nn.IIDGaussian(std=0.1))(h0))
        h2 = cgt.tanh(
            nn.Affine(nhid, nhid, weight_init=nn.IIDGaussian(std=0.1))(h1))
        mean_na = nn.Affine(nhid,
                            ctrl_dim,
                            weight_init=nn.IIDGaussian(std=0.01))(h2)

        b = cgt.size(o_no, 0)
        std_na = cgt.repeat(std_1a, b, axis=0)

        oldmean_na = oldpdist_np[:, 0:self.ctrl_dim]
        oldstd_na = oldpdist_np[:, self.ctrl_dim:2 * self.ctrl_dim]

        logp_n = ((-.5) * cgt.square(
            (a_na - mean_na) / std_na).sum(axis=1)) - logstd_1a.sum()
        oldlogp_n = ((-.5) * cgt.square(
            (a_na - oldmean_na) / oldstd_na).sum(axis=1)
                     ) - cgt.log(oldstd_na).sum(axis=1)

        ratio_n = cgt.exp(logp_n - oldlogp_n)

        surr = (ratio_n * adv_n).mean()

        pdists_np = cgt.concatenate([mean_na, std_na], axis=1)
        # kl = cgt.log(sigafter/)

        params = nn.get_parameters(surr)

        oldvar_na = cgt.square(oldstd_na)
        var_na = cgt.square(std_na)
        kl = (cgt.log(std_na / oldstd_na) +
              (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) -
              .5).sum(axis=1).mean()

        lam = cgt.scalar()
        penobj = surr - lam * kl
        self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n],
                                             [surr, kl])
        self._compute_grad_lagrangian = cgt.function(
            [lam, oldpdist_np, o_no, a_na, adv_n],
            cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)]))
        self.f_pdist = cgt.function([o_no], pdists_np)

        self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n],
                                   [surr, kl])

        self.pc = ParamCollection(params)
Exemple #25
0
def test_seq_2_seq():
    batch_size = 32  # How many samples do you want to batch.
    feat_t_steps = 3  # How many 10ms sound clips.
    feat_num_features = 10  # The dimension of the 10ms clips.
    max_label_length = feat_t_steps  # The maximal label length of the transcription.
    num_out_classes = 27  # 26 letters and space.
    num_out_classes_true = 27 + 2  # Start and end tokens are added.
    num_batches = 512  # 1032
    num_epochs = 40

    feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features))
    ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes_true))

    last_time = time.time()
    print 'initializing seq2seq'
    seq2seq = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes)
    print 'that took ' + str(time.time() - last_time) + ' seconds'

    last_time = time.time()
    print 'making train objective'
    train_objective = seq2seq.get_train_objective(max_label_length=max_label_length,
                                                  ground_labels_basis_btc=ground_labels_basis)
    print 'that took ' + str(time.time() - last_time) + ' seconds'

    last_time = time.time()
    print 'making updates'
    updates = nn.rmsprop(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001)
    #updates = nn.nesterov_momentum(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001, mu=0.4)
    #updates = nn.momentum(train_objective, nn.get_parameters(train_objective), learning_rate=0.00001, mu=0.4)
    #updates = nn.adadelta(train_objective, nn.get_parameters(train_objective), learning_rate=0.0001, rho=0.95)
    print 'that took ' + str(time.time() - last_time) + ' seconds'

    last_time = time.time()
    print 'compiling train function, test function, and prediction output function'
    train_function = cgt.function([feats, ground_labels_basis], [], updates=updates)
    test_function = cgt.function([feats, ground_labels_basis], [train_objective])
    pred = seq2seq.make_prediction(ground_labels_basis_btc=ground_labels_basis, max_label_length=feat_t_steps)
    pred_fun = cgt.function([feats, ground_labels_basis], [pred])
    print 'that took ' + str(time.time() - last_time) + ' seconds'

    test_data = np.load('test_data.npy')
    test_labels = np.load('test_labels.npy')
    data_mean = np.mean(test_data)
    data_sd = np.std(test_data)

    print 'now training'
    last_time = time.time()
    for one_epoch in range(0, num_epochs):
        tested = 0
        print 'starting epoch ' + str(one_epoch)
        for batch_iter in range(0, num_batches):
            batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd,
                                                             test_labels, num_out_classes_true)
            train_function(batch, labels_basis)
        for batch_iter in range(0, num_batches):
            batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd,
                                                             test_labels, num_out_classes_true)
            tested += test_function(batch, labels_basis)[0]

        tested = tested / batch_iter
        print 'train loss for batch ' + str(batch_iter) + ' is ' + str(tested)

        print 'an actual prediction is '
        print pred_fun(batch, labels_basis)[0]
        print 'the truth is'
        print test_labels[batch_iter, :, 0:feat_t_steps]

        print 'that took ' + str(time.time() - last_time) + ' seconds'
        last_time = time.time()


    prediction_final = pred_fun(batch, labels_basis)[0]
    print prediction_final
Exemple #26
0
def test_the_test_problem():
    #Works
    batch_size = 32  # How many samples do you want to batch.
    feat_t_steps = 20  # How many 10ms sound clips.
    feat_num_features = 10  # The dimension of the 10ms clips.
    max_label_length = feat_t_steps  # The maximal label length of the transcription. includes start character.
    num_out_classes = 27
    num_out_classes_true = num_out_classes + 2
    num_batches = 756
    num_epochs = 30

    feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features))
    ground_labels_basis = cgt.tensor3(fixed_shape=(batch_size, max_label_length, num_out_classes_true))

    last_time = time.time()
    print 'initializing temporal dense layer'
    d1 = nnbuilder.temporalDenseLayer(feats, num_units=128, activation=cgt.sigmoid)
    #d2 = nnbuilder.temporalDenseLayer(d1, num_units=128, activation=cgt.sigmoid)
    d3 = nnbuilder.temporalDenseLayer(d1, num_units=num_out_classes_true, activation=nnbuilder.linear)
    out = nn.three_d_softmax(d3, axis=2)

    log_probs = None
    for iter_step in range(0, max_label_length):
        this_character_dist_bc = out[:, iter_step, :]
        prev_out_bc = ground_labels_basis[:, iter_step, :]
        log_probs_pre = prev_out_bc * this_character_dist_bc
        log_probs_pre = cgt.log(cgt.sum(log_probs_pre, axis=1))
        if log_probs is None:
            log_probs = cgt.sum(log_probs_pre)
        else:
            log_probs += cgt.sum(log_probs_pre)

    log_probs = -log_probs

    print 'that took ' + str(time.time() - last_time) + ' seconds'

    last_time = time.time()
    print 'compiling objective function'
    updates = nn.rmsprop(log_probs, nn.get_parameters(log_probs), learning_rate=0.01)
    pred_train = cgt.function([feats, ground_labels_basis], [], updates=updates)
    pred_fun = cgt.function([feats, ground_labels_basis], [log_probs])
    most_likely_chars = cgt.argmax(out, axis=1)
    actual_predictions = cgt.function([feats, ground_labels_basis], [most_likely_chars])
    print 'that took ' + str(time.time() - last_time) + ' seconds'

    test_data = np.load('test_data.npy')
    test_labels = np.load('test_labels.npy')
    data_mean = np.mean(test_data)
    data_sd = np.mean(test_data)

    print 'now training'
    for one_epoch in range(0, num_epochs):
        trained = 0
        last_time = time.time()
        print 'starting epoch ' + str(one_epoch)
        for batch_iter in range(0, num_batches):
            batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd,
                                                             test_labels, num_out_classes_true)
            pred_train(batch, labels_basis)

        for batch_iter in range(0, num_batches):
            batch, labels_basis = normalize_batch_and_labels(test_data, batch_iter, feat_t_steps, data_mean, data_sd,
                                                             test_labels, num_out_classes_true)
            trained += pred_fun(batch, labels_basis)[0]

        trained = trained/batch_iter
        print 'train loss is ' + str(trained)
        print 'that took ' + str(time.time() - last_time) + ' seconds'

        act_pred = actual_predictions(batch, labels_basis)[0]
        print 'an actual prediction is '
        print act_pred
Exemple #27
0
conv2 = nn.rectify(
        nn.SpatialConvolution(32, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(pool1)
        )
pool2 = nn.max_pool_2d(conv2, kernelshape=(2,2), stride=(2,2))
d0, d1, d2, d3 = pool2.shape

flat = pool2.reshape([d0, d1*d2*d3])
nfeats = cgt.infer_shape(flat)[1]
probs = nn.softmax(nn.Affine(nfeats, 10)(flat))
cost = -categorical.loglik(y, probs).mean()

y_preds = cgt.argmax(probs, axis=1)
err = cgt.cast(cgt.not_equal(y, y_preds), cgt.floatX).mean()

params = nn.get_parameters(cost)
updates = nn.sgd(cost, params, 1e-3) 

# training function
f = cgt.function(inputs=[X, y], outputs=[], updates=updates)
# compute the cost and error
cost_and_err = cgt.function(inputs=[X, y], outputs=[cost, err])

for i in xrange(epochs):
    t0 = time.time()
    for start in xrange(0, Xtrain.shape[0], batch_size):
        end = batch_size + start
        f(Xtrainimg[start:end], ytrain[start:end])
    elapsed = time.time() - t0
    costval, errval = cost_and_err(Xtestimg, ytest)
    print("Epoch {} took {}, test cost = {}, test error = {}".format(i, elapsed, costval, errval))
Exemple #28
0
def set_all_weights_helper(network_out_layer, param_values):
    all_params = get_parameters(network_out_layer)
    for param, param_value in zip(all_params, param_values):
        param.op.set_value(param_value)
Exemple #29
0
def get_all_weights(network_out_layer):
    all_params = get_parameters(network_out_layer)
    param_values = []
    for param in all_params:
        param_values.append(param.op.get_value())
    return param_values
Exemple #30
0
def set_all_weights(network_out_layer, pickled_list_of_weights):
    all_params = get_parameters(network_out_layer)
    param_values = pickle.load(open(pickled_list_of_weights, "rb"))
    for param, param_value in zip(all_params, param_values):
        param.op.set_value(param_value)