def make_updater_convnet_theano(): X = TT.tensor4("X") # so shapes can be inferred y = TT.ivector("y") np.random.seed(0) stepsize = TT.scalar("stepsize") layer1 = SpatialConvolutionTheano(1, 32, kernelshape=(3,3), pad=(0,0), weight_init=nn.IIDGaussian(std=.1)) conv1 = nn.rectify(layer1(X)) pool1 = theano.tensor.signal.downsample.max_pool_2d(conv1, ds=(3,3), st=(2,2)) layer2 = SpatialConvolutionTheano(32, 32, kernelshape=(3,3), pad=(0,0), weight_init=nn.IIDGaussian(std=.1)) conv2 = nn.rectify(layer2(pool1)) pool2 = theano.tensor.signal.downsample.max_pool_2d(conv2, ds=(3,3), st=(2,2)) d0,d1,d2,d3 = pool2.shape flatlayer = pool2.reshape([d0,d1*d2*d3]) nfeats = 800 # theano doens't know how to calculate shapes before compiling # the function, so this needs to be computed by hand layer3 = AffineTheano(nfeats, 10) ip1 = layer3(flatlayer) logprobs = logsoftmax_theano(ip1) loss = -logprobs[TT.arange(X.shape[0]), y].mean() params = [layer1.weight, layer1.bias, layer2.weight, layer2.bias, layer3.weight, layer3.bias] gparams = TT.grad(loss, params) updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)] return theano.function([X,y, stepsize], loss, updates=updates, allow_input_downcast=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = fetch_dataset("http://rll.berkeley.edu/cgt-data/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(relu1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = np.load("/Users/joschu/Data/cifar-10-batches-py/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden): X = nn.dropout(X, p_drop_input) h = nn.rectify(cgt.dot(X, w_h)) h = nn.dropout(h, p_drop_hidden) h2 = nn.rectify(cgt.dot(h, w_h2)) h2 = nn.dropout(h2, p_drop_hidden) py_x = nn.softmax(cgt.dot(h2, w_o)) return py_x
def build_fc_return_loss(X, y): """ Build fully connected network and return loss """ np.random.seed(0) h1 = nn.rectify(nn.Affine(28 * 28, 256, weight_init=nn.IIDGaussian(std=0.1))(X)) h2 = nn.rectify(nn.Affine(256, 256, weight_init=nn.IIDGaussian(std=0.1))(h1)) logprobs = nn.logsoftmax(nn.Affine(256, 10, weight_init=nn.IIDGaussian(std=0.1))(h2)) neglogliks = -logprobs[cgt.arange(X.shape[0]), y] loss = neglogliks.mean() return loss
def dense_model3(X, w_h, w_h2, w_h3, w_o, p_drop_input, p_drop_hidden): X = nn.dropout(X, p_drop_input) h = nn.rectify(cgt.dot(X, w_h)) h = nn.dropout(h, p_drop_hidden[0]) h2 = nn.rectify(cgt.dot(h, w_h2)) h2 = nn.dropout(h2, p_drop_hidden[1]) h3 = nn.rectify(cgt.dot(h2, w_h3)) h3 = nn.dropout(h3, p_drop_hidden[2]) py_x = nn.softmax(cgt.dot(h3, w_o)) return py_x
def build_fc_return_loss(X, y): """ Build fully connected network and return loss """ np.random.seed(0) h1 = nn.rectify( nn.Affine(28 * 28, 256, weight_init=nn.IIDGaussian(std=.1))(X)) h2 = nn.rectify( nn.Affine(256, 256, weight_init=nn.IIDGaussian(std=.1))(h1)) logprobs = nn.logsoftmax( nn.Affine(256, 10, weight_init=nn.IIDGaussian(std=.1))(h2)) neglogliks = -logprobs[cgt.arange(X.shape[0]), y] loss = neglogliks.mean() return loss
def build_convnet_return_loss(X, y): np.random.seed(0) conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=0.1))(X) ) pool1 = nn.max_pool_2d(conv1, kernelshape=(3, 3), stride=(2, 2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=0.1))(pool1) ) pool2 = nn.max_pool_2d(conv2, kernelshape=(3, 3), stride=(2, 2)) d0, d1, d2, d3 = pool2.shape flatlayer = pool2.reshape([d0, d1 * d2 * d3]) nfeats = cgt.infer_shape(flatlayer)[1] logprobs = nn.logsoftmax(nn.Affine(nfeats, 10)(flatlayer)) loss = -logprobs[cgt.arange(X.shape[0]), y].mean() return loss
def make_updater_fc_theano(): X = TT.matrix("X") y = TT.ivector("y") np.random.seed(0) stepsize = TT.scalar("stepsize") layer1 = AffineTheano(28 * 28, 256, weight_init=nn.IIDGaussian(std=0.1)) h1 = nn.rectify(layer1(X)) layer2 = AffineTheano(256, 256, weight_init=nn.IIDGaussian(std=0.1)) h2 = nn.rectify(layer2(h1)) logprobs = logsoftmax_theano(AffineTheano(256, 10, weight_init=nn.IIDGaussian(std=0.1))(h2)) neglogliks = -logprobs[TT.arange(X.shape[0]), y] loss = neglogliks.mean() params = [layer1.weight, layer1.bias, layer2.weight, layer2.bias] gparams = TT.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return theano.function([X, y, stepsize], loss, updates=updates, allow_input_downcast=True)
def make_updater_convnet_theano(): X = TT.tensor4("X") # so shapes can be inferred y = TT.ivector("y") np.random.seed(0) stepsize = TT.scalar("stepsize") layer1 = SpatialConvolutionTheano(1, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1)) conv1 = nn.rectify(layer1(X)) pool1 = theano.tensor.signal.downsample.max_pool_2d(conv1, ds=(3, 3), st=(2, 2)) layer2 = SpatialConvolutionTheano(32, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1)) conv2 = nn.rectify(layer2(pool1)) pool2 = theano.tensor.signal.downsample.max_pool_2d(conv2, ds=(3, 3), st=(2, 2)) d0, d1, d2, d3 = pool2.shape flatlayer = pool2.reshape([d0, d1 * d2 * d3]) nfeats = 800 # theano doens't know how to calculate shapes before compiling # the function, so this needs to be computed by hand layer3 = AffineTheano(nfeats, 10) ip1 = layer3(flatlayer) logprobs = logsoftmax_theano(ip1) loss = -logprobs[TT.arange(X.shape[0]), y].mean() params = [ layer1.weight, layer1.bias, layer2.weight, layer2.bias, layer3.weight, layer3.bias ] gparams = TT.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return theano.function([X, y, stepsize], loss, updates=updates, allow_input_downcast=True)
def convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden): l1a = nn.rectify(nn.conv2d(X, w, kernelshape=(3,3), pad=(1,1))) l1 = nn.max_pool_2d(l1a, kernelshape=(2, 2), stride=(2,2)) l1 = nn.dropout(l1, p_drop_conv) l2a = nn.rectify(nn.conv2d(l1, w2, kernelshape=(3,3), pad=(1,1))) l2 = nn.max_pool_2d(l2a, kernelshape=(2, 2), stride=(2,2)) l2 = nn.dropout(l2, p_drop_conv) l3a = nn.rectify(nn.conv2d(l2, w3, kernelshape=(3,3), pad=(1,1))) l3b = nn.max_pool_2d(l3a, kernelshape=(2, 2), stride=(2,2)) batchsize,channels,rows,cols = l3b.shape l3 = cgt.reshape(l3b, [batchsize, channels*rows*cols]) l3 = nn.dropout(l3, p_drop_conv) l4 = nn.rectify(cgt.dot(l3, w4)) l4 = nn.dropout(l4, p_drop_hidden) pyx = nn.softmax(cgt.dot(l4, w_o)) return pyx
def convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden): l1a = nn.rectify(nn.conv2d(X, w, kernelshape=(3, 3), pad=(1, 1))) l1 = nn.max_pool_2d(l1a, kernelshape=(2, 2), stride=(2, 2)) l1 = nn.dropout(l1, p_drop_conv) l2a = nn.rectify(nn.conv2d(l1, w2, kernelshape=(3, 3), pad=(1, 1))) l2 = nn.max_pool_2d(l2a, kernelshape=(2, 2), stride=(2, 2)) l2 = nn.dropout(l2, p_drop_conv) l3a = nn.rectify(nn.conv2d(l2, w3, kernelshape=(3, 3), pad=(1, 1))) l3b = nn.max_pool_2d(l3a, kernelshape=(2, 2), stride=(2, 2)) batchsize, channels, rows, cols = l3b.shape l3 = cgt.reshape(l3b, [batchsize, channels * rows * cols]) l3 = nn.dropout(l3, p_drop_conv) l4 = nn.rectify(cgt.dot(l3, w4)) l4 = nn.dropout(l4, p_drop_hidden) pyx = nn.softmax(cgt.dot(l4, w_o)) return pyx
def make_updater_fc_theano(): X = TT.matrix("X") y = TT.ivector("y") np.random.seed(0) stepsize = TT.scalar("stepsize") layer1 = AffineTheano(28 * 28, 256, weight_init=nn.IIDGaussian(std=.1)) h1 = nn.rectify(layer1(X)) layer2 = AffineTheano(256, 256, weight_init=nn.IIDGaussian(std=.1)) h2 = nn.rectify(layer2(h1)) logprobs = logsoftmax_theano( AffineTheano(256, 10, weight_init=nn.IIDGaussian(std=.1))(h2)) neglogliks = -logprobs[TT.arange(X.shape[0]), y] loss = neglogliks.mean() params = [layer1.weight, layer1.bias, layer2.weight, layer2.bias] gparams = TT.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return theano.function([X, y, stepsize], loss, updates=updates, allow_input_downcast=True)
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no",fixed_shape=(None,n_in)) a_n = cgt.vector("a_n",dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") nhid, nhid2 = 64, 64 h0 = (o_no - 128.0)/128.0 d0 = nn.dropout(h1, .2) h1 = nn.rectify(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(d0)) d1 = nn.dropout(h1, .2) h2 = nn.rectify(nn.Affine(nhid,nhid2,weight_init=nn.IIDGaussian(std=.1))(d1)) # d2 = nn.dropout(h2, .2) probs_na = nn.softmax(nn.Affine(nhid2,n_actions,weight_init=nn.IIDGaussian(std=0.01))(d2)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n*q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def build_convnet_return_loss(X, y): np.random.seed(0) conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1))(X)) pool1 = nn.max_pool_2d(conv1, kernelshape=(3, 3), stride=(2, 2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1))(pool1)) pool2 = nn.max_pool_2d(conv2, kernelshape=(3, 3), stride=(2, 2)) d0, d1, d2, d3 = pool2.shape flatlayer = pool2.reshape([d0, d1 * d2 * d3]) nfeats = cgt.infer_shape(flatlayer)[1] logprobs = nn.logsoftmax(nn.Affine(nfeats, 10)(flatlayer)) loss = -logprobs[cgt.arange(X.shape[0]), y].mean() return loss
def make_deep_rrnn_rot_relu(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[ i_layer + 1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer == 0 else outputs[i_layer - 1] size_x = size_input if i_layer == 0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") add_in_lin = nn.Affine(size_x, size_mem)(x) add_in_relu = nn.rectify(add_in_lin) prev_h_scaled = nn.scale_mag(prev_h) h_in_added = prev_h_scaled + add_in_relu inters_h = [h_in_added] colon = slice(None, None, None) for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = xform_h[i, :] #r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output, name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def make_deep_rrnn_rot_relu(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers+1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[i_layer+1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer==0 else outputs[i_layer-1] size_x = size_input if i_layer==0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") add_in_lin = nn.Affine(size_x, size_mem)(x) add_in_relu = nn.rectify(add_in_lin) prev_h_scaled = nn.scale_mag(prev_h) h_in_added = prev_h_scaled + add_in_relu inters_h = [h_in_added] colon = slice(None, None, None) for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = xform_h[i, :] #r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output,name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X) ) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X)) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
Xtrain, Xtest, ytrain, ytest = load_mnist(onehot=False) # shuffle the data np.random.seed(42) sortinds = np.random.permutation(Xtrain.shape[0]) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] # Model: # Two linear/affine layers with a ReLU activation in between # followed by a logsoftmax. X = cgt.matrix('X', fixed_shape=(None, 784)) y = cgt.vector('y', dtype='i8') layer1 = nn.Affine(784, 400, weight_init=nn.XavierNormal())(X) act1 = nn.rectify(layer1) layer2 = nn.Affine(400, 400, weight_init=nn.XavierNormal())(act1) act2 = nn.rectify(layer2) probs = nn.softmax(nn.Affine(400, 10)(act2)) y_preds = cgt.argmax(probs, axis=1) cost = -cgt.mean(categorical.loglik(y, probs)) err = cgt.cast(cgt.not_equal(y, y_preds), cgt.floatX).mean() params = nn.get_parameters(cost) updates = nn.sgd(cost, params, learning_rate) # train via sgd # training function f = cgt.function(inputs=[X, y], outputs=[], updates=updates) # compute the cost and error cost_and_err = cgt.function(inputs=[X, y], outputs=[cost, err])
if X.ndim == 4: X = cgt.reshape(X, [X.shape[0], X.shape[1]*X.shape[2]*X.shape[3]] ) param = layer.inner_product_param nchanin = infer_shape(X)[1] Wshape = (param.num_output, nchanin) Wname = layer.param[0].name or layer.name+":W" Wval = np.empty(Wshape, dtype=cgt.floatX) W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all") bshape = (1, param.num_output) bname = layer.param[1].name or layer.name+":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") yname = layer.top[0] output = [cgt.broadcast("+",X.dot(W), b, "xx,1x") ] elif layer.type == "ReLU": output = [nn.rectify(inputs[0])] elif layer.type == "Softmax": output = [nn.softmax(inputs[0])] elif layer.type == "LRN": # XXX needs params param = layer.lrn_param output = [nn.lrn(inputs[0], param.alpha,param.beta, param.local_size)] elif layer.type == "Concat": param = layer.concat_param output = [cgt.concatenate(inputs, param.concat_dim) ] elif layer.type == "Dropout": output = [nn.dropout(inputs[0])] elif layer.type == "SoftmaxWithLoss": output = [nn.loglik_softmax(inputs[0], inputs[1])] elif layer.type == "Accuracy": output = [nn.zero_one_loss(inputs[0], inputs[1])]
ytrain = ytrain[sortinds] # reshape for convnet Xtrainimg = Xtrain.reshape(-1, 1, 28, 28) Xtestimg = Xtest.reshape(-1, 1, 28, 28) # Model: # Make it VGG-like # VGG nets have 3x3 kernels with length 1 padding and max-pooling has all 2s. # # VGG is a large model so here well just do a small part of it. X = cgt.tensor4('X', fixed_shape=(None, 1, 28, 28)) y = cgt.vector('y', dtype='i8') conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(X) ) pool1 = nn.max_pool_2d(conv1, kernelshape=(2,2), stride=(2,2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(pool1) ) pool2 = nn.max_pool_2d(conv2, kernelshape=(2,2), stride=(2,2)) d0, d1, d2, d3 = pool2.shape flat = pool2.reshape([d0, d1*d2*d3]) nfeats = cgt.infer_shape(flat)[1] probs = nn.softmax(nn.Affine(nfeats, 10)(flat)) cost = -categorical.loglik(y, probs).mean() y_preds = cgt.argmax(probs, axis=1)
def build_fcn_action_cond_encoder_net(input_shapes, levels=None): x_shape, u_shape = input_shapes x_c_dim = x_shape[0] x1_c_dim = 16 levels = levels or [3] levels = sorted(set(levels)) X = cgt.tensor4('X', fixed_shape=(None, ) + x_shape) U = cgt.matrix('U', fixed_shape=(None, ) + u_shape) # encoding Xlevels = {} for level in range(levels[-1] + 1): if level == 0: Xlevel = X else: if level == 1: xlevelm1_c_dim = x_c_dim xlevel_c_dim = x1_c_dim else: xlevelm1_c_dim = xlevel_c_dim xlevel_c_dim = 2 * xlevel_c_dim Xlevel_1 = nn.rectify( nn.SpatialConvolution(xlevelm1_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='conv%d_1' % level, weight_init=nn.IIDGaussian(std=0.01))( Xlevels[level - 1])) Xlevel_2 = nn.rectify( nn.SpatialConvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='conv%d_2' % level, weight_init=nn.IIDGaussian(std=0.01))(Xlevel_1)) Xlevel = nn.max_pool_2d(Xlevel_2, kernelshape=(2, 2), pad=(0, 0), stride=(2, 2)) Xlevels[level] = Xlevel # bilinear Xlevels_next_pred_0 = {} Ylevels = OrderedDict() Ylevels_diff_pred = OrderedDict() for level in levels: Xlevel = Xlevels[level] Xlevel_diff_pred = Bilinear(input_shapes, b=None, axis=2, name='bilinear%d' % level)(Xlevel, U) Xlevels_next_pred_0[level] = Xlevel + Xlevel_diff_pred Ylevels[level] = Xlevel.reshape( (Xlevel.shape[0], cgt.mul_multi(Xlevel.shape[1:]))) Ylevels_diff_pred[level] = Xlevel_diff_pred.reshape( (Xlevel_diff_pred.shape[0], cgt.mul_multi(Xlevel_diff_pred.shape[1:]))) # decoding Xlevels_next_pred = {} for level in range(levels[-1] + 1)[::-1]: if level == levels[-1]: Xlevel_next_pred = Xlevels_next_pred_0[level] else: if level == 0: xlevelm1_c_dim = x_c_dim elif level < levels[-1] - 1: xlevel_c_dim = xlevelm1_c_dim xlevelm1_c_dim = xlevelm1_c_dim // 2 Xlevel_next_pred_2 = SpatialDeconvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(2, 2), pad=(0, 0), stride=(2, 2), name='upsample%d' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevels_next_pred[ level + 1]) # TODO initialize with bilinear # TODO should rectify? Xlevel_next_pred_1 = nn.rectify( SpatialDeconvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='deconv%d_2' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_2)) nonlinearity = nn.rectify if level > 0 else cgt.tanh Xlevel_next_pred = nonlinearity( SpatialDeconvolution( xlevel_c_dim, xlevelm1_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='deconv%d_1' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_1)) if level in Xlevels_next_pred_0: coefs = nn.parameter(nn.init_array(nn.Constant(0.5), (2, )), name='sum%d.coef' % level) Xlevel_next_pred = coefs[0] * Xlevel_next_pred + coefs[ 1] * Xlevels_next_pred_0[level] # TODO: tanh should be after sum Xlevels_next_pred[level] = Xlevel_next_pred X_next_pred = Xlevels_next_pred[0] Y = cgt.concatenate(Ylevels.values(), axis=1) Y_diff_pred = cgt.concatenate(Ylevels_diff_pred.values(), axis=1) X_diff = cgt.tensor4('X_diff', fixed_shape=(None, ) + x_shape) X_next = X + X_diff loss = ((X_next - X_next_pred)**2).mean(axis=0).sum() / 2. net_name = 'FcnActionCondEncoderNet_levels' + ''.join( str(level) for level in levels) input_vars = OrderedDict([(var.name, var) for var in [X, U, X_diff]]) pred_vars = OrderedDict([('Y_diff_pred', Y_diff_pred), ('Y', Y), ('X_next_pred', X_next_pred)]) return net_name, input_vars, pred_vars, loss
Wshape = (param.num_output, nchanin) Wname = layer.param[0].name or layer.name + ":W" Wval = np.empty(Wshape, dtype=cgt.floatX) W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all") bshape = (1, param.num_output) bname = layer.param[1].name or layer.name + ":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") yname = layer.top[0] output = [cgt.broadcast("+", X.dot(W), b, "xx,1x")] elif layer.type == "ReLU": output = [nn.rectify(inputs[0])] elif layer.type == "Softmax": output = [nn.softmax(inputs[0])] elif layer.type == "LRN": # XXX needs params param = layer.lrn_param output = [ nn.lrn(inputs[0], param.alpha, param.beta, param.local_size) ] elif layer.type == "Concat": param = layer.concat_param output = [cgt.concatenate(inputs, param.concat_dim)] elif layer.type == "Dropout": output = [nn.dropout(inputs[0])] elif layer.type == "SoftmaxWithLoss": output = [nn.loglik_softmax(inputs[0], inputs[1])]