def __init__(self, input_shapes, axis=1, name=None, M=nn.IIDGaussian(std=0.001), N=nn.IIDGaussian(std=0.001), b=nn.Constant(0)): assert axis >= 1 self.axis = axis name = "unnamed" if name is None else name self.y_shape, self.u_shape = input_shapes self.y_dim = int(np.prod(self.y_shape[self.axis - 1:])) self.u_dim, = self.u_shape self.M = nn.parameter(nn.init_array( M, (self.y_dim, self.y_dim, self.u_dim)), name=name + ".M") self.N = nn.parameter(nn.init_array(N, (self.y_dim, self.u_dim)), name=name + ".N") if b is None: self.b = None else: self.b = nn.parameter(nn.init_array(b, (self.y_dim, )), name=name + ".b") # TODO: not regularizable
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(relu1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = np.load("/Users/joschu/Data/cifar-10-batches-py/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def build_fc_return_loss(X, y): """ Build fully connected network and return loss """ np.random.seed(0) h1 = nn.rectify( nn.Affine(28 * 28, 256, weight_init=nn.IIDGaussian(std=.1))(X)) h2 = nn.rectify( nn.Affine(256, 256, weight_init=nn.IIDGaussian(std=.1))(h1)) logprobs = nn.logsoftmax( nn.Affine(256, 10, weight_init=nn.IIDGaussian(std=.1))(h2)) neglogliks = -logprobs[cgt.arange(X.shape[0]), y] loss = neglogliks.mean() return loss
def _init_optim_state(ws, reset=False): if 'optim_state' in ws and not reset: return config = ws['config'] if 'optim_state' in ws: print "Reusing cached optim_state" theta = ws['optim_state']['theta'] elif 'snapshot' in config: print "Loading optim_state from previous snapshot: %s" % config[ 'snapshot'] ws['optim_state'] = pickle.load(open(config['snapshot'], 'r')) theta = ws['optim_state']['theta'] else: init_method = config['init_theta']['distr'] if init_method == 'XavierNormal': init_theta = nn.XavierNormal(**config['init_theta']['params']) elif init_method == 'gaussian': init_theta = nn.IIDGaussian(**config['init_theta']['params']) else: raise ValueError('unknown init distribution') theta = nn.init_array(init_theta, (ws['param_col'].get_total_size(), 1)).flatten() method = config['opt_method'].lower() if method == 'rmsprop': optim_create = lambda t: rmsprop_create(t, step_size=config['step_size']) elif method == 'adam': optim_create = lambda t: adam_create(t, step_size=config['step_size']) else: raise ValueError('unknown optimization method: %s' % method) if reset or 'optim_state' not in ws: ws['optim_state'] = optim_create(theta)
def make_updater_convnet_theano(): X = TT.tensor4("X") # so shapes can be inferred y = TT.ivector("y") np.random.seed(0) stepsize = TT.scalar("stepsize") layer1 = SpatialConvolutionTheano(1, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1)) conv1 = nn.rectify(layer1(X)) pool1 = theano.tensor.signal.downsample.max_pool_2d(conv1, ds=(3, 3), st=(2, 2)) layer2 = SpatialConvolutionTheano(32, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1)) conv2 = nn.rectify(layer2(pool1)) pool2 = theano.tensor.signal.downsample.max_pool_2d(conv2, ds=(3, 3), st=(2, 2)) d0, d1, d2, d3 = pool2.shape flatlayer = pool2.reshape([d0, d1 * d2 * d3]) nfeats = 800 # theano doens't know how to calculate shapes before compiling # the function, so this needs to be computed by hand layer3 = AffineTheano(nfeats, 10) ip1 = layer3(flatlayer) logprobs = logsoftmax_theano(ip1) loss = -logprobs[TT.arange(X.shape[0]), y].mean() params = [ layer1.weight, layer1.bias, layer2.weight, layer2.bias, layer3.weight, layer3.bias ] gparams = TT.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return theano.function([X, y, stepsize], loss, updates=updates, allow_input_downcast=True)
def make_updater_fc_theano(): X = TT.matrix("X") y = TT.ivector("y") np.random.seed(0) stepsize = TT.scalar("stepsize") layer1 = AffineTheano(28 * 28, 256, weight_init=nn.IIDGaussian(std=.1)) h1 = nn.rectify(layer1(X)) layer2 = AffineTheano(256, 256, weight_init=nn.IIDGaussian(std=.1)) h2 = nn.rectify(layer2(h1)) logprobs = logsoftmax_theano( AffineTheano(256, 10, weight_init=nn.IIDGaussian(std=.1))(h2)) neglogliks = -logprobs[TT.arange(X.shape[0]), y] loss = neglogliks.mean() params = [layer1.weight, layer1.bias, layer2.weight, layer2.bias] gparams = TT.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return theano.function([X, y, stepsize], loss, updates=updates, allow_input_downcast=True)
def build_convnet_return_loss(X, y): np.random.seed(0) conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1))(X)) pool1 = nn.max_pool_2d(conv1, kernelshape=(3, 3), stride=(2, 2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3, 3), pad=(0, 0), weight_init=nn.IIDGaussian(std=.1))(pool1)) pool2 = nn.max_pool_2d(conv2, kernelshape=(3, 3), stride=(2, 2)) d0, d1, d2, d3 = pool2.shape flatlayer = pool2.reshape([d0, d1 * d2 * d3]) nfeats = cgt.infer_shape(flatlayer)[1] logprobs = nn.logsoftmax(nn.Affine(nfeats, 10)(flatlayer)) loss = -logprobs[cgt.arange(X.shape[0]), y].mean() return loss
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no", fixed_shape=(None, n_in)) a_n = cgt.vector("a_n", dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0) / 128.0 nhid = 64 h1 = cgt.tanh( nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax( nn.Affine(nhid, n_actions, weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n * q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def __init__(self, num_features=None, num_hidden=100): stepsize = 0.01 # with shape (batchsize, ncols) X = cgt.matrix("X", fixed_shape=(1, num_features)) # y: a symbolic variable representing the rewards, which are integers y = cgt.scalar("y", dtype='float64') hid1 = nn.rectify( nn.Affine(num_features, num_hidden, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(X)) # One final fully-connected layer, and then a linear activation output for reward output = nn.Affine(num_hidden, 1, weight_init=nn.IIDGaussian(std=.1), bias_init=nn.Constant(1))(hid1) abs_deviation = cgt.abs(output - y).mean() params = nn.get_parameters(abs_deviation) gparams = cgt.grad(abs_deviation, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] self.predictor = cgt.function([X], output) self.updater = cgt.function([X, y], abs_deviation, updates=updates)
def build_fcn_action_cond_encoder_net(input_shapes, levels=None): x_shape, u_shape = input_shapes x_c_dim = x_shape[0] x1_c_dim = 16 levels = levels or [3] levels = sorted(set(levels)) X = cgt.tensor4('X', fixed_shape=(None, ) + x_shape) U = cgt.matrix('U', fixed_shape=(None, ) + u_shape) # encoding Xlevels = {} for level in range(levels[-1] + 1): if level == 0: Xlevel = X else: if level == 1: xlevelm1_c_dim = x_c_dim xlevel_c_dim = x1_c_dim else: xlevelm1_c_dim = xlevel_c_dim xlevel_c_dim = 2 * xlevel_c_dim Xlevel_1 = nn.rectify( nn.SpatialConvolution(xlevelm1_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='conv%d_1' % level, weight_init=nn.IIDGaussian(std=0.01))( Xlevels[level - 1])) Xlevel_2 = nn.rectify( nn.SpatialConvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='conv%d_2' % level, weight_init=nn.IIDGaussian(std=0.01))(Xlevel_1)) Xlevel = nn.max_pool_2d(Xlevel_2, kernelshape=(2, 2), pad=(0, 0), stride=(2, 2)) Xlevels[level] = Xlevel # bilinear Xlevels_next_pred_0 = {} Ylevels = OrderedDict() Ylevels_diff_pred = OrderedDict() for level in levels: Xlevel = Xlevels[level] Xlevel_diff_pred = Bilinear(input_shapes, b=None, axis=2, name='bilinear%d' % level)(Xlevel, U) Xlevels_next_pred_0[level] = Xlevel + Xlevel_diff_pred Ylevels[level] = Xlevel.reshape( (Xlevel.shape[0], cgt.mul_multi(Xlevel.shape[1:]))) Ylevels_diff_pred[level] = Xlevel_diff_pred.reshape( (Xlevel_diff_pred.shape[0], cgt.mul_multi(Xlevel_diff_pred.shape[1:]))) # decoding Xlevels_next_pred = {} for level in range(levels[-1] + 1)[::-1]: if level == levels[-1]: Xlevel_next_pred = Xlevels_next_pred_0[level] else: if level == 0: xlevelm1_c_dim = x_c_dim elif level < levels[-1] - 1: xlevel_c_dim = xlevelm1_c_dim xlevelm1_c_dim = xlevelm1_c_dim // 2 Xlevel_next_pred_2 = SpatialDeconvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(2, 2), pad=(0, 0), stride=(2, 2), name='upsample%d' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevels_next_pred[ level + 1]) # TODO initialize with bilinear # TODO should rectify? Xlevel_next_pred_1 = nn.rectify( SpatialDeconvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='deconv%d_2' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_2)) nonlinearity = nn.rectify if level > 0 else cgt.tanh Xlevel_next_pred = nonlinearity( SpatialDeconvolution( xlevel_c_dim, xlevelm1_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='deconv%d_1' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_1)) if level in Xlevels_next_pred_0: coefs = nn.parameter(nn.init_array(nn.Constant(0.5), (2, )), name='sum%d.coef' % level) Xlevel_next_pred = coefs[0] * Xlevel_next_pred + coefs[ 1] * Xlevels_next_pred_0[level] # TODO: tanh should be after sum Xlevels_next_pred[level] = Xlevel_next_pred X_next_pred = Xlevels_next_pred[0] Y = cgt.concatenate(Ylevels.values(), axis=1) Y_diff_pred = cgt.concatenate(Ylevels_diff_pred.values(), axis=1) X_diff = cgt.tensor4('X_diff', fixed_shape=(None, ) + x_shape) X_next = X + X_diff loss = ((X_next - X_next_pred)**2).mean(axis=0).sum() / 2. net_name = 'FcnActionCondEncoderNet_levels' + ''.join( str(level) for level in levels) input_vars = OrderedDict([(var.name, var) for var in [X, U, X_diff]]) pred_vars = OrderedDict([('Y_diff_pred', Y_diff_pred), ('Y', Y), ('X_next_pred', X_next_pred)]) return net_name, input_vars, pred_vars, loss
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no", fixed_shape=(None, obs_dim)) a_na = cgt.matrix("a_na", fixed_shape=(None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2 * ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh( nn.Affine(obs_dim, nhid, weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh( nn.Affine(nhid, nhid, weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid, ctrl_dim, weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2 * self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na).sum(axis=1) ) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n * adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)