def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no",fixed_shape=(None,n_in)) a_n = cgt.vector("a_n",dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0)/128.0 nhid = 64 h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n*q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def __init__(self, input_feature_size, input_time_size, num_units, weight_init=HeUniform(), activation=cgt.sigmoid, cell_out_init=IIDUniform(-0.1, 0.1), hid_out_init=IIDUniform(-0.1, 0.1), #cell_out_init=Constant(0.0), #hid_out_init=Constant(0.0), backwards=False): ingate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) forgetgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) cell = Gate(W_cell=None, nonlinearity=cgt.tanh) outgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) self.nonlinearity = activation self.num_units = num_units self.backwards = backwards self.timesteps = input_time_size def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (parameter(init_array(gate.W_in, (input_feature_size, num_units)), name=None), parameter(init_array(gate.W_hid, (num_units, num_units)), name=None), parameter(init_array(gate.b, (1, num_units)), name=None), gate.nonlinearity) # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params(forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') self.hid_init = parameter(init_array(hid_out_init, (1, num_units)), name=None) self.cell_init = parameter(init_array(cell_out_init, (1, num_units)), name=None) # Stack input weight matrices into a (num_inputs, 4*num_units) #checks out # matrix, which speeds up computation self.W_in_stacked = cgt.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) # Same for hidden weight matrices self.W_hid_stacked = cgt.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # Stack biases into a (4*num_units) vector self.b_stacked = cgt.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=1) self.cell_prev = None self.hid_prev = None
def circ_conv_1d(wg_bhn, s_bh3, axis=2): "VERY inefficient way to implement circular convolution for the special case of filter size 3" assert axis == 2 n = cgt.size(wg_bhn,2) wback = cgt.concatenate([wg_bhn[:,:,n-1:n], wg_bhn[:,:,:n-1]], axis=2) w = wg_bhn wfwd = cgt.concatenate([wg_bhn[:,:,1:n], wg_bhn[:,:,0:1]], axis=2) return cgt.broadcast("*", s_bh3[:,:,0:1] , wback, "xx1,xxx")\ + cgt.broadcast("*", s_bh3[:,:,1:2] , w, "xx1,xxx")\ + cgt.broadcast("*", s_bh3[:,:,2:3] , wfwd, "xx1,xxx")
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim)) a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n*adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def make_ff_controller(opt): b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k H = 2*h in_size = k + h*m out_size = H*m + H + H + H*3 + H + h*m + h*m + p # Previous reads r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m)) # External inputs X_bk = cgt.matrix("x", fixed_shape = (b,k)) r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]]) # Input to controller inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1) hid_sizes = opt.ff_hid_sizes activation = cgt.tanh layer_out_sizes = [in_size] + hid_sizes + [out_size] last_out = inp_bq # feedforward part. we could simplify a bit by using nn.Affine for i in xrange(len(layer_out_sizes)-1): indim = layer_out_sizes[i] outdim = layer_out_sizes[i+1] W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all") bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all") last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x") # Don't apply nonlinearity at the last layer if i != len(layer_out_sizes)-2: last_out = activation(last_out) idx = 0 k_bHm = last_out[:,idx:idx+H*m]; idx += H*m; k_bHm = k_bHm.reshape([b,H,m]) beta_bH = last_out[:,idx:idx+H]; idx += H g_bH = last_out[:,idx:idx+H]; idx += H s_bH3 = last_out[:,idx:idx+3*H]; idx += 3*H; s_bH3 = s_bH3.reshape([b,H,3]) gamma_bH = last_out[:,idx:idx+H]; idx += H e_bhm = last_out[:,idx:idx+h*m]; idx += h*m; e_bhm = e_bhm.reshape([b,h,m]) a_bhm = last_out[:,idx:idx+h*m]; idx += h*m; a_bhm = a_bhm.reshape([b,h,m]) y_bp = last_out[:,idx:idx+p]; idx += p k_bHm = cgt.tanh(k_bHm) beta_bH = nn.softplus(beta_bH) g_bH = cgt.sigmoid(g_bH) s_bH3 = sum_normalize2(cgt.exp(s_bH3)) gamma_bH = cgt.sigmoid(gamma_bH)+1 e_bhm = cgt.sigmoid(e_bhm) a_bhm = cgt.tanh(a_bhm) # y_bp = y_bp assert infer_shape(k_bHm) == (b,H,m) assert infer_shape(beta_bH) == (b,H) assert infer_shape(g_bH) == (b,H) assert infer_shape(s_bH3) == (b,H,3) assert infer_shape(gamma_bH) == (b,H) assert infer_shape(e_bhm) == (b,h,m) assert infer_shape(a_bhm) == (b,h,m) assert infer_shape(y_bp) == (b,p) return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def combo_layer(X, size_in, size_out, splits, s_funcs=None, o_funcs=None, name='?', dbg_out={}): """ Create a combination of specified sub-layers and non-linearity. :param X: symbolic input :param size_in: input size (except batch) :param size_out: output size (except batch) :param splits: split points for applying each sub-layer :param s_funcs: list of functions to create each sub-layer of signature (input, in_size, out_size, name) -> output :param o_funcs: list of non-linearity functions of signature (input) -> output :param name: layer name :type name: str :return: symbolic output Note for s_funcs and o_funcs: - broadcasting enabled if not a list - element with value None has no effect (skip) """ assert isinstance(splits, tuple) and len(splits) > 0 assert all(splits[i] < splits[i+1] for i in xrange(len(splits) - 1)) assert splits[0] >= 0 and splits[-1] <= size_out splits = list(splits) assert not isinstance(o_funcs, list) and not isinstance(s_funcs, list) o_funcs = list(o_funcs) if isinstance(o_funcs, tuple) \ else [o_funcs] * (len(splits) + 1) s_funcs = list(s_funcs) if isinstance(s_funcs, tuple) \ else [s_funcs] * (len(splits) + 1) assert len(splits) + 1 == len(o_funcs) == len(s_funcs) if splits[0] == 0: splits.pop(0) o_funcs.pop(0) s_funcs.pop(0) if len(splits) > 0 and splits[-1] == size_out: splits.pop() o_funcs.pop() s_funcs.pop() curr, names, ins, outs = 0, [], [], [] splits.append(size_out) for _split, _f_s, _f_o in zip(splits, s_funcs, o_funcs): _name = 'L%s[%d:%d]' % (name, curr, _split) _s_out = _split - curr _i = X if _f_s is None else _f_s(X, size_in, _s_out, name=_name) _o = _i if _f_o is None else _f_o(_i) curr = _split ins.append(_i) outs.append(_o) names.append(_name) out = cgt.concatenate(outs, axis=1) if len(outs) > 1 else outs[0] dbg_out.update(dict(zip([_n + '~in' for _n in names], ins))) dbg_out.update(dict(zip([_n + '~out' for _n in names], outs))) return out
def initialize(self, loss, scale): self._iter = 0 self.pc = Params(self.params) cur_val = self.pc.get_value_flat() idx = cur_val.nonzero() new_val = np.random.uniform(-scale, scale, size=(self.pc.get_total_size(),)) new_val[idx] = cur_val[idx] self.sync(new_val) grad = cgt.concatenate([g.flatten() for g in cgt.grad(loss, self.params)]) return grad
def make_loss_and_grad(net): X_b = inps[0] #cgt.matrix(dtype=cgt.floatX) y_onehot = cgt.matrix(dtype='i4') outputs = [logprobs] loss = nn.crossent(outputs[0], y_onehot) / b_size #gradloss = cgt.grad(loss, params) gradloss = cgt.grad(loss, param_list) # XXX use flatcat function grad = cgt.concatenate([x.flatten() for x in gradloss]) #grad = gradloss return cgt.make_function([X_b, y_onehot], [loss, grad, logprobs])
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no", fixed_shape=(None, n_in)) a_n = cgt.vector("a_n", dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0) / 128.0 nhid = 64 h1 = cgt.tanh( nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax( nn.Affine(nhid, n_actions, weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n * q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]): assert size_out >= size_random >= 0 out = cgt.sigmoid(nn.Affine( size_in, size_out, name="InnerProd(%d->%d)" % (size_in, size_out) )(X)) dbg_out.append(out) if size_random == 0: return out if size_random == size_out: out_s = cgt.bernoulli(out) return out_s out_s = cgt.bernoulli(out[:, :size_random]) out = cgt.concatenate([out_s, out[:, size_random:]], axis=1) return out
def pyramidLayer(nn_input, temporal_resolution_decrease=2): """ Batch by time by features. Decreases temporal resolution and increases feature dimension by a resolution decrease factor. """ t_steps = cgt.infer_shape(nn_input)[1] if t_steps % temporal_resolution_decrease != 0: raise ValueError('number of timesteps is not divisable by resolution decrease!') out_list = [] for iter_step in range(0, t_steps, temporal_resolution_decrease): concentrate_list = [] for sub_iter_step in range(0, temporal_resolution_decrease): concentrate_list.append(nn_input[:, iter_step + sub_iter_step, :]) out_list.append(cgt.concatenate(concentrate_list, axis=1)) return cgt.dimshuffle(cgt.stack(out_list), [1, 0, 2])
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]): assert size_out >= size_random >= 0 out = cgt.sigmoid( nn.Affine(size_in, size_out, name="InnerProd(%d->%d)" % (size_in, size_out))(X)) dbg_out.append(out) if size_random == 0: return out if size_random == size_out: out_s = cgt.bernoulli(out) return out_s out_s = cgt.bernoulli(out[:, :size_random]) out = cgt.concatenate([out_s, out[:, size_random:]], axis=1) return out
def mask_layer(func, X, size_in, i_start, i_end=None): if i_end is None: i_start, i_end = 0, i_start assert isinstance(i_start, int) and isinstance(i_end, int) assert -1 < i_start <= i_end <= size_in if i_end == i_start: return X if i_end - i_start == size_in: return func(X) outs = [] if i_start > 0: outs.append(X[:, :i_start]) outs.append(func(X[:, i_start:i_end])) if i_end < size_in: outs.append(X[:, i_end:]) out = cgt.concatenate(outs, axis=1) return out
def get_features_bengio(nn_input, num_units=256, recurrent_layer=None): if recurrent_layer is None: recurrent_layer = recurrentLayer w_init = IIDUniform(-0.1, 0.1) activation = cgt.sigmoid assert num_units % 2 == 0 num_units /= 2 l1_f = recurrent_layer(nn_input=nn_input, num_units=num_units, activation=activation, w_init=w_init) l1_b = recurrent_layer(nn_input=nn_input, num_units=num_units, activation=activation, w_init=w_init) l1_plus = cgt.concatenate([l1_f, l1_b], axis=2) #l2_f = recurrent_layer(nn_input=l1_plus, num_units=num_units, activation=activation, w_init=w_init) #l2_b = recurrent_layer(nn_input=l1_plus, num_units=num_units, activation=activation, w_init=w_init) #l2_plus = cgt.concatenate([l2_f, l2_b], axis=2) #l3_f = recurrent_layer(nn_input=l2_plus, num_units=num_units, activation=activation, w_init=w_init) #l3_b = recurrent_layer(nn_input=l2_plus, num_units=num_units, activation=activation, w_init=w_init) #l3_plus = cgt.concatenate([l3_f, l3_b], axis=2) return l1_plus
def main(num_epochs=NUM_EPOCHS): #cgt.set_precision('half') print("Building network ...") # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) X = cgt.tensor3(name='X', fixed_shape=(N_BATCH, MAX_LENGTH, 2)) l_forward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN) l_backward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN, backwards=True) #l_forward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid) #l_backward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid, backwards=True) #l_forward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify) #l_backward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify, backwards=True) l_forward_slice = l_forward[:, MAX_LENGTH-1, :] # Take the last element in the forward slice time dimension l_backward_slice = l_backward[:, 0, :] # And the first element in the backward slice time dimension l_sum = cgt.concatenate([l_forward_slice, l_backward_slice], axis=1) l_out = nnbuilder.denseLayer(l_sum, num_units=1, activation=cgt.tanh) target_values = cgt.vector('target_output') predicted_values = l_out[:, 0] # For this task we only need the last value cost = cgt.mean((predicted_values - target_values)**2) # Compute SGD updates for training print("Computing updates ...") updates = nn.rmsprop(cost, nn.get_parameters(l_out), LEARNING_RATE) #updates = nn.nesterov_momentum(cost, nn.get_parameters(l_out), 0.05) # cgt functions for training and computing cost print("Compiling functions ...") train = cgt.function([X, target_values], cost, updates=updates) compute_cost = cgt.function([X, target_values], cost) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val = gen_data() print("Training ...") time_start = time.time() try: for epoch in range(num_epochs): for _ in range(EPOCH_SIZE): X, y, m = gen_data() train(X, y) cost_val = compute_cost(X_val, y_val) print("Epoch {} validation cost = {}".format(epoch+1, cost_val)) print ('Epoch took ' + str(time.time() - time_start)) time_start = time.time() except KeyboardInterrupt: pass
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems']) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [ cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d' % t) for t in range(len(Ys)) ] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def make_funcs(config, dbg_out=None): params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1 = lstm_network( config['rnn_steps'], config['num_inputs'], config['num_outputs'], config['num_units'], config['num_mems'] ) # basic size_batch = Xs[0].shape[0] dY = Ys[0].shape[-1] Ys_gt = [cgt.matrix(fixed_shape=(size_batch, dY), name='Y%d'%t) for t in range(len(Ys))] Ys_var = [cgt.tensor3(fixed_shape=(size_batch, dY, dY)) for _ in Ys] net_inputs, net_outputs = Xs + C_0 + H_0 + Ys_var, Ys + C_T + H_T # calculate loss loss_vec = [] for i in range(len(Ys)): # if i == 0: continue _l = dist.gaussian.logprob(Ys_gt[i], Ys[i], Ys_var[i]) loss_vec.append(_l) loss_vec = cgt.add_multi(loss_vec) if config['weight_decay'] > 0.: params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat ** 2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / config['rnn_steps'] / size_batch grad = cgt.grad(loss, params) # functions def f_init(size_batch): c_0, h_0 = [], [] for _n_m in config['num_mems']: if _n_m > 0: c_0.append(np.zeros((size_batch, _n_m))) h_0.append(np.zeros((size_batch, _n_m))) return c_0, h_0 f_step = cgt.function([Xs[0]] + C_0 + H_0, [Ys[0]] + C_1 + H_1) f_loss = cgt.function(net_inputs + Ys_gt, loss) f_grad = cgt.function(net_inputs + Ys_gt, grad) f_surr = cgt.function(net_inputs + Ys_gt, [loss] + net_outputs + grad) return params, f_step, f_loss, f_grad, f_init, f_surr
def make_funcs(config, dbg_out={}): net_in, net_out = hybrid_network(config['num_inputs'], config['num_outputs'], config['num_units'], config['num_sto'], dbg_out=dbg_out) if not config['dbg_out_full']: dbg_out = {} # def f_sample(_inputs, num_samples=1, flatten=False): # _mean, _var = f_step(_inputs) # _samples = [] # for _m, _v in zip(_mean, _var): # _s = np.random.multivariate_normal(_m, np.diag(np.sqrt(_v)), num_samples) # if flatten: _samples.extend(_s) # else: _samples.append(_s) # return np.array(_samples) Y_gt = cgt.matrix("Y") Y_prec = cgt.tensor3('V', fixed_shape=(None, config['num_inputs'], config['num_inputs'])) params = nn.get_parameters(net_out) size_batch, size_out = net_out.shape inputs, outputs = [net_in], [net_out] if config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] loss_vec = dist.gaussian.logprob(Y_gt, net_out, Y_prec) if config['weight_decay'] > 0.: print "Applying penalty on parameter norm" params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = config['weight_decay'] * cgt.sum(params_flat**2) loss_vec -= loss_param # / size_batch loss = cgt.sum(loss_vec) / size_batch # TODO_TZ f_step seems not to fail if X has wrong dim f_step = cgt.function(inputs, outputs) f_surr = get_surrogate_func(inputs + [Y_prec, Y_gt], outputs, [loss_vec], params, _dbg_out=dbg_out) return params, f_step, None, None, None, f_surr
def make_funcs(net_in, net_out, config, dbg_out=None): def f_grad (*x): out = f_surr(*x) return out['loss'], out['surr_loss'], out['surr_grad'] Y = cgt.matrix("Y") params = nn.get_parameters(net_out) if 'no_bias' in config and config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] size_out, size_batch = Y.shape[1], net_in.shape[0] f_step = cgt.function([net_in], [net_out]) # loss_raw of shape (size_batch, 1); loss should be a scalar # sum-of-squares loss sigma = 0.1 loss_raw = -cgt.sum((net_out - Y) ** 2, axis=1, keepdims=True) / sigma # negative log-likelihood # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6 # positive sigma # loss_raw = -gaussian_diagonal.logprob( # Y, net_out, # out_sigma # cgt.fill(.01, [size_batch, size_out]) # ) if 'param_penal_wt' in config: print "Applying penalty on parameter norm" assert config['param_penal_wt'] > 0 params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = cgt.fill(cgt.sum(params_flat ** 2), [size_batch, 1]) loss_param *= config['param_penal_wt'] loss_raw += loss_param loss = cgt.sum(loss_raw) / size_batch # end of loss definition f_loss = cgt.function([net_in, Y], [net_out, loss]) f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw], params) return params, f_step, f_loss, f_grad, f_surr
def make_funcs(net_in, net_out, config, dbg_out=None): def f_grad(*x): out = f_surr(*x) return out['loss'], out['surr_loss'], out['surr_grad'] Y = cgt.matrix("Y") params = nn.get_parameters(net_out) if 'no_bias' in config and config['no_bias']: print "Excluding bias" params = [p for p in params if not p.name.endswith(".b")] size_out, size_batch = Y.shape[1], net_in.shape[0] f_step = cgt.function([net_in], [net_out]) # loss_raw of shape (size_batch, 1); loss should be a scalar # sum-of-squares loss sigma = 0.1 loss_raw = -cgt.sum((net_out - Y)**2, axis=1, keepdims=True) / sigma # negative log-likelihood # out_sigma = cgt.exp(net_out[:, size_out:]) + 1.e-6 # positive sigma # loss_raw = -gaussian_diagonal.logprob( # Y, net_out, # out_sigma # cgt.fill(.01, [size_batch, size_out]) # ) if 'param_penal_wt' in config: print "Applying penalty on parameter norm" assert config['param_penal_wt'] > 0 params_flat = cgt.concatenate([p.flatten() for p in params]) loss_param = cgt.fill(cgt.sum(params_flat**2), [size_batch, 1]) loss_param *= config['param_penal_wt'] loss_raw += loss_param loss = cgt.sum(loss_raw) / size_batch # end of loss definition f_loss = cgt.function([net_in, Y], [net_out, loss]) f_surr = get_surrogate_func([net_in, Y], [net_out] + dbg_out, [loss_raw], params) return params, f_step, f_loss, f_grad, f_surr
def take_one_step(self, nn_input_bf, hid_out): #PROBABLY BUGGED. SHOULD BE REWRITTEN. self.num_batches = cgt.infer_shape(nn_input_bf)[0] # (n_time_steps, n_batch, n_features) #input_bf = cgt.dimshuffle(nn_input_bf, [1, 0, 2]) # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return self.nonlinearity_hid(hid) # adding this non-linearity seems to help stability. #return hid if hid_out is None: if self.hid_out is None: self.hid_out = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) hid_out = self.hid_out # Retrieve the dimensionality of the incoming layer hid_out = step(nn_input_bf, hid_out, W_hid_stacked, W_in_stacked, b_stacked) # dimshuffle back to (n_batch, n_time_steps, n_features)) # self.hid_out = cgt.dimshuffle(self.hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) self.hid_out = hid_out return hid_out
def __call__(self, input_btf): # (n_time_steps, n_batch, n_features) input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2]) self.num_batches = cgt.infer_shape(input_tbf)[1] # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return hid sequences = [input_tbf] step_fun = step hid_init = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [W_in_stacked, b_stacked] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. # Retrieve the dimensionality of the incoming layer hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
def concatenate(items, axis=0): return cgt.concatenate(items, axis=axis)
def build_fcn_action_cond_encoder_net(input_shapes, levels=None): x_shape, u_shape = input_shapes x_c_dim = x_shape[0] x1_c_dim = 16 levels = levels or [3] levels = sorted(set(levels)) X = cgt.tensor4('X', fixed_shape=(None, ) + x_shape) U = cgt.matrix('U', fixed_shape=(None, ) + u_shape) # encoding Xlevels = {} for level in range(levels[-1] + 1): if level == 0: Xlevel = X else: if level == 1: xlevelm1_c_dim = x_c_dim xlevel_c_dim = x1_c_dim else: xlevelm1_c_dim = xlevel_c_dim xlevel_c_dim = 2 * xlevel_c_dim Xlevel_1 = nn.rectify( nn.SpatialConvolution(xlevelm1_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='conv%d_1' % level, weight_init=nn.IIDGaussian(std=0.01))( Xlevels[level - 1])) Xlevel_2 = nn.rectify( nn.SpatialConvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='conv%d_2' % level, weight_init=nn.IIDGaussian(std=0.01))(Xlevel_1)) Xlevel = nn.max_pool_2d(Xlevel_2, kernelshape=(2, 2), pad=(0, 0), stride=(2, 2)) Xlevels[level] = Xlevel # bilinear Xlevels_next_pred_0 = {} Ylevels = OrderedDict() Ylevels_diff_pred = OrderedDict() for level in levels: Xlevel = Xlevels[level] Xlevel_diff_pred = Bilinear(input_shapes, b=None, axis=2, name='bilinear%d' % level)(Xlevel, U) Xlevels_next_pred_0[level] = Xlevel + Xlevel_diff_pred Ylevels[level] = Xlevel.reshape( (Xlevel.shape[0], cgt.mul_multi(Xlevel.shape[1:]))) Ylevels_diff_pred[level] = Xlevel_diff_pred.reshape( (Xlevel_diff_pred.shape[0], cgt.mul_multi(Xlevel_diff_pred.shape[1:]))) # decoding Xlevels_next_pred = {} for level in range(levels[-1] + 1)[::-1]: if level == levels[-1]: Xlevel_next_pred = Xlevels_next_pred_0[level] else: if level == 0: xlevelm1_c_dim = x_c_dim elif level < levels[-1] - 1: xlevel_c_dim = xlevelm1_c_dim xlevelm1_c_dim = xlevelm1_c_dim // 2 Xlevel_next_pred_2 = SpatialDeconvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(2, 2), pad=(0, 0), stride=(2, 2), name='upsample%d' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevels_next_pred[ level + 1]) # TODO initialize with bilinear # TODO should rectify? Xlevel_next_pred_1 = nn.rectify( SpatialDeconvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='deconv%d_2' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_2)) nonlinearity = nn.rectify if level > 0 else cgt.tanh Xlevel_next_pred = nonlinearity( SpatialDeconvolution( xlevel_c_dim, xlevelm1_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='deconv%d_1' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_1)) if level in Xlevels_next_pred_0: coefs = nn.parameter(nn.init_array(nn.Constant(0.5), (2, )), name='sum%d.coef' % level) Xlevel_next_pred = coefs[0] * Xlevel_next_pred + coefs[ 1] * Xlevels_next_pred_0[level] # TODO: tanh should be after sum Xlevels_next_pred[level] = Xlevel_next_pred X_next_pred = Xlevels_next_pred[0] Y = cgt.concatenate(Ylevels.values(), axis=1) Y_diff_pred = cgt.concatenate(Ylevels_diff_pred.values(), axis=1) X_diff = cgt.tensor4('X_diff', fixed_shape=(None, ) + x_shape) X_next = X + X_diff loss = ((X_next - X_next_pred)**2).mean(axis=0).sum() / 2. net_name = 'FcnActionCondEncoderNet_levels' + ''.join( str(level) for level in levels) input_vars = OrderedDict([(var.name, var) for var in [X, U, X_diff]]) pred_vars = OrderedDict([('Y_diff_pred', Y_diff_pred), ('Y', Y), ('X_next_pred', X_next_pred)]) return net_name, input_vars, pred_vars, loss
def stack(tensors, axis=0): if axis is not 0: raise ValueError('only axis=0 is supported under cgt') return cgt.concatenate(map(lambda x: cgt.reshape(x, [1] + x.shape), tensors), axis=0)
def get_character_distribution(self, state_bf, context_bf): total_state = cgt.concatenate([state_bf, context_bf], axis=1) d1 = self.final_out_dense(total_state) return softmax(d1, axis=1)
fixed_shape_mask="all") yname = layer.top[0] output = [cgt.broadcast("+", X.dot(W), b, "xx,1x")] elif layer.type == "ReLU": output = [nn.rectify(inputs[0])] elif layer.type == "Softmax": output = [nn.softmax(inputs[0])] elif layer.type == "LRN": # XXX needs params param = layer.lrn_param output = [ nn.lrn(inputs[0], param.alpha, param.beta, param.local_size) ] elif layer.type == "Concat": param = layer.concat_param output = [cgt.concatenate(inputs, param.concat_dim)] elif layer.type == "Dropout": output = [nn.dropout(inputs[0])] elif layer.type == "SoftmaxWithLoss": output = [nn.loglik_softmax(inputs[0], inputs[1])] elif layer.type == "Accuracy": output = [nn.zero_one_loss(inputs[0], inputs[1])] else: cgt.error("unrecognized layer type %s" % layer.type) assert output is not None # assert isinstance(output, cgt.Node) for i in xrange(len(layer.top)): name2node[layer.top[i]] = output[i] print "stored", layer.top[0]
def flatcat(xs): return cgt.concatenate([x.flatten() for x in xs])
bname = layer.param[1].name or layer.name+":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") yname = layer.top[0] output = [cgt.broadcast("+",X.dot(W), b, "xx,1x") ] elif layer.type == "ReLU": output = [nn.rectify(inputs[0])] elif layer.type == "Softmax": output = [nn.softmax(inputs[0])] elif layer.type == "LRN": # XXX needs params param = layer.lrn_param output = [nn.lrn(inputs[0], param.alpha,param.beta, param.local_size)] elif layer.type == "Concat": param = layer.concat_param output = [cgt.concatenate(inputs, param.concat_dim) ] elif layer.type == "Dropout": output = [nn.dropout(inputs[0])] elif layer.type == "SoftmaxWithLoss": output = [nn.loglik_softmax(inputs[0], inputs[1])] elif layer.type == "Accuracy": output = [nn.zero_one_loss(inputs[0], inputs[1])] else: cgt.error("unrecognized layer type %s"%layer.type) assert output is not None # assert isinstance(output, cgt.Node) for i in xrange(len(layer.top)): name2node[layer.top[i]] = output[i] print "stored", layer.top[0] if layer.type != "Data":
def get_decoder_state(self, context_bf, prev_out_bc, prev_decoder_state): input_bf = cgt.concatenate([context_bf, prev_out_bc], axis=1) l1 = self.recurrent_decoder_one(input_bf, prev_decoder_state) l2 = self.recurrent_decoder_two(l1) return l2
def __call__(self, nn_input_btf): # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) nn_input_tbf = cgt.dimshuffle(nn_input_btf, [1, 0, 2]) seq_len, num_batch = nn_input_tbf.shape[0], nn_input_tbf.shape[1] # Stack input weight matrices into a (num_inputs, 4*num_units) #checks out # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # Stack biases into a (4*num_units) vector b_stacked = cgt.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=1) def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Calculate gates pre-activations and slice gates = input_n + cgt.dot(hid_previous, W_hid_stacked) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] sequences = nn_input_tbf step_fun = step ones = cgt.ones((num_batch, 1)) cell_init = cgt.dot(ones, self.cell_init) hid_init = cgt.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] non_seqs += [W_in_stacked, b_stacked] cell_out, hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no", fixed_shape=(None, obs_dim)) a_na = cgt.matrix("a_na", fixed_shape=(None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2 * ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh( nn.Affine(obs_dim, nhid, weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh( nn.Affine(nhid, nhid, weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid, ctrl_dim, weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2 * self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na).sum(axis=1) ) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n * adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def combo_layer(X, size_in, size_out, splits, s_funcs=None, o_funcs=None, name='?', dbg_out={}): """ Create a combination of specified sub-layers and non-linearity. :param X: symbolic input :param size_in: input size (except batch) :param size_out: output size (except batch) :param splits: split points for applying each sub-layer :param s_funcs: list of functions to create each sub-layer of signature (input, in_size, out_size, name) -> output :param o_funcs: list of non-linearity functions of signature (input) -> output :param name: layer name :type name: str :return: symbolic output Note for s_funcs and o_funcs: - broadcasting enabled if not a list - element with value None has no effect (skip) """ assert isinstance(splits, tuple) and len(splits) > 0 assert all(splits[i] < splits[i + 1] for i in xrange(len(splits) - 1)) assert splits[0] >= 0 and splits[-1] <= size_out splits = list(splits) assert not isinstance(o_funcs, list) and not isinstance(s_funcs, list) o_funcs = list(o_funcs) if isinstance(o_funcs, tuple) \ else [o_funcs] * (len(splits) + 1) s_funcs = list(s_funcs) if isinstance(s_funcs, tuple) \ else [s_funcs] * (len(splits) + 1) assert len(splits) + 1 == len(o_funcs) == len(s_funcs) if splits[0] == 0: splits.pop(0) o_funcs.pop(0) s_funcs.pop(0) if len(splits) > 0 and splits[-1] == size_out: splits.pop() o_funcs.pop() s_funcs.pop() curr, names, ins, outs = 0, [], [], [] splits.append(size_out) for _split, _f_s, _f_o in zip(splits, s_funcs, o_funcs): _name = 'L%s[%d:%d]' % (name, curr, _split) _s_out = _split - curr _i = X if _f_s is None else _f_s(X, size_in, _s_out, name=_name) _o = _i if _f_o is None else _f_o(_i) curr = _split ins.append(_i) outs.append(_o) names.append(_name) out = cgt.concatenate(outs, axis=1) if len(outs) > 1 else outs[0] dbg_out.update(dict(zip([_n + '~in' for _n in names], ins))) dbg_out.update(dict(zip([_n + '~out' for _n in names], outs))) return out