def __init__(self, input_size, output_size, name="", weight_init=HeUniform(1.0), bias_init=Constant(0)): """ Initialize an Feedforward cell. """ self.W = parameter(init_array(weight_init, (input_size, output_size)), name=name + ".W") self.b = parameter(init_array(bias_init, (1, output_size)), name=name + ".b")
def __init__(self, input_shapes, axis=1, name=None, M=nn.IIDGaussian(std=0.001), N=nn.IIDGaussian(std=0.001), b=nn.Constant(0)): assert axis >= 1 self.axis = axis name = "unnamed" if name is None else name self.y_shape, self.u_shape = input_shapes self.y_dim = int(np.prod(self.y_shape[self.axis - 1:])) self.u_dim, = self.u_shape self.M = nn.parameter(nn.init_array( M, (self.y_dim, self.y_dim, self.u_dim)), name=name + ".M") self.N = nn.parameter(nn.init_array(N, (self.y_dim, self.u_dim)), name=name + ".N") if b is None: self.b = None else: self.b = nn.parameter(nn.init_array(b, (self.y_dim, )), name=name + ".b") # TODO: not regularizable
def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (parameter(init_array(gate.W_in, (input_feature_size, num_units)), name=gate_name+".W"), parameter(init_array(gate.W_hid, (num_units, num_units)), name=gate_name+".W"), parameter(init_array(gate.b, (1, num_units)), name=gate_name+".b"), gate.nonlinearity)
def __init__(self, input_channels, output_channels, kernelshape, pad, stride=(1, 1), name=None, weight_init=nn.Constant(0), bias_init=nn.Constant(0)): # type conversion self.input_channels = int(input_channels) self.output_channels = int(output_channels) self.kernelshape = tuple(map(int, kernelshape)) self.pad = tuple(map(int, pad)) self.stride = tuple(map(int, stride)) name = "unnamed" if name is None else name self.weight = theano.shared(nn.init_array( weight_init, (self.output_channels, self.input_channels) + self.kernelshape), name=name + ".W") self.bias = theano.shared(nn.init_array( bias_init, (1, self.output_channels, 1, 1)), name=name + ".b") self.bias.type.broadcastable = (True, False, True, True)
def __init__(self, input_feature_size, input_time_size, num_units, weight_init=HeUniform(), activation=cgt.sigmoid, cell_out_init=IIDUniform(-0.1, 0.1), hid_out_init=IIDUniform(-0.1, 0.1), #cell_out_init=Constant(0.0), #hid_out_init=Constant(0.0), backwards=False): ingate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) forgetgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) cell = Gate(W_cell=None, nonlinearity=cgt.tanh) outgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) self.nonlinearity = activation self.num_units = num_units self.backwards = backwards self.timesteps = input_time_size def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (parameter(init_array(gate.W_in, (input_feature_size, num_units)), name=None), parameter(init_array(gate.W_hid, (num_units, num_units)), name=None), parameter(init_array(gate.b, (1, num_units)), name=None), gate.nonlinearity) # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params(forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') self.hid_init = parameter(init_array(hid_out_init, (1, num_units)), name=None) self.cell_init = parameter(init_array(cell_out_init, (1, num_units)), name=None) # Stack input weight matrices into a (num_inputs, 4*num_units) #checks out # matrix, which speeds up computation self.W_in_stacked = cgt.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) # Same for hidden weight matrices self.W_hid_stacked = cgt.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # Stack biases into a (4*num_units) vector self.b_stacked = cgt.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=1) self.cell_prev = None self.hid_prev = None
def __init__(self, input_size, output_size, name=None, weight_init=nn.Zeros(), bias_init=nn.Zeros()): input_size = int(input_size) output_size = int(output_size) name = "unnamed" if name is None else name self.weight = theano.shared(nn.init_array(weight_init, (input_size, output_size)), name=name + ".W") self.bias = theano.shared(nn.init_array(bias_init, (1, output_size)), name=name + ".b") self.bias.type.broadcastable = (True, False)
def make_prediction(self, max_label_length, ground_labels_basis_btc): context_i_bf = parameter(init_array(IIDGaussian(0.1), (self.batch_size, self.feature_size)), name=None) state_i_bf = parameter(init_array(IIDGaussian(0.1), (self.batch_size, self.decoder_size)), name=None) char_list = [] for iter_step in range(0, max_label_length): #Is this right? prev_out_bc = ground_labels_basis_btc[:, iter_step, :] state_i_bf = self.get_decoder_state(context_i_bf, prev_out_bc, state_i_bf) context_i_bf = self.get_context(state_i_bf) this_character_dist = self.get_character_distribution(state_i_bf, context_i_bf) char_list.append(cgt.argmax(this_character_dist, axis=1)) final = cgt.dimshuffle(cgt.stack(char_list), [1, 0]) return final
def __init__(self, input_size, rnn_size, name="", weight_init=HeUniform(1.0)): """ lstm cell """ # TODO: add bias # forget gate weights self.W_xf = parameter(init_array(weight_init, (input_size, rnn_size)), name=name + ".W_xf") self.W_hf = parameter(init_array(weight_init, (rnn_size, rnn_size)), name=name + "W_hf") # input gate weights self.W_xi = parameter(init_array(weight_init, (input_size, rnn_size)), name=name + ".W_xi") self.W_hi = parameter(init_array(weight_init, (rnn_size, rnn_size)), name=name + "W_hi") # output gate weights self.W_xo = parameter(init_array(weight_init, (input_size, rnn_size)), name=name + ".W_xo") self.W_ho = parameter(init_array(weight_init, (rnn_size, rnn_size)), name=name + "W_ho") # candidate value weights self.W_xc = parameter(init_array(weight_init, (input_size, rnn_size)), name=name + ".W_xc") self.W_hc = parameter(init_array(weight_init, (rnn_size, rnn_size)), name=name + "W_hc")
def __init__(self, input_channels, output_channels, kernelshape, pad, stride=(1,1), name=None, weight_init=nn.Constant(0), bias_init=nn.Constant(0)): # type conversion self.input_channels = int(input_channels) self.output_channels = int(output_channels) self.kernelshape = tuple(map(int, kernelshape)) self.pad = tuple(map(int,pad)) self.stride = tuple(map(int,stride)) name = "unnamed" if name is None else name self.weight = theano.shared(nn.init_array(weight_init, (self.output_channels, self.input_channels) + self.kernelshape), name=name+".W") self.bias = theano.shared(nn.init_array(bias_init, (1, self.output_channels, 1, 1)), name=name+".b") self.bias.type.broadcastable = (True,False,True,True)
def __init__(self, input_size, output_size, name="", weight_init=HeUniform(1.0), bias_init=Constant(0)): """ Initialize an Feedforward cell. """ self.W = parameter(init_array(weight_init, (input_size, output_size)), name=name + ".W") self.b = parameter(init_array(bias_init, (1, output_size)), name=name + '.b')
def _init_optim_state(ws, reset=False): if 'optim_state' in ws and not reset: return config = ws['config'] if 'optim_state' in ws: print "Reusing cached optim_state" theta = ws['optim_state']['theta'] elif 'snapshot' in config: print "Loading optim_state from previous snapshot: %s" % config['snapshot'] ws['optim_state'] = pickle.load(open(config['snapshot'], 'r')) theta = ws['optim_state']['theta'] else: init_method = config['init_theta']['distr'] if init_method == 'XavierNormal': init_theta = nn.XavierNormal(**config['init_theta']['params']) elif init_method == 'gaussian': init_theta = nn.IIDGaussian(**config['init_theta']['params']) else: raise ValueError('unknown init distribution') theta = nn.init_array(init_theta, (ws['param_col'].get_total_size(), 1)).flatten() method = config['opt_method'].lower() if method == 'rmsprop': optim_create = lambda t: rmsprop_create(t, step_size=config['step_size']) elif method == 'adam': optim_create = lambda t: adam_create(t, step_size=config['step_size']) else: raise ValueError('unknown optimization method: %s' % method) if reset or 'optim_state' not in ws: ws['optim_state'] = optim_create(theta)
def test_get_decoder_state(): batch_size = 32 feat_t_steps = 20 feat_num_features = 42 num_out_classes = 28 num_out_classes_true = num_out_classes + 2 # Start, end, are added decoder_size = 50 tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_num_features), (batch_size, feat_num_features)) tau3 = np.reshape(np.random.normal(0.1, 0.2, batch_size*num_out_classes_true), (batch_size, num_out_classes_true)) feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, decoder_size=decoder_size, feature_size=feat_num_features) context_bf = cgt.matrix(fixed_shape=(batch_size, feat_num_features)) prev_out_bc = cgt.matrix(fixed_shape=(batch_size, num_out_classes_true)) state_i_bf = nn.parameter(nn.init_array(nn.IIDGaussian(0.1), (batch_size, decoder_size)), name="decoder_init") decoder_out = s.get_decoder_state(context_bf, prev_out_bc, state_i_bf) decode_fun = cgt.function([feats, context_bf, prev_out_bc], [decoder_out]) m = decode_fun(tau, tau2, tau3)[0] assert m.shape == (batch_size, decoder_size) assert np.mean(m) < 1.0
def _init_optim_state(ws, reset=False): if 'optim_state' in ws and not reset: return config = ws['config'] if 'optim_state' in ws: print "Reusing cached optim_state" theta = ws['optim_state']['theta'] elif 'snapshot' in config: print "Loading optim_state from previous snapshot: %s" % config[ 'snapshot'] ws['optim_state'] = pickle.load(open(config['snapshot'], 'r')) theta = ws['optim_state']['theta'] else: init_method = config['init_theta']['distr'] if init_method == 'XavierNormal': init_theta = nn.XavierNormal(**config['init_theta']['params']) elif init_method == 'gaussian': init_theta = nn.IIDGaussian(**config['init_theta']['params']) else: raise ValueError('unknown init distribution') theta = nn.init_array(init_theta, (ws['param_col'].get_total_size(), 1)).flatten() method = config['opt_method'].lower() if method == 'rmsprop': optim_create = lambda t: rmsprop_create(t, step_size=config['step_size']) elif method == 'adam': optim_create = lambda t: adam_create(t, step_size=config['step_size']) else: raise ValueError('unknown optimization method: %s' % method) if reset or 'optim_state' not in ws: ws['optim_state'] = optim_create(theta)
def __init__(self, input_size, hidden_size, name="", weight_init=HeUniform(1.0)): """ Initialize an RNN cell """ # input to hidden self.W_xh = parameter(init_array(weight_init, (input_size, hidden_size)), name=name+".W_xh") # hidden to hidden self.W_hh = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name+".W_hh") # hidden to output self.W_ho = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name+".W_ho")
def __init__(self, input_size, output_size, name=None, weight_init=nn.Constant(0), bias_init=nn.Constant(0)): input_size = int(input_size) output_size = int(output_size) name = "unnamed" if name is None else name self.weight = theano.shared(nn.init_array(weight_init, (input_size, output_size)), name=name + ".W") self.bias = theano.shared(nn.init_array(bias_init, (1, output_size)), name=name + ".b") self.bias.type.broadcastable = (True, False)
def __init__(self, input_feature_size, input_time_size, num_units, weight_init=XavierNormal(), activation=cgt.sigmoid, hid_out_init=IIDUniform(0, 1), backwards=False): self.num_units = num_units self.timesteps = input_time_size self.num_batches = None self.backwards = backwards self.input_feature_size = input_feature_size resetgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=None, nonlinearity=activation) Gate(W_in=weight_init, W_hid=weight_init, W_cell=None, nonlinearity=activation) updategate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=None, nonlinearity=activation) hidden_update = Gate(W_in=weight_init, W_hid=weight_init, W_cell=None, nonlinearity=cgt.tanh) def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (parameter(init_array(gate.W_in, (input_feature_size, num_units)), name=gate_name+".W"), parameter(init_array(gate.W_hid, (num_units, num_units)), name=gate_name+".W"), parameter(init_array(gate.b, (1, num_units)), name=gate_name+".b"), gate.nonlinearity) # Add in all parameters from gates (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate, self.nonlinearity_updategate) = add_gate_params(updategate, 'updategate') (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate, self.nonlinearity_resetgate) = add_gate_params(resetgate, 'resetgate') (self.W_in_to_hidden_update, self.W_hid_to_hidden_update, self.b_hidden_update, self.nonlinearity_hid) = add_gate_params(hidden_update, 'hidden_update') self.hid_init = parameter(init_array(hid_out_init, (1, num_units)), name='.hid_out_init') self.hid_out = None
def __init__(self, num_units, input_feature_size, input_time_size, activation=rectify, backwards=False, weight_init=XavierNormal(), hid_out_init=IIDUniform(0, 1)): self.in_to_hid = Affine(input_size=input_feature_size, output_size=num_units, weight_init=weight_init) self.hid_to_hid = Affine(input_size=num_units, output_size=num_units, weight_init=weight_init) self.activation = activation self.hid_init = parameter(init_array(hid_out_init, (1, num_units)), name='.hid_out_init') self.timesteps = input_time_size self.backwards = backwards
def get_train_objective(self, max_label_length, ground_labels_basis_btc): context_i_bf = parameter(init_array(IIDUniform(-0.1, 0.1), (self.batch_size, self.feature_size)), name=None) state_i_bf = parameter(init_array(IIDUniform(-0.1, 0.1), (self.batch_size, self.decoder_size)), name=None) prev_out_bc = cgt.zeros((self.batch_size, self.true_number_classes), dtype='i8') #+ self.start_token_index log_probs = None for iter_step in range(0, max_label_length): state_i_bf = self.get_decoder_state(context_i_bf, prev_out_bc, state_i_bf) context_i_bf = self.get_context(state_i_bf) this_character_dist_bc = self.get_character_distribution(state_i_bf, context_i_bf) prev_out_bc = ground_labels_basis_btc[:, iter_step, :] log_probs_pre = prev_out_bc * this_character_dist_bc log_probs_pre = cgt.log(cgt.sum(log_probs_pre, axis=1)) if log_probs is None: log_probs = cgt.sum(log_probs_pre) else: log_probs += cgt.sum(log_probs_pre) log_probs = -log_probs return log_probs
def train(args, X, Y, dbg_iter=None, dbg_epoch=None, dbg_done=None): dbg_out = [] net_in, net_out = hybrid_network(args.num_inputs, args.num_outputs, args.num_units, args.num_sto, dbg_out=dbg_out) params, f_step, f_loss, f_grad, f_surr = \ make_funcs(net_in, net_out, args, dbg_out=dbg_out) param_col = ParamCollection(params) init_params = nn.init_array(args.init_conf, (param_col.get_total_size(), 1)) param_col.set_value_flat(init_params.flatten()) init_params = [ np.array([[0., 1.]]), # W_1 np.array([[0., 0.]]), # b_1 np.array([[1.], [1.]]), # W_3 np.array([[0.]]), # b_3 ] param_col.set_values(init_params) if 'snapshot' in args: print "Loading params from previous snapshot" snapshot = pickle.load(open(args['snapshot'], 'r')) param_col.set_values(snapshot) # param_col.set_value_flat( # np.random.normal(0., 1.,size=param_col.get_total_size()) # ) # optim_state = Table(theta=param_col.get_value_flat(), # scratch=param_col.get_value_flat(), # step_size=args.step_size # ) optim_state = make_rmsprop_state(theta=param_col.get_value_flat(), step_size=args.step_size, decay_rate=args.decay_rate) for i_epoch in range(args.n_epochs): for i_iter in range(X.shape[0]): ind = np.random.choice(X.shape[0], args['size_batch']) x, y = X[ind], Y[ind] # not sure this works for multi-dim info = f_surr(x, y, num_samples=args['size_sample']) loss, loss_surr, grad = info['loss'], info['surr_loss'], info[ 'surr_grad'] # loss, loss_surr, grad = f_grad(x, y) # update rmsprop_update(param_col.flatten_values(grad), optim_state) # optim_state.scratch = param_col.flatten_values(grad) # optim_state.theta -= optim_state.step_size * optim_state.scratch param_col.set_value_flat(optim_state.theta) print param_col.get_value_flat() if dbg_iter: dbg_iter(i_epoch, i_iter, param_col, optim_state, info) if dbg_epoch: dbg_epoch(i_epoch, param_col, f_surr) if dbg_done: dbg_done(param_col, optim_state, f_surr) return optim_state
def __init__(self, input_size, hidden_size, name="", weight_init=HeUniform(1.0)): """ Initialize an RNN cell """ # input to hidden self.W_xh = parameter(init_array(weight_init, (input_size, hidden_size)), name=name + ".W_xh") # hidden to hidden self.W_hh = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name + ".W_hh") # hidden to output self.W_ho = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name + ".W_ho")
def __init__(self, input_feature_size, input_time_size, num_units, weight_init=XavierNormal(), activation=rectify, cell_out_init=IIDUniform(0, 1), hid_out_init=IIDUniform(0, 1), backwards=False): ingate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) forgetgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) cell = Gate(W_cell=None, nonlinearity=cgt.tanh) outgate = Gate(W_in=weight_init, W_hid=weight_init, W_cell=weight_init, nonlinearity=activation) self.nonlinearity = activation self.num_units = num_units self.backwards = backwards self.timesteps = input_time_size def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (parameter(init_array(gate.W_in, (input_feature_size, num_units)), name=gate_name+".W"), parameter(init_array(gate.W_hid, (num_units, num_units)), name=gate_name+".W"), parameter(init_array(gate.b, (1, num_units)), name=gate_name+".b"), gate.nonlinearity) # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params(forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') self.hid_init = parameter(init_array(hid_out_init, (1, num_units)), name='.hid_out_init') self.cell_init = parameter(init_array(cell_out_init, (1, num_units)), name='.cell_out_init')
def train(args, X, Y, dbg_iter=None, dbg_epoch=None, dbg_done=None): dbg_out = [] net_in, net_out = hybrid_network(args.num_inputs, args.num_outputs, args.num_units, args.num_sto, dbg_out=dbg_out) params, f_step, f_loss, f_grad, f_surr = \ make_funcs(net_in, net_out, args, dbg_out=dbg_out) param_col = ParamCollection(params) init_params = nn.init_array(args.init_conf, (param_col.get_total_size(), 1)) param_col.set_value_flat(init_params.flatten()) init_params = [ np.array([[0., 1.]]), # W_1 np.array([[0., 0.]]), # b_1 np.array([[1.], [1.]]), # W_3 np.array([[0.]]), # b_3 ] param_col.set_values(init_params) if 'snapshot' in args: print "Loading params from previous snapshot" snapshot = pickle.load(open(args['snapshot'], 'r')) param_col.set_values(snapshot) # param_col.set_value_flat( # np.random.normal(0., 1.,size=param_col.get_total_size()) # ) # optim_state = Table(theta=param_col.get_value_flat(), # scratch=param_col.get_value_flat(), # step_size=args.step_size # ) optim_state = make_rmsprop_state(theta=param_col.get_value_flat(), step_size=args.step_size, decay_rate=args.decay_rate) for i_epoch in range(args.n_epochs): for i_iter in range(X.shape[0]): ind = np.random.choice(X.shape[0], args['size_batch']) x, y = X[ind], Y[ind] # not sure this works for multi-dim info = f_surr(x, y, num_samples=args['size_sample']) loss, loss_surr, grad = info['loss'], info['surr_loss'], info['surr_grad'] # loss, loss_surr, grad = f_grad(x, y) # update rmsprop_update(param_col.flatten_values(grad), optim_state) # optim_state.scratch = param_col.flatten_values(grad) # optim_state.theta -= optim_state.step_size * optim_state.scratch param_col.set_value_flat(optim_state.theta) print param_col.get_value_flat() if dbg_iter: dbg_iter(i_epoch, i_iter, param_col, optim_state, info) if dbg_epoch: dbg_epoch(i_epoch, param_col, f_surr) if dbg_done: dbg_done(param_col, optim_state, f_surr) return optim_state
def __init__(self, input_size, rnn_size, name="", weight_init=HeUniform(1.0)): """ lstm cell """ # TODO: add bias # forget gate weights self.W_xf = parameter(init_array(weight_init, (input_size, rnn_size)), name=name+".W_xf") self.W_hf = parameter(init_array(weight_init, (rnn_size, rnn_size)), name=name+"W_hf") # input gate weights self.W_xi = parameter(init_array(weight_init, (input_size, rnn_size)), name=name+".W_xi") self.W_hi = parameter(init_array(weight_init, (rnn_size, rnn_size)), name=name+"W_hi") # output gate weights self.W_xo = parameter(init_array(weight_init, (input_size, rnn_size)), name=name+".W_xo") self.W_ho = parameter(init_array(weight_init, (rnn_size, rnn_size)), name=name+"W_ho") # candidate value weights self.W_xc = parameter(init_array(weight_init, (input_size, rnn_size)), name=name+".W_xc") self.W_hc = parameter(init_array(weight_init, (rnn_size, rnn_size)), name=name+"W_hc")
def __init__(self, nn_input_btf, num_out_classes, get_features_fun=None, feature_size=40, decoder_size=40, w_init=IIDUniform(-0.1, 0.1)): self.start_token_index = num_out_classes self.end_token_index = self.start_token_index + 1 self.true_number_classes = num_out_classes + 2 # add dims for start and end token. self.batch_size = cgt.infer_shape(nn_input_btf)[0] self.w_init = w_init self.feature_size = feature_size self.decoder_size = decoder_size if get_features_fun is not None: self.get_features_fun = get_features_fun else: self.get_features_fun = self.get_features_bengio features_btf = self.get_features_fun(nn_input_btf, num_units=self.feature_size) # Compute psi<h_u> over all u (timesteps), the features from the ground data. # This is for computing the context c_i. The features are put through a dense layer. self.features_post_mlp_btf = temporalDenseLayer(features_btf, self.feature_size, w_init=self.w_init, activation=linear, bias_init=Constant(0.0)) self.mixing_vec_w = parameter(init_array(w_init, (1, 1, self.feature_size,)), name=None) # These are for the decoder mechanism, which computes s_i. rnn_activation = cgt.sigmoid recurrence = Recurrent self.recurrent_decoder_one = recurrence(num_units=self.decoder_size, input_time_size=None, input_feature_size=self.feature_size + self.true_number_classes, weight_init=self.w_init, activation=rnn_activation).take_one_step self.recurrent_decoder_two = linear #self.recurrent_decoder_two = recurrence(num_units=self.decoder_size, input_time_size=None, # input_feature_size=self.decoder_size, # weight_init=self.w_init, activation=rnn_activation).take_one_step # Multiply s_i by V to make it have same dimension as h_u. self.states_mlp_bf = Affine(self.decoder_size, self.feature_size, weight_init=self.w_init, bias_init=Constant(0.0)) # This is the final dense layer, which computes the class probs at the end of all things. self.final_out_dense = Affine(self.decoder_size + self.feature_size, self.true_number_classes, weight_init=w_init, bias_init=Constant(0.0))
def __init__(self, input_size, hidden_size, name="", weight_init=HeUniform(1.0)): """ Chung, Junyoung, et al. "Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling." arXiv preprint arXiv:1412.3555 (2014). In the above paper: z is used as notation for the update gate r as notation for the reset gate """ # TODO: bias # The paper makes no mention of bias in equations or text. # Sooo I'm not sure we need it. # reset gate self.W_xr = parameter(init_array(weight_init, (input_size, hidden_size)), name=name + ".W_input_to_reset") self.W_hr = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name + "W_hidden_to_reset") # update gate self.W_xz = parameter(init_array(weight_init, (input_size, hidden_size)), name=name + ".W_input_to_update") self.W_hz = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name + "W_hidden_to_update") # ~hidden is the candidate activation, so we'll denote it as c self.W_xc = parameter(init_array(weight_init, (input_size, hidden_size)), name=name + ".W_input_to_candidate") self.W_hc = parameter(init_array(weight_init, (hidden_size, hidden_size)), name=name + "W_hidden_to_candidate")
def build_fcn_action_cond_encoder_net(input_shapes, levels=None): x_shape, u_shape = input_shapes x_c_dim = x_shape[0] x1_c_dim = 16 levels = levels or [3] levels = sorted(set(levels)) X = cgt.tensor4('X', fixed_shape=(None, ) + x_shape) U = cgt.matrix('U', fixed_shape=(None, ) + u_shape) # encoding Xlevels = {} for level in range(levels[-1] + 1): if level == 0: Xlevel = X else: if level == 1: xlevelm1_c_dim = x_c_dim xlevel_c_dim = x1_c_dim else: xlevelm1_c_dim = xlevel_c_dim xlevel_c_dim = 2 * xlevel_c_dim Xlevel_1 = nn.rectify( nn.SpatialConvolution(xlevelm1_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='conv%d_1' % level, weight_init=nn.IIDGaussian(std=0.01))( Xlevels[level - 1])) Xlevel_2 = nn.rectify( nn.SpatialConvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='conv%d_2' % level, weight_init=nn.IIDGaussian(std=0.01))(Xlevel_1)) Xlevel = nn.max_pool_2d(Xlevel_2, kernelshape=(2, 2), pad=(0, 0), stride=(2, 2)) Xlevels[level] = Xlevel # bilinear Xlevels_next_pred_0 = {} Ylevels = OrderedDict() Ylevels_diff_pred = OrderedDict() for level in levels: Xlevel = Xlevels[level] Xlevel_diff_pred = Bilinear(input_shapes, b=None, axis=2, name='bilinear%d' % level)(Xlevel, U) Xlevels_next_pred_0[level] = Xlevel + Xlevel_diff_pred Ylevels[level] = Xlevel.reshape( (Xlevel.shape[0], cgt.mul_multi(Xlevel.shape[1:]))) Ylevels_diff_pred[level] = Xlevel_diff_pred.reshape( (Xlevel_diff_pred.shape[0], cgt.mul_multi(Xlevel_diff_pred.shape[1:]))) # decoding Xlevels_next_pred = {} for level in range(levels[-1] + 1)[::-1]: if level == levels[-1]: Xlevel_next_pred = Xlevels_next_pred_0[level] else: if level == 0: xlevelm1_c_dim = x_c_dim elif level < levels[-1] - 1: xlevel_c_dim = xlevelm1_c_dim xlevelm1_c_dim = xlevelm1_c_dim // 2 Xlevel_next_pred_2 = SpatialDeconvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(2, 2), pad=(0, 0), stride=(2, 2), name='upsample%d' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevels_next_pred[ level + 1]) # TODO initialize with bilinear # TODO should rectify? Xlevel_next_pred_1 = nn.rectify( SpatialDeconvolution( xlevel_c_dim, xlevel_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='deconv%d_2' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_2)) nonlinearity = nn.rectify if level > 0 else cgt.tanh Xlevel_next_pred = nonlinearity( SpatialDeconvolution( xlevel_c_dim, xlevelm1_c_dim, kernelshape=(3, 3), pad=(1, 1), stride=(1, 1), name='deconv%d_1' % (level + 1), weight_init=nn.IIDGaussian(std=0.01))(Xlevel_next_pred_1)) if level in Xlevels_next_pred_0: coefs = nn.parameter(nn.init_array(nn.Constant(0.5), (2, )), name='sum%d.coef' % level) Xlevel_next_pred = coefs[0] * Xlevel_next_pred + coefs[ 1] * Xlevels_next_pred_0[level] # TODO: tanh should be after sum Xlevels_next_pred[level] = Xlevel_next_pred X_next_pred = Xlevels_next_pred[0] Y = cgt.concatenate(Ylevels.values(), axis=1) Y_diff_pred = cgt.concatenate(Ylevels_diff_pred.values(), axis=1) X_diff = cgt.tensor4('X_diff', fixed_shape=(None, ) + x_shape) X_next = X + X_diff loss = ((X_next - X_next_pred)**2).mean(axis=0).sum() / 2. net_name = 'FcnActionCondEncoderNet_levels' + ''.join( str(level) for level in levels) input_vars = OrderedDict([(var.name, var) for var in [X, U, X_diff]]) pred_vars = OrderedDict([('Y_diff_pred', Y_diff_pred), ('Y', Y), ('X_next_pred', X_next_pred)]) return net_name, input_vars, pred_vars, loss