def create_model(vocab_size, rlayer_type): """ Create LSTM/GRU model for bAbI dataset. Args: vocab_size (int) : String of bAbI data. rlayer_type (string) : Type of recurrent layer to use (gru or lstm). Returns: Model : Model of the created network """ # recurrent layer parameters (default gru) rlayer_obj = GRU if rlayer_type == 'gru' else LSTM rlayer_params = dict(output_size=100, reset_cells=True, init=GlorotUniform(), init_inner=Orthonormal(0.5), activation=Tanh(), gate_activation=Logistic()) # if using lstm, swap the activation functions if rlayer_type == 'lstm': rlayer_params.update(dict(activation=Logistic(), gate_activation=Tanh())) # lookup layer parameters lookup_params = dict(vocab_size=vocab_size, embedding_dim=50, init=Uniform(-0.05, 0.05)) # Model construction story_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)] query_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)] layers = [MergeMultistream(layers=[story_path, query_path], merge="stack"), Affine(vocab_size, init=GlorotUniform(), activation=Softmax())] return Model(layers=layers)
def constuct_network(): """ Constructs the layers of our RCNN architecture. It is similar to AlexNet but simplified to only a few convolutional layers and 3 LSTM layers. """ layers = [ Conv((11, 11, 64), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=3, strides=4), Pooling(3, strides=2), Conv((7, 7, 128), init=Gaussian(scale=0.01), bias=Constant(1), activation=Rectlin(), padding=2), Pooling(3, strides=2), Conv((5, 5, 256), init=Gaussian(scale=0.03), bias=Constant(0), activation=Rectlin(), padding=1), Conv((3, 3, 256), init=Gaussian(scale=0.03), bias=Constant(1), activation=Rectlin(), padding=1), Pooling(3, strides=2), Affine(nout=4096, init=Gaussian(scale=0.01), bias=Constant(1), activation=Rectlin()), DropoutBinary(keep=0.5), LSTM(512, init=Gaussian(scale=0.03), activation=Rectlin(), gate_activation=Tanh()), LSTM(512, init=Gaussian(scale=0.03), activation=Rectlin(), gate_activation=Tanh()), LSTM(512, init=Gaussian(scale=0.03), activation=Rectlin(), gate_activation=Tanh()), Affine(nout=4096, init=Gaussian(scale=0.01), bias=Constant(1), activation=Rectlin()), DropoutBinary(keep=0.5), Affine(nout=101, init=Gaussian(scale=0.01), bias=Constant(-7), activation=Softmax()) ] return Model(layers=layers)
def test_multi_optimizer(backend_default): opt_gdm = GradientDescentMomentum(learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) opt = MultiOptimizer({ 'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1 }) map_list = opt._map_optimizers(layer_list) assert map_list[opt_adam][0].__class__.__name__ == 'Convolution' assert map_list[opt_ada][0].__class__.__name__ == 'Bias' assert map_list[opt_rms][0].__class__.__name__ == 'Linear' assert map_list[opt_gdm][0].__class__.__name__ == 'Activation' assert map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon gru instance gru = GRU(hidden_size, init=Gaussian(), activation=Tanh(), gate_activation=Logistic()) inpa = gru.be.array(np.copy(inp_bl)) # run fprop on the baseline input gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() gru.set_deltas([gru.be.iobuf(gru.in_shape)]) out_bl = gru.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = gru.bprop(gru.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_gru(gru) gru.allocate() out_pos = gru.fprop(gru.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_gru(gru) gru.allocate() out_neg = gru.fprop(gru.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale * out_pos) loss_neg = np.sum(rand_scale * out_neg) # compute the gradient estimate grad = 0.5 / float(epsilon) * (loss_pos - loss_neg) grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del gru return (grads_est, deltas_neon)
def __init__(self): self.in_shape = [1024, (2538, 38)] init = Constant(0) image_path = Sequential( [Affine(20, init, bias=init), Affine(10, init, bias=init)]) sent_path = Sequential([Affine(30, init, bias=init), Affine(10, init)]) layers = [ MergeMultistream(layers=[image_path, sent_path], merge="recurrent"), Dropout(keep=0.5), LSTM(4, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=True), Affine(20, init, bias=init, activation=Softmax()) ] self.layers = layers self.cost = GeneralizedCostMask(CrossEntropyMulti()) self.model = Model(layers=layers) self.model.initialize(self.in_shape, cost=self.cost)
def test_biLSTM_bprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), activation=Tanh(), init=init_glorot, reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() bilstm.set_deltas([bilstm.be.iobuf(bilstm.in_shape)]) # same weight for bi-rnn backward and rnn weights nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs and views lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) # allocate gpu buffers inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr_g = bilstm.fprop(inp_lr) out_lr = out_lr_g.get().copy() del_lr = bilstm.bprop(out_lr_g).get().copy() bilstm.h_buffer[:] = 0 out_rl_g = bilstm.fprop(inp_rl) out_rl = out_rl_g.get().copy() del_rl = bilstm.bprop(out_rl_g).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert np.allclose(x_f, y_b, rtol=0.0, atol=1.0e-5) assert np.allclose(x_b, y_f, rtol=0.0, atol=1.0e-5) del_lr_s = get_steps(del_lr, in_shape) del_rl_s = get_steps(del_rl, in_shape) for (x, y) in zip(del_lr_s, reversed(del_rl_s)): assert np.allclose(x, y, rtol=0.0, atol=1.0e-5)
def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon rnn instance rnn = Recurrent(hidden_size, Gaussian(), Tanh()) inpa = rnn.be.array(np.copy(inp_bl)) # run fprop on the baseline input out_bl = rnn.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = rnn.bprop(rnn.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_rnn(rnn) out_pos = rnn.fprop(rnn.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_rnn(rnn) out_neg = rnn.fprop(rnn.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale * out_pos) loss_neg = np.sum(rand_scale * out_neg) # compute the gradient estimate grad = 0.5 * (loss_pos - loss_neg) / epsilon grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del rnn return (grads_est, deltas_neon)
def test_tanh_derivative(backend): inputs = np.array([true_tanh(0), true_tanh(1), true_tanh(-2)]).reshape((3, 1)) # bprop is on the output outputs = np.array( [1 - true_tanh(0)**2, 1 - true_tanh(1)**2, 1 - true_tanh(-2)**2]).reshape((3, 1)) compare_tensors(Tanh(), inputs, outputs, deriv=True, tol=1e-7)
def test_biLSTM_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), init=init_glorot, activation=Tanh(), reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() # same weight nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr = bilstm.fprop(inp_lr).get().copy() bilstm.h_buffer[:] = 0 out_rl = bilstm.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
def test_multi_optimizer(backend_default_mkl): """ A test for MultiOptimizer. """ opt_gdm = GradientDescentMomentum( learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) for l in layer_list: l.configure(in_obj=(16, 28, 28)) l.allocate() # separate layer_list into two, the last two recurrent layers and the rest layer_list1, layer_list2 = layer_list[:-2], layer_list[-2:] opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Convolution_bias': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1}) layers_to_optimize1 = [l for l in layer_list1 if isinstance(l, ParameterLayer)] layers_to_optimize2 = [l for l in layer_list2 if isinstance(l, ParameterLayer)] opt.optimize(layers_to_optimize1, 0) assert opt.map_list[opt_adam][0].__class__.__name__ is 'Convolution_bias' assert opt.map_list[opt_rms][0].__class__.__name__ == 'Linear' opt.optimize(layers_to_optimize2, 0) assert opt.map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert opt.map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
# this data, splits into individual words. This can be passed into the Text # object during dataset creation as seen below. def tokenizer(s): return s.replace('\n', '<eos>').split() # load data and parse on word-level train_set = Text(time_steps, train_path, tokenizer=tokenizer, onehot_input=False) valid_set = Text(time_steps, valid_path, vocab=train_set.vocab, tokenizer=tokenizer, onehot_input=False) # weight initialization init = Uniform(low=-0.1, high=0.1) # model initialization rlayer_params = {"output_size": hidden_size, "init": init, "activation": Tanh(), "gate_activation": Logistic()} if args.rlayer_type == 'lstm': rlayer1, rlayer2 = LSTM(**rlayer_params), LSTM(**rlayer_params) else: rlayer1, rlayer2 = GRU(**rlayer_params), GRU(**rlayer_params) layers = [ LookupTable(vocab_size=len(train_set.vocab), embedding_dim=hidden_size, init=init), rlayer1, rlayer2, Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) model = Model(layers=layers)
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) hidden_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon LSTM lstm = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = lstm.be.array(inp) # import pdb; pdb.set_trace() # run neon fprop lstm.fprop(inpa) # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = lstm.b.get().T WLSTM[1:input_size + 1, :] = lstm.W_input.get().T WLSTM[input_size + 1:] = lstm.W_recur.get().T # transpose input X and do fprop inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM) # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T # compare results print '====Verifying IFOG====' allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.0e-5) print '====Verifying cell states====' allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.0e-5) print '====Verifying hidden states====' allclose_with_out(lstm.h_buffer.get(), Hout_ref, rtol=0.0, atol=1.0e-5) print 'fprop is verified' # now test the bprop # generate random deltas tensor deltas = np.random.randn(*hidden_shape) lstm.bprop(lstm.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = lstm.dW_input.get() dWrecur_neon = lstm.dW_recur.get() db_neon = lstm.db.get() # import pdb; pdb.set_trace() deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size) (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache) dWrecur_ref = dWLSTM_ref[-hidden_size:, :] dWinput_ref = dWLSTM_ref[1:input_size + 1, :] db_ref = dWLSTM_ref[0, :] dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T # compare results print 'Making sure neon LSTM match numpy LSTM in bprop' print '====Verifying update on W_recur====' assert allclose_with_out(dWrecur_neon, dWrecur_ref.T, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_input====' assert allclose_with_out(dWinput_neon, dWinput_ref.T, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.0e-5) print '====Verifying output delta====' assert allclose_with_out(lstm.out_deltas_buffer.get(), dX_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
# sent2vec network nhidden = 2400 gradient_clip_norm = 5.0 train_set = SentenceHomogenous(data_file=data_file, sent_name='train', text_name='report_train', nwords=vocab_size_layer, max_len=args.max_len_w, index_from=index_from) if valid_split and valid_split > 0.0: valid_set = SentenceHomogenous(data_file=data_file, sent_name='valid', text_name='report_valid', nwords=vocab_size_layer, max_len=args.max_len_w, index_from=index_from) skip = SkipThought(vocab_size_layer, embed_dim, init_embed_dev, nhidden, rec_layer=GRU, init_rec=Orthonormal(), activ_rec=Tanh(), activ_rec_gate=Logistic(), init_ff=Uniform(low=-0.1, high=0.1), init_const=Constant(0.0)) model = Model(skip) if args.model_file and os.path.isfile(args.model_file): neon_logger.display("Loading saved weights from: {}".format(args.model_file)) model_dict = load_obj(args.model_file) model.deserialize(model_dict, load_states=True) elif args.model_file: neon_logger.display("Unable to find model file {}, restarting training.". format(args.model_file)) cost = Multicost(costs=[GeneralizedCostMask(costfunc=CrossEntropyMulti(usebits=True)), GeneralizedCostMask(costfunc=CrossEntropyMulti(usebits=True))], weights=[1, 1])
print "Vocab size - ", vocab_size print "Sentence Length - ", sentence_length print "# of train sentences", X_train.shape[0] print "# of test sentence", X_test.shape[0] train_set = DataIterator(X_train, y_train, nclass=2) valid_set = DataIterator(X_test, y_test, nclass=2) # weight initialization init_emb = Uniform(low=-0.1/embedding_dim, high=0.1/embedding_dim) init_glorot = GlorotUniform() layers = [ LookupTable(vocab_size=vocab_size, embedding_dim=embedding_dim, init=init_emb), LSTM(hidden_size, init_glorot, activation=Tanh(), gate_activation=Logistic(), reset_cells=True), RecurrentSum(), Dropout(keep=0.5), Affine(2, init_glorot, bias=init_glorot, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) metric = Accuracy() model = Model(layers=layers) optimizer = Adagrad(learning_rate=0.01, clip_gradients=clip_gradients) # configure callbacks
seq_len, return_sequences=return_sequences) valid_set = DataIteratorSequence(time_series.test, seq_len, return_sequences=return_sequences) # define weights initialization init = GlorotUniform() # Uniform(low=-0.08, high=0.08) # define model: model is different for the 2 strategies (sequence target or not) if return_sequences is True: layers = [ LSTM(hidden, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=False), Affine(train_set.nfeatures, init, bias=init, activation=Identity()) ] else: layers = [ LSTM(hidden, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=True), RecurrentLast(), Affine(train_set.nfeatures, init, bias=init, activation=Identity()) ] model = Model(layers=layers)
# load data train_set = ImageCaption(path=data_path, max_images=-1) # weight initialization init = Uniform(low=-0.08, high=0.08) init2 = Constant(val=train_set.be.array(train_set.bias_init)) # model initialization image_path = Sequential([Affine(hidden_size, init, bias=Constant(val=0.0))]) sent_path = Sequential([Affine(hidden_size, init, linear_name='sent')]) layers = [ MergeMultistream(layers=[image_path, sent_path], merge="recurrent"), Dropout(keep=0.5), LSTM(hidden_size, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=True), Affine(train_set.vocab_size, init, bias=init2, activation=Softmax()) ] cost = GeneralizedCostMask(costfunc=CrossEntropyMulti(usebits=True)) # configure callbacks checkpoint_model_path = "~/image_caption2.pickle" if args.callback_args['save_path'] is None: args.callback_args['save_path'] = checkpoint_model_path if args.callback_args['serialize'] is None: args.callback_args['serialize'] = 1 model = Model(layers=layers)
tokenizer=tokenizer, onehot_input=False) valid_set = Text(time_steps, valid_path, vocab=train_set.vocab, tokenizer=tokenizer, onehot_input=False) # weight initialization init = Uniform(low=-0.1, high=0.1) # model initialization rlayer_params = { "output_size": hidden_size, "init": init, "activation": Tanh(), "gate_activation": Logistic() } if args.rlayer_type == 'lstm': rlayer1, rlayer2 = LSTM(**rlayer_params), LSTM(**rlayer_params) else: rlayer1, rlayer2 = GRU(**rlayer_params), GRU(**rlayer_params) layers = [ LookupTable(vocab_size=len(train_set.vocab), embedding_dim=hidden_size, init=init), rlayer1, rlayer2, Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))
Affine(nout=1, init=init, bias=init, activation=lrelu)] #E primary branch3 = [b1,Linear(1, init=Constant(val=1.0))] #SUM ECAL D_layers = Tree([branch1, branch2, branch3], name="Discriminator") #keep weight between branches equal to 1. for now (alphas=(1.,1.,1.) as by default ) # generator using convolution layers init_gen = Gaussian(scale=0.001) relu = Rectlin(slope=0) # relu for generator pad1 = dict(pad_h=2, pad_w=2, pad_d=2) str1 = dict(str_h=2, str_w=2, str_d=2) conv1 = dict(init=init_gen, batch_norm=False, activation=lrelu, padding=pad1, strides=str1, bias=init_gen) pad2 = dict(pad_h=2, pad_w=2, pad_d=2) str2 = dict(str_h=2, str_w=2, str_d=2) conv2 = dict(init=init_gen, batch_norm=False, activation=lrelu, padding=pad2, strides=str2, bias=init_gen) pad3 = dict(pad_h=0, pad_w=0, pad_d=0) str3 = dict(str_h=1, str_w=1, str_d=1) conv3 = dict(init=init_gen, batch_norm=False, activation=Tanh(), padding=pad3, strides=str3, bias=init_gen) bg = BranchNode("bg") branchg = [bg, Affine(1024, init=init_gen, bias=init_gen, activation=relu), BatchNorm(), Affine(8 * 7 * 7 * 7, init=init_gen, bias=init_gen), Reshape((8, 7, 7, 7)), Deconv((6, 6, 6, 6), **conv1), #14x14x14 BatchNorm(), # Linear(5 * 14 * 14 * 14, init=init), # Reshape((5, 14, 14, 14)), Deconv((5, 5, 5, 64), **conv2), #27x27x27 BatchNorm(), Conv((3, 3, 3, 1), **conv3) ]
def test_tanh(backend): inputs = np.array([0, 1, -2]).reshape((3, 1)) outputs = np.array([true_tanh(0), true_tanh(1), true_tanh(-2)]).reshape((3, 1)) compare_tensors(Tanh(), inputs, outputs, tol=1e-7)
gradient_clip_value = None # setup backend be = gen_backend(**extract_valid_args(args, gen_backend)) # download penn treebank dataset = PTB(time_steps, path=args.data_dir) train_set = dataset.train_iter valid_set = dataset.valid_iter # weight initialization init = Uniform(low=-0.08, high=0.08) # model initialization layers = [ Recurrent(hidden_size, init, activation=Tanh()), Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) model = Model(layers=layers) optimizer = RMSProp(gradient_clip_value=gradient_clip_value, stochastic_round=args.rounding) # configure callbacks callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args) # train model model.fit(train_set,
def test_reshape_layer_model(backend_default, fargs): """ test cases: - conv before RNNs - conv after RNNs - conv after LUT """ np.random.seed(seed=0) nin, nout, bsz = fargs be = backend_default be.bsz = bsz input_size = (nin, be.bsz) init = Uniform(-0.1, 0.1) g_uni = GlorotUniform() inp_np = np.random.rand(nin, be.bsz) delta_np = np.random.rand(nout, be.bsz) inp = be.array(inp_np) delta = be.array(delta_np) conv_lut_1 = [ LookupTable(vocab_size=2000, embedding_dim=400, init=init), Reshape(reshape=(4, 100, -1)), Conv((3, 3, 16), init=init), LSTM(64, g_uni, activation=Tanh(), gate_activation=Logistic(), reset_cells=True), RecurrentSum(), Affine(nout, init, bias=init, activation=Softmax()) ] conv_lut_2 = [ LookupTable(vocab_size=1000, embedding_dim=400, init=init), Reshape(reshape=(4, 50, -1)), Conv((3, 3, 16), init=init), Pooling(2, strides=2), Affine(nout=nout, init=init, bias=init, activation=Softmax()), ] conv_rnn_1 = [ LookupTable(vocab_size=2000, embedding_dim=400, init=init), LSTM(64, g_uni, activation=Tanh(), gate_activation=Logistic(), reset_cells=True), Reshape(reshape=(4, 32, -1)), Conv((3, 3, 16), init=init), Affine(nout, init, bias=init, activation=Softmax()) ] conv_rnn_2 = [ LookupTable(vocab_size=2000, embedding_dim=400, init=init), Recurrent(64, g_uni, activation=Tanh(), reset_cells=True), Reshape(reshape=(4, -1, 32)), Conv((3, 3, 16), init=init), Affine(nout, init, bias=init, activation=Softmax()) ] lut_sum_1 = [ LookupTable(vocab_size=1000, embedding_dim=128, init=init), RecurrentSum(), Affine(nout=nout, init=init, bias=init, activation=Softmax()), ] lut_birnn_1 = [ LookupTable(vocab_size=1000, embedding_dim=200, init=init), DeepBiRNN(32, init=GlorotUniform(), batch_norm=True, activation=Tanh(), reset_cells=True, depth=1), Reshape((4, 32, -1)), Conv((3, 3, 16), init=init), Affine(nout=nout, init=init, bias=init, activation=Softmax()) ] layers_test = [ conv_lut_1, conv_lut_2, conv_rnn_1, conv_rnn_2, lut_sum_1, lut_birnn_1 ] for lg in layers_test: model = Model(layers=lg) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) model.initialize(input_size, cost) model.fprop(inp) model.bprop(delta)
train_set = TextNMT(time_steps, train_path, get_prev_target=True, onehot_input=False, split='train', dataset=dataset, subset_pct=args.subset_pct) valid_set = TextNMT(time_steps, train_path, get_prev_target=False, onehot_input=False, split='valid', dataset=dataset) # weight initialization init = Uniform(low=-0.08, high=0.08) # Standard or Conditional encoder / decoder: encoder = [LookupTable(vocab_size=len(train_set.s_vocab), embedding_dim=embedding_dim, init=init, name="LUT_en")] decoder = [LookupTable(vocab_size=len(train_set.t_vocab), embedding_dim=embedding_dim, init=init, name="LUT_de")] decoder_connections = [] # link up recurrent layers for ii in range(num_layers): encoder.append(GRU(hidden_size, init, activation=Tanh(), gate_activation=Logistic(), reset_cells=True, name="GRU1Enc")) decoder.append(GRU(hidden_size, init, activation=Tanh(), gate_activation=Logistic(), reset_cells=True, name="GRU1Dec")) decoder_connections.append(ii) decoder.append(Affine(train_set.nout, init, bias=init, activation=Softmax(), name="Affout")) layers = Seq2Seq([encoder, decoder], decoder_connections=decoder_connections, name="Seq2Seq") cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) model = Model(layers=layers) optimizer = RMSProp(gradient_clip_value=gradient_clip_value, stochastic_round=args.rounding)
be = gen_backend(**extract_valid_args(args, gen_backend)) be.bsz = 1 # define same model as in train init_glorot = GlorotUniform() init_emb = Uniform(low=-0.1 / embedding_dim, high=0.1 / embedding_dim) nclass = 2 layers = [ LookupTable(vocab_size=vocab_size, embedding_dim=embedding_dim, init=init_emb, pad_idx=0, update=True), LSTM(hidden_size, init_glorot, activation=Tanh(), gate_activation=Logistic(), reset_cells=True), RecurrentSum(), Dropout(keep=0.5), Affine(nclass, init_glorot, bias=init_glorot, activation=Softmax()) ] # load the weights print("Initialized the models - ") model_new = Model(layers=layers) print("Loading the weights from {0}".format(args.model_weights)) model_new.load_params(args.model_weights) model_new.initialize(dataset=(sentence_length, batch_size))
def check_gru(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0], add_init_state=False): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) slice_shape = (hidden_size, batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon GRU gru = GRU(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inp_dev = gru.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # run neon fprop gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() test_buffer = DeltasTree() gru.allocate_deltas(test_buffer) test_buffer.allocate_buffers() gru.set_deltas(test_buffer) if add_init_state: init_state = np.random.rand(*slice_shape) * inp_moms[1] + inp_moms[0] init_state_dev = gru.be.array(init_state) gru.fprop(inp_dev, init_state=init_state_dev) else: gru.fprop(inp_dev) # reference numpy GRU gru_ref = RefGRU(input_size, hidden_size) WGRU = gru_ref.weights # make ref weights and biases the same with neon model r_range = list(range(hidden_size)) z_range = list(range(hidden_size, hidden_size * 2)) c_range = list(range(hidden_size * 2, hidden_size * 3)) WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range] WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range] WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range] WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range] WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range] WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range] WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range] WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range] WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range] # transpose input X and do fprop # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size).swapaxes(1, 2) if add_init_state: init_state_ref = init_state.copy() (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref, init_state_ref) else: (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref) neon_logger.display('====Verifying hidden states====') assert allclose_with_out(gru.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('fprop is verified') # now test the bprop neon_logger.display('Making sure neon GRU matches numpy GRU in bprop') gru.bprop(gru.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = gru.dW_input.get() dWrecur_neon = gru.dW_recur.get() db_neon = gru.db.get() dWxr_neon = dWinput_neon[r_range] dWxz_neon = dWinput_neon[z_range] dWxc_neon = dWinput_neon[c_range] dWrr_neon = dWrecur_neon[r_range] dWrz_neon = dWrecur_neon[z_range] dWrc_neon = dWrecur_neon[c_range] dbr_neon = db_neon[r_range] dbz_neon = db_neon[z_range] dbc_neon = db_neon[c_range] drzc_neon = gru.rzhcan_delta_buffer.get() dr_neon = drzc_neon[r_range] dz_neon = drzc_neon[z_range] dc_neon = drzc_neon[c_range] dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr] dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz] dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc] dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr] dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz] dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc] dbr_ref = dWGRU_ref[gru_ref.dW_ind_br] dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz] dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc] # neon_logger.display '====Verifying hidden deltas ====' neon_logger.display('====Verifying r deltas ====') assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying z deltas ====') assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying hcan deltas ====') assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on W_input====') neon_logger.display('dWxr') assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWxz') assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWxc') assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on W_recur====') neon_logger.display('dWrr') assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWrz') assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWrc') assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on bias====') neon_logger.display('dbr') assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dbz') assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dbc') assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('bprop is verified') return
print "Vocab size - ", vocab_size print "Sentence Length - ", sentence_length print "# of train sentences", X_train.shape[0] print "# of test sentence", X_test.shape[0] train_set = ArrayIterator(X_train, y_train, nclass=2) valid_set = ArrayIterator(X_test, y_test, nclass=2) # weight initialization uni = Uniform(low=-0.1 / embedding_dim, high=0.1 / embedding_dim) g_uni = GlorotUniform() if args.rlayer_type == 'lstm': rlayer = LSTM(hidden_size, g_uni, activation=Tanh(), gate_activation=Logistic(), reset_cells=True) elif args.rlayer_type == 'bilstm': rlayer = DeepBiLSTM(hidden_size, g_uni, activation=Tanh(), depth=1, gate_activation=Logistic(), reset_cells=True) elif args.rlayer_type == 'rnn': rlayer = Recurrent(hidden_size, g_uni, activation=Tanh(), reset_cells=True) elif args.rlayer_type == 'birnn': rlayer = DeepBiRNN(hidden_size, g_uni, activation=Tanh(),
default_dtype=args.datatype) # download penn treebank train_path = load_text('ptb-train', path=args.data_dir) valid_path = load_text('ptb-valid', path=args.data_dir) # load data and parse on character-level train_set = Text(time_steps, train_path) valid_set = Text(time_steps, valid_path, vocab=train_set.vocab) # weight initialization init = Uniform(low=-0.08, high=0.08) # model initialization layers = [ Recurrent(hidden_size, init, Tanh()), Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) model = Model(layers=layers) optimizer = RMSProp(clip_gradients=clip_gradients, stochastic_round=args.rounding) # configure callbacks callbacks = Callbacks(model, train_set, output_file=args.output_file, valid_set=valid_set, valid_freq=args.validation_freq, progress_bar=args.progress_bar) # train model
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # ======== create models ======== # neon RNN rnn = Recurrent(hidden_size, init_func, Tanh()) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) Wxh = rnn_ref.Wxh Whh = rnn_ref.Whh bh = rnn_ref.bh # ========= generate data ================= # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = rnn.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size).swapaxes(1, 2) # ========= running models ========== # run neon fprop rnn.fprop(inpa) # weights are only initialized after doing fprop, so now # make ref weights and biases the same with neon model Wxh[:] = rnn.W_input.get() Whh[:] = rnn.W_recur.get() bh[:] = rnn.b.get() (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref) # now test the bprop rnn.bprop(rnn.be.array(deltas)) # grab the delta W from gradient buffer dWxh_neon = rnn.dW_input.get() dWhh_neon = rnn.dW_recur.get() db_neon = rnn.db.get() # comparing outputs print '====Verifying hidden states====' print allclose_with_out(rnn.h_buffer.get(), h_ref_list, rtol=0.0, atol=1.0e-5) print 'fprop is verified' print '====Verifying update on W and b ====' print 'dWxh' assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5) print 'dWhh' assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' print 'db' assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
def create_model(dis_model='dc', gen_model='dc', cost_type='wasserstein', noise_type='normal', im_size=64, n_chan=3, n_noise=100, n_gen_ftr=64, n_dis_ftr=64, depth=4, n_extra_layers=0, batch_norm=True, gen_squash=None, dis_squash=None, dis_iters=5, wgan_param_clamp=None, wgan_train_sched=False): """ Create a GAN model and associated GAN cost function for image generation Arguments: dis_model (str): Discriminator type, can be 'mlp' for a simple MLP or 'dc' for a DC-GAN style model. (defaults to 'dc') gen_model (str): Generator type, can be 'mlp' for a simple MLP or 'dc' for a DC-GAN style model. (defaults to 'dc') cost_type (str): Cost type, can be 'original', 'modified' following Goodfellow2014 or 'wasserstein' following Arjovsky2017 (defaults to 'wasserstein') noise_type (str): Noise distribution, can be 'uniform or' 'normal' (defaults to 'normal') im_size (int): Image size (defaults to 64) n_chan (int): Number of image channels (defaults to 3) n_noise (int): Number of noise dimensions (defaults to 100) n_gen_ftr (int): Number of generator feature maps (defaults to 64) n_dis_ftr (int): Number of discriminator feature maps (defaults to 64) depth (int): Depth of layers in case of MLP (defaults to 4) n_extra_layers (int): Number of extra conv layers in case of DC (defaults to 0) batch_norm (bool): Enable batch normalization (defaults to True) gen_squash (str or None): Squashing function at the end of generator (defaults to None) dis_squash (str or None): Squashing function at the end of discriminator (defaults to None) dis_iters (int): Number of critics for discriminator (defaults to 5) wgan_param_clamp (float or None): In case of WGAN weight clamp value, None for others wgan_train_sched (bool): Enable training schedule of number of critics (defaults to False) """ assert dis_model in ['mlp', 'dc'], \ "Unsupported model type for discriminator net, supported: 'mlp' and 'dc'" assert gen_model in ['mlp', 'dc'], \ "Unsupported model type for generator net, supported: 'mlp' and 'dc'" assert cost_type in ['original', 'modified', 'wasserstein'], \ "Unsupported GAN cost function type, supported: 'original', 'modified' and 'wasserstein'" # types of final squashing functions squash_func = dict(nosquash=Identity(), sym=Tanh(), asym=Logistic()) if cost_type == 'wasserstein': if gen_model == 'mlp': gen_squash = gen_squash or 'nosquash' elif gen_model == 'dc': gen_squash = gen_squash or 'sym' dis_squash = dis_squash or 'nosquash' else: # for all GAN costs other than Wasserstein gen_squash = gen_squash or 'sym' dis_squash = dis_squash or 'asym' assert gen_squash in ['nosquash', 'sym', 'asym'], \ "Unsupported final squashing function for generator," \ " supported: 'nosquash', 'sym' and 'asym'" assert dis_squash in ['nosquash', 'sym', 'asym'], \ "Unsupported final squashing function for discriminator," \ " supported: 'nosquash', 'sym' and 'asym'" gfa = squash_func[gen_squash] dfa = squash_func[dis_squash] # create model layers if gen_model == 'mlp': gen = create_mlp_generator(im_size, n_chan, n_gen_ftr, depth, batch_norm=False, finact=gfa) noise_dim = (n_noise, ) elif gen_model == 'dc': gen = create_dc_generator(im_size, n_chan, n_noise, n_gen_ftr, n_extra_layers, batch_norm, finact=gfa) noise_dim = (n_noise, 1, 1) if dis_model == 'mlp': dis = create_mlp_discriminator(im_size, n_dis_ftr, depth, batch_norm=False, finact=dfa) elif dis_model == 'dc': dis = create_dc_discriminator(im_size, n_chan, n_dis_ftr, n_extra_layers, batch_norm, finact=dfa) layers = GenerativeAdversarial(generator=Sequential(gen, name="Generator"), discriminator=Sequential( dis, name="Discriminator")) return GAN(layers=layers, noise_dim=noise_dim, noise_type=noise_type, k=dis_iters, wgan_param_clamp=wgan_param_clamp, wgan_train_sched=wgan_train_sched), \ GeneralizedGANCost(costfunc=GANCost(func=cost_type))
gradient_clip_value = 5 # download shakespeare text data_path = load_shakespeare(path=args.data_dir) train_path, valid_path = Text.create_valid_file(data_path) # load data and parse on character-level train_set = Text(time_steps, train_path) valid_set = Text(time_steps, valid_path, vocab=train_set.vocab) # weight initialization init = Uniform(low=-0.08, high=0.08) # model initialization layers = [ LSTM(hidden_size, init, activation=Logistic(), gate_activation=Tanh()), Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] model = Model(layers=layers) cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) optimizer = RMSProp(gradient_clip_value=gradient_clip_value, stochastic_round=args.rounding) # configure callbacks callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args) # fit and validate model.fit(train_set, optimizer=optimizer,
default_dtype=args.datatype) # download penn treebank train_path = load_text('ptb-train', path=args.data_dir) valid_path = load_text('ptb-valid', path=args.data_dir) # load data and parse on character-level train_set = Text(time_steps, train_path) valid_set = Text(time_steps, valid_path, vocab=train_set.vocab) # weight initialization init = Uniform(low=-0.08, high=0.08) # model initialization if rlayer_type == 'lstm': rlayer = LSTM(hidden_size, init, Logistic(), Tanh()) elif rlayer_type == 'gru': rlayer = GRU(hidden_size, init, activation=Tanh(), gate_activation=Logistic()) else: raise NotImplementedError('%s layer not implemented' % rlayer_type) layers = [ rlayer, Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) model = Model(layers=layers) optimizer = RMSProp(clip_gradients=clip_gradients, stochastic_round=args.rounding)