def layers(self): return [ Conv((7, 7, 96), init=Gaussian(scale=0.0001), bias=Constant(0), activation=Rectlin(), padding=3, strides=1), LRN(31, ascale=0.001, bpower=0.75), Pooling(3, strides=2, padding=1), Conv((5, 5, 256), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=2, strides=1), LRN(31, ascale=0.001, bpower=0.75), Pooling(3, strides=2, padding=1), Conv((3, 3, 384), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=1, strides=1), Conv((3, 3, 384), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=1, strides=1), Conv((3, 3, 256), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=1, strides=1), Pooling(3, strides=2, padding=1), Affine(nout=4096, init=Gaussian(scale=0.01), bias=Constant(0), activation=Identity()), Dropout(keep=0.5), Affine(nout=4096, init=Gaussian(scale=0.01), bias=Constant(0), activation=Identity()), Dropout(keep=0.5), Affine(nout=self.noutputs, init=Gaussian(scale=0.01), bias=Constant(0), activation=Softmax() if self.use_softmax else Logistic( shortcut=True)) ]
def create_model(vocab_size, rlayer_type): """ Create LSTM/GRU model for bAbI dataset. Args: vocab_size (int) : String of bAbI data. rlayer_type (string) : Type of recurrent layer to use (gru or lstm). Returns: Model : Model of the created network """ # recurrent layer parameters (default gru) rlayer_obj = GRU if rlayer_type == 'gru' else LSTM rlayer_params = dict(output_size=100, reset_cells=True, init=GlorotUniform(), init_inner=Orthonormal(0.5), activation=Tanh(), gate_activation=Logistic()) # if using lstm, swap the activation functions if rlayer_type == 'lstm': rlayer_params.update( dict(activation=Logistic(), gate_activation=Tanh())) # lookup layer parameters lookup_params = dict(vocab_size=vocab_size, embedding_dim=50, init=Uniform(-0.05, 0.05)) # Model construction story_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)] query_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)] layers = [ MergeMultistream(layers=[story_path, query_path], merge="stack"), Affine(vocab_size, init=GlorotUniform(), activation=Softmax()) ] return Model(layers=layers)
def test_biLSTM_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), init=init_glorot, activation=Tanh(), reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() # same weight nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr = bilstm.fprop(inp_lr).get().copy() bilstm.h_buffer[:] = 0 out_rl = bilstm.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
def test_model_N_S_setter(backend_default): # weight initialization init = Constant(0.08) # model initialization layers = [ Recurrent(150, init, activation=Logistic()), Affine(100, init, bias=init, activation=Rectlin()) ] model = Model(layers=layers) model.set_batch_size(20) model.set_seq_len(10)
def test_multi_optimizer(backend_default): opt_gdm = GradientDescentMomentum( learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1}) map_list = opt.map_optimizers(layer_list) assert map_list[opt_adam][0].__class__.__name__ == 'Convolution' assert map_list[opt_ada][0].__class__.__name__ == 'Bias' assert map_list[opt_rms][0].__class__.__name__ == 'Linear' assert map_list[opt_gdm][0].__class__.__name__ == 'Activation' assert map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def __init__(self): self.in_shape = (1, 32, 32) init_norm = Gaussian(loc=0.0, scale=0.01) normrelu = dict(init=init_norm, activation=Rectlin()) normsigm = dict(init=init_norm, activation=Logistic(shortcut=True)) normsoft = dict(init=init_norm, activation=Softmax()) # setup model layers b1 = BranchNode(name="b1") b2 = BranchNode(name="b2") p1 = [ Affine(nout=100, name="main1", **normrelu), b1, Affine(nout=32, name="main2", **normrelu), Affine(nout=160, name="main3", **normrelu), b2, Affine(nout=32, name="main2", **normrelu), # make next layer big to check sizing Affine(nout=320, name="main2", **normrelu), Affine(nout=10, name="main4", **normsoft) ] p2 = [ b1, Affine(nout=16, name="branch1_1", **normrelu), Affine(nout=10, name="branch1_2", **normsigm) ] p3 = [ b2, Affine(nout=16, name="branch2_1", **normrelu), Affine(nout=10, name="branch2_2", **normsigm) ] self.cost = Multicost(costs=[ GeneralizedCost(costfunc=CrossEntropyMulti()), GeneralizedCost(costfunc=CrossEntropyBinary()), GeneralizedCost(costfunc=CrossEntropyBinary()) ], weights=[1, 0., 0.]) self.layers = SingleOutputTree([p1, p2, p3], alphas=[1, .2, .2]) self.model = Model(layers=self.layers) self.model.initialize(self.in_shape, cost=self.cost)
def __init__(self, backend, dataset, subj): ad = { 'type': 'adadelta', 'lr_params': {'rho': 0.9, 'epsilon': 1e-10} } self.layers = [] self.add(DataLayer(is_local=True, nofm=dataset.nchannels, ofmshape=[1, dataset.nsamples])) self.add(ConvLayer(nofm=64, fshape=[1, 3], activation=RectLin(), lrule_init=ad)) self.add(PoolingLayer(op='max', fshape=[1, 2], stride=2)) self.add(FCLayer(nout=128, activation=RectLin(), lrule_init=ad)) self.add(FCLayer(nout=dataset.nclasses, activation=Logistic(), lrule_init=ad)) self.add(CostLayer(cost=CrossEntropy())) self.model = MLP(num_epochs=1, batch_size=128, layers=self.layers) self.backend = backend self.dataset = dataset
def test_model_get_outputs(backend_default): (X_train, y_train), (X_test, y_test), nclass = load_mnist() train_set = DataIterator(X_train[:backend_default.bsz * 3]) init_norm = Gaussian(loc=0.0, scale=0.1) layers = [Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True))] mlp = Model(layers=layers) out_list = [] mlp.initialize(train_set) for x, t in train_set: x = mlp.fprop(x) out_list.append(x.get().T.copy()) ref_output = np.vstack(out_list) train_set.reset() output = mlp.get_outputs(train_set) assert np.allclose(output, ref_output)
def layers(self): bn = True return [ Conv((7, 7, 96), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=3, strides=1)\ if self.bn_first_layer else\ Conv((7, 7, 96), init=Kaiming(), bias=Constant(0), activation=Explin(), padding=3, strides=1), Pooling(3, strides=2, padding=1), Conv((7, 7, 128), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=3, strides=1), Pooling(3, strides=2, padding=1), Conv((5, 5, 256), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=2, strides=1), Pooling(3, strides=2, padding=1), Conv((3, 3, 384), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=1, strides=1), Conv((3, 3, 384), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=1, strides=1), Conv((3, 3, 384), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=1, strides=1), Pooling(3, strides=2, padding=1, op='avg'), Affine(nout=self.noutputs, init=Kaiming(), activation=Explin(), batch_norm=bn), Affine(nout=self.noutputs, init=Kaiming(), activation=Explin(), batch_norm=bn), Affine(nout=self.noutputs, init=Kaiming(), bias=Constant(0), activation=Softmax() if self.use_softmax else Logistic(shortcut=True)) ]
def test_model_get_outputs_rnn(backend_default, data): data_path = load_text('ptb-valid', path=data) data_set = Text(time_steps=50, path=data_path) # weight initialization init = Constant(0.08) # model initialization layers = [ Recurrent(150, init, Logistic()), Affine(len(data_set.vocab), init, bias=init, activation=Rectlin()) ] model = Model(layers=layers) output = model.get_outputs(data_set) assert output.shape == ( data_set.ndata, data_set.seq_length, data_set.nclass)
def gen_model(backend_type): # setup backend gen_backend(backend=backend_type, batch_size=batch_size, rng_seed=2, device_id=args.device_id, datatype=args.datatype) init_uni = Uniform(low=-0.1, high=0.1) # Set up the model layers layers = [] layers.append(Conv((5, 5, 16), init=init_uni, bias=Constant(0), activation=Rectlin())) layers.append(Pooling(2)) layers.append(Conv((5, 5, 32), init=init_uni, activation=Rectlin())) layers.append(Pooling(2)) layers.append(Affine(nout=500, init=init_uni, activation=Rectlin())) layers.append(Affine(nout=10, init=init_uni, activation=Logistic(shortcut=True))) mlp = Model(layers=layers) return mlp
def input_layers(self, analytics_input, init, activation, gate): """ return the input layers. we currently support convolutional and LSTM :return: """ if self.recurrent: if analytics_input: # support analytics + content input_layers = MergeMultistream([[ LSTM(300, init, init_inner=Kaiming(), activation=activation, gate_activation=gate, reset_cells=True), RecurrentSum() ], [Affine(30, init, activation=activation)]], 'stack') else: # content only input_layers = [ LSTM(300, init, init_inner=Kaiming(), activation=activation, gate_activation=gate, reset_cells=True), RecurrentSum() ] else: if analytics_input: # support analytics + content input_layers = MergeMultistream([ self.conv_net(activation), [Affine(30, init, activation=Logistic())] ], 'stack') else: # content only input_layers = self.conv_net(activation) return input_layers
def create_network(stage_depth): if stage_depth in (18, 18): stages = (2, 2, 2, 2) elif stage_depth in (34, 50): stages = (3, 4, 6, 3) elif stage_depth in (68, 101): stages = (3, 4, 23, 3) elif stage_depth in (102, 152): stages = (3, 8, 36, 3) else: raise ValueError('Invalid stage_depth value'.format(stage_depth)) bottleneck = False if stage_depth in (50, 101, 152): bottleneck = True layers = [Conv(**conv_params(7, 64, strides=2)), Pooling(3, strides=2)] # Structure of the deep residual part of the network: # stage_depth modules of 2 convolutional layers each at feature map depths # of 64, 128, 256, 512 nfms = list( itt.chain.from_iterable( [itt.repeat(2**(x + 6), r) for x, r in enumerate(stages)])) strides = [-1] + [ 1 if cur == prev else 2 for cur, prev in zip(nfms[1:], nfms[:-1]) ] for nfm, stride in zip(nfms, strides): layers.append(module_factory(nfm, bottleneck, stride)) layers.append(Pooling('all', op='avg')) layers.append( Affine(10, init=Kaiming(local=False), batch_norm=True, activation=Rectlin())) layers.append(Affine(1, init=Kaiming(local=False), activation=Logistic())) #return Model(layers=layers), GeneralizedCost(costfunc=CrossEntropyBinary()) return Model(layers=layers), GeneralizedCost(costfunc=CrossEntropyBinary())
def build(self): """ Build the model's layers """ first_layer_dens = 64 second_layer_dens = 64 output_layer_dens = 2 # setup weight initialization function init_norm = Gaussian(scale=0.01) # setup model layers layers = [ Affine(nout=first_layer_dens, init=init_norm, activation=Rectlin()), Affine(nout=second_layer_dens, init=init_norm, activation=Rectlin()), Affine(nout=output_layer_dens, init=init_norm, activation=Logistic(shortcut=True)) ] # initialize model object self.model = Model(layers=layers)
def test_model_get_outputs(backend_default, data): dataset = MNIST(path=data) train_set = dataset.train_iter init_norm = Gaussian(loc=0.0, scale=0.1) layers = [ Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True)) ] mlp = Model(layers=layers) out_list = [] mlp.initialize(train_set) for x, t in train_set: x = mlp.fprop(x) out_list.append(x.get().T.copy()) ref_output = np.vstack(out_list) train_set.reset() output = mlp.get_outputs(train_set) assert np.allclose(output, ref_output[:output.shape[0], :]) # test model benchmark inference mlp.benchmark(train_set, inference=True, niterations=5)
def layers(self): init_uni = Uniform(low=-0.1, high=0.1) bn = False return [ Conv((5, 5, 16), init=init_uni, activation=Rectlin(), batch_norm=bn), Pooling((2, 2)), Conv((5, 5, 32), init=init_uni, activation=Rectlin(), batch_norm=bn), Pooling((2, 2)), Affine(nout=500, init=init_uni, activation=Rectlin(), batch_norm=bn), Affine(nout=self.noutputs, init=init_uni, bias=Constant(0), activation=Softmax() if self.use_softmax else Logistic( shortcut=True)) ]
def test_multi_optimizer(backend_default_mkl): """ A test for MultiOptimizer. """ opt_gdm = GradientDescentMomentum(learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) for l in layer_list: l.configure(in_obj=(16, 28, 28)) l.allocate() # separate layer_list into two, the last two recurrent layers and the rest layer_list1, layer_list2 = layer_list[:-2], layer_list[-2:] opt = MultiOptimizer({ 'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Convolution_bias': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1 }) layers_to_optimize1 = [ l for l in layer_list1 if isinstance(l, ParameterLayer) ] layers_to_optimize2 = [ l for l in layer_list2 if isinstance(l, ParameterLayer) ] opt.optimize(layers_to_optimize1, 0) assert opt.map_list[opt_adam][0].__class__.__name__ is 'Convolution_bias' assert opt.map_list[opt_rms][0].__class__.__name__ == 'Linear' opt.optimize(layers_to_optimize2, 0) assert opt.map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert opt.map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
parser = NeonArgparser(__doc__) args = parser.parse_args() dataset = MNIST(path=args.data_dir) train_set = dataset.train_iter valid_set = dataset.valid_iter # weight initialization init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model layers = [] layers.append(Affine(nout=100, init=init_norm, bias=Constant(0), activation=Rectlin())) layers.append(Affine(nout=10, init=init_norm, bias=Constant(0), activation=Logistic(shortcut=True), name='special_linear')) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) mlp = Model(layers=layers) # fit and validate optimizer_one = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) optimizer_two = RMSProp() # all bias layers and the last linear layer will use # optimizer_two. all other layers will use optimizer_one. opt = MultiOptimizer({'default': optimizer_one, 'Bias': optimizer_two, 'special_linear': optimizer_two})
print "Sentence Length - ", sentence_length print "# of train sentences", X_train.shape[0] print "# of test sentence", X_test.shape[0] train_set = ArrayIterator(X_train, y_train, nclass=2) valid_set = ArrayIterator(X_test, y_test, nclass=2) # weight initialization uni = Uniform(low=-0.1 / embedding_dim, high=0.1 / embedding_dim) g_uni = GlorotUniform() if args.rlayer_type == 'lstm': rlayer = LSTM(hidden_size, g_uni, activation=Tanh(), gate_activation=Logistic(), reset_cells=True) elif args.rlayer_type == 'bilstm': rlayer = DeepBiLSTM(hidden_size, g_uni, activation=Tanh(), depth=1, gate_activation=Logistic(), reset_cells=True) elif args.rlayer_type == 'rnn': rlayer = Recurrent(hidden_size, g_uni, activation=Tanh(), reset_cells=True) elif args.rlayer_type == 'birnn': rlayer = DeepBiRNN(hidden_size, g_uni, activation=Tanh(), depth=1,
def test_model_serialize(backend): (X_train, y_train), (X_test, y_test), nclass = load_mnist() train_set = DataIterator([X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28)) init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model path1 = [ Conv((5, 5, 16), init=init_norm, bias=Constant(0), activation=Rectlin()), Pooling(2), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()) ] path2 = [ Dropout(keep=0.5), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()) ] layers = [ MergeConcat([path1, path2]), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()), BatchNorm(), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True)) ] tmp_save = 'test_model_serialize_tmp_save.pickle' mlp = Model(layers=layers) mlp.optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) mlp.cost = GeneralizedCost(costfunc=CrossEntropyBinary()) n_test = 3 num_epochs = 3 # Train model for num_epochs and n_test batches for epoch in range(num_epochs): for i, (x, t) in enumerate(train_set): x = mlp.fprop(x) delta = mlp.cost.get_errors(x, t) mlp.bprop(delta) mlp.optimizer.optimize(mlp.layers_to_optimize, epoch=epoch) if i > n_test: break # Get expected outputs of n_test batches and states of all layers outputs_exp = [] pdicts_exp = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs_exp.append(mlp.fprop(x, inference=True)) if i > n_test: break # Serialize model save_obj(mlp.serialize(keep_states=True), tmp_save) # Load model mlp = Model(layers=layers) mlp.load_weights(tmp_save) outputs = [] pdicts = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs.append(mlp.fprop(x, inference=True)) if i > n_test: break # Check outputs, states, and params are the same for output, output_exp in zip(outputs, outputs_exp): assert np.allclose(output.get(), output_exp.get()) for pd, pd_exp in zip(pdicts, pdicts_exp): for s, s_e in zip(pd['states'], pd_exp['states']): if isinstance(s, list): # this is the batch norm case for _s, _s_e in zip(s, s_e): assert np.allclose(_s, _s_e) else: assert np.allclose(s, s_e) for p, p_e in zip(pd['params'], pd_exp['params']): if isinstance(p, list): # this is the batch norm case for _p, _p_e in zip(p, p_e): assert np.allclose(_p, _p_e) else: assert np.allclose(p, p_e) os.remove(tmp_save)
# load data train_set = ImageCaption(path=data_path, max_images=-1) # weight initialization init = Uniform(low=-0.08, high=0.08) init2 = Constant(val=train_set.be.array(train_set.bias_init)) # model initialization image_path = Sequential([Affine(hidden_size, init, bias=Constant(val=0.0))]) sent_path = Sequential([Affine(hidden_size, init, linear_name='sent')]) layers = [ MergeMultistream(layers=[image_path, sent_path], merge="recurrent"), Dropout(keep=0.5), LSTM(hidden_size, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=True), Affine(train_set.vocab_size, init, bias=init2, activation=Softmax()) ] cost = GeneralizedCostMask(costfunc=CrossEntropyMulti(usebits=True)) # configure callbacks checkpoint_model_path = "~/image_caption2.pickle" if args.callback_args['save_path'] is None: args.callback_args['save_path'] = checkpoint_model_path if args.callback_args['serialize'] is None: args.callback_args['serialize'] = 1 model = Model(layers=layers)
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) hidden_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon LSTM lstm = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = lstm.be.array(inp) # import pdb; pdb.set_trace() # run neon fprop lstm.fprop(inpa) # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = lstm.b.get().T WLSTM[1:input_size + 1, :] = lstm.W_input.get().T WLSTM[input_size + 1:] = lstm.W_recur.get().T # transpose input X and do fprop inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM) # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T # compare results print '====Verifying IFOG====' allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.0e-5) print '====Verifying cell states====' allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.0e-5) print '====Verifying hidden states====' allclose_with_out(lstm.h_buffer.get(), Hout_ref, rtol=0.0, atol=1.0e-5) print 'fprop is verified' # now test the bprop # generate random deltas tensor deltas = np.random.randn(*hidden_shape) lstm.bprop(lstm.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = lstm.dW_input.get() dWrecur_neon = lstm.dW_recur.get() db_neon = lstm.db.get() # import pdb; pdb.set_trace() deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size) (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache) dWrecur_ref = dWLSTM_ref[-hidden_size:, :] dWinput_ref = dWLSTM_ref[1:input_size + 1, :] db_ref = dWLSTM_ref[0, :] dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T # compare results print 'Making sure neon LSTM match numpy LSTM in bprop' print '====Verifying update on W_recur====' assert allclose_with_out(dWrecur_neon, dWrecur_ref.T, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_input====' assert allclose_with_out(dWinput_neon, dWinput_ref.T, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.0e-5) print '====Verifying output delta====' assert allclose_with_out(lstm.out_deltas_buffer.get(), dX_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
train_set = DataIteratorSequence(time_series.train, seq_len, return_sequences=return_sequences) valid_set = DataIteratorSequence(time_series.test, seq_len, return_sequences=return_sequences) # define weights initialization init = GlorotUniform() # Uniform(low=-0.08, high=0.08) # define model: model is different for the 2 strategies (sequence target or not) if return_sequences is True: layers = [ LSTM(hidden, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=False), Affine(train_set.nfeatures, init, bias=init, activation=Identity()) ] else: layers = [ LSTM(hidden, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=True), RecurrentLast(), Affine(train_set.nfeatures, init, bias=init, activation=Identity()) ]
gradient_clip_value = 5 # load data and parse on character-level ticker_task = CopyTask(seq_len_max, vec_size) train_set = Ticker(ticker_task) # weight initialization init = Uniform(low=-0.08, high=0.08) output_size = 8 N = 120 # number of memory locations M = 8 # size of a memory location # model initialization layers = [ GRU(hidden_size, init, activation=Tanh(), gate_activation=Logistic()), Affine(train_set.nout, init, bias=init, activation=Logistic()) ] cost = GeneralizedCostMask(costfunc=CrossEntropyBinary()) model = Model(layers=layers) optimizer = RMSProp(gradient_clip_value=gradient_clip_value, stochastic_round=args.rounding) # configure callbacks callbacks = Callbacks(model, **args.callback_args) # we can use the training set as the validation set, # since the data is tickerally generated
gradient_clip_value = 5 # download shakespeare text data_path = load_shakespeare(path=args.data_dir) train_path, valid_path = Text.create_valid_file(data_path) # load data and parse on character-level train_set = Text(time_steps, train_path) valid_set = Text(time_steps, valid_path, vocab=train_set.vocab) # weight initialization init = Uniform(low=-0.08, high=0.08) # model initialization layers = [ LSTM(hidden_size, init, activation=Logistic(), gate_activation=Tanh()), Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] model = Model(layers=layers) cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) optimizer = RMSProp(gradient_clip_value=gradient_clip_value, stochastic_round=args.rounding) # configure callbacks callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args) # fit and validate model.fit(train_set, optimizer=optimizer,
# this data, splits into individual words. This can be passed into the Text # object during dataset creation as seen below. def tokenizer(s): return s.replace('\n', '<eos>').split() # load data and parse on word-level train_set = Text(time_steps, train_path, tokenizer=tokenizer, onehot_input=False) valid_set = Text(time_steps, valid_path, vocab=train_set.vocab, tokenizer=tokenizer, onehot_input=False) # weight initialization init = Uniform(low=-0.1, high=0.1) # model initialization rlayer_params = {"output_size": hidden_size, "init": init, "activation": Tanh(), "gate_activation": Logistic()} if args.rlayer_type == 'lstm': rlayer1, rlayer2 = LSTM(**rlayer_params), LSTM(**rlayer_params) else: rlayer1, rlayer2 = GRU(**rlayer_params), GRU(**rlayer_params) layers = [ LookupTable(vocab_size=len(train_set.vocab), embedding_dim=hidden_size, init=init), rlayer1, rlayer2, Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) model = Model(layers=layers)
# hyperparameters num_epochs = args.epochs (X_train, y_train), (X_test, y_test), nclass = load_mnist(path=args.data_dir) train_set = ArrayIterator([X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28)) valid_set = ArrayIterator([X_test, X_test], y_test, nclass=nclass, lshape=(1, 28, 28)) # weight initialization init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model path1 = Sequential(layers=[Affine(nout=100, init=init_norm, activation=Rectlin()), Affine(nout=100, init=init_norm, activation=Rectlin())]) path2 = Sequential(layers=[Affine(nout=100, init=init_norm, activation=Rectlin()), Affine(nout=100, init=init_norm, activation=Rectlin())]) layers = [MergeMultistream(layers=[path1, path2], merge="stack"), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True))] model = Model(layers=layers) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) # fit and validate optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) # configure callbacks callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args) model.fit(train_set, cost=cost, optimizer=optimizer, num_epochs=num_epochs, callbacks=callbacks)
onehot_input=False) valid_set = Text(time_steps, valid_path, vocab=train_set.vocab, tokenizer=tokenizer, onehot_input=False) # weight initialization init = Uniform(low=-0.1, high=0.1) # model initialization rlayer_params = { "output_size": hidden_size, "init": init, "activation": Tanh(), "gate_activation": Logistic() } if args.rlayer_type == 'lstm': rlayer1, rlayer2 = LSTM(**rlayer_params), LSTM(**rlayer_params) else: rlayer1, rlayer2 = GRU(**rlayer_params), GRU(**rlayer_params) layers = [ LookupTable(vocab_size=len(train_set.vocab), embedding_dim=hidden_size, init=init), rlayer1, rlayer2, Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))
args = parser.parse_args() # load up the mnist data set # split into train and tests sets (X_train, y_train), (X_test, y_test), nclass = load_mnist(path=args.data_dir) # setup a training set iterator train_set = DataIterator(X_train, y_train, nclass=nclass) # setup a validation data set iterator valid_set = DataIterator(X_test, y_test, nclass=nclass) # setup weight initialization function init_norm = Gaussian(loc=0.0, scale=0.01) normrelu = dict(init=init_norm, activation=Rectlin()) normsigm = dict(init=init_norm, activation=Logistic(shortcut=True)) normsoft = dict(init=init_norm, activation=Softmax()) # setup model layers b1 = BranchNode(name="b1") b2 = BranchNode(name="b2") p1 = [Affine(nout=100, linear_name="m_l1", **normrelu), b1, Affine(nout=32, linear_name="m_l2", **normrelu), Affine(nout=16, linear_name="m_l3", **normrelu), b2, Affine(nout=10, linear_name="m_l4", **normsoft)] p2 = [b1,
def test_reshape_layer_model(backend_default, fargs): """ test cases: - conv before RNNs - conv after RNNs - conv after LUT """ np.random.seed(seed=0) nin, nout, bsz = fargs be = backend_default be.bsz = bsz input_size = (nin, be.bsz) init = Uniform(-0.1, 0.1) g_uni = GlorotUniform() inp_np = np.random.rand(nin, be.bsz) delta_np = np.random.rand(nout, be.bsz) inp = be.array(inp_np) delta = be.array(delta_np) conv_lut_1 = [ LookupTable(vocab_size=2000, embedding_dim=400, init=init), Reshape(reshape=(4, 100, -1)), Conv((3, 3, 16), init=init), LSTM(64, g_uni, activation=Tanh(), gate_activation=Logistic(), reset_cells=True), RecurrentSum(), Affine(nout, init, bias=init, activation=Softmax()) ] conv_lut_2 = [ LookupTable(vocab_size=1000, embedding_dim=400, init=init), Reshape(reshape=(4, 50, -1)), Conv((3, 3, 16), init=init), Pooling(2, strides=2), Affine(nout=nout, init=init, bias=init, activation=Softmax()), ] conv_rnn_1 = [ LookupTable(vocab_size=2000, embedding_dim=400, init=init), LSTM(64, g_uni, activation=Tanh(), gate_activation=Logistic(), reset_cells=True), Reshape(reshape=(4, 32, -1)), Conv((3, 3, 16), init=init), Affine(nout, init, bias=init, activation=Softmax()) ] conv_rnn_2 = [ LookupTable(vocab_size=2000, embedding_dim=400, init=init), Recurrent(64, g_uni, activation=Tanh(), reset_cells=True), Reshape(reshape=(4, -1, 32)), Conv((3, 3, 16), init=init), Affine(nout, init, bias=init, activation=Softmax()) ] lut_sum_1 = [ LookupTable(vocab_size=1000, embedding_dim=128, init=init), RecurrentSum(), Affine(nout=nout, init=init, bias=init, activation=Softmax()), ] lut_birnn_1 = [ LookupTable(vocab_size=1000, embedding_dim=200, init=init), DeepBiRNN(32, init=GlorotUniform(), batch_norm=True, activation=Tanh(), reset_cells=True, depth=1), Reshape((4, 32, -1)), Conv((3, 3, 16), init=init), Affine(nout=nout, init=init, bias=init, activation=Softmax()) ] layers_test = [ conv_lut_1, conv_lut_2, conv_rnn_1, conv_rnn_2, lut_sum_1, lut_birnn_1 ] for lg in layers_test: model = Model(layers=lg) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) model.initialize(input_size, cost) model.fprop(inp) model.bprop(delta)