def create_model(vocab_size, rlayer_type): """ Create LSTM/GRU model for bAbI dataset. Args: vocab_size (int) : String of bAbI data. rlayer_type (string) : Type of recurrent layer to use (gru or lstm). Returns: Model : Model of the created network """ # recurrent layer parameters (default gru) rlayer_obj = GRU if rlayer_type == 'gru' else LSTM rlayer_params = dict(output_size=100, reset_cells=True, init=GlorotUniform(), init_inner=Orthonormal(0.5), activation=Tanh(), gate_activation=Logistic()) # if using lstm, swap the activation functions if rlayer_type == 'lstm': rlayer_params.update(dict(activation=Logistic(), gate_activation=Tanh())) # lookup layer parameters lookup_params = dict(vocab_size=vocab_size, embedding_dim=50, init=Uniform(-0.05, 0.05)) # Model construction story_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)] query_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)] layers = [MergeMultistream(layers=[story_path, query_path], merge="stack"), Affine(vocab_size, init=GlorotUniform(), activation=Softmax())] return Model(layers=layers)
def __init__(self, overlapping_classes=None, exclusive_classes=None, analytics_input=True, network_type='conv_net', num_words=60, width=100, lookup_size=0, lookup_dim=0, optimizer=Adam()): assert (overlapping_classes is not None) or (exclusive_classes is not None) self.width = width self.num_words = num_words self.overlapping_classes = overlapping_classes self.exclusive_classes = exclusive_classes self.analytics_input = analytics_input self.recurrent = network_type == 'lstm' self.lookup_size = lookup_size self.lookup_dim = lookup_dim init = GlorotUniform() activation = Rectlin(slope=1E-05) gate = Logistic() input_layers = self.input_layers(analytics_input, init, activation, gate) if self.overlapping_classes is None: output_layers = [ Affine(len(self.exclusive_classes), init, activation=Softmax()) ] elif self.exclusive_classes is None: output_layers = [ Affine(len(self.overlapping_classes), init, activation=Logistic()) ] else: output_branch = BranchNode(name='exclusive_overlapping') output_layers = Tree([[ SkipNode(), output_branch, Affine(len(self.exclusive_classes), init, activation=Softmax()) ], [ output_branch, Affine(len(self.overlapping_classes), init, activation=Logistic()) ]]) layers = [ input_layers, # this is where inputs meet, and where we may want to add depth or # additional functionality Dropout(keep=0.8), output_layers ] super(ClassifierNetwork, self).__init__(layers, optimizer=optimizer)
def test_multi_optimizer(backend_default): opt_gdm = GradientDescentMomentum(learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) opt = MultiOptimizer({ 'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1 }) map_list = opt._map_optimizers(layer_list) assert map_list[opt_adam][0].__class__.__name__ == 'Convolution' assert map_list[opt_ada][0].__class__.__name__ == 'Bias' assert map_list[opt_rms][0].__class__.__name__ == 'Linear' assert map_list[opt_gdm][0].__class__.__name__ == 'Activation' assert map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def test_biRNN_bprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiRNN(hidden_size, activation=Logistic(), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.set_deltas([birnn.be.iobuf(birnn.in_shape)]) # same weight for bi-rnn backward and rnn weights birnn.W_input_b[:] = birnn.W_input_f birnn.W_recur_b[:] = birnn.W_recur_f birnn.b_b[:] = birnn.b_f birnn.dW[:] = 0 # same weight for bi-directional rnn init_glorot = GlorotUniform() rnn = Recurrent(hidden_size, activation=Logistic(), init=init_glorot) rnn.configure(in_shape) rnn.prev_layer = True rnn.allocate() rnn.set_deltas([rnn.be.iobuf(rnn.in_shape)]) # inputs and views lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) # allocate gpu buffers inp_lr = birnn.be.array(lr) inp_rl = birnn.be.array(rl) # outputs out_lr_g = birnn.fprop(inp_lr) del_lr = birnn.bprop(out_lr_g).get().copy() birnn.h_buffer[:] = 0 out_rl_g = birnn.fprop(inp_rl) del_rl = birnn.bprop(out_rl_g).get().copy() del_lr_s = get_steps(del_lr, in_shape) del_rl_s = get_steps(del_rl, in_shape) for (x, y) in zip(del_lr_s, reversed(del_rl_s)): assert np.allclose(x, y, rtol=0.0, atol=1.0e-5)
def __init__(self, backend, dataset, subj): ad = { 'type': 'adadelta', 'lr_params': { 'rho': 0.9, 'epsilon': 0.000000001 } } self.layers = [] self.add( DataLayer(is_local=True, nofm=dataset.nchannels, ofmshape=[1, dataset.nsamples])) self.add( ConvLayer(nofm=64, fshape=[1, 3], activation=RectLin(), lrule_init=ad)) self.add(PoolingLayer(op='max', fshape=[1, 2], stride=2)) if subj != 2: self.add(FCLayer(nout=128, activation=RectLin(), lrule_init=ad)) self.add( FCLayer(nout=dataset.nclasses, activation=Logistic(), lrule_init=ad)) self.add(CostLayer(cost=CrossEntropy())) self.model = MLP(num_epochs=1, batch_size=128, layers=self.layers) self.dataset = dataset
def test_logistic_derivative(backend_default): # bprop is on the output inputs = np.array([0, 1, -2]).reshape((3, 1)) inputs = 1.0 / (1.0 + np.exp(-inputs)) outputs = inputs * (1.0 - inputs) compare_tensors(Logistic(shortcut=False), inputs, outputs, deriv=True, tol=1e-7)
def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon gru instance gru = GRU(hidden_size, init=Gaussian(), activation=Tanh(), gate_activation=Logistic()) inpa = gru.be.array(np.copy(inp_bl)) # run fprop on the baseline input gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() gru.set_deltas([gru.be.iobuf(gru.in_shape)]) out_bl = gru.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = gru.bprop(gru.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_gru(gru) gru.allocate() out_pos = gru.fprop(gru.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_gru(gru) gru.allocate() out_neg = gru.fprop(gru.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale * out_pos) loss_neg = np.sum(rand_scale * out_neg) # compute the gradient estimate grad = 0.5 / float(epsilon) * (loss_pos - loss_neg) grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del gru return (grads_est, deltas_neon)
def __init__(self): self.in_shape = [1024, (2538, 38)] init = Constant(0) image_path = Sequential( [Affine(20, init, bias=init), Affine(10, init, bias=init)]) sent_path = Sequential([Affine(30, init, bias=init), Affine(10, init)]) layers = [ MergeMultistream(layers=[image_path, sent_path], merge="recurrent"), Dropout(keep=0.5), LSTM(4, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=True), Affine(20, init, bias=init, activation=Softmax()) ] self.layers = layers self.cost = GeneralizedCostMask(CrossEntropyMulti()) self.model = Model(layers=layers) self.model.initialize(self.in_shape, cost=self.cost)
def layers(self): init_uni = Uniform(low=-0.1, high=0.1) bn = False return [ DOG((5.0, 4.0, 3.0, 1.6), 1.8), Conv((5, 5, 16), init=init_uni, activation=Rectlin(), batch_norm=bn), Pooling((2, 2)), Conv((5, 5, 32), init=init_uni, activation=Rectlin(), batch_norm=bn), Pooling((2, 2)), Affine(nout=500, init=init_uni, activation=Rectlin(), batch_norm=bn), Affine(nout=self.noutputs, init=init_uni, bias=Constant(0), activation=Softmax() if self.use_softmax else Logistic( shortcut=True)) ]
def test_model_get_outputs_rnn(backend_default, data): data_path = load_ptb_test(path=data) data_set = Text(time_steps=50, path=data_path) # weight initialization init = Constant(0.08) # model initialization layers = [ Recurrent(150, init, activation=Logistic()), Affine(len(data_set.vocab), init, bias=init, activation=Rectlin()) ] model = Model(layers=layers) output = model.get_outputs(data_set) assert output.shape == (data_set.ndata, data_set.seq_length, data_set.nclass) # since the init are all constant and model is un-trained: # along the feature dim, the values should be all the same assert np.allclose(output[0, 0], output[0, 0, 0], rtol=0, atol=1e-5) assert np.allclose(output[0, 1], output[0, 1, 0], rtol=0, atol=1e-5) # along the time dim, the values should be increasing: assert np.alltrue(output[0, 2] > output[0, 1]) assert np.alltrue(output[0, 1] > output[0, 0])
def main(args): # load up the mnist data set dataset = MNIST(path=args.data_dir) # initialize model object mlp = Model(layers=[ Affine(nout=100, init=Gaussian(loc=0.0, scale=0.01), activation=Rectlin()), Affine(nout=10, init=Gaussian(loc=0.0, scale=0.01), activation=Logistic(shortcut=True)) ]) # setup optimizer optimizer = GradientDescentMomentum(0.1, momentum_coef=0.9, stochastic_round=args.rounding) # configure callbacks callbacks = Callbacks(mlp, eval_set=dataset.valid_iter, **args.callback_args) # run fit # setup cost function as CrossEntropy mlp.fit(dataset.train_iter, optimizer=optimizer, num_epochs=args.epochs, cost=GeneralizedCost(costfunc=CrossEntropyBinary()), callbacks=callbacks) error_rate = mlp.eval(dataset.valid_iter, metric=Misclassification()) neon_logger.display('Classification accuracy = %.4f' % (1 - error_rate))
def build_model(self): # setup weight initialization function init_norm = Gaussian(loc=0.0, scale=0.01) # setup model layers layers = [ Affine(nout=100, init=init_norm, bias=Uniform(), activation=Rectlin()), Affine(nout=10, init=init_norm, bias=Uniform(), activation=Logistic(shortcut=True)) ] # setup cost function as CrossEntropy self.cost = GeneralizedCost(costfunc=CrossEntropyBinary()) # setup optimizer self.optimizer = GradientDescentMomentum( 0.1, momentum_coef=0.9, stochastic_round=self.args.rounding) # initialize model object self.model = ModelDist(layers=layers)
def test_biLSTM_bprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), activation=Tanh(), init=init_glorot, reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() bilstm.set_deltas([bilstm.be.iobuf(bilstm.in_shape)]) # same weight for bi-rnn backward and rnn weights nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs and views lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) # allocate gpu buffers inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr_g = bilstm.fprop(inp_lr) out_lr = out_lr_g.get().copy() del_lr = bilstm.bprop(out_lr_g).get().copy() bilstm.h_buffer[:] = 0 out_rl_g = bilstm.fprop(inp_rl) out_rl = out_rl_g.get().copy() del_rl = bilstm.bprop(out_rl_g).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert np.allclose(x_f, y_b, rtol=0.0, atol=1.0e-5) assert np.allclose(x_b, y_f, rtol=0.0, atol=1.0e-5) del_lr_s = get_steps(del_lr, in_shape) del_rl_s = get_steps(del_rl, in_shape) for (x, y) in zip(del_lr_s, reversed(del_rl_s)): assert np.allclose(x, y, rtol=0.0, atol=1.0e-5)
def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon lstm instance lstm = LSTM(hidden_size, Gaussian(), Tanh(), Logistic()) inpa = lstm.be.array(np.copy(inp_bl)) # run fprop on the baseline input out_bl = lstm.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = lstm.bprop(lstm.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_lstm(lstm) out_pos = lstm.fprop(lstm.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_lstm(lstm) out_neg = lstm.fprop(lstm.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale * out_pos) loss_neg = np.sum(rand_scale * out_neg) # compute the gradient estimate grad = 0.5 * (loss_pos - loss_neg) / epsilon grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del lstm return (grads_est, deltas_neon)
def create_model(nin): layers = [] layers.append(DataLayer(nout=nin)) layers.append(FCLayer(nout=100, activation=RectLin())) layers.append(FCLayer(nout=10, activation=Logistic())) layers.append(CostLayer(cost=CrossEntropy())) model = MLP(num_epochs=10, batch_size=128, layers=layers) return model
def test_multi_optimizer(backend_default_mkl): """ A test for MultiOptimizer. """ opt_gdm = GradientDescentMomentum( learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) for l in layer_list: l.configure(in_obj=(16, 28, 28)) l.allocate() # separate layer_list into two, the last two recurrent layers and the rest layer_list1, layer_list2 = layer_list[:-2], layer_list[-2:] opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Convolution_bias': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1}) layers_to_optimize1 = [l for l in layer_list1 if isinstance(l, ParameterLayer)] layers_to_optimize2 = [l for l in layer_list2 if isinstance(l, ParameterLayer)] opt.optimize(layers_to_optimize1, 0) assert opt.map_list[opt_adam][0].__class__.__name__ is 'Convolution_bias' assert opt.map_list[opt_rms][0].__class__.__name__ == 'Linear' opt.optimize(layers_to_optimize2, 0) assert opt.map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert opt.map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def layers(self): return [ Conv((7, 7, 96), init=Gaussian(scale=0.0001), bias=Constant(0), activation=Rectlin(), padding=3, strides=1), LRN(31, ascale=0.001, bpower=0.75), Pooling(3, strides=2, padding=1), Conv((5, 5, 256), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=2, strides=1), LRN(31, ascale=0.001, bpower=0.75), Pooling(3, strides=2, padding=1), Conv((3, 3, 384), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=1, strides=1), Conv((3, 3, 384), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=1, strides=1), Conv((3, 3, 256), init=Gaussian(scale=0.01), bias=Constant(0), activation=Rectlin(), padding=1, strides=1), Pooling(3, strides=2, padding=1), Affine(nout=4096, init=Gaussian(scale=0.01), bias=Constant(0), activation=Identity()), Dropout(keep=0.5), Affine(nout=4096, init=Gaussian(scale=0.01), bias=Constant(0), activation=Identity()), Dropout(keep=0.5), Affine(nout=self.noutputs, init=Gaussian(scale=0.01), bias=Constant(0), activation=Softmax() if self.use_softmax else Logistic( shortcut=True)) ]
def test_biLSTM_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), init=init_glorot, activation=Tanh(), reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() # same weight nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr = bilstm.fprop(inp_lr).get().copy() bilstm.h_buffer[:] = 0 out_rl = bilstm.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
def test_model_N_S_setter(backend_default): # weight initialization init = Constant(0.08) # model initialization layers = [ Recurrent(150, init, activation=Logistic()), Affine(100, init, bias=init, activation=Rectlin()) ] model = Model(layers=layers) model.set_batch_size(20) model.set_seq_len(10)
def __init__(self): self.in_shape = (1, 32, 32) init_norm = Gaussian(loc=0.0, scale=0.01) normrelu = dict(init=init_norm, activation=Rectlin()) normsigm = dict(init=init_norm, activation=Logistic(shortcut=True)) normsoft = dict(init=init_norm, activation=Softmax()) # setup model layers b1 = BranchNode(name="b1") b2 = BranchNode(name="b2") p1 = [ Affine(nout=100, name="main1", **normrelu), b1, Affine(nout=32, name="main2", **normrelu), Affine(nout=160, name="main3", **normrelu), b2, Affine(nout=32, name="main2", **normrelu), # make next layer big to check sizing Affine(nout=320, name="main2", **normrelu), Affine(nout=10, name="main4", **normsoft) ] p2 = [ b1, Affine(nout=16, name="branch1_1", **normrelu), Affine(nout=10, name="branch1_2", **normsigm) ] p3 = [ b2, Affine(nout=16, name="branch2_1", **normrelu), Affine(nout=10, name="branch2_2", **normsigm) ] self.cost = Multicost(costs=[ GeneralizedCost(costfunc=CrossEntropyMulti()), GeneralizedCost(costfunc=CrossEntropyBinary()), GeneralizedCost(costfunc=CrossEntropyBinary()) ], weights=[1, 0., 0.]) self.layers = SingleOutputTree([p1, p2, p3], alphas=[1, .2, .2]) self.model = Model(layers=self.layers) self.model.initialize(self.in_shape, cost=self.cost)
def test_model_get_outputs(backend_default): (X_train, y_train), (X_test, y_test), nclass = load_mnist() train_set = DataIterator(X_train[:backend_default.bsz * 3]) init_norm = Gaussian(loc=0.0, scale=0.1) layers = [Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True))] mlp = Model(layers=layers) out_list = [] mlp.initialize(train_set) for x, t in train_set: x = mlp.fprop(x) out_list.append(x.get().T.copy()) ref_output = np.vstack(out_list) train_set.reset() output = mlp.get_outputs(train_set) assert np.allclose(output, ref_output)
def test_model_get_outputs_rnn(backend_default, data): data_path = load_text('ptb-valid', path=data) data_set = Text(time_steps=50, path=data_path) # weight initialization init = Constant(0.08) # model initialization layers = [ Recurrent(150, init, Logistic()), Affine(len(data_set.vocab), init, bias=init, activation=Rectlin()) ] model = Model(layers=layers) output = model.get_outputs(data_set) assert output.shape == ( data_set.ndata, data_set.seq_length, data_set.nclass)
def layers(self): bn = True return [ Conv((7, 7, 96), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=3, strides=1)\ if self.bn_first_layer else\ Conv((7, 7, 96), init=Kaiming(), bias=Constant(0), activation=Explin(), padding=3, strides=1), Pooling(3, strides=2, padding=1), Conv((7, 7, 128), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=3, strides=1), Pooling(3, strides=2, padding=1), Conv((5, 5, 256), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=2, strides=1), Pooling(3, strides=2, padding=1), Conv((3, 3, 384), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=1, strides=1), Conv((3, 3, 384), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=1, strides=1), Conv((3, 3, 384), init=Kaiming(), activation=Explin(), batch_norm=bn, padding=1, strides=1), Pooling(3, strides=2, padding=1, op='avg'), Affine(nout=self.noutputs, init=Kaiming(), activation=Explin(), batch_norm=bn), Affine(nout=self.noutputs, init=Kaiming(), activation=Explin(), batch_norm=bn), Affine(nout=self.noutputs, init=Kaiming(), bias=Constant(0), activation=Softmax() if self.use_softmax else Logistic(shortcut=True)) ]
def gen_model(backend_type): # setup backend gen_backend(backend=backend_type, batch_size=batch_size, rng_seed=2, device_id=args.device_id, datatype=args.datatype) init_uni = Uniform(low=-0.1, high=0.1) # Set up the model layers layers = [] layers.append(Conv((5, 5, 16), init=init_uni, bias=Constant(0), activation=Rectlin())) layers.append(Pooling(2)) layers.append(Conv((5, 5, 32), init=init_uni, activation=Rectlin())) layers.append(Pooling(2)) layers.append(Affine(nout=500, init=init_uni, activation=Rectlin())) layers.append(Affine(nout=10, init=init_uni, activation=Logistic(shortcut=True))) mlp = Model(layers=layers) return mlp
def input_layers(self, analytics_input, init, activation, gate): """ return the input layers. we currently support convolutional and LSTM :return: """ if self.recurrent: if analytics_input: # support analytics + content input_layers = MergeMultistream([[ LSTM(300, init, init_inner=Kaiming(), activation=activation, gate_activation=gate, reset_cells=True), RecurrentSum() ], [Affine(30, init, activation=activation)]], 'stack') else: # content only input_layers = [ LSTM(300, init, init_inner=Kaiming(), activation=activation, gate_activation=gate, reset_cells=True), RecurrentSum() ] else: if analytics_input: # support analytics + content input_layers = MergeMultistream([ self.conv_net(activation), [Affine(30, init, activation=Logistic())] ], 'stack') else: # content only input_layers = self.conv_net(activation) return input_layers
def create_network(stage_depth): if stage_depth in (18, 18): stages = (2, 2, 2, 2) elif stage_depth in (34, 50): stages = (3, 4, 6, 3) elif stage_depth in (68, 101): stages = (3, 4, 23, 3) elif stage_depth in (102, 152): stages = (3, 8, 36, 3) else: raise ValueError('Invalid stage_depth value'.format(stage_depth)) bottleneck = False if stage_depth in (50, 101, 152): bottleneck = True layers = [Conv(**conv_params(7, 64, strides=2)), Pooling(3, strides=2)] # Structure of the deep residual part of the network: # stage_depth modules of 2 convolutional layers each at feature map depths # of 64, 128, 256, 512 nfms = list( itt.chain.from_iterable( [itt.repeat(2**(x + 6), r) for x, r in enumerate(stages)])) strides = [-1] + [ 1 if cur == prev else 2 for cur, prev in zip(nfms[1:], nfms[:-1]) ] for nfm, stride in zip(nfms, strides): layers.append(module_factory(nfm, bottleneck, stride)) layers.append(Pooling('all', op='avg')) layers.append( Affine(10, init=Kaiming(local=False), batch_norm=True, activation=Rectlin())) layers.append(Affine(1, init=Kaiming(local=False), activation=Logistic())) #return Model(layers=layers), GeneralizedCost(costfunc=CrossEntropyBinary()) return Model(layers=layers), GeneralizedCost(costfunc=CrossEntropyBinary())
def build(self): """ Build the model's layers """ first_layer_dens = 64 second_layer_dens = 64 output_layer_dens = 2 # setup weight initialization function init_norm = Gaussian(scale=0.01) # setup model layers layers = [ Affine(nout=first_layer_dens, init=init_norm, activation=Rectlin()), Affine(nout=second_layer_dens, init=init_norm, activation=Rectlin()), Affine(nout=output_layer_dens, init=init_norm, activation=Logistic(shortcut=True)) ] # initialize model object self.model = Model(layers=layers)
def test_model_get_outputs(backend_default, data): dataset = MNIST(path=data) train_set = dataset.train_iter init_norm = Gaussian(loc=0.0, scale=0.1) layers = [ Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True)) ] mlp = Model(layers=layers) out_list = [] mlp.initialize(train_set) for x, t in train_set: x = mlp.fprop(x) out_list.append(x.get().T.copy()) ref_output = np.vstack(out_list) train_set.reset() output = mlp.get_outputs(train_set) assert np.allclose(output, ref_output[:output.shape[0], :]) # test model benchmark inference mlp.benchmark(train_set, inference=True, niterations=5)
args = parser.parse_args() # load up the mnist data set # split into train and tests sets (X_train, y_train), (X_test, y_test), nclass = load_mnist(path=args.data_dir) # setup a training set iterator train_set = DataIterator(X_train, y_train, nclass=nclass) # setup a validation data set iterator valid_set = DataIterator(X_test, y_test, nclass=nclass) # setup weight initialization function init_norm = Gaussian(loc=0.0, scale=0.01) normrelu = dict(init=init_norm, activation=Rectlin()) normsigm = dict(init=init_norm, activation=Logistic(shortcut=True)) normsoft = dict(init=init_norm, activation=Softmax()) # setup model layers b1 = BranchNode(name="b1") b2 = BranchNode(name="b2") p1 = [Affine(nout=100, linear_name="m_l1", **normrelu), b1, Affine(nout=32, linear_name="m_l2", **normrelu), Affine(nout=16, linear_name="m_l3", **normrelu), b2, Affine(nout=10, linear_name="m_l4", **normsoft)] p2 = [b1,
# hyperparameters num_epochs = args.epochs (X_train, y_train), (X_test, y_test), nclass = load_mnist(path=args.data_dir) train_set = ArrayIterator([X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28)) valid_set = ArrayIterator([X_test, X_test], y_test, nclass=nclass, lshape=(1, 28, 28)) # weight initialization init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model path1 = Sequential(layers=[Affine(nout=100, init=init_norm, activation=Rectlin()), Affine(nout=100, init=init_norm, activation=Rectlin())]) path2 = Sequential(layers=[Affine(nout=100, init=init_norm, activation=Rectlin()), Affine(nout=100, init=init_norm, activation=Rectlin())]) layers = [MergeMultistream(layers=[path1, path2], merge="stack"), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True))] model = Model(layers=layers) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) # fit and validate optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) # configure callbacks callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args) model.fit(train_set, cost=cost, optimizer=optimizer, num_epochs=num_epochs, callbacks=callbacks)