def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear( input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin,gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, weights_init=Constant(2), activation=Tanh(), gate_activation=Tanh()) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, weights_init=IsotropicGaussian(), activation=Tanh(), gate_activation=Tanh(), use_update_gate=False, rng=numpy.random.RandomState(1)) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') z = tensor.matrix('z') r = tensor.matrix('r') h1 = self.gated.apply(x, z, r, h0, iterate=False) next_h = theano.function(inputs=[h0, x, z, r], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose(h1_val, next_h(h0_val, x_val, zi_val, ri_val)[0], rtol=1e-6) def test_reset_only_many_steps(self): x = tensor.tensor3('x') ri = tensor.tensor3('ri') mask = tensor.matrix('mask') h = self.reset_only.apply(x, reset_inputs=ri, mask=mask) calc_h = theano.function(inputs=[x, ri, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=floatX) x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None] ri_val = 0.3 - x_val mask_val = numpy.ones((24, 4), dtype=floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=floatX) W = self.reset_only.state_to_state.get_value() U = self.reset_only.state_to_reset.get_value() for i in range(1, 25): r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose(h_val, calc_h(x_val, ri_val, mask_val)[0], 1e-03)
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=Constant(2)) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=IsotropicGaussian(), seed=1) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') gi = tensor.matrix('gi') h1 = self.gated.apply(x, gi, h0, iterate=False) next_h = theano.function(inputs=[h0, x, gi], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=theano.config.floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=theano.config.floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose( h1_val, next_h(h0_val, x_val, numpy.hstack([zi_val, ri_val]))[0], rtol=1e-6) def test_many_steps(self): x = tensor.tensor3('x') gi = tensor.tensor3('gi') mask = tensor.matrix('mask') h = self.reset_only.apply(x, gi, mask=mask) calc_h = theano.function(inputs=[x, gi, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = numpy.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None] ri_val = 0.3 - x_val zi_val = 2 * ri_val mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) W = self.reset_only.state_to_state.get_value() Wz = self.reset_only.state_to_gates.get_value()[:, :3] Wr = self.reset_only.state_to_gates.get_value()[:, 3:] for i in range(1, 25): z_val = numpy.tanh(h_val[i - 1].dot(Wz) + zi_val[i - 1]) r_val = numpy.tanh(h_val[i - 1].dot(Wr) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = z_val * h_val[i] + (1 - z_val) * h_val[i - 1] h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose( h_val, calc_h(x_val, numpy.concatenate( [zi_val, ri_val], axis=2), mask_val)[0], 1e-04) # Also test that initial state is a parameter initial_state, = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial_state) assert initial_state.name == 'initial_state'
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, weights_init=Constant(2), activation=Tanh(), gate_activation=Tanh()) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, weights_init=IsotropicGaussian(), activation=Tanh(), gate_activation=Tanh(), use_update_gate=False, seed=1) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') z = tensor.matrix('z') r = tensor.matrix('r') h1 = self.gated.apply(x, z, r, h0, iterate=False) next_h = theano.function(inputs=[h0, x, z, r], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose(h1_val, next_h(h0_val, x_val, zi_val, ri_val)[0], rtol=1e-6) def test_reset_only_many_steps(self): x = tensor.tensor3('x') ri = tensor.tensor3('ri') mask = tensor.matrix('mask') h = self.reset_only.apply(x, reset_inputs=ri, mask=mask) calc_h = theano.function(inputs=[x, ri, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=floatX) x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None] ri_val = 0.3 - x_val mask_val = numpy.ones((24, 4), dtype=floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=floatX) W = self.reset_only.state_to_state.get_value() U = self.reset_only.state_to_reset.get_value() for i in range(1, 25): r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose(h_val, calc_h(x_val, ri_val, mask_val)[0], 1e-03)
class GatedRecurrentFull(Initializable): """A wrapper around the GatedRecurrent brick that improves usability. It contains: * A fork to map to initialize the reset and the update units. * Better initialization to initialize the different pieces While this works, there is probably a better more elegant way to do this. Parameters ---------- hidden_dim : int dimension of the hidden state activation : :class:`.Brick` gate_activation: :class:`.Brick` state_to_state_init: object Weight Initialization state_to_reset_init: object Weight Initialization state_to_update_init: obje64 Weight Initialization input_to_state_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_reset_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_update_transform: :class:`.Brick` [CvMG14] uses Linear transform References --------- self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=self.activation, gate_activation=self.gate_activation) .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio, *Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734. """ @lazy(allocation=['hidden_dim', 'state_to_state_init', 'state_to_update_init', 'state_to_reset_init'], initialization=['input_to_state_transform', 'input_to_update_transform', 'input_to_reset_transform']) def __init__(self, hidden_dim, activation=None, gate_activation=None, state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None, input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None, **kwargs): super(GatedRecurrentFull, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.state_to_state_init = state_to_state_init self.state_to_update_init = state_to_update_init self.state_to_reset_init = state_to_reset_init self.input_to_state_transform = input_to_state_transform self.input_to_update_transform = input_to_update_transform self.input_to_reset_transform = input_to_reset_transform self.input_to_state_transform.name += "_input_to_state_transform" self.input_to_update_transform.name += "_input_to_update_transform" self.input_to_reset_transform.name += "_input_to_reset_transform" self.use_mine = True if self.use_mine: self.rnn = GatedRecurrentFast( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) else: self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) self.children = [self.rnn, self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform] self.children.extend(self.rnn.children) def initialize(self): super(GatedRecurrentFull, self).initialize() self.input_to_state_transform.initialize() self.input_to_update_transform.initialize() self.input_to_reset_transform.initialize() self.rnn.initialize() weight_shape = (self.hidden_dim, self.hidden_dim) state_to_state = self.state_to_state_init.generate(rng=self.rng, shape=weight_shape) state_to_update= self.state_to_update_init.generate(rng=self.rng, shape=weight_shape) state_to_reset = self.state_to_reset_init.generate(rng=self.rng, shape=weight_shape) self.rnn.state_to_state.set_value(state_to_state) if self.use_mine: self.rnn.state_to_update.set_value(state_to_update) self.rnn.state_to_reset.set_value(state_to_reset) else: self.rnn.state_to_gates.set_value(np.hstack((state_to_update, state_to_reset))) @application(inputs=['input_'], outputs=['output']) def apply(self, input_, mask=None): """ Parameters ---------- inputs_ : :class:`~tensor.TensorVariable` sequence to feed into GRU. Axes are mb, sequence, features mask : :class:`~tensor.TensorVariable` A 1D binary array with 1 or 0 to represent data given available. Returns ------- output: :class:`theano.tensor.TensorVariable` sequence to feed out. Axes are batch, sequence, features """ states_from_in = self.input_to_state_transform.apply(input_) update_from_in = self.input_to_update_transform.apply(input_) reset_from_in = self.input_to_reset_transform.apply(input_) gate_inputs = tensor.concatenate([update_from_in, reset_from_in], axis=2) if self.use_mine: output = self.rnn.apply(inputs=states_from_in, update_inputs=update_from_in, reset_inputs=reset_from_in, mask=mask) else: output = self.rnn.apply(inputs=states_from_in, gate_inputs=gate_inputs) return output
def training(runname, rnnType, maxPackets, packetTimeSteps, packetReverse, padOldTimeSteps, wtstd, lr, decay, clippings, dimIn, dim, attentionEnc, attentionContext, numClasses, batch_size, epochs, trainPercent, dataPath, loadPrepedData, channel): # pragma: no cover print locals() print X = T.tensor4('inputs') Y = T.matrix('targets') linewt_init = IsotropicGaussian(wtstd) line_bias = Constant(1.0) rnnwt_init = IsotropicGaussian(wtstd) rnnbias_init = Constant(0.0) classifierWts = IsotropicGaussian(wtstd) learning_rateClass = theano.shared(np.array(lr, dtype=theano.config.floatX)) learning_decay = np.array(decay, dtype=theano.config.floatX) ###DATA PREP print 'loading data' if loadPrepedData: hexSessions = loadFile(dataPath) else: sessioner = sessionizer.HexSessionizer(dataPath) hexSessions = sessioner.read_pcap() hexSessions = removeBadSessionizer(hexSessions) numSessions = len(hexSessions) print str(numSessions) + ' sessions found' hexSessionsKeys = order_keys(hexSessions) hexDict = hexTokenizer() print 'creating dictionary of ip communications' comsDict, uniqIPs = srcIpDict(hexSessions) comsDict = dictUniquerizer(comsDict) print 'initializing network graph' ###ENCODER if rnnType == 'gru': rnn = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru') dimMultiplier = 2 else: rnn = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm') dimMultiplier = 4 fork = Fork(output_names=['linear', 'gates'], name='fork', input_dim=dimIn, output_dims=[dim, dim * dimMultiplier], weights_init = linewt_init, biases_init = line_bias) ###CONTEXT if rnnType == 'gru': rnnContext = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gruContext') else: rnnContext = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstmContext') forkContext = Fork(output_names=['linearContext', 'gatesContext'], name='forkContext', input_dim=dim, output_dims=[dim, dim * dimMultiplier], weights_init = linewt_init, biases_init = line_bias) forkDec = Fork(output_names=['linear', 'gates'], name='forkDec', input_dim=dim, output_dims=[dim, dim*dimMultiplier], weights_init = linewt_init, biases_init = line_bias) #CLASSIFIER bmlp = BatchNormalizedMLP( activations=[Tanh(),Tanh()], dims=[dim, dim, numClasses], weights_init=classifierWts, biases_init=Constant(0.0001) ) #initialize the weights in all the functions fork.initialize() rnn.initialize() forkContext.initialize() rnnContext.initialize() forkDec.initialize() bmlp.initialize() def onestepEnc(X): data1, data2 = fork.apply(X) if rnnType == 'gru': hEnc = rnn.apply(data1, data2) else: hEnc, _ = rnn.apply(data2) return hEnc hEnc, _ = theano.scan(onestepEnc, X) #(mini*numPackets, packetLen, 1, hexdictLen) if attentionEnc: attentionmlpEnc = MLP(activations=[Tanh()], dims = [dim, 1], weights_init=attnWts, biases_init=Constant(1.0)) attentionmlpEnc.initialize() hEncAttn = T.reshape(hEnc, (-1, packetTimeSteps, dim)) def onestepEncAttn(hEncAttn): preEncattn = attentionmlpEnc.apply(hEncAttn) attEncsoft = Softmax() attEncpyx = attEncsoft.apply(preEncattn.flatten()) attEncpred = attEncpyx.flatten() attenc = T.mul(hEncAttn.dimshuffle(1,0), attEncpred).dimshuffle(1,0) return attenc attenc, _ = theano.scan(onestepEncAttn, hEncAttn) hEncReshape = T.reshape(T.sum(attenc, axis = 1), (-1, maxPackets, 1, dim)) else: hEncReshape = T.reshape(hEnc[:,-1], (-1, maxPackets, 1, dim)) #[:,-1] takes the last rep for each packet #(mini, numPackets, 1, dimReduced) #[:,-1] takes the last rep for each packet #(mini, numPackets, 1, dimReduced) def onestepContext(hEncReshape): data3, data4 = forkContext.apply(hEncReshape) if rnnType == 'gru': hContext = rnnContext.apply(data3, data4) else: hContext, _ = rnnContext.apply(data4) return hContext hContext, _ = theano.scan(onestepContext, hEncReshape) if attentionContext: attentionmlpContext = MLP(activations=[Tanh()], dims = [dim, 1], weights_init=attnWts, biases_init=Constant(1.0)) attentionmlpContext.initialize() hContextAttn = T.reshape(hContext, (-1,maxPackets,dim)) def onestepContextAttn(hContextAttn): preContextatt = attentionmlpContext.apply(hContextAttn) attContextsoft = Softmax() attContextpyx = attContextsoft.apply(preContextatt.flatten()) attContextpred = attContextpyx.flatten() attcontext = T.mul(hContextAttn.dimshuffle(1,0), attContextpred).dimshuffle(1,0) return attcontext attcontext, _ = theano.scan(onestepContextAttn, hContextAttn) hContextReshape = T.sum(attcontext, axis = 1) else: hContextReshape = T.reshape(hContext[:,-1], (-1,dim)) data5, _ = forkDec.apply(hContextReshape) pyx = bmlp.apply(data5) softmax = Softmax() softoutClass = softmax.apply(pyx) costClass = T.mean(CategoricalCrossEntropy().apply(Y, softoutClass)) #CREATE GRAPH cgClass = ComputationGraph([costClass]) paramsClass = VariableFilter(roles = [PARAMETER])(cgClass.variables) learning = learningfunctions.Learning(costClass,paramsClass,learning_rateClass,l1=0.,l2=0.,maxnorm=0.,c=clippings) updatesClass = learning.Adam() module_logger.info('starting graph compilation') classifierTrain = theano.function([X,Y], [costClass, hEnc, hContext, pyx, softoutClass], updates=updatesClass, allow_input_downcast=True) classifierPredict = theano.function([X], softoutClass, allow_input_downcast=True) module_logger.info('graph compilation finished') print 'finished graph compilation' trainIndex = int(len(hexSessionsKeys)*trainPercent) epochCost = [] gradNorms = [] trainAcc = [] testAcc = [] costCollect = [] trainCollect = [] module_logger.info('beginning training') iteration = 0 #epoch for epoch in xrange(epochs): #iteration/minibatch for start, end in zip(range(0, trainIndex,batch_size), range(batch_size, trainIndex, batch_size)): trainingTargets = [] trainingSessions = [] #create one minibatch with 0.5 normal and 0.5 abby normal traffic for trainKey in range(start, end): sessionForEncoding = list(hexSessions[hexSessions.keys()[trainKey]][0]) adfun = adversarialfunctions.Adversary(sessionForEncoding) adversaryList = [sessionForEncoding, adfun.dstIpSwapOut(comsDict, uniqIPs), adfun.portDirSwitcher(), adfun.ipDirSwitcher()] abbyIndex = random.sample(range(len(adversaryList)), 1)[0] targetClasses = [0]*numClasses targetClasses[abbyIndex] = 1 abbyTarget = np.array(targetClasses, dtype=theano.config.floatX) trainingSessions.append(abbyOneHotSes[0]) trainingTargets.append(abbyTarget) sessionsMinibatch = np.asarray(trainingSessions).reshape((-1, packetTimeSteps, 1, dimIn)) targetsMinibatch = np.asarray(trainingTargets) costfun = classifierTrain(sessionsMinibatch, targetsMinibatch) if iteration % (numSessions / (10 * batch_size)) == 0: costCollect.append(costfun[0]) trainCollect.append(np.mean(np.argmax(costfun[-1],axis=1) == np.argmax(targetsMinibatch, axis=1))) module_logger.info(' Iteration: ', iteration) module_logger.info(' Cost: ', np.mean(costCollect)) module_logger.info(' TRAIN accuracy: ', np.mean(trainCollect)) print ' Iteration: ', iteration print ' Cost: ', np.mean(costCollect) print ' TRAIN accuracy: ', np.mean(trainCollect) iteration+=1 #testing accuracy if iteration % (numSessions / (2 * batch_size)) == 0: predtar, acttar, testCollect = predictClass(classifierPredict, hexSessions, comsDict, uniqIPs, hexDict, hexSessionsKeys, numClasses, trainPercent, dimIn, maxPackets, packetTimeSteps, padOldTimeSteps) binaryPrecisionRecall(predtar, acttar, numClasses) module_logger.info(str(testCollect)) #save the models if iteration % (numSessions / (5 * batch_size)) == 0: save_model(classifierPredict) epochCost.append(np.mean(costCollect)) trainAcc.append(np.mean(trainCollect)) module_logger.info('Epoch: ', epoch) module_logger.info('Epoch cost average: ', epochCost[-1]) module_logger.info('Epoch TRAIN accuracy: ', trainAcc[-1]) print 'Epoch: ', epoch print 'Epoch cost average: ', epochCost[-1] print 'Epoch TRAIN accuracy: ', trainAcc[-1] return classifierTrain, classifierPredict
class GatedRecurrentFull(Initializable): """A wrapper around the GatedRecurrent brick that improves usability. It contains: * A fork to map to initialize the reset and the update units. * Better initialization to initialize the different pieces While this works, there is probably a better more elegant way to do this. Parameters ---------- hidden_dim : int dimension of the hidden state activation : :class:`.Brick` gate_activation: :class:`.Brick` state_to_state_init: object Weight Initialization state_to_reset_init: object Weight Initialization state_to_update_init: obje64 Weight Initialization input_to_state_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_reset_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_update_transform: :class:`.Brick` [CvMG14] uses Linear transform References --------- self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=self.activation, gate_activation=self.gate_activation) .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio, *Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734. """ @lazy(allocation=[ 'hidden_dim', 'state_to_state_init', 'state_to_update_init', 'state_to_reset_init' ], initialization=[ 'input_to_state_transform', 'input_to_update_transform', 'input_to_reset_transform' ]) def __init__(self, hidden_dim, activation=None, gate_activation=None, state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None, input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None, **kwargs): super(GatedRecurrentFull, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.state_to_state_init = state_to_state_init self.state_to_update_init = state_to_update_init self.state_to_reset_init = state_to_reset_init self.input_to_state_transform = input_to_state_transform self.input_to_update_transform = input_to_update_transform self.input_to_reset_transform = input_to_reset_transform self.input_to_state_transform.name += "_input_to_state_transform" self.input_to_update_transform.name += "_input_to_update_transform" self.input_to_reset_transform.name += "_input_to_reset_transform" self.use_mine = True if self.use_mine: self.rnn = GatedRecurrentFast(weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) else: self.rnn = GatedRecurrent(weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) self.children = [ self.rnn, self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform ] self.children.extend(self.rnn.children) def initialize(self): super(GatedRecurrentFull, self).initialize() self.input_to_state_transform.initialize() self.input_to_update_transform.initialize() self.input_to_reset_transform.initialize() self.rnn.initialize() weight_shape = (self.hidden_dim, self.hidden_dim) state_to_state = self.state_to_state_init.generate(rng=self.rng, shape=weight_shape) state_to_update = self.state_to_update_init.generate( rng=self.rng, shape=weight_shape) state_to_reset = self.state_to_reset_init.generate(rng=self.rng, shape=weight_shape) self.rnn.state_to_state.set_value(state_to_state) if self.use_mine: self.rnn.state_to_update.set_value(state_to_update) self.rnn.state_to_reset.set_value(state_to_reset) else: self.rnn.state_to_gates.set_value( np.hstack((state_to_update, state_to_reset))) @application(inputs=['input_'], outputs=['output']) def apply(self, input_, mask=None): """ Parameters ---------- inputs_ : :class:`~tensor.TensorVariable` sequence to feed into GRU. Axes are mb, sequence, features mask : :class:`~tensor.TensorVariable` A 1D binary array with 1 or 0 to represent data given available. Returns ------- output: :class:`theano.tensor.TensorVariable` sequence to feed out. Axes are batch, sequence, features """ states_from_in = self.input_to_state_transform.apply(input_) update_from_in = self.input_to_update_transform.apply(input_) reset_from_in = self.input_to_reset_transform.apply(input_) gate_inputs = tensor.concatenate([update_from_in, reset_from_in], axis=2) if self.use_mine: output = self.rnn.apply(inputs=states_from_in, update_inputs=update_from_in, reset_inputs=reset_from_in, mask=mask) else: output = self.rnn.apply(inputs=states_from_in, gate_inputs=gate_inputs) return output