class InputModule(MergeLayer): # Input Module, which uses SemMemModule and GRULayer(lasgne) def __init__(self, incomings, voc_size, hid_state_size, SemMem=None, GRU=None, **kwargs): super(InputModule, self).__init__(incomings, **kwargs) if SemMem is not None: self.SemMem = SemMem else: self.SemMem = SemMemModule(incomings[0], voc_size, hid_state_size, **kwargs) if GRU is not None: self.GRU = GRU else: self.GRU = GRULayer(SemMem, hid_state_size) self.voc_size = voc_size self.hid_state_size = hid_state_size def get_params(self, **tags): # Because InputModules uses external GRULayer's parameters, # We have to notify this information to train the GRU's parameters. return self.GRU.get_params(**tags) def get_output_shape_for(self, input_shape): return (None, None, self.hid_state_size) def get_output_for(self, inputs, **kwargs): # input with size (batch, sentences, words) input = inputs[0] # original size of input_word is (batch, sentences) # input_word with size (batch x sentences, ) after flatten input_word = T.flatten(inputs[1]) word_dropout = inputs[2] # Apply word embedding # With size (batch x sentence, word, emb_dim) sentence_rep = self.SemMem.get_output_for([input, word_dropout]) # Apply GRU Layer # 'gru_outs' with size (batch x sentence, word, hid_state_size) gru_outs = self.GRU.get_output_for([sentence_rep]) # Extract candidate fact from GRU's output by input_word variable # resolving input with additional word # e.g. John went to the hallway nil nil nil -> [GRU1, ... ,GRU8] -> GRU5 # # hid_extract with size (batch x sentence, hid_state_size) hid_extract = gru_outs[T.arange(gru_outs.shape[0], dtype='int16'), input_word - 1] # candidate_facts with size (batch, sentences, hid_state_size) candidate_facts = T.reshape(x=hid_extract, newshape=(-1, input.shape[1], self.hid_state_size)) return candidate_facts
def test_gru_hid_init_layer_eval(): # Test `hid_init` as a `Layer` with some dummy input. Compare the output of # a network with a `Layer` as input to `hid_init` to a network with a # `np.array` as input to `hid_init` n_units = 7 n_test_cases = 2 in_shp = (n_test_cases, 2, 3) in_h_shp = (1, n_units) # dummy inputs X_test = np.ones(in_shp, dtype=theano.config.floatX) Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX) Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1)) # network with `Layer` initializer for hid_init l_inp = InputLayer(in_shp) l_inp_h = InputLayer(in_h_shp) l_rec_inp_layer = GRULayer(l_inp, n_units, hid_init=l_inp_h) # network with `np.array` initializer for hid_init l_rec_nparray = GRULayer(l_inp, n_units, hid_init=Xh_test) # copy network parameters from l_rec_inp_layer to l_rec_nparray l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()]) l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()]) for k, v in l_rn_param.items(): if k in l_il_param: v.set_value(l_il_param[k].get_value()) # build the theano functions X = T.tensor3() Xh = T.matrix() output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer, {l_inp: X, l_inp_h: Xh}) output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X}) # test both nets with dummy input output_val_inp_layer = output_inp_layer.eval({X: X_test, Xh: Xh_test_batch}) output_val_nparray = output_nparray.eval({X: X_test}) # check output given `Layer` is the same as with `np.array` assert np.allclose(output_val_inp_layer, output_val_nparray)
class InputModule(MergeLayer): # Input Module, which uses SemMemModule and GRULayer(lasgne) def __init__(self, incomings, voc_size, hid_state_size, SemMem=None, GRU=None, **kwargs): super(InputModule, self).__init__(incomings, **kwargs) if SemMem is not None: self.SemMem = SemMem else: self.SemMem = SemMemModule(incomings[0],voc_size,hid_state_size,**kwargs) if GRU is not None: self.GRU = GRU else: self.GRU = GRULayer(SemMem, hid_state_size) self.voc_size = voc_size self.hid_state_size = hid_state_size def get_params(self, **tags): # Because InputModules uses external GRULayer's parameters, # We have to inform this information to train them. return self.GRU.get_params(**tags) def get_output_shape_for(self, input_shape): return (None, None, self.hid_state_size) def get_output_for(self, inputs, **kwargs): input = inputs[0] input_word = T.flatten(inputs[1]) word_dropout = inputs[2] # Apply word embedding sentence_rep = self.SemMem.get_output_for([input, word_dropout]) # Apply GRU Layer gru_outs = self.GRU.get_output_for([sentence_rep]) # Extract candidate fact from GRU's output by input_word variable # resolving input with adtional word # e.g. John when to the hallway nil nil nil -> [GRU1, ... ,GRU8] -> GRU5 candidate_facts = T.reshape( gru_outs[T.arange(gru_outs.shape[0],dtype='int32'), input_word-1], (-1, input.shape[1], self.hid_state_size)) return candidate_facts
def test_gru_bck(): num_batch, seq_len, n_features1 = 2, 3, 4 num_units = 2 x = T.tensor3() in_shp = (num_batch, seq_len, n_features1) l_inp = InputLayer(in_shp) x_in = np.ones(in_shp).astype('float32') # need to set random seed. lasagne.random.get_rng().seed(1234) l_gru_fwd = GRULayer(l_inp, num_units=num_units, backwards=False) lasagne.random.get_rng().seed(1234) l_gru_bck = GRULayer(l_inp, num_units=num_units, backwards=True) output_fwd = helper.get_output(l_gru_fwd, x) output_bck = helper.get_output(l_gru_bck, x) output_fwd_val = output_fwd.eval({x: x_in}) output_bck_val = output_bck.eval({x: x_in}) # test that the backwards model reverses its final input np.testing.assert_almost_equal(output_fwd_val, output_bck_val[:, ::-1])
def test_gru_hid_init_mask(): # test that you can set hid_init to be a layer when a mask is provided l_inp = InputLayer((2, 2, 3)) l_inp_h = InputLayer((2, 5)) l_inp_msk = InputLayer((2, 2)) l_gru = GRULayer(l_inp, 5, hid_init=l_inp_h, mask_input=l_inp_msk) x = T.tensor3() h = T.matrix() msk = T.matrix() inputs = {l_inp: x, l_inp_h: h, l_inp_msk: msk} output = lasagne.layers.get_output(l_gru, inputs)
def test_gru_passthrough(): # Tests that the LSTM can simply pass through its input l_in = InputLayer((4, 5, 6)) zero = lasagne.init.Constant(0.) one = lasagne.init.Constant(1.) pass_gate = Gate(zero, zero, None, one, None) no_gate = Gate(zero, zero, None, zero, None) in_pass_gate = Gate( np.eye(6).astype(theano.config.floatX), zero, None, zero, None) l_rec = GRULayer(l_in, 6, no_gate, pass_gate, in_pass_gate) out = lasagne.layers.get_output(l_rec) inp = np.arange(4 * 5 * 6).reshape(4, 5, 6).astype(theano.config.floatX) np.testing.assert_almost_equal(out.eval({l_in.input_var: inp}), inp)
def __init__(self, incomings, voc_size, hid_state_size, SemMem=None, GRU=None, **kwargs): super(InputModule, self).__init__(incomings, **kwargs) if SemMem is not None: self.SemMem = SemMem else: self.SemMem = SemMemModule(incomings[0],voc_size,hid_state_size,**kwargs) if GRU is not None: self.GRU = GRU else: self.GRU = GRULayer(SemMem, hid_state_size) self.voc_size = voc_size self.hid_state_size = hid_state_size
def test_gru_return_shape(): num_batch, seq_len, n_features1, n_features2 = 5, 3, 10, 11 num_units = 6 x = T.tensor4() in_shp = (num_batch, seq_len, n_features1, n_features2) l_inp = InputLayer(in_shp) l_rec = GRULayer(l_inp, num_units=num_units) x_in = np.random.random(in_shp).astype('float32') output = helper.get_output(l_rec, x) output_val = output.eval({x: x_in}) assert helper.get_output_shape(l_rec, x_in.shape) == output_val.shape assert output_val.shape == (num_batch, seq_len, num_units)
def _add_decoder(self): """ Decoder returns the batch of sequences of thought vectors, each corresponds to a decoded token reshapes this 3d tensor to 2d matrix so that the next Dense layer can convert each thought vector to a probability distribution vector """ self._net['hid_states_decoder'] = InputLayer( shape=(None, self._decoder_depth, None), input_var=T.tensor3('hid_inits_decoder'), name='hid_states_decoder') # repeat along the sequence axis output_seq_len times, where output_seq_len is inferred from input tensor self._net['enc_repeated'] = RepeatLayer( incoming=self._net[ 'enc_result'], # input shape = (batch_size, encoder_output_dimension) n=self._output_seq_len, name='repeat_layer') self._net['emb_condition_id_repeated'] = RepeatLayer( incoming=self._net['emb_condition_id'], n=self._output_seq_len, name='embedding_condition_id_repeated') self._net['dec_concated_input'] = ConcatLayer( incomings=[ self._net['emb_y'], self._net['enc_repeated'], self._net['emb_condition_id_repeated'] ], axis=2, name='decoder_concated_input') # shape = (batch_size, input_seq_len, encoder_output_dimension) self._net['dec_0'] = self._net['dec_concated_input'] for dec_layer_id in xrange(1, self._decoder_depth + 1): # input shape = (batch_size, input_seq_len, embedding_dimension + hidden_dimension) self._net['dec_' + str(dec_layer_id)] = GRULayer( incoming=self._net['dec_' + str(dec_layer_id - 1)], num_units=self._hidden_layer_dim, grad_clipping=self._grad_clip, only_return_final=False, name='decoder_' + str(dec_layer_id), mask_input=self._net['input_y_mask'], hid_init=SliceLayer(self._net['hid_states_decoder'], dec_layer_id - 1, axis=1)) self._net['dec'] = self._net['dec_' + str(self._decoder_depth)]
def test_gru_precompute(): num_batch, seq_len, n_features1 = 2, 3, 4 num_units = 2 in_shp = (num_batch, seq_len, n_features1) l_inp = InputLayer(in_shp) l_mask_inp = InputLayer(in_shp[:2]) x_in = np.random.random(in_shp).astype('float32') mask_in = np.ones((num_batch, seq_len), dtype='float32') # need to set random seed. lasagne.random.get_rng().seed(1234) l_gru_precompute = GRULayer(l_inp, num_units=num_units, precompute_input=True, mask_input=l_mask_inp) lasagne.random.get_rng().seed(1234) l_gru_no_precompute = GRULayer(l_inp, num_units=num_units, precompute_input=False, mask_input=l_mask_inp) output_precompute = helper.get_output(l_gru_precompute).eval({ l_inp.input_var: x_in, l_mask_inp.input_var: mask_in }) output_no_precompute = helper.get_output(l_gru_no_precompute).eval({ l_inp.input_var: x_in, l_mask_inp.input_var: mask_in }) # test that the backwards model reverses its final input np.testing.assert_almost_equal(output_precompute, output_no_precompute)
def test_gru_variable_input_size(): # that seqlen and batchsize None works num_batch, n_features1 = 6, 5 num_units = 13 x = T.tensor3() in_shp = (None, None, n_features1) l_inp = InputLayer(in_shp) x_in1 = np.ones((num_batch + 1, 10, n_features1)).astype('float32') x_in2 = np.ones((num_batch, 15, n_features1)).astype('float32') l_rec = GRULayer(l_inp, num_units=num_units, backwards=False) output = helper.get_output(l_rec, x) output.eval({x: x_in1}) output.eval({x: x_in2})
def test_gru_unroll_scan_fwd(): num_batch, seq_len, n_features1 = 2, 3, 4 num_units = 2 in_shp = (num_batch, seq_len, n_features1) l_inp = InputLayer(in_shp) l_mask_inp = InputLayer(in_shp[:2]) x_in = np.random.random(in_shp).astype('float32') mask_in = np.ones(in_shp[:2]).astype('float32') # need to set random seed. lasagne.random.get_rng().seed(1234) l_gru_scan = GRULayer(l_inp, num_units=num_units, backwards=False, unroll_scan=False, mask_input=l_mask_inp) lasagne.random.get_rng().seed(1234) l_gru_unrolled = GRULayer(l_inp, num_units=num_units, backwards=False, unroll_scan=True, mask_input=l_mask_inp) output_scan = helper.get_output(l_gru_scan) output_unrolled = helper.get_output(l_gru_unrolled) output_scan_val = output_scan.eval({ l_inp.input_var: x_in, l_mask_inp.input_var: mask_in }) output_unrolled_val = output_unrolled.eval({ l_inp.input_var: x_in, l_mask_inp.input_var: mask_in }) np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs): self.num_batch = num_batch self.n_features = n_features self.max_len = max_len self.hidden = hidden rng = np.random.RandomState(123) self.drng = rng self.rng = RandomStreams(rng.randint(2 ** 30)) # params initial_W = np.asarray( rng.uniform( low=-4 * np.sqrt(6. / (self.hidden[1] + self.n_features)), high=4 * np.sqrt(6. / (self.hidden[1] + self.n_features)), size=(self.hidden[1], self.n_features) ), dtype=theano.config.floatX ) self.W = theano.shared(value=initial_W, name='W', borrow=True) # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True) self.b = theano.shared( value=np.zeros( self.n_features, dtype=theano.config.floatX ), borrow=True ) # self.b_y_kappa = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) # I could directly create the model here since it is fixed self.l_in = InputLayer(shape=(None, self.max_len, self.n_features)) self.mask_input = InputLayer(shape=(None, self.max_len)) first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0]) # l_shp = ReshapeLayer(first_hidden, (-1, hidden[0])) # l_dense = DenseLayer(l_shp, num_units=self.hidden[0], nonlinearity=rectify) # l_drop = DropoutLayer(l_dense, p=0.5) # l_shp = ReshapeLayer(l_drop, (-1, self.max_len, self.hidden[0])) self.model = GRULayer(first_hidden, num_units=hidden[1])
def gru_column(input, num_units, hidden, **kwargs): kwargs.pop("only_return_final", None) assert isinstance(hidden, (list, tuple)) name = kwargs.pop("name", "default") column = [input] for i, l_hidden in enumerate(hidden): kwargs_ = kwargs.copy() if isinstance(l_hidden, Layer): kwargs_.pop("learn_init", None) kwargs_["hid_init"] = l_hidden layer = GRULayer(column[-1], num_units, name=os.path.join(name, "gru_%02d" % i), **kwargs_) column.append(layer) return column[1:]
def _add_utterance_encoder(self): # input shape = (batch_size * input_context_size, input_seq_len, embedding_dimension) self._add_forward_backward_encoder_layer() for enc_layer_id in xrange(1, self._encoder_depth): is_last_encoder_layer = enc_layer_id == self._encoder_depth - 1 return_only_final_state = is_last_encoder_layer # input shape = (batch_size * input_context_size, input_seq_len, embedding_dimension) self._net['enc_' + str(enc_layer_id)] = GRULayer( incoming=self._net['enc_' + str(enc_layer_id - 1)], num_units=self._hidden_layer_dim, grad_clipping=self._grad_clip, only_return_final=return_only_final_state, name='encoder_' + str(enc_layer_id), mask_input=self._net['input_x_mask']) self._net['enc'] = self._net['enc_' + str(self._encoder_depth - 1)]
def test_gru_nparams_hid_init_layer(): # test that you can see layers through hid_init l_inp = InputLayer((2, 2, 3)) l_inp_h = InputLayer((2, 5)) l_inp_h_de = DenseLayer(l_inp_h, 7) l_gru = GRULayer(l_inp, 7, hid_init=l_inp_h_de) # directly check the layers can be seen through hid_init assert lasagne.layers.get_all_layers(l_gru) == [l_inp, l_inp_h, l_inp_h_de, l_gru] # 3*n_gates + 2 # the 3 is because we have hid_to_gate, in_to_gate and bias for each gate # 2 is for the W and b parameters in the DenseLayer assert len(lasagne.layers.get_all_params(l_gru, trainable=True)) == 11 # GRU bias params(3) + Dense bias params(1) assert len(lasagne.layers.get_all_params(l_gru, regularizable=False)) == 4
def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs): self.num_batch = num_batch self.n_features = n_features self.max_len = max_len self.hidden = hidden rng = np.random.RandomState(123) self.drng = rng self.rng = RandomStreams(rng.randint(2 ** 30)) # params initial_W = np.asarray( rng.uniform( low=1e-5, high=1, size=(self.hidden[1], self.n_features) ), dtype=theano.config.floatX ) self.W_y_theta = theano.shared(value=initial_W, name='W_y_theta', borrow=True) # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True) self.b_y_theta = theano.shared( value=np.zeros( self.n_features, dtype=theano.config.floatX ), borrow=True ) # self.b_y_kappa = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) # I could directly create the model here since it is fixed self.l_in = InputLayer(shape=(self.num_batch, self.max_len, self.n_features)) self.mask_input = InputLayer(shape=(self.num_batch, self.max_len)) first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0]) self.model =GRULayer(first_hidden, num_units=hidden[1])
def gated_layer(incoming, num_units, grad_clipping, only_return_final, backwards, gated_layer_type, mask_input=None, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), resetgate=lasagne.layers.Gate(W_cell=None), updategate=lasagne.layers.Gate(W_cell=None), hidden_update=lasagne.layers.Gate( W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), name=None): if gated_layer_type == "gru": return GRULayer(incoming, num_units, mask_input=mask_input, grad_clipping=grad_clipping, only_return_final=only_return_final, backwards=backwards, hid_init=hid_init, resetgate=resetgate, updategate=updategate, hidden_update=hidden_update, name=name) else: return LSTMLayer(incoming, num_units, mask_input=mask_input, grad_clipping=grad_clipping, nonlinearity=lasagne.nonlinearities.tanh, only_return_final=only_return_final, backwards=backwards, cell_init=cell_init, hid_init=hid_init, resetgate=resetgate, updategate=updategate, hidden_update=hidden_update, name=name)
def test_gru_hid_init_layer_eval(): # Test `hid_init` as a `Layer` with some dummy input. Compare the output of # a network with a `Layer` as input to `hid_init` to a network with a # `np.array` as input to `hid_init` n_units = 7 n_test_cases = 2 in_shp = (n_test_cases, 2, 3) in_h_shp = (1, n_units) # dummy inputs X_test = np.ones(in_shp, dtype=theano.config.floatX) Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX) Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1)) # network with `Layer` initializer for hid_init l_inp = InputLayer(in_shp) l_inp_h = InputLayer(in_h_shp) l_rec_inp_layer = GRULayer(l_inp, n_units, hid_init=l_inp_h) # network with `np.array` initializer for hid_init l_rec_nparray = GRULayer(l_inp, n_units, hid_init=Xh_test) # copy network parameters from l_rec_inp_layer to l_rec_nparray l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()]) l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()]) for k, v in l_rn_param.items(): if k in l_il_param: v.set_value(l_il_param[k].get_value()) # build the theano functions X = T.tensor3() Xh = T.matrix() output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer, { l_inp: X, l_inp_h: Xh }) output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X}) # test both nets with dummy input output_val_inp_layer = output_inp_layer.eval({ X: X_test, Xh: Xh_test_batch }) output_val_nparray = output_nparray.eval({X: X_test}) # check output given `Layer` is the same as with `np.array` assert np.allclose(output_val_inp_layer, output_val_nparray)
def _add_context_encoder(self): self._net['batched_enc'] = reshape( self._net['enc'], (self._batch_size, self._input_context_size, get_output_shape(self._net['enc'])[-1])) self._net['context_enc'] = GRULayer( incoming=self._net['batched_enc'], num_units=self._hidden_layer_dim, grad_clipping=self._grad_clip, only_return_final=True, name='context_encoder') self._net['switch_enc_to_tv'] = T.iscalar(name='switch_enc_to_tv') self._net['thought_vector'] = InputLayer( shape=(None, self._hidden_layer_dim), input_var=T.fmatrix(name='thought_vector'), name='thought_vector') self._net['enc_result'] = SwitchLayer( incomings=[self._net['thought_vector'], self._net['context_enc']], condition=self._net['switch_enc_to_tv']) # We need the following to pass as 'givens' argument when compiling theano functions: self._default_thoughts_vector = T.zeros((self._batch_size, self._hidden_layer_dim)) self._default_input_x = T.zeros(shape=(self._net['thought_vector'].input_var.shape[0], 1, 1), dtype=np.int32)
def __init__(self, in_path, concat=False, wsi_path=None, dat_path='data/dat.pkl', supp_path='data/supp.pkl'): self.in_path = in_path self.concat = concat self.wsi_path = wsi_path self.lm_mode = 'default' with open(self.in_path, 'rb') as f: p = pk.load(f) self.do_brnn = False if 'do_brnn' in p: self.do_brnn = p['do_brnn'] self.is_lstm = 'Wxo' in p['params'] self.is_gru = 'Whr' in p['params'] if 'Wt' not in p: self.lm_mode = 'iden' elif p['Wt'].get_value().ndim == 1: self.lm_mode = 'diag' self.params = p['params'] self.dwe = self.params['dwe'] # disambiguated word embeddings self.td = self.dwe.get_value().shape[1] self.hd = self.params['L'].get_value().shape[1] self.gc = 2 self.l_mask = InputLayer((None, None), trainable=False) if self.is_lstm: self.l_gru_emb = LSTMLayer((None, None, self.td), self.hd, grad_clipping=self.gc, \ ingate=Gate(W_in=self.params['Wxr'], W_hid=self.params['Whr'], b=self.params['br'], W_cell=None), \ forgetgate=Gate(W_in=self.params['Wxu'], W_hid=self.params['Whu'], b=self.params['bu'], W_cell=None), \ outgate=Gate(W_in=self.params['Wxo'], W_hid=self.params['Who'], b=self.params['bo'], W_cell=None), \ cell=Gate(W_in=self.params['Wxc'], W_hid=self.params['Whc'], b=self.params['bc'], W_cell=None,\ nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, peepholes=False) if self.do_brnn: self.l_bgru_emb = LSTMLayer((None, None, self.td), self.hd, grad_clipping=self.gc, \ ingate=Gate(W_in=self.params['bWxr'], W_hid=self.params['bWhr'], b=self.params['bbr'], W_cell=None), \ forgetgate=Gate(W_in=self.params['bWxu'], W_hid=self.params['bWhu'], b=self.params['bbu'], W_cell=None), \ outgate=Gate(W_in=self.params['bWxo'], W_hid=self.params['bWho'], b=self.params['bbo'], W_cell=None), \ cell=Gate(W_in=self.params['bWxc'], W_hid=self.params['bWhc'], b=self.params['bbc'], W_cell=None,\ nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, peepholes=False, backwards=True) elif self.is_gru: self.l_gru_emb = GRULayer((None, None, self.td), self.hd, grad_clipping=self.gc, \ resetgate=Gate(W_in=self.params['Wxr'], W_hid=self.params['Whr'], b=self.params['br'], W_cell=None), \ updategate=Gate(W_in=self.params['Wxu'], W_hid=self.params['Whu'], b=self.params['bu'], W_cell=None), \ hidden_update=Gate(W_in=self.params['Wxc'], W_hid=self.params['Whc'], b=self.params['bc'], W_cell=None,\ nonlinearity=nonlinearities.tanh),mask_input=self.l_mask) if self.do_brnn: self.l_bgru_emb = GRULayer((None, None, self.td), self.hd, grad_clipping=self.gc, \ resetgate=Gate(W_in=self.params['bWxr'], W_hid=self.params['bWhr'], b=self.params['bbr'], W_cell=None), \ updategate=Gate(W_in=self.params['bWxu'], W_hid=self.params['bWhu'], b=self.params['bbu'], W_cell=None), \ hidden_update=Gate(W_in=self.params['bWxc'], W_hid=self.params['bWhc'], b=self.params['bbc'], W_cell=None,\ nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, backwards=True) else: self.is_nlm = True with open(dat_path, 'rb') as f: d = pk.load(f) self.nw, self.mw, self.ms = d['def'].shape # num words, max num of words, max num of senses self.dw = d['dw'] # dw to index self.aw = d['aw'] self.no = len(d['aw']) if 'spriors' in d: self.sense_priors = d['spriors'] else: self.sense_priors = np.ones((self.no, self.ms)) with open(supp_path, 'rb') as f: s = pk.load(f) self.id2aw = s['id2aw'] self.id2dw = s['id2dw'] self.aw2dw = s['aw2dw'] self.build_encoder()
def test_gru_grad_clipping(): # test that you can set grad_clip variable x = T.tensor3() l_rec = GRULayer(InputLayer((2, 2, 3)), 5, grad_clipping=1) output = lasagne.layers.get_output(l_rec, x)
hidden.append(slice_) return hidden ############################################################################### # ENCODER # ############################################################################### # Encoder's Recurrent subnetwork l_encoder_mask = InputLayer((None, None), name="encoder/mask") l_encoder_embed = InputLayer((None, None, n_embed_char), name="encoder/input") bidi_gru = [] bidi_gru.append( GRULayer(l_encoder_embed, n_hidden_encoder, learn_init=True, name="encoder/gru_f", backwards=False)) bidi_gru.append( GRULayer(l_encoder_embed, n_hidden_encoder, learn_init=True, name="encoder/gru_b", backwards=True)) l_encoder_context = ConcatLayer(bidi_gru, axis=-1, name="encoder/cat") ############################################################################### # DECODER # ############################################################################### # Decoder's Recurrent subnetwork
class PRAE: def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs): self.num_batch = num_batch self.n_features = n_features self.max_len = max_len self.hidden = hidden rng = np.random.RandomState(123) self.drng = rng self.rng = RandomStreams(rng.randint(2**30)) # params initial_W = np.asarray(rng.uniform(low=1e-5, high=1, size=(self.hidden[1], self.n_features)), dtype=theano.config.floatX) self.W_y_theta = theano.shared(value=initial_W, name='W_y_theta', borrow=True) # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True) self.b_y_theta = theano.shared(value=np.zeros( self.n_features, dtype=theano.config.floatX), borrow=True) # self.b_y_kappa = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) # I could directly create the model here since it is fixed self.l_in = InputLayer(shape=(self.num_batch, self.max_len, self.n_features)) self.mask_input = InputLayer(shape=(self.num_batch, self.max_len)) first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0]) self.model = GRULayer(first_hidden, num_units=hidden[1]) # need some reshape voodoo # l_shp = ReshapeLayer(second_hidden, (-1, hidden[1])) # after the reshape I have batch*max_len X features # self.model = DenseLayer(l_shp, num_units=self.n_features, nonlinearity=rectify) # if now I put a dense layer this will collect all the output temporally which is what I want, I'll have to fix # the dimensions probably later # For every gaussian in the sum I need 3 values plus a value for the total scale # the output of this layer will be (num_batch, num_units, max_len) TODO check size def get_output_shape_for(self): return self.model.get_output_shape_for(self.num_batch, self.max_len, self.hidden[2]) def get_output_y(self, output): # (batch, time, hidden) X (hidden, features) + (, features) => (batch, time, features) theta_out = T.nnet.relu(T.dot(output, self.W_y_theta) + self.b_y_theta) #kappa_out = T.nnet.relu(T.dot(output, self.W_y_kappa) + self.b_y_kappa) return theta_out def get_log_x(self, x, theta_out): # DIM = (batch, time, hidden) # everything is elementwise log_x = T.log(theta_out + 1e-8) - theta_out * x log_x = log_x.sum(axis=2, dtype=theano.config.floatX ) # sum over x cause I assume they are independent return log_x def build_model(self, train_x, train_mask_x, train_mask_out, train_target, test_x, test_mask_x, test_mask_out, test_target): self.train_x = train_x self.train_mask_x = train_mask_x self.train_mask_out = train_mask_out self.train_target = train_target self.test_x = test_x self.test_mask_x = test_mask_x self.test_mask_out = test_mask_out self.test_target = test_target self.index = T.iscalar('index') self.num_batch_test = T.iscalar('index') self.b_slice = slice(self.index * self.num_batch, (self.index + 1) * self.num_batch) sym_x = T.dtensor3() sym_mask_x = T.dmatrix() sym_target = T.dtensor3() # sym_mask_out = T.dtensor3() should not be useful since output is still zero # TODO think about this if it is true output = lasagne.layers.get_output(self.model, inputs={ self.l_in: sym_x, self.mask_input: sym_mask_x }) theta = self.get_output_y(output) log_px = self.get_log_x(sym_target, theta) log_px_sum_time = log_px.sum(axis=1, dtype=theano.config.floatX) # sum over tx loss = -T.sum(log_px_sum_time) / self.num_batch # average over batch ## log_px_test = self.get_log_x(sym_target, theta) log_px_sum_time_test = log_px_test.sum( axis=1, dtype=theano.config.floatX) # sum over time loss_test = -T.sum( log_px_sum_time_test) / self.num_batch_test # average over batch # loss = T.mean(lasagne.objectives.squared_error(mu, sym_target)) all_params = [self.W_y_theta] + [ self.b_y_theta ] + lasagne.layers.get_all_params(self.model) all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss, all_params)] all_grads_target = lasagne.updates.total_norm_constraint( all_grads_target, 3) updates_target = adam(all_grads_target, all_params) train_model = theano.function( [self.index], [loss, theta, log_px], givens={ sym_x: self.train_x[self.b_slice], sym_mask_x: self.train_mask_x[self.b_slice], sym_target: self.train_target[self.b_slice] }, updates=updates_target) test_model = theano.function( [self.num_batch_test], [loss_test, theta], givens={ sym_x: self.test_x, sym_mask_x: self.test_mask_x, sym_target: self.test_target }) return train_model, test_model
def test_memory_cells(batch_size=3, seq_len=50, input_dim=8, n_hidden=16): # lasagne way l_in = InputLayer( (None, seq_len, input_dim), input_var=theano.shared( np.random.normal(size=[batch_size, seq_len, input_dim])), name='input seq') l_lstm0 = LSTMLayer(l_in, n_hidden, name='lstm') l_gru0 = GRULayer(l_in, n_hidden, name='gru') f_predict0 = theano.function([], get_output([l_lstm0, l_gru0])) # agentnet way s_in = InputLayer((None, input_dim), name='in') s_prev_cell = InputLayer((None, n_hidden), name='cell') s_prev_hid = InputLayer((None, n_hidden), name='hid') s_lstm_cell, s_lstm_hid = LSTMCell(s_prev_cell, s_prev_hid, s_in, name='lstm') s_prev_gru = InputLayer((None, n_hidden), name='hid') s_gru = GRUCell(s_prev_gru, s_in, name='gru') rec = Recurrence(state_variables=OrderedDict({ s_lstm_cell: s_prev_cell, s_lstm_hid: s_prev_hid, s_gru: s_prev_gru }), input_sequences={s_in: l_in}, unroll_scan=False) state_seqs, _ = rec.get_sequence_layers() l_lstm1 = state_seqs[s_lstm_hid] l_gru1 = state_seqs[s_gru] f_predict1 = theano.function([], get_output([l_lstm1, l_gru1])) # lstm param transfer old_params = sorted(get_all_params(l_lstm0, trainable=True), key=lambda p: p.name) new_params = sorted(get_all_params(s_lstm_hid, trainable=True), key=lambda p: p.name) for old, new in zip(old_params, new_params): print old.name, '<-', new.name assert tuple(old.shape.eval()) == tuple(new.shape.eval()) old.set_value(new.get_value()) # gru param transfer old_params = sorted(get_all_params(l_gru0, trainable=True), key=lambda p: p.name) new_params = sorted(get_all_params(s_gru, trainable=True), key=lambda p: p.name) for old, new in zip(old_params, new_params): print old.name, '<-', new.name assert tuple(old.shape.eval()) == tuple(new.shape.eval()) old.set_value(new.get_value()) lstm0_out, gru0_out = f_predict0() lstm1_out, gru1_out = f_predict1() assert np.allclose(lstm0_out, lstm1_out) assert np.allclose(gru0_out, gru1_out)
class PRAE: def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs): self.num_batch = num_batch self.n_features = n_features self.max_len = max_len self.hidden = hidden rng = np.random.RandomState(123) self.drng = rng self.rng = RandomStreams(rng.randint(2 ** 30)) # params initial_W = np.asarray( rng.uniform( low=-4 * np.sqrt(6. / (self.hidden[1] + self.n_features)), high=4 * np.sqrt(6. / (self.hidden[1] + self.n_features)), size=(self.hidden[1], self.n_features) ), dtype=theano.config.floatX ) self.W = theano.shared(value=initial_W, name='W', borrow=True) # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True) self.b = theano.shared( value=np.zeros( self.n_features, dtype=theano.config.floatX ), borrow=True ) # self.b_y_kappa = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) # I could directly create the model here since it is fixed self.l_in = InputLayer(shape=(None, self.max_len, self.n_features)) self.mask_input = InputLayer(shape=(None, self.max_len)) first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0]) # l_shp = ReshapeLayer(first_hidden, (-1, hidden[0])) # l_dense = DenseLayer(l_shp, num_units=self.hidden[0], nonlinearity=rectify) # l_drop = DropoutLayer(l_dense, p=0.5) # l_shp = ReshapeLayer(l_drop, (-1, self.max_len, self.hidden[0])) self.model = GRULayer(first_hidden, num_units=hidden[1]) # self.model = ConcatLayer([first_hidden, second_hidden], axis=2) # l_shp = ReshapeLayer(second_hidden, (-1, hidden[1])) # l_dense = DenseLayer(l_shp, num_units=self.n_features, nonlinearity=rectify) # To reshape back to our original shape, we can use the symbolic shape # variables we retrieved above. #self.model = ReshapeLayer(l_dense, (-1, self.max_len, self.n_features)) # if now I put a dense layer this will collect all the output temporally which is what I want, I'll have to fix # the dimensions probably later # For every gaussian in the sum I need 3 values plus a value for the total scale # the output of this layer will be (num_batch, num_units, max_len) TODO check size def get_output_shape_for(self): return self.model.get_output_shape_for(self.num_batch, self.max_len, self.hidden[1]) def get_output_y(self, x): return T.nnet.relu(T.dot(x, self.W) + self.b) def build_model(self, train_x, train_mask_x, train_mask_out, train_target, test_x, test_mask_x, test_mask_out, test_target): self.train_x = train_x self.train_mask_x = train_mask_x self.train_mask_out = train_mask_out self.train_target = train_target self.test_x = test_x self.test_mask_x = test_mask_x self.test_mask_out = test_mask_out self.test_target = test_target self.index = T.iscalar('index') self.num_batch_test = T.iscalar('index') self.b_slice = slice(self.index * self.num_batch, (self.index + 1) * self.num_batch) sym_x = T.dtensor3() sym_mask_x = T.dmatrix() sym_target = T.dtensor3() sym_mask_out = T.dtensor3() # sym_mask_out = T.dtensor3() should not be useful since output is still zero # TODO think about this if it is true out = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x}) out_out = self.get_output_y(out) loss = T.mean(lasagne.objectives.squared_error(out_out, sym_target)) / self.num_batch out_test = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x}) out_out_test = self.get_output_y(out_test) loss_test = T.mean(lasagne.objectives.squared_error(out_out_test, sym_target)) / self.num_batch_test all_params = [self.W] + [self.b] +lasagne.layers.get_all_params(self.model) all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss, all_params)] all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 3) updates_target = adam(all_grads_target, all_params) train_model = theano.function([self.index], [loss, out_out], givens={sym_x: self.train_x[self.b_slice], sym_mask_x: self.train_mask_x[self.b_slice], sym_target: self.train_target[self.b_slice], }, updates=updates_target) test_model = theano.function([self.num_batch_test], [loss_test, out_out_test], givens={sym_x: self.test_x, sym_mask_x: self.test_mask_x, sym_target: self.test_target, }) return train_model, test_model
def test_gru_init_val_error(): # check if errors are raised when init is non matrix tensorVariable vector = T.vector() with pytest.raises(ValueError): l_rec = GRULayer(InputLayer((2, 2, 3)), 5, hid_init=vector)
def __init__(self, model_name, max_seq_len, num_features_pitch, num_features_duration, num_gru_layer_units=25, set_x_input_to_zero=False, in_dropout_p=0, out_dropout_p=0, use_l2_penalty=False): super(GRU_Network, self).__init__(model_name, max_seq_len, num_features_pitch, num_features_duration, num_gru_layer_units, set_x_input_to_zero, in_dropout_p, use_l2_penalty) self.out_dropout_p = out_dropout_p ##### THE LAYERS OF THE NEXT-STEP PREDICTION GRU NETWORK ##### ### INPUT NETWORK ### # Two input layers receiving Onehot-encoded data l_in_pitch = InputLayer((None, None, self.num_features_pitch), name="l_in_pitch") l_in_duration = InputLayer((None, None, self.num_features_duration), name="l_in_duration") # Layer merging the two input layers l_in_merge = ConcatLayer([l_in_pitch, l_in_duration], axis=2, name="l_in_merge") # Dropout in input network l_in_intermediate = l_in_merge if self.in_dropout_p > 0: l_in_intermediate = DropoutLayer(l_in_intermediate, rescale=False, p=self.in_dropout_p, shared_axes=(1,2)) # The mask layer for ignoring time-steps after <eos> in the GRU layer l_in_mask = InputLayer((None, self.max_seq_len), name="l_in_mask") ### OUTPUT NETWORK ### # A normal GRU layer self.l_out_gru = GRULayer(l_in_intermediate, num_units=self.num_gru_layer_units, name='GRULayer', mask_input=l_in_mask) # Dropout in output network l_out_intermediate = self.l_out_gru if self.out_dropout_p > 0: l_out_intermediate = DropoutLayer(l_out_intermediate, rescale=False, p=self.out_dropout_p) # We need to do some reshape voodo to connect a softmax layer. # See http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html#examples # In short this line changes the shape from # (batch_size, decode_len, num_dec_units) -> (batch_size*decodelen,num_dec_units). # We need to do this since the softmax is applied to the last dimension and we want to # softmax the output at each position individually l_out_reshape = ReshapeLayer(l_out_intermediate, (-1, [2]), name="l_out_reshape") # Setting up the output-layers as softmax-encoded pitch and duration vectors from the dense layers. (Two dense layers with softmax output, e.g. prediction probabilities for next note in melody) l_out_softmax_pitch = DenseLayer(l_out_reshape, num_units=self.num_features_pitch, nonlinearity=lasagne.nonlinearities.softmax, name='SoftmaxOutput_pitch') l_out_softmax_duration = DenseLayer(l_out_reshape, num_units=self.num_features_duration, nonlinearity=lasagne.nonlinearities.softmax, name='SoftmaxOutput_duration') # reshape back to 3d format (batch_size, decode_len, num_dec_units). Here we tied the batch size to the shape of the symbolic variable for X allowing #us to use different batch sizes in the model. self.l_out_pitch = ReshapeLayer(l_out_softmax_pitch, (-1, self.max_seq_len, self.num_features_pitch), name="l_out_pitch") self.l_out_duration = ReshapeLayer(l_out_softmax_duration, (-1, self.max_seq_len, self.num_features_duration), name="l_out_duration") ### NETWORK OUTPUTS ### # Setting up the output as softmax-encoded pitch and duration vectors from the dense softmax layers. # (OBS: This is bypassing the onehot layers, so we evaluate the model on the softmax-outputs directly) output_pitch_train = get_output(self.l_out_pitch, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = False) output_duration_train = get_output(self.l_out_duration, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = False) output_pitch_eval = get_output(self.l_out_pitch, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = True) output_duration_eval = get_output(self.l_out_duration, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = True) output_gru = get_output(self.l_out_gru, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = True) #Get parameters from all layers except nondeterministic (dropout) all_parameters = get_all_params([self.l_out_pitch, self.l_out_duration], trainable=True) print "Trainable Model Parameters" print "-"*40 for param in all_parameters: print param, param.get_value().shape print "-"*40 # Compute costs # For indeterministic training cost_pitch_train, acc_pitch_train = eval(output_pitch_train, self.y_pitch_sym, self.num_features_pitch, self.y_mask_sym) cost_duration_train, acc_duration_train = eval(output_duration_train, self.y_duration_sym, self.num_features_duration, self.y_mask_sym) if self.use_l2_penalty: l2_penalty = regularize_layer_params([self.l_out_pitch, self.l_out_duration], l2) else: l2_penalty = 0 total_cost = cost_pitch_train + cost_duration_train + l2_penalty # and deterministic evaluation cost_pitch_eval, acc_pitch_eval = eval(output_pitch_eval, self.y_pitch_sym, self.num_features_pitch, self.y_mask_sym) cost_duration_eval, acc_duration_eval = eval(output_duration_eval, self.y_duration_sym, self.num_features_duration, self.y_mask_sym) #add grad clipping to avoid exploding gradients all_grads = [T.clip(g,-3,3) for g in T.grad(total_cost, all_parameters)] all_grads = lasagne.updates.total_norm_constraint(all_grads,3) #Compile Theano functions. updates = lasagne.updates.adam(all_grads, all_parameters, learning_rate=0.005) self.f_train = theano.function([self.x_pitch_sym, self.y_pitch_sym, self.x_duration_sym, self.y_duration_sym, self.x_mask_sym, self.y_mask_sym], [cost_pitch_train, acc_pitch_train, output_pitch_train, cost_duration_train, acc_duration_train, output_duration_train], updates=updates) #since we have stochasticity in the network when dropout is used we will use the evaluation graph without any updates given and deterministic=True. self.f_eval = theano.function([self.x_pitch_sym, self.y_pitch_sym, self.x_duration_sym, self.y_duration_sym, self.x_mask_sym, self.y_mask_sym], [cost_pitch_eval, acc_pitch_eval, output_pitch_eval, cost_duration_eval, acc_duration_eval, output_duration_eval]) self.f_eval_gru = theano.function([self.x_pitch_sym, self.x_duration_sym, self.x_mask_sym], output_gru)
class Sent2Vec: def __init__(self, in_path, concat=False, wsi_path=None, dat_path='data/dat.pkl', supp_path='data/supp.pkl'): self.in_path = in_path self.concat = concat self.wsi_path = wsi_path self.lm_mode = 'default' with open(self.in_path, 'rb') as f: p = pk.load(f) self.do_brnn = False if 'do_brnn' in p: self.do_brnn = p['do_brnn'] self.is_lstm = 'Wxo' in p['params'] self.is_gru = 'Whr' in p['params'] if 'Wt' not in p: self.lm_mode = 'iden' elif p['Wt'].get_value().ndim == 1: self.lm_mode = 'diag' self.params = p['params'] self.dwe = self.params['dwe'] # disambiguated word embeddings self.td = self.dwe.get_value().shape[1] self.hd = self.params['L'].get_value().shape[1] self.gc = 2 self.l_mask = InputLayer((None, None), trainable=False) if self.is_lstm: self.l_gru_emb = LSTMLayer((None, None, self.td), self.hd, grad_clipping=self.gc, \ ingate=Gate(W_in=self.params['Wxr'], W_hid=self.params['Whr'], b=self.params['br'], W_cell=None), \ forgetgate=Gate(W_in=self.params['Wxu'], W_hid=self.params['Whu'], b=self.params['bu'], W_cell=None), \ outgate=Gate(W_in=self.params['Wxo'], W_hid=self.params['Who'], b=self.params['bo'], W_cell=None), \ cell=Gate(W_in=self.params['Wxc'], W_hid=self.params['Whc'], b=self.params['bc'], W_cell=None,\ nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, peepholes=False) if self.do_brnn: self.l_bgru_emb = LSTMLayer((None, None, self.td), self.hd, grad_clipping=self.gc, \ ingate=Gate(W_in=self.params['bWxr'], W_hid=self.params['bWhr'], b=self.params['bbr'], W_cell=None), \ forgetgate=Gate(W_in=self.params['bWxu'], W_hid=self.params['bWhu'], b=self.params['bbu'], W_cell=None), \ outgate=Gate(W_in=self.params['bWxo'], W_hid=self.params['bWho'], b=self.params['bbo'], W_cell=None), \ cell=Gate(W_in=self.params['bWxc'], W_hid=self.params['bWhc'], b=self.params['bbc'], W_cell=None,\ nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, peepholes=False, backwards=True) elif self.is_gru: self.l_gru_emb = GRULayer((None, None, self.td), self.hd, grad_clipping=self.gc, \ resetgate=Gate(W_in=self.params['Wxr'], W_hid=self.params['Whr'], b=self.params['br'], W_cell=None), \ updategate=Gate(W_in=self.params['Wxu'], W_hid=self.params['Whu'], b=self.params['bu'], W_cell=None), \ hidden_update=Gate(W_in=self.params['Wxc'], W_hid=self.params['Whc'], b=self.params['bc'], W_cell=None,\ nonlinearity=nonlinearities.tanh),mask_input=self.l_mask) if self.do_brnn: self.l_bgru_emb = GRULayer((None, None, self.td), self.hd, grad_clipping=self.gc, \ resetgate=Gate(W_in=self.params['bWxr'], W_hid=self.params['bWhr'], b=self.params['bbr'], W_cell=None), \ updategate=Gate(W_in=self.params['bWxu'], W_hid=self.params['bWhu'], b=self.params['bbu'], W_cell=None), \ hidden_update=Gate(W_in=self.params['bWxc'], W_hid=self.params['bWhc'], b=self.params['bbc'], W_cell=None,\ nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, backwards=True) else: self.is_nlm = True with open(dat_path, 'rb') as f: d = pk.load(f) self.nw, self.mw, self.ms = d['def'].shape # num words, max num of words, max num of senses self.dw = d['dw'] # dw to index self.aw = d['aw'] self.no = len(d['aw']) if 'spriors' in d: self.sense_priors = d['spriors'] else: self.sense_priors = np.ones((self.no, self.ms)) with open(supp_path, 'rb') as f: s = pk.load(f) self.id2aw = s['id2aw'] self.id2dw = s['id2dw'] self.aw2dw = s['aw2dw'] self.build_encoder() # assume xml-style input # output: 'lemma.pos instance-id sense-name/rating' def perform_wsi(self): expr = '[' + string.punctuation + ']' jaccard = False for d in os.listdir(self.wsi_path): f = os.path.join(self.wsi_path, d) if not os.path.isfile(f): continue with open(f) as fin: wsi = xd.parse(fin.read()) for inst in wsi['instances']['instance']: tok = inst['@token'] txt = re.sub(expr, ' ', inst['#text']) lemma = inst['@lemma'] pos = inst['@partOfSpeech'] inst_id = inst['@id'] ind = txt.split().index(tok) s, m, ptmp = self.to_indexes(txt, token=tok, pos=pos, lem=lemma) '''s = s.reshape(1, *s.shape) m = m.reshape(1, *m.shape) fu = np.asarray([ptmp]).astype(np.int32) weights = self.get_weights(s, m, fu, np.ones_like(s).astype(np.float32)) # mw x ms''' weights = self.get_vector([txt], mode='w', token=tok, pos=pos, lem=lemma) senses = s[ind, :] sweight = weights[0][ind, :] ratings = [(self.id2dw[senses[i]], sweight[i]) \ for i in range(len(sweight)) \ if self.id2dw[senses[i]].split('.')[0] == lemma and sweight[i] > 0.02] ratings.sort(key=lambda k: k[1], reverse=True) if len(ratings) == 0: pdb.set_trace() l = min(3, len(ratings)) if jaccard: r = [k[0] for k in ratings[0:2]] else: r = [k[0] + '/' + str(k[1]) for k in ratings[0:l]] print '{}.{} {} {}'.format(lemma, pos, inst_id, ' '.join(r)) def build_encoder(self): def to_vect(d, m, p): L0 = self.params['L0'] hid_inp = self.dwe[d, :] # mw x ms x hd logit = T.exp(T.dot(hid_inp, L0)[:,:,p])# (mw x ms) x mw mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) mask = mk.dimshuffle(0, 'x', 'x') l2 = logit * mask # mw x ms x mw l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m # mw x ms w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x') w1 = T.switch(T.isnan(w0), 0, w0) w = w1.dimshuffle(0, 1, 'x') # mw x ms x 1 res = T.sum(w * hid_inp, axis=1) # mw x hd return res #, logit, weights def to_weights(d, m, p, prior): hid_inp = self.dwe[d, :] # mw x ms x hd if self.is_lstm or self.is_gru: logit = T.exp(T.dot(hid_inp, L0)[:,:,p])# (mw x ms) x mw mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) mask = mk.dimshuffle(0, 'x', 'x') l2 = logit * mask # mw x ms x mw l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m # mw x ms w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x') w1 = T.switch(T.isnan(w0), 0, w0) else: if self.lm_mode == 'diag': B = hid_inp * Wt.dimshuffle('x', 'x', 0) tmp = T.tensordot(B, B.T, axes = 1) elif self.lm_mode == 'iden': logit = T.tensordot(self.dwe[d, :], self.dwe.T, axes=1)[:,:,d] # mw x ms x mw x ms cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0) # 1 x 1 x mw logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1), axis=3) / cnt # mw x ms x mw logit = T.exp(10*T.switch(T.isnan(logit), 0, logit)) # mw x ms x mw logit = T.prod(logit, axis=2) * prior # mw x ms sm = T.sum(logit * m, axis=1, keepdims=True) # mw x 1 logit = (logit * m) / sm # mw x ms return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit) else: tmp = T.tensordot(T.dot(hid_inp, self.params['Wt']), hid_inp.T, axes=1) # mw x ms x ms x mw tmp = T.exp(tmp.dimshuffle(0, 1, 3, 2)) # mw x ms x mw x ms tmp = tmp * m.dimshuffle('x', 'x', 0, 1) nrm = T.sum(tmp, axis=3) tmp = tmp / nrm.dimshuffle(0, 1, 2, 'x') tmp = T.switch(T.isnan(tmp), 0, tmp) mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) tmp = T.max(tmp, axis=3) * mk.dimshuffle('x', 'x', 0) # mw x ms x mw tmp = T.exp(T.sum(T.log(T.switch(T.eq(tmp, 0), 1, tmp)), axis=2)) * m # mw x ms tmp = tmp * prior tmp = tmp / T.sum(tmp, axis=1).dimshuffle(0, 'x') w1 = T.switch(T.isnan(tmp), 0, tmp) return w1 st = T.itensor3('st') # bs x len x ms pd = T.imatrix('wi') # bs x len mk = T.itensor3('mk') # bs x len x ms wv = T.dmatrix('wv') # bs x hd pe = T.imatrix('pe') # bs x mew pr = T.tensor3('pr') # bs x len x ms weights, _ = theano.scan(fn = to_weights, sequences = [st, mk, pd, pr]) # bs x mw x ms mask = T.ones_like(pd).astype(theano.config.floatX) # bs x len if self.is_lstm or self.is_gru: enc, _ = theano.scan(fn = to_vect, sequences = [st, mk, pd]) # bs x mw x hd enc = enc.astype(theano.config.floatX) fdef_emb = self.l_gru_emb.get_output_for([enc, mask]) # bs x hd if self.do_brnn: bdef_emb = self.l_bgru_emb.get_output_for([enc, mask]) if self.concat: def_emb = T.concatenate([fdef_emb[:,-1,:], bdef_emb[:,0,:]], axis=1) else: def_emb = T.dot(fdef_emb[:, -1, :], self.params['Wf']) + \ T.dot(bdef_emb[:, 0, :], self.params['Wb']) + \ self.params['by'].dimshuffle('x', 0) # bs x hd else: def_emb = fdef_emb[:, -1, :] else: hid_inp = self.dwe[st, :] dat = T.sum(weights.dimshuffle(0, 1, 2, 'x') * hid_inp, axis=2) def_emb = T.sum(T.dot(dat, self.params['L']), axis = 1) self.encode = theano.function([st, mk, pd, pr], def_emb) self.get_weights = theano.function([st, mk, pd, pr], weights) def preproc_word(self, w, pos=None): if pos == 'j': pos = 'a' w = re.sub(r'[\$,\{\}\[\]\(\)`\'\":;!\?\.]', '', w).lower() w = re.sub(r'\-', '_', w) # hyphen -> underscore if w == 'an': w = 'a' # dirty hack.... if w == 'oclock': w = 'o\'clock' if w.isdigit(): w = '<NUM>' wp = wn.morphy(w, pos=pos) if wp is None: wp = w return wp # 'sents' is a list of sentences def get_vector(self, sents, mode='v', token=None, pos='any', lem=None): mw = max([len(s.split()) for s in sents]) s = np.ones((len(sents), mw, self.ms), dtype=np.int32) * -1 m = np.zeros(s.shape, dtype=np.int32) p = np.ones((len(sents), mw), dtype=np.int32) * -1 pr = np.ones((len(sents), mw, self.ms), dtype=np.float32) sp = self.sense_priors # no x ms for (si, sn) in enumerate(sents): s[si], m[si], p_tmp = self.to_indexes(sn, mw, token=token, pos=pos, lem=lem) p[si][0:len(p_tmp)] = p_tmp for i in range(mw): if i >= len(sn): break pwid = p[si][i] pr[si][i] = sp[pwid, :] if mode == 'v': return self.encode(s, m, p, pr) else: return self.get_weights(s, m, p, pr) # 'sent' is a single string # mw is the maximum number of words (if called from get_vector()) # Setting token = w and pos = p will restrict the processing of 'w' to ones having POS tag 'p' def to_indexes(self, sent, mw = None, token = None, pos = None, lem = None): def same_pos(a, b): if a is None or b is None or a == b: return True if (a == 'a' or a == 's') and b == 'j': return True return False p_tmp = [] sn = sent.split() if mw is None: mw = len(sn) s = np.ones((mw, self.ms), dtype=np.int32) * -1 m = np.zeros(s.shape, dtype=np.int32) for (ind, w) in enumerate(sn): filt = (token is not None) and (w == token) #filter the token using pos if filt: _pos = pos else: _pos = None w = self.preproc_word(w, pos=_pos) if w not in self.aw2dw or len(self.aw2dw[w]) == 0: s[ind, 0] = self.dw['<UNK>'] m[ind, 0] = 1.0 else: l = min(10, len(self.aw2dw[w])) if filt: cands = [] if lem is not None: w = lem for wp in self.aw2dw[w]: try: if same_pos(wn.synset(wp).pos(), pos) and wp.split('.')[0] == w: cands.append(wp) except: continue #cands = [wp for wp in self.aw2dw[w] if same_pos(wn.synset(wp).pos(), pos)] l = min(25, len(cands)) s[ind][0:l] = [self.dw[wp] for wp in cands][0:25] else: s[ind][0:l] = [self.dw[wp] for wp in self.aw2dw[w][0:l]] m[ind][0:l] = np.ones((l,)) if l == 0: pdb.set_trace() if w in self.aw: p_tmp.append(self.aw[w]) else: p_tmp.append(0) return s, m, p_tmp
class PRAE: def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs): self.num_batch = num_batch self.n_features = n_features self.max_len = max_len self.hidden = hidden rng = np.random.RandomState(123) self.drng = rng self.rng = RandomStreams(rng.randint(2 ** 30)) # params initial_W = np.asarray( rng.uniform( low=1e-5, high=1, size=(self.hidden[1], self.n_features) ), dtype=theano.config.floatX ) self.W_y_theta = theano.shared(value=initial_W, name='W_y_theta', borrow=True) # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True) self.b_y_theta = theano.shared( value=np.zeros( self.n_features, dtype=theano.config.floatX ), borrow=True ) # self.b_y_kappa = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) # I could directly create the model here since it is fixed self.l_in = InputLayer(shape=(self.num_batch, self.max_len, self.n_features)) self.mask_input = InputLayer(shape=(self.num_batch, self.max_len)) first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0]) self.model =GRULayer(first_hidden, num_units=hidden[1]) # need some reshape voodoo # l_shp = ReshapeLayer(second_hidden, (-1, hidden[1])) # after the reshape I have batch*max_len X features # self.model = DenseLayer(l_shp, num_units=self.n_features, nonlinearity=rectify) # if now I put a dense layer this will collect all the output temporally which is what I want, I'll have to fix # the dimensions probably later # For every gaussian in the sum I need 3 values plus a value for the total scale # the output of this layer will be (num_batch, num_units, max_len) TODO check size def get_output_shape_for(self): return self.model.get_output_shape_for(self.num_batch, self.max_len, self.hidden[2]) def get_output_y(self, output): # (batch, time, hidden) X (hidden, features) + (, features) => (batch, time, features) theta_out = T.nnet.relu(T.dot(output, self.W_y_theta) + self.b_y_theta) #kappa_out = T.nnet.relu(T.dot(output, self.W_y_kappa) + self.b_y_kappa) return theta_out def get_log_x(self, x, theta_out): # DIM = (batch, time, hidden) # everything is elementwise log_x = T.log(theta_out + 1e-8) - theta_out * x log_x = log_x.sum(axis=2, dtype=theano.config.floatX) # sum over x cause I assume they are independent return log_x def build_model(self, train_x, train_mask_x, train_mask_out, train_target, test_x, test_mask_x, test_mask_out, test_target): self.train_x = train_x self.train_mask_x = train_mask_x self.train_mask_out = train_mask_out self.train_target = train_target self.test_x = test_x self.test_mask_x = test_mask_x self.test_mask_out = test_mask_out self.test_target = test_target self.index = T.iscalar('index') self.num_batch_test = T.iscalar('index') self.b_slice = slice(self.index * self.num_batch, (self.index + 1) * self.num_batch) sym_x = T.dtensor3() sym_mask_x = T.dmatrix() sym_target = T.dtensor3() # sym_mask_out = T.dtensor3() should not be useful since output is still zero # TODO think about this if it is true output = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x}) theta = self.get_output_y(output) log_px = self.get_log_x(sym_target, theta) log_px_sum_time = log_px.sum(axis=1, dtype=theano.config.floatX) # sum over tx loss = - T.sum(log_px_sum_time) / self.num_batch # average over batch ## log_px_test = self.get_log_x(sym_target, theta) log_px_sum_time_test = log_px_test.sum(axis=1, dtype=theano.config.floatX) # sum over time loss_test = - T.sum(log_px_sum_time_test) / self.num_batch_test # average over batch # loss = T.mean(lasagne.objectives.squared_error(mu, sym_target)) all_params = [self.W_y_theta] + [self.b_y_theta] + lasagne.layers.get_all_params(self.model) all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss, all_params)] all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 3) updates_target = adam(all_grads_target, all_params) train_model = theano.function([self.index], [loss, theta, log_px], givens={sym_x: self.train_x[self.b_slice], sym_mask_x: self.train_mask_x[self.b_slice], sym_target: self.train_target[self.b_slice]}, updates=updates_target) test_model = theano.function([self.num_batch_test], [loss_test, theta], givens={sym_x: self.test_x, sym_mask_x: self.test_mask_x, sym_target: self.test_target}) return train_model, test_model
def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs): """ notes Headers from paper in all caps mem = n_article slots if is_article else n_title_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_a: attention weights for article memory :param M_a: article memory :param args: gru_weights, maybe w_t, maybe M_t gru_weights: weights with which to initialize GRULayer on each time step w_t: attention weights for titles memory M_t: titles memory :param kwargs: is_training, is_article is_training: is_article: we use different parts of memory when working with a article :return: [y = model outputs, i + 1 = increment index, h w, M (see above)] """ is_training = kwargs['is_training'] is_article = kwargs['is_article'] gru_weights = args[:depth] if len(args) > depth: w_t = args[depth] M_t = args[depth + 1] i_type = T.iscalar if is_article or is_training else T.ivector assert i.type == i_type if not is_article: assert w_t is not None and M_t is not None word_idxs = i if is_article or is_training: # get representation of word window document = articles if is_article else titles # [instances, bucket_width] word_idxs = document[:, i:i + 1] # [instances, 1] # x_i = self.emb[word_idxs].flatten(ndim=2) # [instances, embedding_dim] input = InputLayer(shape=(None, 1), input_var=word_idxs) embed = EmbeddingLayer(input, num_embeddings, embedding_dim) gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0) for weight in gru_weights: gru = GRULayer(incoming=gru, num_units=embedding_dim, hid_init=weight) x_i = get_output(gru).flatten(ndim=2) x_i = Print('x_i')(x_i) # [instances, embedding_dim] gru_weights = [] if is_article: M_read = M_a # [instances, memory_size, n_article_slots] w_read = w_a # [instances, n_article_slots] else: M_read = T.concatenate( [M_a, M_t], axis=2) # [instances, memory_size, n_title_slots] w_read = T.concatenate([w_a, w_t], axis=1) # [instances, n_title_slots] # eqn 15 c = T.batched_dot(M_read, w_read) # [instances, memory_size] # EXTERNAL MEMORY READ def get_attention(Wg, bg, M, w): g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.nnet.softplus(beta) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g) * w + g * w_hat # [instances, mem] w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a) # [instances, n_article_slots] if not is_article: w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t) # [instances, n_title_slots] # MODEL INPUT AND OUTPUT # eqn 9 h = T.dot(c, self.Wh) + T.dot( x_i, self.Wx) + self.bh # [instances, hidden_size] # eqn 10 y = T.nnet.softmax(T.dot(h, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v = T.tanh(T.dot(h, self.Wv) + self.bv) # [instances, memory_size] # need to add broadcast layers for memory update f = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v = v.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f + T.batched_dot(v, u) * ( 1 - f) # [instances, memory_size, mem] M_a = update_memory(self.We_a, self.be_a, w_a, M_a) attention_and_memory = [w_a, M_a] if not is_article: M_t = update_memory(self.We_t, self.be_t, w_t, M_t) attention_and_memory += [w_t, M_t] y_max = y.argmax(axis=1).astype(int32) next_idxs = i + 1 if is_training or is_article else y_max return [y, y_max, next_idxs, h] + attention_and_memory