def __init__(self, name, config): super().__init__(name) self.config = config self.param('src_embeddings', (len(config['src_encoder']), config['src_embedding_dims']), init_f=Gaussian(fan_in=config['src_embedding_dims'])) self.param('trg_embeddings', (len(config['trg_encoder']), config['trg_embedding_dims']), init_f=Gaussian(fan_in=config['trg_embedding_dims'])) self.add(Linear('hidden', config['decoder_state_dims'], config['trg_embedding_dims'])) self.add(Linear('emission', config['trg_embedding_dims'], len(config['trg_encoder']), w=self._trg_embeddings.T)) for prefix, backwards in (('fwd', False), ('back', True)): self.add(Sequence( prefix+'_encoder', LSTM, backwards, config['src_embedding_dims'] + ( config['encoder_state_dims'] if backwards else 0), config['encoder_state_dims'], layernorm=config['encoder_layernorm'], dropout=config['encoder_dropout'], trainable_initial=True, offset=0)) self.add(Sequence( 'decoder', LSTM, False, config['trg_embedding_dims'], config['decoder_state_dims'], layernorm=config['decoder_layernorm'], dropout=config['decoder_dropout'], attention_dims=config['attention_dims'], attended_dims=2*config['encoder_state_dims'], trainable_initial=False, offset=-1)) h_t = T.matrix('h_t') self.predict_fun = function( [h_t], T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t))))) inputs = T.lmatrix('inputs') inputs_mask = T.bmatrix('inputs_mask') self.encode_fun = function( [inputs, inputs_mask], self.encode(inputs, inputs_mask))
def __init__(self, name, config): super().__init__(name) self.config = config pprint(config) sys.stdout.flush() self.add(Embeddings( 'embeddings', config['n_symbols'], config['embedding_dims'])) self.add(Linear( 'hidden', config['state_dims'], config['embedding_dims'], dropout=config['dropout'], layernorm=config['layernorm'])) self.add(Linear( 'emission', config['embedding_dims'], config['n_symbols'], w=self.embeddings._w.T)) self.add(Sequence( 'decoder', LSTM, False, config['embedding_dims'], config['state_dims'], dropout=config['recurrent_dropout'], layernorm=config['recurrent_layernorm'], trainable_initial=True, offset=-1)) h_t = T.matrix('h_t') self.predict_fun = function( [h_t], T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t)))))
def step_fun(self): if self._step_fun is None: all_inputs = [T.matrix('inputs'), T.vector('inputs_mask')] all_inputs.extend((rec.variable for rec in self.recurrences if not rec.init == OutputOnly)) all_inputs.extend( (nonseq.variable for nonseq in self.non_sequences)) self._step_fun = function(all_inputs, self.step(*all_inputs), name='{}_step_fun'.format(self.name)) return self._step_fun
def __init__(self, name, config): super().__init__(name) self.config = config pprint(config) sys.stdout.flush() self.add( Embeddings('embeddings', config['n_symbols'], config['embedding_dims'])) self.add( Linear('hidden', config['state_dims'], config['embedding_dims'], dropout=config['dropout'], layernorm=config['layernorm'])) self.add( Linear('emission', config['embedding_dims'], config['n_symbols'], w=self.embeddings._w.T)) self.add( Sequence('decoder', LSTM, False, config['embedding_dims'], config['state_dims'], dropout=config['recurrent_dropout'], layernorm=config['recurrent_layernorm'], trainable_initial=True, offset=-1)) h_t = T.matrix('h_t') self.predict_fun = function( [h_t], T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t)))))
loss = super().loss() return loss + ((self(inputs) - outputs)**2).mean() def __call__(self, inputs): return T.nnet.sigmoid(self.output(T.tanh(self.hidden(inputs)))) if __name__ == '__main__': x = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=theano.config.floatX) y = np.array([[0], [1], [1], [0]], dtype=theano.config.floatX) x = x * 0.8 + 0.1 y = y * 0.8 + 0.1 inputs = T.matrix('inputs') outputs = T.matrix('outputs') xor = MLP('xor') optimizer = Adam(xor.parameters(), xor.loss(inputs, outputs), [inputs], [outputs]) for i in range(1000): loss = optimizer.step(x, y) if np.isnan(loss): print('NaN at iteration %d!' % (i + 1)) break print('Last loss = %g. Predictions vs targets:' % loss) predict = function([inputs], xor(inputs), name='XOR_predict') print(np.hstack([predict(x), y]))
sym_outputs = T.lmatrix('outputs') sym_outputs_mask = T.bmatrix('outputs_mask') # Create an optimizer instance, manually specifying which # parameters to optimize, which loss function to use, which inputs # (none) and outputs are used for the model. We also specify the # gradient clipping threshold. optimizer = Adam( lm.parameters(), lm.loss(sym_outputs, sym_outputs_mask), [], [sym_outputs, sym_outputs_mask], grad_max_norm=5.0) # Compile a function to compute cross-entropy of a batch. cross_entropy = function( [sym_outputs, sym_outputs_mask], lm.cross_entropy(sym_outputs, sym_outputs_mask)) test_set = sents[:test_size] train_set = sents[test_size:] # Get one batch of testing data, encoded as a masked matrix. test_outputs, test_outputs_mask = encoder.pad_sequences( test_set, max_length=config['max_length']) batch_nr = 0 sent_nr = 0 for i in range(n_epochs): for batch in iterate_batches(train_set, batch_size, len): outputs, outputs_mask = encoder.pad_sequences( batch, max_length=config['max_length'])
return loss + ((self(inputs) - outputs) ** 2).mean() def __call__(self, inputs): return T.nnet.sigmoid(self.output(T.tanh(self.hidden(inputs)))) if __name__ == '__main__': x = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=theano.config.floatX) y = np.array([[0], [1], [1], [0]], dtype=theano.config.floatX) x = x*0.8 + 0.1 y = y*0.8 + 0.1 inputs = T.matrix('inputs') outputs = T.matrix('outputs') xor = MLP('xor') optimizer = Adam(xor.parameters(), xor.loss(inputs, outputs), [inputs], [outputs]) for i in range(1000): loss = optimizer.step(x, y) if np.isnan(loss): print('NaN at iteration %d!' % (i+1)) break print('Last loss = %g. Predictions vs targets:' % loss) predict = function([inputs], xor(inputs), name='XOR_predict') print(np.hstack([predict(x), y]))
# Create the model. sym_outputs = T.lmatrix('outputs') sym_outputs_mask = T.bmatrix('outputs_mask') # Create an optimizer instance, manually specifying which # parameters to optimize, which loss function to use, which inputs # (none) and outputs are used for the model. We also specify the # gradient clipping threshold. optimizer = Adam(lm.parameters(), lm.loss(sym_outputs, sym_outputs_mask), [], [sym_outputs, sym_outputs_mask], grad_max_norm=5.0) # Compile a function to compute cross-entropy of a batch. cross_entropy = function([sym_outputs, sym_outputs_mask], lm.cross_entropy(sym_outputs, sym_outputs_mask)) test_set = sents[:test_size] train_set = sents[test_size:] # Get one batch of testing data, encoded as a masked matrix. test_outputs, test_outputs_mask = encoder.pad_sequences( test_set, max_length=config['max_length']) batch_nr = 0 sent_nr = 0 for i in range(n_epochs): for batch in iterate_batches(train_set, batch_size, len): outputs, outputs_mask = encoder.pad_sequences( batch, max_length=config['max_length'])
def __init__(self, name, config): super().__init__(name) self.config = config self.add(Embeddings( 'source_char_embeddings', 1 if config['source_encoder'].sub_encoder is None \ else len(config['source_encoder'].sub_encoder), config['source_char_embedding_dims'], dropout=config['char_embeddings_dropout'])) self.add(Embeddings( 'source_embeddings', len(config['source_encoder']), config['source_embedding_dims'], dropout=config['embeddings_dropout'])) self.add(Embeddings( 'target_embeddings', len(config['target_encoder']), config['target_embedding_dims'])) self.add(Linear( 'hidden', config['decoder_state_dims'], config['target_embedding_dims'], dropout=config['dropout'], layernorm=config['layernorm'])) self.add(Linear( 'emission', config['target_embedding_dims'], len(config['target_encoder']), w=self.target_embeddings._w.T)) self.add(Linear( 'proj_h0', config['encoder_state_dims'], config['decoder_state_dims'], dropout=config['dropout'], layernorm=config['layernorm'])) self.add(Linear( 'proj_c0', config['encoder_state_dims'], config['decoder_state_dims'], dropout=config['dropout'], layernorm=config['layernorm'])) # The total loss is # lambda_o*xent(target sentence) + lambda_a*xent(alignment) self.lambda_o = theano.shared( np.array(1.0, dtype=theano.config.floatX)) self.lambda_a = theano.shared( np.array(config['alignment_loss'], dtype=theano.config.floatX)) for prefix, backwards in (('fwd', False), ('back', True)): self.add(LSTMSequence( prefix+'_char_encoder', backwards, config['source_char_embedding_dims'] + ( (config['source_embedding_dims'] // 2) if backwards else 0), config['source_embedding_dims'] // 2, layernorm=config['encoder_layernorm'], dropout=config['recurrent_dropout'], trainable_initial=True, offset=0)) for prefix, backwards in (('fwd', False), ('back', True)): self.add(LSTMSequence( prefix+'_encoder', backwards, config['source_embedding_dims'] + ( config['encoder_state_dims'] if backwards else 0), config['encoder_state_dims'], layernorm=config['encoder_layernorm'], dropout=config['recurrent_dropout'], trainable_initial=True, offset=0)) self.add(LSTMSequence( 'decoder', False, config['target_embedding_dims'], config['decoder_state_dims'], layernorm=config['decoder_layernorm'], dropout=config['recurrent_dropout'], attention_dims=config['attention_dims'], attended_dims=2*config['encoder_state_dims'], trainable_initial=False, contextgate=(config['decoder_gate'] == 'context'), offset=-1)) h_t = T.matrix('h_t') self.predict_fun = function( [h_t], T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t))))) inputs = T.lmatrix('inputs') inputs_mask = T.bmatrix('inputs_mask') chars = T.lmatrix('chars') chars_mask = T.bmatrix('chars_mask') outputs = T.lmatrix('outputs') outputs_mask = T.bmatrix('outputs_mask') attention = T.tensor3('attention') self.x = [inputs, inputs_mask, chars, chars_mask] self.y = [outputs, outputs_mask, attention] self.encode_fun = function(self.x, self.encode(*self.x)) self.xent_fun = function(self.x+self.y, self.xent(*(self.x+self.y))) self.pred_fun = function(self.x+self.y[:-1], self(*(self.x+self.y[:-1])))
def ns_func(self, ns): if ns not in self._ns_func_cache: self._ns_func_cache[ns] = function([ns.variable], ns.func(ns.variable)) return self._ns_func_cache[ns]
def main(): import argparse import pickle import sys import os.path from time import time parser = argparse.ArgumentParser( description='Neural machine translation') parser.add_argument('--model', type=str, required=True, help='name of the model file') parser.add_argument('--corpus', type=str, help='name of parallel corpus file') args = parser.parse_args() if os.path.exists(args.model): with open(args.model, 'rb') as f: config = pickle.load(f) model = NMT('nmt', config) model.load(f) else: n_epochs = 1 batch_size = 64 test_size = batch_size max_length = 30 with open(args.corpus, 'r', encoding='utf-8') as f: def read_pairs(): for line in f: fields = [s.strip() for s in line.split('|||')] if len(fields) == 2: pair = tuple(map(str.split, fields)) lens = tuple(map(len, pair)) if min(lens) >= 2 and max(lens) <= max_length: yield pair src_sents, trg_sents = list(zip(*read_pairs())) src_encoder = TextEncoder(sequences=src_sents, max_vocab=10000) trg_encoder = TextEncoder(sequences=trg_sents, max_vocab=10000) sent_pairs = list(zip(src_sents, trg_sents)) print('Read %d sentences, vocabulary size %d/%d' % ( len(sent_pairs), len(src_encoder), len(trg_encoder)), flush=True) config = { 'src_encoder': src_encoder, 'trg_encoder': trg_encoder, 'src_embedding_dims': 512, 'trg_embedding_dims': 512, 'encoder_dropout': 0.2, 'decoder_dropout': 0.2, 'encoder_state_dims': 1024, 'decoder_state_dims': 1024, 'attention_dims': 1024, 'encoder_layernorm': 'ba1', 'decoder_layernorm': 'ba1', } model = NMT('nmt', config) sym_inputs = T.lmatrix('inputs') sym_inputs_mask = T.bmatrix('inputs_mask') sym_outputs = T.lmatrix('outputs') sym_outputs_mask = T.bmatrix('outputs_mask') optimizer = Adam( model.parameters(), model.loss(sym_inputs, sym_inputs_mask, sym_outputs, sym_outputs_mask), [sym_inputs, sym_inputs_mask], [sym_outputs, sym_outputs_mask], grad_max_norm=5.0) xent = function( [sym_inputs, sym_inputs_mask, sym_outputs, sym_outputs_mask], model.xent(sym_inputs, sym_inputs_mask, sym_outputs, sym_outputs_mask)) test_set = sent_pairs[:test_size] train_set = sent_pairs[test_size:] test_src, test_trg = list(zip(*test_set)) test_inputs, test_inputs_mask = src_encoder.pad_sequences(test_src) test_outputs, test_outputs_mask = trg_encoder.pad_sequences(test_trg) start_time = time() end_time = start_time + 24*3600 batch_nr = 0 while time() < end_time: def pair_len(pair): return max(map(len, pair)) for batch_pairs in iterate_batches(train_set, 64, pair_len): src_batch, trg_batch = list(zip(*batch_pairs)) inputs, inputs_mask = src_encoder.pad_sequences(src_batch) outputs, outputs_mask = trg_encoder.pad_sequences(trg_batch) t0 = time() train_loss = optimizer.step( inputs, inputs_mask, outputs, outputs_mask) print('Train loss: %.3f (%.2f s)' % (train_loss, time()-t0), flush=True) batch_nr += 1 if batch_nr % 10 == 0: test_xent = xent(test_inputs, test_inputs_mask, test_outputs, test_outputs_mask) print('Test xent: %.3f' % test_xent, flush=True) if batch_nr % 100 == 0: pred, pred_mask, scores = model.search( test_inputs, test_inputs_mask, max_length) for src_sent, sent, sent_mask, score in zip( test_inputs.T, pred[-1].T, pred_mask[-1].T, scores[-1].T): print(' '.join( src_encoder.vocab[x] for x in src_sent.flatten() if x > 1)) print('%.2f'%score, ' '.join( trg_encoder.vocab[x] for x, there in zip(sent.flatten(), sent_mask.flatten()) if bool(there))) print('-'*72, flush=True) if time() >= end_time: break with open(args.model, 'wb') as f: pickle.dump(config, f) model.save(f)