Exemple #1
0
    def __init__(self, name, config):
        super().__init__(name)
        self.config = config

        self.param('src_embeddings',
                   (len(config['src_encoder']), config['src_embedding_dims']),
                   init_f=Gaussian(fan_in=config['src_embedding_dims']))
        self.param('trg_embeddings',
                   (len(config['trg_encoder']), config['trg_embedding_dims']),
                   init_f=Gaussian(fan_in=config['trg_embedding_dims']))
        self.add(Linear('hidden',
                        config['decoder_state_dims'],
                        config['trg_embedding_dims']))
        self.add(Linear('emission',
                        config['trg_embedding_dims'],
                        len(config['trg_encoder']),
                        w=self._trg_embeddings.T))
        for prefix, backwards in (('fwd', False), ('back', True)):
            self.add(Sequence(
                prefix+'_encoder', LSTM, backwards,
                config['src_embedding_dims'] + (
                    config['encoder_state_dims'] if backwards else 0),
                config['encoder_state_dims'],
                layernorm=config['encoder_layernorm'],
                dropout=config['encoder_dropout'],
                trainable_initial=True,
                offset=0))
        self.add(Sequence(
            'decoder', LSTM, False,
            config['trg_embedding_dims'],
            config['decoder_state_dims'],
            layernorm=config['decoder_layernorm'],
            dropout=config['decoder_dropout'],
            attention_dims=config['attention_dims'],
            attended_dims=2*config['encoder_state_dims'],
            trainable_initial=False,
            offset=-1))

        h_t = T.matrix('h_t')
        self.predict_fun = function(
                [h_t],
                T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t)))))

        inputs = T.lmatrix('inputs')
        inputs_mask = T.bmatrix('inputs_mask')
        self.encode_fun = function(
                [inputs, inputs_mask],
                self.encode(inputs, inputs_mask))
Exemple #2
0
    def __init__(self, name, config):
        super().__init__(name)

        self.config = config

        pprint(config)
        sys.stdout.flush()

        self.add(Embeddings(
            'embeddings', config['n_symbols'], config['embedding_dims']))
        self.add(Linear(
            'hidden',
            config['state_dims'], config['embedding_dims'],
            dropout=config['dropout'],
            layernorm=config['layernorm']))
        self.add(Linear(
            'emission',
            config['embedding_dims'], config['n_symbols'],
            w=self.embeddings._w.T))
        self.add(Sequence(
            'decoder', LSTM, False,
            config['embedding_dims'], config['state_dims'],
            dropout=config['recurrent_dropout'],
            layernorm=config['recurrent_layernorm'],
            trainable_initial=True, offset=-1))

        h_t = T.matrix('h_t')
        self.predict_fun = function(
                [h_t],
                T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t)))))
Exemple #3
0
 def step_fun(self):
     if self._step_fun is None:
         all_inputs = [T.matrix('inputs'), T.vector('inputs_mask')]
         all_inputs.extend((rec.variable for rec in self.recurrences
                            if not rec.init == OutputOnly))
         all_inputs.extend(
             (nonseq.variable for nonseq in self.non_sequences))
         self._step_fun = function(all_inputs,
                                   self.step(*all_inputs),
                                   name='{}_step_fun'.format(self.name))
     return self._step_fun
Exemple #4
0
    def __init__(self, name, config):
        super().__init__(name)

        self.config = config

        pprint(config)
        sys.stdout.flush()

        self.add(
            Embeddings('embeddings', config['n_symbols'],
                       config['embedding_dims']))
        self.add(
            Linear('hidden',
                   config['state_dims'],
                   config['embedding_dims'],
                   dropout=config['dropout'],
                   layernorm=config['layernorm']))
        self.add(
            Linear('emission',
                   config['embedding_dims'],
                   config['n_symbols'],
                   w=self.embeddings._w.T))
        self.add(
            Sequence('decoder',
                     LSTM,
                     False,
                     config['embedding_dims'],
                     config['state_dims'],
                     dropout=config['recurrent_dropout'],
                     layernorm=config['recurrent_layernorm'],
                     trainable_initial=True,
                     offset=-1))

        h_t = T.matrix('h_t')
        self.predict_fun = function(
            [h_t], T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t)))))
Exemple #5
0
        loss = super().loss()
        return loss + ((self(inputs) - outputs)**2).mean()

    def __call__(self, inputs):
        return T.nnet.sigmoid(self.output(T.tanh(self.hidden(inputs))))


if __name__ == '__main__':
    x = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=theano.config.floatX)
    y = np.array([[0], [1], [1], [0]], dtype=theano.config.floatX)

    x = x * 0.8 + 0.1
    y = y * 0.8 + 0.1

    inputs = T.matrix('inputs')
    outputs = T.matrix('outputs')
    xor = MLP('xor')
    optimizer = Adam(xor.parameters(), xor.loss(inputs, outputs), [inputs],
                     [outputs])

    for i in range(1000):
        loss = optimizer.step(x, y)
        if np.isnan(loss):
            print('NaN at iteration %d!' % (i + 1))
            break

    print('Last loss = %g. Predictions vs targets:' % loss)

    predict = function([inputs], xor(inputs), name='XOR_predict')
    print(np.hstack([predict(x), y]))
Exemple #6
0
        sym_outputs = T.lmatrix('outputs')
        sym_outputs_mask = T.bmatrix('outputs_mask')

        # Create an optimizer instance, manually specifying which
        # parameters to optimize, which loss function to use, which inputs
        # (none) and outputs are used for the model. We also specify the
        # gradient clipping threshold.
        optimizer = Adam(
                lm.parameters(),
                lm.loss(sym_outputs, sym_outputs_mask),
                [], [sym_outputs, sym_outputs_mask],
                grad_max_norm=5.0)

        # Compile a function to compute cross-entropy of a batch.
        cross_entropy = function(
                [sym_outputs, sym_outputs_mask],
                lm.cross_entropy(sym_outputs, sym_outputs_mask))

        test_set = sents[:test_size]
        train_set = sents[test_size:]

        # Get one batch of testing data, encoded as a masked matrix.
        test_outputs, test_outputs_mask = encoder.pad_sequences(
                test_set, max_length=config['max_length'])

        batch_nr = 0
        sent_nr = 0
        for i in range(n_epochs):
            for batch in iterate_batches(train_set, batch_size, len):
                outputs, outputs_mask = encoder.pad_sequences(
                        batch, max_length=config['max_length'])
Exemple #7
0
        return loss + ((self(inputs) - outputs) ** 2).mean()

    def __call__(self, inputs):
        return T.nnet.sigmoid(self.output(T.tanh(self.hidden(inputs))))


if __name__ == '__main__':
    x = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=theano.config.floatX)
    y = np.array([[0],    [1],    [1],    [0]],    dtype=theano.config.floatX)

    x = x*0.8 + 0.1
    y = y*0.8 + 0.1

    inputs = T.matrix('inputs')
    outputs = T.matrix('outputs')
    xor = MLP('xor')
    optimizer = Adam(xor.parameters(), xor.loss(inputs, outputs),
                     [inputs], [outputs])

    for i in range(1000):
        loss = optimizer.step(x, y)
        if np.isnan(loss):
            print('NaN at iteration %d!' % (i+1))
            break

    print('Last loss = %g. Predictions vs targets:' % loss)

    predict = function([inputs], xor(inputs), name='XOR_predict')
    print(np.hstack([predict(x), y]))

Exemple #8
0
        # Create the model.
        sym_outputs = T.lmatrix('outputs')
        sym_outputs_mask = T.bmatrix('outputs_mask')

        # Create an optimizer instance, manually specifying which
        # parameters to optimize, which loss function to use, which inputs
        # (none) and outputs are used for the model. We also specify the
        # gradient clipping threshold.
        optimizer = Adam(lm.parameters(),
                         lm.loss(sym_outputs, sym_outputs_mask), [],
                         [sym_outputs, sym_outputs_mask],
                         grad_max_norm=5.0)

        # Compile a function to compute cross-entropy of a batch.
        cross_entropy = function([sym_outputs, sym_outputs_mask],
                                 lm.cross_entropy(sym_outputs,
                                                  sym_outputs_mask))

        test_set = sents[:test_size]
        train_set = sents[test_size:]

        # Get one batch of testing data, encoded as a masked matrix.
        test_outputs, test_outputs_mask = encoder.pad_sequences(
            test_set, max_length=config['max_length'])

        batch_nr = 0
        sent_nr = 0
        for i in range(n_epochs):
            for batch in iterate_batches(train_set, batch_size, len):
                outputs, outputs_mask = encoder.pad_sequences(
                    batch, max_length=config['max_length'])
Exemple #9
0
	def __init__(self, name, config):
		super().__init__(name)
		self.config = config

		self.add(Embeddings(
			'source_char_embeddings',
			1 if config['source_encoder'].sub_encoder is None \
					else len(config['source_encoder'].sub_encoder),
			config['source_char_embedding_dims'],
			dropout=config['char_embeddings_dropout']))

		self.add(Embeddings(
			'source_embeddings',
			len(config['source_encoder']),
			config['source_embedding_dims'],
			dropout=config['embeddings_dropout']))

		self.add(Embeddings(
			'target_embeddings',
			len(config['target_encoder']),
			config['target_embedding_dims']))

		self.add(Linear(
			'hidden',
			config['decoder_state_dims'],
			config['target_embedding_dims'],
			dropout=config['dropout'],
			layernorm=config['layernorm']))

		self.add(Linear(
			'emission',
			config['target_embedding_dims'],
			len(config['target_encoder']),
			w=self.target_embeddings._w.T))

		self.add(Linear(
			'proj_h0',
			config['encoder_state_dims'],
			config['decoder_state_dims'],
			dropout=config['dropout'],
			layernorm=config['layernorm']))

		self.add(Linear(
			'proj_c0',
			config['encoder_state_dims'],
			config['decoder_state_dims'],
			dropout=config['dropout'],
			layernorm=config['layernorm']))

		# The total loss is
		#   lambda_o*xent(target sentence) + lambda_a*xent(alignment)
		self.lambda_o = theano.shared(
				np.array(1.0, dtype=theano.config.floatX))
		self.lambda_a = theano.shared(
				np.array(config['alignment_loss'], dtype=theano.config.floatX))
		for prefix, backwards in (('fwd', False), ('back', True)):
			self.add(LSTMSequence(
				prefix+'_char_encoder', backwards,
				config['source_char_embedding_dims'] + (
					(config['source_embedding_dims'] // 2) if backwards else 0),
				config['source_embedding_dims'] // 2,
				layernorm=config['encoder_layernorm'],
				dropout=config['recurrent_dropout'],
				trainable_initial=True,
				offset=0))
		for prefix, backwards in (('fwd', False), ('back', True)):
			self.add(LSTMSequence(
				prefix+'_encoder', backwards,
				config['source_embedding_dims'] + (
					config['encoder_state_dims'] if backwards else 0),
				config['encoder_state_dims'],
				layernorm=config['encoder_layernorm'],
				dropout=config['recurrent_dropout'],
				trainable_initial=True,
				offset=0))
		self.add(LSTMSequence(
			'decoder', False,
			config['target_embedding_dims'],
			config['decoder_state_dims'],
			layernorm=config['decoder_layernorm'],
			dropout=config['recurrent_dropout'],
			attention_dims=config['attention_dims'],
			attended_dims=2*config['encoder_state_dims'],
			trainable_initial=False,
			contextgate=(config['decoder_gate'] == 'context'),
			offset=-1))

		h_t = T.matrix('h_t')
		self.predict_fun = function(
				[h_t],
				T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t)))))

		inputs = T.lmatrix('inputs')
		inputs_mask = T.bmatrix('inputs_mask')
		chars = T.lmatrix('chars')
		chars_mask = T.bmatrix('chars_mask')
		outputs = T.lmatrix('outputs')
		outputs_mask = T.bmatrix('outputs_mask')
		attention = T.tensor3('attention')

		self.x = [inputs, inputs_mask, chars, chars_mask]
		self.y = [outputs, outputs_mask, attention]

		self.encode_fun = function(self.x,
			self.encode(*self.x))
		self.xent_fun = function(self.x+self.y,
			self.xent(*(self.x+self.y)))
		self.pred_fun = function(self.x+self.y[:-1],
			self(*(self.x+self.y[:-1])))
Exemple #10
0
 def ns_func(self, ns):
     if ns not in self._ns_func_cache:
         self._ns_func_cache[ns] = function([ns.variable],
                                            ns.func(ns.variable))
     return self._ns_func_cache[ns]
Exemple #11
0
def main():
    import argparse
    import pickle
    import sys
    import os.path
    from time import time

    parser = argparse.ArgumentParser(
            description='Neural machine translation')

    parser.add_argument('--model', type=str, required=True,
            help='name of the model file')
    parser.add_argument('--corpus', type=str,
            help='name of parallel corpus file')

    args = parser.parse_args()

    if os.path.exists(args.model):
        with open(args.model, 'rb') as f:
            config = pickle.load(f)
            model = NMT('nmt', config)
            model.load(f)
    else:
        n_epochs = 1
        batch_size = 64
        test_size = batch_size
        max_length = 30

        with open(args.corpus, 'r', encoding='utf-8') as f:
            def read_pairs():
                for line in f:
                    fields = [s.strip() for s in line.split('|||')]
                    if len(fields) == 2:
                        pair = tuple(map(str.split, fields))
                        lens = tuple(map(len, pair))
                        if min(lens) >= 2 and max(lens) <= max_length:
                            yield pair
            src_sents, trg_sents = list(zip(*read_pairs()))
            src_encoder = TextEncoder(sequences=src_sents, max_vocab=10000)
            trg_encoder = TextEncoder(sequences=trg_sents, max_vocab=10000)
            sent_pairs = list(zip(src_sents, trg_sents))
            print('Read %d sentences, vocabulary size %d/%d' % (
                len(sent_pairs), len(src_encoder), len(trg_encoder)),
                flush=True)
            
        config = {
            'src_encoder': src_encoder,
            'trg_encoder': trg_encoder,
            'src_embedding_dims': 512,
            'trg_embedding_dims': 512,
            'encoder_dropout': 0.2,
            'decoder_dropout': 0.2,
            'encoder_state_dims': 1024,
            'decoder_state_dims': 1024,
            'attention_dims': 1024,
            'encoder_layernorm': 'ba1',
            'decoder_layernorm': 'ba1',
            }
        
        model = NMT('nmt', config)

        sym_inputs = T.lmatrix('inputs')
        sym_inputs_mask = T.bmatrix('inputs_mask')
        sym_outputs = T.lmatrix('outputs')
        sym_outputs_mask = T.bmatrix('outputs_mask')

        optimizer = Adam(
                model.parameters(),
                model.loss(sym_inputs, sym_inputs_mask,
                           sym_outputs, sym_outputs_mask),
                [sym_inputs, sym_inputs_mask],
                [sym_outputs, sym_outputs_mask],
                grad_max_norm=5.0)

        xent = function(
                [sym_inputs, sym_inputs_mask, sym_outputs, sym_outputs_mask],
                model.xent(sym_inputs, sym_inputs_mask,
                           sym_outputs, sym_outputs_mask))

        test_set = sent_pairs[:test_size]
        train_set = sent_pairs[test_size:]

        test_src, test_trg = list(zip(*test_set))
        test_inputs, test_inputs_mask = src_encoder.pad_sequences(test_src)
        test_outputs, test_outputs_mask = trg_encoder.pad_sequences(test_trg)

        start_time = time()
        end_time = start_time + 24*3600
        batch_nr = 0

        while time() < end_time:
            def pair_len(pair): return max(map(len, pair))
            for batch_pairs in iterate_batches(train_set, 64, pair_len):
                src_batch, trg_batch = list(zip(*batch_pairs))
                inputs, inputs_mask = src_encoder.pad_sequences(src_batch)
                outputs, outputs_mask = trg_encoder.pad_sequences(trg_batch)
                t0 = time()
                train_loss = optimizer.step(
                        inputs, inputs_mask, outputs, outputs_mask)
                print('Train loss: %.3f (%.2f s)' % (train_loss, time()-t0),
                      flush=True)
                batch_nr += 1
                if batch_nr % 10 == 0:
                    test_xent = xent(test_inputs, test_inputs_mask,
                                     test_outputs, test_outputs_mask)
                    print('Test xent: %.3f' % test_xent, flush=True)
                if batch_nr % 100 == 0:
                    pred, pred_mask, scores = model.search(
                            test_inputs, test_inputs_mask, max_length)
                    for src_sent, sent, sent_mask, score in zip(
                            test_inputs.T,
                            pred[-1].T, pred_mask[-1].T, scores[-1].T):
                        print(' '.join(
                            src_encoder.vocab[x] for x in src_sent.flatten()
                            if x > 1))
                        print('%.2f'%score, ' '.join(
                            trg_encoder.vocab[x] for x, there
                            in zip(sent.flatten(), sent_mask.flatten())
                            if bool(there)))
                        print('-'*72, flush=True)

                if time() >= end_time: break

        with open(args.model, 'wb') as f:
            pickle.dump(config, f)
            model.save(f)