Example #1
0
class Model(base.Model):
    # overwrite config
    name = 'char2word_to_char/europarl-ende'
    # datasets
    train_x_files = ['data/train/europarl-v7.de-en.en']
    train_t_files = ['data/train/europarl-v7.de-en.de']
    valid_x_files = ['data/valid/devtest2006.en']
    valid_t_files = ['data/valid/devtest2006.de']
    test_x_files = ['data/valid/test2007.en']
    test_t_files = ['data/valid/test2007.de']

    # settings that are local to the model
    alphabet_src = Alphabet('data/alphabet/dict_europarl.de-en.en', eos='*')
    alphabet_tar = Alphabet('data/alphabet/dict_europarl.de-en.de', eos='*', sos='')
    def __init__(self):
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('char')
        self.label_alphabet = Alphabet('label', is_label=True)

        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None

        self.max_char_length = 0

        self.word_num = 0
        self.char_num = 0
        self.label_num = 0
Example #3
0
    def __init__(self):
        self.word_alphabet = Alphabet('word')
        self.category_alphabet = Alphabet('category', is_category=True)
        self.label_alphabet = Alphabet('label', is_label=True)
        self.char_alphabet = Alphabet('char')
        self.pos_alphabet = Alphabet('pos')
        self.parent_alphabet = Alphabet('parent')
        self.rel_alphabet = Alphabet('rel')

        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None

        self.max_char_length = 0

        self.word_num = 0
        self.char_num = 0
        self.label_num = 0
        self.category_num = 0
        self.parent_num = 0
        self.pos_num = 0
        self.rel_num = 0
    def __init__(self):
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('char')
        self.bichar_alphabet = Alphabet('bichar')

        self.pos_alphabet = Alphabet('pos', is_label=True)
        self.char_type_alphabet = Alphabet('type', is_label=True)

        self.extchar_alphabet = Alphabet('extchar')
        self.extbichar_alphabet = Alphabet('extbichar')

        self.segpos_alphabet = Alphabet('segpos', is_label=True)
        self.wordlen_alphabet = Alphabet('wordlen')

        self.number_normalized = False
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.norm_bichar_emb = False

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_bichar_embedding = None

        self.wordPadID = 0
        self.charPadID = 0
        self.bicharPadID = 0
        self.charTypePadID = 0
        self.wordlenPadID = 0
        self.posPadID = 0
        self.appID = 0
        self.actionPadID = 0

        self.word_num = 0
        self.char_num = 0
        self.pos_num = 0
        self.bichar_num = 0
        self.segpos_num = 0
        self.wordlen_num = 0
        self.char_type_num = 0

        self.extchar_num = 0
        self.extbichar_num = 0

        self.wordlen_max = 7
Example #5
0
class Model(base.Model):
    # overwrite config
    name = 'char2word_to_char/wmt-deen'
    train_x_files = [
        'data/train/europarl-v7.de-en.de.tok',
        'data/train/commoncrawl.de-en.de.tok',
        'data/train/news-commentary-v10.de-en.de.tok'
    ]
    train_t_files = [
        'data/train/europarl-v7.de-en.en.tok',
        'data/train/commoncrawl.de-en.en.tok',
        'data/train/news-commentary-v10.de-en.en.tok'
    ]
    valid_x_files = ['data/valid/newstest2013.de.tok']
    valid_t_files = ['data/valid/newstest2013.en.tok']
    test_x_files = ['data/valid/newstest2014.deen.de.tok']
    test_t_files = ['data/valid/newstest2014.deen.en.tok']

    # settings that are local to the model
    alphabet_src = Alphabet('data/alphabet/dict_wmt_tok.de-en.de', eos='*')
    alphabet_tar = Alphabet('data/alphabet/dict_wmt_tok.de-en.en',
                            eos='*',
                            sos='')
Example #6
0
        """Add Start Of Sequence character to an array of sequences."""
        sos_col = np.ones([self.latest_batch_size, 1]) * alphabet.sos_id
        return np.concatenate([sos_col, array[:, :-1]], 1)


if __name__ == '__main__':
    from data.alphabet import Alphabet
    SEQ_LEN_X = 250
    SEQ_LEN_T = 500
    BATCH_SIZE = 76800

    text_loader = TextLoader(
        ['data/train/europarl-v7.de-en.en'],
        ['data/train/europarl-v7.de-en.de'], SEQ_LEN_X, SEQ_LEN_T)

    alphabet_src = Alphabet('data/alphabet/dict_wmt_tok.de-en.en', eos='*')
    alphabet_tar = Alphabet('data/alphabet/dict_wmt_tok.de-en.de', eos='*', sos='')

    text_batch_gen = TextBatchGenerator(text_loader,
                                        BATCH_SIZE,
                                        alphabet_src,
                                        alphabet_tar,
                                        use_dynamic_array_sizes=True)

    print("running warmup for 20 iterations, and 180 iterations with bucket")
    line = ""
    for i, batch in enumerate(text_batch_gen.gen_batch(variable_bucket_schedule)):
        print(batch["x_encoded"].shape, batch["t_encoded"].shape)
        if i == 200:
            break
Example #7
0
class Model:
    # settings that affect train.py
    batch_size_train = 100000
    batch_size_valid = 128
    seq_len_x = 50
    seq_len_t = 50
    name = None  # (string) For saving logs and checkpoints. (None to disable.)
    visualize_freq = 10000  # Visualize training X, y, and t. (0 to disable.)
    log_freq = 100  # How often to print updates during training.
    save_freq = 1000  # How often to save checkpoints. (0 to disable.)
    valid_freq = 500  # How often to validate.
    iterations = 5 * 32000  # How many iterations to train for before stopping.
    train_feedback = False  # Enable feedback during training?
    tb_log_freq = 500  # How often to save logs for TensorBoard
    max_to_keep = 100

    # datasets
    #train_x_files = ['data/train/europarl-v7.de-en.en']
    #train_t_files = ['data/train/europarl-v7.de-en.de']
    train_x_files = [
        'data/train/europarl-v7.de-en.en.tok',
        'data/train/commoncrawl.de-en.en.tok',
        'data/train/news-commentary-v10.de-en.en.tok'
    ]
    train_t_files = [
        'data/train/europarl-v7.de-en.de.tok',
        'data/train/commoncrawl.de-en.de.tok',
        'data/train/news-commentary-v10.de-en.de.tok'
    ]
    #valid_x_files = ['data/valid/devtest2006.en', 'data/valid/test2006.en',
    #                 'data/valid/test2007.en', 'data/valid/test2008.en']
    #valid_t_files = ['data/valid/devtest2006.de', 'data/valid/test2006.de',
    #                 'data/valid/test2007.de', 'data/valid/test2008.de']
    valid_x_files = ['data/valid/newstest2013.en.tok']
    valid_t_files = ['data/valid/newstest2013.de.tok']
    test_x_files = ['data/valid/newstest2014.deen.en.tok']
    test_t_files = ['data/valid/newstest2014.deen.de.tok']

    # settings that are local to the model
    alphabet_src_size = 310  # size of alphabet
    alphabet_tar_size = 310  # size of alphabet
    alphabet_src = Alphabet('data/alphabet/dict_wmt_tok.de-en.en', eos='*')
    alphabet_tar = Alphabet('data/alphabet/dict_wmt_tok.de-en.de',
                            eos='*',
                            sos='')
    char_encoder_units = 400  # number of units in character-level encoder
    word_encoder_units = 400  # num nuits in word-level encoders (both forwards and back)
    attn_units = 300  # num units used for attention in the decoder.
    embedd_dims = 256  # size of character embeddings
    learning_rate = 0.001
    reg_scale = 0.000001
    clip_norm = 1

    swap_schedule = {0: 0.0}

    # kwargs for scheduling function
    schedule_kwargs = {'fuzzyness': 3}

    def __init__(self):
        self.max_x_seq_len = self.seq_len_x
        self.max_t_seq_len = self.seq_len_t

        # TF placeholders
        self.setup_placeholders()

        # schedule functions
        self.train_schedule_function = tl.variable_bucket_schedule
        self.valid_schedule_function = None  # falls back to frostings.default_schedule
        self.test_schedule_function = None

        print("Model instantiation")
        self.build()
        self.loss, self.accuracy = self.build_loss(self.out, self.out_tensor)
        self.valid_loss, self.valid_accuracy = self.build_valid_loss()
        self.ys = self.build_prediction(self.out_tensor)
        self.valid_ys = self.build_valid_prediction()
        self.build_training()

        # Create TensorBoard scalar summaries
        tf.scalar_summary('train/loss', self.loss)
        tf.scalar_summary('train/accuracy', self.accuracy)

        # setup batch generators
        self.setup_batch_generators()

    def setup_placeholders(self):
        shape = [None, None]
        self.Xs = tf.placeholder(tf.int32, shape=shape, name='X_input')
        self.ts = tf.placeholder(tf.int32, shape=shape, name='t_input')
        self.ts_go = tf.placeholder(tf.int32, shape=shape, name='t_input_go')
        self.X_len = tf.placeholder(tf.int32, shape=[None], name='X_len')
        self.t_len = tf.placeholder(tf.int32, shape=[None], name='t_len')
        self.feedback = tf.placeholder(tf.bool, name='feedback_indicator')
        self.x_mask = tf.placeholder(tf.float32, shape=shape, name='x_mask')
        self.t_mask = tf.placeholder(tf.float32, shape=shape, name='t_mask')

        shape = [None, None]
        self.X_spaces = tf.placeholder(tf.int32, shape=shape, name='X_spaces')
        self.X_spaces_len = tf.placeholder(tf.int32,
                                           shape=[None],
                                           name='X_spaces_len')

    def build(self):
        print('Building model')
        self.x_embeddings = tf.Variable(tf.random_normal(
            [self.alphabet_src_size, self.embedd_dims], stddev=0.1),
                                        name='x_embeddings')
        self.t_embeddings = tf.Variable(tf.random_normal(
            [self.alphabet_tar_size, self.embedd_dims], stddev=0.1),
                                        name='t_embeddings')

        X_embedded = tf.gather(self.x_embeddings, self.Xs, name='embed_X')
        t_embedded = tf.gather(self.t_embeddings, self.ts_go, name='embed_t')

        with tf.variable_scope('dense_out'):
            W_out = tf.get_variable(
                'W_out', [self.word_encoder_units * 2, self.alphabet_tar_size])
            b_out = tf.get_variable('b_out', [self.alphabet_tar_size])

        # forward encoding
        char_enc_state, char_enc_out = encoder(X_embedded, self.X_len,
                                               'char_encoder',
                                               self.char_encoder_units)
        char2word = _grid_gather(char_enc_out, self.X_spaces)
        char2word.set_shape([None, None, self.char_encoder_units])
        word_enc_state, word_enc_out = encoder(char2word, self.X_spaces_len,
                                               'word_encoder',
                                               self.word_encoder_units)

        # backward encoding words
        char2word = tf.reverse_sequence(char2word,
                                        tf.to_int64(self.X_spaces_len), 1)
        char2word.set_shape([None, None, self.char_encoder_units])
        word_enc_state_bck, word_enc_out_bck = encoder(
            char2word, self.X_spaces_len, 'word_encoder_backwards',
            self.word_encoder_units)
        word_enc_out_bck = tf.reverse_sequence(word_enc_out_bck,
                                               tf.to_int64(self.X_spaces_len),
                                               1)

        word_enc_state = tf.concat(1, [word_enc_state, word_enc_state_bck])
        word_enc_out = tf.concat(2, [word_enc_out, word_enc_out_bck])

        # decoding
        dec_state, dec_out, valid_dec_out, valid_attention_tracker = (
            attention_decoder(word_enc_out, self.X_spaces_len, word_enc_state,
                              t_embedded, self.t_len, self.attn_units,
                              self.t_embeddings, W_out, b_out))

        out_tensor = tf.reshape(dec_out, [-1, self.word_encoder_units * 2])
        out_tensor = tf.matmul(out_tensor, W_out) + b_out
        out_shape = tf.concat(0, [
            tf.expand_dims(tf.shape(self.X_len)[0], 0),
            tf.expand_dims(tf.shape(t_embedded)[1], 0),
            tf.expand_dims(tf.constant(self.alphabet_tar_size), 0)
        ])
        self.valid_attention_tracker = valid_attention_tracker.pack()
        self.out_tensor = tf.reshape(out_tensor, out_shape)
        self.out_tensor.set_shape([None, None, self.alphabet_tar_size])

        valid_out_tensor = tf.reshape(valid_dec_out,
                                      [-1, self.word_encoder_units * 2])
        valid_out_tensor = tf.matmul(valid_out_tensor, W_out) + b_out
        self.valid_out_tensor = tf.reshape(valid_out_tensor, out_shape)

        self.out = None

        # add TensorBoard summaries for all variables
        tf.contrib.layers.summarize_variables()

    def build_loss(self, out, out_tensor):
        """Build a loss function and accuracy for the model."""
        print('  Building loss and accuracy')

        with tf.variable_scope('accuracy'):
            argmax = tf.to_int32(tf.argmax(out_tensor, 2))
            correct = tf.to_float(tf.equal(argmax, self.ts)) * self.t_mask
            accuracy = tf.reduce_sum(correct) / tf.reduce_sum(self.t_mask)

        with tf.variable_scope('loss'):
            loss = sequence_loss_tensor(out_tensor, self.ts, self.t_mask,
                                        self.alphabet_tar_size)

            with tf.variable_scope('regularization'):
                regularize = tf.contrib.layers.l2_regularizer(self.reg_scale)
                params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
                reg_term = sum([regularize(param) for param in params])

            loss += reg_term

        return loss, accuracy

    def build_valid_loss(self):
        return self.build_loss(self.out, self.valid_out_tensor)

    def build_prediction(self, out_tensor):
        print('  Building prediction')
        with tf.variable_scope('prediction'):
            # logits is a list of tensors of shape [batch_size, alphabet_size].
            # We need shape of [batch_size, target_seq_len, alphabet_size].
            return tf.argmax(out_tensor, dimension=2)

    def build_valid_prediction(self):
        return self.build_prediction(self.valid_out_tensor)

    def build_training(self):
        print('  Building training')
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)

        # Do gradient clipping
        # NOTE: this is the correct, but slower clipping by global norm.
        # Maybe it's worth trying the faster tf.clip_by_norm()
        # (See the documentation for tf.clip_by_global_norm() for more info)
        grads_and_vars = optimizer.compute_gradients(self.loss)
        gradients, variables = zip(*grads_and_vars)  # unzip list of tuples
        clipped_gradients, global_norm = (tf.clip_by_global_norm(
            gradients, self.clip_norm))
        clipped_grads_and_vars = zip(clipped_gradients, variables)

        # Create TensorBoard scalar summary for global gradient norm
        tf.scalar_summary('train/global gradient norm', global_norm)

        # Create TensorBoard summaries for gradients
        # for grad, var in grads_and_vars:
        #     # Sparse tensor updates can't be summarized, so avoid doing that:
        #     if isinstance(grad, tf.Tensor):
        #         tf.histogram_summary('grad_' + var.name, grad)

        # make training op for applying the gradients
        self.train_op = optimizer.apply_gradients(clipped_grads_and_vars,
                                                  global_step=self.global_step)

    def setup_batch_generators(self):
        """Load the datasets"""
        self.batch_generator = dict()

        # load training set
        print('Load training set')
        train_loader = tl.TextLoader(paths_X=self.train_x_files,
                                     paths_t=self.train_t_files,
                                     seq_len_x=self.seq_len_x,
                                     seq_len_t=self.seq_len_t)
        self.batch_generator['train'] = tl.TextBatchGenerator(
            loader=train_loader,
            batch_size=self.batch_size_train,
            alphabet_src=self.alphabet_src,
            alphabet_tar=self.alphabet_tar,
            use_dynamic_array_sizes=True,
            **self.schedule_kwargs)

        # load validation set
        print('Load validation set')
        valid_loader = tl.TextLoader(paths_X=self.valid_x_files,
                                     paths_t=self.valid_t_files,
                                     seq_len_x=self.seq_len_x,
                                     seq_len_t=self.seq_len_t)
        self.batch_generator['valid'] = tl.TextBatchGenerator(
            loader=valid_loader,
            batch_size=self.batch_size_valid,
            alphabet_src=self.alphabet_src,
            alphabet_tar=self.alphabet_tar,
            use_dynamic_array_sizes=True)

        # load test set
        print('Load validation set')
        test_loader = tl.TextLoader(paths_X=self.test_x_files,
                                    paths_t=self.test_t_files,
                                    seq_len_x=self.seq_len_x,
                                    seq_len_t=self.seq_len_t)
        self.batch_generator['test'] = tl.TextBatchGenerator(
            loader=test_loader,
            batch_size=self.batch_size_valid,
            alphabet_src=self.alphabet_src,
            alphabet_tar=self.alphabet_tar,
            use_dynamic_array_sizes=True)

    def valid_dict(self, batch, feedback=True):
        """ Return feed_dict for validation """
        return {
            self.Xs: batch['x_encoded'],
            self.ts: batch['t_encoded'],
            self.ts_go: batch['t_encoded_go'],
            self.X_len: batch['x_len'],
            self.t_len: batch['t_len'],
            self.x_mask: batch['x_mask'],
            self.t_mask: batch['t_mask'],
            self.feedback: feedback,
            self.X_spaces: batch['x_spaces'],
            self.X_spaces_len: batch['x_spaces_len']
        }

    def train_dict(self, batch):
        """ Return feed_dict for training.
        Reuse validation feed_dict because the only difference is feedback.
        """
        return self.valid_dict(batch, feedback=False)

    def build_feed_dict(self, batch, validate=False):
        return self.valid_dict(batch) if validate else self.train_dict(batch)

    def get_generator(self, split):
        assert split in ['train', 'valid', 'test']
        return self.batch_generator[split].gen_batch

    def next_train_feed(self):
        generator = self.get_generator('train')
        for t_batch in generator(self.train_schedule_function):
            extra = {'t_len': t_batch['t_len']}
            yield (self.build_feed_dict(t_batch), extra)

    def next_valid_feed(self):
        generator = self.get_generator('valid')
        for v_batch in generator(self.valid_schedule_function):
            yield self.build_feed_dict(v_batch, validate=True)

    def next_test_feed(self):
        generator = self.get_generator('test')
        for p_batch in generator(self.test_schedule_function):
            yield self.build_feed_dict(p_batch, validate=True)

    def get_alphabet_src(self):
        return self.batch_generator['train'].alphabet_src

    def get_alphabet_tar(self):
        return self.batch_generator['train'].alphabet_tar