def extract_args(self, features, mode, params):
     super().extract_args(features, mode, params)
     if self.hparams.vocab_size > 0:
         self.vocab = Vocabulary(size=self.hparams.vocab_size)
     else:
         self.vocab = Vocabulary(
             fname=self.hparams.vocab_file,
             skip_tokens=self.hparams.skip_tokens,
             skip_tokens_start=self.hparams.skip_tokens_start)
def main(_):
    print("Loading hyperparameters..")
    params = util.load_params(FLAGS.params_file)

    print("Building model..")
    model_dir = FLAGS.model_dir
    if FLAGS.clean_model_dir:
        util.clean_model_dir(model_dir)
    if FLAGS.model_cls == "transformer":
        model_cls = TransformerEstimator
    elif FLAGS.model_cls == "seq2seq":
        model_cls = Seq2SeqEstimator
    else:
        raise ValueError("Model class not supported.")
    model = model_cls(model_dir, params)

    print("Getting sources..")
    fields = {"train/inputs": "int", "train/targets": "int"}
    train_source = DataSource(FLAGS.train_file, fields)
    test_source = DataSource(FLAGS.test_file, fields)

    field_map = {"inputs": "train/inputs", "targets": "train/targets"}
    train_input_fn = train_source.get_input_fn(
        "train_in", field_map, None, FLAGS.batch_size)
    test_input_fn = test_source.get_input_fn(
        "test_in", field_map, 1, FLAGS.batch_size)

    print("Processing model..")
    model.train(train_input_fn, steps=FLAGS.train_batches)
    model.evaluate(test_input_fn)

    if FLAGS.interactive:
        print("Interactive decoding...")
        vocab = Vocabulary(fname=params["vocab_file"])
        decoding.cmd_decode(model, vocab)
 def extract_args(self, features, mode, params):
     super().extract_args(features, mode, params)
     self.d_k = self.hparams.d_model // self.hparams.num_heads
     self.d_pos = self.hparams.d_pos if self.hparams.d_pos == 0 else self.hparams.d_pos
     self.d_ff = self.hparams.d_ff if self.hparams.d_ff == 0 else self.hparams.d_ff
     if self.hparams.vocab_size > 0:
         self.vocab = Vocabulary(size=self.hparams.vocab_size)
     else:
         self.vocab = Vocabulary(fname=self.hparams.vocab_file)
     if not self.hparams.fixed_learning_rate:
         self.train_step = tf.get_variable(
             'train_step',
             shape=[],
             dtype=tf.float32,
             initializer=tf.zeros_initializer(dtype=tf.int32),
             trainable=False)
         self.learning_rate = (  # magic formula provided in transformer paper
             tf.sqrt(1.0 / self.hparams.d_model) * tf.minimum(
                 self.train_step * tf.pow(self.hparams.warmup_steps, -1.5),
                 tf.pow(self.train_step, -0.5)))
Beispiel #4
0
 def write_to_tfrecord(self, out_file, pipeline=None, max_lines=None):
     print("Writing to TFRecord..")
     writer = tf.python_io.TFRecordWriter(out_file)
     line_ctr = 0
     for row in self.row_gen():
         if not self.process_row(pipeline, row):
             continue
         feature = dict()
         for i in range(len(row)):
             key_ = self.headers[i].name
             type_ = self.headers[i].data_type
             vocab_ = self.headers[i].vocab_file
             mode_ = self.headers[i].vocab_mode
             if type_ == "text":
                 if vocab_ not in self.vocabs:
                     if mode_ != "write":
                         self.vocabs[vocab_] = Vocabulary(fname=vocab_)
                     else:
                         self.vocabs[vocab_] = Vocabulary()
                 row[i] = self.vocabs[vocab_].tokenize(
                     row[i], fixed_vocab=(mode_ == "read"))
                 feature[key_] = self.int64_feature(row[i])
             elif type_ == "int":
                 print([int(row[i])])
                 feature[key_] = self.int64_feature([int(row[i])])
             elif type_ == "float":
                 feature[key_] = self.float_feature([float(row[i])])
             else:
                 raise ValueError("Header type " + str(type_) +
                                  " not supported.")
         example = tf.train.Example(features=tf.train.Features(
             feature=feature))
         writer.write(example.SerializeToString())
         line_ctr = self.print_lines_processed(line_ctr)
         if max_lines is not None and line_ctr >= max_lines:
             break
     writer.close()
 def build_vocab_files(self, count_cutoff=0):
     print("Building vocabularies..")
     read_only = True
     self.vocabs = dict()
     for i in range(len(self.headers)):
         vocab_ = self.headers[i].vocab_file
         mode = self.headers[i].vocab_mode
         if ((vocab_ is not None) and (vocab_ not in self.vocabs)
                 and (mode != "read")):
             read_only = False
             if mode == "write":
                 self.vocabs[vocab_] = Vocabulary()
             elif mode == "append":
                 self.vocabs[vocab_] = Vocabulary(fname=vocab_)
             else:
                 raise ValueError("Vocab mode " + str(mode) +
                                  " not supported.")
         elif vocab_ is not None and mode == "read":
             self.vocabs[vocab_] = Vocabulary(fname=vocab_)
     if read_only:
         return
     line_ctr = 0
     for row in self.row_gen():
         for i in range(len(row)):
             vocab_ = self.headers[i].vocab_file
             if vocab_ in self.vocabs:
                 self.vocabs[vocab_].tokenize(row[i], fixed_vocab=False)
         line_ctr = self.print_lines_processed(line_ctr)
     for vocab_ in self.vocabs:
         if count_cutoff >= 0:
             self.vocabs[vocab_].count_cutoff(count_cutoff)
         with open(vocab_, "w", encoding="utf8") as vocab_f:
             for word in self.vocabs[vocab_].words:
                 vocab_f.write(word + "\n")
     for i in range(len(self.headers)):
         self.headers[i].vocab_mode = "read"
Beispiel #6
0
 def extract_args(self, features, mode, params):
     super().extract_args(features, mode, params)
     if (self.hparams.src_vocab_size == 0
             and self.hparams.tgt_vocab_size == 0
             and self.hparams.src_vocab_file == ""
             and self.hparams.tgt_vocab_file == ""):
         self.src_vocab = self.vocab
         self.tgt_vocab = self.vocab
     else:
         if self.hparams.src_vocab_size > 0:
             self.src_vocab = Vocabulary(size=self.hparams.src_vocab_size)
         else:
             self.src_vocab = Vocabulary(fname=self.hparams.src_vocab_file)
         if self.hparams.tgt_vocab_size > 0:
             self.tgt_vocab = Vocabulary(size=self.hparams.tgt_vocab_size)
         else:
             self.tgt_vocab = Vocabulary(fname=self.hparams.tgt_vocab_file)
def main(_):
    print("Loading hyperparameters..")
    params = util.load_params(FLAGS.params_file)

    print("Building model..")
    validation_config = tf.estimator.RunConfig(
        save_checkpoints_steps=100,
        keep_checkpoint_max=None,
    )
    model_dir = FLAGS.model_dir
    if FLAGS.clean_model_dir:
        util.clean_model_dir(model_dir)
    if FLAGS.model_cls == "transformer":
        model_cls = TransformerEstimator
    elif FLAGS.model_cls == "seq2seq":
        model_cls = Seq2SeqEstimator
    else:
        raise ValueError("Model class not supported.")
    model = model_cls(model_dir, params, config=validation_config)

    print("Getting sources..")
    fields = {"train/inputs": "int", "train/targets": "int"}
    train_source = DataSource(FLAGS.train_file, fields)
    test_source = DataSource(FLAGS.test_file, fields)

    field_map = {"inputs": "train/inputs", "targets": "train/targets"}
    train_input_fn = train_source.get_input_fn("train_in", field_map, None,
                                               FLAGS.batch_size)
    test_input_fn = test_source.get_input_fn("test_in", field_map, 1,
                                             FLAGS.batch_size)

    print("Processing model..")
    model.train(train_input_fn, steps=FLAGS.train_batches)
    model.choose_best_checkpoint(test_input_fn)
    model.evaluate(test_input_fn)

    if FLAGS.interaction != "off":
        print("Interactive decoding...")
        vocab = Vocabulary(fname=params["vocab_file"])
        if FLAGS.interaction == "cmd":
            decoding.cmd_decode(model, vocab, persona=True)
        elif FLAGS.interaction == "gui":
            decoding.gui_decode(model, vocab)
 def write_to_tfrecord(self,
                       out_file,
                       pipeline=None,
                       max_lines=None,
                       line_gen=None,
                       line_shard_len=None,
                       streamline=True,
                       traversal="depth_first",
                       max_pos_len=32):
     print("Writing to TFRecord..")
     writer = tf.python_io.TFRecordWriter(out_file)
     line_ctr = 0
     if line_gen is None:
         line_gen = self.row_gen()
     for row in line_gen:
         if not self.process_row(row, pipeline):
             continue
         feature = {}
         for i in range(len(row)):
             key_ = self.headers[i].name
             type_ = self.headers[i].data_type
             vocab_ = self.headers[i].vocab_file
             mode_ = self.headers[i].vocab_mode
             if type_ == "text" or type_ == "tree":
                 if vocab_ not in self.vocabs:
                     if mode_ != "write":
                         self.vocabs[vocab_] = Vocabulary(fname=vocab_)
                     else:
                         self.vocabs[vocab_] = Vocabulary()
                 if type_ == "text":
                     row[i] = self.vocabs[vocab_].tokenize(
                         row[i], fixed_vocab=(mode_ == "read"))
                     feature[key_] = self.int64_feature(row[i])
                 else:
                     tree_ints = []
                     tree_pos = []
                     for node in row[i].choose_traversal(traversal):
                         if streamline:
                             if node.value == "_NULL" and (
                                     not node.parent
                                     or node.parent.children[0].value
                                     == "_NULL"):
                                 continue
                             node.value = str(node.value)
                             if (not node.is_leaf()
                                 ) and node.children[0].value == "_NULL":
                                 if node.children[1].value == "_NULL":
                                     node.value = str(node.value) + "_0"
                                 else:
                                     node.value = str(node.value) + "_1"
                             if mode_ == "read" and node.value not in self.vocabs[
                                     vocab_].word2idx:
                                 if len(node.value
                                        ) > 2 and node.value[-2:] == "_0":
                                     node.value = "_UNK_0"
                                 elif len(node.value
                                          ) > 2 and node.value[-2:] == "_1":
                                     node.value = "_UNK_1"
                                 else:
                                     node.value = "_UNK"
                         tree_ints.append(self.vocabs[vocab_].get_token_id(
                             node.value, mode_ == "read"))
                         tree_pos += node.get_padded_positional_encoding(
                             max_pos_len)
                     field = self.headers[i].name
                     feature[field] = self.int64_feature(tree_ints)
                     feature[field + "_pos"] = self.float_feature(tree_pos)
             elif type_ == "int":
                 feature[key_] = self.int64_feature([int(row[i])])
             elif type_ == "float":
                 feature[key_] = self.float_feature([float(row[i])])
             else:
                 raise ValueError("Header type " + str(type_) +
                                  " not supported.")
         example = tf.train.Example(features=tf.train.Features(
             feature=feature))
         writer.write(example.SerializeToString())
         line_ctr = self.print_lines_processed(line_ctr, "trees")
         if max_lines is not None and line_ctr >= max_lines:
             break
     writer.close()
 def apply_byte_pair_encodings(self, out_file, max_lines=None):
     self.build_vocab_files()
     print("Applying byte pair encodings..")
     all_bpe_vocabs = dict()
     word_encodings = dict()
     for vocab_ in self.vocabs:
         all_bpe_vocabs[vocab_] = Vocabulary(fname=vocab_)
         word_encodings[vocab_] = dict()
     length_headers = OrderedDict()
     for i in range(len(self.headers)):
         if self.headers[i].vocab_file is not None:
             length_headers[self.headers[i].name] = DataHeader(
                 self.headers[i].name + "/_length", "int")
     for header_name in length_headers:
         self.headers.append(length_headers[header_name])
     with open(out_file, "w", encoding="utf8") as out_f:
         line_ctr = 0
         for row in self.row_gen():
             row_extension = []
             for i in range(len(row)):
                 vocab_ = self.headers[i].vocab_file
                 if vocab_ is not None:
                     row_extension.append(len(row[i].strip().split()))
                     new_elem = ""
                     for word in row[i].strip().split():
                         if word in word_encodings[vocab_]:
                             encoding = word_encodings[vocab_][word]
                         else:
                             encoding = list(word) + ["</EOW>"]
                             bigrams = dict()
                             for j in range(len(encoding) - 1):
                                 bigram = encoding[j] + encoding[j + 1]
                                 if bigram in all_bpe_vocabs[
                                         vocab_].word2idx:
                                     bigrams[j] = all_bpe_vocabs[
                                         vocab_].word2idx[bigram]
                             while len(bigrams) > 0:
                                 bigrams_argmin = None
                                 for idx in bigrams:
                                     if bigrams_argmin is None or bigrams[
                                             idx] < bigrams[bigrams_argmin]:
                                         bigrams_argmin = idx
                                 encoding = encoding[0:bigrams_argmin] + \
                                     [encoding[bigrams_argmin] + encoding[bigrams_argmin+1]] + encoding[bigrams_argmin+2:]
                                 bigrams = dict()
                                 for j in range(len(encoding) - 1):
                                     bigram = encoding[j] + encoding[j + 1]
                                     if bigram in all_bpe_vocabs[
                                             vocab_].word2idx:
                                         bigrams[j] = all_bpe_vocabs[
                                             vocab_].word2idx[bigram]
                             word_encodings[vocab_][word] = encoding
                         for subword in encoding:
                             new_elem += subword + " "
                     row[i] = new_elem
             row += row_extension
             out_f.write(self.concatenate_segments(row))
             line_ctr = self.print_lines_processed(line_ctr)
             if max_lines is not None and line_ctr >= max_lines:
                 break
     self.in_files = [out_file]
Beispiel #10
0
def main(_):
    print("Loading parameters..")
    params = util.load_params(FLAGS.params_file)

    print("Building model..")
    model_dir = FLAGS.model_dir
    if FLAGS.clean_model_dir:
        util.clean_model_dir(model_dir)
    first_model = PersonaSeq2SeqEstimator(model_dir, params, scope="first")
    second_model_encoder = Seq2SeqEncoderEstimator(model_dir,
                                                   params,
                                                   scope="second_encoder")
    second_model = EstimatorChain([second_model_encoder, first_model.decoder],
                                  model_dir,
                                  params,
                                  scope="second")
    mmi_model = PersonaSeq2SeqEstimator(model_dir,
                                        params,
                                        scope="mmi",
                                        is_mmi_model=True)
    model_group = EstimatorGroup([first_model, second_model, mmi_model],
                                 model_dir,
                                 params,
                                 scope="group")

    print("Getting sources..")
    fields = {
        "train/inputs": "int",
        "train/targets": "int",
        "train/speakers": "int"
    }
    train_source = DataSource(FLAGS.train_file, fields)
    autoenc_source = DataSource(FLAGS.autoenc_file, fields)
    test_source = DataSource(FLAGS.test_file, fields)

    train_field_map = {
        "inputs": "train/inputs",
        "targets": "train/targets",
        "speaker_ids": "train/speakers"
    }
    autoenc_field_map = {
        "inputs": "train/inputs",
        "targets": "train/inputs",
        "speaker_ids": "train/speakers"
    }
    mmi_field_map = {
        "inputs": "train/targets",
        "targets": "train/inputs",
        "speaker_ids": "train/speakers"
    }

    paired_input_fn = train_source.get_input_fn("paired_in", train_field_map,
                                                None, FLAGS.batch_size)
    autoenc_input_fn = train_source.get_input_fn("autoenc_in",
                                                 autoenc_field_map, None,
                                                 FLAGS.batch_size)
    mmi_input_fn = train_source.get_input_fn("mmi_in", mmi_field_map, None,
                                             FLAGS.batch_size)
    train_input_fn = DataSource.group_input_fns(
        ["first", "second", "mmi"],
        [paired_input_fn, autoenc_input_fn, mmi_input_fn])
    test_input_fn = test_source.get_input_fn("test_in", train_field_map, 1,
                                             FLAGS.batch_size)

    print("Processing models..")
    print("Pretraining primary model..")
    model_group.train(train_input_fn,
                      first_model,
                      steps=FLAGS.pretrain_batches)
    print("Multitask training..")
    model_group.train(train_input_fn, {
        "first": 1,
        "second": 1,
        "mmi": 0
    },
                      steps=FLAGS.train_batches)
    print("Training MMI model..")
    model_group.train(train_input_fn, mmi_model, steps=FLAGS.mmi_batches)
    print("Evaluating..")
    model_group.evaluate(test_input_fn, first_model)

    if FLAGS.interactive:
        print("Interactive decoding...")
        vocab = Vocabulary(fname=params["vocab_file"])
        decoding.cmd_decode(first_model,
                            vocab,
                            persona=True,
                            mmi_component=mmi_model)
class AbstractTransformerEstimator(AbstractIcecapsEstimator):
    @classmethod
    def construct_expected_params(cls):
        expected_params = super().construct_expected_params()
        expected_params["vocab_file"] = cls.make_param(
            "icecaps/examples/dummy_data/vocab.dic")
        expected_params["vocab_size"] = cls.make_param(0)
        expected_params["depth"] = cls.make_param(1)
        expected_params["num_heads"] = cls.make_param(8)
        expected_params["d_model"] = cls.make_param(32)
        expected_params["d_pos"] = cls.make_param(32)
        expected_params["d_ff"] = cls.make_param(64)
        expected_params["max_length"] = cls.make_param(10)
        expected_params["min_wavelength"] = cls.make_param(1.0)
        expected_params["max_wavelength"] = cls.make_param(1000.0)
        expected_params["warmup_steps"] = cls.make_param(4000.0)
        expected_params["fixed_learning_rate"] = cls.make_param(False)
        expected_params["learn_wavelengths"] = cls.make_param(False)
        expected_params["modality"] = cls.make_param("seq")
        expected_params["tree_depth"] = cls.make_param(256)
        expected_params["tree_width"] = cls.make_param(2)
        expected_params["learn_positional_embeddings"] = cls.make_param(False)
        return expected_params

    def extract_args(self, features, mode, params):
        super().extract_args(features, mode, params)
        self.d_k = self.hparams.d_model // self.hparams.num_heads
        self.d_pos = self.hparams.d_pos if self.hparams.d_pos == 0 else self.hparams.d_pos
        self.d_ff = self.hparams.d_ff if self.hparams.d_ff == 0 else self.hparams.d_ff
        if self.hparams.vocab_size > 0:
            self.vocab = Vocabulary(size=self.hparams.vocab_size)
        else:
            self.vocab = Vocabulary(fname=self.hparams.vocab_file)
        if not self.hparams.fixed_learning_rate:
            self.train_step = tf.get_variable(
                'train_step',
                shape=[],
                dtype=tf.float32,
                initializer=tf.zeros_initializer(dtype=tf.int32),
                trainable=False)
            self.learning_rate = (  # magic formula provided in transformer paper
                tf.sqrt(1.0 / self.hparams.d_model) * tf.minimum(
                    self.train_step * tf.pow(self.hparams.warmup_steps, -1.5),
                    tf.pow(self.train_step, -0.5)))

    def build_embeddings(self):
        # self.hparams.max_length), dtype=tf.float32), 1)
        position = tf.expand_dims(tf.cast(tf.range(0, 2048), dtype=tf.float32),
                                  1)
        if self.hparams.learn_wavelengths:
            wavelength_logs = tf.get_variable("wavelength_logs",
                                              [self.d_pos // 2], tf.float32)
        else:
            wavelength_logs = tf.linspace(
                math.log(self.hparams.min_wavelength),
                math.log(self.hparams.max_wavelength), self.d_pos // 2)
        div_term = tf.expand_dims(tf.exp(-wavelength_logs), 0)
        outer_product = tf.matmul(position, div_term)
        cosines = tf.cos(outer_product)
        sines = tf.sin(outer_product)
        self.positional_embeddings = tf.concat([cosines, sines], -1)
        if self.hparams.learn_positional_embeddings:
            self.positional_embeddings = tf.get_variable(
                name='positional_embeddings',
                shape=[self.hparams.max_length, self.hparams.d_model
                       ]) * np.sqrt(float(self.hparams.d_model))
        self.token_embeddings = tf.get_variable(
            name='token_embeddings',
            shape=[self.vocab.size(), self.hparams.d_model]) * np.sqrt(
                float(self.hparams.d_model))
        if self.hparams.modality == "tree":
            self.d_tree_param = self.d_pos // (self.hparams.tree_depth *
                                               self.hparams.tree_width)
            self.tree_params = tf.tanh(
                tf.get_variable("tree_params", [self.d_tree_param]))
            self.tiled_tree_params = tf.tile(
                tf.reshape(self.tree_params, [1, 1, -1]),
                [self.hparams.tree_depth, self.hparams.tree_width, 1])
            self.tiled_depths = tf.tile(
                tf.reshape(tf.range(self.hparams.tree_depth, dtype=tf.float32),
                           [-1, 1, 1]),
                [1, self.hparams.tree_width, self.d_tree_param])
            self.tree_norm = tf.sqrt(
                (1 - tf.square(self.tree_params)) * self.hparams.d_model / 2)
            self.tree_weights = tf.reshape(
                tf.pow(self.tiled_tree_params, self.tiled_depths) *
                self.tree_norm, [
                    self.hparams.tree_depth * self.hparams.tree_width,
                    self.d_tree_param
                ])

    def treeify_positions(self, positions):
        treeified = tf.expand_dims(positions, -1) * self.tree_weights
        shape = tf.shape(treeified)
        shape = tf.concat([shape[:-2], [self.d_pos]], -1)
        treeified = tf.reshape(treeified, shape)
        return treeified

    def init_inputs(self):
        self.inputs_sparse = tf.cast(self.features["inputs"], tf.int32)
        self.mask = tf.cast(
            tf.not_equal(self.inputs_sparse, self.vocab.end_token_id),
            tf.float32)
        self.inputs_length = tf.cast(tf.count_nonzero(self.mask, -1), tf.int32)
        self.inputs_max_length = tf.reduce_max(self.inputs_length)
        self.batch_size = tf.shape(self.inputs_sparse)[0]
        self.inputs_sparse = tf.slice(
            self.inputs_sparse, [0, 0],
            [self.batch_size, self.inputs_max_length])
        self.mask = tf.slice(self.mask, [0, 0],
                             [self.batch_size, self.inputs_max_length])
        self.inputs_dense = tf.nn.embedding_lookup(
            params=self.token_embeddings, ids=self.inputs_sparse)
        if self.hparams.modality == "seq":
            self.positions = tf.slice(self.positional_embeddings, [0, 0],
                                      [self.inputs_max_length, self.d_pos])
        elif self.hparams.modality == "tree":
            self.positions = tf.reshape(self.features["inputs_positions"], [
                self.batch_size, self.inputs_max_length,
                self.hparams.tree_depth * self.hparams.tree_width
            ])
            self.positions = self.treeify_positions(self.positions)
        else:
            raise ValueError("This input modality is not supported.")
        if self.d_pos != self.hparams.d_model:
            self.positions = tf.layers.dense(self.positions,
                                             self.hparams.d_model)
        self.inputs_dense = self.inputs_dense + self.positions
        self.inputs_dense = tf.nn.dropout(self.inputs_dense, self.keep_prob)
        self.inputs_dense = tf.transpose(
            tf.transpose(self.inputs_dense) * tf.transpose(self.mask))

    def build_layer_norm(self, x):
        return tf.contrib.layers.layer_norm(x, begin_norm_axis=-1)

    def build_sublayer_fn(self, x, f):
        x = self.build_layer_norm(x)
        x = x + tf.nn.dropout(f(x), self.keep_prob)
        return x

    def attention(self, query, key, value, d_k, enc_mask=None, dec_mask=None):
        scores = tf.matmul(query, tf.transpose(key,
                                               [0, 1, 3, 2])) / math.sqrt(d_k)
        if enc_mask is not None:
            scores = tf.transpose(
                scores, [1, 2, 0, 3]) * enc_mask - 1e24 * (1.0 - enc_mask)
            scores = tf.transpose(scores, [2, 0, 1, 3])
        if dec_mask is not None:
            scores = scores * dec_mask - 1e24 * (1.0 - dec_mask)
        p_attn = tf.nn.softmax(scores)
        p_attn = tf.nn.dropout(p_attn, keep_prob=self.keep_prob)
        attended_values = tf.matmul(p_attn, value)
        return attended_values, p_attn

    def mha_fn(self, query, key, value, batch_size, enc_mask_, dec_mask_):
        with tf.variable_scope("mha", reuse=tf.AUTO_REUSE) as scope:
            query = tf.transpose(
                tf.reshape(
                    tf.layers.dense(query, self.hparams.d_model,
                                    use_bias=True),
                    [batch_size, -1, self.hparams.num_heads, self.d_k]),
                [0, 2, 1, 3])
            key = tf.transpose(
                tf.reshape(
                    tf.layers.dense(key, self.hparams.d_model, use_bias=True),
                    [batch_size, -1, self.hparams.num_heads, self.d_k]),
                [0, 2, 1, 3])
            value = tf.transpose(
                tf.reshape(
                    tf.layers.dense(value, self.hparams.d_model,
                                    use_bias=True),
                    [batch_size, -1, self.hparams.num_heads, self.d_k]),
                [0, 2, 1, 3])
            attended, _ = self.attention(query, key, value, self.d_k,
                                         enc_mask_, dec_mask_)
            attended = tf.reshape(tf.transpose(attended, [0, 2, 1, 3]),
                                  [batch_size, -1, self.hparams.d_model])
            return attended

    def build_mha_sublayer(self,
                           x,
                           m,
                           batch_size,
                           enc_mask=None,
                           dec_mask=None):
        with tf.variable_scope("attn", reuse=tf.AUTO_REUSE) as scope:
            return self.build_sublayer_fn(
                x, lambda q: tf.layers.dense(
                    self.mha_fn(q, m, m, batch_size, enc_mask, dec_mask), self.
                    hparams.d_model))

    def build_ffn_sublayer(self, x, d_ff):
        with tf.variable_scope("ffn", reuse=tf.AUTO_REUSE) as scope:

            def ffn_fn(q):
                return tf.layers.dense(tf.layers.dense(q, d_ff, tf.nn.relu),
                                       self.hparams.d_model)

            return self.build_sublayer_fn(x, ffn_fn)

    def build_optimizer(self, trainable_params=None):
        super().build_optimizer(trainable_params)
        self.step_update_op = tf.assign_add(self.train_step, 1.0)
        with tf.control_dependencies([self.step_update_op]):
            self.train_op = tf.group([self.step_update_op, self.train_op])
Beispiel #12
0
 def __init__(self, fname, fields, vocab=None):
     self.fname = fname
     self.parse_fields(fields)
     self.input_fns = dict()
     self.vocab = vocab if vocab is not None else Vocabulary()
class AbstractRecurrentEstimator(AbstractIcecapsEstimator):
    @classmethod
    def construct_expected_params(cls):
        expected_params = super().construct_expected_params()
        expected_params["max_length"] = cls.make_param(50)
        expected_params["cell_type"] = cls.make_param('gru')
        expected_params["hidden_units"] = cls.make_param(32)
        expected_params["depth"] = cls.make_param(1)
        expected_params["token_embed_dim"] = cls.make_param(16)
        expected_params["tie_token_embeddings"] = cls.make_param(True)
        expected_params["beam_width"] = cls.make_param(8)
        expected_params["vocab_file"] = cls.make_param(
            "./dummy_data/vocab.dic")
        expected_params["vocab_size"] = cls.make_param(0)
        expected_params["skip_tokens"] = cls.make_param('')
        expected_params["skip_tokens_start"] = cls.make_param('')
        return expected_params

    def extract_args(self, features, mode, params):
        super().extract_args(features, mode, params)
        if self.hparams.vocab_size > 0:
            self.vocab = Vocabulary(size=self.hparams.vocab_size)
        else:
            self.vocab = Vocabulary(
                fname=self.hparams.vocab_file,
                skip_tokens=self.hparams.skip_tokens,
                skip_tokens_start=self.hparams.skip_tokens_start)

    def build_cell(self, name=None):
        if self.hparams.cell_type == 'linear':
            cell = BasicRNNCell(self.hparams.hidden_units,
                                activation=tf.identity,
                                name=name)
        elif self.hparams.cell_type == 'tanh':
            cell = BasicRNNCell(self.hparams.hidden_units,
                                activation=tf.tanh,
                                name=name)
        elif self.hparams.cell_type == 'relu':
            cell = BasicRNNCell(self.hparams.hidden_units,
                                activation=tf.nn.relu,
                                name=name)
        elif self.hparams.cell_type == 'gru':
            cell = GRUCell(self.hparams.hidden_units, name=name)
        elif self.hparams.cell_type == 'lstm':
            cell = LSTMCell(self.hparams.hidden_units, name=name)
        else:
            raise ValueError('Provided cell type not supported.')
        return cell

    def build_deep_cell(self,
                        cell_list=None,
                        name=None,
                        return_raw_list=False):
        if name is None:
            name = "cell"
        if cell_list is None:
            cell_list = []
            for i in range(self.hparams.depth):
                cell = self.build_cell(name=name + "_" + str(i))
                cell = DropoutWrapper(cell, output_keep_prob=self.keep_prob)
                cell_list.append(cell)
        if return_raw_list:
            return cell_list
        if len(cell_list) == 1:
            return cell_list[0]
        return MultiRNNCell(cell_list)

    def build_rnn(self, input_key="inputs"):
        with tf.variable_scope('rnn'):
            self.cell = self.build_deep_cell()
            self.build_inputs(input_key)
            self.outputs, self.last_state = tf.nn.dynamic_rnn(
                cell=self.cell,
                inputs=self.inputs_dense,
                sequence_length=self.inputs_length,
                time_major=False,
                dtype=tf.float32
            )  # [batch_size, max_time_step, cell_output_size], [batch_size, cell_output_size]

    def build_embeddings(self):
        if "token_embeddings" in self.features and self.hparams.tie_token_embeddings:
            self.token_embeddings = self.features["token_embeddings"]
        else:
            self.token_embeddings = tf.get_variable(
                name='token_embeddings',
                shape=[self.vocab.size(), self.hparams.token_embed_dim])
            if self.hparams.token_embed_dim != self.hparams.hidden_units:
                projection = tf.get_variable(name='token_embed_proj',
                                             shape=[
                                                 self.hparams.token_embed_dim,
                                                 self.hparams.hidden_units
                                             ])
                self.token_embeddings = self.token_embeddings @ projection

    def embed_sparse_to_dense(self, sparse):
        with tf.variable_scope('embed_sparse_to_dense', reuse=tf.AUTO_REUSE):
            dense = tf.nn.embedding_lookup(self.token_embeddings, sparse)
        return dense

    def build_inputs(self, input_key):
        self.build_embeddings()
        self.inputs_sparse_untrimmed = tf.cast(self.features[input_key],
                                               tf.int32)
        self.inputs_length = tf.cast(
            tf.count_nonzero(
                self.inputs_sparse_untrimmed - self.vocab.end_token_id, -1),
            tf.int32)
        self.inputs_max_length = tf.reduce_max(self.inputs_length)
        self.inputs_sparse = tf.slice(self.inputs_sparse_untrimmed, [0, 0],
                                      [-1, self.inputs_max_length])
        self.inputs_dense = self.embed_sparse_to_dense(self.inputs_sparse)
        self.batch_size = tf.shape(self.inputs_sparse)[0]

    def build_loss(self):
        with tf.name_scope('build_loss'):
            self.loss = seq2seq.sequence_loss(
                logits=self.logits,
                targets=self.targets_sparse,
                weights=self.target_mask,
                average_across_timesteps=True,
                average_across_batch=True,
            )
        self.reported_loss = tf.identity(self.loss, 'reported_loss')
def main(_):
    '''
    This is a simple example of how to build an Icecaps training script, and is essentially
    the "Hello World" of Icecaps. Icecaps training scripts follow a basic five-phase pattern
    that we describe here. We train a basic model on the paired data stored in
    dummy_data/paired_personalized.tfrecord. For information on how to build TFRecords
    from text data files, please see data_processing_example.py.
    '''

    print("Loading hyperparameters..")
    # The first phase is to load hyperparameters from a .params file. These files follow a
    # simple colon-delimited format (e.g. see dummy_params/simple_example_seq2seq.params).
    params = util.load_params(FLAGS.params_file)

    print("Building model..")
    # Second, we build our architecture based on our loaded hyperparameters. Our architecture
    # here is very basic: we use a simple LSTM-based seq2seq model. For information on more
    # complex architectures, wee train_persona_mmi_example.py.
    model_dir = FLAGS.model_dir
    if FLAGS.clean_model_dir:
        util.clean_model_dir(model_dir)
    model_cls = Seq2SeqEstimator

    # Every estimator expects a different set of hyperparmeters. If you set use_default_params
    # to True in your .params file, the estimator will employ default values for any unspecified
    # hyperparameters. To view the list of hyperparmeters with default values, you can run the
    # class method list_params(). E.g. you can open a Python session and run
    # Seq2SeqEstimator.list_params() to view what hyperparameters our seq2seq estimator expects.
    model = model_cls(model_dir, params)

    print("Getting sources..")
    # Third, we set up our data sources. DataSource objects allow you to build input_fns that
    # efficiently feed data into the training pipeline from TFRecord files. In our simple example,
    # we set up two data sources: one for training and one for testing.

    # TFRecords are created with name variables per data point. You must create a fields dictionary
    # to tell the DataSource which variables to load and what their types are.
    fields = {"train/inputs": "int", "train/targets": "int"}
    train_source = DataSource(FLAGS.train_file, fields)
    test_source = DataSource(FLAGS.test_file, fields)

    # Then, you must create a field_map dictionary to tell your estimator how to map the TFRecord's
    # variable names to the names expected by the estimator. While this may seem like unnecessary
    # overhead in this simple example, it provides useful flexibility in more complex scenarios.
    field_map = {"inputs": "train/inputs", "targets": "train/targets"}

    # Finally, build input_fns from your DataSources.
    train_input_fn = train_source.get_input_fn(
        "train_in", field_map, None,
        FLAGS.batch_size)  # None lets our input_fn run for an unbounded
    # number of epochs.
    test_input_fn = test_source.get_input_fn(
        "test_in", field_map, 1,
        FLAGS.batch_size)  # For testing, we only want to run the input_fn
    # for one epoch instead.

    print("Processing model..")
    # Fourth, we pipe our input_fns through our model for training and evaluation.
    model.train(train_input_fn, steps=FLAGS.train_batches)
    model.evaluate(test_input_fn)

    if FLAGS.interactive:
        print("Interactive decoding...")
        # Fifth, you may optionally set up an interactive session to test your system by directly
        # engaging with it.
        vocab = Vocabulary(fname=params["vocab_file"])
        decoding.cmd_decode(model, vocab)
Beispiel #15
0
class RNNEstimator(AbstractRecurrentEstimator):
    def _model_fn(self, features, mode, params):
        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
            self.extract_args(features, mode, params)
            self.init_inputs()
            self.build_cell()
            self.build_obj()
            if mode == tf.estimator.ModeKeys.PREDICT:
                self.build_rt_decoder()
                self.predictions = {
                    "inputs": self.features["inputs"],
                    "outputs": self.hypotheses,
                    "scores": self.scores
                }
                if "metadata" in self.features:
                    self.predictions["metadata"] = self.features["metadata"]
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=self.predictions)
            self.init_targets()
            self.build_loss()
            if mode == tf.estimator.ModeKeys.TRAIN:
                self.build_optimizer()
                for var in tf.trainable_variables():
                    # Add histograms for trainable variables
                    tf.summary.histogram(var.op.name, var)
                return tf.estimator.EstimatorSpec(mode,
                                                  loss=self.reported_loss,
                                                  train_op=self.train_op)
            if mode == tf.estimator.ModeKeys.EVAL:
                print("Number of parameters: " +
                      str(self.get_num_model_params()))
                self.eval_metric_ops = dict()
                return tf.estimator.EstimatorSpec(
                    mode,
                    loss=self.reported_loss,
                    eval_metric_ops=self.eval_metric_ops)

    @classmethod
    def construct_expected_params(cls):
        expected_params = super().construct_expected_params()
        expected_params["src_vocab_file"] = cls.make_param("")
        expected_params["tgt_vocab_file"] = cls.make_param("")
        expected_params["src_vocab_size"] = cls.make_param(0)
        expected_params["tgt_vocab_size"] = cls.make_param(0)
        return expected_params

    def extract_args(self, features, mode, params):
        super().extract_args(features, mode, params)
        if (self.hparams.src_vocab_size == 0
                and self.hparams.tgt_vocab_size == 0
                and self.hparams.src_vocab_file == ""
                and self.hparams.tgt_vocab_file == ""):
            self.src_vocab = self.vocab
            self.tgt_vocab = self.vocab
        else:
            if self.hparams.src_vocab_size > 0:
                self.src_vocab = Vocabulary(size=self.hparams.src_vocab_size)
            else:
                self.src_vocab = Vocabulary(fname=self.hparams.src_vocab_file)
            if self.hparams.tgt_vocab_size > 0:
                self.tgt_vocab = Vocabulary(size=self.hparams.tgt_vocab_size)
            else:
                self.tgt_vocab = Vocabulary(fname=self.hparams.tgt_vocab_file)

    def init_inputs(self):
        with tf.name_scope('init_encoder'):
            inputs = tf.cast(self.features["inputs"], tf.int32)
            self.batch_size = tf.shape(inputs)[0]
            inputs_length = tf.cast(
                tf.count_nonzero(inputs - self.vocab.end_token_id, -1),
                tf.int32)
            inputs_max_length = tf.reduce_max(inputs_length)
            end_token = tf.ones(shape=[
                self.batch_size, self.hparams.max_length - inputs_max_length
            ],
                                dtype=tf.int32) * self.vocab.end_token_id
            # [batch_size, max_time_steps + 1]
            self.inputs_sparse = tf.concat([inputs, end_token], axis=1)

    def init_targets(self):
        with tf.name_scope('init_decoder'):
            targets = tf.cast(self.features["targets"], tf.int32)
            targets_length = tf.cast(
                tf.count_nonzero(targets - self.vocab.end_token_id, -1),
                tf.int32)
            targets_max_length = tf.reduce_max(targets_length)
            end_token = tf.ones(shape=[
                self.batch_size, self.hparams.max_length - targets_max_length
            ],
                                dtype=tf.int32) * self.vocab.end_token_id
            # [batch_size, max_time_steps + 1]
            self.targets_sparse = tf.concat([targets, end_token], axis=1)
            self.targets_length = targets_length + 1
            self.target_mask = tf.sequence_mask(lengths=self.targets_length,
                                                maxlen=self.hparams.max_length,
                                                dtype=tf.float32)

    def build_cell(self):
        sequence_length = tf.ones([self.batch_size],
                                  dtype=tf.int32) * self.hparams.max_length
        super().build_cell(sequence_length, self.src_vocab.size())

    def build_obj(self):
        output_layer = Dense(self.tgt_vocab.size(), name='output_projection')
        self.logits = output_layer(self.outputs)

    def build_rt_decoder(self):
        with tf.name_scope('predict_decoder'):
            self.hypotheses = tf.argmax(self.logits, -1)
            self.scores = tf.reduce_sum(
                tf.reduce_max(tf.nn.log_softmax(self.logits), -1), -1)
Beispiel #16
0
def main(_):
    '''
    This is a more complex example in which we build an Icecaps script involving
    component chaining and multi-task learning. We recommend you start with
    train_simple_example.py. In this example, we build a personalized conversation system
    that combines paired and unpaired data, and applies MMI during decoding.
    '''

    print("Loading parameters..")
    # When multiple estimators are involved, you can specify which hyperparameters in your
    # params file belong to which estimator using scoping. See dummy_params/persona_mmi_example.params
    # for an example. If no scope is specified, the hyperparameter is provided to all
    # models in your architecture.
    params = util.load_params(FLAGS.params_file)

    print("Building model..")
    model_dir = FLAGS.model_dir
    if FLAGS.clean_model_dir:
        util.clean_model_dir(model_dir)

    # For this system, we will need to build three different estimators.
    # The first estimator is a personalized seq2seq estimator that will be responsible for
    # learning the conversational model.
    first_model = PersonaSeq2SeqEstimator(model_dir, params, scope="first")

    # The second estimator is a personalized seq2seq estimator that shares its decoder with
    # the first model. This model will learn an autoencoder on an unpaired personalized
    # data set. The purpose of this configuration is to influence the first model with
    # stylistic information from the unpaired dataset.

    # To construct this second estimator, we first build a seq2seq encoder separate from
    # the first model. Then, we use an EstimatorChain to chain that encoder to the first
    # model's decoder, allowing the two models to share that decoder.
    second_model_encoder = Seq2SeqEncoderEstimator(model_dir,
                                                   params,
                                                   scope="second_encoder")
    second_model = EstimatorChain([second_model_encoder, first_model.decoder],
                                  model_dir,
                                  params,
                                  scope="second")

    # The third estimator is used for MMI decoding. This model will learn the inverse
    # function of the first model. During decoding, this estimator will be used to rerank
    # hypotheses generated by the first model during beam search decoding. While this
    # won't have much of an effect on our toy data sets, the purpose of this model in
    # real-world settings is to penalize generic responses applicable to many contexts
    # such as "I don't know."
    mmi_model = PersonaSeq2SeqEstimator(model_dir,
                                        params,
                                        scope="mmi",
                                        is_mmi_model=True)
    model_group = EstimatorGroup([first_model, second_model, mmi_model],
                                 model_dir,
                                 params,
                                 scope="group")

    print("Getting sources..")
    # We will use two DataSources for training and one for testing.
    fields = {
        "train/inputs": "int",
        "train/targets": "int",
        "train/speakers": "int"
    }
    paired_source = DataSource(FLAGS.paired_file, fields)
    unpaired_source = DataSource(FLAGS.unpaired_file, fields)
    test_source = DataSource(FLAGS.test_file, fields)

    # We construct three field maps.
    # The paired field map is similar to the field map shown in train_simple_example.py
    # The unpaired field map maps train/inputs to both the estimator's inputs and targets,
    # in order to train an autoencoder.
    # The mmi field maps maps train/inputs to targets and train/targets to inputs, in
    # order to learn the inverse of the first estimator.
    paired_field_map = {
        "inputs": "train/inputs",
        "targets": "train/targets",
        "speaker_ids": "train/speakers"
    }
    unpaired_field_map = {
        "inputs": "train/inputs",
        "targets": "train/inputs",
        "speaker_ids": "train/speakers"
    }
    mmi_field_map = {
        "inputs": "train/targets",
        "targets": "train/inputs",
        "speaker_ids": "train/speakers"
    }

    paired_input_fn = paired_source.get_input_fn("paired_in", paired_field_map,
                                                 None, FLAGS.batch_size)
    unpaired_input_fn = unpaired_source.get_input_fn("unpaired_in",
                                                     unpaired_field_map, None,
                                                     FLAGS.batch_size)
    mmi_input_fn = paired_source.get_input_fn("mmi_in", mmi_field_map, None,
                                              FLAGS.batch_size)
    # For multi-task learning, you will need to group your input_fns together with group_input_fns().
    train_input_fn = DataSource.group_input_fns(
        ["first", "second", "mmi"],
        [paired_input_fn, unpaired_input_fn, mmi_input_fn])
    test_input_fn = test_source.get_input_fn("test_in", paired_field_map, 1,
                                             FLAGS.batch_size)

    print("Processing models..")
    # Icecaps supports flexible multi-task training pipelines. You can set up multiple phases
    # where each phase trains your architecture with different weights across your objectives.
    # In this example, we will first pre-train the first model by itself, then jointly train
    # the first and second models, then finally train the MMI model by itself.
    print("Pretraining primary model..")
    model_group.train(train_input_fn,
                      first_model,
                      steps=FLAGS.pretrain_batches)
    print("Multitask training..")
    model_group.train(train_input_fn, {
        "first": 1,
        "second": 1,
        "mmi": 0
    },
                      steps=FLAGS.train_batches)
    print("Training MMI model..")
    model_group.train(train_input_fn, mmi_model, steps=FLAGS.mmi_batches)
    print("Evaluating..")
    model_group.evaluate(test_input_fn, first_model)

    if FLAGS.interactive:
        print("Interactive decoding...")
        vocab = Vocabulary(fname=params["vocab_file"])
        # To decode with MMI, you can pass in your MMI model to cmd_decode().
        # lambda_balance represents how the first model and MMI model's scores are weighted during decoding.
        decoding.cmd_decode(first_model,
                            vocab,
                            persona=True,
                            mmi_component=mmi_model,
                            lambda_balance=FLAGS.lambda_balance)