def __init__(self, pretrained_model_name: Optional[str] = None, cache_dir: Optional[str] = None, hparams=None): super().__init__(hparams=hparams) self.load_pretrained_config(pretrained_model_name, cache_dir) # Word embedding self.word_embedder = WordEmbedder( vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Segment embedding for each type of tokens self.segment_embedder = WordEmbedder( vocab_size=self._hparams.type_vocab_size, hparams=self._hparams.segment_embed) # Position embedding self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The BERT encoder (a TransformerEncoder) self.encoder = TransformerEncoder(hparams=self._hparams.encoder) self.pooler = nn.Sequential( nn.Linear(self._hparams.hidden_size, self._hparams.hidden_size), nn.Tanh()) self.init_pretrained_weights()
def test_infer_helpers(self): """Tests inference helpers. """ def _test_fn(helper): _, next_inputs, _ = helper.next_inputs( time=1, outputs=tf.ones([self._batch_size, self._vocab_size]), # Not used state=None, # Not used sample_ids=tf.ones([self._batch_size], dtype=tf.int32)) self.assertEqual(helper.sample_ids_shape, tf.TensorShape([])) self.assertEqual(next_inputs.get_shape(), tf.TensorShape([self._batch_size, self._emb_dim])) # Test in an RNN decoder output_layer = tf.layers.Dense(self._vocab_size) decoder = BasicRNNDecoder(vocab_size=self._vocab_size, output_layer=output_layer) outputs, final_state, sequence_lengths = decoder( helper=helper, max_decoding_length=self._max_seq_length) cell_dim = decoder.hparams.rnn_cell.kwargs.num_units with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_, final_state_, sequence_lengths_ = sess.run( [outputs, final_state, sequence_lengths]) max_length = max(sequence_lengths_) self.assertEqual( outputs_.logits.shape, (self._batch_size, max_length, self._vocab_size)) self.assertEqual(outputs_.sample_id.shape, (self._batch_size, max_length)) self.assertEqual(final_state_[0].shape, (self._batch_size, cell_dim)) # case-(1) helper = GreedyEmbeddingHelper(self._embedding, self._start_tokens, self._end_token) _test_fn(helper) # case-(2) embedder = WordEmbedder(self._embedding) helper = GreedyEmbeddingHelper(embedder, self._start_tokens, self._end_token) _test_fn(helper) # case-(3) word_embedder = WordEmbedder(self._embedding) pos_embedder = PositionEmbedder(position_size=self._max_seq_length) def _emb_fn(ids, times): return word_embedder(ids) + pos_embedder(times) helper = GreedyEmbeddingHelper(_emb_fn, self._start_tokens, self._end_token) _test_fn(helper)
def test_embedder_multi_calls(self): """Tests embedders called by multiple times. """ hparams = { "dim": 1024, "dropout_rate": 0.3, "dropout_strategy": "item" } embedder = WordEmbedder(vocab_size=100, hparams=hparams) inputs = torch.ones([64, 16], dtype=torch.int32) outputs = embedder(inputs) emb_dim = embedder.dim if not isinstance(emb_dim, (list, tuple)): emb_dim = [emb_dim] self.assertEqual(list(outputs.shape), [64, 16] + emb_dim) # Call with inputs in a different shape inputs = torch.ones([64, 10, 20], dtype=torch.int32) outputs = embedder(inputs) emb_dim = embedder.dim if not isinstance(emb_dim, (list, tuple)): emb_dim = [emb_dim] self.assertEqual(list(outputs.shape), [64, 10, 20] + emb_dim)
def _test_word_embedder(self, hparams): """Tests :class:`texar.modules.WordEmbedder`. """ embedder = WordEmbedder(vocab_size=100, hparams=hparams) inputs = torch.randint(embedder.vocab_size, (64, 16), dtype=torch.long) outputs = embedder(inputs) inputs_soft = torch.randn((64, 16, embedder.vocab_size), dtype=torch.float32) outputs_soft = embedder(soft_ids=inputs_soft) if isinstance(embedder.dim, (list, tuple)): emb_dim = tuple(embedder.dim) else: emb_dim = (embedder.dim, ) if isinstance(hparams["dim"], (list, tuple)): hparams_dim = tuple(hparams["dim"]) else: hparams_dim = (hparams["dim"], ) self.assertEqual(outputs.size(), (64, 16) + emb_dim) self.assertEqual(outputs_soft.size(), (64, 16) + emb_dim) self.assertEqual(emb_dim, hparams_dim) self.assertEqual(embedder.vocab_size, 100) self.assertEqual(outputs.size(), (64, 16) + emb_dim) self.assertEqual(outputs_soft.size(), (64, 16) + emb_dim)
def _test_word_embedder(self, hparams): """Tests :class:`texar.modules.WordEmbedder`. """ embedder = WordEmbedder( vocab_size=100, hparams=hparams) inputs = torch.ones([64, 16], dtype=torch.int32) outputs = embedder(inputs) inputs_soft = torch.ones( [64, 16, embedder.vocab_size], dtype=torch.float32) outputs_soft = embedder(soft_ids=inputs_soft) emb_dim = embedder.dim if isinstance(emb_dim, int): emb_dim = [emb_dim] if not isinstance(emb_dim, (list)): emb_dim = list(emb_dim) hparams_dim = hparams["dim"] if not isinstance(hparams["dim"], (list, tuple)): hparams_dim = [hparams["dim"]] self.assertEqual(list(outputs.shape), [64, 16] + emb_dim) self.assertEqual(list(outputs_soft.shape), [64, 16] + emb_dim) self.assertEqual(emb_dim, hparams_dim) self.assertEqual(embedder.vocab_size, 100) self.assertEqual(tuple(outputs.shape), (64, 16) + tuple(emb_dim)) self.assertEqual(tuple(outputs_soft.shape), (64, 16) + tuple(emb_dim))
def __init__(self, config_model, config_data): ModuleBase.__init__(self) self.config_model = config_model self.config_data = config_data with open(config_data.vocab_file, "rb") as f: id2w = pickle.load(f) self.id2w = id2w self.vocab_size = len(id2w) self.pad_token_id, self.bos_token_id = (0, 1) self.eos_token_id, self.unk_token_id = (2, 3) self.word_embedder = WordEmbedder(vocab_size=self.vocab_size, hparams=config_model.emb) self.pos_embedder = SinusoidsPositionEmbedder( position_size=config_data.max_decoding_length, hparams=config_model.position_embedder_hparams, ) self.encoder = TransformerEncoder(hparams=config_model.encoder) self.decoder = TransformerDecoder( vocab_size=self.vocab_size, output_layer=self.word_embedder.embedding, hparams=config_model.decoder, ) self.smoothed_loss_func = LabelSmoothingLoss( label_confidence=self.config_model.loss_label_confidence, tgt_vocab_size=self.vocab_size, ignore_index=0, )
def __init__(self, hparams=None): super().__init__(hparams) self.word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Segment embedding for each type of tokens self.segment_embedder = WordEmbedder( vocab_size=self._hparams.type_vocab_size, hparams=self._hparams.segment_embed) # Position embedding self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The BERT encoder (a TransformerEncoder) self.encoder = TransformerEncoder(hparams=self._hparams.encoder) self.pooler = nn.Sequential( nn.Linear(self._hparams.hidden_size, self._hparams.hidden_size), nn.Tanh(), nn.Dropout(self._hparams.dropout)) self._num_classes = self._hparams.num_classes if self._num_classes > 0: logit_kwargs = self._hparams.logit_layer_kwargs if logit_kwargs is None: logit_kwargs = {} elif not isinstance(logit_kwargs, HParams): raise ValueError("hparams['logit_layer_kwargs'] " "must be a dict.") else: logit_kwargs = logit_kwargs.todict() self.logits_layer = nn.Linear(self._hparams.hidden_size, self._num_classes, **logit_kwargs) else: self.logits_layer = None self.step_iteration = 0
def test_word_embedder_soft_ids(self): """Tests the correctness of using soft ids. """ init_value = np.expand_dims(np.arange(5), 1) embedder = WordEmbedder(init_value=init_value) ids = np.array([3]) soft_ids = np.array([[0, 0, 0, 1, 0]]) outputs = embedder(ids=torch.from_numpy(ids)) soft_outputs = embedder(soft_ids=torch.from_numpy(soft_ids)) self.assertEqual(outputs, soft_outputs)
def test_word_embedder_soft_ids(self): """Tests the correctness of using soft ids. """ init_value = np.expand_dims(np.arange(5), 1) embedder = WordEmbedder(init_value=init_value) ids = torch.tensor([3]) soft_ids = torch.tensor([0, 0, 0, 1, 0], dtype=torch.float) outputs = embedder(ids=ids) soft_outputs = embedder(soft_ids=soft_ids) self.assertEqual(outputs, soft_outputs)
def setUp(self): self._vocab_size = 4 self._max_time = 8 self._batch_size = 16 self._emb_dim = 20 self._inputs = torch.randint(self._vocab_size, size=(self._batch_size, self._max_time)) embedding = torch.rand(self._vocab_size, self._emb_dim, dtype=torch.float) self._embedder = WordEmbedder(init_value=embedding) self._hparams = HParams(None, BasicRNNDecoder.default_hparams())
def test_encode_with_embedder(self): """Tests encoding companioned with :mod:`texar.modules.embedders`. """ embedder = WordEmbedder(vocab_size=20, hparams={"dim": 100}) inputs = tf.ones([64, 16], dtype=tf.int32) encoder = UnidirectionalRNNEncoder() outputs, state = encoder(embedder(inputs)) cell_dim = encoder.hparams.rnn_cell.kwargs.num_units with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_, state_ = sess.run([outputs, state]) self.assertEqual(outputs_.shape, (64, 16, cell_dim)) self.assertEqual(state_[0].shape, (64, cell_dim))
def setUp(self): self._vocab_size = 10 self._max_time = 16 self._batch_size = 8 self._emb_dim = 20 self._attention_dim = 256 self._inputs = torch.randint(self._vocab_size, size=(self._batch_size, self._max_time)) embedding = torch.rand(self._vocab_size, self._emb_dim, dtype=torch.float) self._embedder = WordEmbedder(init_value=embedding) self._encoder_output = torch.rand(self._batch_size, self._max_time, 64) self._test_hparams = {} # (cell_type, is_multi) -> hparams for cell_type in ["RNNCell", "LSTMCell", "GRUCell"]: hparams = { "rnn_cell": { 'type': cell_type, 'kwargs': { 'num_units': 256, }, }, "attention": { "kwargs": { "num_units": self._attention_dim }, } } self._test_hparams[(cell_type, False)] = HParams( hparams, AttentionRNNDecoder.default_hparams()) hparams = { "rnn_cell": { 'type': 'LSTMCell', 'kwargs': { 'num_units': 256, }, 'num_layers': 3, }, "attention": { "kwargs": { "num_units": self._attention_dim }, } } self._test_hparams[("LSTMCell", True)] = HParams( hparams, AttentionRNNDecoder.default_hparams())
def test_word_embedder_soft_ids(self): """Tests the correctness of using soft ids. """ init_value = np.expand_dims(np.arange(5), 1) embedder = WordEmbedder(init_value=init_value) ids = np.array([3]) soft_ids = np.array([[0, 0, 0, 1, 0]]) outputs = embedder(ids=ids) soft_outputs = embedder(soft_ids=soft_ids) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_, soft_outputs_ = sess.run([outputs, soft_outputs]) self.assertEqual(outputs_, soft_outputs_)
def __init__(self, gpt2_config, top_k, temperature): super().__init__() self.word_embedder = WordEmbedder(vocab_size=gpt2_config.vocab_size, hparams=gpt2_config.embed) self.pos_embedder = PositionEmbedder( position_size=gpt2_config.position_size, hparams=gpt2_config.pos_embed) self.decoder = TransformerDecoder( vocab_size=gpt2_config.vocab_size, output_layer=self.word_embedder.embedding, hparams=gpt2_config.decoder) self.top_k = top_k self.temperature = temperature self._embedding_fn = lambda x, y: (self.word_embedder(x) + self. pos_embedder(y))
def test_embedder_multi_calls(self): """Tests embedders called by multiple times. """ hparams = {"dim": 26, "dropout_rate": 0.3, "dropout_strategy": "item"} embedder = WordEmbedder(vocab_size=100, hparams=hparams) inputs = torch.randint(embedder.vocab_size, (64, 16), dtype=torch.long) outputs = embedder(inputs) if isinstance(embedder.dim, (list, tuple)): emb_dim = tuple(embedder.dim) else: emb_dim = (embedder.dim, ) self.assertEqual(outputs.size(), (64, 16) + emb_dim) # Call with inputs in a different shape inputs = torch.randint(embedder.vocab_size, (64, 10, 20), dtype=torch.long) outputs = embedder(inputs) self.assertEqual(outputs.size(), (64, 10, 20) + emb_dim)
def _test_word_embedder(self, hparams): """Tests :class:`texar.modules.WordEmbedder`. """ embedder = WordEmbedder(vocab_size=100, hparams=hparams) inputs = tf.ones([64, 16], dtype=tf.int32) outputs = embedder(inputs) inputs_soft = tf.ones([64, 16, embedder.vocab_size], dtype=tf.float32) outputs_soft = embedder(soft_ids=inputs_soft) emb_dim = embedder.dim if not isinstance(emb_dim, (list, tuple)): emb_dim = [emb_dim] hparams_dim = hparams["dim"] if not isinstance(hparams["dim"], (list, tuple)): hparams_dim = [hparams["dim"]] self.assertEqual(outputs.shape, [64, 16] + emb_dim) self.assertEqual(outputs_soft.shape, [64, 16] + emb_dim) self.assertEqual(emb_dim, hparams_dim) self.assertEqual(embedder.vocab_size, 100) self.assertEqual(len(embedder.trainable_variables), 1) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_, outputs_soft_ = sess.run( [outputs, outputs_soft], feed_dict={global_mode(): tf.estimator.ModeKeys.TRAIN}) self.assertEqual(outputs_.shape, (64, 16) + tuple(emb_dim)) self.assertEqual(outputs_soft_.shape, (64, 16) + tuple(emb_dim)) # Tests unknown input shapes inputs = tf.placeholder(dtype=tf.int64, shape=[None, None]) outputs = embedder(inputs) self.assertEqual(len(outputs.get_shape()), 2 + len(hparams_dim)) inputs_soft = tf.placeholder(dtype=tf.int64, shape=[None, None, None]) outputs_soft = embedder(soft_ids=inputs_soft) self.assertEqual(len(outputs_soft.get_shape()), 2 + len(hparams_dim))
def __init__(self, pretrained_model_name: Optional[str] = None, cache_dir: Optional[str] = None, hparams=None): self.load_pretrained_config(pretrained_model_name, cache_dir, hparams) # Word embedding word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Position embedding position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The GPT2 encoder (a TransformerEncoder) super().__init__(hparams=None) # Register modules after `__init__` is called. self.word_embedder = word_embedder self.position_embedder = position_embedder self.init_pretrained_weights(load_output_layer=False)