def __init__(self, config):
        """
        Transformer model that wraps over separate Encoder and Decoder classes, as well as provided the methods used
        during training and inference.
        :param config: config object containing specification for how to build the encoder-decoder architecture
        """
        super(Transformer, self).__init__()
        self.config = config
        pes = []
        for i in range(self.config.max_length):
            pes.append(self.positional_embedding(i, self.config.model_size))
        pes = np.concatenate(pes, axis=0)
        pes = tf.constant(pes, dtype=tf.float32)
        # Original implementation of Vaswani et al.'s [2017] Attention is All you Need Encoder-Decoder Transformer
        # leveraging https://trungtran.io/2019/04/29/create-the-transformer-with-tensorflow-2-0/, onto which
        # substantial modifications have been made accomodate the hybrid architecture.
        self.encoder = Encoder(self.config.vocab_size,
                               self.config.model_size,
                               self.config.num_layers,
                               self.config.h,
                               self.config.tokenizer.vectors,
                               pes=pes,
                               multitask=self.config.multitask)
        self.decoder = Decoder(self.config.vocab_size,
                               self.config.model_size,
                               self.config.num_layers,
                               self.config.h,
                               self.config.tokenizer.vectors,
                               pes=pes)

        self.multitask = self.config.multitask
        self.stopwords = self.config.stopwords
        self.stopword_l = self.config.stopword_l
Esempio n. 2
0
 def __init__(self, hp_LM):
     super().__init__()
     self.embeddings = nn.Embedding(hp_LM.vocab_size,
                                    hp_LM.num_hidden_LM,
                                    padding_idx=0)
     self.encoder = Encoder(hp_LM)
     self.linear = nn.Linear(hp_LM.num_hidden_LM, hp_LM.vocab_size)
Esempio n. 3
0
    def __init__(self, hp):
        #def __init__(self, src_vocab, trg_vocab, d_model_encoder, N_e, n_head_encoder, ff_conv_kernel_size_encoder, concat_after_encoder,
        #           d_model_decoder, N_d, n_head_decoder, ff_conv_kernel_size_decoder, concat_after_decoder, reduction_rate, dropout, dropout_prenet=0.5, dropout_postnet=0.5,
        #           CTC_training=False, multi_speaker=False, spk_emb_dim=None, output_type=None, num_group=None):
        super().__init__()
        src_vocab = hp.vocab_size
        self.CTC_training = hp.CTC_training

        # self.cnn_encoder = CNN_embedding(src_vocab, hp.cnn_dim)
        self.encoder = Encoder(src_vocab, d_model_encoder, N_e, n_head_encoder,
                               ff_conv_kernel_size_encoder,
                               concat_after_encoder, dropout)
        self.decoder = Decoder(trg_vocab,
                               d_model_decoder,
                               N_d,
                               n_head_decoder,
                               ff_conv_kernel_size_decoder,
                               concat_after_decoder,
                               dropout,
                               dropout_prenet,
                               multi_speaker=multi_speaker,
                               spk_emb_dim=spk_emb_dim,
                               output_type=output_type)
        self.stop_token = nn.Linear(d_model_decoder, reduction_rate)
        if self.CTC_training:
            self.CTC_linear = nn.Linear(d_model_decoder, src_vocab)
        self.postnet = PostConvNet(d_model_decoder,
                                   trg_vocab,
                                   reduction_rate,
                                   dropout_postnet,
                                   output_type=output_type,
                                   num_group=num_group)
Esempio n. 4
0
    def __init__(self):
        super(CTCModel, self).__init__()
        if hp.encoder_type == 'Wave':
            self.encoder = WaveEncoder()
        else:
            self.encoder = Encoder()

        self.decoder = nn.Linear(hp.num_hidden_nodes * 2, hp.num_classes)
Esempio n. 5
0
 def __init__(self, hp):
     super(AttModel, self).__init__()
     self.hp = hp
     if hp.encoder_type == 'Wave':
         self.encoder = WaveEncoder()
     elif hp.encoder_type == 'Conformer':
         self.encoder = TransformerEncoder(hp.lmfb_dim, 256, 16, 4, 0.1)
     else:
         self.encoder = Encoder(hp)
     self.decoder = Decoder(hp)
Esempio n. 6
0
 def __init__(self):
     super(AttModel, self).__init__()
     if hp.encoder_type == 'CNN':
         self.encoder = CNN_Encoder()
     elif hp.encoder_type == 'Wave':
         self.encoder = WaveEncoder()
     else:
         self.encoder = Encoder()
     self.decoder = Decoder()
     if hp.combined_ASR:
         self.wordlevel = Wordlevelencoder()
         self.acencoder = Acencoder()
     if hp.ASR_based:
         self.wordlevel = Wordlevelencoder()
Esempio n. 7
0
    def __init__(self, hp):
        super().__init__()
        self.d_model_e = hp.d_model_e
        self.d_model_d = hp.d_model_d
        self.trg_vocab = hp.vocab_size
        self.encoder_type = hp.encoder
        self.decoder_type = hp.decoder
        #self.mode = hp.mode
        self.use_ctc = hp.use_ctc
        self.hp = hp
        self.frame_stacking = True if hp.frame_stacking > 1 else False

        if self.hp.dev_mode:
            self.emb_real_tts = nn.Embedding(2, hp.mel_dim)

        if not self.frame_stacking:
            if hp.cnn_avepool:
                self.cnn_encoder = CNN_embedding_avepool(hp)
            else:
                self.cnn_encoder = CNN_embedding(hp)
        else:
            self.embedder = nn.Linear(hp.mel_dim * hp.frame_stacking,
                                      self.d_model_e)

        if self.encoder_type == 'Conformer':
            self.encoder = ConformerEncoder(hp)
        else:
            self.encoder = Encoder(hp)
        if self.decoder_type.lower() == 'transformer':
            self.decoder = Decoder(hp)
            self.out = nn.Linear(self.d_model_d, self.trg_vocab)
        elif self.decoder_type.lower() == 'ctc':
            self.out = nn.Linear(self.d_model_d, self.trg_vocab)
        elif self.decoder_type.lower() == 'transducer':
            self.decoder = TransducerDecoder(hp)
        else:
            self.decoder = LSTMDecoder(hp)

        if self.use_ctc:
            self.out_ctc = nn.Linear(self.d_model_e, self.trg_vocab)
class Transformer(tf.keras.Model):
    def __init__(self, config):
        """
        Transformer model that wraps over separate Encoder and Decoder classes, as well as provided the methods used
        during training and inference.
        :param config: config object containing specification for how to build the encoder-decoder architecture
        """
        super(Transformer, self).__init__()
        self.config = config
        pes = []
        for i in range(self.config.max_length):
            pes.append(self.positional_embedding(i, self.config.model_size))
        pes = np.concatenate(pes, axis=0)
        pes = tf.constant(pes, dtype=tf.float32)
        # Original implementation of Vaswani et al.'s [2017] Attention is All you Need Encoder-Decoder Transformer
        # leveraging https://trungtran.io/2019/04/29/create-the-transformer-with-tensorflow-2-0/, onto which
        # substantial modifications have been made accomodate the hybrid architecture.
        self.encoder = Encoder(self.config.vocab_size,
                               self.config.model_size,
                               self.config.num_layers,
                               self.config.h,
                               self.config.tokenizer.vectors,
                               pes=pes,
                               multitask=self.config.multitask)
        self.decoder = Decoder(self.config.vocab_size,
                               self.config.model_size,
                               self.config.num_layers,
                               self.config.h,
                               self.config.tokenizer.vectors,
                               pes=pes)

        self.multitask = self.config.multitask
        self.stopwords = self.config.stopwords
        self.stopword_l = self.config.stopword_l

    def train_step(self, inputs):
        """
        Called during each update step.
        :param inputs: A tuple of 2D tensors of encoder_inputs, decoder_inputs, and decoder_outputs
        :return: A dictionary mapping task loss to values
        """
        encoder_inputs_contexts, decoder_inputs, decoder_outputs = inputs
        with tf.GradientTape() as tape:
            if self.multitask:
                # Create inputs for retrieval and re-ranking tasks
                encoder_inputs_responses = tf.identity(decoder_inputs)
                encoder_inputs_distractors = tf.identity(decoder_inputs)
                tf.random.shuffle(encoder_inputs_distractors)

                # Create padding mask to block attention on padding (PAD has id 0)
                contexts_padding_mask = 1 - tf.cast(
                    tf.equal(encoder_inputs_contexts, 0), dtype=tf.float32)
                responses_padding_mask = 1 - tf.cast(
                    tf.equal(encoder_inputs_responses, 0), dtype=tf.float32)
                distractors_padding_mask = 1 - tf.cast(
                    tf.equal(encoder_inputs_distractors, 0), dtype=tf.float32)

                # Add additional dimension to mask (batch_size, 1, seq_len)
                contexts_padding_mask = tf.expand_dims(contexts_padding_mask,
                                                       axis=1)
                responses_padding_mask = tf.expand_dims(responses_padding_mask,
                                                        axis=1)
                distractors_padding_mask = tf.expand_dims(
                    distractors_padding_mask, axis=1)
                masks = {
                    "contexts": contexts_padding_mask,
                    "responses": responses_padding_mask,
                    "distractors": distractors_padding_mask
                }

                encoder_output = self.encoder.multitask_forward([
                    encoder_inputs_contexts, encoder_inputs_responses,
                    encoder_inputs_distractors
                ], masks)
                retrieval_loss = encoder_output["contrastive_loss"]
                reranker_loss = encoder_output["reranker_loss"]

                decoder_output = self.decoder(
                    decoder_inputs, encoder_output["encoder_outputs"],
                    masks["contexts"])

                generator_loss = self.loss_func(decoder_outputs,
                                                decoder_output)
                losses = {
                    "generator": np.mean(generator_loss.numpy()),
                    "retrieval": np.mean(retrieval_loss.numpy()),
                    "reranker": np.mean(reranker_loss.numpy())
                }
                loss = generator_loss + retrieval_loss + reranker_loss

            else:
                encoder_outputs = self.encoder(encoder_inputs_contexts)
                pred = self.decoder(decoder_inputs, encoder_outputs)
                generator_loss = self.loss_func(decoder_outputs, pred)
                losses = {"generator": np.mean(generator_loss.numpy())}
                loss = generator_loss

        variables = self.encoder.trainable_variables + self.decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))

        return losses

    def load_weights(self, folder):
        self.encoder.load_weights(
            f"{folder}/{self.config.model_name}_encoder.h5")
        self.decoder.load_weights(
            f"{folder}/{self.config.model_name}_decoder.h5")

    def predict(self,
                test_source_text=None,
                top_k=None,
                return_probabilities=False):
        """
        Used at inference, to take an input sentence and produce a response.
        :param test_source_text: A string specifying the context; a random dummy context will be used if not provided
        :param top_k: An integer that determines number of candidates for each resampling in top_k decoding.  If not
        provided, greedy decoding will be used
        :param return_probabilities: Boolean, set to True to return tuple containing response and probability, otherwise
        will return only response
        :return: A string response or a tuple of string response and float probability
        """
        # If test sentence is not provided
        # randomly pick up one from the below
        if test_source_text is None:
            test_source_text = np.random.choice([
                "hello, how are you?", "what is your name?", "hi there",
                "what's up?"
            ])
            print(test_source_text)

        # Tokenize the test sentence to obtain source sequence
        test_source_seq = self.config.tokenizer.encode_ids([test_source_text])

        # Convert to tensor and truncate
        en_output = self.encoder(
            tf.constant(test_source_seq)[:, :self.config.max_length])

        de_input = tf.constant([[1]], dtype=tf.int64)
        out_words = []
        probability = 1.

        while True:
            de_output = self.decoder(de_input, en_output)

            if top_k is None:
                # Take the last token as the predicted token
                new_word = tf.expand_dims(tf.argmax(de_output, -1)[:, -1],
                                          axis=1)

                out_words.append(int(new_word.numpy()[0][0]))

                # The next input is a new sequence
                # contains both the input sequence and the predicted token
                de_input = tf.concat((de_input, new_word), axis=1)

            else:
                # Select top_k tokens
                p = []
                array = de_output.numpy()
                argsort = np.argsort(array[0, -1, :])
                candidates = argsort[-top_k:]
                for k in range(top_k):
                    p.append(array[0, -1, candidates[k]])
                new_word = np.random.choice(candidates, p=softmax(p))
                probability *= softmax(array[0, -1, :])[int(new_word)]

                out_words.append(int(new_word))

                # The next input is a new sequence
                # contains both the input sequence and the predicted token
                de_input = tf.concat(
                    (de_input, tf.cast(tf.constant([[new_word]]), tf.int64)),
                    axis=1)

            # End if hitting <end> or length exceeds 14
            if out_words[-1] == 2 or len(out_words) >= 14:
                break

        if return_probabilities:
            return self.config.tokenizer.decode_ids(out_words), probability

        else:
            return self.config.tokenizer.decode_ids(out_words)

    def test(self, contexts, responses):
        """
        Generates predictions based on the contexts provided, and stores as a pickle file.
        :param contexts: list of strings
        :param responses: list of strings
        :return: None
        """
        if self.config.multitask:
            # Load retrieval candidates
            try:
                retrieval_vectors = pickle.load(
                    open("Save/response_vectors", "rb"))
                retrieval_texts = pickle.load(open("Save/response_texts",
                                                   "rb"))
            except FileNotFoundError as e:
                print(e)
                print("Must initialise retrieval candidates first")
                sys.exit()

        predictions = []
        generative_usage = 0
        for context in contexts:
            generated_responses = []
            if not self.config.multitask:
                # Generate response candidates
                probs = []
                for _ in range(self.config.num_generated):
                    response, prob = self.predict(test_source_text=context,
                                                  top_k=5,
                                                  return_probabilities=True)
                    generated_responses.append(response)
                    probs.append(prob)
                argmax = int(np.argmax(np.asarray(probs)))

            else:
                for _ in range(self.config.num_generated):
                    generated_responses.append(self.predict(context, top_k=5))

                # Rerank candidates
                # Encode context and duplicate
                context_encoded = self.config.tokenizer.encode_ids([context
                                                                    ])[0]
                context_encoded = context_encoded[:self.config.max_length]
                context_encoded = [
                    context_encoded for _ in range(self.config.num_generated +
                                                   self.config.num_retrieved)
                ]

                # Context encode
                retrieval_encode_context = self.encoder.encode_contexts(
                    tf.constant([context_encoded[0]])).numpy()
                scores = np.dot(retrieval_encode_context[0],
                                retrieval_vectors.T)
                argsort = np.argsort(scores)
                retrieval_candidates = argsort[-self.config.num_retrieved:]

                for i in range(self.config.num_retrieved):
                    generated_responses.append(retrieval_texts[int(
                        retrieval_candidates[i])])

                # Encode candidates
                candidates_encoded = self.config.tokenizer.encode_ids_with_bos_eos(
                    generated_responses)
                candidates = np.zeros(
                    [len(generated_responses), self.config.max_length])
                for j, c in enumerate(candidates_encoded):
                    for k in range(
                            len(c) - 1
                    ):  # Leave off EOS, as Encoder was only trained on responses with BOS token
                        candidates[j, k] = c[k]

                        if k == (self.config.max_length - 1):
                            break

                context_encoded = tf.constant(context_encoded)
                candidates = tf.constant(candidates)

                scores = self.encoder.rerank(context_encoded, candidates)
                argmax = np.argmax(scores.numpy()[:, 0])

            print("=====================")
            print(context)
            pred = generated_responses[int(argmax)]
            if int(argmax) <= 9:
                generative_usage += 1
            predictions.append(pred)
            print(pred)
            print("====================")

        pickle.dump((contexts, responses, predictions),
                    open(f"Save/{self.config.model_name}_test", "wb"))
        print(generative_usage)

    def validation_loss(self, inputs):
        """
        Completes a forward pass on the model with validation data and calculates the loss
        :param inputs: tuple containing 2D tensors of encoder_inputs, decoder_inputs, and decoder_outputs
        :return: A dictionary mapping task losses to values
        """
        encoder_inputs_contexts, decoder_inputs, decoder_outputs = inputs
        if self.multitask:
            # Create inputs for retrieval and re-ranking tasks
            encoder_inputs_responses = tf.identity(decoder_inputs)
            encoder_inputs_distractors = tf.identity(decoder_inputs)
            tf.random.shuffle(encoder_inputs_distractors)
            encoder_outputs = self.encoder.multitask_forward([
                encoder_inputs_contexts, encoder_inputs_responses,
                encoder_inputs_distractors
            ])
            retrieval_loss = encoder_outputs["contrastive_loss"]
            reranker_loss = encoder_outputs["reranker_loss"]

            pred = self.decoder(decoder_inputs,
                                encoder_outputs["encoder_outputs"])
            generator_loss = self.loss_func(decoder_outputs, pred)
            losses = {
                "generator": np.mean(generator_loss.numpy()),
                "retrieval": np.mean(retrieval_loss.numpy()),
                "reranker": np.mean(reranker_loss.numpy())
            }

        else:
            encoder_outputs = self.encoder(encoder_inputs_contexts)
            pred = self.decoder(decoder_inputs, encoder_outputs)
            generator_loss = self.loss_func(decoder_outputs, pred)
            losses = {"generator": np.mean(generator_loss.numpy())}

        return losses

    def loss_func(self, targets, logits):
        """
        Calculates cross-entropy loss by applying a mask to the padding tokens, and weighted the stopwords if the
        specific model requires it.
        :param targets: 2D tensor of target indices
        :param logits: 3D tensor of predictions
        :return: cross-entropy loss
        """
        crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True)
        mask = tf.cast(tf.math.logical_not(tf.math.equal(targets, 0)),
                       tf.float64)
        if self.stopwords is not None:
            for id_ in self.stopwords:
                tmp_mask = tf.cast(tf.math.equal(targets, id_),
                                   tf.float64) * -self.stopword_l
                mask = tf.add(mask, tmp_mask)
        loss = crossentropy(targets, logits, sample_weight=mask)

        return loss

    @staticmethod
    def positional_embedding(pos, model_size):
        """
        Encodes position according to the positional embedding used in Vaswani et al's [2017] Attention is all you need.
        :param pos: integer position in the sentence
        :param model_size: integer dimensionality of the embedding
        :return: 2D numpy array
        """
        PE = np.zeros((1, model_size))
        for i in range(model_size):
            if i % 2 == 0:
                PE[:, i] = np.sin(pos / 10000**(i / model_size))
            else:
                PE[:, i] = np.cos(pos / 10000**((i - 1) / model_size))

        return PE
Esempio n. 9
0
    def __init__(self,
                 src_vocab,
                 trg_vocab,
                 d_model_encoder,
                 N_e,
                 n_head_encoder,
                 ff_conv_kernel_size_encoder,
                 concat_after_encoder,
                 d_model_decoder,
                 N_d,
                 n_head_decoder,
                 ff_conv_kernel_size_decoder,
                 concat_after_decoder,
                 reduction_rate,
                 dropout,
                 CTC_training,
                 n_bins,
                 f0_min,
                 f0_max,
                 energy_min,
                 energy_max,
                 pitch_pred=True,
                 energy_pred=True,
                 output_type=None,
                 num_group=None,
                 log_offset=1.,
                 multi_speaker=False,
                 spk_emb_dim=None,
                 spkr_emb=None):
        super().__init__()

        if 'encoder' in spkr_emb:
            self.encoder = Encoder(src_vocab, d_model_encoder, N_e,
                                   n_head_encoder, ff_conv_kernel_size_encoder,
                                   concat_after_encoder, dropout,
                                   multi_speaker, spk_emb_dim)
        else:
            self.encoder = Encoder(src_vocab,
                                   d_model_encoder,
                                   N_e,
                                   n_head_encoder,
                                   ff_conv_kernel_size_encoder,
                                   concat_after_encoder,
                                   dropout=0.0,
                                   multi_speaker=False,
                                   spk_emb_dim=None)

        self.variance_adaptor = VarianceAdaptor(d_model_encoder, n_bins,
                                                f0_min, f0_max, energy_min,
                                                energy_max, log_offset,
                                                pitch_pred, energy_pred)

        if 'decoder' in spkr_emb:
            self.decoder = Encoder(d_model_encoder,
                                   d_model_decoder,
                                   N_d,
                                   n_head_decoder,
                                   ff_conv_kernel_size_decoder,
                                   concat_after_decoder,
                                   dropout,
                                   multi_speaker,
                                   spk_emb_dim,
                                   embedding=False)
        else:
            self.decoder = Encoder(d_model_encoder,
                                   d_model_decoder,
                                   N_d,
                                   n_head_decoder,
                                   ff_conv_kernel_size_decoder,
                                   concat_after_decoder,
                                   dropout=0.0,
                                   multi_speaker=False,
                                   spk_emb_dim=None,
                                   embedding=False)

        self.postnet = PostConvNet(d_model_decoder, trg_vocab, reduction_rate,
                                   0.5, output_type, num_group)