Exemple #1
0
    def forward(self, memory, decoder_inputs, memory_lengths, teacher_forcing):
        """ Decoder forward pass for training
        PARAMS
        ------
        memory: Encoder outputs
        decoder_inputs: Decoder inputs for teacher forcing. i.e. acoustic-feats.
        memory_lengths: Encoder output lengths for attention masking.

        RETURNS
        -------
        acoustic_outputs: acoustic outputs from the decoder
        gate_outputs: gate outputs from the decoder
        alignments: sequence of attention weights from the decoder
        """

        decoder_input = self.get_go_frame(memory).unsqueeze(0)
        decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
        decoder_inputs = self.prenet(decoder_inputs)

        self.initialize_decoder_states(
            memory, mask=~get_mask_from_lengths(memory_lengths))

        acoustic_outputs, gate_outputs, alignments = [], [], []
        teach_force_flags = np.random.choice(
            2, [decoder_inputs.size(0) - 1],
            p=[1 - teacher_forcing, teacher_forcing])

        while len(acoustic_outputs) < decoder_inputs.size(0) - 1:
            step = len(acoustic_outputs)
            if step > 0 and not teach_force_flags[step]:
                decoder_input = self.prenet(acoustic_output)
            else:
                decoder_input = decoder_inputs[step]
            if self.attention_window_size is not None:
                attention_windowed_mask = get_mask_from_lengths_window_and_time_step(
                    memory_lengths, self.attention_window_size, step)
            else:
                attention_windowed_mask = None

            #decoder_input = decoder_inputs[len(acoustic_outputs)]
            #if self.attention_window_size is not None:
            #    time_step = len(acoustic_outputs)
            #    attention_windowed_mask = \
            #        get_mask_from_lengths_window_and_time_step(
            #            memory_lengths, self.attention_window_size, time_step)
            #else:
            #    attention_windowed_mask = None

            acoustic_output, gate_output, attention_weights = self.decode(
                decoder_input, attention_windowed_mask)

            acoustic_outputs += [acoustic_output.squeeze(1)]
            gate_outputs += [gate_output.squeeze()]
            alignments += [attention_weights]

        acoustic_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
            acoustic_outputs, gate_outputs, alignments)

        return acoustic_outputs, gate_outputs, alignments
Exemple #2
0
    def inference(self, memory, memory_lengths):
        """ Decoder inference
        PARAMS
        ------
        memory: Encoder outputs

        RETURNS
        -------
        acoustic_outputs: acoustic outputs from the decoder
        gate_outputs: gate outputs from the decoder
        alignments: sequence of attention weights from the decoder
        """
        decoder_input = self.get_go_frame(memory)

        self.initialize_decoder_states(memory, mask=None)

        acoustic_outputs, gate_outputs, alignments = [], [], []
        while True:
            decoder_input = self.prenet(decoder_input)

            if self.attention_window_size is not None:
                time_step = len(acoustic_outputs)
                attention_windowed_mask = get_mask_from_lengths_window_and_time_step(
                    memory_lengths, self.attention_window_size, time_step)
            else:
                attention_windowed_mask = None

            acoustic_output, gate_output, alignment = self.decode(
                decoder_input, attention_windowed_mask)
            # acoustic_output, gate_output, alignment = self.decode(decoder_input, attention_windowed_mask.type(torch.bool))

            acoustic_outputs += [acoustic_output.squeeze(1)]
            gate_outputs += [gate_output]
            alignments += [alignment]
            # if torch.sigmoid(gate_output.squeeze()) > self.gate_threshold:
            #     pass
            #     # break
            if len(acoustic_outputs) == min(self.max_decoder_steps,
                                            int(memory_lengths.item())):
                break
            # if len(acoustic_outputs) == min(self.max_decoder_steps, int((memory_lengths.item() + 14) / 1.161 + 14)):
            #     print("Warning! Step {} has reached max decoder steps.".format(len(acoustic_outputs)))

            decoder_input = acoustic_output

        acoustic_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
            acoustic_outputs, gate_outputs, alignments)

        return acoustic_outputs, gate_outputs, alignments
Exemple #3
0
    def inference(self, memory, memory_lengths):
        """ Decoder inference
        PARAMS
        ------
        memory: Encoder outputs

        RETURNS
        -------
        acoustic_outputs: acoustic outputs from the decoder
        gate_outputs: gate outputs from the decoder
        alignments: sequence of attention weights from the decoder
        """
        decoder_input = self.get_go_frame(memory)

        self.initialize_decoder_states(memory, mask=None)

        acoustic_outputs, gate_outputs, alignments = [], [], []
        while True:
            decoder_input = self.prenet(decoder_input)

            if self.attention_window_size is not None:
                time_step = len(acoustic_outputs)
                attention_windowed_mask = \
                    get_mask_from_lengths_window_and_time_step(
                        memory_lengths, self.attention_window_size, time_step)
            else:
                attention_windowed_mask = None

            acoustic_output, gate_output, alignment = self.decode(
                decoder_input, attention_windowed_mask)

            acoustic_outputs += [acoustic_output.squeeze(1)]
            gate_outputs += [gate_output]
            alignments += [alignment]

            if torch.sigmoid(gate_output.data) > self.gate_threshold:
                break
            elif len(acoustic_outputs) == self.max_decoder_steps:
                print("Warning! Reached max decoder steps")
                break

            decoder_input = acoustic_output

        acoustic_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
            acoustic_outputs, gate_outputs, alignments)

        return acoustic_outputs, gate_outputs, alignments