Exemple #1
0
 def init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
                     vocab_list):
     """Initialize the external scorer.
     :param beam_alpha: Parameter associated with language model.
     :type beam_alpha: float
     :param beam_beta: Parameter associated with word count.
     :type beam_beta: float
     :param language_model_path: Filepath for language model. If it is
                                 empty, the external scorer will be set to
                                 None, and the decoding method will be pure
                                 beam search without scorer.
     :type language_model_path: basestring|None
     :param vocab_list: List of tokens in the vocabulary, for decoding.
     :type vocab_list: list
     """
     if language_model_path != '':
         self.logger.info("begin to initialize the external scorer "
                          "for decoding")
         self._ext_scorer = Scorer(beam_alpha, beam_beta,
                                   language_model_path, vocab_list)
         lm_char_based = self._ext_scorer.is_character_based()
         lm_max_order = self._ext_scorer.get_max_order()
         lm_dict_size = self._ext_scorer.get_dict_size()
         self.logger.info("language model: "
                          "is_character_based = %d," % lm_char_based +
                          " max_order = %d," % lm_max_order +
                          " dict_size = %d" % lm_dict_size)
         self.logger.info("end initializing scorer")
     else:
         self._ext_scorer = None
         self.logger.info("no language model provided, "
                          "decoding by pure beam search without scorer.")
Exemple #2
0
class DeepSpeech2Model(object):
    """DeepSpeech2Model class.

    :param vocab_size: Decoding vocabulary size.
    :type vocab_size: int
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward directional RNNs.Notice that
                              for GRU, weight sharing is not supported.
    :type share_rnn_weights: bool
    """

    def __init__(self, model,
                 ds2_model_path,
                 vocab_list,
                 device):
        self.vocab_size = len(vocab_list)
        self.vocab_list = vocab_list
        self.device = device
        self.model = self._create_model(model,
                                        self.vocab_size,
                                        self.device)
        self._load_paddle_pretrained(ds2_model_path)
        self.model.to(self.device)

        self._inferer = None
        self._loss_inferer = None
        self._ext_scorer = None
        # the model only contain 2 Conv layers
        self._num_conv_layers = 2

        self.logger = logging.getLogger("")
        self.logger.setLevel(level=logging.INFO)



    def train(self,
              train_dataset,
              train_batchsize,
              val_dataset,
              val_batchsize,
              collate_fn,
              lr_key,
              exclue_lr_key,
              learning_rate,
              scheduler_gamma,
              gradient_clipping,
              num_passes,
              output_dir,
              writer,
              num_iterations_print=100,
              feeding_dict=None,
              sortN_epoch=0,
              num_workers=10,
              specific_lr_dict = None
              ):
        """Train the model for one epoch

        :param train_batch_reader: Train data reader.
        :type train_batch_reader: callable

        :param dev_batch_reader: Validation data reader.
        :type dev_batch_reader: callable

        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list

        :param learning_rate: Learning rate for ADAM optimizer.
        :type learning_rate: float

        :param gradient_clipping: Gradient clipping threshold.
        :type gradient_clipping: float

        :param num_passes: Number of training epochs.
        :type num_passes: int

        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type rnn_iteratons_print: int

        :param output_dir: Directory for saving the model (every pass).
        :type output_dir: basestring
        """

        self.model.train()
        self.logger.info("DNN structure: \n {}\n".format(self.model))
        self.logger.info("Learning rate: \n {}\n".format(learning_rate))
        self.logger.info("scheduler_gamma: \n {}\n".format(scheduler_gamma))
        # prepare model output directory
        assert os.path.exists(output_dir)


        # adapt the feeding dict and reader according to the network
        # adapted_train_batch_reader = self._adapt_data(train_batch_reader)
        # adapted_dev_batch_reader = self._adapt_data(dev_batch_reader)

        # create loss
        # PaddlePaddle DeepSpeech2 use puts the blank at the end
        self.criterion = CTCLoss(blank=self.vocab_size, reduction="mean", zero_infinity=True)

        # optimizer
        # Prepare optimizer and schedule (linear warmup and decay)
        tuned_param = {n:p for n, p in self.model.named_parameters() \
                        if any(nd in n for nd in lr_key)}

        if exclue_lr_key:
            tuned_param = {n:tuned_param[n] for n in tuned_param \
                            if any(nd not in n for nd in exclue_lr_key)}

        #TODO: implement a flexible optimizer that can custom learning rate for each layer.
        if specific_lr_dict:
            assert isinstance(specific_lr_dict, dict)

            special_param = []
            # special_param_name = []
            common_param = [{"params":tuned_param[n]} for n in tuned_param \
                            if any(nd not in n for nd in specific_lr_dict)]
            # common_param_name = [n for n in tuned_param \
            #                 if any(nd not in n for nd in specific_lr)]

            
            for n in tuned_param:
                key_loc = [nd in n for nd in specific_lr_dict]
                if any(key_loc):
                    # sooooo ugly!!!
                    special_param.append({"params":tuned_param[n], "lr": list(compress(specific_lr_dict.values(), key_loc))[0]})
                    # special_param_name.append(n)

            optim_param = common_param + special_param
        else:
            optim_param = [{"params":tuned_param[n]} for n in tuned_param]

        self.logger.info("learnable parameter name: {}\n".format(tuned_param.keys()))
        optimizer = optim.AdamW(optim_param, lr=learning_rate)
        # optimizer = optim.Adam(tuned_param.values(), lr=learning_rate,
        #                         betas=(0.9, 0.98), eps=1e-9)

        scheduler = optim.lr_scheduler.ExponentialLR(optimizer,
                                                     gamma=scheduler_gamma,
                                                     last_epoch=-1)

        val_dataloader = DataLoader(val_dataset, batch_size=val_batchsize,
                                    shuffle=False, num_workers=num_workers,
                                    collate_fn=collate_fn)

        self.logger.info("Start training process")
        global_iter = 0
        last_lr = learning_rate
        for epoch in range(num_passes):
            shuffle = True if epoch >= sortN_epoch else False
            train_dataloader = DataLoader(train_dataset, batch_size=train_batchsize,
                                          shuffle=shuffle, num_workers=num_workers,
                                          collate_fn=collate_fn)
            for index, batch in enumerate(train_dataloader):
                if self.model.training != True:
                    self.model.train()

                # self.optimizer.zero_grad()
                self.model.zero_grad()
                loss = self.compute_loss(batch)
                if loss != loss:
                    self.logger.info("loss: {} \ninput {}".format(loss.item(), batch["uttid"]))
                    break

                loss.backward()
                torch.nn.utils.clip_grad_norm_(tuned_param.values(), max_norm=gradient_clipping)
                optimizer.step()

                self.logger.debug("epoch{}, global_iter{} train loss: {}".format(epoch, global_iter, loss.item()))
                if global_iter % num_iterations_print == 0:
                    val_loss = self.evaluate(val_dataloader)

                    self.logger.info("epoch: {}, global_iter: {}, last_lr: {},loss/train: {}, loss/val: {}".format(epoch, global_iter, last_lr, loss.item(), val_loss))
                    writer.add_scalar('Loss/train', loss.item(), global_iter)
                    writer.add_scalar('Loss/val', val_loss, global_iter)

                    torch.save(self.model.state_dict(),
                               os.path.join(output_dir, "models/model_{}.pth".format(global_iter)))

                    val_detail = self.decode(val_dataloader)
                    val_detail.to_pickle(os.path.join(output_dir, "vals/val_detail_{}.pkl".format(global_iter)))


                global_iter += 1

            scheduler.step()
            last_lr = scheduler.get_last_lr()

        val_loss = self.evaluate(val_dataloader)
        self.logger.info("global_iter: {}, loss/train: {}, loss/val: {}".format(global_iter, loss.item(), val_loss))
        writer.add_scalar('Loss/train', loss.item(), global_iter)
        writer.add_scalar('Loss/val', val_loss, global_iter)
        torch.save(self.model.state_dict(),
                   os.path.join(output_dir, "models/model_{}.pth".format("final")))


    def decode(self, dataloader):
        outputs = defaultdict(list)
        for i_batch, sample_batched in enumerate(dataloader):
            batch_results = self.infer_batch_probs(infer_data=sample_batched)
            batch_transcripts_beam = self.decode_batch_beam_search(probs_split=batch_results,
                                                                   beam_alpha=2,
                                                                   beam_beta=0.35,
                                                                   beam_size=500,
                                                                   cutoff_prob=1.0,
                                                                   cutoff_top_n=40,
                                                                   num_processes=5)
            outputs["uttid"].extend(sample_batched["uttid"])
            outputs["probs"].extend(batch_results)
            outputs["asr"].extend(batch_transcripts_beam)
            outputs["text"].extend(sample_batched["trans"])

        outputs = pd.DataFrame.from_dict(outputs)
        return outputs

    def evaluate(self, dataloader)->float:
        total_loss = []
        for index, batch in enumerate(dataloader):
            loss = self.compute_loss(batch)
            total_loss.append(loss.item())

            # if loss is NaN
            if loss != loss:
                for i,_ in enumerate(batch["uttid"]):
                    self.logger.debug("uttid: {}, length_spec: {}, text: {}".format(batch["uttid"][i], batch["length_spec"][i], batch["trans"][i]))
            # if loss is inf
            if loss == float('inf'):
                for i,_ in enumerate(batch["uttid"]):
                    self.logger.debug("uttid: {}, length_spec: {}, text: {}".format(batch["uttid"][i], batch["length_spec"][i], batch["trans"][i]))

        return np.mean(total_loss)



    def compute_loss(self, batch):
        self.model.zero_grad()
        batch = self._adapt_data(batch)
        refs = batch["text"]
        length_refs = batch["length_text"]
        flattened_refs = DeepSpeech2Model.flatten_paded_seq(refs, length_refs)
        hyps, length_hyps, other = self.model(batch)
        hyps = hyps[0]
        flattened_refs = flattened_refs.to(self.device)
        # (log_probs, targets) must on the same device
        # input_lengths, target_lengths
        loss = self.criterion(log_probs=hyps,
                              targets=flattened_refs,
                              input_lengths=length_hyps,
                              target_lengths=length_refs)

        if loss != loss:
            self.logger.debug("uttid: {}".format(batch["uttid"]))
            self.logger.debug("length_hyps: {}, length_refs: {}".format(length_hyps, length_refs))
            self.logger.debug("hyps: {}".format(hyps))
            self.logger.debug("other: {}".format(other))
        return loss


    @staticmethod
    def flatten_paded_seq(text, length):
        assert isinstance(text, torch.IntTensor), "{}".format(text.type())
        assert isinstance(length, torch.IntTensor), "{}".format(length.type())
        flattened_text = torch.cat([text[i][:length[i]] for i in range(text.shape[0])])
        return flattened_text

    def infer_batch_probs(self, infer_data):
        """Infer the prob matrices for a batch of speech utterances.

        :param infer_data: List of utterances to infer, with each utterance
                           consisting of a tuple of audio features and
                           transcription text (empty string).
        :type infer_data: list
        :return: List of 2-D probability matrix, and each consists of prob
                 vectors for one speech utterancce.
        :rtype: List of matrix
        """
        if self.model.training:
            self.model.eval()
        # define inferer
        adapted_infer_data = self._adapt_data(infer_data)

        # run inference
        with torch.no_grad():
            infer_results = self.model(adapted_infer_data)
            results, lengths, _ = infer_results

            results = results[0].data.cpu().numpy()
            probs_split = []
            for i in range(results.shape[0]):
                probs_split.append(results[i][:lengths[i]])

        return probs_split

    def decode_batch_greedy(self, probs_split):
        """Decode by best path for a batch of probs matrix input.

        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :return: List of transcription texts.
        :rtype: List of basestring
        """
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(
                probs_seq=probs, vocabulary=self.vocab_list)
            results.append(output_transcription)
        return results

    def init_ext_scorer(self, beam_alpha, beam_beta, language_model_path):
        """Initialize the external scorer. This is where we use the language model

        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param language_model_path: Filepath for language model. If it is
                                    empty, the external scorer will be set to
                                    None, and the decoding method will be pure
                                    beam search without scorer.
        :type language_model_path: basestring|None
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        """
        if language_model_path != '':
            self.logger.info("begin to initialize the external scorer "
                             "for decoding")
            self._ext_scorer = Scorer(beam_alpha, beam_beta,
                                      language_model_path, self.vocab_list)
            lm_char_based = self._ext_scorer.is_character_based()
            lm_max_order = self._ext_scorer.get_max_order()
            lm_dict_size = self._ext_scorer.get_dict_size()
            self.logger.info("language model: "
                             "is_character_based = %d," % lm_char_based +
                             " max_order = %d," % lm_max_order +
                             " dict_size = %d" % lm_dict_size)
            self.logger.info("end initializing scorer")
        else:
            self._ext_scorer = None
            self.logger.info("no language model provided, "
                             "decoding by pure beam search without scorer.")

    def decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
                                 beam_size, cutoff_prob, cutoff_top_n,
                                 num_processes):
        """Decode by beam search for a batch of probs matrix input.
           Beam Search already take language model into account

        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param beam_size: Width for Beam search.
        :type beam_size: int
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                        characters with highest probs in vocabulary will be
                        used in beam search, default 40.
        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param num_processes: Number of processes (CPU) for decoder.
        :type num_processes: int
        :return: List of transcription texts.
        :rtype: List of basestring
        """
        if self._ext_scorer != None:
            self._ext_scorer.reset_params(beam_alpha, beam_beta)
        # beam search decode
        num_processes = min(num_processes, len(probs_split))
        beam_search_results = ctc_beam_search_decoder_batch(
            probs_split=probs_split,
            vocabulary=self.vocab_list,
            beam_size=beam_size,
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n)

        results = [result[0][1] for result in beam_search_results]
        return results


    def _adapt_data(self, batch):
        """Adapt data according to network struct.

        For each convolution layer in the conv_group, to remove impacts from
        padding data, we can multiply zero to the padding part of the outputs
        of each batch normalization layer. We add a scale_sub_region layer after
        each batch normalization layer to reset the padding data.
        For rnn layers, to remove impacts from padding data, we can truncate the
        padding part before output data feeded into the first rnn layer. We use
        sub_seq layer to achieve this.

        :param data: Data from data_provider.
        :type data: list|function
        :return: Adapted data.
        :rtype: list|function
        """
        assert "length_spec" in batch.keys()
        adapted_batch = batch
        # no padding part
        audio_lens = batch["length_spec"]

        # Stride size for conv0 is (3, 2)
        # Stride size for conv1 to convN is (1, 2)
        # Same as the network, hard-coded here
        valid_w = (audio_lens - 1) // 3 + 1

        mask_length = []
        # adding conv layer making info
        # deepspeech's CNN layer will not thrink after the first layer.
        mask_length.append(valid_w)
        for i in range(self._num_conv_layers - 1):
            mask_length.append(valid_w)
        adapted_batch["cnn_masks"] = mask_length
        return adapted_batch

    def _load_paddle_pretrained(self, model_path):
        """Load pretrained DeepSpeech parameters."""
        assert self.model
        self.model.load_paddle_pretrained(model_path)


    def _create_model(self, model, vocab_size, device):
        """Create data layers and model network."""
        return  model(device)

    def load_weights(self, path):
        """Load weights"""
        pretrained_dict = torch.load(path)
        model_dict = self.model.state_dict()
        # 1. filter out unnecessary keys
        pretrained_matched_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
        # 2. overwrite entries in the existing state dict
        model_dict.update(pretrained_matched_dict)
        # 3. load the new state dict
        self.model.load_state_dict(model_dict)
        self.logger.info("load weights from: {}".format(path))
        self.logger.info("excluded weights: {}".format(set(pretrained_dict.keys())- set(model_dict)))
Exemple #3
0
class DeepSpeech2Model(object):
    """DeepSpeech2Model class.

    :param vocab_size: Decoding vocabulary size.
    :type vocab_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_layer_size: RNN layer size (number of RNN cells).
    :type rnn_layer_size: int
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward directional RNNs.Notice that
                              for GRU, weight sharing is not supported.
    :type share_rnn_weights: bool
    :param place: Program running place.
    :type place: CPUPlace or CUDAPlace
    :param init_from_pretrained_model: Pretrained model path. If None, will train
                                  from stratch.
    :type init_from_pretrained_model: string|None
    :param output_model_dir: Output model directory. If None, output to current directory. 
    :type output_model_dir: string|None
    """
    def __init__(self,
                 vocab_size,
                 num_conv_layers,
                 num_rnn_layers,
                 rnn_layer_size,
                 use_gru=False,
                 share_rnn_weights=True,
                 place=fluid.CPUPlace(),
                 init_from_pretrained_model=None,
                 output_model_dir=None):
        self._vocab_size = vocab_size
        self._num_conv_layers = num_conv_layers
        self._num_rnn_layers = num_rnn_layers
        self._rnn_layer_size = rnn_layer_size
        self._use_gru = use_gru
        self._share_rnn_weights = share_rnn_weights
        self._place = place
        self._init_from_pretrained_model = init_from_pretrained_model
        self._output_model_dir = output_model_dir
        self._ext_scorer = None
        self.logger = logging.getLogger("")
        self.logger.setLevel(level=logging.INFO)

    def create_network(self, is_infer=False):
        """Create data layers and model network.
        :param is_training: Whether to create a network for training.
        :type is_training: bool 
        :return reader: Reader for input.
        :rtype reader: read generater
        :return log_probs: An output unnormalized log probability layer.
        :rtype lig_probs: Varable
        :return loss: A ctc loss layer.
        :rtype loss: Variable
        """

        if not is_infer:
            input_fields = {
                'names': ['audio_data', 'text_data', 'seq_len_data', 'masks'],
                'shapes': [[None, 161, None], [None, 1], [None, 1],
                           [None, 32, 81, None]],
                'dtypes': ['float32', 'int32', 'int64', 'float32'],
                'lod_levels': [0, 1, 0, 0]
            }

            inputs = [
                fluid.data(name=input_fields['names'][i],
                           shape=input_fields['shapes'][i],
                           dtype=input_fields['dtypes'][i],
                           lod_level=input_fields['lod_levels'][i])
                for i in range(len(input_fields['names']))
            ]

            reader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                        capacity=64,
                                                        iterable=False,
                                                        use_double_buffer=True)

            (audio_data, text_data, seq_len_data, masks) = inputs
        else:
            audio_data = fluid.data(name='audio_data',
                                    shape=[None, 161, None],
                                    dtype='float32',
                                    lod_level=0)
            seq_len_data = fluid.data(name='seq_len_data',
                                      shape=[None, 1],
                                      dtype='int64',
                                      lod_level=0)
            masks = fluid.data(name='masks',
                               shape=[None, 32, 81, None],
                               dtype='float32',
                               lod_level=0)
            text_data = None
            reader = fluid.DataFeeder([audio_data, seq_len_data, masks],
                                      self._place)

        log_probs, loss = deep_speech_v2_network(
            audio_data=audio_data,
            text_data=text_data,
            seq_len_data=seq_len_data,
            masks=masks,
            dict_size=self._vocab_size,
            num_conv_layers=self._num_conv_layers,
            num_rnn_layers=self._num_rnn_layers,
            rnn_size=self._rnn_layer_size,
            use_gru=self._use_gru,
            share_rnn_weights=self._share_rnn_weights)
        return reader, log_probs, loss

    def init_from_pretrained_model(self, exe, program):
        '''Init params from pretrain model. '''

        assert isinstance(self._init_from_pretrained_model, str)

        if not os.path.exists(self._init_from_pretrained_model):
            print(self._init_from_pretrained_model)
            raise Warning("The pretrained params do not exist.")
            return False
        fluid.io.load_params(exe,
                             self._init_from_pretrained_model,
                             main_program=program,
                             filename="params.pdparams")

        print("finish initing model from pretrained params from %s" %
              (self._init_from_pretrained_model))

        pre_epoch = 0
        dir_name = self._init_from_pretrained_model.split('_')
        if len(dir_name) >= 2 and dir_name[-2].endswith(
                'epoch') and dir_name[-1].isdigit():
            pre_epoch = int(dir_name[-1])

        return pre_epoch + 1

    def save_param(self, exe, program, dirname):
        '''Save model params to dirname'''

        assert isinstance(self._output_model_dir, str)

        param_dir = os.path.join(self._output_model_dir)

        if not os.path.exists(param_dir):
            os.mkdir(param_dir)

        fluid.io.save_params(exe,
                             os.path.join(param_dir, dirname),
                             main_program=program,
                             filename="params.pdparams")
        print("save parameters at %s" % (os.path.join(param_dir, dirname)))

        return True

    def test(self, exe, dev_batch_reader, test_program, test_reader,
             fetch_list):
        '''Test the model.

        :param exe:The executor of program.
        :type exe: Executor
        :param dev_batch_reader: The reader of test dataa.
        :type dev_batch_reader: read generator 
        :param test_program: The program of test.
        :type test_program: Program
        :param test_reader: Reader of test.
        :type test_reader: Reader
        :param fetch_list: Fetch list.
        :type fetch_list: list
        :return: An output unnormalized log probability. 
        :rtype: array
        '''
        test_reader.start()
        epoch_loss = []
        while True:
            try:
                each_loss = exe.run(program=test_program,
                                    fetch_list=fetch_list,
                                    return_numpy=False)
                epoch_loss.extend(np.array(each_loss[0]))

            except fluid.core.EOFException:
                test_reader.reset()
                break
        return np.mean(np.array(epoch_loss))

    def train(self,
              train_batch_reader,
              dev_batch_reader,
              feeding_dict,
              learning_rate,
              gradient_clipping,
              num_epoch,
              batch_size,
              num_samples,
              save_epoch=100,
              num_iterations_print=100,
              test_off=False):
        """Train the model.

        :param train_batch_reader: Train data reader.
        :type train_batch_reader: callable
        :param dev_batch_reader: Validation data reader.
        :type dev_batch_reader: callable
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :param learning_rate: Learning rate for ADAM optimizer.
        :type learning_rate: float
        :param gradient_clipping: Gradient clipping threshold.
        :type gradient_clipping: float
        :param num_epoch: Number of training epochs.
        :type num_epoch: int
        :param batch_size: Number of batch size.
        :type batch_size: int
        :param num_samples: The num of train samples.
        :type num_samples: int
        :param save_epoch: Number of training iterations for save checkpoint and params.
        :type save_epoch: int
        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type num_iteratons_print: int
        :param test_off: Turn off testing.
        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(self._output_model_dir):
            mkpath(self._output_model_dir)

        # adapt the feeding dict according to the network
        adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict)

        if isinstance(self._place, fluid.CUDAPlace):
            dev_count = fluid.core.get_cuda_device_count()
        else:
            dev_count = int(os.environ.get('CPU_NUM', 1))

        # prepare the network
        train_program = fluid.Program()
        startup_prog = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_reader, log_probs, ctc_loss = self.create_network()
                # prepare optimizer
                optimizer = fluid.optimizer.AdamOptimizer(
                    learning_rate=fluid.layers.exponential_decay(
                        learning_rate=learning_rate,
                        decay_steps=num_samples / batch_size / dev_count,
                        decay_rate=0.83,
                        staircase=True))
                fluid.clip.set_gradient_clip(
                    clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=gradient_clipping))
                optimizer.minimize(loss=ctc_loss)

        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_reader, _, ctc_loss = self.create_network()

        test_prog = test_prog.clone(for_test=True)

        exe = fluid.Executor(self._place)
        exe.run(startup_prog)

        # init from some pretrain models, to better solve the current task
        pre_epoch = 0
        if self._init_from_pretrained_model:
            pre_epoch = self.init_from_pretrained_model(exe, train_program)

        build_strategy = compiler.BuildStrategy()
        exec_strategy = fluid.ExecutionStrategy()

        # pass the build_strategy to with_data_parallel API
        compiled_prog = compiler.CompiledProgram(
            train_program).with_data_parallel(loss_name=ctc_loss.name,
                                              build_strategy=build_strategy,
                                              exec_strategy=exec_strategy)

        train_reader.set_batch_generator(train_batch_reader)
        test_reader.set_batch_generator(dev_batch_reader)

        # run train
        for epoch_id in range(num_epoch):
            train_reader.start()
            epoch_loss = []
            time_begin = time.time()
            batch_id = 0
            step = 0
            while True:
                try:
                    fetch_list = [ctc_loss.name]

                    if batch_id % num_iterations_print == 0:
                        fetch = exe.run(program=compiled_prog,
                                        fetch_list=fetch_list,
                                        return_numpy=False)
                        each_loss = fetch[0]
                        epoch_loss.extend(np.array(each_loss[0]) / batch_size)

                        print("epoch: %d, batch: %d, train loss: %f\n" %
                              (epoch_id, batch_id,
                               np.mean(each_loss[0]) / batch_size))

                    else:
                        each_loss = exe.run(program=compiled_prog,
                                            fetch_list=[],
                                            return_numpy=False)

                    batch_id = batch_id + 1
                except fluid.core.EOFException:
                    train_reader.reset()
                    break
            time_end = time.time()
            used_time = time_end - time_begin
            if test_off:
                print("\n--------Time: %f sec, epoch: %d, train loss: %f\n" %
                      (used_time, epoch_id, np.mean(np.array(epoch_loss))))
            else:
                print('\n----------Begin test...')
                test_loss = self.test(exe,
                                      dev_batch_reader=dev_batch_reader,
                                      test_program=test_prog,
                                      test_reader=test_reader,
                                      fetch_list=[ctc_loss])
                print(
                    "--------Time: %f sec, epoch: %d, train loss: %f, test loss: %f"
                    % (used_time, epoch_id + pre_epoch,
                       np.mean(np.array(epoch_loss)), test_loss / batch_size))
            if (epoch_id + 1) % save_epoch == 0:
                self.save_param(exe, train_program,
                                "epoch_" + str(epoch_id + pre_epoch))

        self.save_param(exe, train_program, "step_final")

        print("\n------------Training finished!!!-------------")

    def infer_batch_probs(self, infer_data, feeding_dict):
        """Infer the prob matrices for a batch of speech utterances.
        :param infer_data: List of utterances to infer, with each utterance
                           consisting of a tuple of audio features and
                           transcription text (empty string).
        :type infer_data: list
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :return: List of 2-D probability matrix, and each consists of prob
                 vectors for one speech utterancce.
        :rtype: List of matrix
        """
        # define inferer
        infer_program = fluid.Program()
        startup_prog = fluid.Program()

        # adapt the feeding dict according to the network
        adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict)

        # prepare the network
        with fluid.program_guard(infer_program, startup_prog):
            with fluid.unique_name.guard():
                feeder, log_probs, _ = self.create_network(is_infer=True)

        infer_program = infer_program.clone(for_test=True)
        exe = fluid.Executor(self._place)
        exe.run(startup_prog)

        # init param from pretrained_model
        if not self._init_from_pretrained_model:
            exit("No pretrain model file path!")
        self.init_from_pretrained_model(exe, infer_program)

        infer_results = []
        time_begin = time.time()

        # run inference
        for i in range(infer_data[0].shape[0]):
            each_log_probs = exe.run(program=infer_program,
                                     feed=feeder.feed([[
                                         infer_data[0][i], infer_data[2][i],
                                         infer_data[3][i]
                                     ]]),
                                     fetch_list=[log_probs],
                                     return_numpy=False)
            infer_results.extend(np.array(each_log_probs[0]))

        # slice result
        infer_results = np.array(infer_results)
        seq_len = (infer_data[2] - 1) // 3 + 1

        start_pos = [0] * (infer_data[0].shape[0] + 1)
        for i in range(infer_data[0].shape[0]):
            start_pos[i + 1] = start_pos[i] + seq_len[i][0]
        probs_split = [
            infer_results[start_pos[i]:start_pos[i + 1]]
            for i in range(0, infer_data[0].shape[0])
        ]

        return probs_split

    def decode_batch_greedy(self, probs_split, vocab_list):
        """Decode by best path for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :return: List of transcription texts.
        :rtype: List of basestring
        """
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(probs_seq=probs,
                                                      vocabulary=vocab_list)
            results.append(output_transcription)
        print(results)
        return results

    def init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
                        vocab_list):
        """Initialize the external scorer.
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param language_model_path: Filepath for language model. If it is
                                    empty, the external scorer will be set to
                                    None, and the decoding method will be pure
                                    beam search without scorer.
        :type language_model_path: basestring|None
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        """
        if language_model_path != '':
            self.logger.info("begin to initialize the external scorer "
                             "for decoding")
            self._ext_scorer = Scorer(beam_alpha, beam_beta,
                                      language_model_path, vocab_list)
            lm_char_based = self._ext_scorer.is_character_based()
            lm_max_order = self._ext_scorer.get_max_order()
            lm_dict_size = self._ext_scorer.get_dict_size()
            self.logger.info("language model: "
                             "is_character_based = %d," % lm_char_based +
                             " max_order = %d," % lm_max_order +
                             " dict_size = %d" % lm_dict_size)
            self.logger.info("end initializing scorer")
        else:
            self._ext_scorer = None
            self.logger.info("no language model provided, "
                             "decoding by pure beam search without scorer.")

    def decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
                                 beam_size, cutoff_prob, cutoff_top_n,
                                 vocab_list, num_processes):
        """Decode by beam search for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param beam_size: Width for Beam search.
        :type beam_size: int
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                        characters with highest probs in vocabulary will be
                        used in beam search, default 40.
        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param num_processes: Number of processes (CPU) for decoder.
        :type num_processes: int
        :return: List of transcription texts.
        :rtype: List of basestring
        """
        if self._ext_scorer != None:
            print("YES")
            self._ext_scorer.reset_params(beam_alpha, beam_beta)
        # beam search decode
        num_processes = min(num_processes, len(probs_split))
        beam_search_results = ctc_beam_search_decoder_batch(
            probs_split=probs_split,
            vocabulary=vocab_list,
            beam_size=beam_size,
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n)

        results = [result[0][1] for result in beam_search_results]
        return results

    def _adapt_feeding_dict(self, feeding_dict):
        """Adapt feeding dict according to network struct.

        To remove impacts from padding part, we add scale_sub_region layer and
        sub_seq layer. For sub_seq layer, 'sequence_offset' and
        'sequence_length' fields are appended. For each scale_sub_region layer
        'convN_index_range' field is appended.

        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :return: Adapted feeding dict.
        :rtype: dict|list
        """
        adapted_feeding_dict = copy.deepcopy(feeding_dict)
        if isinstance(feeding_dict, dict):
            adapted_feeding_dict["sequence_offset"] = len(adapted_feeding_dict)
            adapted_feeding_dict["sequence_length"] = len(adapted_feeding_dict)
            for i in range(self._num_conv_layers):
                adapted_feeding_dict["conv%d_index_range" %i] = \
                        len(adapted_feeding_dict)
        elif isinstance(feeding_dict, list):
            adapted_feeding_dict.append("sequence_offset")
            adapted_feeding_dict.append("sequence_length")
            for i in range(self._num_conv_layers):
                adapted_feeding_dict.append("conv%d_index_range" % i)
        else:
            raise ValueError("Type of feeding_dict is %s, not supported." %
                             type(feeding_dict))

        return adapted_feeding_dict
Exemple #4
0
class DeepSpeech2Model(object):
    """DeepSpeech2Model class.

    :param vocab_size: Decoding vocabulary size.
    :type vocab_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_layer_size: RNN layer size (number of RNN cells).
    :type rnn_layer_size: int
    :param pretrained_model_path: Pretrained model path. If None, will train
                                  from stratch.
    :type pretrained_model_path: basestring|None
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward directional RNNs.Notice that
                              for GRU, weight sharing is not supported.
    :type share_rnn_weights: bool
    """
    def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
                 rnn_layer_size, use_gru, pretrained_model_path,
                 share_rnn_weights):
        self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
                             rnn_layer_size, use_gru, share_rnn_weights)
        self._create_parameters(pretrained_model_path)
        self._inferer = None
        self._loss_inferer = None
        self._ext_scorer = None
        self._num_conv_layers = num_conv_layers
        self.logger = logging.getLogger("")
        self.logger.setLevel(level=logging.INFO)

    def train(self,
              train_batch_reader,
              dev_batch_reader,
              feeding_dict,
              learning_rate,
              gradient_clipping,
              num_passes,
              output_model_dir,
              is_local=True,
              num_iterations_print=100,
              test_off=False):
        """Train the model.

        :param train_batch_reader: Train data reader.
        :type train_batch_reader: callable
        :param dev_batch_reader: Validation data reader.
        :type dev_batch_reader: callable
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :param learning_rate: Learning rate for ADAM optimizer.
        :type learning_rate: float
        :param gradient_clipping: Gradient clipping threshold.
        :type gradient_clipping: float
        :param num_passes: Number of training epochs.
        :type num_passes: int
        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type rnn_iteratons_print: int
        :param is_local: Set to False if running with pserver with multi-nodes.
        :type is_local: bool
        :param output_model_dir: Directory for saving the model (every pass).
        :type output_model_dir: basestring
        :param test_off: Turn off testing.
        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(output_model_dir):
            mkpath(output_model_dir)

        # adapt the feeding dict and reader according to the network
        adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict)
        adapted_train_batch_reader = self._adapt_data(train_batch_reader)
        adapted_dev_batch_reader = self._adapt_data(dev_batch_reader)

        # prepare optimizer and trainer
        optimizer = paddle.optimizer.Adam(
            learning_rate=learning_rate,
            gradient_clipping_threshold=gradient_clipping)
        trainer = paddle.trainer.SGD(cost=self._loss,
                                     parameters=self._parameters,
                                     update_equation=optimizer,
                                     is_local=is_local)

        # create event handler
        def event_handler(event):
            global start_time, cost_sum, cost_counter
            if isinstance(event, paddle.event.EndIteration):
                cost_sum += event.cost
                cost_counter += 1
                if (event.batch_id + 1) % num_iterations_print == 0:
                    output_model_path = os.path.join(output_model_dir,
                                                     "params.latest.tar.gz")
                    with gzip.open(output_model_path, 'w') as f:
                        trainer.save_parameter_to_tar(f)
                    print("\nPass: %d, Batch: %d, TrainCost: %f" %
                          (event.pass_id, event.batch_id + 1,
                           cost_sum / cost_counter))
                    cost_sum, cost_counter = 0.0, 0
                else:
                    sys.stdout.write('.')
                    sys.stdout.flush()
            if isinstance(event, paddle.event.BeginPass):
                start_time = time.time()
                cost_sum, cost_counter = 0.0, 0
            if isinstance(event, paddle.event.EndPass):
                if test_off:
                    print("\n------- Time: %d sec,  Pass: %d" %
                          (time.time() - start_time, event.pass_id))
                else:
                    result = trainer.test(reader=adapted_dev_batch_reader,
                                          feeding=adapted_feeding_dict)
                    print(
                        "\n------- Time: %d sec,  Pass: %d, "
                        "ValidationCost: %s" %
                        (time.time() - start_time, event.pass_id, result.cost))
                output_model_path = os.path.join(
                    output_model_dir, "params.pass-%d.tar.gz" % event.pass_id)
                with gzip.open(output_model_path, 'w') as f:
                    trainer.save_parameter_to_tar(f)

        # run train
        trainer.train(reader=adapted_train_batch_reader,
                      event_handler=event_handler,
                      num_passes=num_passes,
                      feeding=adapted_feeding_dict)

    # TODO(@pkuyym) merge this function into infer_batch
    def infer_loss_batch(self, infer_data):
        """Model inference. Infer the ctc loss for a batch of speech
        utterances.

        :param infer_data: List of utterances to infer, with each utterance a
                           tuple of audio features and transcription text (empty
                           string).
        :type infer_data: list
        :return: List of ctc loss.
        :rtype: List of float
        """
        # define inferer
        if self._loss_inferer == None:
            self._loss_inferer = paddle.inference.Inference(
                output_layer=self._loss, parameters=self._parameters)
        # run inference
        return self._loss_inferer.infer(input=infer_data)

    def infer_batch_probs(self, infer_data, feeding_dict):
        """Infer the prob matrices for a batch of speech utterances.

        :param infer_data: List of utterances to infer, with each utterance
                           consisting of a tuple of audio features and
                           transcription text (empty string).
        :type infer_data: list
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :return: List of 2-D probability matrix, and each consists of prob
                 vectors for one speech utterancce.
        :rtype: List of matrix
        """
        # define inferer
        if self._inferer == None:
            self._inferer = paddle.inference.Inference(
                output_layer=self._log_probs, parameters=self._parameters)
        adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict)
        adapted_infer_data = self._adapt_data(infer_data)
        # run inference
        infer_results = self._inferer.infer(input=adapted_infer_data,
                                            feeding=adapted_feeding_dict)
        start_pos = [0] * (len(adapted_infer_data) + 1)
        for i in xrange(len(adapted_infer_data)):
            start_pos[i + 1] = start_pos[i] + adapted_infer_data[i][3][0]
        probs_split = [
            infer_results[start_pos[i]:start_pos[i + 1]]
            for i in xrange(0, len(adapted_infer_data))
        ]
        return probs_split

    def decode_batch_greedy(self, probs_split, vocab_list):
        """Decode by best path for a batch of probs matrix input.

        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :return: List of transcription texts.
        :rtype: List of basestring
        """
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(probs_seq=probs,
                                                      vocabulary=vocab_list)
            results.append(output_transcription)
        return results

    def init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
                        vocab_list):
        """Initialize the external scorer.

        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param language_model_path: Filepath for language model. If it is
                                    empty, the external scorer will be set to
                                    None, and the decoding method will be pure
                                    beam search without scorer.
        :type language_model_path: basestring|None
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        """
        if language_model_path != '':
            self.logger.info("begin to initialize the external scorer "
                             "for decoding")
            self._ext_scorer = Scorer(beam_alpha, beam_beta,
                                      language_model_path, vocab_list)
            lm_char_based = self._ext_scorer.is_character_based()
            lm_max_order = self._ext_scorer.get_max_order()
            lm_dict_size = self._ext_scorer.get_dict_size()
            self.logger.info("language model: "
                             "is_character_based = %d," % lm_char_based +
                             " max_order = %d," % lm_max_order +
                             " dict_size = %d" % lm_dict_size)
            self.logger.info("end initializing scorer")
        else:
            self._ext_scorer = None
            self.logger.info("no language model provided, "
                             "decoding by pure beam search without scorer.")

    def decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
                                 beam_size, cutoff_prob, cutoff_top_n,
                                 vocab_list, num_processes):
        """Decode by beam search for a batch of probs matrix input.

        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param beam_size: Width for Beam search.
        :type beam_size: int
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                        characters with highest probs in vocabulary will be
                        used in beam search, default 40.
        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param num_processes: Number of processes (CPU) for decoder.
        :type num_processes: int
        :return: List of transcription texts.
        :rtype: List of basestring
        """
        if self._ext_scorer != None:
            self._ext_scorer.reset_params(beam_alpha, beam_beta)
        # beam search decode
        num_processes = min(num_processes, len(probs_split))
        beam_search_results = ctc_beam_search_decoder_batch(
            probs_split=probs_split,
            vocabulary=vocab_list,
            beam_size=beam_size,
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n)

        results = [result[0][1] for result in beam_search_results]
        return results

    def _adapt_feeding_dict(self, feeding_dict):
        """Adapt feeding dict according to network struct.

        To remove impacts from padding part, we add scale_sub_region layer and
        sub_seq layer. For sub_seq layer, 'sequence_offset' and
        'sequence_length' fields are appended. For each scale_sub_region layer
        'convN_index_range' field is appended.

        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :return: Adapted feeding dict.
        :rtype: dict|list
        """
        adapted_feeding_dict = copy.deepcopy(feeding_dict)
        if isinstance(feeding_dict, dict):
            adapted_feeding_dict["sequence_offset"] = len(adapted_feeding_dict)
            adapted_feeding_dict["sequence_length"] = len(adapted_feeding_dict)
            for i in xrange(self._num_conv_layers):
                adapted_feeding_dict["conv%d_index_range" %i] = \
                        len(adapted_feeding_dict)
        elif isinstance(feeding_dict, list):
            adapted_feeding_dict.append("sequence_offset")
            adapted_feeding_dict.append("sequence_length")
            for i in xrange(self._num_conv_layers):
                adapted_feeding_dict.append("conv%d_index_range" % i)
        else:
            raise ValueError("Type of feeding_dict is %s, not supported." %
                             type(feeding_dict))

        return adapted_feeding_dict

    def _adapt_data(self, data):
        """Adapt data according to network struct.

        For each convolution layer in the conv_group, to remove impacts from
        padding data, we can multiply zero to the padding part of the outputs
        of each batch normalization layer. We add a scale_sub_region layer after
        each batch normalization layer to reset the padding data.
        For rnn layers, to remove impacts from padding data, we can truncate the
        padding part before output data feeded into the first rnn layer. We use
        sub_seq layer to achieve this.

        :param data: Data from data_provider.
        :type data: list|function
        :return: Adapted data.
        :rtype: list|function
        """
        def adapt_instance(instance):
            if len(instance) < 2 or len(instance) > 3:
                raise ValueError("Size of instance should be 2 or 3.")
            padded_audio = instance[0]
            text = instance[1]
            # no padding part
            if len(instance) == 2:
                audio_len = padded_audio.shape[1]
            else:
                audio_len = instance[2]
            adapted_instance = [padded_audio, text]
            # Stride size for conv0 is (3, 2)
            # Stride size for conv1 to convN is (1, 2)
            # Same as the network, hard-coded here
            padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1
            padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1
            valid_w = (audio_len - 1) // 3 + 1
            adapted_instance += [
                [0],  # sequence offset, always 0
                [valid_w],  # valid sequence length
                # Index ranges for channel, height and width
                # Please refer scale_sub_region layer to see details
                [1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w]
            ]
            pre_padded_h = padded_conv0_h
            for i in xrange(self._num_conv_layers - 1):
                padded_h = (pre_padded_h - 1) // 2 + 1
                pre_padded_h = padded_h
                adapted_instance += [[
                    1, 32, 1, padded_h, valid_w + 1, padded_conv0_w
                ]]
            return adapted_instance

        if isinstance(data, list):
            return map(adapt_instance, data)
        elif inspect.isgeneratorfunction(data):

            def adapted_reader():
                for instance in data():
                    yield map(adapt_instance, instance)

            return adapted_reader
        else:
            raise ValueError("Type of data is %s, not supported." % type(data))

    def _create_parameters(self, model_path=None):
        """Load or create model parameters."""
        if model_path is None:
            self._parameters = paddle.parameters.create(self._loss)
        else:
            self._parameters = paddle.parameters.Parameters.from_tar(
                gzip.open(model_path))

    def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
                        rnn_layer_size, use_gru, share_rnn_weights):
        """Create data layers and model network."""
        # paddle.data_type.dense_array is used for variable batch input.
        # The size 161 * 161 is only an placeholder value and the real shape
        # of input batch data will be induced during training.
        audio_data = paddle.layer.data(name="audio_spectrogram",
                                       type=paddle.data_type.dense_array(161 *
                                                                         161))
        text_data = paddle.layer.data(
            name="transcript_text",
            type=paddle.data_type.integer_value_sequence(vocab_size))
        seq_offset_data = paddle.layer.data(
            name='sequence_offset',
            type=paddle.data_type.integer_value_sequence(1))
        seq_len_data = paddle.layer.data(
            name='sequence_length',
            type=paddle.data_type.integer_value_sequence(1))
        index_range_datas = []
        for i in xrange(num_rnn_layers):
            index_range_datas.append(
                paddle.layer.data(name='conv%d_index_range' % i,
                                  type=paddle.data_type.dense_vector(6)))

        self._log_probs, self._loss = deep_speech_v2_network(
            audio_data=audio_data,
            text_data=text_data,
            seq_offset_data=seq_offset_data,
            seq_len_data=seq_len_data,
            index_range_datas=index_range_datas,
            dict_size=vocab_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_layer_size,
            use_gru=use_gru,
            share_rnn_weights=share_rnn_weights)