Esempio n. 1
0
    def __init__(self, params, model, num_workers=1, worker_id=0):
        super(TransformerDataLayer, self).__init__(params, model, num_workers,
                                                   worker_id)
        self.src_vocab_file = self.params['src_vocab_file']
        # if tgt vocab isn't specified - assume common vocab file
        self.tgt_vocab_file = self.params.get('tgt_vocab_file',
                                              self.src_vocab_file)

        # load source and target vocabularies to RAM
        # pre-processed vocab starts from PAD, EOS
        self.src_seq2idx = load_pre_existing_vocabulary(self.src_vocab_file,
                                                        min_idx=PAD_ID)
        self.tgt_seq2idx = load_pre_existing_vocabulary(self.tgt_vocab_file,
                                                        min_idx=PAD_ID)

        self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()}
        self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()}

        self.params['src_vocab_size'] = len(self.src_seq2idx)
        self.params['tgt_vocab_size'] = len(self.tgt_seq2idx)
        self.params['target_seq2idx'] = self.tgt_seq2idx
        self.params['source_seq2idx'] = self.src_seq2idx
        self.params['target_idx2seq'] = self.tgt_idx2seq
        self.params['source_idx2seq'] = self.src_idx2seq

        self._num_workers = num_workers
        self._worker_id = worker_id

        self._input_tensors = {}
        self._iterator = None
        self.batched_dataset = None
Esempio n. 2
0
    def __init__(self, params, model, num_workers, worker_id):
        """Speech-to-text data layer constructor.
        See parent class for arguments description.
        Config parameters:
        * **num_audio_features** (int) --- number of audio features to extract.
        * **input_type** (str) --- could be either "spectrogram" or "mfcc".
        * **vocab_file** (str) --- path to vocabulary file or sentencepiece model.
        * **dataset_files** (list) --- list with paths to all dataset .csv files.
        * **augmentation** (dict) --- optional dictionary with data augmentation
          parameters. Can contain "time_stretch_ratio", "noise_level_min" and
          "noise_level_max" parameters, e.g.::
            {
              'time_stretch_ratio': 0.05,
              'noise_level_min': -90,
              'noise_level_max': -60,
            }
          For additional details on these parameters see
          :func:`data.speech2text.speech_utils.augment_audio_signal` function.
        * **autoregressive** (bool) --- boolean indicating whether the model is autoregressive.
        """
        super(Speech2TextDataLayer, self).__init__(params, model, num_workers,
                                                   worker_id)

        self.params['autoregressive'] = self.params.get(
            'autoregressive', False)
        self.autoregressive = self.params['autoregressive']
        self.params['bpe'] = self.params.get('bpe', False)
        if self.params['bpe']:
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(self.params['vocab_file'])
            self.params['tgt_vocab_size'] = len(self.sp) + 1
        else:
            self.params['char2idx'] = load_pre_existing_vocabulary(
                self.params['vocab_file'],
                read_chars=True,
            )
            if not self.autoregressive:
                # add one for implied blank token
                self.params['tgt_vocab_size'] = len(
                    self.params['char2idx']) + 1
            else:
                num_chars_orig = len(self.params['char2idx'])
                self.params['tgt_vocab_size'] = num_chars_orig + 2
                self.start_index = num_chars_orig
                self.end_index = num_chars_orig + 1
                self.params['char2idx']['<S>'] = self.start_index
                self.params['char2idx']['</S>'] = self.end_index
                self.target_pad_value = self.end_index
            self.params['idx2char'] = {
                i: w
                for w, i in self.params['char2idx'].items()
            }
        self.target_pad_value = 0

        self._files = None
        if self.params["interactive"]:
            return
        for csv in params['dataset_files']:
            files = pd.read_csv(csv, encoding='utf-8')
            if self._files is None:
                self._files = files
            else:
                self._files = self._files.append(files)

        if self.params['mode'] != 'infer':
            cols = ['wav_filename', 'transcript']
        else:
            cols = 'wav_filename'

        self.all_files = self._files.loc[:, cols].values
        self._files = self.split_data(self.all_files)

        self._size = self.get_size_in_samples()
        self._dataset = None
        self._iterator = None
        self._input_tensors = None

        self.params['max_duration'] = params.get('max_duration', None)
Esempio n. 3
0
  def __init__(self, params, model, num_workers=None, worker_id=None):
    """Text-to-speech data layer constructor.

    See parent class for arguments description.

    Config parameters:

    * **dataset** (str) --- The dataset to use. Currently 'LJ' for the LJSpeech
      1.1 dataset is supported.
    * **num_audio_features** (int) --- number of audio features to extract.
    * **output_type** (str) --- could be either "magnitude", or "mel".
    * **vocab_file** (str) --- path to vocabulary file.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
      File is assumed to be separated by "|".
    * **dataset_location** (string) --- string with path to directory where wavs
      are stored.
    * **feature_normalize** (bool) --- whether to normlize the data with a
      preset mean and std
    * **feature_normalize_mean** (bool) --- used for feature normalize.
      Defaults to 0.
    * **feature_normalize_std** (bool) --- used for feature normalize.
      Defaults to 1.
    * **mag_power** (int) --- the power to which the magnitude spectrogram is
      scaled to. Defaults to 1.
      1 for energy spectrogram
      2 for power spectrogram
      Defaults to 2.
    * **pad_EOS** (bool) --- whether to apply EOS tokens to both the text and
      the speech signal. Will pad at least 1 token regardless of pad_to value.
      Defaults to True.
    * **pad_value** (float) --- The value we pad the spectrogram with. Defaults
      to np.log(data_min).
    * **pad_to** (int) --- we pad such that the resulting datapoint is a
      multiple of pad_to.
      Defaults to 8.
    * **trim** (bool) --- Whether to trim silence via librosa or not. Defaults
      to False.
    * **data_min** (float) --- min clip value prior to taking the log. Defaults
      to 1e-5. Please change to 1e-2 if using htk mels.
    * **duration_min** (int) --- Minimum duration in steps for speech signal.
      All signals less than this will be cut from the training set. Defaults to
      0.
    * **duration_max** (int) --- Maximum duration in steps for speech signal.
      All signals greater than this will be cut from the training set. Defaults 
      to 4000.
    * **mel_type** (str): One of ['slaney', 'htk']. Decides which algorithm to
      use to compute mel specs.
      Defaults to htk.

    """
    super(Text2SpeechDataLayer, self).__init__(
        params,
        model,
        num_workers,
        worker_id
    )

    names = ['wav_filename', 'raw_transcript', 'transcript']
    sep = '\x7c'
    header = None

    if self.params["dataset"] == "LJ":
      self._sampling_rate = 22050
      self._n_fft = 1024
    elif self.params["dataset"] == "MAILABS":
      self._sampling_rate = 16000
      self._n_fft = 800

    # Character level vocab
    self.params['char2idx'] = load_pre_existing_vocabulary(
        self.params['vocab_file'],
        min_idx=3,
        read_chars=True,
    )
    # Add the pad, start, and end chars
    self.params['char2idx']['<p>'] = 0
    self.params['char2idx']['<s>'] = 1
    self.params['char2idx']['</s>'] = 2
    self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()}
    self.params['src_vocab_size'] = len(self.params['char2idx'])

    n_feats = self.params['num_audio_features']
    if "both" in self.params["output_type"]:
      self._both = True
      if self.params["feature_normalize"]:
        raise ValueError(
            "feature normalize is not currently enabled for both mode"
        )
      if not isinstance(n_feats, dict):
        raise ValueError(
            "num_audio_features must be a dictionary for both mode"
        )
      else:
        if ("mel" not in n_feats and
            "magnitude" not in n_feats):
          raise ValueError(
            "num_audio_features must contain mel and magnitude keys"
          )
        elif (not isinstance(n_feats["mel"], int) or
              not isinstance(n_feats["magnitude"], int)):
            raise ValueError(
                "num_audio_features must be a int"
            )
      n_mels = n_feats['mel']
      data_min = self.params.get("data_min", None)
      if data_min is not None:
        if not isinstance(data_min, dict):
          raise ValueError(
              "data_min must be a dictionary for both mode"
          )
        else:
          if "mel" not in data_min and "magnitude" not in data_min:
            raise ValueError(
              "data_min must contain mel and magnitude keys"
            )
          elif (not isinstance(data_min["mel"], float) or 
                not isinstance(data_min["magnitude"], float)):
            raise ValueError(
                "data_min must be a float"
            )
      self._exp_mag = self.params.get("exp_mag", True)
    else:
      if not isinstance(n_feats, int):
        raise ValueError(
            "num_audio_features must be a float for mel or magnitude mode"
        )
      if not isinstance(self.params.get("data_min",1.0), float):
        raise ValueError(
            "data_min must be a float for mel or magnitude mode"
        )
      self._both = False
      self._exp_mag = False
      n_mels = n_feats

    self._mel = "mel" in self.params["output_type"]

    if self._mel or self._both:
      htk = True
      norm = None
      if self.params.get('mel_type', 'htk') == 'slaney':
        htk = False
        norm = 1
      self._mel_basis = librosa.filters.mel(
          sr=self._sampling_rate,
          n_fft=self._n_fft,
          n_mels=n_mels,
          htk=htk,
          norm=norm
      )
    else:
      self._mel_basis = None

    if self.params["interactive"]:
      return

    # Load csv files
    self._files = None
    for csvs in params['dataset_files']:
      files = pd.read_csv(
          csvs,
          encoding='utf-8',
          sep=sep,
          header=header,
          names=names,
          quoting=3
      )
      if self._files is None:
        self._files = files
      else:
        self._files = self._files.append(files)

    if self.params['mode'] != 'infer':
      cols = ['wav_filename', 'transcript']
    else:
      cols = 'transcript'

    all_files = self._files.loc[:, cols].values
    self._files = self.split_data(all_files)

    self._size = self.get_size_in_samples()
    self._dataset = None
    self._iterator = None
    self._input_tensors = None
Esempio n. 4
0
    def __init__(self, params, model, num_workers=1, worker_id=0):
        super(ParallelTextDataLayer, self).__init__(params, model, num_workers,
                                                    worker_id)
        self._batch_size = self.params['batch_size']
        self.source_file = self.params['source_file']
        self._use_targets = self.params.get('use_targets', True)
        if not self._use_targets:
            self.target_file = self.source_file
            if 'target_file' in self.params:
                print("WARNING: target file was specified but was "
                      "ignored by data layer because 'use_targets'=False")
        else:
            self.target_file = self.params['target_file']
        self.src_vocab_file = self.params['src_vocab_file']
        self.tgt_vocab_file = self.params['tgt_vocab_file']
        self.max_len = self.params['max_length']
        self._delimiter = self.params.get('delimiter', ' ')
        self._map_parallel_calls = self.params.get('map_parallel_calls', 8)
        self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight',
                                                     False)
        self._prefetch_buffer_size = self.params.get('prefetch_buffer_size',
                                                     tf.contrib.data.AUTOTUNE)
        self._shuffle_buffer_size = self.params.get('shuffle_buffer_size', -1)
        self._num_workers = num_workers
        self._worker_id = worker_id
        self._use_start_token = self.params.get('use_start_token', True)
        if self._pad_lengths_to_eight and not (self.params['max_length'] % 8
                                               == 0):
            raise ValueError("If padding to 8 in data layer, then "
                             "max_length should be multiple of 8")

        def file_len(fname):
            with open(fname) as f:
                for i, l in enumerate(f):
                    pass
            return i + 1

        self.dataset_size = file_len(self.source_file)
        special_tokens_already_in_vocab = self.params.get(
            'special_tokens_already_in_vocab', True)

        # load source and target vocabularies to RAM
        self.src_seq2idx = load_pre_existing_vocabulary(
            self.src_vocab_file,
            min_idx=0 if special_tokens_already_in_vocab else
            SpecialTextTokens.UNK_ID.value + 1)
        self.tgt_seq2idx = load_pre_existing_vocabulary(
            self.tgt_vocab_file,
            min_idx=0 if special_tokens_already_in_vocab else
            SpecialTextTokens.UNK_ID.value + 1)

        if not special_tokens_already_in_vocab:
            # manually add special tokens
            # unknown symbol
            self.src_seq2idx[
              SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \
              SpecialTextTokens.UNK_ID.value
            self.tgt_seq2idx[
              SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \
              SpecialTextTokens.UNK_ID.value
            # sentence start
            self.src_seq2idx[
              SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \
              SpecialTextTokens.S_ID.value
            self.tgt_seq2idx[
              SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \
              SpecialTextTokens.S_ID.value
            # sentence end
            self.src_seq2idx[
              SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \
              SpecialTextTokens.EOS_ID.value
            self.tgt_seq2idx[
              SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \
              SpecialTextTokens.EOS_ID.value
            # padding
            self.src_seq2idx[
              SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \
              SpecialTextTokens.PAD_ID.value
            self.tgt_seq2idx[
              SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \
              SpecialTextTokens.PAD_ID.value

        if self.params.get('pad_vocab_to_eight', False):
            self.src_seq2idx = pad_vocab_to_eight(self.src_seq2idx)
            self.tgt_seq2idx = pad_vocab_to_eight(self.tgt_seq2idx)

        self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()}
        self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()}

        self.params['src_vocab_size'] = len(self.src_seq2idx)
        self.params['tgt_vocab_size'] = len(self.tgt_seq2idx)
        self.params['target_seq2idx'] = self.tgt_seq2idx
        self.params['source_seq2idx'] = self.src_seq2idx
        self.params['target_idx2seq'] = self.tgt_idx2seq
        self.params['source_idx2seq'] = self.src_idx2seq

        self._input_tensors = {}
Esempio n. 5
0
  def __init__(self, params, model, num_workers, worker_id):
    """Speech-to-text data layer constructor.
    See parent class for arguments description.
    Config parameters:
    * **num_audio_features** (int) --- number of audio features to extract.
    * **input_type** (str) --- could be either "spectrogram" or "mfcc".
    * **vocab_file** (str) --- path to vocabulary file or sentencepiece model.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
    * **augmentation** (dict) --- optional dictionary with data augmentation
      parameters. Can contain "time_stretch_ratio", "noise_level_min" and
      "noise_level_max" parameters, e.g.::
        {
          'time_stretch_ratio': 0.05,
          'noise_level_min': -90,
          'noise_level_max': -60,
        }
      For additional details on these parameters see
      :func:`data.speech2text.speech_utils.augment_audio_signal` function.
    * **autoregressive** (bool) --- boolean indicating whether the model is
      autoregressive.
    * **syn_enable** (bool) --- boolean indicating whether the model is
      using synthetic data.
    * **syn_subdirs** (list) --- must be defined if using synthetic mode.
      Contains a list of subdirectories that hold the synthetica wav files.
    """
    super(Speech2TextDataLayer, self).__init__(params, model,
                                               num_workers, worker_id)
    # we need this until python_speech_features gets update on pypi.org
    self.apply_window = 'winfunc' in inspect.getargspec(psf.logfbank)[0]
    if not self.apply_window and \
        (self.params['input_type'] == 'mfcc' or \
         self.params['input_type'] == 'logfbank'):
      print('WARNING: using python_speech_features WITHOUT windowing function')
      print('Please install the latest python_speech_features (from GitHub)')
    self.params['autoregressive'] = self.params.get('autoregressive', False)
    self.autoregressive = self.params['autoregressive']
    self.params['bpe'] = self.params.get('bpe', False)
    if self.params['bpe']:
      self.sp = spm.SentencePieceProcessor()
      self.sp.Load(self.params['vocab_file'])
      self.params['tgt_vocab_size'] = len(self.sp) + 1
    else:
      self.params['char2idx'] = load_pre_existing_vocabulary(
          self.params['vocab_file'], read_chars=True,
      )
      if not self.autoregressive:
        # add one for implied blank token
        self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1
      else:
        num_chars_orig = len(self.params['char2idx'])
        self.params['tgt_vocab_size'] = num_chars_orig + 2
        self.start_index = num_chars_orig
        self.end_index = num_chars_orig + 1
        self.params['char2idx']['<S>'] = self.start_index
        self.params['char2idx']['</S>'] = self.end_index
        self.target_pad_value = self.end_index
      self.params['idx2char'] = {i: w for w,
                                 i in self.params['char2idx'].items()}
    self.target_pad_value = 0

    self._files = None
    if self.params["interactive"]:
      self.params['max_duration'] = params.get('max_duration', -1.0)
      self.params['window_size'] = params.get('window_size', 20e-3)
      self.params['window_stride'] = params.get('window_stride', 10e-3)
      return
    for csv in params['dataset_files']:
      files = pd.read_csv(csv, encoding='utf-8')
      if self._files is None:
        self._files = files
      else:
        self._files = self._files.append(files)

    if self.params['mode'] != 'infer':
      cols = ['wav_filename', 'transcript']
    else:
      cols = 'wav_filename'

    self.all_files = self._files.loc[:, cols].values
    self._files = self.split_data(self.all_files)

    self._size = self.get_size_in_samples()
    self._dataset = None
    self._iterator = None
    self._input_tensors = None

    self.params['max_duration'] = params.get('max_duration', -1.0)
    self.params['window_size'] = params.get('window_size', 20e-3)
    self.params['window_stride'] = params.get('window_stride', 10e-3)