Ejemplo n.º 1
0
  def __init__(self, config, mode):
    super().__init__(config, mode)

    self.mode = mode
    self.dataconf = self.config['data']
    self.taskconf = self.dataconf['task']
    self.solverconf = self.config['solver']

    self.data_type = self.taskconf['data_type']

    self.chunk_size_seconds = self.taskconf['audio']['clip_size']
    # TODO: configurable frame rate
    self.chunk_size_frames = self.chunk_size_seconds * 100
    self.feature_dims = self.taskconf['audio']['feature_size']
    # TODO: delta features
    self.feature_shape = (self.chunk_size_frames, self.feature_dims, 1)
    # TODO: text input is useless
    self.max_text_len = 10

    self._cmvn_path = self.taskconf['audio']['cmvn_path']

    # meta data
    self.meta = KaldiMetaData()
    self._classes = {}

    logging.info('Loading meta data ...')
    self.load_meta_data()

    self.sampler = ChunkSampler(self.chunk_size_frames)
Ejemplo n.º 2
0
  def __init__(self, config, mode):
    super().__init__(config, mode)
    self.dataconf = self.config['data']
    self.taskconf = self.dataconf['task']
    self.solverconf = self.config['solver']

    self.data_type = self.taskconf['data_type']

    if 'whole_utt_inference' in self.taskconf['audio']:
      self.whole_utt_inference = self.taskconf['audio']['whole_utt_inference']
    else:
      self.whole_utt_inference = False

    if 'add_random_offset' in self.taskconf['audio']:
      self.add_random_offset = self.taskconf['audio']['add_random_offset']
    else:
      self.add_random_offset = False

    if 'drop_short_chunks' in self.taskconf['audio']:
      self.drop_short_chunks = self.taskconf['audio']['drop_short_chunks']
    else:
      self.drop_short_chunks = 0.0

    if 'single_chunk' in self.taskconf['audio']:
      self.single_chunk = self.taskconf['audio']['single_chunk']
    else:
      self.single_chunk = 0.0

    if 'select_by_spk_train' in self.taskconf['audio']:
      self.select_by_spk_train = self.taskconf['audio']['select_by_spk_train']
    else:
      self.select_by_spk_train = False

    if 'select_by_spk_eval' in self.taskconf['audio']:
      self.select_by_spk_eval = self.taskconf['audio']['select_by_spk_eval']
    else:
      self.select_by_spk_eval = False

    if 'num_repeats' in self.taskconf['audio']:
      self.num_repeats = self.taskconf['audio']['num_repeats']
    else:
      self.num_repeats = 1

    self.chunk_size_seconds = self.taskconf['audio']['clip_size']
    # TODO: configurable frame rate
    self.chunk_size_frames = self.chunk_size_seconds * 100
    self.feature_dims = self.taskconf['audio']['feature_size']
    # TODO: delta features
    if self.mode == utils.INFER and self.whole_utt_inference:
      self.feature_shape = (None, self.feature_dims, 1)
    else:
      self.feature_shape = (self.chunk_size_frames, self.feature_dims, 1)
    self.feature_shape = (None, self.feature_dims, 1)

    # TODO: not implemented
    self.uniform_resample = False

    # 10k sample/utts is somehow enough.
    self.cmvn_max_samples = 10000

    self._cmvn_path = self.taskconf['audio']['cmvn_path']

    # meta data
    self.meta = KaldiMetaData()
    self._classes = {}

    logging.info('Loading meta data ...')
    self.load_meta_data()

    if self.mode == utils.INFER and self.whole_utt_inference:
      # Do whole utterance inference.
      logging.info('Set chunk_size = 10M and padding = False.')
      self.sampler = ChunkSampler(self.meta, 10000000)
      self.sampler.pad_chunks = False
    else:
      self.sampler = ChunkSampler(self.meta, self.chunk_size_frames)

    if self.mode != utils.INFER and self.add_random_offset:
      self.sampler.add_random_offset = True

    if self.mode != utils.INFER and self.drop_short_chunks > 0.0:
      logging.info('Dropping chunks < %f .' % (self.drop_short_chunks))
      self.sampler.drop_short_chunks = self.drop_short_chunks

    if self.mode != utils.INFER:
      logging.info('Single chunk sampling enabled.')
      self.sampler.single_chunk = self.single_chunk

    if self.mode == utils.TRAIN:
      logging.info('Utt selection by spk enabled for training.')
      self.sampler.select_by_spk = self.select_by_spk_train
    elif self.mode == utils.EVAL:
      logging.info('Utt selection by spk enabled for evaluation.')
      self.sampler.select_by_spk = self.select_by_spk_eval

    if self.mode == utils.TRAIN:
      logging.info('Num repeats = %d.' % (self.num_repeats))
      self.sampler.num_repeats = self.num_repeats
Ejemplo n.º 3
0
class SpeakerClsTask(SpeechTask):
  ''' Speaker Classification Task '''

  def __init__(self, config, mode):
    super().__init__(config, mode)
    self.dataconf = self.config['data']
    self.taskconf = self.dataconf['task']
    self.solverconf = self.config['solver']

    self.data_type = self.taskconf['data_type']

    if 'whole_utt_inference' in self.taskconf['audio']:
      self.whole_utt_inference = self.taskconf['audio']['whole_utt_inference']
    else:
      self.whole_utt_inference = False

    if 'add_random_offset' in self.taskconf['audio']:
      self.add_random_offset = self.taskconf['audio']['add_random_offset']
    else:
      self.add_random_offset = False

    if 'drop_short_chunks' in self.taskconf['audio']:
      self.drop_short_chunks = self.taskconf['audio']['drop_short_chunks']
    else:
      self.drop_short_chunks = 0.0

    if 'single_chunk' in self.taskconf['audio']:
      self.single_chunk = self.taskconf['audio']['single_chunk']
    else:
      self.single_chunk = 0.0

    if 'select_by_spk_train' in self.taskconf['audio']:
      self.select_by_spk_train = self.taskconf['audio']['select_by_spk_train']
    else:
      self.select_by_spk_train = False

    if 'select_by_spk_eval' in self.taskconf['audio']:
      self.select_by_spk_eval = self.taskconf['audio']['select_by_spk_eval']
    else:
      self.select_by_spk_eval = False

    if 'num_repeats' in self.taskconf['audio']:
      self.num_repeats = self.taskconf['audio']['num_repeats']
    else:
      self.num_repeats = 1

    self.chunk_size_seconds = self.taskconf['audio']['clip_size']
    # TODO: configurable frame rate
    self.chunk_size_frames = self.chunk_size_seconds * 100
    self.feature_dims = self.taskconf['audio']['feature_size']
    # TODO: delta features
    if self.mode == utils.INFER and self.whole_utt_inference:
      self.feature_shape = (None, self.feature_dims, 1)
    else:
      self.feature_shape = (self.chunk_size_frames, self.feature_dims, 1)
    self.feature_shape = (None, self.feature_dims, 1)

    # TODO: not implemented
    self.uniform_resample = False

    # 10k sample/utts is somehow enough.
    self.cmvn_max_samples = 10000

    self._cmvn_path = self.taskconf['audio']['cmvn_path']

    # meta data
    self.meta = KaldiMetaData()
    self._classes = {}

    logging.info('Loading meta data ...')
    self.load_meta_data()

    if self.mode == utils.INFER and self.whole_utt_inference:
      # Do whole utterance inference.
      logging.info('Set chunk_size = 10M and padding = False.')
      self.sampler = ChunkSampler(self.meta, 10000000)
      self.sampler.pad_chunks = False
    else:
      self.sampler = ChunkSampler(self.meta, self.chunk_size_frames)

    if self.mode != utils.INFER and self.add_random_offset:
      self.sampler.add_random_offset = True

    if self.mode != utils.INFER and self.drop_short_chunks > 0.0:
      logging.info('Dropping chunks < %f .' % (self.drop_short_chunks))
      self.sampler.drop_short_chunks = self.drop_short_chunks

    if self.mode != utils.INFER:
      logging.info('Single chunk sampling enabled.')
      self.sampler.single_chunk = self.single_chunk

    if self.mode == utils.TRAIN:
      logging.info('Utt selection by spk enabled for training.')
      self.sampler.select_by_spk = self.select_by_spk_train
    elif self.mode == utils.EVAL:
      logging.info('Utt selection by spk enabled for evaluation.')
      self.sampler.select_by_spk = self.select_by_spk_eval

    if self.mode == utils.TRAIN:
      logging.info('Num repeats = %d.' % (self.num_repeats))
      self.sampler.num_repeats = self.num_repeats

  def load_meta_data(self):
    ''' Load meta data. '''
    data_paths = self.dataconf[self.mode]['paths']
    logging.info('Loading mode %s dirs: %s ...' % (self.mode, data_paths))
    if len(data_paths) != 1:
      raise ValueError('More than 1 data dirs is not supported by now.')
    for data_path in data_paths:
      logging.info('Loading dir %s ...' % (data_path))
      if self.data_type == 'KaldiDataDirectory':
        self.meta.load(data_path)
      else:
        raise ValueError('Unsupported data type: %s' % (self.data_type))
    self._classes = self.meta.spk2id

  @property
  def num_class(self):
    ''' Return number of classes. '''
    return len(self._classes)

  @property
  def classes(self):
    ''' Return a map from class names to label ids. '''
    return self._classes

  def class_id(self, class_name):
    ''' Return the numeric label of a given class name. '''
    return self._classes[class_name]

  def generate_feat(self, filelist, dry_run=False):
    ''' Stub. Not implemented because we use Kaldi features. '''

  def generate_cmvn(self, filelist=None, dry_run=False):  # pylint: disable=unused-argument
    ''' Generate mean and vars of features. '''
    sums, square, count = utils.create_cmvn_statis(
        self.taskconf['audio']['feature_size'],
        self.taskconf['audio']['add_delta_deltas'])

    self.sampler.chunk_size = 100000
    self.sampler.pad_chunks = False

    num_done = 0
    for inputs, _, _, _, _ in \
        self.generate_data():
      # update stats
      if inputs.ndim == 3:
        inputs = np.expand_dims(inputs, axis=0)
      sums, square, count = utils.update_cmvn_statis(
          inputs, sums, square, count, axis=(0, 1))
      num_done += 1
      if num_done % 100 == 0:
        logging.info('Done %d samples.' % (num_done))
      if num_done > self.cmvn_max_samples:
        break
    # compute cmvn
    mean, var = utils.compute_cmvn(sums, square, count)
    if dry_run:
      logging.info('save cmvn:{}'.format(self._cmvn_path))
    else:
      np.save(self._cmvn_path, (mean, var))
    logging.info('generate cmvn done')
    logging.info(mean)
    logging.info(var)

  #pylint: disable=stop-iteration-return
  def generate_data(self):
    '''
    Yields samples.

    Args:
      multiprocess: use multiprocessing. Default = True.

    Yields:
      (inputs, label, filename, clip_id, soft_label)
    '''
    class_num = self.taskconf['classes']['num']

    def process_sample(sample, clip_id):
      '''
      Pack various info into a tuple, to be furtherly processed by Dataset.

      Args:
        sample: a clip (or chunk) of an utterance generated previously.
        clip_id: the index of clip in the entire utterance.

      Returns:
        a tuple of feature, label and everything else for training.
      '''
      inputs, label, utt_key = sample
      filename = utt_key
      clip_id = clip_id
      soft_label = np.zeros((1,))  # disabled for speaker model
      return inputs, label, filename, clip_id, soft_label

    if self.mode == utils.INFER:
      # Estimator.predict might cause multiprocessing to fail.
      multiprocess = False
    else:
      multiprocess = True

    self.sampler.reset()

    if multiprocess:
      q = ImapUnorderedDataQueue
      data_queue = q(self.meta, self.sampler, num_processes=4)
      data_queue.start()
      for samples in data_queue.get_items():
        for idx, sample in enumerate(samples):
          yield process_sample(sample, idx)
    else:
      for item in self.meta.utts.items():
        samples = self.sampler.utt_to_samples((None, item))
        for idx, sample in enumerate(samples):
          yield process_sample(sample, idx)
    raise StopIteration

  def feature_spec(self):
    output_shapes = (
        tf.TensorShape(self.feature_shape),  # audio_feat e.g. (3000, 40, 3)
        tf.TensorShape([]),  # label
        tf.TensorShape([]),  # filename
        tf.TensorShape([]),  # clip_id
        tf.TensorShape([1]),  # soft_label, disabled for speaker model
    )
    output_types = (
        tf.float32,
        tf.int32,
        tf.string,
        tf.int32,
        tf.float32,
    )
    assert len(output_shapes) == len(output_types)
    return output_shapes, output_types

  def preprocess_batch(self, batch):
    return batch

  def dataset(self, mode, batch_size, num_epoch):  # pylint: disable=unused-argument
    shapes, types = self.feature_spec()
    data = tf.data.Dataset.from_generator(
        generator=lambda: self.generate_data(),  # pylint: disable=unnecessary-lambda
        output_types=types,
        output_shapes=shapes,
    )

    buffer_size = self.taskconf['shuffle_buffer_size']
    logging.info('Using buffer size of %d samples in shuffle_and_repeat().' %
                 (buffer_size))
    if mode == utils.TRAIN:
      data = data.shuffle(buffer_size=buffer_size)
      if self.uniform_resample:

        def class_func(inputs, labels, filenames, clip_ids, soft_labels):
          ''' Return the label of a sample tuple. '''
          return labels

        target_dist = tf.ones((self.num_class,), dtype=tf.float32) / \
                      self.num_class
        data = data.apply(
            tf.data.experimental.rejection_resample(class_func, target_dist))

    def make_example(inputs, labels, filenames, clip_ids, soft_labels):
      features = {
          'inputs': inputs,
          'labels': labels,
          'filepath': filenames,
          'clipid': clip_ids,
          'soft_labels': soft_labels,
      }
      return features, labels

    if self.mode == utils.INFER and self.whole_utt_inference:
      # To avoid length difference since padding = False.
      logging.info('Inference mode, set batch_size to 1.')
      batch_size = 1
    return data.map(make_example, num_parallel_calls=10).\
                batch(batch_size, drop_remainder=False).\
                prefetch(tf.data.experimental.AUTOTUNE)
Ejemplo n.º 4
0
class SpeakerClsTask(SpeechTask):
  ''' Speaker Classification Task '''

  def __init__(self, config, mode):
    super().__init__(config, mode)

    self.mode = mode
    self.dataconf = self.config['data']
    self.taskconf = self.dataconf['task']
    self.solverconf = self.config['solver']

    self.data_type = self.taskconf['data_type']

    self.chunk_size_seconds = self.taskconf['audio']['clip_size']
    # TODO: configurable frame rate
    self.chunk_size_frames = self.chunk_size_seconds * 100
    self.feature_dims = self.taskconf['audio']['feature_size']
    # TODO: delta features
    self.feature_shape = (self.chunk_size_frames, self.feature_dims, 1)
    # TODO: text input is useless
    self.max_text_len = 10

    self._cmvn_path = self.taskconf['audio']['cmvn_path']

    # meta data
    self.meta = KaldiMetaData()
    self._classes = {}

    logging.info('Loading meta data ...')
    self.load_meta_data()

    self.sampler = ChunkSampler(self.chunk_size_frames)

  def load_meta_data(self):
    ''' Load meta data. '''
    data_paths = self.dataconf[self.mode]['paths']
    logging.info('Loading mode %s dirs: %s ...' % (self.mode, data_paths))
    if len(data_paths) != 1:
      raise ValueError('More than 1 data dirs is not supported by now.')
    for data_path in data_paths:
      logging.info('Loading dir %s ...' % (data_path))
      if self.data_type == 'KaldiDataDirectory':
        self.meta.load(data_path)
      else:
        raise ValueError('Unsupported data type: %s' % (self.data_type))
    self._classes = self.meta.spk2id

  @property
  def num_class(self):
    ''' Return number of classes. '''
    return len(self._classes)

  @property
  def classes(self):
    ''' Return a map from class names to label ids. '''
    return self._classes

  def class_id(self, class_name):
    ''' Return the numeric label of a given class name. '''
    return self._classes[class_name]

  def generate_feat(self, filelist, dry_run=False):
    ''' Stub. Not implemented because we use Kaldi features. '''

  def generate_cmvn(self, filelist=None, dry_run=False):  # pylint: disable=unused-argument
    ''' Generate mean and vars of features. '''
    sums, square, count = utils.create_cmvn_statis(
        self.taskconf['audio']['feature_size'],
        self.taskconf['audio']['add_delta_deltas'])

    self.sampler.set_chunk_size(100000)
    self.sampler.set_pad_chunks(False)
    for inputs, _, _, _, _, _ in \
        self.generate_data():
      # update stats
      if inputs.ndim == 3:
        inputs = np.expand_dims(inputs, axis=0)
      sums, square, count = utils.update_cmvn_statis(
          inputs, sums, square, count, axis=(0, 1))
    # compute cmvn
    mean, var = utils.compute_cmvn(sums, square, count)
    if dry_run:
      logging.info('save cmvn:{}'.format(self._cmvn_path))
    else:
      np.save(self._cmvn_path, (mean, var))
    logging.info('generate cmvn done')
    logging.info(mean)
    logging.info(var)

  #pylint: disable=stop-iteration-return
  def generate_data(self):
    '''
    Yields samples.

    Args:
      multiprocess: use multiprocessing. Default = True.

    Yields:
      (inputs, texts, label, filename, clip_id, soft_label)
    '''
    class_num = self.taskconf['classes']['num']

    def process_sample(sample, clip_id):
      '''
      Pack various info into a tuple, to be furtherly processed by Dataset.

      Args:
        sample: a clip (or chunk) of an utterance generated previously.
        clip_id: the index of clip in the entire utterance.

      Returns:
        a tuple of feature, label and everything else for training.
      '''
      inputs, label, utt_key = sample
      texts = np.array([0] * self.max_text_len)
      filename = utt_key
      clip_id = clip_id
      soft_label = np.zeros((1,))  # disabled for speaker model
      return inputs, texts, label, filename, clip_id, soft_label

    if self.mode == utils.INFER:
      # Estimator.predict might cause multiprocessing to fail.
      multiprocess = False
    else:
      multiprocess = True

    if multiprocess:
      q = ImapUnorderedDataQueue
      data_queue = q(self.meta, self.sampler, num_processes=4)
      data_queue.start()
      for samples in data_queue.get_items():
        for idx, sample in enumerate(samples):
          yield process_sample(sample, idx)
    else:
      for item in self.meta.utts.items():
        samples = self.sampler.utt_to_samples((None, item))
        for idx, sample in enumerate(samples):
          yield process_sample(sample, idx)
    raise StopIteration

  def feature_spec(self):
    output_shapes = (
        tf.TensorShape(self.feature_shape),  # audio_feat e.g. (3000, 40, 3)
        tf.TensorShape([self.max_text_len]),  # text
        tf.TensorShape([]),  # label
        tf.TensorShape([]),  # filename
        tf.TensorShape([]),  # clip_id
        tf.TensorShape([1]),  # soft_label, disabled for speaker model
    )
    output_types = (
        tf.float32,
        tf.int32,
        tf.int32,
        tf.string,
        tf.int32,
        tf.float32,
    )
    return output_shapes, output_types

  def preprocess_batch(self, batch):
    return batch

  def dataset(self, mode, batch_size, num_epoch):  # pylint: disable=unused-argument
    shapes, types = self.feature_spec()
    data = tf.data.Dataset.from_generator(
        generator=lambda: self.generate_data(),  # pylint: disable=unnecessary-lambda
        output_types=types,
        output_shapes=shapes,
    )

    if 'tf_shuffle_buffer_size' in self.solverconf['optimizer']:
      buffer_size = self.solverconf['optimizer']['tf_shuffle_buffer_size']
    else:
      buffer_size = 100000
    logging.info('Using buffer size of %d samples in shuffle_and_repeat().' %
                 (buffer_size))
    if mode == utils.TRAIN:
      data = data.apply(
          tf.data.experimental.shuffle_and_repeat(
              buffer_size=buffer_size, count=1, seed=None))

    def make_example(inputs, texts, labels, filenames, clip_ids, soft_labels):
      features = {
          'inputs': inputs,
          'texts': texts,
          'labels': labels,
          'filepath': filenames,
          'clipid': clip_ids,
          'soft_labels': soft_labels,
      }
      return features, labels

    return data.map(make_example, num_parallel_calls=10).\
                batch(batch_size, drop_remainder=False).\
                prefetch(tf.contrib.data.AUTOTUNE)