Ejemplo n.º 1
0
  def build_inputs(self, params: cfg.DataConfig, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':

      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        x = dict(
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)

        # Include some label_id as -1, which will be ignored in loss/metrics.
        y = tf.random.uniform(
            shape=(1, params.seq_length),
            minval=-1,
            maxval=len(self.task_config.class_names),
            dtype=tf.dtypes.int32)
        return (x, y)

      dataset = tf.data.Dataset.range(1)
      dataset = dataset.repeat()
      dataset = dataset.map(
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

    return data_loader_factory.get_data_loader(params).load(input_context)
Ejemplo n.º 2
0
    def build_inputs(self, params, input_context=None):
        """Returns tf.data.Dataset for pretraining."""
        # copy from masked_lm.py for testing
        if params.input_path == 'dummy':

            def dummy_data(_):
                dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
                dummy_lm = tf.zeros((1, params.max_predictions_per_seq),
                                    dtype=tf.int32)
                return dict(input_word_ids=dummy_ids,
                            input_mask=dummy_ids,
                            input_type_ids=dummy_ids,
                            masked_lm_positions=dummy_lm,
                            masked_lm_ids=dummy_lm,
                            masked_lm_weights=tf.cast(dummy_lm,
                                                      dtype=tf.float32),
                            next_sentence_labels=tf.zeros((1, 1),
                                                          dtype=tf.int32))

            dataset = tf.data.Dataset.range(1)
            dataset = dataset.repeat()
            dataset = dataset.map(
                dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            return dataset

        return data_loader_factory.get_data_loader(params).load(input_context)
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':
      # Dummy training data for unit test.
      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        x = dict(
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)
        y = dict(
            start_positions=tf.constant(0, dtype=tf.int32),
            end_positions=tf.constant(1, dtype=tf.int32))
        return (x, y)

      dataset = tf.data.Dataset.range(1)
      dataset = dataset.repeat()
      dataset = dataset.map(
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

    if params.is_training:
      dataloader_params = params
    else:
      input_path = self._tf_record_input_path
      dataloader_params = params.replace(input_path=input_path)

    return data_loader_factory.get_data_loader(dataloader_params).load(
        input_context)
Ejemplo n.º 4
0
 def build_dataset(self):
   """Creates the training and evaluation dataset."""
   # Returns None when the input_path is 'dummy'.
   if self.train_dataset_config.input_path == 'dummy':
     self.train_dataset = None
     self.eval_dataset = None
     return
   # None distributed dataset.
   train_dataset = data_loader_factory.get_data_loader(
       self.train_dataset_config).load()
   eval_dataset = data_loader_factory.get_data_loader(
       self.eval_dataset_config).load()
   # Ddistributed dataset.
   self.train_dataset = orbit.utils.make_distributed_dataset(
       self.strategy, train_dataset)
   self.eval_dataset = orbit.utils.make_distributed_dataset(
       self.strategy, eval_dataset)
Ejemplo n.º 5
0
 def build_inputs(self,
                  params: cfg.DataConfig,
                  input_context: Optional[tf.distribute.InputContext] = None):
   """Returns a dataset."""
   if params.is_training:
     dataloader_params = params
   else:
     input_path = self._tf_record_input_path
     # Read from padded tf records instead.
     dataloader_params = params.replace(
         input_path=input_path,
         tfds_name="",
         tfds_split="",
         has_unique_id=True)
   dataloader_params = dataloader_params.replace(
       sentencepiece_model_path=self._sentencepiece_model_path)
   return data_loader_factory.get_data_loader(dataloader_params).load(
       input_context)
Ejemplo n.º 6
0
    def build_inputs(self, params, input_context=None):
        """Returns tf.data.Dataset for sentence_prediction task."""
        if params.input_path == 'dummy':
            dataset = tf.data.Dataset.range(1)
            dataset = dataset.repeat()
            dummy_data = functools.partial(self._dummy_data, params)
            dataset = dataset.map(
                dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            return dataset

        if params.is_training:
            dataloader_params = params
        else:
            input_path = self._tf_record_input_path
            dataloader_params = params.replace(input_path=input_path)

        return data_loader_factory.get_data_loader(dataloader_params).load(
            input_context)
Ejemplo n.º 7
0
    def build_inputs(self, params, input_context=None):
        """Returns tf.data.Dataset for sentence_prediction task."""
        if params.input_path == 'dummy':

            def dummy_data(_):
                dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
                x = dict(input_word_ids=dummy_ids,
                         input_mask=dummy_ids,
                         input_type_ids=dummy_ids)
                y = tf.zeros((1, 1), dtype=tf.int32)
                return (x, y)

            dataset = tf.data.Dataset.range(1)
            dataset = dataset.repeat()
            dataset = dataset.map(
                dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            return dataset

        return data_loader_factory.get_data_loader(params).load(input_context)
Ejemplo n.º 8
0
def write_test_record(params, model_dir):
  """Writes the test input to a tfrecord."""
  # Get raw data from tfds.
  params = params.replace(transform_and_batch=False)
  dataset = data_loader_factory.get_data_loader(params).load()
  references = []
  total_samples = 0
  output_file = os.path.join(model_dir, "eval.tf_record")
  writer = tf.io.TFRecordWriter(output_file)
  for d in dataset:
    references.append(d[params.tgt_lang].numpy().decode())
    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                "unique_id": tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[total_samples])),
                params.src_lang: tf.train.Feature(
                    bytes_list=tf.train.BytesList(
                        value=[d[params.src_lang].numpy()])),
                params.tgt_lang: tf.train.Feature(
                    bytes_list=tf.train.BytesList(
                        value=[d[params.tgt_lang].numpy()])),
            }))
    writer.write(example.SerializeToString())
    total_samples += 1
  batch_size = params.global_batch_size
  num_dummy_example = batch_size - total_samples % batch_size
  for i in range(num_dummy_example):
    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                "unique_id": tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[total_samples + i])),
                params.src_lang: tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[b""])),
                params.tgt_lang: tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[b""])),
            }))
    writer.write(example.SerializeToString())
  writer.close()
  return references, output_file
Ejemplo n.º 9
0
    def build_inputs(self, params, input_context=None):
        """Returns tf.data.Dataset for sentence_prediction task."""
        if params.input_path == 'dummy':

            def dummy_data(_):
                dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
                x = dict(input_word_ids=dummy_ids,
                         input_mask=dummy_ids,
                         input_type_ids=dummy_ids)

                if self.task_config.model.num_classes == 1:
                    y = tf.zeros((1, ), dtype=tf.float32)
                else:
                    y = tf.zeros((1, 1), dtype=tf.int32)
                x[self.label_field] = y
                return x

            dataset = tf.data.Dataset.range(1)
            dataset = dataset.repeat()
            dataset = dataset.map(
                dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            return dataset

        return data_loader_factory.get_data_loader(params).load(input_context)
Ejemplo n.º 10
0
def build_inputs(data_params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    return data_loader_factory.get_data_loader(data_params).load(input_context)
Ejemplo n.º 11
0
 def test_register_and_load(self):
     train_config = MyDataConfig()
     train_loader = data_loader_factory.get_data_loader(train_config)
     self.assertTrue(train_loader.params.is_training)