Ejemplo n.º 1
0
def download_and_prepare(dataset_name, data_dir):
    """Downloads and prepares T2T or TFDS dataset.

  Args:
    dataset_name: tfds dataset or t2t problem name prefixed by 't2t_'.
    data_dir: location of existing dataset or None.

  Returns:
    data_dir: path string of downloaded data.
  """
    if not data_dir:
        data_dir = os.path.expanduser('~/tensorflow_datasets/')
        dl_dir = os.path.join(data_dir, 'download')
        logging.info(
            'No dataset directory provided. '
            'Downloading and generating dataset for %s inside data directory %s '
            'For large datasets it is better to prepare datasets manually!',
            dataset_name, data_dir)
        if dataset_name.startswith('t2t_'):
            # Download and run dataset generator for T2T problem.
            data_dir = os.path.join(data_dir, dataset_name)
            tf.io.gfile.makedirs(data_dir)
            tf.io.gfile.makedirs(dl_dir)
            t2t_problems.problem(dataset_name[len('t2t_'):]).generate_data(
                data_dir, dl_dir)
        else:
            # Download and prepare TFDS dataset.
            tfds_builder = tfds.builder(dataset_name)
            tfds_builder.download_and_prepare(download_dir=dl_dir)
    else:
        data_dir = os.path.expanduser(data_dir)
    return data_dir
Ejemplo n.º 2
0
def _train_and_eval_dataset_v1(problem_name, data_dir, train_shuffle_files,
                               eval_shuffle_files):
    """Return train and evaluation datasets, feature info and supervised keys."""
    with tf.device('cpu:0'):
        problem = t2t_problems.problem(problem_name)
        hparams = None
        if problem_name == 'video_bair_robot_pushing':
            hparams = problem.get_hparams()
            bair_robot_pushing_hparams(hparams)
        train_dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN,
                                        data_dir,
                                        shuffle_files=train_shuffle_files,
                                        hparams=hparams)
        train_dataset = train_dataset.map(_select_features)
        eval_dataset = problem.dataset(tf.estimator.ModeKeys.EVAL,
                                       data_dir,
                                       shuffle_files=eval_shuffle_files,
                                       hparams=hparams)
        eval_dataset = eval_dataset.map(_select_features)
        # TODO(lukaszkaiser): remove this need for one example, just input_key.
        examples = list(tfds.as_numpy(train_dataset.take(1)))
    # We use 'inputs' as input except for purely auto-regressive tasks like
    # language models where 'targets' are used as input_key.
    input_key = 'inputs' if 'inputs' in examples[0] else 'targets'
    supervised_keys = ([input_key], ['targets'])
    return train_dataset, eval_dataset, supervised_keys
Ejemplo n.º 3
0
def _train_and_eval_dataset_v1(problem_name, data_dir):
    """Return train and evaluation datasets, feature info and supervised keys."""
    assert not tf.executing_eagerly(), "tf.eager mode must be turned off."
    problem = t2t_problems.problem(problem_name)
    train_dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, data_dir)
    train_dataset = train_dataset.map(_select_features)
    eval_dataset = problem.dataset(tf.estimator.ModeKeys.EVAL, data_dir)
    eval_dataset = eval_dataset.map(_select_features)
    hparams = problem.get_hparams()
    # We take a few training examples to guess the shapes.
    input_shapes, target_shapes = [], []
    example_tensor = train_dataset.make_one_shot_iterator().get_next()
    sess = tf.Session()
    example1 = sess.run(example_tensor)
    example2 = sess.run(example_tensor)
    example3 = sess.run(example_tensor)
    # We use "inputs" as input except for purely auto-regressive tasks like
    # language models where "targets" are used as input_key.
    input_key = "inputs" if "inputs" in example1 else "targets"
    supervised_keys = ([input_key], ["targets"])
    for example in [example1, example2, example3]:
        input_shapes.append(list(example[input_key].shape))
        target_shapes.append(list(example["targets"].shape))
    input_vocab_size = hparams.vocab_size[input_key]
    target_vocab_size = hparams.vocab_size["targets"]
    input_info = _make_info(input_shapes, input_vocab_size)
    target_info = _make_info(target_shapes, target_vocab_size)
    info = {input_key: input_info, "targets": target_info}
    return train_dataset, eval_dataset, info, supervised_keys
Ejemplo n.º 4
0
def _train_and_eval_dataset_v1(problem_name, data_dir, train_shuffle_files,
                               eval_shuffle_files):
    """Return train and evaluation datasets, feature info and supervised keys."""
    with tf.device('cpu:0'):
        problem = t2t_problems.problem(problem_name)
        hparams = None
        if problem_name == 'video_bair_robot_pushing':
            hparams = problem.get_hparams()
            bair_robot_pushing_hparams(hparams)
        train_dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN,
                                        data_dir,
                                        shuffle_files=train_shuffle_files,
                                        hparams=hparams)
        train_dataset = train_dataset.map(_select_features)
        eval_dataset = problem.dataset(tf.estimator.ModeKeys.EVAL,
                                       data_dir,
                                       shuffle_files=eval_shuffle_files,
                                       hparams=hparams)
        eval_dataset = eval_dataset.map(_select_features)
        hparams = problem.get_hparams()
        # We take a few training examples to guess the shapes.
        input_shapes, target_shapes, examples = [], [], []
        if tf.executing_eagerly():
            for example in _eager_dataset_iterator(train_dataset.take(3)):
                examples.append(example)
        else:
            example_tensor = train_dataset.make_one_shot_iterator().get_next()
            sess = tf.Session()
            example1 = sess.run(example_tensor)
            example2 = sess.run(example_tensor)
            example3 = sess.run(example_tensor)
            examples = [example1, example2, example3]
    # We use 'inputs' as input except for purely auto-regressive tasks like
    # language models where 'targets' are used as input_key.
    input_key = 'inputs' if 'inputs' in examples[0] else 'targets'
    supervised_keys = ([input_key], ['targets'])
    for example in examples:
        input_shapes.append(list(example[input_key].shape))
        target_shapes.append(list(example['targets'].shape))
    input_vocab_size = hparams.vocab_size[input_key]
    target_vocab_size = hparams.vocab_size['targets']
    input_dtype = examples[0][input_key].dtype
    target_dtype = examples[0]['targets'].dtype
    input_info = _make_info(input_shapes, input_vocab_size, input_dtype)
    target_info = _make_info(target_shapes, target_vocab_size, target_dtype)
    info = {input_key: input_info, 'targets': target_info}
    return train_dataset, eval_dataset, info, supervised_keys
Ejemplo n.º 5
0
def inputs(n_devices,
           dataset_name,
           data_dir=None,
           input_name=None,
           n_chunks=0):
    """Make Inputs for built-in datasets.

  Args:
    n_devices: how many devices to build the inputs for.
    dataset_name: a TFDS or T2T dataset name. If it's a T2T dataset name, prefix
      with "t2t_".
    data_dir: data directory.
    input_name: optional, name of the inputs from the dictionary.
    n_chunks: optional, into how many pieces should we chunk (large inputs).

  Returns:
    trax.inputs.Inputs
  """
    if not data_dir:
        data_dir = os.path.expanduser('~/tensorflow_datasets/')
        dl_dir = os.path.join(data_dir, 'download')
        tf.logging.info((
            'No dataset directory provided. '
            'Downloading and generating dataset for %s inside data directory %s '
            'For large datasets it is better to prepare datasets manually!') %
                        (dataset_name, data_dir))
        if dataset_name.startswith('t2t_'):
            # Download and run dataset generator for T2T problem.
            data_dir = os.path.join(data_dir, dataset_name)
            tf.gfile.MakeDirs(data_dir)
            tf.gfile.MakeDirs(dl_dir)
            t2t_problems.problem(dataset_name[4:]).generate_data(
                data_dir, dl_dir)
        else:
            # Download and prepare TFDS dataset.
            tfds_builder = tfds.builder(dataset_name)
            tfds_builder.download_and_prepare(download_dir=dl_dir)
    else:
        data_dir = os.path.expanduser(data_dir)

    (train_batches, train_eval_batches, eval_batches, input_name, input_shape,
     input_dtype, target_shape,
     target_dtype) = _train_and_eval_batches(dataset_name, data_dir,
                                             input_name, n_devices)

    if isinstance(input_dtype, tf.DType):
        input_dtype = input_dtype.as_numpy_dtype
    if isinstance(target_dtype, tf.DType):
        target_dtype = target_dtype.as_numpy_dtype

    if input_dtype == np.uint8:  # TPUs don't like uint8s, we cast to ints.
        input_dtype = np.int32
    if target_dtype == np.uint8:
        target_dtype = np.int32

    def numpy_stream(dataset):
        return dataset_to_stream(dataset, input_name, n_chunks=n_chunks)

    if n_chunks > 0:
        length = input_shape[0]
        input_shape = tuple(
            [tuple([length // n_chunks] + list(input_shape)[1:])] * n_chunks)
        input_dtype = tuple([input_dtype] * n_chunks)
        target_shape = tuple(
            [tuple([length // n_chunks] + list(target_shape)[1:])] * n_chunks)
        target_dtype = tuple([target_dtype] * n_chunks)

    return Inputs(train_stream=lambda: numpy_stream(train_batches),
                  train_eval_stream=lambda: numpy_stream(train_eval_batches),
                  eval_stream=lambda: numpy_stream(eval_batches),
                  input_shape=input_shape,
                  input_dtype=input_dtype,
                  target_shape=target_shape,
                  target_dtype=target_dtype)