def download_and_prepare(dataset_name, data_dir): """Downloads and prepares T2T or TFDS dataset. Args: dataset_name: tfds dataset or t2t problem name prefixed by 't2t_'. data_dir: location of existing dataset or None. Returns: data_dir: path string of downloaded data. """ if not data_dir: data_dir = os.path.expanduser('~/tensorflow_datasets/') dl_dir = os.path.join(data_dir, 'download') logging.info( 'No dataset directory provided. ' 'Downloading and generating dataset for %s inside data directory %s ' 'For large datasets it is better to prepare datasets manually!', dataset_name, data_dir) if dataset_name.startswith('t2t_'): # Download and run dataset generator for T2T problem. data_dir = os.path.join(data_dir, dataset_name) tf.io.gfile.makedirs(data_dir) tf.io.gfile.makedirs(dl_dir) t2t_problems.problem(dataset_name[len('t2t_'):]).generate_data( data_dir, dl_dir) else: # Download and prepare TFDS dataset. tfds_builder = tfds.builder(dataset_name) tfds_builder.download_and_prepare(download_dir=dl_dir) else: data_dir = os.path.expanduser(data_dir) return data_dir
def _train_and_eval_dataset_v1(problem_name, data_dir, train_shuffle_files, eval_shuffle_files): """Return train and evaluation datasets, feature info and supervised keys.""" with tf.device('cpu:0'): problem = t2t_problems.problem(problem_name) hparams = None if problem_name == 'video_bair_robot_pushing': hparams = problem.get_hparams() bair_robot_pushing_hparams(hparams) train_dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, data_dir, shuffle_files=train_shuffle_files, hparams=hparams) train_dataset = train_dataset.map(_select_features) eval_dataset = problem.dataset(tf.estimator.ModeKeys.EVAL, data_dir, shuffle_files=eval_shuffle_files, hparams=hparams) eval_dataset = eval_dataset.map(_select_features) # TODO(lukaszkaiser): remove this need for one example, just input_key. examples = list(tfds.as_numpy(train_dataset.take(1))) # We use 'inputs' as input except for purely auto-regressive tasks like # language models where 'targets' are used as input_key. input_key = 'inputs' if 'inputs' in examples[0] else 'targets' supervised_keys = ([input_key], ['targets']) return train_dataset, eval_dataset, supervised_keys
def _train_and_eval_dataset_v1(problem_name, data_dir): """Return train and evaluation datasets, feature info and supervised keys.""" assert not tf.executing_eagerly(), "tf.eager mode must be turned off." problem = t2t_problems.problem(problem_name) train_dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, data_dir) train_dataset = train_dataset.map(_select_features) eval_dataset = problem.dataset(tf.estimator.ModeKeys.EVAL, data_dir) eval_dataset = eval_dataset.map(_select_features) hparams = problem.get_hparams() # We take a few training examples to guess the shapes. input_shapes, target_shapes = [], [] example_tensor = train_dataset.make_one_shot_iterator().get_next() sess = tf.Session() example1 = sess.run(example_tensor) example2 = sess.run(example_tensor) example3 = sess.run(example_tensor) # We use "inputs" as input except for purely auto-regressive tasks like # language models where "targets" are used as input_key. input_key = "inputs" if "inputs" in example1 else "targets" supervised_keys = ([input_key], ["targets"]) for example in [example1, example2, example3]: input_shapes.append(list(example[input_key].shape)) target_shapes.append(list(example["targets"].shape)) input_vocab_size = hparams.vocab_size[input_key] target_vocab_size = hparams.vocab_size["targets"] input_info = _make_info(input_shapes, input_vocab_size) target_info = _make_info(target_shapes, target_vocab_size) info = {input_key: input_info, "targets": target_info} return train_dataset, eval_dataset, info, supervised_keys
def _train_and_eval_dataset_v1(problem_name, data_dir, train_shuffle_files, eval_shuffle_files): """Return train and evaluation datasets, feature info and supervised keys.""" with tf.device('cpu:0'): problem = t2t_problems.problem(problem_name) hparams = None if problem_name == 'video_bair_robot_pushing': hparams = problem.get_hparams() bair_robot_pushing_hparams(hparams) train_dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, data_dir, shuffle_files=train_shuffle_files, hparams=hparams) train_dataset = train_dataset.map(_select_features) eval_dataset = problem.dataset(tf.estimator.ModeKeys.EVAL, data_dir, shuffle_files=eval_shuffle_files, hparams=hparams) eval_dataset = eval_dataset.map(_select_features) hparams = problem.get_hparams() # We take a few training examples to guess the shapes. input_shapes, target_shapes, examples = [], [], [] if tf.executing_eagerly(): for example in _eager_dataset_iterator(train_dataset.take(3)): examples.append(example) else: example_tensor = train_dataset.make_one_shot_iterator().get_next() sess = tf.Session() example1 = sess.run(example_tensor) example2 = sess.run(example_tensor) example3 = sess.run(example_tensor) examples = [example1, example2, example3] # We use 'inputs' as input except for purely auto-regressive tasks like # language models where 'targets' are used as input_key. input_key = 'inputs' if 'inputs' in examples[0] else 'targets' supervised_keys = ([input_key], ['targets']) for example in examples: input_shapes.append(list(example[input_key].shape)) target_shapes.append(list(example['targets'].shape)) input_vocab_size = hparams.vocab_size[input_key] target_vocab_size = hparams.vocab_size['targets'] input_dtype = examples[0][input_key].dtype target_dtype = examples[0]['targets'].dtype input_info = _make_info(input_shapes, input_vocab_size, input_dtype) target_info = _make_info(target_shapes, target_vocab_size, target_dtype) info = {input_key: input_info, 'targets': target_info} return train_dataset, eval_dataset, info, supervised_keys
def inputs(n_devices, dataset_name, data_dir=None, input_name=None, n_chunks=0): """Make Inputs for built-in datasets. Args: n_devices: how many devices to build the inputs for. dataset_name: a TFDS or T2T dataset name. If it's a T2T dataset name, prefix with "t2t_". data_dir: data directory. input_name: optional, name of the inputs from the dictionary. n_chunks: optional, into how many pieces should we chunk (large inputs). Returns: trax.inputs.Inputs """ if not data_dir: data_dir = os.path.expanduser('~/tensorflow_datasets/') dl_dir = os.path.join(data_dir, 'download') tf.logging.info(( 'No dataset directory provided. ' 'Downloading and generating dataset for %s inside data directory %s ' 'For large datasets it is better to prepare datasets manually!') % (dataset_name, data_dir)) if dataset_name.startswith('t2t_'): # Download and run dataset generator for T2T problem. data_dir = os.path.join(data_dir, dataset_name) tf.gfile.MakeDirs(data_dir) tf.gfile.MakeDirs(dl_dir) t2t_problems.problem(dataset_name[4:]).generate_data( data_dir, dl_dir) else: # Download and prepare TFDS dataset. tfds_builder = tfds.builder(dataset_name) tfds_builder.download_and_prepare(download_dir=dl_dir) else: data_dir = os.path.expanduser(data_dir) (train_batches, train_eval_batches, eval_batches, input_name, input_shape, input_dtype, target_shape, target_dtype) = _train_and_eval_batches(dataset_name, data_dir, input_name, n_devices) if isinstance(input_dtype, tf.DType): input_dtype = input_dtype.as_numpy_dtype if isinstance(target_dtype, tf.DType): target_dtype = target_dtype.as_numpy_dtype if input_dtype == np.uint8: # TPUs don't like uint8s, we cast to ints. input_dtype = np.int32 if target_dtype == np.uint8: target_dtype = np.int32 def numpy_stream(dataset): return dataset_to_stream(dataset, input_name, n_chunks=n_chunks) if n_chunks > 0: length = input_shape[0] input_shape = tuple( [tuple([length // n_chunks] + list(input_shape)[1:])] * n_chunks) input_dtype = tuple([input_dtype] * n_chunks) target_shape = tuple( [tuple([length // n_chunks] + list(target_shape)[1:])] * n_chunks) target_dtype = tuple([target_dtype] * n_chunks) return Inputs(train_stream=lambda: numpy_stream(train_batches), train_eval_stream=lambda: numpy_stream(train_eval_batches), eval_stream=lambda: numpy_stream(eval_batches), input_shape=input_shape, input_dtype=input_dtype, target_shape=target_shape, target_dtype=target_dtype)