class DrugCardiotoxicityDatasetTest(tf.test.TestCase, parameterized.TestCase): @parameterized.named_parameters( ('Train', tfds.Split.TRAIN, True, 6523), ('Validation', tfds.Split.VALIDATION, False, 1631), ('Test', tfds.Split.TEST, False, 839), ('Test2', tfds.Split('test2'), False, 177)) def testDatasetSize(self, split, is_training, expected_size): dataset_builder = ub.datasets.DrugCardiotoxicityDataset( split=split, is_training=is_training, shuffle_buffer_size=20) self.assertEqual(dataset_builder.num_examples, expected_size) @parameterized.named_parameters( ('Train', tfds.Split.TRAIN, True), ('Validation', tfds.Split.VALIDATION, False), ('Test', tfds.Split.TEST, False), ('Test2', tfds.Split('test2'), False)) def testDatasetShape(self, split, is_training): batch_size = 128 dataset_builder = ub.datasets.DrugCardiotoxicityDataset( split=split, is_training=is_training, shuffle_buffer_size=20) dataset = dataset_builder.load(batch_size=batch_size).take(1) element = next(iter(dataset)) atoms = element['features']['atoms'] pairs = element['features']['pairs'] atom_mask = element['features']['atom_mask'] pair_mask = element['features']['pair_mask'] molecule_id = element['features']['molecule_id'] labels = element['labels'] self.assertEqual(atoms.shape, (batch_size, 60, 27)) self.assertEqual(pairs.shape, (batch_size, 60, 60, 12)) self.assertEqual(atom_mask.shape, (batch_size, 60)) self.assertEqual(pair_mask.shape, (batch_size, 60, 60)) self.assertEqual(molecule_id.shape, (batch_size, )) self.assertEqual(labels.shape, (batch_size, 2))
def _as_dataset( self, split: tfds.Split, decoders=None, read_config=None, shuffle_files=False) -> tf.data.Dataset: """Constructs a `tf.data.Dataset`.""" del decoders del read_config del shuffle_files if split == tfds.Split.TRAIN: return _build_dataset( glob_dir=os.path.join(self._data_dir, self._file_names['train']), is_training=True) elif split == tfds.Split.VALIDATION: return _build_dataset( glob_dir=os.path.join(self._data_dir, self._file_names['validation']), is_training=False) elif split == tfds.Split.TEST: return _build_dataset( glob_dir=os.path.join(self._data_dir, self._file_names['test']), is_training=False) elif split == tfds.Split('test2'): return _build_dataset( glob_dir=os.path.join(self._data_dir, self._file_names['test2']), is_training=False) raise ValueError('Unsupported split given: {}.'.format(split))
def main(argv: Sequence[str]): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if not FLAGS.use_gpu: logging.info('Using TPU for training.') strategy = utils.get_tpu_strategy(FLAGS.tpu) else: logging.info('Using GPU for training.') strategy = tf.distribute.MirroredStrategy() train_dataset, steps_per_epoch = utils.load_dataset( FLAGS.data_dir, tfds.Split.TRAIN, FLAGS.batch_size) eval_identifiers = ['tune', 'test1', 'test2'] splits = [tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2')] eval_datasets, steps_per_eval = utils.load_eval_datasets( eval_identifiers, splits, FLAGS.data_dir, FLAGS.batch_size) logging.info('Steps for eval datasets: %s', steps_per_eval) graph_augmenter = None if FLAGS.augmentations: graph_augmenter = augmentation_utils.GraphAugment( FLAGS.augmentations, FLAGS.aug_ratio, FLAGS.aug_prob, FLAGS.perturb_node_features, FLAGS.drop_edges_only, FLAGS.perturb_edge_features, FLAGS.initialize_edge_features_randomly, FLAGS.mask_mean, FLAGS.mask_stddev) params = utils.ModelParameters(num_heads=FLAGS.num_heads, num_layers=FLAGS.num_layers, message_layer_size=FLAGS.message_layer_size, readout_layer_size=FLAGS.readout_layer_size, use_gp_layer=False, learning_rate=FLAGS.learning_rate, augmentations=FLAGS.augmentations, num_epochs=FLAGS.num_epochs, steps_per_epoch=steps_per_epoch) model_dir = FLAGS.output_dir utils.write_params(dataclasses.asdict(params), os.path.join(model_dir, 'params.json')) summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'summaries')) run(train_dataset=train_dataset, eval_datasets=eval_datasets, steps_per_eval=steps_per_eval, params=params, model_dir=model_dir, strategy=strategy, summary_writer=summary_writer, loss_type=FLAGS.loss_type, graph_augmenter=graph_augmenter)
def _split_generators(self, dl_manager): """Specify dataset splits, setting up calls to _generate_examples. This is the first entrypoint for tfds's download_and_prepare function. Args: dl_manager: (DownloadManager) Download manager to download the data. Returns: `list<tfds.core.SplitGenerator>`. """ sys.setrecursionlimit(10000) del dl_manager # Unused. return [ tfds.core.SplitGenerator( name=tfds.Split("train"), gen_kwargs={"split": "train"}), ]
def _as_dataset( self, split: tfds.Split, decoders=None, read_config=None, shuffle_files=False) -> tf.data.Dataset: """Constructs a `tf.data.Dataset`. Args: split: `tfds.Split` which subset of the data to read. decoders: Unused. read_config: Unused. shuffle_files: Unused. Returns: `tf.data.Dataset` """ del decoders del read_config del shuffle_files is_training = False if isinstance(split, tfds.core.ReadInstruction): logging.warn( 'ReadInstruction splits are currently not supported. Using ' 'the split name `%s` instead of `%s`.', split.split_name, split) split = tfds.Split(split.split_name) if split == tfds.Split.TRAIN: file_pattern = 'train-*-of-*' is_training = True elif split == tfds.Split.VALIDATION: file_pattern = 'validation-*-of-*' elif split == tfds.Split.TEST: file_pattern = 'test-*-of-*' else: raise ValueError('Unsupported split given: {}.'.format(split)) return _build_dataset( glob_dir=os.path.join(self._data_dir, file_pattern), is_training=is_training)
def load_dataset(data_dir, split, batch_size): """Loads a single dataset with specific split.""" known_splits = [ tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2') ] if split in known_splits: is_training = split == tfds.Split.TRAIN else: raise ValueError( 'Received ambiguous split {}, must set is_training for splits other ' 'than "train", "validation", "test".'.format(split)) builder = DrugCardiotoxicityDataset(split=split, data_dir=data_dir, is_training=is_training) dataset = builder.load( batch_size=batch_size).map(lambda x: (x['features'], x['labels'])) steps = builder.num_examples // batch_size if not is_training: steps += 1 return dataset, steps
def _info(self) -> tfds.core.DatasetInfo: """Returns the `tfds.core.DatasetInfo` object.""" features = { _LABEL_NAME: tfds.features.ClassLabel(num_classes=_NUM_CLASSES), _NODES_FEATURE_NAME: tfds.features.Tensor( shape=[_MAX_NODES, _NODE_FEATURE_LENGTH], dtype=tf.float32), _EDGES_FEATURE_NAME: tfds.features.Tensor( shape=[_MAX_NODES, _MAX_NODES, _EDGE_FEATURE_LENGTH], dtype=tf.float32), _NODE_MASK_FEATURE_NAME: tfds.features.Tensor(shape=[_MAX_NODES], dtype=tf.float32), _EDGE_MASK_FEATURE_NAME: tfds.features.Tensor( shape=[_MAX_NODES, _MAX_NODES], dtype=tf.float32), _DISTANCE_TO_TRAIN_NAME: tfds.features.Tensor(shape=[1], dtype=tf.float32), _EXAMPLE_NAME: tfds.features.Tensor(shape=[], dtype=tf.string), } info = tfds.core.DatasetInfo( builder=self, description=_DESCRIPTION, features=tfds.features.FeaturesDict(features), homepage='https://www.tensorflow.org/datasets/catalog/cardiotox', citation=_CITATION, # Note that while metadata seems to be the most appropriate way to store # arbitrary info, it will not be printed when printing out the dataset # info. metadata=tfds.core.MetadataDict(max_nodes=_MAX_NODES, node_features=_NODE_FEATURE_LENGTH, edge_features=_EDGE_FEATURE_LENGTH)) # Instead of having a single element shard_lengths, we should really have a # list of the number of elements in each file shard in each split. split_infos = [ tfds.core.SplitInfo( name=tfds.Split.VALIDATION, shard_lengths=[self._num_examples['validation']], num_bytes=0, ), tfds.core.SplitInfo( name=tfds.Split.TEST, shard_lengths=[self._num_examples['test']], num_bytes=0, ), tfds.core.SplitInfo( name=tfds.Split('test2'), shard_lengths=[self._num_examples['test2']], num_bytes=0, ), tfds.core.SplitInfo( name=tfds.Split.TRAIN, shard_lengths=[self._num_examples['train']], num_bytes=0, ), ] split_dict = tfds.core.SplitDict( split_infos, dataset_name='__drug_cardiotoxicity_dataset_builder') info.set_splits(split_dict) return info
def main(argv: Sequence[str]): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if not FLAGS.use_gpu: logging.info('Using TPU for training.') strategy = utils.get_tpu_strategy(FLAGS.tpu) else: logging.info('Using GPU for training.') strategy = tf.distribute.MirroredStrategy() train_dataset, steps_per_epoch = utils.load_dataset(FLAGS.data_dir, tfds.Split.TRAIN, FLAGS.batch_size) eval_identifiers = ['tune', 'test1', 'test2'] splits = [tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2')] eval_datasets, steps_per_eval = utils.load_eval_datasets( eval_identifiers, splits, FLAGS.data_dir, FLAGS.batch_size) logging.info('Steps for eval datasets: %s', steps_per_eval) params = utils.ModelParameters( num_heads=FLAGS.num_heads, num_layers=FLAGS.num_layers, message_layer_size=FLAGS.message_layer_size, readout_layer_size=FLAGS.readout_layer_size, use_gp_layer=FLAGS.use_gp_layer, learning_rate=FLAGS.learning_rate, num_epochs=FLAGS.num_epochs, steps_per_epoch=steps_per_epoch) gp_layer_kwargs = dict( num_inducing=FLAGS.gp_num_inducing, gp_kernel_scale=FLAGS.gp_kernel_scale, gp_output_bias=FLAGS.gp_output_bias, normalize_input=FLAGS.gp_normalize_input, gp_cov_momentum=FLAGS.gp_cov_momentum, gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty) model_dir = FLAGS.output_dir utils.write_params( dataclasses.asdict(params), os.path.join(model_dir, 'params.json')) utils.write_params(gp_layer_kwargs, os.path.join(model_dir, 'gp_layer_kwargs.json')) summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'summaries')) run(train_dataset=train_dataset, eval_datasets=eval_datasets, steps_per_eval=steps_per_eval, params=params, model_dir=model_dir, gp_layer_kwargs=gp_layer_kwargs, strategy=strategy, summary_writer=summary_writer, loss_type=FLAGS.loss_type, use_spec_norm=FLAGS.use_spec_norm, spec_norm_multiplier=FLAGS.spec_norm_multiplier, use_spec_norm_mp=FLAGS.use_spec_norm_mp, spec_norm_multiplier_mp=FLAGS.spec_norm_multiplier_mp)
mnist, info = tfds.load(name='mnist', split='train', with_info=True) print(info) print(info.homepage) print(info.features['image']) print(info.features['label']) print(info.splits['train'].num_examples) print(info.splits['test'].num_examples) # as supervised returns dataset in (image,;abel) tuple otherwise will return a dictionary mnist1 = tfds.load(name='mnist', as_supervised=True) for image, label in mnist1['train'].take(2): print(image.shape, label.shape) split = tfds.Split('test') mnist2 = tfds.load(name='mnist', split=split) print(mnist2) # dataset builder mnist_builder = tfds.builder('mnist') mnist_builder.download_and_prepare() mnist4 = mnist_builder.as_dataset(split=tfds.Split.TRAIN) print(mnist4) # EXTRACT dataset = tfds.load(name="mnist", split="train") # TRANSFORM dataset.shuffle(100) # LOAD for data in dataset.take(1):