class DrugCardiotoxicityDatasetTest(tf.test.TestCase, parameterized.TestCase):
    @parameterized.named_parameters(
        ('Train', tfds.Split.TRAIN, True, 6523),
        ('Validation', tfds.Split.VALIDATION, False, 1631),
        ('Test', tfds.Split.TEST, False, 839),
        ('Test2', tfds.Split('test2'), False, 177))
    def testDatasetSize(self, split, is_training, expected_size):
        dataset_builder = ub.datasets.DrugCardiotoxicityDataset(
            split=split, is_training=is_training, shuffle_buffer_size=20)
        self.assertEqual(dataset_builder.num_examples, expected_size)

    @parameterized.named_parameters(
        ('Train', tfds.Split.TRAIN, True),
        ('Validation', tfds.Split.VALIDATION, False),
        ('Test', tfds.Split.TEST, False),
        ('Test2', tfds.Split('test2'), False))
    def testDatasetShape(self, split, is_training):
        batch_size = 128
        dataset_builder = ub.datasets.DrugCardiotoxicityDataset(
            split=split, is_training=is_training, shuffle_buffer_size=20)
        dataset = dataset_builder.load(batch_size=batch_size).take(1)
        element = next(iter(dataset))
        atoms = element['features']['atoms']
        pairs = element['features']['pairs']
        atom_mask = element['features']['atom_mask']
        pair_mask = element['features']['pair_mask']
        molecule_id = element['features']['molecule_id']
        labels = element['labels']

        self.assertEqual(atoms.shape, (batch_size, 60, 27))
        self.assertEqual(pairs.shape, (batch_size, 60, 60, 12))
        self.assertEqual(atom_mask.shape, (batch_size, 60))
        self.assertEqual(pair_mask.shape, (batch_size, 60, 60))
        self.assertEqual(molecule_id.shape, (batch_size, ))
        self.assertEqual(labels.shape, (batch_size, 2))
 def _as_dataset(
     self,
     split: tfds.Split,
     decoders=None,
     read_config=None,
     shuffle_files=False) -> tf.data.Dataset:
   """Constructs a `tf.data.Dataset`."""
   del decoders
   del read_config
   del shuffle_files
   if split == tfds.Split.TRAIN:
     return _build_dataset(
         glob_dir=os.path.join(self._data_dir, self._file_names['train']),
         is_training=True)
   elif split == tfds.Split.VALIDATION:
     return _build_dataset(
         glob_dir=os.path.join(self._data_dir, self._file_names['validation']),
         is_training=False)
   elif split == tfds.Split.TEST:
     return _build_dataset(
         glob_dir=os.path.join(self._data_dir, self._file_names['test']),
         is_training=False)
   elif split == tfds.Split('test2'):
     return _build_dataset(
         glob_dir=os.path.join(self._data_dir, self._file_names['test2']),
         is_training=False)
   raise ValueError('Unsupported split given: {}.'.format(split))
Example #3
0
def main(argv: Sequence[str]):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    tf.io.gfile.makedirs(FLAGS.output_dir)
    logging.info('Saving checkpoints at %s', FLAGS.output_dir)
    tf.random.set_seed(FLAGS.seed)

    if not FLAGS.use_gpu:
        logging.info('Using TPU for training.')
        strategy = utils.get_tpu_strategy(FLAGS.tpu)
    else:
        logging.info('Using GPU for training.')
        strategy = tf.distribute.MirroredStrategy()

    train_dataset, steps_per_epoch = utils.load_dataset(
        FLAGS.data_dir, tfds.Split.TRAIN, FLAGS.batch_size)

    eval_identifiers = ['tune', 'test1', 'test2']
    splits = [tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2')]
    eval_datasets, steps_per_eval = utils.load_eval_datasets(
        eval_identifiers, splits, FLAGS.data_dir, FLAGS.batch_size)

    logging.info('Steps for eval datasets: %s', steps_per_eval)
    graph_augmenter = None
    if FLAGS.augmentations:
        graph_augmenter = augmentation_utils.GraphAugment(
            FLAGS.augmentations, FLAGS.aug_ratio, FLAGS.aug_prob,
            FLAGS.perturb_node_features, FLAGS.drop_edges_only,
            FLAGS.perturb_edge_features,
            FLAGS.initialize_edge_features_randomly, FLAGS.mask_mean,
            FLAGS.mask_stddev)

    params = utils.ModelParameters(num_heads=FLAGS.num_heads,
                                   num_layers=FLAGS.num_layers,
                                   message_layer_size=FLAGS.message_layer_size,
                                   readout_layer_size=FLAGS.readout_layer_size,
                                   use_gp_layer=False,
                                   learning_rate=FLAGS.learning_rate,
                                   augmentations=FLAGS.augmentations,
                                   num_epochs=FLAGS.num_epochs,
                                   steps_per_epoch=steps_per_epoch)

    model_dir = FLAGS.output_dir
    utils.write_params(dataclasses.asdict(params),
                       os.path.join(model_dir, 'params.json'))

    summary_writer = tf.summary.create_file_writer(
        os.path.join(model_dir, 'summaries'))
    run(train_dataset=train_dataset,
        eval_datasets=eval_datasets,
        steps_per_eval=steps_per_eval,
        params=params,
        model_dir=model_dir,
        strategy=strategy,
        summary_writer=summary_writer,
        loss_type=FLAGS.loss_type,
        graph_augmenter=graph_augmenter)
  def _split_generators(self, dl_manager):
    """Specify dataset splits, setting up calls to _generate_examples.

    This is the first entrypoint for tfds's download_and_prepare function.

    Args:
      dl_manager: (DownloadManager) Download manager to download the data.

    Returns:
      `list<tfds.core.SplitGenerator>`.
    """
    sys.setrecursionlimit(10000)
    del dl_manager  # Unused.
    return [
        tfds.core.SplitGenerator(
            name=tfds.Split("train"),
            gen_kwargs={"split": "train"}),
    ]
Example #5
0
  def _as_dataset(
      self,
      split: tfds.Split,
      decoders=None,
      read_config=None,
      shuffle_files=False) -> tf.data.Dataset:
    """Constructs a `tf.data.Dataset`.

    Args:
      split: `tfds.Split` which subset of the data to read.
      decoders: Unused.
      read_config: Unused.
      shuffle_files: Unused.

    Returns:
      `tf.data.Dataset`
    """
    del decoders
    del read_config
    del shuffle_files
    is_training = False
    if isinstance(split, tfds.core.ReadInstruction):
      logging.warn(
          'ReadInstruction splits are currently not supported. Using '
          'the split name `%s` instead of `%s`.', split.split_name, split)
      split = tfds.Split(split.split_name)
    if split == tfds.Split.TRAIN:
      file_pattern = 'train-*-of-*'
      is_training = True
    elif split == tfds.Split.VALIDATION:
      file_pattern = 'validation-*-of-*'
    elif split == tfds.Split.TEST:
      file_pattern = 'test-*-of-*'
    else:
      raise ValueError('Unsupported split given: {}.'.format(split))
    return _build_dataset(
        glob_dir=os.path.join(self._data_dir, file_pattern),
        is_training=is_training)
Example #6
0
def load_dataset(data_dir, split, batch_size):
    """Loads a single dataset with specific split."""
    known_splits = [
        tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST,
        tfds.Split('test2')
    ]
    if split in known_splits:
        is_training = split == tfds.Split.TRAIN
    else:
        raise ValueError(
            'Received ambiguous split {}, must set is_training for splits other '
            'than "train", "validation", "test".'.format(split))

    builder = DrugCardiotoxicityDataset(split=split,
                                        data_dir=data_dir,
                                        is_training=is_training)
    dataset = builder.load(
        batch_size=batch_size).map(lambda x: (x['features'], x['labels']))
    steps = builder.num_examples // batch_size
    if not is_training:
        steps += 1

    return dataset, steps
  def _info(self) -> tfds.core.DatasetInfo:
    """Returns the `tfds.core.DatasetInfo` object."""
    features = {
        _LABEL_NAME:
            tfds.features.ClassLabel(num_classes=_NUM_CLASSES),
        _NODES_FEATURE_NAME:
            tfds.features.Tensor(
                shape=[_MAX_NODES, _NODE_FEATURE_LENGTH], dtype=tf.float32),
        _EDGES_FEATURE_NAME:
            tfds.features.Tensor(
                shape=[_MAX_NODES, _MAX_NODES, _EDGE_FEATURE_LENGTH],
                dtype=tf.float32),
        _NODE_MASK_FEATURE_NAME:
            tfds.features.Tensor(shape=[_MAX_NODES], dtype=tf.float32),
        _EDGE_MASK_FEATURE_NAME:
            tfds.features.Tensor(
                shape=[_MAX_NODES, _MAX_NODES], dtype=tf.float32),
        _DISTANCE_TO_TRAIN_NAME:
            tfds.features.Tensor(shape=[1], dtype=tf.float32),
        _EXAMPLE_NAME:
            tfds.features.Tensor(shape=[], dtype=tf.string),
    }
    info = tfds.core.DatasetInfo(
        builder=self,
        description=_DESCRIPTION,
        features=tfds.features.FeaturesDict(features),
        homepage='https://www.tensorflow.org/datasets/catalog/cardiotox',
        citation=_CITATION,
        # Note that while metadata seems to be the most appropriate way to store
        # arbitrary info, it will not be printed when printing out the dataset
        # info.
        metadata=tfds.core.MetadataDict(max_nodes=_MAX_NODES,
                                        node_features=_NODE_FEATURE_LENGTH,
                                        edge_features=_EDGE_FEATURE_LENGTH))

    # Instead of having a single element shard_lengths, we should really have a
    # list of the number of elements in each file shard in each split.
    split_infos = [
        tfds.core.SplitInfo(
            name=tfds.Split.VALIDATION,
            shard_lengths=[self._num_examples['validation']],
            num_bytes=0,
        ),
        tfds.core.SplitInfo(
            name=tfds.Split.TEST,
            shard_lengths=[self._num_examples['test']],
            num_bytes=0,
        ),
        tfds.core.SplitInfo(
            name=tfds.Split('test2'),
            shard_lengths=[self._num_examples['test2']],
            num_bytes=0,
        ),
        tfds.core.SplitInfo(
            name=tfds.Split.TRAIN,
            shard_lengths=[self._num_examples['train']],
            num_bytes=0,
        ),
    ]
    split_dict = tfds.core.SplitDict(
        split_infos, dataset_name='__drug_cardiotoxicity_dataset_builder')
    info.set_splits(split_dict)
    return info
Example #8
0
def main(argv: Sequence[str]):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  tf.io.gfile.makedirs(FLAGS.output_dir)
  logging.info('Saving checkpoints at %s', FLAGS.output_dir)
  tf.random.set_seed(FLAGS.seed)

  if not FLAGS.use_gpu:
    logging.info('Using TPU for training.')
    strategy = utils.get_tpu_strategy(FLAGS.tpu)
  else:
    logging.info('Using GPU for training.')
    strategy = tf.distribute.MirroredStrategy()

  train_dataset, steps_per_epoch = utils.load_dataset(FLAGS.data_dir,
                                                      tfds.Split.TRAIN,
                                                      FLAGS.batch_size)

  eval_identifiers = ['tune', 'test1', 'test2']
  splits = [tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2')]
  eval_datasets, steps_per_eval = utils.load_eval_datasets(
      eval_identifiers, splits, FLAGS.data_dir, FLAGS.batch_size)

  logging.info('Steps for eval datasets: %s', steps_per_eval)

  params = utils.ModelParameters(
      num_heads=FLAGS.num_heads,
      num_layers=FLAGS.num_layers,
      message_layer_size=FLAGS.message_layer_size,
      readout_layer_size=FLAGS.readout_layer_size,
      use_gp_layer=FLAGS.use_gp_layer,
      learning_rate=FLAGS.learning_rate,
      num_epochs=FLAGS.num_epochs,
      steps_per_epoch=steps_per_epoch)

  gp_layer_kwargs = dict(
      num_inducing=FLAGS.gp_num_inducing,
      gp_kernel_scale=FLAGS.gp_kernel_scale,
      gp_output_bias=FLAGS.gp_output_bias,
      normalize_input=FLAGS.gp_normalize_input,
      gp_cov_momentum=FLAGS.gp_cov_momentum,
      gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty)

  model_dir = FLAGS.output_dir
  utils.write_params(
      dataclasses.asdict(params), os.path.join(model_dir, 'params.json'))
  utils.write_params(gp_layer_kwargs,
                     os.path.join(model_dir, 'gp_layer_kwargs.json'))
  summary_writer = tf.summary.create_file_writer(
      os.path.join(model_dir, 'summaries'))
  run(train_dataset=train_dataset,
      eval_datasets=eval_datasets,
      steps_per_eval=steps_per_eval,
      params=params,
      model_dir=model_dir,
      gp_layer_kwargs=gp_layer_kwargs,
      strategy=strategy,
      summary_writer=summary_writer,
      loss_type=FLAGS.loss_type,
      use_spec_norm=FLAGS.use_spec_norm,
      spec_norm_multiplier=FLAGS.spec_norm_multiplier,
      use_spec_norm_mp=FLAGS.use_spec_norm_mp,
      spec_norm_multiplier_mp=FLAGS.spec_norm_multiplier_mp)
mnist, info = tfds.load(name='mnist', split='train', with_info=True)
print(info)

print(info.homepage)
print(info.features['image'])
print(info.features['label'])
print(info.splits['train'].num_examples)
print(info.splits['test'].num_examples)

# as supervised returns dataset in (image,;abel) tuple otherwise will return a dictionary
mnist1 = tfds.load(name='mnist', as_supervised=True)
for image, label in mnist1['train'].take(2):
    print(image.shape, label.shape)

split = tfds.Split('test')
mnist2 = tfds.load(name='mnist', split=split)
print(mnist2)

# dataset builder
mnist_builder = tfds.builder('mnist')
mnist_builder.download_and_prepare()
mnist4 = mnist_builder.as_dataset(split=tfds.Split.TRAIN)
print(mnist4)

# EXTRACT
dataset = tfds.load(name="mnist", split="train")
# TRANSFORM
dataset.shuffle(100)
# LOAD
for data in dataset.take(1):