def train_and_evaluate(lr=0.001, weight_decay=2, batch_size=BATCH_SIZE): hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = FF_NN(num_features=num_features, num_classes=2, drop_prob=drop_prob, embedding_table_shapes=embeddings, num_continuous=num_continuous, emb_dropout=emb_dropout) criterion = torch.nn.CrossEntropyLoss() # Only parameters of final layer are being optimized. optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) with BatchedDataLoader(make_batch_reader(dataset_url_or_urls='file:///dbfs/tmp/assembled_t', num_epochs=None, transform_spec=None, shuffle_row_groups=False, workers_count=8, cur_shard=hvd.rank(), shard_count=hvd.size()), batch_size=BATCH_SIZE) as train_dataloader, \ BatchedDataLoader(make_batch_reader(dataset_url_or_urls='file:///dbfs/tmp/assembled_v', num_epochs=None, transform_spec=None, shuffle_row_groups=False, workers_count=8, cur_shard=hvd.rank(), shard_count=hvd.size()), batch_size=BATCH_SIZE) as val_dataloader: train_dataloader_iter = iter(train_dataloader) steps_per_epoch = train_df_size // BATCH_SIZE val_dataloader_iter = iter(val_dataloader) validation_steps = max(1, val_df_size // (BATCH_SIZE)) for epoch in range(NUM_EPOCHS): print('Epoch {}/{}'.format(epoch + 1, NUM_EPOCHS)) print('-' * 10) train_loss, train_acc = train_one_epoch(model, optimizer, exp_lr_scheduler, train_dataloader_iter, steps_per_epoch, epoch, device) val_loss, val_acc, val_f1 = evaluate(model, val_dataloader_iter, validation_steps, device) return val_loss
def setup(self, stage=None): # Assign train/val datasets for use in dataloaders from petastorm import make_batch_reader if stage == 'fit' or stage is None: self.train_reader = make_batch_reader(self.train_dir, num_epochs=self.num_reader_epochs, cur_shard=self.cur_shard, shard_count=self.shard_count, hdfs_driver='libhdfs', schema_fields=self.schema_fields, storage_options=self.storage_options) if self.has_val: self.val_reader = make_batch_reader(self.val_dir, num_epochs=self.num_reader_epochs, cur_shard=self.cur_shard, shard_count=self.shard_count, hdfs_driver='libhdfs', schema_fields=self.schema_fields, storage_options=self.storage_options)
def training(epochs): for epoch in range(epochs): epoch_loss_avg = tf.keras.metrics.Mean() epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() with make_batch_reader(train_path, num_epochs=5) as reader_train: train_dataset = Input.get_dataset( reader_train, shuffle=10, batch=model.config.batch_size) # Training loop - using batches of 32 for x, y in train_dataset: # Optimize the model loss_value = train_step(model, optimizer, x, y) # Track progress epoch_loss_avg.update_state(loss_value) # Add current batch loss # Compare predicted label to actual label # training=True is needed only if there are layers with different # behavior during training versus inference (e.g. Dropout). epoch_accuracy.update_state(y, model(x, training=True)) # End epoch train_loss_results.append(epoch_loss_avg.result()) train_accuracy_results.append(epoch_accuracy.result()) tf.print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", epoch) if epoch % 2 == 0: tf.print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))
def test_mem_cache_num_epochs_without_mem_cache_error( two_columns_non_petastorm_dataset): error_string = "num_epochs should not be specified when inmemory_cache_all is not enabled." with make_batch_reader(two_columns_non_petastorm_dataset.url, num_epochs=1) as reader: with pytest.raises(ValueError, match=error_string): BatchedDataLoader(reader, num_epochs=2)
def training2(epochs, num_of_iters): epoch_loss_avg = tf.keras.metrics.Mean() epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() with make_batch_reader(train_path, num_epochs=None) as reader_train: train_dataset = Input.get_dataset( reader_train, shuffle=10, batch=model.config.batch_size) epoch_steps = 0 for epoch in range(epochs): for iteration, (input, target) in enumerate(train_dataset): # Break if the number of computed batches exceeds the # total number of the examples if iteration % num_of_iters==0 and iteration >0: epoch_steps+=1 train_loss_results.append(epoch_loss_avg.result()) train_accuracy_results.append(epoch_accuracy.result()) if epoch_steps % 2 == 0: print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch_steps, epoch_loss_avg.result(), epoch_accuracy.result())) break # HERE WE PERFORM ONE TRAINING STEP loss_value = train_step(model, optimizer, input, target) # Track progress epoch_loss_avg.update_state(loss_value) # Add current batch loss # Compare predicted label to actual label # training=True is needed only if there are layers with different # behavior during training versus inference (e.g. Dropout). epoch_accuracy.update_state(target, model(input, training=True))
def test_torch_dataloader_advanced_params(mock_torch_make_batch_reader, test_ctx): SHARD_COUNT = 3 df = test_ctx.spark.range(100).repartition(SHARD_COUNT) conv = make_spark_converter(df) mock_torch_make_batch_reader.return_value = \ make_batch_reader(conv.cache_dir_url) with conv.make_torch_dataloader(reader_pool_type='dummy', cur_shard=1, shard_count=SHARD_COUNT) as _: pass peta_args = mock_torch_make_batch_reader.call_args.kwargs assert peta_args['reader_pool_type'] == 'dummy' and \ peta_args['cur_shard'] == 1 and \ peta_args['shard_count'] == SHARD_COUNT and \ peta_args['num_epochs'] is None and \ peta_args['workers_count'] == 4 # Test default value overridden arguments. with conv.make_torch_dataloader(num_epochs=1, workers_count=2) as _: pass peta_args = mock_torch_make_batch_reader.call_args.kwargs assert peta_args['num_epochs'] == 1 and peta_args['workers_count'] == 2
def __enter__(self): # import locally to avoid importing tensorflow globally. from petastorm.tf_utils import make_petastorm_dataset import tensorflow.compat.v1 as tf # pylint: disable=import-error _wait_file_available(self.parquet_file_url_list) self.reader = make_batch_reader(self.parquet_file_url_list, **self.petastorm_reader_kwargs) # unroll dataset dataset = make_petastorm_dataset(self.reader).flat_map( tf.data.Dataset.from_tensor_slices) # TODO: auto tune best batch size in default case. batch_size = self.batch_size or 32 dataset = dataset.batch(batch_size=batch_size) prefetch = self.prefetch if prefetch is None: if LooseVersion(tf.__version__) >= LooseVersion('1.14'): # We can make prefetch optimization prefetch = tf.data.experimental.AUTOTUNE else: prefetch = 1 dataset = dataset.prefetch(prefetch) return dataset
def __init__(self, data_url, batch_size, prefetch, preproc_fn, preproc_parallelism): """ :param data_url: A string specifying the data URL. :param batch_size: batch size of the generated tf.data.dataset :param prefetch: prefetch for tf dataset :param preproc_fn: preprocessing function :param preproc_parallelism: parallelism for preprocessing function """ from petastorm.tf_utils import make_petastorm_dataset import tensorflow as tf def support_prefetch_and_autotune(): return LooseVersion(tf.__version__) >= LooseVersion('1.14') self.reader = petastorm.make_batch_reader(data_url) self.dataset = make_petastorm_dataset(self.reader) \ .flat_map(tf.data.Dataset.from_tensor_slices) \ self.dataset = self.dataset.batch(batch_size=batch_size) if support_prefetch_and_autotune(): if prefetch is None: prefetch = tf.data.experimental.AUTOTUNE if prefetch != 0: self.dataset = self.dataset.prefetch(prefetch) if preproc_fn is not None: if preproc_parallelism is None: if support_prefetch_and_autotune(): preproc_parallelism = tf.data.experimental.AUTOTUNE else: preproc_parallelism = 1 self.dataset = self.dataset.map(preproc_fn, preproc_parallelism)
def python_hello_world(dataset_url='file:///tmp/external_dataset'): # Reading data from the non-Petastorm Parquet via pure Python with make_batch_reader(dataset_url, schema_fields=["id", "value1", "value2"]) as reader: for schema_view in reader: # make_batch_reader() returns batches of rows instead of individual rows print("Batched read:\nid: {0} value1: {1} value2: {2}".format( schema_view.id, schema_view.value1, schema_view.value2))
def __enter__(self): from petastorm.pytorch import DataLoader _wait_file_available(self.parquet_file_url_list) self.reader = make_batch_reader(self.parquet_file_url_list, **self.petastorm_reader_kwargs) self.loader = DataLoader(reader=self.reader, batch_size=self.batch_size) return self.loader
def __init__(self, data_url): """ :param data_url: A string specifying the data URL. """ from petastorm.tf_utils import make_petastorm_dataset self.reader = make_batch_reader(data_url) self.dataset = make_petastorm_dataset(self.reader)
def tensorflow_hello_world(dataset_url='file:///tmp/external_dataset'): # Example: tf_tensors will return tensors with dataset data with make_batch_reader(dataset_url) as reader: tensor = tf_tensors(reader) with tf.Session() as sess: # Because we are using make_batch_reader(), each read returns a batch of rows instead of a single row batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id)) # Example: use tf.data.Dataset API with make_batch_reader(dataset_url) as reader: dataset = make_petastorm_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id))
def get_data_loader(data_path: str = None, num_epochs: int = 1, batch_size: int = 16): if not data_path: return None return DataLoader(make_batch_reader(dataset_url=data_path, num_epochs=num_epochs), batch_size=batch_size)
def test_make_batch_reader_with_url_list(scalar_dataset): url_list = _get_local_fs_url_list(scalar_dataset.url) url_list = list(filter(lambda x: x.endswith('.parquet'), url_list)) with make_batch_reader(url_list, workers_count=1) as reader: row_count = 0 for batch in reader: row_count += len(batch.id) assert row_count == 100
def test_transform_function_batched(scalar_dataset): def double_float64(sample): sample['float64'] *= 2 return sample with make_batch_reader(scalar_dataset.url, transform_spec=TransformSpec(double_float64)) as reader: actual = next(reader) for actual_id, actual_float64 in zip(actual.id, actual.float64): original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id) expected_matrix = original_sample['float64'] * 2 np.testing.assert_equal(expected_matrix, actual_float64)
def test_simple_read_tensorflow_with_non_petastorm_many_columns_dataset(many_columns_non_petastorm_dataset): """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference data""" with make_batch_reader(dataset_url_or_urls=many_columns_non_petastorm_dataset.url) as reader: row_tensors = tf_tensors(reader) # Make sure we have static shape info for all fields for column in row_tensors: assert column.get_shape().as_list() == [None] with _tf_session() as sess: batch = sess.run(row_tensors)._asdict() assert set(batch.keys()) == set(many_columns_non_petastorm_dataset.data[0].keys())
def __enter__(self): from petastorm.pytorch import DataLoader _wait_file_available(self.parquet_file_url_list) self.reader = make_batch_reader(self.parquet_file_url_list, **self.petastorm_reader_kwargs) data_loader_fn = self.data_loader_fn or DataLoader self.loader = data_loader_fn(reader=self.reader, batch_size=self.batch_size, shuffling_queue_capacity=self.shuffling_queue_capacity) return self.loader
def test_transform_function_batched_deleting_column(scalar_dataset): def double_float64(sample): del sample['float64'] return sample with make_batch_reader(scalar_dataset.url, transform_spec=TransformSpec(double_float64, removed_fields=[ 'float64' ])) as reader: actual = next(reader) assert 'float64' not in actual._fields
def __init__(self, url, features, training_set_metadata): self.url = url self.training_set_metadata = training_set_metadata with make_batch_reader(self.url) as reader: self.size = reader.dataset.metadata.num_rows self.reshape_features = { feature[PROC_COLUMN]: list( (-1, *training_set_metadata[feature[NAME]]['reshape'])) for feature in features if 'reshape' in training_set_metadata[feature[NAME]] }
def test_with_batch_reader(scalar_dataset, shuffling_queue_capacity, data_loader_type): """See if we are getting correct batch sizes when using DataLoader with make_batch_reader""" pytorch_compatible_fields = [k for k, v in scalar_dataset.data[0].items() if not isinstance(v, (np.datetime64, np.unicode_))] with data_loader_type(make_batch_reader(scalar_dataset.url, schema_fields=pytorch_compatible_fields), batch_size=3, shuffling_queue_capacity=shuffling_queue_capacity) as loader: batches = list(loader) assert len(scalar_dataset.data) == sum(batch['id'].shape[0] for batch in batches) # list types are broken in pyarrow 0.15.0. Don't test list-of-int field if pa.__version__ != '0.15.0': assert len(scalar_dataset.data) == sum(batch['int_fixed_size_list'].shape[0] for batch in batches) assert batches[0]['int_fixed_size_list'].shape[1] == len(scalar_dataset.data[0]['int_fixed_size_list'])
def test_pyarrow_filters_make_batch_reader(): path = tempfile.mkdtemp() url = 'file://' + path create_test_scalar_dataset(url, 3000, partition_by=['id_div_700']) with make_batch_reader(url, filters=[ ('id_div_700', '=', 2), ]) as reader: uv = set() for data in reader: for _id_div_700 in data.id_div_700: uv.add(_id_div_700) assert uv == {2}
def _init_petaloader(self): def _transform_row(df_batch): return df_batch transform = TransformSpec(_transform_row, removed_fields=['cat_id', 'store_id', 'state_id']) reader = make_batch_reader(self.filename, schema_fields=['id', 'item_id', 'dept_id', 'cat_id', 'day_id', 'sales', 'day_date_str', 'month_id', 'date', 'wm_yr_wk', 'snap_flag', 'sell_price', 'sales_dollars', 'store_id', 'state_id'], workers_count=1 #,transform_spec = transform ) return PetaDataLoader(reader=reader, batch_size=128, shuffling_queue_capacity=100000)
def load_data(data: Union[str, Sequence[str]], ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[int]] = None, **kwargs) -> pd.DataFrame: _assert_petastorm_installed() with petastorm.make_batch_reader(data) as reader: shards = [pd.DataFrame(batch._asdict()) for batch in reader] local_df = pd.concat(shards, copy=False) if ignore: local_df = local_df[local_df.columns.difference(ignore)] return local_df
def test_transform_function_with_predicate_batched(scalar_dataset): def double_float64(sample): assert all(sample['id'] % 2 == 0) sample['float64'] *= 2 return sample with make_batch_reader(scalar_dataset.url, transform_spec=TransformSpec(double_float64), predicate=in_lambda(['id'], lambda id: id % 2 == 0)) as reader: actual = next(reader) for actual_id, actual_float64 in zip(actual.id, actual.float64): assert actual_id % 2 == 0 original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id) expected_matrix = original_sample['float64'] * 2 np.testing.assert_equal(expected_matrix, actual_float64)
def __init__(self, url, features, training_set_metadata): self.url = to_url(url) self.features = [feature[PROC_COLUMN] for feature in features] self.training_set_metadata = training_set_metadata with make_batch_reader(self.url) as reader: self.size = sum(piece.get_metadata().num_rows for piece in reader.dataset.pieces) self.reshape_features = { feature[PROC_COLUMN]: list( (-1, *training_set_metadata[feature[NAME]]['reshape'])) for feature in features if 'reshape' in training_set_metadata[feature[NAME]] }
def test_simple_read_tensorflow_with_parquet_dataset(scalar_dataset): """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference data""" with make_batch_reader(dataset_url=scalar_dataset.url) as reader: row_tensors = tf_tensors(reader) # Make sure we have static shape info for all fields for column in row_tensors: assert column.get_shape().as_list() == [None] with _tf_session() as sess: for _ in range(2): batch = sess.run(row_tensors)._asdict() for i, id_value in enumerate(batch['id']): expected_row = next(d for d in scalar_dataset.data if d['id'] == id_value) for field_name in expected_row.keys(): _assert_fields_eq(batch[field_name][i], expected_row[field_name])
def test_non_petastorm_with_many_colums_with_one_shot_iterator(many_columns_non_petastorm_dataset): """Just a bunch of read and compares of all values to the expected values""" with make_batch_reader(many_columns_non_petastorm_dataset.url, workers_count=1) as reader: dataset = make_petastorm_dataset(reader) iterator = dataset.make_one_shot_iterator() # Make sure we have static shape info for all fields for shape in dataset.output_shapes: # TODO(yevgeni): check that the shapes are actually correct, not just not None assert shape.dims is not None # Read a bunch of entries from the dataset and compare the data to reference with tf.Session() as sess: iterator = iterator.get_next() sample = sess.run(iterator)._asdict() assert set(sample.keys()) == set(many_columns_non_petastorm_dataset.data[0].keys())
def __init__(self, data_url, batch_size, num_epochs, workers_count, cur_shard, shard_count, **petastorm_reader_kwargs): """ :param data_url: A string specifying the data URL. See `SparkDatasetConverter.make_torch_dataloader()` for the definitions of the other parameters. """ from petastorm.pytorch import DataLoader petastorm_reader_kwargs["num_epochs"] = num_epochs if workers_count is not None: petastorm_reader_kwargs["workers_count"] = workers_count petastorm_reader_kwargs["cur_shard"] = cur_shard petastorm_reader_kwargs["shard_count"] = shard_count self.reader = petastorm.make_batch_reader(data_url, **petastorm_reader_kwargs) self.loader = DataLoader(reader=self.reader, batch_size=batch_size)
def get_dataloader(self, dataset: Dataset, identity: str = "Default"): batch_preprocessor = self.build_batch_preprocessor() reader_options = self.reader_options assert reader_options data_reader = make_batch_reader( # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`. dataset.parquet_url, num_epochs=1, reader_pool_type=reader_options.petastorm_reader_pool_type, ) # NOTE: must be wrapped by DataLoaderWrapper to call __exit__() on end of epoch dataloader = DataLoader( data_reader, batch_size=reader_options.minibatch_size, collate_fn=collate_and_preprocess( batch_preprocessor=batch_preprocessor, use_gpu=False), ) return _closing_iter(dataloader)
def test_transform_function_new_field_batched(scalar_dataset): def double_float64(sample): sample['new_float64'] = sample['float64'] * 2 del sample['float64'] return sample with make_batch_reader(scalar_dataset.url, reader_pool_type='dummy', transform_spec=TransformSpec(double_float64, [('new_float64', np.float64, (), False)], ['float64'])) as reader: row_tensors = tf_tensors(reader) with _tf_session() as sess: actual = sess.run(row_tensors) for actual_id, actual_float64 in zip(actual.id, actual.new_float64): original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id) expected = original_sample['float64'] * 2 np.testing.assert_equal(expected, actual_float64)