コード例 #1
0
ファイル: petastorm.py プロジェクト: afogarty85/petastorm
def train_and_evaluate(lr=0.001, weight_decay=2, batch_size=BATCH_SIZE):
    hvd.init()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = FF_NN(num_features=num_features,
                  num_classes=2,
                  drop_prob=drop_prob,
                  embedding_table_shapes=embeddings,
                  num_continuous=num_continuous,
                  emb_dropout=emb_dropout)
    criterion = torch.nn.CrossEntropyLoss()

    # Only parameters of final layer are being optimized.
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=lr,
                                momentum=0.9,
                                weight_decay=weight_decay)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                       step_size=7,
                                                       gamma=0.1)

    with BatchedDataLoader(make_batch_reader(dataset_url_or_urls='file:///dbfs/tmp/assembled_t',
                                      num_epochs=None,
                                      transform_spec=None, shuffle_row_groups=False, workers_count=8,
                                            cur_shard=hvd.rank(), shard_count=hvd.size()), batch_size=BATCH_SIZE) as train_dataloader, \
         BatchedDataLoader(make_batch_reader(dataset_url_or_urls='file:///dbfs/tmp/assembled_v',
                                      num_epochs=None, transform_spec=None, shuffle_row_groups=False, workers_count=8,
                                            cur_shard=hvd.rank(), shard_count=hvd.size()), batch_size=BATCH_SIZE) as val_dataloader:

        train_dataloader_iter = iter(train_dataloader)
        steps_per_epoch = train_df_size // BATCH_SIZE

        val_dataloader_iter = iter(val_dataloader)
        validation_steps = max(1, val_df_size // (BATCH_SIZE))

        for epoch in range(NUM_EPOCHS):
            print('Epoch {}/{}'.format(epoch + 1, NUM_EPOCHS))
            print('-' * 10)

            train_loss, train_acc = train_one_epoch(model, optimizer,
                                                    exp_lr_scheduler,
                                                    train_dataloader_iter,
                                                    steps_per_epoch, epoch,
                                                    device)
            val_loss, val_acc, val_f1 = evaluate(model, val_dataloader_iter,
                                                 validation_steps, device)

    return val_loss
コード例 #2
0
 def setup(self, stage=None):
     # Assign train/val datasets for use in dataloaders
     from petastorm import make_batch_reader
     if stage == 'fit' or stage is None:
         self.train_reader = make_batch_reader(self.train_dir, num_epochs=self.num_reader_epochs,
                                               cur_shard=self.cur_shard, shard_count=self.shard_count,
                                               hdfs_driver='libhdfs',
                                               schema_fields=self.schema_fields,
                                               storage_options=self.storage_options)
         if self.has_val:
             self.val_reader = make_batch_reader(self.val_dir, num_epochs=self.num_reader_epochs,
                                                 cur_shard=self.cur_shard, shard_count=self.shard_count,
                                                 hdfs_driver='libhdfs',
                                                 schema_fields=self.schema_fields,
                                                 storage_options=self.storage_options)
def training(epochs):
    for epoch in range(epochs):
        epoch_loss_avg = tf.keras.metrics.Mean()
        epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
        with make_batch_reader(train_path, num_epochs=5) as reader_train:
            train_dataset = Input.get_dataset(
                reader_train,
                shuffle=10,
                batch=model.config.batch_size)

            # Training loop - using batches of 32
            for x, y in train_dataset:
                # Optimize the model
                loss_value = train_step(model, optimizer, x, y)

                # Track progress
                epoch_loss_avg.update_state(loss_value)  # Add current batch loss
                # Compare predicted label to actual label
                # training=True is needed only if there are layers with different
                # behavior during training versus inference (e.g. Dropout).
                epoch_accuracy.update_state(y, model(x, training=True))

            # End epoch
            train_loss_results.append(epoch_loss_avg.result())
            train_accuracy_results.append(epoch_accuracy.result())

            tf.print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", epoch)

            if epoch % 2 == 0:
                tf.print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
                                                                            epoch_loss_avg.result(),
                                                                            epoch_accuracy.result()))
コード例 #4
0
def test_mem_cache_num_epochs_without_mem_cache_error(
        two_columns_non_petastorm_dataset):
    error_string = "num_epochs should not be specified when inmemory_cache_all is not enabled."
    with make_batch_reader(two_columns_non_petastorm_dataset.url,
                           num_epochs=1) as reader:
        with pytest.raises(ValueError, match=error_string):
            BatchedDataLoader(reader, num_epochs=2)
def training2(epochs, num_of_iters):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
    with make_batch_reader(train_path, num_epochs=None) as reader_train:
        train_dataset = Input.get_dataset(
            reader_train,
            shuffle=10,
            batch=model.config.batch_size)

        epoch_steps = 0

        for epoch in range(epochs):
            for iteration, (input, target) in enumerate(train_dataset):
                # Break if the number of computed batches exceeds the
                # total number of the examples
                if iteration % num_of_iters==0 and iteration >0:
                    epoch_steps+=1
                    train_loss_results.append(epoch_loss_avg.result())
                    train_accuracy_results.append(epoch_accuracy.result())
                    if epoch_steps % 2 == 0:
                        print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch_steps,
                                                                                    epoch_loss_avg.result(),
                                                                                    epoch_accuracy.result()))
                    break
                # HERE WE PERFORM ONE TRAINING STEP

                loss_value = train_step(model, optimizer, input, target)

                # Track progress
                epoch_loss_avg.update_state(loss_value)  # Add current batch loss
                # Compare predicted label to actual label
                # training=True is needed only if there are layers with different
                # behavior during training versus inference (e.g. Dropout).
                epoch_accuracy.update_state(target, model(input, training=True))
コード例 #6
0
def test_torch_dataloader_advanced_params(mock_torch_make_batch_reader,
                                          test_ctx):
    SHARD_COUNT = 3
    df = test_ctx.spark.range(100).repartition(SHARD_COUNT)
    conv = make_spark_converter(df)

    mock_torch_make_batch_reader.return_value = \
        make_batch_reader(conv.cache_dir_url)

    with conv.make_torch_dataloader(reader_pool_type='dummy',
                                    cur_shard=1,
                                    shard_count=SHARD_COUNT) as _:
        pass
    peta_args = mock_torch_make_batch_reader.call_args.kwargs
    assert peta_args['reader_pool_type'] == 'dummy' and \
        peta_args['cur_shard'] == 1 and \
        peta_args['shard_count'] == SHARD_COUNT and \
        peta_args['num_epochs'] is None and \
        peta_args['workers_count'] == 4

    # Test default value overridden arguments.
    with conv.make_torch_dataloader(num_epochs=1, workers_count=2) as _:
        pass
    peta_args = mock_torch_make_batch_reader.call_args.kwargs
    assert peta_args['num_epochs'] == 1 and peta_args['workers_count'] == 2
コード例 #7
0
    def __enter__(self):
        # import locally to avoid importing tensorflow globally.
        from petastorm.tf_utils import make_petastorm_dataset
        import tensorflow.compat.v1 as tf  # pylint: disable=import-error

        _wait_file_available(self.parquet_file_url_list)
        self.reader = make_batch_reader(self.parquet_file_url_list, **self.petastorm_reader_kwargs)

        # unroll dataset
        dataset = make_petastorm_dataset(self.reader).flat_map(
            tf.data.Dataset.from_tensor_slices)

        # TODO: auto tune best batch size in default case.
        batch_size = self.batch_size or 32
        dataset = dataset.batch(batch_size=batch_size)

        prefetch = self.prefetch

        if prefetch is None:
            if LooseVersion(tf.__version__) >= LooseVersion('1.14'):
                # We can make prefetch optimization
                prefetch = tf.data.experimental.AUTOTUNE
            else:
                prefetch = 1

        dataset = dataset.prefetch(prefetch)

        return dataset
コード例 #8
0
    def __init__(self, data_url, batch_size, prefetch, preproc_fn,
                 preproc_parallelism):
        """
        :param data_url: A string specifying the data URL.
        :param batch_size: batch size of the generated tf.data.dataset
        :param prefetch: prefetch for tf dataset
        :param preproc_fn: preprocessing function
        :param preproc_parallelism: parallelism for preprocessing function
        """
        from petastorm.tf_utils import make_petastorm_dataset
        import tensorflow as tf

        def support_prefetch_and_autotune():
            return LooseVersion(tf.__version__) >= LooseVersion('1.14')

        self.reader = petastorm.make_batch_reader(data_url)
        self.dataset = make_petastorm_dataset(self.reader) \
            .flat_map(tf.data.Dataset.from_tensor_slices) \

        self.dataset = self.dataset.batch(batch_size=batch_size)

        if support_prefetch_and_autotune():
            if prefetch is None:
                prefetch = tf.data.experimental.AUTOTUNE
            if prefetch != 0:
                self.dataset = self.dataset.prefetch(prefetch)

        if preproc_fn is not None:
            if preproc_parallelism is None:
                if support_prefetch_and_autotune():
                    preproc_parallelism = tf.data.experimental.AUTOTUNE
                else:
                    preproc_parallelism = 1
            self.dataset = self.dataset.map(preproc_fn, preproc_parallelism)
コード例 #9
0
def python_hello_world(dataset_url='file:///tmp/external_dataset'):
    # Reading data from the non-Petastorm Parquet via pure Python
    with make_batch_reader(dataset_url,
                           schema_fields=["id", "value1", "value2"]) as reader:
        for schema_view in reader:
            # make_batch_reader() returns batches of rows instead of individual rows
            print("Batched read:\nid: {0} value1: {1} value2: {2}".format(
                schema_view.id, schema_view.value1, schema_view.value2))
コード例 #10
0
    def __enter__(self):
        from petastorm.pytorch import DataLoader

        _wait_file_available(self.parquet_file_url_list)
        self.reader = make_batch_reader(self.parquet_file_url_list,
                                        **self.petastorm_reader_kwargs)
        self.loader = DataLoader(reader=self.reader, batch_size=self.batch_size)
        return self.loader
コード例 #11
0
    def __init__(self, data_url):
        """
        :param data_url: A string specifying the data URL.
        """
        from petastorm.tf_utils import make_petastorm_dataset

        self.reader = make_batch_reader(data_url)
        self.dataset = make_petastorm_dataset(self.reader)
コード例 #12
0
def tensorflow_hello_world(dataset_url='file:///tmp/external_dataset'):
    # Example: tf_tensors will return tensors with dataset data
    with make_batch_reader(dataset_url) as reader:
        tensor = tf_tensors(reader)
        with tf.Session() as sess:
            # Because we are using make_batch_reader(), each read returns a batch of rows instead of a single row
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))

    # Example: use tf.data.Dataset API
    with make_batch_reader(dataset_url) as reader:
        dataset = make_petastorm_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))
コード例 #13
0
def get_data_loader(data_path: str = None,
                    num_epochs: int = 1,
                    batch_size: int = 16):
    if not data_path:
        return None

    return DataLoader(make_batch_reader(dataset_url=data_path,
                                        num_epochs=num_epochs),
                      batch_size=batch_size)
コード例 #14
0
def test_make_batch_reader_with_url_list(scalar_dataset):
    url_list = _get_local_fs_url_list(scalar_dataset.url)
    url_list = list(filter(lambda x: x.endswith('.parquet'), url_list))

    with make_batch_reader(url_list, workers_count=1) as reader:
        row_count = 0
        for batch in reader:
            row_count += len(batch.id)

        assert row_count == 100
コード例 #15
0
ファイル: test_end_to_end.py プロジェクト: suluner/petastorm
def test_transform_function_batched(scalar_dataset):
    def double_float64(sample):
        sample['float64'] *= 2
        return sample

    with make_batch_reader(scalar_dataset.url, transform_spec=TransformSpec(double_float64)) as reader:
        actual = next(reader)
        for actual_id, actual_float64 in zip(actual.id, actual.float64):
            original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id)
            expected_matrix = original_sample['float64'] * 2
            np.testing.assert_equal(expected_matrix, actual_float64)
コード例 #16
0
def test_simple_read_tensorflow_with_non_petastorm_many_columns_dataset(many_columns_non_petastorm_dataset):
    """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference
    data"""
    with make_batch_reader(dataset_url_or_urls=many_columns_non_petastorm_dataset.url) as reader:
        row_tensors = tf_tensors(reader)
        # Make sure we have static shape info for all fields
        for column in row_tensors:
            assert column.get_shape().as_list() == [None]

        with _tf_session() as sess:
            batch = sess.run(row_tensors)._asdict()
            assert set(batch.keys()) == set(many_columns_non_petastorm_dataset.data[0].keys())
コード例 #17
0
    def __enter__(self):
        from petastorm.pytorch import DataLoader

        _wait_file_available(self.parquet_file_url_list)

        self.reader = make_batch_reader(self.parquet_file_url_list, **self.petastorm_reader_kwargs)

        data_loader_fn = self.data_loader_fn or DataLoader
        self.loader = data_loader_fn(reader=self.reader,
                                     batch_size=self.batch_size,
                                     shuffling_queue_capacity=self.shuffling_queue_capacity)
        return self.loader
コード例 #18
0
def test_transform_function_batched_deleting_column(scalar_dataset):
    def double_float64(sample):
        del sample['float64']
        return sample

    with make_batch_reader(scalar_dataset.url,
                           transform_spec=TransformSpec(double_float64,
                                                        removed_fields=[
                                                            'float64'
                                                        ])) as reader:
        actual = next(reader)
        assert 'float64' not in actual._fields
コード例 #19
0
    def __init__(self, url, features, training_set_metadata):
        self.url = url
        self.training_set_metadata = training_set_metadata

        with make_batch_reader(self.url) as reader:
            self.size = reader.dataset.metadata.num_rows

        self.reshape_features = {
            feature[PROC_COLUMN]: list(
                (-1, *training_set_metadata[feature[NAME]]['reshape']))
            for feature in features
            if 'reshape' in training_set_metadata[feature[NAME]]
        }
コード例 #20
0
def test_with_batch_reader(scalar_dataset, shuffling_queue_capacity, data_loader_type):
    """See if we are getting correct batch sizes when using DataLoader with make_batch_reader"""
    pytorch_compatible_fields = [k for k, v in scalar_dataset.data[0].items()
                                 if not isinstance(v, (np.datetime64, np.unicode_))]
    with data_loader_type(make_batch_reader(scalar_dataset.url, schema_fields=pytorch_compatible_fields),
                          batch_size=3, shuffling_queue_capacity=shuffling_queue_capacity) as loader:
        batches = list(loader)
        assert len(scalar_dataset.data) == sum(batch['id'].shape[0] for batch in batches)

        # list types are broken in pyarrow 0.15.0. Don't test list-of-int field
        if pa.__version__ != '0.15.0':
            assert len(scalar_dataset.data) == sum(batch['int_fixed_size_list'].shape[0] for batch in batches)
            assert batches[0]['int_fixed_size_list'].shape[1] == len(scalar_dataset.data[0]['int_fixed_size_list'])
コード例 #21
0
def test_pyarrow_filters_make_batch_reader():
    path = tempfile.mkdtemp()
    url = 'file://' + path
    create_test_scalar_dataset(url, 3000, partition_by=['id_div_700'])
    with make_batch_reader(url, filters=[
        ('id_div_700', '=', 2),
    ]) as reader:
        uv = set()
        for data in reader:
            for _id_div_700 in data.id_div_700:
                uv.add(_id_div_700)

        assert uv == {2}
コード例 #22
0
    def _init_petaloader(self):
        def _transform_row(df_batch):
            return df_batch

        transform = TransformSpec(_transform_row, removed_fields=['cat_id', 'store_id', 'state_id'])
        reader = make_batch_reader(self.filename,
                 schema_fields=['id', 'item_id', 'dept_id', 'cat_id', 'day_id',
               'sales', 'day_date_str', 'month_id', 'date', 'wm_yr_wk',
               'snap_flag', 'sell_price', 'sales_dollars', 'store_id', 'state_id'],
                workers_count=1
                #,transform_spec = transform
        )
        return PetaDataLoader(reader=reader, batch_size=128, shuffling_queue_capacity=100000)
コード例 #23
0
ファイル: petastorm.py プロジェクト: ijrsvt/xgboost_ray
    def load_data(data: Union[str, Sequence[str]],
                  ignore: Optional[Sequence[str]] = None,
                  indices: Optional[Sequence[int]] = None,
                  **kwargs) -> pd.DataFrame:
        _assert_petastorm_installed()
        with petastorm.make_batch_reader(data) as reader:
            shards = [pd.DataFrame(batch._asdict()) for batch in reader]

        local_df = pd.concat(shards, copy=False)

        if ignore:
            local_df = local_df[local_df.columns.difference(ignore)]

        return local_df
コード例 #24
0
ファイル: test_end_to_end.py プロジェクト: suluner/petastorm
def test_transform_function_with_predicate_batched(scalar_dataset):
    def double_float64(sample):
        assert all(sample['id'] % 2 == 0)
        sample['float64'] *= 2
        return sample

    with make_batch_reader(scalar_dataset.url, transform_spec=TransformSpec(double_float64),
                           predicate=in_lambda(['id'], lambda id: id % 2 == 0)) as reader:
        actual = next(reader)
        for actual_id, actual_float64 in zip(actual.id, actual.float64):
            assert actual_id % 2 == 0
            original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id)
            expected_matrix = original_sample['float64'] * 2
            np.testing.assert_equal(expected_matrix, actual_float64)
コード例 #25
0
    def __init__(self, url, features, training_set_metadata):
        self.url = to_url(url)
        self.features = [feature[PROC_COLUMN] for feature in features]
        self.training_set_metadata = training_set_metadata

        with make_batch_reader(self.url) as reader:
            self.size = sum(piece.get_metadata().num_rows
                            for piece in reader.dataset.pieces)

        self.reshape_features = {
            feature[PROC_COLUMN]: list(
                (-1, *training_set_metadata[feature[NAME]]['reshape']))
            for feature in features
            if 'reshape' in training_set_metadata[feature[NAME]]
        }
コード例 #26
0
ファイル: test_tf_utils.py プロジェクト: rgruener/petastorm
def test_simple_read_tensorflow_with_parquet_dataset(scalar_dataset):
    """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference
    data"""
    with make_batch_reader(dataset_url=scalar_dataset.url) as reader:
        row_tensors = tf_tensors(reader)
        # Make sure we have static shape info for all fields
        for column in row_tensors:
            assert column.get_shape().as_list() == [None]

        with _tf_session() as sess:
            for _ in range(2):
                batch = sess.run(row_tensors)._asdict()
                for i, id_value in enumerate(batch['id']):
                    expected_row = next(d for d in scalar_dataset.data if d['id'] == id_value)
                    for field_name in expected_row.keys():
                        _assert_fields_eq(batch[field_name][i], expected_row[field_name])
コード例 #27
0
ファイル: test_tf_dataset.py プロジェクト: wxrui/petastorm
def test_non_petastorm_with_many_colums_with_one_shot_iterator(many_columns_non_petastorm_dataset):
    """Just a bunch of read and compares of all values to the expected values"""
    with make_batch_reader(many_columns_non_petastorm_dataset.url, workers_count=1) as reader:
        dataset = make_petastorm_dataset(reader)
        iterator = dataset.make_one_shot_iterator()

        # Make sure we have static shape info for all fields
        for shape in dataset.output_shapes:
            # TODO(yevgeni): check that the shapes are actually correct, not just not None
            assert shape.dims is not None

        # Read a bunch of entries from the dataset and compare the data to reference
        with tf.Session() as sess:
            iterator = iterator.get_next()
            sample = sess.run(iterator)._asdict()
            assert set(sample.keys()) == set(many_columns_non_petastorm_dataset.data[0].keys())
コード例 #28
0
    def __init__(self, data_url, batch_size, num_epochs, workers_count,
                 cur_shard, shard_count, **petastorm_reader_kwargs):
        """
        :param data_url: A string specifying the data URL.
        See `SparkDatasetConverter.make_torch_dataloader()` for the definitions
        of the other parameters.
        """
        from petastorm.pytorch import DataLoader

        petastorm_reader_kwargs["num_epochs"] = num_epochs
        if workers_count is not None:
            petastorm_reader_kwargs["workers_count"] = workers_count
        petastorm_reader_kwargs["cur_shard"] = cur_shard
        petastorm_reader_kwargs["shard_count"] = shard_count

        self.reader = petastorm.make_batch_reader(data_url,
                                                  **petastorm_reader_kwargs)
        self.loader = DataLoader(reader=self.reader, batch_size=batch_size)
コード例 #29
0
 def get_dataloader(self, dataset: Dataset, identity: str = "Default"):
     batch_preprocessor = self.build_batch_preprocessor()
     reader_options = self.reader_options
     assert reader_options
     data_reader = make_batch_reader(
         # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`.
         dataset.parquet_url,
         num_epochs=1,
         reader_pool_type=reader_options.petastorm_reader_pool_type,
     )
     # NOTE: must be wrapped by DataLoaderWrapper to call __exit__() on end of epoch
     dataloader = DataLoader(
         data_reader,
         batch_size=reader_options.minibatch_size,
         collate_fn=collate_and_preprocess(
             batch_preprocessor=batch_preprocessor, use_gpu=False),
     )
     return _closing_iter(dataloader)
コード例 #30
0
def test_transform_function_new_field_batched(scalar_dataset):
    def double_float64(sample):
        sample['new_float64'] = sample['float64'] * 2
        del sample['float64']
        return sample

    with make_batch_reader(scalar_dataset.url, reader_pool_type='dummy',
                           transform_spec=TransformSpec(double_float64,
                                                        [('new_float64', np.float64, (), False)],
                                                        ['float64'])) as reader:
        row_tensors = tf_tensors(reader)
        with _tf_session() as sess:
            actual = sess.run(row_tensors)

        for actual_id, actual_float64 in zip(actual.id, actual.new_float64):
            original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id)
            expected = original_sample['float64'] * 2
            np.testing.assert_equal(expected, actual_float64)