コード例 #1
0
def test_remove_field_transform():
    one_removed = transform_schema(
        TestSchema,
        TransformSpec(lambda x: x, edit_fields=None, removed_fields=['int']))
    assert set(one_removed.fields.keys()) == {'string', 'double'}

    two_removed = transform_schema(
        TestSchema,
        TransformSpec(lambda x: x,
                      edit_fields=None,
                      removed_fields=['int', 'double']))
    assert set(two_removed.fields.keys()) == {'string'}
コード例 #2
0
ファイル: dataloader.py プロジェクト: ssheikholeslami/maggy
    def __init__(
        self, dataset: str, batch_size: int = 1, transform_spec: TransformSpec = None
    ):
        """Initializes a reader depending on the dataset (Petastorm/Parquet).

        :param dataset: Path to the dataset.
        :param batch_size: How many samples per batch to load (default: ``1``).
        :param transform_spec: Petastorm transform spec for data augmentation.
        """
        num_workers = int(os.environ["WORLD_SIZE"])  # Is set at lagom startup.
        rank = int(os.environ["RANK"])
        is_peta_ds = EnvSing.get_instance().exists(
            dataset.rstrip("/") + "/_common_metadata"
        )
        # Make reader only compatible with petastorm dataset.
        ds_type = "Petastorm" if is_peta_ds else "Parquet"
        print(f"{ds_type} dataset detected in folder {dataset}")
        reader_factory = make_reader if is_peta_ds else make_batch_reader
        reader = reader_factory(
            dataset,
            cur_shard=rank,
            shard_count=num_workers,
            transform_spec=TransformSpec(transform_spec),
        )
        super().__init__(reader, batch_size=batch_size)
        self.iterator = None
コード例 #3
0
def test_select_field_transform():
    test_list = [['string', 'double', 'int'], ['int', 'string', 'double'],
                 ['string', 'int'], ['int']]
    for selected_fields in test_list:
        transformed = transform_schema(
            TestSchema, TransformSpec(selected_fields=selected_fields))
        assert list(transformed.fields.keys()) == selected_fields
コード例 #4
0
def test_unknown_fields_in_remove_field_transform():
    with pytest.warns(UserWarning, match='not part of the schema.*unknown_1'):
        one_removed = transform_schema(
            TestSchema,
            TransformSpec(lambda x: x,
                          edit_fields=None,
                          removed_fields=['int', 'unknown_1', 'unknown_2']))
    assert set(one_removed.fields.keys()) == {'string', 'double'}
コード例 #5
0
def test_change_field_transform():
    one_added = transform_schema(
        TestSchema,
        TransformSpec(lambda x: x,
                      edit_fields=[
                          UnischemaField('double', np.float16, (), None, False)
                      ]))
    assert one_added.fields['double'].numpy_dtype == np.float16
コード例 #6
0
def test_add_field_transform():
    one_added = transform_schema(
        TestSchema,
        TransformSpec(lambda x: x,
                      edit_fields=[
                          UnischemaField('double2', np.float64, (), None,
                                         False)
                      ]))
    assert set(
        one_added.fields.keys()) == {'string', 'double', 'double2', 'int'}
コード例 #7
0
def test_transform_spec_support_return_tensor(scalar_dataset, reader_factory):
    field1 = UnischemaField(name='abc', shape=(2, 3), numpy_dtype=np.float32)

    with pytest.raises(ValueError, match='field abc must be numpy array type'):
        ArrowReaderWorker._check_shape_and_ravel('xyz', field1)

    with pytest.raises(ValueError, match='field abc must be the shape'):
        ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 5)), field1)

    with pytest.raises(
            ValueError,
            match=
            'field abc error: only support row major multi-dimensional array'):
        ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3), order='F'),
                                                 field1)

    assert (6, ) == ArrowReaderWorker._check_shape_and_ravel(
        np.zeros((2, 3)), field1).shape

    for partial_shape in [(2, None), (None, ), (None, None)]:
        field_with_unknown_dim = UnischemaField(name='abc',
                                                shape=partial_shape,
                                                numpy_dtype=np.float32)
        with pytest.raises(
                ValueError,
                match='All dimensions of a shape.*must be constant'):
            ArrowReaderWorker._check_shape_and_ravel(
                np.zeros((2, 3), order='F'), field_with_unknown_dim)

    def preproc_fn1(x):
        return pd.DataFrame({
            'tensor_col_1':
            x['id'].map(lambda _: np.random.rand(2, 3)),
            'tensor_col_2':
            x['id'].map(lambda _: np.random.rand(3, 4, 5)),
        })

    edit_fields = [
        ('tensor_col_1', np.float32, (2, 3), False),
        ('tensor_col_2', np.float32, (3, 4, 5), False),
    ]

    # This spec will remove all input columns and return one new column 'tensor_col_1' with shape (2, 3)
    spec1 = TransformSpec(preproc_fn1,
                          edit_fields=edit_fields,
                          removed_fields=list(scalar_dataset.data[0].keys()))

    with reader_factory(scalar_dataset.url, transform_spec=spec1) as reader:
        sample = next(reader)._asdict()
        assert len(sample) == 2
        assert (2, 3) == sample['tensor_col_1'].shape[1:] and \
               (3, 4, 5) == sample['tensor_col_2'].shape[1:]
コード例 #8
0
    def __init__(self,
                 train_path,
                 val_path,
                 batch_size=16,
                 num_minibatchs=8,
                 num_workers=4,
                 text_column="",
                 target_column="target"):
        # Tokenised columns replace text based ones
        remove_other = lambda rows: rows[[text_column, target_column]]
        transform_spec = TransformSpec(
            remove_other, selected_fields=[text_column, target_column])

        super().__init__(train_path, val_path, batch_size, num_minibatchs,
                         num_workers, transform_spec)
コード例 #9
0
start = default_timer()
dataset = IterableParquetDataset("Data/Train.parquet", process_rows)
dataloader = DataLoader(dataset, num_workers=4)
list(dataloader)
end = default_timer()
print(end - start)

del dataset, dataloader
del start, end

start = default_timer()
dataset = IterableManualParquetDataset("Data/Train.parquet", process_rows)
dataloader = DataLoader(dataset, num_workers=4)
list(dataloader)
end = default_timer()
print(end - start)

del dataset, dataloader
del start, end

start = default_timer()
dataset = make_batch_reader(Path("Data/Train.parquet").absolute().as_uri(),
                            workers_count=4,
                            transform_spec=TransformSpec(
                                lambda rows: rows[["readme", "target"]],
                                selected_fields=["readme", "target"]))
dataloader = TransformersDataLoader(dataset, string_column="readme")
list(dataloader)
end = default_timer()
print(end - start)
コード例 #10
0
def test_noop_transform():
    transformed_schema = transform_schema(
        TestSchema,
        TransformSpec(lambda x: x, edit_fields=None, removed_fields=None))
    assert set(transformed_schema.fields) == set(TestSchema.fields)