Example #1
0
def test_construct_from_invalid_sources_raise(multisourcefs):
    child1 = ds.FileSystemDatasetFactory(
        multisourcefs,
        fs.FileSelector('/plain'),
        format=ds.ParquetFileFormat()
    )
    child2 = ds.FileSystemDatasetFactory(
        multisourcefs,
        fs.FileSelector('/schema'),
        format=ds.ParquetFileFormat()
    )

    with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'):
        ds.dataset([child1, child2])

    expected = (
        "Expected a list of path-like or dataset objects. The given list "
        "contains the following types: int"
    )
    with pytest.raises(TypeError, match=expected):
        ds.dataset([1, 2, 3])

    expected = (
        "Expected a path-like, list of path-likes or a list of Datasets "
        "instead of the given type: NoneType"
    )
    with pytest.raises(TypeError, match=expected):
        ds.dataset(None)
Example #2
0
def test_file_format_pickling():
    formats = [
        ds.IpcFileFormat(),
        ds.ParquetFileFormat(),
        ds.ParquetFileFormat(read_options=ds.ParquetReadOptions(
            use_buffered_stream=True)),
        ds.ParquetFileFormat(read_options={
            'use_buffered_stream': True,
            'buffer_size': 4096,
        })
    ]
    for file_format in formats:
        assert pickle.loads(pickle.dumps(file_format)) == file_format
Example #3
0
def test_filesystem_data_source(mockfs):
    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    source = ds.FileSystemDataSource(mockfs,
                                     paths,
                                     partitions,
                                     source_partition=None,
                                     file_format=file_format)

    source_partition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                               ds.FieldExpression('source'),
                                               ds.ScalarExpression(1337))
    partitions = [
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(1)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(2))
    ]
    source = ds.FileSystemDataSource(mockfs,
                                     paths,
                                     partitions,
                                     source_partition=source_partition,
                                     file_format=file_format)
    assert source.partition_expression.equals(source_partition)
Example #4
0
def test_partitioning_factory(mockfs):
    paths_or_selector = fs.FileSelector('subdir', recursive=True)
    format = ds.ParquetFileFormat()

    options = ds.FileSystemFactoryOptions('subdir')
    partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key'])
    assert isinstance(partitioning_factory, ds.PartitioningFactory)
    options.partitioning_factory = partitioning_factory

    factory = ds.FileSystemDatasetFactory(
        mockfs, paths_or_selector, format, options
    )
    inspected_schema = factory.inspect()
    # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths
    expected_schema = pa.schema([
        ("i64", pa.int64()),
        ("f64", pa.float64()),
        ("str", pa.string()),
        ("group", pa.int32()),
        ("key", pa.string()),
    ])
    assert inspected_schema.equals(expected_schema)

    hive_partitioning_factory = ds.HivePartitioning.discover()
    assert isinstance(hive_partitioning_factory, ds.PartitioningFactory)
Example #5
0
def test_open_dataset_from_source_additional_kwargs(multisourcefs):
    child = ds.FileSystemDatasetFactory(
        multisourcefs, fs.FileSelector('/plain'),
        format=ds.ParquetFileFormat()
    )
    with pytest.raises(ValueError, match="cannot pass any additional"):
        ds.dataset(child, format="parquet")
Example #6
0
def test_filesystem_source(mockfs):
    schema = pa.schema([])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    source = ds.FileSystemSource(schema,
                                 source_partition=None,
                                 file_format=file_format,
                                 filesystem=mockfs,
                                 paths_or_selector=paths,
                                 partitions=partitions)

    source_partition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                               ds.FieldExpression('source'),
                                               ds.ScalarExpression(1337))
    partitions = [
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(1)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(2))
    ]
    source = ds.FileSystemSource(paths_or_selector=paths,
                                 schema=schema,
                                 source_partition=source_partition,
                                 filesystem=mockfs,
                                 partitions=partitions,
                                 file_format=file_format)
    assert source.partition_expression.equals(source_partition)
Example #7
0
def test_filesystem_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat(read_options=ds.ParquetReadOptions(
        dictionary_columns={"str"}))

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    assert options.partition_base_dir == 'subdir'
    assert options.ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is False

    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format,
                                          options)
    inspected_schema = factory.inspect()

    assert factory.inspect().equals(pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64()),
        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
        pa.field('const', pa.int64()),
        pa.field('group', pa.int32()),
        pa.field('key', pa.string()),
    ]),
                                    check_metadata=False)

    assert isinstance(factory.inspect_schemas(), list)
    assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset)
    assert factory.root_partition.equals(ds.ScalarExpression(True))

    dataset = factory.finish()
    assert isinstance(dataset, ds.FileSystemDataset)
    assert len(list(dataset.scan())) == 2

    scanner = ds.Scanner(dataset)
    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    expected_str = pa.DictionaryArray.from_arrays(
        pa.array([0, 1, 2, 3, 4], type=pa.int32()),
        pa.array("0 1 2 3 4".split(), type=pa.string()))
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group = pa.array([group] * 5, type=pa.int32())
        expected_key = pa.array([key] * 5, type=pa.string())
        expected_const = pa.array([group - 1] * 5, type=pa.int64())
        for batch in task.execute():
            assert batch.num_columns == 6
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_str)
            assert batch[3].equals(expected_const)
            assert batch[4].equals(expected_group)
            assert batch[5].equals(expected_key)

    table = dataset.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 6
Example #8
0
def dataset(mockfs):
    format = ds.ParquetFileFormat()
    selector = fs.FileSelector('subdir', recursive=True)
    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
    return factory.finish()
Example #9
0
def test_file_format_pickling():
    formats = [
        ds.IpcFileFormat(),
        ds.CsvFileFormat(),
        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t',
                                             ignore_empty_lines=True)),
        ds.ParquetFileFormat(),
        ds.ParquetFileFormat(
            read_options=ds.ParquetReadOptions(use_buffered_stream=True)
        ),
        ds.ParquetFileFormat(
            read_options={
                'use_buffered_stream': True,
                'buffer_size': 4096,
            }
        )
    ]
    for file_format in formats:
        assert pickle.loads(pickle.dumps(file_format)) == file_format
Example #10
0
def test_filesystem_dataset(mockfs):
    schema = pa.schema([pa.field('const', pa.int64())])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    dataset = ds.FileSystemDataset(schema,
                                   root_partition=None,
                                   file_format=file_format,
                                   filesystem=mockfs,
                                   paths_or_selector=paths,
                                   partitions=partitions)
    assert isinstance(dataset.format, ds.ParquetFileFormat)

    root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                             ds.FieldExpression('level'),
                                             ds.ScalarExpression(1337))
    partitions = [
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(1)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(2))
    ]
    dataset = ds.FileSystemDataset(paths_or_selector=paths,
                                   schema=schema,
                                   root_partition=root_partition,
                                   filesystem=mockfs,
                                   partitions=partitions,
                                   file_format=file_format)
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    for fragment, partition, path in zip(fragments, partitions, paths):
        assert fragment.partition_expression.equals(
            ds.AndExpression(root_partition, partition))
        assert fragment.path == path
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert fragment.row_groups is None

        row_group_fragments = list(fragment.get_row_group_fragments())
        assert len(row_group_fragments) == 1
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert row_group_fragments[0].path == path
        assert row_group_fragments[0].row_groups == {0}

    # test predicate pushdown using row group metadata
    fragments = list(dataset.get_fragments(filter=ds.field("const") == 0))
    assert len(fragments) == 2
    assert len(list(fragments[0].get_row_group_fragments())) == 1
    assert len(list(fragments[1].get_row_group_fragments())) == 0
Example #11
0
def test_filesystem_dataset(mockfs):
    schema = pa.schema([])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    dataset = ds.FileSystemDataset(
        schema,
        root_partition=None,
        file_format=file_format,
        filesystem=mockfs,
        paths_or_selector=paths,
        partitions=partitions
    )
    assert isinstance(dataset.format, ds.ParquetFileFormat)

    root_partition = ds.ComparisonExpression(
        ds.CompareOperator.Equal,
        ds.FieldExpression('level'),
        ds.ScalarExpression(1337)
    )
    partitions = [
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('part'),
            ds.ScalarExpression(1)
        ),
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('part'),
            ds.ScalarExpression(2)
        )
    ]
    dataset = ds.FileSystemDataset(
        paths_or_selector=paths,
        schema=schema,
        root_partition=root_partition,
        filesystem=mockfs,
        partitions=partitions,
        file_format=file_format
    )
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    assert fragments[0].partition_expression.equals(
        ds.AndExpression(root_partition, partitions[0]))
    assert fragments[1].partition_expression.equals(
        ds.AndExpression(root_partition, partitions[1]))
    assert fragments[0].path == paths[0]
    assert fragments[1].path == paths[1]
Example #12
0
def dataset(mockfs):
    format = ds.ParquetFileFormat()
    selector = fs.FileSelector('subdir', recursive=True)
    options = ds.FileSystemDiscoveryOptions('subdir')
    options.partition_scheme = ds.SchemaPartitionScheme(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    discovery = ds.FileSystemDataSourceDiscovery(mockfs, selector, format,
                                                 options)
    schema = discovery.inspect()
    source = discovery.finish()
    return ds.Dataset([source], schema)
Example #13
0
def test_dataset_union(multisourcefs):
    child = ds.FileSystemDatasetFactory(
        multisourcefs, fs.FileSelector('/plain'),
        format=ds.ParquetFileFormat()
    )
    factory = ds.UnionDatasetFactory([child])

    # TODO(bkietz) reintroduce factory.children property
    assert len(factory.inspect_schemas()) == 1
    assert all(isinstance(s, pa.Schema) for s in factory.inspect_schemas())
    assert factory.inspect_schemas()[0].equals(child.inspect())
    assert factory.inspect().equals(child.inspect())
    assert isinstance(factory.finish(), ds.Dataset)
Example #14
0
def test_file_system_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat()

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema([
            pa.field('group', pa.int32()),
            pa.field('key', pa.string())
        ])
    )
    assert options.partition_base_dir == 'subdir'
    assert options.ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is True

    factory = ds.FileSystemSourceFactory(
        mockfs, paths_or_selector, format, options
    )
    inspected_schema = factory.inspect()

    assert isinstance(factory.inspect(), pa.Schema)
    assert isinstance(factory.inspect_schemas(), list)
    assert isinstance(factory.finish(inspected_schema),
                      ds.FileSystemSource)
    assert factory.root_partition.equals(ds.ScalarExpression(True))

    source = factory.finish()
    assert isinstance(source, ds.Source)

    dataset = ds.Dataset([source], inspected_schema)

    scanner = dataset.new_scan().finish()
    assert len(list(scanner.scan())) == 2

    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group_column = pa.array([group] * 5, type=pa.int32())
        expected_key_column = pa.array([key] * 5, type=pa.string())
        for batch in task.execute():
            assert batch.num_columns == 4
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_group_column)
            assert batch[3].equals(expected_key_column)

    table = scanner.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 4
Example #15
0
def test_make_fragment(multisourcefs):
    parquet_format = ds.ParquetFileFormat()
    dataset = ds.dataset('/plain', filesystem=multisourcefs,
                         format=parquet_format)

    for path in dataset.files:
        fragment = parquet_format.make_fragment(path, multisourcefs)
        row_group_fragment = parquet_format.make_fragment(path, multisourcefs,
                                                          row_groups=[0])
        for f in [fragment, row_group_fragment]:
            assert isinstance(f, ds.ParquetFileFragment)
            assert f.path == path
            assert isinstance(f.filesystem, type(multisourcefs))
        assert fragment.row_groups is None
        assert row_group_fragment.row_groups == {0}
Example #16
0
    def _read_data(self) -> None:
        _logger.debug("Starting data read")

        # only scan the columns projected and in our file
        cols_to_read = prune_columns(self._file_schema, self._expected_schema)

        with profile("read data", self._stats):
            try:
                read_fs = FS_MAP[type(self._input.fs)]
            except KeyError:
                raise FileSystemNotFound(
                    f"No mapped filesystem found for {type(self._input.fs)}")

            arrow_dataset = ds.FileSystemDataset.from_paths(
                [self._input.location()],
                schema=self._arrow_file.schema_arrow,
                format=ds.ParquetFileFormat(),
                filesystem=read_fs())

            arrow_table = arrow_dataset.to_table(columns=cols_to_read,
                                                 filter=self._filter)

        # process schema evolution if needed
        with profile("schema_evol_proc", self._stats):
            processed_tbl = self.migrate_schema(arrow_table)
            for i, field in self.get_missing_fields():
                dtype_func = DTYPE_MAP.get(field.type.type_id)
                if dtype_func is None:
                    raise RuntimeError(
                        "Unable to create null column for type %s" %
                        field.type.type_id)

                dtype = dtype_func(field)
                processed_tbl = (processed_tbl.add_column(
                    i, pa.field(field.name, dtype[0], True, None),
                    ParquetReader.create_null_column(processed_tbl[0], dtype)))
        self._table = processed_tbl
        self.materialized_table = True
Example #17
0
def test_filesystem_dataset(mockfs):
    schema = pa.schema([
        pa.field('const', pa.int64())
    ])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.scalar(True), ds.scalar(True)]

    dataset = ds.FileSystemDataset(
        schema=schema,
        format=file_format,
        filesystem=mockfs,
        paths_or_selector=paths,
        partitions=partitions
    )

    assert isinstance(dataset.format, ds.ParquetFileFormat)

    # the root_partition and partitions keywords have defaults
    dataset = ds.FileSystemDataset(
        paths, schema, format=file_format, filesystem=mockfs,
    )

    assert isinstance(dataset.format, ds.ParquetFileFormat)

    # validation of required arguments
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs)
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs)
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, format=file_format)
    # validation of root_partition
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, format=file_format,
                             filesystem=mockfs, root_partition=1)

    root_partition = ds.field('level') == ds.scalar(1337)
    partitions = [ds.field('part') == x for x in range(1, 3)]
    dataset = ds.FileSystemDataset(
        paths_or_selector=paths,
        schema=schema,
        root_partition=root_partition,
        filesystem=mockfs,
        partitions=partitions,
        format=file_format
    )
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    for fragment, partition, path in zip(fragments, partitions, paths):
        assert fragment.partition_expression.equals(partition)
        assert fragment.path == path
        assert isinstance(fragment.format, ds.ParquetFileFormat)
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert fragment.row_groups is None

        row_group_fragments = list(fragment.get_row_group_fragments())
        assert len(row_group_fragments) == 1
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert row_group_fragments[0].path == path
        assert row_group_fragments[0].row_groups == {0}

    fragments = list(dataset.get_fragments(filter=ds.field("const") == 0))
    assert len(fragments) == 2