Beispiel #1
0
def dataset(mockfs):
    format = ds.ParquetFileFormat()
    selector = fs.FileSelector('subdir', recursive=True)
    options = ds.FileSystemDiscoveryOptions('subdir')
    options.partition_scheme = ds.SchemaPartitionScheme(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    discovery = ds.FileSystemDataSourceDiscovery(mockfs, selector, format,
                                                 options)
    schema = discovery.inspect()
    source = discovery.finish()
    return ds.Dataset([source], schema)
Beispiel #2
0
def test_file_system_discovery(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat()

    options = ds.FileSystemDiscoveryOptions('subdir')
    options.partition_scheme = ds.SchemaPartitionScheme(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    assert options.partition_base_dir == 'subdir'
    assert options.ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is True

    discovery = ds.FileSystemDataSourceDiscovery(mockfs, paths_or_selector,
                                                 format, options)
    inspected_schema = discovery.inspect()

    assert isinstance(discovery.inspect(), pa.Schema)
    assert isinstance(discovery.inspect_schemas(), list)
    assert isinstance(discovery.finish(inspected_schema),
                      ds.FileSystemDataSource)
    assert discovery.root_partition.equals(ds.ScalarExpression(True))

    data_source = discovery.finish()
    assert isinstance(data_source, ds.DataSource)

    dataset = ds.Dataset([data_source], inspected_schema)

    scanner = dataset.new_scan().finish()
    assert len(list(scanner.scan())) == 2

    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group_column = pa.array([group] * 5, type=pa.int32())
        expected_key_column = pa.array([key] * 5, type=pa.string())
        for batch in task.execute():
            assert batch.num_columns == 4
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_group_column)
            assert batch[3].equals(expected_key_column)

    table = scanner.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 4
Beispiel #3
0
def test_partition_scheme():
    schema = pa.schema(
        [pa.field('i64', pa.int64()),
         pa.field('f64', pa.float64())])
    for klass in [ds.SchemaPartitionScheme, ds.HivePartitionScheme]:
        scheme = klass(schema)
        assert isinstance(scheme, ds.PartitionScheme)

    scheme = ds.SchemaPartitionScheme(
        pa.schema(
            [pa.field('group', pa.int64()),
             pa.field('key', pa.float64())]))
    expr = scheme.parse('/3/3.14')
    assert isinstance(expr, ds.Expression)

    expected = ds.AndExpression(
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('group'),
                                ds.ScalarExpression(3)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('key'),
                                ds.ScalarExpression(3.14)))
    assert expr.equals(expected)

    with pytest.raises(pa.ArrowInvalid):
        scheme.parse('/prefix/3/aaa')

    scheme = ds.HivePartitionScheme(
        pa.schema(
            [pa.field('alpha', pa.int64()),
             pa.field('beta', pa.int64())]))
    expr = scheme.parse('/alpha=0/beta=3')
    expected = ds.AndExpression(
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('alpha'),
                                ds.ScalarExpression(0)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('beta'),
                                ds.ScalarExpression(3)))
    assert expr.equals(expected)