def test_filesystem_source(mockfs): schema = pa.schema([]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] source = ds.FileSystemSource(schema, source_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions) source_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('source'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] source = ds.FileSystemSource(paths_or_selector=paths, schema=schema, source_partition=source_partition, filesystem=mockfs, partitions=partitions, file_format=file_format) assert source.partition_expression.equals(source_partition)
def test_filesystem_data_source(mockfs): file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] source = ds.FileSystemDataSource(mockfs, paths, partitions, source_partition=None, file_format=file_format) source_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('source'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] source = ds.FileSystemDataSource(mockfs, paths, partitions, source_partition=source_partition, file_format=file_format) assert source.partition_expression.equals(source_partition)
def test_expression_ergonomics(): zero = ds.scalar(0) one = ds.scalar(1) true = ds.scalar(True) false = ds.scalar(False) string = ds.scalar("string") field = ds.field("field") assert one.equals(ds.ScalarExpression(1)) assert zero.equals(ds.ScalarExpression(0)) assert true.equals(ds.ScalarExpression(True)) assert false.equals(ds.ScalarExpression(False)) assert string.equals(ds.ScalarExpression("string")) assert field.equals(ds.FieldExpression("field")) expected = ds.AndExpression(ds.ScalarExpression(1), ds.ScalarExpression(0)) for expr in [one & zero, 1 & zero, one & 0]: assert expr.equals(expected) expected = ds.OrExpression(ds.ScalarExpression(1), ds.ScalarExpression(0)) for expr in [one | zero, 1 | zero, one | 0]: assert expr.equals(expected) comparison_ops = [ (operator.eq, ds.CompareOperator.Equal), (operator.ne, ds.CompareOperator.NotEqual), (operator.ge, ds.CompareOperator.GreaterEqual), (operator.le, ds.CompareOperator.LessEqual), (operator.lt, ds.CompareOperator.Less), (operator.gt, ds.CompareOperator.Greater), ] for op, compare_op in comparison_ops: expr = op(zero, one) expected = ds.ComparisonExpression(compare_op, zero, one) assert expr.equals(expected) expr = ~true == false expected = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.NotExpression(ds.ScalarExpression(True)), ds.ScalarExpression(False) ) assert expr.equals(expected) for typ in ("bool", pa.bool_()): expr = field.cast(typ) == true expected = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.CastExpression(ds.FieldExpression("field"), pa.bool_()), ds.ScalarExpression(True) ) assert expr.equals(expected) expr = field.isin([1, 2]) expected = ds.InExpression(ds.FieldExpression("field"), pa.array([1, 2])) assert expr.equals(expected) with pytest.raises(TypeError): field.isin(1)
def test_expression(): a = ds.ScalarExpression(1) b = ds.ScalarExpression(1.1) c = ds.ScalarExpression(True) d = ds.ScalarExpression("string") e = ds.ScalarExpression(None) equal = ds.ComparisonExpression(ds.CompareOperator.Equal, a, b) greater = a > b assert equal.op == ds.CompareOperator.Equal and_ = ds.AndExpression(a, b) assert and_.left_operand.equals(a) assert and_.right_operand.equals(b) assert and_.equals(ds.AndExpression(a, b)) assert and_.equals(and_) or_ = ds.OrExpression(a, b) not_ = ds.NotExpression(ds.OrExpression(a, b)) is_valid = ds.IsValidExpression(a) cast_safe = ds.CastExpression(a, pa.int32()) cast_unsafe = ds.CastExpression(a, pa.int32(), safe=False) in_ = ds.InExpression(a, pa.array([1, 2, 3])) assert is_valid.operand == a assert in_.set_.equals(pa.array([1, 2, 3])) assert cast_unsafe.to == pa.int32() assert cast_unsafe.safe is False assert cast_safe.safe is True condition = ds.ComparisonExpression(ds.CompareOperator.Greater, ds.FieldExpression('i64'), ds.ScalarExpression(5)) schema = pa.schema( [pa.field('i64', pa.int64()), pa.field('f64', pa.float64())]) assert condition.validate(schema) == pa.bool_() i64_is_5 = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(5)) i64_is_7 = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(7)) assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False)) assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True)) assert str(condition) == "(i64 > 5:int64)" assert "(i64 > 5:int64)" in repr(condition) all_exprs = [ a, b, c, d, e, equal, greater, and_, or_, not_, is_valid, cast_unsafe, cast_safe, in_, condition, i64_is_5, i64_is_7 ] for expr in all_exprs: restored = pickle.loads(pickle.dumps(expr)) assert expr.equals(restored)
def test_partitioning(): schema = pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()) ]) for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]: partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) partitioning = ds.DirectoryPartitioning( pa.schema([ pa.field('group', pa.int64()), pa.field('key', pa.float64()) ]) ) expr = partitioning.parse('/3/3.14') assert isinstance(expr, ds.Expression) expected = ds.AndExpression( ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('group'), ds.ScalarExpression(3) ), ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('key'), ds.ScalarExpression(3.14) ) ) assert expr.equals(expected) with pytest.raises(pa.ArrowInvalid): partitioning.parse('/prefix/3/aaa') partitioning = ds.HivePartitioning( pa.schema([ pa.field('alpha', pa.int64()), pa.field('beta', pa.int64()) ]) ) expr = partitioning.parse('/alpha=0/beta=3') expected = ds.AndExpression( ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('alpha'), ds.ScalarExpression(0) ), ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('beta'), ds.ScalarExpression(3) ) ) assert expr.equals(expected)
def test_filesystem_dataset(mockfs): schema = pa.schema([pa.field('const', pa.int64())]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset(schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions) assert isinstance(dataset.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] dataset = ds.FileSystemDataset(paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals( ds.AndExpression(root_partition, partition)) assert fragment.path == path assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} # test predicate pushdown using row group metadata fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2 assert len(list(fragments[0].get_row_group_fragments())) == 1 assert len(list(fragments[1].get_row_group_fragments())) == 0
def test_filesystem_dataset(mockfs): schema = pa.schema([]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset( schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions ) assert isinstance(dataset.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337) ) partitions = [ ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1) ), ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2) ) ] dataset = ds.FileSystemDataset( paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format ) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) assert fragments[0].partition_expression.equals( ds.AndExpression(root_partition, partitions[0])) assert fragments[1].partition_expression.equals( ds.AndExpression(root_partition, partitions[1])) assert fragments[0].path == paths[0] assert fragments[1].path == paths[1]
def test_expression(): a = ds.ScalarExpression(1) b = ds.ScalarExpression(1.1) c = ds.ScalarExpression(True) d = ds.ScalarExpression("string") equal = ds.ComparisonExpression(ds.CompareOperator.Equal, a, b) assert equal.op() == ds.CompareOperator.Equal and_ = ds.AndExpression(a, b) assert and_.left_operand.equals(a) assert and_.right_operand.equals(b) assert and_.equals(ds.AndExpression(a, b)) assert and_.equals(and_) ds.AndExpression(a, b, c) ds.OrExpression(a, b) ds.OrExpression(a, b, c, d) ds.NotExpression(ds.OrExpression(a, b, c)) ds.IsValidExpression(a) ds.CastExpression(a, pa.int32()) ds.CastExpression(a, pa.int32(), safe=True) ds.InExpression(a, pa.array([1, 2, 3])) condition = ds.ComparisonExpression( ds.CompareOperator.Greater, ds.FieldExpression('i64'), ds.ScalarExpression(5) ) schema = pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()) ]) assert condition.validate(schema) == pa.bool_() i64_is_5 = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(5) ) i64_is_7 = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(7) ) assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False)) assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True)) assert str(condition) == "(i64 > 5:int64)" assert "(i64 > 5:int64)" in repr(condition)
def test_dataset(dataset): assert isinstance(dataset, ds.Dataset) assert isinstance(dataset.schema, pa.Schema) # TODO(kszucs): test non-boolean expressions for filter do raise builder = dataset.new_scan() assert isinstance(builder, ds.ScannerBuilder) scanner = builder.finish() assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) for task in scanner.scan(): assert isinstance(task, ds.ScanTask) for batch in task.execute(): assert batch.column(0).equals(expected_i64) assert batch.column(1).equals(expected_f64) table = scanner.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 condition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(1)) scanner = dataset.new_scan().use_threads(True).filter(condition).finish() result = scanner.to_table().to_pydict() # don't rely on the scanning order assert result['i64'] == [1, 1] assert result['f64'] == [1., 1.] assert sorted(result['group']) == [1, 2] assert sorted(result['key']) == ['xxx', 'yyy']
def test_filter_implicit_cast(tempdir): # ARROW-7652 table = pa.table({'a': pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())}) _, path = _create_single_file(tempdir, table) dataset = ds.dataset(str(path)) filter_ = ds.ComparisonExpression(ds.CompareOperator.Greater, ds.FieldExpression('a'), ds.ScalarExpression(2)) scanner_builder = dataset.new_scan() scanner_builder.filter(filter_) result = scanner_builder.finish().to_table() assert len(result) == 3