コード例 #1
0
def test_partitioning():
    schema = pa.schema(
        [pa.field('i64', pa.int64()),
         pa.field('f64', pa.float64())])
    for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]:
        partitioning = klass(schema)
        assert isinstance(partitioning, ds.Partitioning)

    partitioning = ds.DirectoryPartitioning(
        pa.schema(
            [pa.field('group', pa.int64()),
             pa.field('key', pa.float64())]))
    expr = partitioning.parse('/3/3.14')
    assert isinstance(expr, ds.Expression)

    expected = (ds.field('group') == 3) & (ds.field('key') == 3.14)
    assert expr.equals(expected)

    with pytest.raises(pa.ArrowInvalid):
        partitioning.parse('/prefix/3/aaa')

    partitioning = ds.HivePartitioning(
        pa.schema(
            [pa.field('alpha', pa.int64()),
             pa.field('beta', pa.int64())]))
    expr = partitioning.parse('/alpha=0/beta=3')
    expected = ((ds.field('alpha') == ds.scalar(0)) &
                (ds.field('beta') == ds.scalar(3)))
    assert expr.equals(expected)
コード例 #2
0
ファイル: test_dataset.py プロジェクト: xiepeini/arrow
def test_expression_construction():
    zero = ds.scalar(0)
    one = ds.scalar(1)
    true = ds.scalar(True)
    false = ds.scalar(False)
    string = ds.scalar("string")
    field = ds.field("field")

    zero | one == string
    ~true == false
    for typ in ("bool", pa.bool_()):
        field.cast(typ) == true

    field.isin([1, 2])

    with pytest.raises(TypeError):
        field.isin(1)

    # operations with non-scalar values
    with pytest.raises(TypeError):
        field == [1]

    with pytest.raises(TypeError):
        field != {1}

    with pytest.raises(TypeError):
        field & [1]

    with pytest.raises(TypeError):
        field | [1]
コード例 #3
0
def test_expression_ergonomics():
    zero = ds.scalar(0)
    one = ds.scalar(1)
    true = ds.scalar(True)
    false = ds.scalar(False)
    string = ds.scalar("string")
    field = ds.field("field")

    assert one.equals(ds.ScalarExpression(1))
    assert zero.equals(ds.ScalarExpression(0))
    assert true.equals(ds.ScalarExpression(True))
    assert false.equals(ds.ScalarExpression(False))
    assert string.equals(ds.ScalarExpression("string"))
    assert field.equals(ds.FieldExpression("field"))

    expected = ds.AndExpression(ds.ScalarExpression(1), ds.ScalarExpression(0))
    for expr in [one & zero, 1 & zero, one & 0]:
        assert expr.equals(expected)

    expected = ds.OrExpression(ds.ScalarExpression(1), ds.ScalarExpression(0))
    for expr in [one | zero, 1 | zero, one | 0]:
        assert expr.equals(expected)

    comparison_ops = [
        (operator.eq, ds.CompareOperator.Equal),
        (operator.ne, ds.CompareOperator.NotEqual),
        (operator.ge, ds.CompareOperator.GreaterEqual),
        (operator.le, ds.CompareOperator.LessEqual),
        (operator.lt, ds.CompareOperator.Less),
        (operator.gt, ds.CompareOperator.Greater),
    ]
    for op, compare_op in comparison_ops:
        expr = op(zero, one)
        expected = ds.ComparisonExpression(compare_op, zero, one)
        assert expr.equals(expected)

    expr = ~true == false
    expected = ds.ComparisonExpression(
        ds.CompareOperator.Equal,
        ds.NotExpression(ds.ScalarExpression(True)),
        ds.ScalarExpression(False)
    )
    assert expr.equals(expected)

    for typ in ("bool", pa.bool_()):
        expr = field.cast(typ) == true
        expected = ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.CastExpression(ds.FieldExpression("field"), pa.bool_()),
            ds.ScalarExpression(True)
        )
        assert expr.equals(expected)

    expr = field.isin([1, 2])
    expected = ds.InExpression(ds.FieldExpression("field"), pa.array([1, 2]))
    assert expr.equals(expected)

    with pytest.raises(TypeError):
        field.isin(1)
コード例 #4
0
ファイル: test_dataset.py プロジェクト: xiepeini/arrow
def test_expression_serialization():
    a = ds.scalar(1)
    b = ds.scalar(1.1)
    c = ds.scalar(True)
    d = ds.scalar("string")
    e = ds.scalar(None)

    condition = ds.field('i64') > 5
    schema = pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64())
    ])
    assert condition.validate(schema) == pa.bool_()

    assert condition.assume(ds.field('i64') == 5).equals(
        ds.scalar(False))

    assert condition.assume(ds.field('i64') == 7).equals(
        ds.scalar(True))

    all_exprs = [a, b, c, d, e, a == b, a > b, a & b, a | b, ~c,
                 d.is_valid(), a.cast(pa.int32(), safe=False),
                 a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]),
                 ds.field('i64') > 5, ds.field('i64') == 5,
                 ds.field('i64') == 7]
    for expr in all_exprs:
        assert isinstance(expr, ds.Expression)
        restored = pickle.loads(pickle.dumps(expr))
        assert expr.equals(restored)
コード例 #5
0
ファイル: table.py プロジェクト: SpikingNeurons/toolcraft
    '!=': operator.ne,
    'and': operator.and_,
    'or': operator.or_,
    'in': lambda _x, _l: _x in _l,
    'not in': lambda _x, _l: _x not in _l,
    # todo: dont know how this will be used filter tuple that uses three
    #  elements
    # '~': lambda _x: ~x,
    # todo: not sure about this
    # 'not': lambda _x: ~x,
}

# do not try to reuse code for operators as we encounter problem of using
# last value while looping over dict _OP_MAPPER (loop rolling issue with python)
_OP_MAPPER_EXP = {
    '=': lambda _x, _y: operator.eq(pds.field(_x), pds.scalar(_y)),
    '==': lambda _x, _y: operator.eq(pds.field(_x), pds.scalar(_y)),
    '<': lambda _x, _y: operator.lt(pds.field(_x), pds.scalar(_y)),
    '>': lambda _x, _y: operator.gt(pds.field(_x), pds.scalar(_y)),
    '<=': lambda _x, _y: operator.le(pds.field(_x), pds.scalar(_y)),
    '>=': lambda _x, _y: operator.ge(pds.field(_x), pds.scalar(_y)),
    '!=': lambda _x, _y: operator.ne(pds.field(_x), pds.scalar(_y)),
    'and': lambda _x, _y: operator.and_(pds.field(_x), pds.scalar(_y)),
    'or': lambda _x, _y: operator.or_(pds.field(_x), pds.scalar(_y)),
    'in': lambda _x, _l: pds.field(_x).isin(_l),
    'not in': lambda _x, _l: operator.inv(pds.field(_x).isin(_l)),
    # todo: dont know how this will be used filter tuple that uses three
    #  elements
    # '~': lambda _x: operator.inv(pds.scalar(_x)),
    # todo: not sure about this
    # 'not': lambda _x: operator.inv(pds.scalar(_x)),
コード例 #6
0
ファイル: test_dataset.py プロジェクト: xiepeini/arrow
def test_filesystem_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat(
        read_options=ds.ParquetReadOptions(dictionary_columns={"str"})
    )

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema([
            pa.field('group', pa.int32()),
            pa.field('key', pa.string())
        ])
    )
    assert options.partition_base_dir == 'subdir'
    assert options.selector_ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is False

    factory = ds.FileSystemDatasetFactory(
        mockfs, paths_or_selector, format, options
    )
    inspected_schema = factory.inspect()

    assert factory.inspect().equals(pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64()),
        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
        pa.field('const', pa.int64()),
        pa.field('group', pa.int32()),
        pa.field('key', pa.string()),
    ]), check_metadata=False)

    assert isinstance(factory.inspect_schemas(), list)
    assert isinstance(factory.finish(inspected_schema),
                      ds.FileSystemDataset)
    assert factory.root_partition.equals(ds.scalar(True))

    dataset = factory.finish()
    assert isinstance(dataset, ds.FileSystemDataset)
    assert len(list(dataset.scan())) == 2

    scanner = ds.Scanner.from_dataset(dataset)
    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    expected_str = pa.DictionaryArray.from_arrays(
        pa.array([0, 1, 2, 3, 4], type=pa.int32()),
        pa.array("0 1 2 3 4".split(), type=pa.string())
    )
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group = pa.array([group] * 5, type=pa.int32())
        expected_key = pa.array([key] * 5, type=pa.string())
        expected_const = pa.array([group - 1] * 5, type=pa.int64())
        for batch in task.execute():
            assert batch.num_columns == 6
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_str)
            assert batch[3].equals(expected_const)
            assert batch[4].equals(expected_group)
            assert batch[5].equals(expected_key)

    table = dataset.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 6
コード例 #7
0
ファイル: test_dataset.py プロジェクト: xiepeini/arrow
def test_filesystem_dataset(mockfs):
    schema = pa.schema([
        pa.field('const', pa.int64())
    ])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.scalar(True), ds.scalar(True)]

    dataset = ds.FileSystemDataset(
        schema=schema,
        format=file_format,
        filesystem=mockfs,
        paths_or_selector=paths,
        partitions=partitions
    )

    assert isinstance(dataset.format, ds.ParquetFileFormat)

    # the root_partition and partitions keywords have defaults
    dataset = ds.FileSystemDataset(
        paths, schema, format=file_format, filesystem=mockfs,
    )

    assert isinstance(dataset.format, ds.ParquetFileFormat)

    # validation of required arguments
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs)
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs)
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, format=file_format)
    # validation of root_partition
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, format=file_format,
                             filesystem=mockfs, root_partition=1)

    root_partition = ds.field('level') == ds.scalar(1337)
    partitions = [ds.field('part') == x for x in range(1, 3)]
    dataset = ds.FileSystemDataset(
        paths_or_selector=paths,
        schema=schema,
        root_partition=root_partition,
        filesystem=mockfs,
        partitions=partitions,
        format=file_format
    )
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    for fragment, partition, path in zip(fragments, partitions, paths):
        assert fragment.partition_expression.equals(partition)
        assert fragment.path == path
        assert isinstance(fragment.format, ds.ParquetFileFormat)
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert fragment.row_groups is None

        row_group_fragments = list(fragment.get_row_group_fragments())
        assert len(row_group_fragments) == 1
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert row_group_fragments[0].path == path
        assert row_group_fragments[0].row_groups == {0}

    fragments = list(dataset.get_fragments(filter=ds.field("const") == 0))
    assert len(fragments) == 2