def test_schema_merge(): a = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ]) b = pa.schema([pa.field('foo', pa.int32()), pa.field('qux', pa.bool_())]) c = pa.schema([pa.field('quux', pa.dictionary(pa.int32(), pa.string()))]) d = pa.schema([pa.field('foo', pa.int64()), pa.field('qux', pa.bool_())]) result = pa.unify_schemas([a, b, c]) expected = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())), pa.field('qux', pa.bool_()), pa.field('quux', pa.dictionary(pa.int32(), pa.string())) ]) assert result.equals(expected) with pytest.raises(pa.ArrowInvalid): pa.unify_schemas([b, d]) # ARROW-14002: Try with tuple instead of list result = pa.unify_schemas((a, b, c)) assert result.equals(expected)
def _union_dataset(children, schema=None, **kwargs): if any(v is not None for v in kwargs.values()): raise ValueError( "When passing a list of Datasets, you cannot pass any additional " "arguments") if schema is None: # unify the children datasets' schemas schema = pa.unify_schemas([child.schema for child in children]) # create datasets with the requested schema children = [child.replace_schema(schema) for child in children] return UnionDataset(schema, children)