def test_schema(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('foo').name == 'foo' assert sch.field_by_name('foo').type == fields[0].type assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([None])
def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123', None, '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) field = pa.field('datetime64', pa.timestamp('ms')) schema = pa.schema([field]) self._check_pandas_roundtrip( df, timestamps_to_ms=True, expected_schema=schema, ) df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) self._check_pandas_roundtrip( df, timestamps_to_ms=False, expected_schema=schema, )
def test_custom_nulls(self): # Infer nulls with custom values opts = ConvertOptions(null_values=['Xxx', 'Zzz']) rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.null()), ('b', pa.string()), ('c', pa.string()), ('d', pa.int64())]) assert table.schema == schema assert table.to_pydict() == { 'a': [None, None], 'b': [u"Xxx", u"#N/A"], 'c': [u"1", u""], 'd': [2, None], } opts = ConvertOptions(null_values=[]) rows = b"a,b\n#N/A,\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.string()), ('b', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'a': [u"#N/A"], 'b': [u""], }
def test_schema_equals_propagates_check_metadata(): # ARROW-4088 schema1 = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string()) ]) schema2 = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string(), metadata={'a': 'alpha'}), ]) assert not schema1.equals(schema2) assert schema1.equals(schema2, check_metadata=False)
def test_schema_equals(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch1 = pa.schema(fields) sch2 = pa.schema(fields) assert sch1.equals(sch2) del fields[-1] sch3 = pa.schema(fields) assert not sch1.equals(sch3)
def test_table_from_pydict(): table = pa.Table.from_pydict({}) assert table.num_columns == 0 assert table.num_rows == 0 assert table.schema == pa.schema([]) assert table.to_pydict() == {} # With arrays as values data = OrderedDict([('strs', pa.array([u'', u'foo', u'bar'])), ('floats', pa.array([4.5, 5, None]))]) schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # With chunked arrays as values data = OrderedDict([('strs', pa.chunked_array([[u''], [u'foo', u'bar']])), ('floats', pa.chunked_array([[4.5], [5, None]]))]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # With lists as values data = OrderedDict([('strs', [u'', u'foo', u'bar']), ('floats', [4.5, 5, None])]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema assert table.to_pydict() == data # With metadata and inferred schema metadata = {b'foo': b'bar'} schema = schema.add_metadata(metadata) table = pa.Table.from_pydict(data, metadata=metadata) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pydict() == data # With explicit schema table = pa.Table.from_pydict(data, schema=schema) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pydict() == data # Cannot pass both schema and metadata with pytest.raises(ValueError): pa.Table.from_pydict(data, schema=schema, metadata=metadata)
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_float_nulls(self): num_values = 100 null_mask = np.random.randint(0, 10, size=num_values) < 3 dtypes = [('f4', pa.float32()), ('f8', pa.float64())] names = ['f4', 'f8'] expected_cols = [] arrays = [] fields = [] for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) arr = pa.array(values, from_pandas=True, mask=null_mask) arrays.append(arr) fields.append(pa.field(name, arrow_dtype)) values[null_mask] = np.nan expected_cols.append(values) ex_frame = pd.DataFrame(dict(zip(names, expected_cols)), columns=names) table = pa.Table.from_arrays(arrays, names) assert table.schema.equals(pa.schema(fields)) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame)
def test_table_safe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) casted_table = table.cast(target_schema) assert casted_table.equals(expected_table)
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def make_recordbatch(length): schema = pa.schema([pa.field('f0', pa.int16()), pa.field('f1', pa.int16())]) a0 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16)) a1 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16)) batch = pa.RecordBatch.from_arrays([a0, a1], schema) return batch
def test_recordbatch_basics(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1']) assert not batch.schema.metadata assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data) assert batch.to_pydict() == OrderedDict([ ('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, 5, 10]) ]) with pytest.raises(IndexError): # bounds checking batch[2] # Schema passed explicitly schema = pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())], metadata={b'foo': b'bar'}) batch = pa.RecordBatch.from_arrays(data, schema) assert batch.schema == schema
def test_table_unsafe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) with pytest.raises(pa.ArrowInvalid, match='Floating point value truncated'): table.cast(target_schema) casted_table = table.cast(target_schema, safe=False) assert casted_table.equals(expected_table)
def test_int_object_nulls(self): arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) df = pd.DataFrame({'ints': arr}) expected = pd.DataFrame({'ints': pd.to_numeric(arr)}) field = pa.field('ints', pa.int64()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected=expected, expected_schema=schema)
def test_unicode(self): repeats = 1000 values = [u'foo', None, u'bar', u'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) field = pa.field('strings', pa.string()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_empty_table(): schema = pa.schema([ pa.field('oneField', pa.int64()) ]) table = schema.empty_table() assert isinstance(table, pa.Table) assert table.num_rows == 0 assert table.schema == schema
def test_boolean_no_nulls(self): num_values = 100 np.random.seed(0) df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) field = pa.field('bools', pa.bool_()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_schema_from_tuples(): fields = [ ('foo', pa.int32()), ('bar', pa.string()), ('baz', pa.list_(pa.int8())), ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([('foo', None)])
def test_fixed_size_bytes(self): values = [b'foo', None, b'bar', None, None, b'hey'] df = pd.DataFrame({'strings': values}) schema = pa.schema([pa.field('strings', pa.binary(3))]) table = pa.Table.from_pandas(df, schema=schema) assert table.schema[0].type == schema[0].type assert table.schema[0].name == schema[0].name result = table.to_pandas() tm.assert_frame_equal(result, df)
def test_table_from_batches_and_schema(): schema = pa.schema([ pa.field('a', pa.int64()), pa.field('b', pa.float64()), ]) batch = pa.RecordBatch.from_arrays([pa.array([1]), pa.array([3.14])], names=['a', 'b']) table = pa.Table.from_batches([batch], schema) assert table.schema.equals(schema) assert table.column(0) == pa.column('a', pa.array([1])) assert table.column(1) == pa.column('b', pa.array([3.14])) incompatible_schema = pa.schema([pa.field('a', pa.int64())]) with pytest.raises(pa.ArrowInvalid): pa.Table.from_batches([batch], incompatible_schema) incompatible_batch = pa.RecordBatch.from_arrays([pa.array([1])], ['a']) with pytest.raises(pa.ArrowInvalid): pa.Table.from_batches([incompatible_batch], schema)
def test_table_str_to_categorical(self): values = [None, 'a', 'b', np.nan] df = pd.DataFrame({'strings': values}) field = pa.field('strings', pa.string()) schema = pa.schema([field]) table = pa.Table.from_pandas(df, schema=schema) result = table.to_pandas(strings_to_categorical=True) expected = pd.DataFrame({'strings': pd.Categorical(values)}) tm.assert_frame_equal(result, expected, check_dtype=True)
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_decimal_128_from_pandas(self): expected = pd.DataFrame({ 'decimals': [ decimal.Decimal('394092382910493.12341234678'), -decimal.Decimal('314292388910493.12343437128'), ] }) converted = pa.Table.from_pandas(expected, preserve_index=False) field = pa.field('decimals', pa.decimal(26, 11)) schema = pa.schema([field]) assert converted.schema.equals(schema)
def test_decimal_64_from_pandas(self): expected = pd.DataFrame({ 'decimals': [ decimal.Decimal('-129934.123331'), decimal.Decimal('129534.123731'), ] }) converted = pa.Table.from_pandas(expected, preserve_index=False) field = pa.field('decimals', pa.decimal(12, 6)) schema = pa.schema([field]) assert converted.schema.equals(schema)
def test_decimal_32_from_pandas(self): expected = pd.DataFrame({ 'decimals': [ decimal.Decimal('-1234.123'), decimal.Decimal('1234.439'), ] }) converted = pa.Table.from_pandas(expected, preserve_index=False) field = pa.field('decimals', pa.decimal128(7, 3)) schema = pa.schema([field]) assert converted.schema.equals(schema)
def test_convert_options(): cls = ConvertOptions opts = cls() assert opts.check_utf8 is True opts.check_utf8 = False assert opts.check_utf8 is False assert opts.strings_can_be_null is False opts.strings_can_be_null = True assert opts.strings_can_be_null is True assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] opts = cls(check_utf8=False, column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], strings_can_be_null=True) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.strings_can_be_null is True
def test_simple_timestamps(self): # Infer a timestamp column rows = b"a,b\n1970,1970-01-01\n1989,1989-07-14\n" table = self.read_bytes(rows) schema = pa.schema([('a', pa.int64()), ('b', pa.timestamp('s'))]) assert table.schema == schema assert table.to_pydict() == { 'a': [1970, 1989], 'b': [datetime(1970, 1, 1), datetime(1989, 7, 14)], }
def test_table_cast_to_incompatible_schema(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), ] table = pa.Table.from_arrays(data, names=tuple('ab')) target_schema1 = pa.schema([ pa.field('A', pa.int32()), pa.field('b', pa.int16()), ]) target_schema2 = pa.schema([ pa.field('a', pa.int32()), ]) message = ("Target schema's field names are not matching the table's " "field names:.*") with pytest.raises(ValueError, match=message): table.cast(target_schema1) with pytest.raises(ValueError, match=message): table.cast(target_schema2)
def test_list_metadata(self): df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]}) schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))]) table = pa.Table.from_pandas(df, schema=schema) metadata = table.schema.metadata assert b'mixed' not in metadata[b'pandas'] js = json.loads(metadata[b'pandas'].decode('utf8')) data_column = js['columns'][0] assert data_column['pandas_type'] == 'list[int64]' assert data_column['numpy_type'] == 'object'
def test_partial_schema(self): data = OrderedDict([ ('a', [0, 1, 2, 3, 4]), ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)), ('c', [-10, -5, 0, 5, 10]) ]) df = pd.DataFrame(data) partial_schema = pa.schema([ pa.field('a', pa.int64()), pa.field('b', pa.int32()) ]) expected_schema = pa.schema([ pa.field('a', pa.int64()), pa.field('b', pa.int32()), pa.field('c', pa.int64()) ]) self._check_pandas_roundtrip(df, schema=partial_schema, expected_schema=expected_schema)
def test_fixed_size_bytes_does_not_accept_varying_lengths(self): values = [b'foo', None, b'ba', None, None, b'hey'] df = pd.DataFrame({'strings': values}) schema = pa.schema([pa.field('strings', pa.binary(3))]) with self.assertRaises(pa.ArrowInvalid): pa.Table.from_pandas(df, schema=schema)
def test_column_selection(tempdir): from pyarrow import orc # create a table with nested types inner = pa.field('inner', pa.int64()) middle = pa.field('middle', pa.struct([inner])) fields = [ pa.field('basic', pa.int32()), pa.field( 'list', pa.list_(pa.field('item', pa.int32())) ), pa.field( 'struct', pa.struct([middle, pa.field('inner2', pa.int64())]) ), pa.field( 'list-struct', pa.list_(pa.field( 'item', pa.struct([ pa.field('inner1', pa.int64()), pa.field('inner2', pa.int64()) ]) )) ), pa.field('basic2', pa.int64()), ] arrs = [ [0], [[1, 2]], [{"middle": {"inner": 3}, "inner2": 4}], [[{"inner1": 5, "inner2": 6}, {"inner1": 7, "inner2": 8}]], [9]] table = pa.table(arrs, schema=pa.schema(fields)) path = str(tempdir / 'test.orc') orc.write_table(table, path) orc_file = orc.ORCFile(path) # default selecting all columns result1 = orc_file.read() assert result1.equals(table) # selecting with columns names result2 = orc_file.read(columns=["basic", "basic2"]) assert result2.equals(table.select(["basic", "basic2"])) result3 = orc_file.read(columns=["list", "struct", "basic2"]) assert result3.equals(table.select(["list", "struct", "basic2"])) # using dotted paths result4 = orc_file.read(columns=["struct.middle.inner"]) expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]}) assert result4.equals(expected4) result5 = orc_file.read(columns=["struct.inner2"]) expected5 = pa.table({"struct": [{"inner2": 4}]}) assert result5.equals(expected5) result6 = orc_file.read( columns=["list", "struct.middle.inner", "struct.inner2"] ) assert result6.equals(table.select(["list", "struct"])) result7 = orc_file.read(columns=["list-struct.inner1"]) expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]}) assert result7.equals(expected7) # selecting with (Arrow-based) field indices result2 = orc_file.read(columns=[0, 4]) assert result2.equals(table.select(["basic", "basic2"])) result3 = orc_file.read(columns=[1, 2, 3]) assert result3.equals(table.select(["list", "struct", "list-struct"])) # error on non-existing name or index with pytest.raises(IOError): # liborc returns ParseError, which gets translated into IOError # instead of ValueError orc_file.read(columns=["wrong"]) with pytest.raises(ValueError): orc_file.read(columns=[5])
def test_schema_to_string_with_metadata(): lorem = """\ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla accumsan vel turpis et mollis. Aliquam tincidunt arcu id tortor blandit blandit. Donec eget leo quis lectus scelerisque varius. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Praesent faucibus, diam eu volutpat iaculis, tellus est porta ligula, a efficitur turpis nulla facilisis quam. Aliquam vitae lorem erat. Proin a dolor ac libero dignissim mollis vitae eu mauris. Quisque posuere tellus vitae massa pellentesque sagittis. Aenean feugiat, diam ac dignissim fermentum, lorem sapien commodo massa, vel volutpat orci nisi eu justo. Nulla non blandit sapien. Quisque pretium vestibulum urna eu vehicula.""" # ARROW-7063 my_schema = pa.schema([pa.field("foo", "int32", False, metadata={"key1": "value1"}), pa.field("bar", "string", True, metadata={"key3": "value3"})], metadata={"lorem": lorem}) assert my_schema.to_string() == """\ foo: int32 not null -- field metadata -- key1: 'value1' bar: string -- field metadata -- key3: 'value3' -- schema metadata -- lorem: '""" + lorem[:65] + "' + " + str(len(lorem) - 65) # Metadata that exactly fits result = pa.schema([('f0', 'int32')], metadata={'key': 'value' + 'x' * 62}).to_string() assert result == """\ f0: int32 -- schema metadata -- key: 'valuexxxxxxxxxxxxxxxxxxxxxxxxxxxxx\ xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'""" assert my_schema.to_string(truncate_metadata=False) == """\ foo: int32 not null -- field metadata -- key1: 'value1' bar: string -- field metadata -- key3: 'value3' -- schema metadata -- lorem: '{}'""".format(lorem) assert my_schema.to_string(truncate_metadata=False, show_field_metadata=False) == """\ foo: int32 not null bar: string -- schema metadata -- lorem: '{}'""".format(lorem) assert my_schema.to_string(truncate_metadata=False, show_schema_metadata=False) == """\ foo: int32 not null -- field metadata -- key1: 'value1' bar: string -- field metadata -- key3: 'value3'""" assert my_schema.to_string(truncate_metadata=False, show_field_metadata=False, show_schema_metadata=False) == """\
def list_flights(self, context, criteria): yield flight.FlightInfo(pa.schema([]), flight.FlightDescriptor.for_path('/foo'), [], -1, -1) raise flight.FlightInternalError("foo")
def list_flights(self, context, criteria): if criteria == self.CRITERIA: yield flight.FlightInfo(pa.schema([]), flight.FlightDescriptor.for_path('/foo'), [], -1, -1)
PQ_TYPES = { 'flow_id': pa.int64(), 'net_src': pa.string(), 'ip_src': pa.string(), 'ip_dst': pa.string(), 'port_src': pa.int64(), 'port_dst': pa.int64(), 'tcp_flags_0': pa.int64(), 'start_time': pa.int64(), 'end_time': pa.int64(), 'duration': pa.int64(), 'proto': pa.int64(), 'total_byte_cnt_0': pa.int64(), 'total_pkt_cnt_0': pa.int64() } PQ_SCHEMA = pa.schema([pa.field(it, PQ_TYPES[it]) for it in PQ_TYPES.keys()]) def timestamps_to_index_dates(start_ts, end_ts, margin=600): def ts_to_day_start(ts): return int(time.mktime(datetime.fromtimestamp(ts).date().timetuple())) start_day_ts = ts_to_day_start(start_ts - margin) end_day_ts = ts_to_day_start(end_ts + margin) gen = xrange(start_day_ts, end_day_ts + DAY_SECS, DAY_SECS) return [datetime.fromtimestamp(it).strftime('%y%m%d00') for it in gen] def get_flow_batch(start_ts, end_ts): es_client = Elasticsearch(ES_CONF, **ES_OPTS) indices = ','.join([
+ bq_attr_fields + [ f"{ds.GetRasterBand(iband + 1).GetDescription()}:{outtype}" for iband in range(ds.RasterCount) ] ) ) # create parquet schema fields = [ ("geography", pa.string()), ("geography_polygon", pa.string()) ] \ + pq_attr_fields \ + [(ds.GetRasterBand(iband + 1).GetDescription(), pq_type) for iband in range(ds.RasterCount)] pq_schema = pa.schema(fields) # Pandas dataframe column names columns = [field_name for (field_name, field_type) in fields] # process the file only when the output file specified if len(sys.argv) <= 2: exit(0) outfile = sys.argv[2] # prepare transfrom from pixel to raster coordinates gt = ds.GetGeoTransform() hw = (0.5 * gt[1]) hh = (0.5 * -gt[5]) with pq.ParquetWriter(outfile, pq_schema) as writer:
def initialize_write( df, fs, path, append=False, partition_on=None, ignore_divisions=False, division_info=None, schema=None, index_cols=None, **kwargs, ): # Infer schema if "infer" # (also start with inferred schema if user passes a dict) if schema == "infer" or isinstance(schema, dict): # Start with schema from _meta_nonempty _schema = pa.Schema.from_pandas( df._meta_nonempty.set_index(index_cols) if index_cols else df. _meta_nonempty) # Use dict to update our inferred schema if isinstance(schema, dict): schema = pa.schema(schema) for name in schema.names: i = _schema.get_field_index(name) j = schema.get_field_index(name) _schema = _schema.set(i, schema.field(j)) # If we have object columns, we need to sample partitions # until we find non-null data for each column in `sample` sample = [col for col in df.columns if df[col].dtype == "object"] if schema_field_supported and sample and schema == "infer": delayed_schema_from_pandas = delayed(pa.Schema.from_pandas) for i in range(df.npartitions): # Keep data on worker _s = delayed_schema_from_pandas( df[sample].to_delayed()[i]).compute() for name, typ in zip(_s.names, _s.types): if typ != "null": i = _schema.get_field_index(name) j = _s.get_field_index(name) _schema = _schema.set(i, _s.field(j)) sample.remove(name) if not sample: break # Final (inferred) schema schema = _schema dataset = fmd = None i_offset = 0 if append and division_info is None: ignore_divisions = True fs.mkdirs(path, exist_ok=True) if append: try: # Allow append if the dataset exists. # Also need dataset.metadata object if # ignore_divisions is False (to check divisions) dataset = pq.ParquetDataset(path, filesystem=fs) if not dataset.metadata and not ignore_divisions: # TODO: Be more flexible about existing metadata. raise NotImplementedError( "_metadata file needed to `append` " "with `engine='pyarrow'` " "unless `ignore_divisions` is `True`") fmd = dataset.metadata except (IOError, ValueError, IndexError): # Original dataset does not exist - cannot append append = False if append: names = dataset.metadata.schema.names has_pandas_metadata = ( dataset.schema.to_arrow_schema().metadata is not None and b"pandas" in dataset.schema.to_arrow_schema().metadata) if has_pandas_metadata: pandas_metadata = json.loads(dataset.schema.to_arrow_schema( ).metadata[b"pandas"].decode("utf8")) categories = [ c["name"] for c in pandas_metadata["columns"] if c["pandas_type"] == "categorical" ] else: categories = None dtypes = _get_pyarrow_dtypes(dataset.schema.to_arrow_schema(), categories) if set(names) != set(df.columns) - set(partition_on): raise ValueError("Appended columns not the same.\n" "Previous: {} | New: {}".format( names, list(df.columns))) elif (pd.Series(dtypes).loc[names] != df[names].dtypes).any(): # TODO Coerce values for compatible but different dtypes raise ValueError("Appended dtypes differ.\n{}".format( set(dtypes.items()) ^ set(df.dtypes.iteritems()))) i_offset = len(dataset.pieces) if division_info["name"] not in names: ignore_divisions = True if not ignore_divisions: old_end = None row_groups = [ dataset.metadata.row_group(i) for i in range(dataset.metadata.num_row_groups) ] for row_group in row_groups: for i, name in enumerate(names): if name != division_info["name"]: continue column = row_group.column(i) if column.statistics: if not old_end: old_end = column.statistics.max else: old_end = max(old_end, column.statistics.max) break divisions = division_info["divisions"] if divisions[0] < old_end: raise ValueError( "Appended divisions overlapping with the previous ones" " (set ignore_divisions=True to append anyway).\n" "Previous: {} | New: {}".format(old_end, divisions[0])) return fmd, schema, i_offset
def _determine_schemas_to_compare(schemas, ignore_pandas): """ Iterate over a list of `pyarrow.Schema` objects and prepares them for comparison by picking a reference and determining all null columns. .. note:: If pandas metadata exists, the version stored in the metadata is overwritten with the currently installed version since we expect to stay backwards compatible Returns ------- reference: Schema A reference schema which is picked from the input list. The reference schema is guaranteed to be a schema having the least number of null columns of all input columns. The set of null columns is guaranteed to be a true subset of all null columns of all input schemas. If no such schema can be found, an Exception is raised list_of_schemas: List[Tuple[Schema, List]] A list holding pairs of (Schema, null_columns) where the null_columns are all columns which are null and must be removed before comparing the schemas """ has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas schemas_to_evaluate = [] reference = None null_cols_in_reference = set() for schema in schemas: if not isinstance(schema, SchemaWrapper): schema = SchemaWrapper(schema, None) if has_pandas: metadata = schema.metadata if metadata is None or b"pandas" not in metadata: raise ValueError( "Pandas and non-Pandas schemas are not comparable. " "Use ignore_pandas=True if you only want to compare " "on Arrow level.") pandas_metadata = load_json(metadata[b"pandas"].decode("utf8")) # we don't care about the pandas version, since we assume it's safe # to read datasets that were written by older or newer versions. pandas_metadata["pandas_version"] = "{}".format(pd.__version__) metadata_clean = deepcopy(metadata) metadata_clean[b"pandas"] = _dict_to_binary(pandas_metadata) current = SchemaWrapper(pa.schema(schema, metadata_clean), schema.origin) else: current = schema # If a field is null we cannot compare it and must therefore reject it null_columns = { field.name for field in current if field.type == pa.null() } # Determine a valid reference schema. A valid reference schema is considered to be the schema # of all input schemas with the least empty columns. # The reference schema ought to be a schema whose empty columns are a true subset for all sets # of empty columns. This ensures that the actual reference schema is the schema with the most # information possible. A schema which doesn't fulfil this requirement would weaken the # comparison and would allow for false positives # Trivial case if reference is None: reference = current null_cols_in_reference = null_columns # The reference has enough information to validate against current schema. # Append it to the list of schemas to be verified elif null_cols_in_reference.issubset(null_columns): schemas_to_evaluate.append((current, null_columns)) # current schema includes all information of reference and more. # Add reference to schemas_to_evaluate and update reference elif null_columns.issubset(null_cols_in_reference): schemas_to_evaluate.append((reference, null_cols_in_reference)) reference = current null_cols_in_reference = null_columns # If there is no clear subset available elect the schema with the least null columns as `reference`. # Iterate over the null columns of `reference` and replace it with a non-null field of the `current` # schema which recovers the loop invariant (null columns of `reference` is subset of `current`) else: if len(null_columns) < len(null_cols_in_reference): reference, current = current, reference null_cols_in_reference, null_columns = ( null_columns, null_cols_in_reference, ) for col in null_cols_in_reference - null_columns: # Enrich the information in the reference by grabbing the missing fields # from the current iteration. This assumes that we only check for global validity and # isn't relevant where the reference comes from. reference = _swap_fields_by_name(reference, current, col) null_cols_in_reference.remove(col) schemas_to_evaluate.append((current, null_columns)) if reference is None and schemas_to_evaluate: reference = schemas_to_evaluate.pop()[0] return reference, schemas_to_evaluate
def map(self, function, with_indices: bool = False, batched: bool = False, batch_size: Optional[int] = 1000, remove_columns: Optional[List[str]] = None, keep_in_memory: bool = False, load_from_cache_file: bool = True, cache_file_name: Optional[str] = None, writer_batch_size: Optional[int] = 1000, arrow_schema: Optional[pa.Schema] = None, disable_nullable: bool = True): """ Apply a function to all the elements in the table (individually or in batches) and update the table (if function does updated examples). Args: `function` (`callable`): with one of the following signature: - `function(example: Dict) -> Union[Dict, Any]` if `batched=False` and `with_indices=False` - `function(example: Dict, indices: int) -> Union[Dict, Any]` if `batched=False` and `with_indices=True` - `function(batch: Dict[List]) -> Union[Dict, Any]` if `batched=True` and `with_indices=False` - `function(batch: Dict[List], indices: List[int]) -> Union[Dict, Any]` if `batched=True` and `with_indices=True` `with_indices` (`bool`, default: `False`): Provide example indices to `function` `batched` (`bool`, default: `False`): Provide batch of examples to `function` `batch_size` (`Optional[int]`, default: `1000`): Number of examples per batch provided to `function` if `batched=True` `batch_size <= 0` or `batch_size == None`: Provide the full dataset as a single batch to `function` `remove_columns` (`Optional[List[str]]`, default: `None`): Remove a selection of columns while doing the mapping. Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding columns with names in `remove_columns`, these columns will be kept. `keep_in_memory` (`bool`, default: `False`): Keep the dataset in memory instead of writing it to a cache file. `load_from_cache_file` (`bool`, default: `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. `cache_file_name` (`Optional[str]`, default: `None`): Provide the name of a cache file to use to store the results of the computation instead of the automatically generated cache file name. `writer_batch_size` (`int`, default: `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. `arrow_schema` (`Optional[pa.Schema]`, default: `None`): Use a specific Apache Arrow Schema to store the cache file instead of the automatically generated one. `disable_nullable` (`bool`, default: `True`): Allow null values in the table. """ # If the array is empty we do nothing if len(self) == 0: return self # Select the columns (arrow columns) to process if remove_columns is not None and any(col not in self._data.column_names for col in remove_columns): raise ValueError("Column to remove {} not in the dataset. Current columns in the dataset: {}".format( list(filter(lambda col: col not in self._data.column_names, remove_columns)), self._data.column_names)) # If we do batch computation but no batch sze is provided, default to the full dataset if batched and (batch_size is None or batch_size <= 0): batch_size = self._data.num_rows # Check if the function returns updated examples def does_function_return_dict(inputs, indices): """ Does the function returns a dict. """ processed_inputs = function(inputs, indices) if with_indices else function(inputs) return isinstance(processed_inputs, Mapping) # We only update the data table (and use the cache) if the function returns a dict. # Test it on the first element or a small batch (0, 1) for batched inputs test_inputs = self[:2] if batched else self[0] test_indices = [0, 1] if batched else 0 update_data = does_function_return_dict(test_inputs, test_indices) def apply_function_on_filtered_inputs(inputs, indices): """ Utility to apply the function on a selection of columns. """ processed_inputs = function(inputs, indices) if with_indices else function(inputs) if not update_data: return None # Nothing to update, let's move on if remove_columns is not None: for column in remove_columns: inputs.pop(column) inputs.update(processed_inputs) return inputs # Find the output schema if none is given test_inputs = self[:2] if batched else self[0] test_indices = [0, 1] if batched else 0 test_output = apply_function_on_filtered_inputs(test_inputs, test_indices) if arrow_schema is None and update_data: if not batched: test_output = self.nest(test_output) test_output = convert_tuples_in_lists(test_output) arrow_schema = pa.Table.from_pydict(test_output).schema if disable_nullable: arrow_schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in arrow_schema) # Check if we've already cached this computation (indexed by a hash) if self._data_files and update_data: if cache_file_name is None: # we create a unique hash from the function, current dataset file and the mapping args cache_kwargs = {"with_indices": with_indices, "batched": batched, "batch_size": batch_size, "remove_columns": remove_columns, "keep_in_memory": keep_in_memory, "load_from_cache_file": load_from_cache_file, "cache_file_name": cache_file_name, "writer_batch_size": writer_batch_size, "arrow_schema": arrow_schema, "disable_nullable": disable_nullable,} cache_file_name = self._get_cache_file_path(function, cache_kwargs) if os.path.exists(cache_file_name) and load_from_cache_file: logger.info("Loading cached processed dataset at %s", cache_file_name) return Dataset.from_file(cache_file_name) # Prepare output buffer and batched writer in memory or on file if we update the table if update_data: if keep_in_memory or not self._data_files: buf_writer = pa.BufferOutputStream() writer = ArrowWriter(schema=arrow_schema, stream=buf_writer, writer_batch_size=writer_batch_size) else: buf_writer = None logger.info("Caching processed dataset at %s", cache_file_name) writer = ArrowWriter(schema=arrow_schema, path=cache_file_name, writer_batch_size=writer_batch_size) # Loop over single examples or batches and write to buffer/file if examples are to be updated if not batched: for i, example in tqdm(enumerate(self)): example = apply_function_on_filtered_inputs(example, i) if update_data: writer.write(example) else: for i in tqdm(range(0, len(self), batch_size)): batch = self[i:i+batch_size] indices = list(range(*(slice(i, i+batch_size).indices(self._data.num_rows)))) # Something simpler? batch = apply_function_on_filtered_inputs(batch, indices) if update_data: writer.write_batch(batch) if update_data: writer.finalize() # close_stream=bool(buf_writer is None)) # We only close if we are writing in a file # Create new Dataset from buffer or file if buf_writer is None: return Dataset.from_file(cache_file_name) else: return Dataset.from_buffer(buf_writer.getvalue()) else: return self
def test_recordbatch_from_arrays_validate_schema(): # ARROW-6263 arr = pa.array([1, 2]) schema = pa.schema([pa.field('f0', pa.list_(pa.utf8()))]) with pytest.raises(NotImplementedError): pa.record_batch([arr], schema=schema)
AVRO_SCHEMA = { 'namespace': 'avro.wordcount', 'type': 'record', 'name': 'WordCount', 'fields': [{ 'name': 'word', 'type': 'string' }, { 'name': 'count', 'type': 'int' }] } PARQUET_SCHEMA = pyarrow.schema([('word', pyarrow.string()), ('count', pyarrow.int64())]) DEFAULT_CODEC = 'snappy' class WordExtractingDoFn(beam.DoFn): """Parse each line of input text into words.""" def __init__(self): self.words_counter = Metrics.counter(self.__class__, 'words') self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths') self.word_lengths_dist = Metrics.distribution(self.__class__, 'word_len_dist') self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines') def process(self, element):
def test_filesystem_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(dictionary_columns={"str"}) ) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema([ pa.field('group', pa.int32()), pa.field('key', pa.string()) ]) ) assert options.partition_base_dir == 'subdir' assert options.selector_ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is False factory = ds.FileSystemDatasetFactory( mockfs, paths_or_selector, format, options ) inspected_schema = factory.inspect() assert factory.inspect().equals(pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()), pa.field('str', pa.dictionary(pa.int32(), pa.string())), pa.field('const', pa.int64()), pa.field('group', pa.int32()), pa.field('key', pa.string()), ]), check_metadata=False) assert isinstance(factory.inspect_schemas(), list) assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset) assert factory.root_partition.equals(ds.scalar(True)) dataset = factory.finish() assert isinstance(dataset, ds.FileSystemDataset) assert len(list(dataset.scan())) == 2 scanner = ds.Scanner.from_dataset(dataset) expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) expected_str = pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, 3, 4], type=pa.int32()), pa.array("0 1 2 3 4".split(), type=pa.string()) ) for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group = pa.array([group] * 5, type=pa.int32()) expected_key = pa.array([key] * 5, type=pa.string()) expected_const = pa.array([group - 1] * 5, type=pa.int64()) for batch in task.execute(): assert batch.num_columns == 6 assert batch[0].equals(expected_i64) assert batch[1].equals(expected_f64) assert batch[2].equals(expected_str) assert batch[3].equals(expected_const) assert batch[4].equals(expected_group) assert batch[5].equals(expected_key) table = dataset.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 assert table.num_columns == 6
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, safe=True): (all_names, column_names, index_column_names, index_descriptors, index_columns, columns_to_convert, convert_fields) = _get_columns_to_convert(df, schema, preserve_index, columns) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, field): if field is None: field_nullable = True type_ = None else: field_nullable = field.nullable type_ = field.type try: result = pa.array(col, type=type_, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ( "Conversion failed for column {0!s} with type {1!s}".format( col.name, col.dtype), ) raise e if not field_nullable and result.null_count > 0: raise ValueError("Field {} was non-nullable but pandas column " "had {} null values".format( str(field), result.null_count)) return result if nthreads == 1: arrays = [ convert_column(c, f) for c, f in zip(columns_to_convert, convert_fields) ] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list( executor.map(convert_column, columns_to_convert, convert_fields)) types = [x.type for x in arrays] if schema is not None: # add index columns index_types = types[len(column_names):] for name, type_ in zip(index_column_names, index_types): name = name if name is not None else 'None' schema = schema.append(pa.field(name, type_)) else: fields = [] for name, type_ in zip(all_names, types): name = name if name is not None else 'None' fields.append(pa.field(name, type_)) schema = pa.schema(fields) metadata = construct_metadata(df, column_names, index_columns, index_descriptors, preserve_index, types) schema = schema.add_metadata(metadata) return arrays, schema
def test_boolean_object_nulls(self): arr = np.array([False, None, True] * 100, dtype=object) df = pd.DataFrame({'bools': arr}) field = pa.field('bools', pa.bool_()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
def make_meta(obj, origin, partition_keys=None): """ Create metadata object for DataFrame. .. note:: This function can, for convenience reasons, also be applied to schema objects in which case they are just returned. .. warning:: Information for categoricals will be stripped! :meth:`normalize_type` will be applied to normalize type information and :meth:`normalize_column_order` will be applied to to reorder column information. Parameters ---------- obj: Union[DataFrame, Schema] Object to extract metadata from. origin: str Origin of the schema data, used for debugging and error reporting. partition_keys: Union[None, List[str]] Partition keys used to split the dataset. Returns ------- schema: SchemaWrapper Schema information for DataFrame. """ if isinstance(obj, SchemaWrapper): return obj if isinstance(obj, pa.Schema): return SchemaWrapper(obj, origin) if not isinstance(obj, pd.DataFrame): raise ValueError( "Input must be a pyarrow schema, or a pandas dataframe") if ARROW_LARGER_EQ_0130: schema = pa.Schema.from_pandas(obj) else: table = pa.Table.from_pandas(obj) schema = table.schema del table pandas_metadata = _pandas_meta_from_schema(schema) # normalize types fields = dict([(field.name, field.type) for field in schema]) for cmd in pandas_metadata["columns"]: name = cmd.get("name") if name is None: continue field_name = cmd["field_name"] field_idx = schema.get_field_index(field_name) field = schema[field_idx] if pa.types.is_dictionary(field.type): # TODO: remove this with Arrow 0.10 when we can access the dictionary # information. tmp_df = pd.DataFrame({field.name: obj[field.name].cat.categories}) tmp_schema = pa.Table.from_pandas(tmp_df).schema field = tmp_schema[0] tmp_metadata = _pandas_meta_from_schema(schema) cmd = tmp_metadata["columns"][0] fields[field_name], cmd["pandas_type"], cmd["numpy_type"], cmd[ "metadata"] = normalize_type(field.type, cmd["pandas_type"], cmd["numpy_type"], cmd["metadata"]) metadata = schema.metadata metadata[b"pandas"] = _dict_to_binary(pandas_metadata) schema = pa.schema([pa.field(n, t) for n, t in fields.items()], metadata) return normalize_column_order(SchemaWrapper(schema, origin), partition_keys)
def test_filesystem_dataset(mockfs): schema = pa.schema([ pa.field('const', pa.int64()) ]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.scalar(True), ds.scalar(True)] dataset = ds.FileSystemDataset( schema=schema, format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions ) assert isinstance(dataset.format, ds.ParquetFileFormat) # the root_partition and partitions keywords have defaults dataset = ds.FileSystemDataset( paths, schema, format=file_format, filesystem=mockfs, ) assert isinstance(dataset.format, ds.ParquetFileFormat) # validation of required arguments with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format) # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format, filesystem=mockfs, root_partition=1) root_partition = ds.field('level') == ds.scalar(1337) partitions = [ds.field('part') == x for x in range(1, 3)] dataset = ds.FileSystemDataset( paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, format=file_format ) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals(partition) assert fragment.path == path assert isinstance(fragment.format, ds.ParquetFileFormat) assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2
def normalize_column_order(schema, partition_keys=None): """ Normalize column order in schema. Columns are sorted in the following way: 1. Partition keys (as provided by ``partition_keys``) 2. DataFrame columns in alphabetic order 3. Remaining fields as generated by pyarrow, mostly index columns Parameters ---------- schema: SchemaWrapper Schema information for DataFrame. partition_keys: Union[None, List[str]] Partition keys used to split the dataset. Returns ------- schema: SchemaWrapper Schema information for DataFrame. """ if not isinstance(schema, SchemaWrapper): schema = SchemaWrapper(schema, None) if partition_keys is None: partition_keys = [] else: partition_keys = list(partition_keys) pandas_metadata = _pandas_meta_from_schema(schema) origin = schema.origin cols_partition = {} cols_payload = [] cols_misc = [] for cmd in pandas_metadata["columns"]: name = cmd.get("name") field_name = cmd["field_name"] field_idx = schema.get_field_index(field_name) if field_idx >= 0: field = schema[field_idx] else: field = None if name is None: cols_misc.append((cmd, field)) elif name in partition_keys: cols_partition[name] = (cmd, field) else: cols_payload.append((name, cmd, field)) ordered = [] for k in partition_keys: if k in cols_partition: ordered.append(cols_partition[k]) ordered += [(cmd, f) for _name, cmd, f in sorted(cols_payload, key=lambda x: x[0])] ordered += cols_misc pandas_metadata["columns"] = [cmd for cmd, _ in ordered] fields = [f for _, f in ordered if f is not None] metadata = schema.metadata metadata[b"pandas"] = _dict_to_binary(pandas_metadata) schema = pa.schema(fields, metadata) return SchemaWrapper(schema, origin)
def make_schema(): return pa.schema([('field', pa.int64())])
def _to_arrow_schema(row_type): return pa.schema([ pa.field(n, to_arrow_type(t), t._nullable) for n, t in zip(row_type.field_names(), row_type.field_types()) ])
def test_flight_get_schema(): """Make sure GetSchema returns correct schema.""" with GetInfoFlightServer() as server: client = FlightClient(('localhost', server.port)) info = client.get_schema(flight.FlightDescriptor.for_command(b'')) assert info.schema == pa.schema([('a', pa.int32())])
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df
def do_get(self, context, ticket): return flight.GeneratorStream(pa.schema([('a', pa.int32())]), self.slow_stream())
import pyarrow as pa PQ_SCHEMAS = dict() # site_visits fields = [ pa.field('visit_id', pa.int64(), nullable=False), pa.field('crawl_id', pa.uint32(), nullable=False), pa.field('instance_id', pa.uint32(), nullable=False), pa.field('site_url', pa.string(), nullable=False), pa.field('site_rank', pa.uint32()) ] PQ_SCHEMAS['site_visits'] = pa.schema(fields) # crawl_history fields = [ pa.field('crawl_id', pa.uint32(), nullable=False), pa.field('visit_id', pa.int64(), nullable=False), pa.field('instance_id', pa.uint32(), nullable=False), pa.field('command', pa.string()), pa.field('arguments', pa.string()), pa.field('retry_number', pa.int8()), pa.field('command_status', pa.string()), pa.field('error', pa.string()), pa.field('traceback', pa.string()) ] PQ_SCHEMAS['crawl_history'] = pa.schema(fields) # http_requests fields = [ pa.field('incognito', pa.int32()),
def test_to_column_info(): schema = pa.schema([ pa.field("col_boolean", pa.bool_()), pa.field("col_tinyint", pa.int32()), pa.field("col_smallint", pa.int32()), pa.field("col_int", pa.int32()), pa.field("col_bigint", pa.int64()), pa.field("col_float", pa.float32()), pa.field("col_double", pa.float64()), pa.field("col_string", pa.string()), pa.field("col_varchar", pa.string()), pa.field("col_timestamp", pa.timestamp("ns")), pa.field("col_date", pa.date32()), pa.field("col_binary", pa.binary()), pa.field("col_array", pa.list_(pa.field("array_element", pa.int32()))), pa.field("col_map", pa.map_(pa.int32(), pa.field("entries", pa.int32()))), pa.field( "col_struct", pa.struct([pa.field("a", pa.int32()), pa.field("b", pa.int32())]), ), pa.field("col_decimal", pa.decimal128(10, 1)), ]) assert to_column_info(schema) == ( { "Name": "col_boolean", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "boolean", }, { "Name": "col_tinyint", "Nullable": "NULLABLE", "Precision": 10, "Scale": 0, "Type": "integer", }, { "Name": "col_smallint", "Nullable": "NULLABLE", "Precision": 10, "Scale": 0, "Type": "integer", }, { "Name": "col_int", "Nullable": "NULLABLE", "Precision": 10, "Scale": 0, "Type": "integer", }, { "Name": "col_bigint", "Nullable": "NULLABLE", "Precision": 19, "Scale": 0, "Type": "bigint", }, { "Name": "col_float", "Nullable": "NULLABLE", "Precision": 17, "Scale": 0, "Type": "float", }, { "Name": "col_double", "Nullable": "NULLABLE", "Precision": 17, "Scale": 0, "Type": "double", }, { "Name": "col_string", "Nullable": "NULLABLE", "Precision": 2147483647, "Scale": 0, "Type": "varchar", }, { "Name": "col_varchar", "Nullable": "NULLABLE", "Precision": 2147483647, "Scale": 0, "Type": "varchar", }, { "Name": "col_timestamp", "Nullable": "NULLABLE", "Precision": 3, "Scale": 0, "Type": "timestamp", }, { "Name": "col_date", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "date", }, { "Name": "col_binary", "Nullable": "NULLABLE", "Precision": 1073741824, "Scale": 0, "Type": "varbinary", }, { "Name": "col_array", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "array", }, { "Name": "col_map", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "map", }, { "Name": "col_struct", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "row", }, { "Name": "col_decimal", "Nullable": "NULLABLE", "Precision": 10, "Scale": 1, "Type": "decimal", }, )
def convert_csv_to_parquet(csv_file: str, parquet_dir: str, partitioned: bool): print("Start ", datetime.now()) print(csv_file) print(parquet_dir) print("Abs path of csv file: " + os.path.abspath(csv_file)) #Remove old partitions if partitioned: if Path(parquet_dir).is_dir(): shutil.rmtree(parquet_dir) # ReadOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions csv_read_options = pv.ReadOptions(skip_rows=0, encoding="utf8", column_names=[ "unit_id", "value", "start", "stop", "start_year", "start_epoch_days", "stop_epoch_days" ]) # ParseOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html#pyarrow.csv.ParseOptions csv_parse_options = pv.ParseOptions(delimiter=';') # Types: https://arrow.apache.org/docs/python/api/datatypes.html # TODO nullable parameter does not work as expected! data_schema = pa.schema([ pa.field(name='start_year', type=pa.string(), nullable=True), pa.field(name='unit_id', type=pa.uint64(), nullable=False), pa.field(name='value', type=pa.string(), nullable=False), pa.field(name='start_epoch_days', type=pa.int16(), nullable=True), pa.field(name='stop_epoch_days', type=pa.int16(), nullable=True), ]) # ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions csv_convert_options = pv.ConvertOptions(column_types=data_schema) #include_columns=["start_year", "unit_id", "value", "start_epoch_days", "stop_epoch_days"]) # read_csv: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html#pyarrow.csv.read_csv table = pv.read_csv(input_file=csv_file, read_options=csv_read_options, parse_options=csv_parse_options, convert_options=csv_convert_options) # print('Bytes: ' + str(table.nbytes)) # print('Rows: ' + str(table.num_rows)) # print('Schema: ' + str(table.schema)) # print('Column names: ' + str(table.column_names)) # pandas.set_option('max_columns', None) # print all columns # print(table.to_pandas().head(10)) # write with partitions if partitioned: pq.write_to_dataset(table, root_path=parquet_dir, partition_cols=['start_year']) else: pq.write_to_dataset(table, root_path=parquet_dir) print("End ", datetime.now())
def test_flight_get_schema(): """Make sure GetSchema returns correct schema.""" with flight_server(GetInfoFlightServer) as server_location: client = flight.FlightClient.connect(server_location) info = client.get_schema(flight.FlightDescriptor.for_command(b'')) assert info.schema == pa.schema([('a', pa.int32())])
def test_union_dataset_from_other_datasets(tempdir, multisourcefs): child1 = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') child2 = ds.dataset('/schema', filesystem=multisourcefs, format='parquet', partitioning=['week', 'color']) child3 = ds.dataset('/hive', filesystem=multisourcefs, format='parquet', partitioning='hive') assert child1.schema != child2.schema != child3.schema assembled = ds.dataset([child1, child2, child3]) assert isinstance(assembled, ds.UnionDataset) msg = 'cannot pass any additional arguments' with pytest.raises(ValueError, match=msg): ds.dataset([child1, child2], filesystem=multisourcefs) expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), ('value', pa.float64()), ('color', pa.string()), ('week', pa.int32()), ('year', pa.int32()), ('month', pa.int32()), ]) assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) assembled = ds.dataset([child1, child3]) expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), ('value', pa.float64()), ('color', pa.string()), ('year', pa.int32()), ('month', pa.int32()), ]) assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) expected_schema = pa.schema([ ('month', pa.int32()), ('color', pa.string()), ('date', pa.date32()), ]) assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) expected_schema = pa.schema([ ('month', pa.int32()), ('color', pa.string()), ('unkown', pa.string()) # fill with nulls ]) assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) # incompatible schemas, date and index columns have conflicting types table = pa.table([range(9), [0.] * 4 + [1.] * 5, 'abcdefghj'], names=['date', 'value', 'index']) _, path = _create_single_file(tempdir, table=table) child4 = ds.dataset(path) with pytest.raises(pa.ArrowInvalid, match='Unable to merge'): ds.dataset([child1, child4])
def schema(self): return pa.schema( self.features.type) if self.features is not None else None
import pyarrow as pa input_schema = pa.schema([ pa.field("input", pa.uint8(), False).with_metadata({b'fletcher_epc': b'8'}) ]).with_metadata({ b'fletcher_mode': b'read', b'fletcher_name': b'input' }) pa.output_stream("in.as").write(input_schema.serialize()) test_rec = """{ "timestamp": "2005-09-09T11:59:06-10:01", "accel_decel": 11446688, "timezone": 883, "vin": 8834555, "odometer": 99711112, "hypermiling": false, "avgspeed": 156, "sec_in_band": [3403, 893, 2225, 78, 162, 2332, 1473, 2587, 3446, 178, 997, 2403], "miles_in_time_range": [3376, 2553, 2146, 919, 2241, 1044, 1079, 3751, 1665, 2062, 46, 2868, 375, 3305, 4109, 3319, 627, 3523, 2225, 357, 1653, 2757, 3477, 3549], "const_speed_miles_in_band": [4175, 2541, 2841, 157, 2922, 651, 315, 2484, 2696, 165, 1366, 958], "vary_speed_miles_in_band": [2502, 155, 1516, 1208, 2229, 1850, 4032, 3225, 2704, 2064, 484, 3073], "sec_decel": [722, 2549, 547, 3468, 844, 3064, 2710, 1515, 763, 2972], "sec_accel": [4175, 2541, 2841, 157, 2922, 651, 315, 2484, 2696, 165, 1366, 958], "accel": [1780, 228, 1267, 2389, 437, 871], "orientation": false, "braking": [724, 2549, 547], "small_speed_var": [724, 2549, 54788], "large_speed_var": [724, 2549, 5478], "speed_changes": 156