def test_table_from_pandas_schema(): # passed schema is source of truth for the columns import pandas as pd df = pd.DataFrame( OrderedDict([('strs', ['', 'foo', 'bar']), ('floats', [4.5, 5, None])])) # with different but compatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())]) table = pa.Table.from_pandas(df, schema=schema) assert pa.types.is_float32(table.column('floats').type) assert table.schema.remove_metadata() == schema # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): pa.Table.from_pandas(df, schema=schema) # schema has columns not present in data -> error schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64()), ('ints', pa.int64())]) with pytest.raises(KeyError, match='ints'): pa.Table.from_pandas(df, schema=schema) # data has columns not present in schema -> ignored schema = pa.schema([('strs', pa.utf8())]) table = pa.Table.from_pandas(df, schema=schema) assert table.num_columns == 1 assert table.schema.remove_metadata() == schema assert table.column_names == ['strs']
def _infer_output_column_type(column: pyarrow.ChunkedArray) -> ColumnType: if column.type == pyarrow.utf8() or (hasattr(column.type, "value_type") and column.type.value_type == pyarrow.utf8()): return ColumnType.Text() else: return ColumnType.Number()
def test_render_v0_add_retweet_status_screen_name(): # Migration: what happens when we accumulate tweets # where the old stored table does not have retweet_status_screen_name? # We should consider those to have just None in that column input_table = pa.table({ "screen_name": ["TheTweepyTester", "TheTweepyTester"], "created_at": pa.array( [dt("2016-11-05T21:38:46Z"), dt("2016-11-05T21:37:13Z")], pa.timestamp("ns"), ), "text": ["Hello", "testing 1000 https://t.co/3vt8ITRQ3w"], "retweet_count": [0, 0], "favorite_count": [0, 0], "in_reply_to_screen_name": pa.array([None, None], pa.utf8()), "user_description": ["", ""], "source": ["Twitter Web Client", "Tweepy dev"], "lang": ["en", "en"], "id": [795017539831103489, 795017147651162112], }) with _temp_parquet_file(input_table) as parquet_path: _assert_render( twitter.FetchResult(parquet_path, []), P(accumulate=False), input_table.add_column(6, "retweeted_status_screen_name", pa.array([None, None], pa.utf8())), [], )
def test_render_v0_truncate_fetch_results(): all_rows = pa.table({ "screen_name": ["TheTweepyTester", "TheTweepyTester"], "created_at": pa.array( [dt("2016-11-05T21:38:46Z"), dt("2016-11-05T21:37:13Z")], pa.timestamp("ns"), ), "text": ["Hello", "testing 1000 https://t.co/3vt8ITRQ3w"], "retweet_count": [0, 0], "favorite_count": [0, 0], "in_reply_to_screen_name": pa.array([None, None], pa.utf8()), "retweeted_status_screen_name": pa.array([None, None], pa.utf8()), "user_description": ["", ""], "source": ["Twitter Web Client", "Tweepy dev"], "lang": ["en", "en"], "id": [795017539831103489, 795017147651162112], }) with _temp_parquet_file(all_rows) as parquet_path: _assert_render( twitter.FetchResult(parquet_path, []), P(accumulate=False), all_rows.slice(0, 1), [], )
def test_render_v0_recover_after_bug_160258591(): # https://www.pivotaltracker.com/story/show/160258591 # 'id', 'retweet_count' and 'favorite_count' had wrong type after # accumulating an empty table. Now the bad data is in our database; # let's convert back to the type we want. input_table = pa.table({ "screen_name": ["TheTweepyTester", "TheTweepyTester"], "created_at": pa.array( [dt("2016-11-05T21:38:46Z"), dt("2016-11-05T21:37:13Z")], pa.timestamp("ns"), ), "text": ["Hello", "testing 1000 https://t.co/3vt8ITRQ3w"], "retweet_count": ["0", "0"], "favorite_count": ["0", "0"], "in_reply_to_screen_name": pa.array([None, None], pa.utf8()), "retweeted_status_screen_name": pa.array([None, None], pa.utf8()), "user_description": ["", ""], "source": ["Twitter Web Client", "Tweepy dev"], "lang": ["en", "en"], "id": ["795017539831103489", "795017147651162112"], }) with _temp_parquet_file(input_table) as parquet_path: _assert_render( twitter.FetchResult(parquet_path, []), P(accumulate=False), (input_table.set_column(3, "retweet_count", pa.array([ 0, 0 ])).set_column(4, "favorite_count", pa.array([0, 0])).set_column( 10, "id", pa.array([795017539831103489, 795017147651162112]))), [], )
def test_table_from_pydict_arrow_arrays(data, klass): data = OrderedDict([('strs', klass(data[0])), ('floats', klass(data[1]))]) schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())]) # With arrays as values table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # With explicit (matching) schema table = pa.Table.from_pydict(data, schema=schema) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # with different but compatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())]) table = pa.Table.from_pydict(data, schema=schema) assert pa.types.is_float32(table.column('floats').type) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): pa.Table.from_pydict(data, schema=schema)
def test_from_arrays_schema(data, klass): data = [klass(data[0]), klass(data[1])] schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())]) table = pa.Table.from_arrays(data, schema=schema) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # length of data and schema not matching schema = pa.schema([('strs', pa.utf8())]) with pytest.raises(ValueError): pa.Table.from_arrays(data, schema=schema) # with different but compatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())]) table = pa.Table.from_arrays(data, schema=schema) assert pa.types.is_float32(table.column('floats').type) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): pa.Table.from_pydict(data, schema=schema) # Cannot pass both schema and metadata / names with pytest.raises(ValueError): pa.Table.from_arrays(data, schema=schema, names=['strs', 'floats']) with pytest.raises(ValueError): pa.Table.from_arrays(data, schema=schema, metadata={b'foo': b'bar'})
def test_column_types_dict(self): # Ask for dict-encoded column types in ConvertOptions column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())), ('b', pa.dictionary(pa.int32(), pa.int64())), ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))), ('d', pa.dictionary(pa.int32(), pa.large_utf8()))] opts = ConvertOptions(column_types=dict(column_types)) rows = (b"a,b,c,d\n" b"abc,123456,1.0,zz\n" b"defg,123456,0.5,xx\n" b"abc,N/A,1.0,xx\n") table = self.read_bytes(rows, convert_options=opts) schema = pa.schema(column_types) expected = { 'a': ["abc", "defg", "abc"], 'b': [123456, 123456, None], 'c': [Decimal("1.00"), Decimal("0.50"), Decimal("1.00")], 'd': ["zz", "xx", "xx"], } assert table.schema == schema assert table.to_pydict() == expected # Unsupported index type column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8())) opts = ConvertOptions(column_types=dict(column_types)) with pytest.raises(NotImplementedError): table = self.read_bytes(rows, convert_options=opts)
def test_render_undefined_language_is_null(): # https://blog.twitter.com/developer/en_us/a/2013/introducing-new-metadata-for-tweets.html with _temp_tarfile([ lambda: _temp_json_path_lz4( "1088215462867959800.json.lz4", Path("tests/files/1_1_one_undefined_lang.json"), {"cjw:apiEndpoint": "1.1/statuses/user_timeline.json"}, ) ]) as tar_path: _assert_render( twitter.FetchResult(tar_path, []), P(accumulate=False), pa.table({ "screen_name": ["workbenchdata"], "created_at": pa.array([dt("Wed Jan 23 23:22:39 +0000 2019")], pa.timestamp("ns")), "text": ["🤖 https://t.co/FOhOfZT9MZ"], "retweet_count": [0], "favorite_count": [1], "in_reply_to_screen_name": pa.nulls(1, pa.utf8()), "retweeted_status_screen_name": pa.nulls(1, pa.utf8()), "user_description": [ "Scrape, clean and analyze data without code. Create reproducible data workflows that can be shared with others" ], "source": ["Twitter for iPhone"], # "lang" is the key data point we're testing "lang": pa.nulls(1, pa.utf8()), "id": [1088215462867959800], }), [], )
def test_large_list_type(): ty = pa.large_list(pa.utf8()) assert isinstance(ty, pa.LargeListType) assert ty.value_type == pa.utf8() with pytest.raises(TypeError): pa.large_list(None)
def test_large_list_type(): ty = pa.large_list(pa.utf8()) assert isinstance(ty, pa.LargeListType) assert ty.value_type == pa.utf8() assert ty.value_field == pa.field("item", pa.utf8(), nullable=True) with pytest.raises(TypeError): pa.large_list(None)
def test_map_type(): ty = pa.map_(pa.utf8(), pa.int32()) assert isinstance(ty, pa.MapType) assert ty.key_type == pa.utf8() assert ty.item_type == pa.int32() with pytest.raises(TypeError): pa.map_(None) with pytest.raises(TypeError): pa.map_(pa.int32(), None)
def test_arrow(): a = pl.Series("a", [1, 2, 3, None]) out = a.to_arrow() assert out == pa.array([1, 2, 3, None]) a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8())) s = pl.Series("a", a) assert s.dtype == pl.Categorical assert (pl.from_arrow( pa.array([["foo"], ["foo", "bar"]], pa.list_(pa.utf8()))).dtype == pl.List)
def test_has_header_when_n_rows_is_1(self): with _temp_csv("A,B") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "A": pa.array([], pa.utf8()), "B": pa.array([], pa.utf8()) }), [], ), )
def reencode_dictionary_array(array: pa.Array) -> pa.Array: if len(array.indices) <= len(array.dictionary): # Groupby often reduces the number of values considerably. Let's shy # away from dictionary when it gives us literally nothing. return array.cast(pa.utf8()) used = np.zeros(len(array.dictionary), np.bool_) used[array.indices] = True if np.all(used): return array # no edit return array.cast(pa.utf8()).dictionary_encode() # TODO optimize
def test_map_type(): ty = pa.map_(pa.utf8(), pa.int32()) assert isinstance(ty, pa.MapType) assert ty.key_type == pa.utf8() assert ty.key_field == pa.field("key", pa.utf8(), nullable=False) assert ty.item_type == pa.int32() assert ty.item_field == pa.field("value", pa.int32(), nullable=True) with pytest.raises(TypeError): pa.map_(None) with pytest.raises(TypeError): pa.map_(pa.int32(), None) with pytest.raises(TypeError): pa.map_(pa.field("name", pa.string(), nullable=True), pa.int64())
def test_is_map(): m = pa.map_(pa.utf8(), pa.int32()) assert types.is_map(m) assert not types.is_map(pa.int32()) fields = pa.map_(pa.field('key_name', pa.utf8(), nullable=False), pa.field('value_name', pa.int32())) assert types.is_map(fields) entries_type = pa.struct([pa.field('key', pa.int8()), pa.field('value', pa.int8())]) list_type = pa.list_(entries_type) assert not types.is_map(list_type)
def test_view(): # ARROW-5992 arr = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) expected = pa.array(['foo', 'bar', 'baz'], type=pa.binary()) assert arr.view(pa.binary()).equals(expected) assert arr.view('binary').equals(expected)
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.large_list(pa.uint8()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_type_ids(): # Having this fixed is very important because internally we rely on this id # to parse from python for idx, arrow_type in [ (0, pa.null()), (1, pa.bool_()), (2, pa.uint8()), (3, pa.int8()), (4, pa.uint16()), (5, pa.int16()), (6, pa.uint32()), (7, pa.int32()), (8, pa.uint64()), (9, pa.int64()), (10, pa.float16()), (11, pa.float32()), (12, pa.float64()), (13, pa.string()), (13, pa.utf8()), (14, pa.binary()), (16, pa.date32()), (17, pa.date64()), (18, pa.timestamp("us")), (19, pa.time32("s")), (20, pa.time64("us")), (23, pa.decimal128(8, 1)), (34, pa.large_utf8()), (35, pa.large_binary()), ]: assert idx == arrow_type.id
def test_statistics_convert_logical_types(tempdir): # ARROW-5166, ARROW-4139 # (min, max, type) cases = [ (10, 11164359321221007157, pa.uint64()), (10, 4294967295, pa.uint32()), ("ähnlich", "öffentlich", pa.utf8()), (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000), pa.time32('ms')), (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000), pa.time64('us')), (datetime.datetime(2019, 6, 24, 0, 0, 0, 1000), datetime.datetime(2019, 6, 25, 0, 0, 0, 1000), pa.timestamp('ms')), (datetime.datetime(2019, 6, 24, 0, 0, 0, 1000), datetime.datetime(2019, 6, 25, 0, 0, 0, 1000), pa.timestamp('us')) ] for i, (min_val, max_val, typ) in enumerate(cases): t = pa.Table.from_arrays([pa.array([min_val, max_val], type=typ)], ['col']) path = str(tempdir / ('example{}.parquet'.format(i))) pq.write_table(t, path, version='2.0') pf = pq.ParquetFile(path) stats = pf.metadata.row_group(0).column(0).statistics assert stats.min == min_val assert stats.max == max_val
def coerce_arrow(array: "pa.Array") -> "pa.Array": # also coerces timezone to naive representation # units are accounted for by pyarrow if "timestamp" in str(array.type): warnings.warn( "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost", ) ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False) ms = pa.compute.cast(ts_ms, pa.int64()) del ts_ms array = pa.compute.cast(ms, pa.date64()) del ms # note: Decimal256 could not be cast to float elif isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) # simplest solution is to cast to (large)-string arrays # this is copy and expensive elif isinstance(array, pa.DictionaryArray): if array.dictionary.type == pa.string(): array = pa.compute.cast(pa.compute.cast(array, pa.utf8()), pa.large_utf8()) else: raise ValueError( "polars does not support dictionary encoded types other than strings" ) if hasattr(array, "num_chunks") and array.num_chunks > 1: array = array.combine_chunks() return array
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_header_rows_convert_to_str(): workbook = xl.Workbook() sheet = workbook.add_sheet("X") sheet.write( 0, 0, datetime.date(2020, 1, 25), xl.easyxf(num_format_str="dd-mmm-yyyy") ) sheet.write(0, 1, 123.4213) sheet.write(0, 2, 123.4213, xl.easyxf(num_format_str="#.00")) # Leave D1 blank # It'd be nice to set E1="", but xlwt treats "" as blank sheet.write(1, 0, "a") sheet.write(1, 1, "b") sheet.write(1, 2, "c") sheet.write(1, 3, "d") with tempfile.NamedTemporaryFile(suffix="-headers.arrow") as header_file: # ignore result do_convert_data(workbook, header_rows="0-1", header_rows_file=header_file.name) with pyarrow.ipc.open_file(header_file.name) as header_reader: header_table = header_reader.read_all() assert_table_equals( header_table, pyarrow.table( { "A": ["25-Jan-2020"], "B": ["123.4213"], "C": ["123.42"], "D": pyarrow.array([None], pyarrow.utf8()), } ), )
def _query_conversations(db: sqlite3.Connection) -> pa.Table: with contextlib.closing(db.cursor()) as cursor: cursor.execute(CONVERSATIONS_SQL) table1 = _cursor_to_table(cursor) # Extract messages in Python, not SQLite UDF, so it's easy to debug the # query as described in the README. last_message_pattern = re.compile("(?:.*\n\u2063)*(.*)", re.DOTALL) def extract_last_message(messages_str: str) -> str: r"""Omit all but the final message from a conversation. Check doesn't use a JSON Array to delimit separate message texts. Instead, it delimits them by '\n\u2063'. """ if messages_str is None: return None return last_message_pattern.match(messages_str).group(1) user_messages_list = table1["user_messages"].to_pylist() last_user_message_list = [ extract_last_message(m) for m in user_messages_list ] return table1.add_column( table1.column_names.index("user_messages") + 1, "last_user_message", pa.array(last_user_message_list, pa.utf8()), )
def _convert_to_parquet(self): import pyarrow as pa final_schema = [] PRIMITIVE = { 'string': pa.utf8(), 'int': pa.int32(), 'long': pa.int64(), 'boolean': pa.bool_(), 'double': pa.float64(), 'float': pa.float32() } for field in self._metadata['fields']: name = field['name'] writer_type = field['type'] if not isinstance(writer_type, list): writer_type = [writer_type] comment = {} nullable = False final_type = None for nntype in writer_type: if isinstance(nntype, str): if nntype == 'null': nullable = True else: final_type = PRIMITIVE[nntype] elif isinstance(nntype, dict): if 'logicalType' in nntype: logical_type = nntype['logicalType'] if logical_type == 'decimal': scale = nntype.get('scale', 10) precision = nntype.get('precision', 38) final_type = pa.decimal128(precision, scale) elif logical_type == 'timestamp-millis': final_type = pa.timestamp('ms') elif logical_type == 'timestamp-micro': final_type = pa.timestamp('us') comment['timeunit'] = 'us' elif logical_type == 'date': final_type = pa.date32() else: raise NotImplementedError( 'logicalType {} is not implemented'.format( logical_type)) elif 'type' in nntype: if nntype['type'] in PRIMITIVE: final_type = PRIMITIVE[nntype['type']] if 'connect.name' in nntype: comment['connect_type'] = nntype[ 'connect.name'] else: raise NotImplementedError( 'Type {} is not implemented'.format(nntype)) else: raise NotImplementedError( 'Type {} is not implemented'.format(nntype)) else: raise NotImplementedError( 'Type {} is not implemented'.format(nntype)) final_schema.append(pa.field(name, final_type, nullable, comment)) return pa.schema(final_schema)
def case_basic_required(size=1): int64 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] float64 = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] string = [ "Hello", "bbb", "aa", "", "bbb", "abc", "bbb", "bbb", "def", "aaa" ] boolean = [True, True, False, False, False, True, True, True, True, True] fields = [ pa.field('int64', pa.int64(), nullable=False), pa.field('float64', pa.float64(), nullable=False), pa.field('string', pa.utf8(), nullable=False), pa.field('bool', pa.bool_(), nullable=False), pa.field('date', pa.timestamp('ms', ), nullable=False), pa.field('uint32', pa.uint32(), nullable=False), ] schema = pa.schema(fields) return { "int64": int64 * size, "float64": float64 * size, "string": string * size, "bool": boolean * size, "date": int64 * size, "uint32": int64 * size, }, schema, f"basic_required_{size*10}.parquet"
def test_render_retweeted_status_full_text_twitter_api_v1(): with _temp_tarfile([ lambda: _temp_json_path_lz4( "1105492514289512400.json.lz4", Path("tests/files/1_1_one_extended_retweet.json"), {"cjw:apiEndpoint": "1.1/statuses/user_timeline.json"}, ) ]) as tar_path: _assert_render( twitter.FetchResult(tar_path, []), P(accumulate=False), pa.table({ "screen_name": ["workbenchdata"], "created_at": pa.array([dt("Tue Mar 12 15:35:29 +0000 2019")], pa.timestamp("ns")), "text": [ # "text" is the key data point we're testing "RT @JacopoOttaviani: ⚡️ I'm playing with @workbenchdata: absolutely mindblowing. It's like a fusion between ScraperWiki, OpenRefine and Datawrapper. All of it online in the cloud and for free 👉🏽 https://t.co/fleqjI1qCI https://t.co/mmWHJLDjT2 #ddj #dataviz" ], "retweet_count": [7], "favorite_count": [0], "in_reply_to_screen_name": pa.nulls(1, pa.utf8()), "retweeted_status_screen_name": ["JacopoOttaviani"], "user_description": [ "Scrape, clean and analyze data without code. Create reproducible data workflows that can be shared with others" ], "source": ["Twitter for iPhone"], "lang": ["en"], "id": [1105492514289512400], }), [], )
def test_dictionary_delta(stream_fixture): ty = pa.dictionary(pa.int8(), pa.utf8()) data = [["foo", "foo", None], ["foo", "bar", "foo"], # potential delta ["foo", "bar"], ["foo", None, "bar", "quux"], # potential delta ["bar", "quux"], # replacement ] batches = [ pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts']) for v in data] schema = batches[0].schema def write_batches(): with stream_fixture._get_writer(pa.MockOutputStream(), schema) as writer: for batch in batches: writer.write_batch(batch) return writer.stats st = write_batches() assert st.num_record_batches == 5 assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 3 assert st.num_dictionary_deltas == 0 stream_fixture.use_legacy_ipc_format = None stream_fixture.options = pa.ipc.IpcWriteOptions( emit_dictionary_deltas=True) st = write_batches() assert st.num_record_batches == 5 assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 1 assert st.num_dictionary_deltas == 2
def test_request_serialized_example(self): example = text_format.Parse( """ features { feature { key: "x_bytes" value { bytes_list { value: ["ASa8asdf"] }}} feature { key: "x" value { bytes_list { value: "JLK7ljk3" }}} feature { key: "y" value { int64_list { value: [1, 2] }}} } """, tf.train.Example()) serialized_example_remote = [example.SerializeToString()] record_batch_remote = pa.RecordBatch.from_arrays( [ pa.array([["ASa8asdf"]], type=pa.list_(pa.binary())), pa.array([["JLK7ljk3"]], type=pa.list_(pa.utf8())), pa.array([[1, 2]], type=pa.list_(pa.int32())), pa.array([[4.5, 5, 5.5]], type=pa.list_(pa.float32())), serialized_example_remote ], ['x_bytes', 'x', 'y', 'z', _RECORDBATCH_COLUMN] ) result = list(bsl_util.RecordToJSON(record_batch_remote, True)) self.assertEqual(result, [{ 'b64': base64.b64encode(example.SerializeToString()).decode() }])
def python_to_arrow_type(cls, python_type: type) -> pa.DataType: if python_type == bool: return pa.bool_() if python_type == int: return pa.int64() if python_type == float: return pa.float64() if python_type == decimal.Decimal: return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE) if python_type == str: return pa.utf8() if python_type == dt.date: return pa.date32() if python_type == dt.datetime: return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT, cls.__TRAC_TIMESTAMP_ZONE) raise _ex.ETracInternal( f"No Arrow type mapping available for Python type [{python_type}]")
def simple_dicts_table(): dict_values = pa.array(["foo", "baz", "quux"], type=pa.utf8()) data = [ pa.chunked_array([ pa.DictionaryArray.from_arrays([1, 0, None], dict_values), pa.DictionaryArray.from_arrays([2, 1], dict_values)]), ] return pa.Table.from_arrays(data, names=['some_dicts'])
def test_cast_binary_to_utf8(): binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary()) utf8_arr = binary_arr.cast(pa.utf8()) expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) assert utf8_arr.equals(expected) non_utf8_values = [(u'mañana').encode('utf-16-le')] non_utf8_binary = pa.array(non_utf8_values) assert non_utf8_binary.type == pa.binary() with pytest.raises(ValueError): non_utf8_binary.cast(pa.string()) non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]), type=pa.binary()) # No error casted = non_utf8_all_null.cast(pa.string()) assert casted.null_count == 1
def test_table_from_pydict(): table = pa.Table.from_pydict({}) assert table.num_columns == 0 assert table.num_rows == 0 assert table.schema == pa.schema([]) assert table.to_pydict() == {} # With arrays as values data = OrderedDict([('strs', pa.array([u'', u'foo', u'bar'])), ('floats', pa.array([4.5, 5, None]))]) schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # With chunked arrays as values data = OrderedDict([('strs', pa.chunked_array([[u''], [u'foo', u'bar']])), ('floats', pa.chunked_array([[4.5], [5, None]]))]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # With lists as values data = OrderedDict([('strs', [u'', u'foo', u'bar']), ('floats', [4.5, 5, None])]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema assert table.to_pydict() == data # With metadata and inferred schema metadata = {b'foo': b'bar'} schema = schema.add_metadata(metadata) table = pa.Table.from_pydict(data, metadata=metadata) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pydict() == data # With explicit schema table = pa.Table.from_pydict(data, schema=schema) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pydict() == data # Cannot pass both schema and metadata with pytest.raises(ValueError): pa.Table.from_pydict(data, schema=schema, metadata=metadata)