def test_arrow_table_roundtrip(): import pyarrow as pa from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") arr[1] = pd.NaT df = pd.DataFrame({"a": arr}) table = pa.table(df) assert isinstance(table.field("a").type, ArrowPeriodType) result = table.to_pandas() assert isinstance(result["a"].dtype, PeriodDtype) tm.assert_frame_equal(result, df) table2 = pa.concat_tables([table, table]) result = table2.to_pandas() expected = pd.concat([df, df], ignore_index=True) tm.assert_frame_equal(result, expected)
def test_parquet_nested_extension(tmpdir): # Parquet support for extension types nested in struct or list import pyarrow.parquet as pq ext_type = IntegerType() storage = pa.array([4, 5, 6, 7], type=pa.int64()) ext_array = pa.ExtensionArray.from_storage(ext_type, storage) struct_array = pa.StructArray.from_arrays( [storage, ext_array], names=['ints', 'exts']) orig_table = pa.table({'structs': struct_array}) filename = tmpdir / 'nested_extension_type.parquet' pq.write_table(orig_table, filename) table = pq.read_table(filename) assert table.column(0).type == struct_array.type assert table == orig_table
def test_arrow_table_roundtrip(breaks): import pyarrow as pa from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks(breaks) arr[1] = None df = pd.DataFrame({"a": arr}) table = pa.table(df) assert isinstance(table.field("a").type, ArrowIntervalType) result = table.to_pandas() assert isinstance(result["a"].dtype, pd.IntervalDtype) tm.assert_frame_equal(result, df) table2 = pa.concat_tables([table, table]) result = table2.to_pandas() expected = pd.concat([df, df], ignore_index=True) tm.assert_frame_equal(result, expected)
def test_only_dictionary_encode_for_big_savings(self): no_values = ["A", "B", "C"] * 10 # dictionary would give ~10x savings yes_values = ["A", "B"] * 15 # dictionary would give ~15x savings csv = "\n".join(f"{no},{yes}" for no, yes in zip(no_values, yes_values)) with _temp_csv(csv) as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=False), ParseCsvResult( pa.table({ "Column 1": no_values, "Column 2": pa.array(yes_values).dictionary_encode(), }), [], ), )
def test_ipc_format(tempdir): table = pa.table({'a': pa.array([1, 2, 3], type="int8"), 'b': pa.array([.1, .2, .3], type="float64")}) path = str(tempdir / 'test.arrow') with pa.output_stream(path) as sink: writer = pa.RecordBatchFileWriter(sink, table.schema) writer.write_batch(table.to_batches()[0]) writer.close() dataset = ds.dataset(path, format=ds.IpcFileFormat()) result = dataset.to_table() assert result.equals(table) for format_str in ["ipc", "arrow"]: dataset = ds.dataset(path, format=format_str) result = dataset.to_table() assert result.equals(table)
def test_table_repr_to_string(): # Schema passed explicitly schema = pa.schema([ pa.field('c0', pa.int16(), metadata={'key': 'value'}), pa.field('c1', pa.int32()) ], metadata={b'foo': b'bar'}) tab = pa.table([ pa.array([1, 2, 3, 4], type='int16'), pa.array([1, 2, 3, 4], type='int32') ], schema=schema) assert str(tab) == """pyarrow.Table c0: int16 c1: int32""" assert tab.to_string(show_metadata=True) == """\
def test_dictionary_encode_empty(self): # All empty strings => 0 bytes of text data. So Arrow doesn't create # a buffer ... and our buffer-size math must account for buf=None. with _temp_csv("A,B\n,\n,\n,\n,\n,\n") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True, autoconvert_text_to_numbers=False), ParseCsvResult( pa.table({ "A": pa.array(["", "", "", "", ""]).dictionary_encode(), "B": pa.array(["", "", "", "", ""]).dictionary_encode(), }), [], ), )
def test_v2_lz4_default_compression(): # ARROW-8750: Make sure that the compression=None option selects lz4 if # it's available if not pa.Codec.is_available('lz4_frame'): pytest.skip("LZ4 compression support is not built in C++") # some highly compressible data t = pa.table([np.repeat(0, 100000)], names=['f0']) buf = io.BytesIO() write_feather(t, buf) default_result = buf.getvalue() buf = io.BytesIO() write_feather(t, buf, compression='uncompressed') uncompressed_result = buf.getvalue() assert len(default_result) < len(uncompressed_result)
def test_filter_errors(): arr = pa.chunked_array([["a", None], ["c", "d", "e"]]) batch = pa.record_batch([pa.array(["a", None, "c", "d", "e"])], names=["a'"]) table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"]) for obj in [arr, batch, table]: # non-boolean dtype mask = pa.array([0, 1, 0, 1, 0]) with pytest.raises(NotImplementedError, match="no kernel matching input types"): obj.filter(mask) # wrong length mask = pa.array([True, False, True]) with pytest.raises(pa.ArrowInvalid, match="must all be the same length"): obj.filter(mask)
def sssp(graph: PropertyGraph, source, length_property, shift, property_name): dists = create_distance_array(graph, source, length_property) init_bag = InsertBag[UpdateRequest]() init_bag.push((source, 0)) t = StatTimer("Total SSSP") t.start() for_each( init_bag, sssp_operator(graph, dists, graph.get_edge_property(length_property)), worklist=OrderedByIntegerMetric(obim_indexer(shift)), disable_conflict_detection=True, loop_name="SSSP", ) t.stop() print("Elapsed time: ", t.get(), "milliseconds.") graph.add_node_property(pyarrow.table({property_name: dists}))
def accel_convert_to_table(accel, stream_name, user_name, start_time, total_records, frequency): ts = get_timestamps(start_time=start_time, total_records=accel[:, 0].size, frequency=frequency) print("Converting into Table: Stream Name: ", stream_name) try: ndarray_table = pa.table( { "timestamp": ts.get("timestamp"), "localtime": ts.get("localtime"), "x": accel[:, 0], "y": accel[:, 1], "z": accel[:, 2] } ) file_path = (data_folder_path+"parsed/stream="+stream_name+"/version=1/user="******"/").lower() Path(file_path).mkdir(parents=True, exist_ok=True) pq.write_table(ndarray_table,file_path+"data.parquet") except Exception as e: print(e)
def test_projection(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) num_cols = source_table.num_columns for i in range(1, num_cols - 1): source_table = source_table.remove_column(num_cols - i) assert source_table == reader.read()
def test_chunked_array_to_pandas_preserve_name(): # https://issues.apache.org/jira/browse/ARROW-7709 import pandas as pd import pandas.testing as tm for data in [ pa.array([1, 2, 3]), pa.array(pd.Categorical(["a", "b", "a"])), pa.array(pd.date_range("2012", periods=3)), pa.array(pd.date_range("2012", periods=3, tz="Europe/Brussels")), pa.array([1, 2, 3], pa.timestamp("ms")), pa.array([1, 2, 3], pa.timestamp("ms", "Europe/Brussels")) ]: table = pa.table({"name": data}) result = table.column("name").to_pandas() assert result.name == "name" expected = pd.Series(data.to_pandas(), name="name") tm.assert_series_equal(result, expected)
def test_truncate_do_not_cause_invalid_utf8(): workbook = xl.Workbook() sheet = workbook.add_sheet("X") for i, s in enumerate( [ # Examples from https://en.wikipedia.org/wiki/UTF-8 "AAAA", "AA\u00A2", # ¢ (2 bytes) -- keep "AAA\u00A2", # ¢ (2 bytes) -- drop both bytes "A\u0939", # ह (3 bytes) -- keep "AA\u0939", # ह (3 bytes) -- drop all three bytes "AAA\u0939", # ह (3 bytes) -- drop all three bytes "\U00010348", # 𐍈 (4 bytes) -- keep "A\U00010348", # 𐍈 (4 bytes) -- drop all four bytes "AA\U00010348", # 𐍈 (4 bytes) -- drop all four bytes "AAA\U00010348", # 𐍈 (4 bytes) -- drop all four bytes ] ): sheet.write(i, 0, s) result, stdout = do_convert_data( workbook, max_bytes_per_value=4, header_rows="", include_stdout=True, ) expected = pyarrow.table( { "A": [ "AAAA", "AA\u00A2", "AAA", "A\u0939", "AA", "AAA", "\U00010348", "A", "AA", "AAA", ] } ) assert_table_equals(result, expected) assert stdout == b"truncated 6 values (value byte limit is 4; see row 2 column A)\n"
def test_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "int_col2", IntegerType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.int32()) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("string_col", pa.string(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("int_col2", pa.int32(), nullable=True), ])) target_table = reader.read() assert source_table == target_table
def write(parquet_path: Path, table: pyarrow.Table) -> None: """ Write an Arrow table to a Parquet file, overwriting if needed. We aim to keep the file format "stable": all future versions of parquet.read() should support all files written by today's version of this function. Dictionary-encoded columns will stay dictionary-encoded. Practically, `parquet.write(path, table); table = parquet.read(path)` does not change `table`. """ if table.num_rows == 0: # Workaround for https://issues.apache.org/jira/browse/ARROW-6568 # If table is zero-length, guarantee it has a RecordBatch so Arrow # won't crash when writing a DictionaryArray. def empty_array_for_field(field): if pyarrow.types.is_dictionary(field.type): return pyarrow.DictionaryArray.from_arrays( pyarrow.array([], type=field.type.index_type), pyarrow.array([], type=field.type.value_type), ) else: return pyarrow.array([], type=field.type) table = pyarrow.table({ field.name: empty_array_for_field(field) for field in table.schema }) pyarrow.parquet.write_table( table, str(parquet_path), version="2.0", compression="SNAPPY", # Preserve whatever dictionaries we have in Pandas. Write+read # should return an exact copy. use_dictionary=[ name.encode("utf-8") for name, column in zip(table.column_names, table.columns) if pyarrow.types.is_dictionary(column.type) ], )
def test_use_nullable_dtypes(self, engine): import pyarrow.parquet as pq if engine == "fastparquet": # We are manually disabling fastparquet's # nullable dtype support pending discussion pytest.skip("Fastparquet nullable dtype support is disabled") table = pyarrow.table({ "a": pyarrow.array([1, 2, 3, None], "int64"), "b": pyarrow.array([1, 2, 3, None], "uint8"), "c": pyarrow.array(["a", "b", "c", None]), "d": pyarrow.array([True, False, True, None]), # Test that nullable dtypes used even in absence of nulls "e": pyarrow.array([1, 2, 3, 4], "int64"), }) with tm.ensure_clean() as path: # write manually with pyarrow to write integers pq.write_table(table, path) result1 = read_parquet(path, engine=engine) result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) assert result1["a"].dtype == np.dtype("float64") expected = pd.DataFrame({ "a": pd.array([1, 2, 3, None], dtype="Int64"), "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), "e": pd.array([1, 2, 3, 4], dtype="Int64"), }) if engine == "fastparquet": # Fastparquet doesn't support string columns yet # Only int and boolean result2 = result2.drop("c", axis=1) expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected)
def transpose(self, in_place: bool = True): """\ Transpose whole object. Data matrix is transposed, observations and variables are interchanged. """ scdata = self.to_memory().read_all().select( self.var).to_pandas().transpose() obs = self.format_obs(pa.table(self.var), self.to_memory()) var = self.obs if not in_place: return SCData(scdata, obs, var, self.uns, self.obsm, self.varm) else: scdata: pa.Table = pa.Table.from_pandas(df=scdata, preserve_index=True, nthreads=self.use_cores) scdata = self.ensure_scdata_format(scdata, obs, var, self.obsm, self.varm, self.uns) self.update_scdata(scdata)
def test_convert_uint8_uint16_uint32(): # parquet only stores int32/int64 values natively. These are upcast to # be encoded. _test_convert_via_arrow( pyarrow.table({ "u8": pyarrow.array([1, 138, None], type=pyarrow.uint8()), "u16": pyarrow.array([1, 38383, None], type=pyarrow.uint16()), "u32": pyarrow.array([1, 4294967291, None], type=pyarrow.uint32()), }), "u8,u16,u32\r\n1,1,1\r\n138,38383,4294967291\r\n,,", [ dict(u8=1, u16=1, u32=1), dict(u8=138, u16=38383, u32=4294967291), dict(u8=None, u16=None, u32=None), ], )
def test_parquet_period(tmpdir, registered_period_type): # Parquet support for primitive extension types period_type, period_class = registered_period_type storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) table = pa.table([arr], names=["ext"]) import pyarrow.parquet as pq filename = tmpdir / 'period_extension_type.parquet' pq.write_table(table, filename) # Stored in parquet as storage type but with extension metadata saved # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) # Since the type could be reconstructed, the extension type metadata is # absent. assert schema.field("ext").metadata == {} # When reading in, properly create extension type if it is registered result = pq.read_table(filename) assert result.schema.field("ext").type == period_type assert result.schema.field("ext").metadata == {b'PARQUET:field_id': b'1'} # Get the exact array class defined by the registered type. result_array = result.column("ext").chunk(0) assert type(result_array) is period_class # When the type is not registered, read in as storage type pa.unregister_extension_type(period_type.extension_name) result = pq.read_table(filename) assert result.schema.field("ext").type == pa.int64() # The extension metadata is present for roundtripping. assert result.schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'test.period', b'PARQUET:field_id': b'1', }
def test_sort_indices_table(): table = pa.table({"a": [1, 1, 0], "b": [1, 0, 1]}) result = pc.sort_indices(table, sort_keys=[("a", "ascending")]) assert result.to_pylist() == [2, 0, 1] result = pc.sort_indices(table, sort_keys=[("a", "ascending"), ("b", "ascending")]) assert result.to_pylist() == [2, 1, 0] with pytest.raises(ValueError, match="Must specify one or more sort keys"): pc.sort_indices(table) with pytest.raises(ValueError, match="Nonexistent sort key column"): pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) with pytest.raises(ValueError, match="not a valid order"): pc.sort_indices(table, sort_keys=[("a", "nonscending")])
def test_truncate_csv_repair_utf8(self): with _temp_csv("A,B\na,b\nc,d\né,f\ng,h") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "A": ["a", "c", "�"], "B": ["b", "d", None] }), [ ParseCsvWarning.TruncatedFile(20, 13), ParseCsvWarning.RepairedEncoding( encoding="utf-8", first_invalid_byte=195, first_invalid_byte_position=12, ), ], ), )
def encode(self): global Data_decompressed ''' This function will be using decoder2 function defined above for decoding data. This Model will create a parquet file for decoded columns ''' encoded_array = self.encoder_model.generateEncodings( Data_decompressed, Mask_decompressed) parquetDic = {} for i in range(encoded_array.shape[1]): name = f'col_{i+1}' parquetDic[name] = encoded_array[:, i] print(f'Encoder Columns shape: {encoded_array.shape}') log2(encoded_array) ndarray_table = pa.table(parquetDic) pq.write_table(ndarray_table, 'my_encoder.parquet') print('File my_encoder.parquet created')
def ndarray_to_file( np_array: np.ndarray, path: str, file_system: AbstractFileSystem, content_type: str = ContentType.PARQUET.value, **kwargs): """ Writes the given Numpy ndarray to a file. """ # PyArrow only supports 1D ndarrays, so convert to list of 1D arrays np_arrays = [array for array in np_array] pa_utils.table_to_file( pa.table({"data": np_arrays}), path, file_system, content_type, **kwargs )
def test_read_fastparquet_text_categorical(): # To write this file, install fastparquet and run: # # import fastparquet # import pandas as pd # fastparquet.write( # 'x.parquet', # pd.DataFrame({"A": pd.Series(["x", None, "y", "x", "x"], dtype="category")}) # ) path = (Path(__file__).parent / "files" / "column-A-dictionary-from-fastparquet.parquet") result = do_convert(path) assert_table_equals( result, pyarrow.table({ "A": pyarrow.array(["x", None, "y", "x", "x"]).dictionary_encode() }), )
def test_encode_nested_arrays_and_objects(self): assert_json_result_equals( _parse_json_with_defaults([{ "value": { "x": ["y", { "z": True, "Z": ["a", None] }, ["b", "c"]], "X": {}, } }]), ParseJsonResult( pyarrow.table({ "value": ['{"x":["y",{"z":true,"Z":["a",null]},["b","c"]],"X":{}}'] }), [], ), )
def dataframe_to_arrow_table( dataframe: pd.DataFrame, columns: List[Column], path: Path ) -> None: """Write `dataframe` to an Arrow file.""" arrays = [] for column in columns: arrays.append(series_to_arrow_array(dataframe[column.name])) arrow_table_without_metadata = pa.Table.from_arrays( arrays, names=[c.name for c in columns] ) fields = [ _fix_arrow_field(arrow_table_without_metadata.schema.field(i), columns[i].type) for i in range(len(columns)) ] arrow_table = pa.table(arrow_table_without_metadata.columns, pa.schema(fields)) with pa.RecordBatchFileWriter(str(path), arrow_table.schema) as writer: writer.write_table(arrow_table)
def render(arrow_table, params, output_path, *, columns, **kwargs): # Test the "columns" kwarg self.assertEqual(columns, input_columns) table = pa.table( { "A": [1], "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "C": ["a"], "D": [1], "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": ["a"], "G": [1], "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "I": ["a"], } ) with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer: writer.write_table(table) return []
def pandas_to_pydf( data: "pd.DataFrame", columns: Optional[Sequence[str]] = None, rechunk: bool = True, nan_to_none: bool = True, ) -> "PyDataFrame": """ Construct a PyDataFrame from a pandas DataFrame. """ if not _PYARROW_AVAILABLE: raise ImportError( "'pyarrow' is required when constructing a PyDataFrame from a pandas DataFrame." ) arrow_dict = { str(col): _pandas_series_to_arrow(data[col], nan_to_none=nan_to_none) for col in data.columns } arrow_table = pa.table(arrow_dict) return arrow_to_pydf(arrow_table, columns=columns, rechunk=rechunk)
def test_orcfile_readwrite(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(table, buffer_output_stream) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) output_table = orc.ORCFile(buffer_reader).read() assert table.equals(output_table) # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table(buffer_output_stream, table) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) output_table = orc.ORCFile(buffer_reader).read() assert table.equals(output_table)