def test_render_undefined_language_is_null(): # https://blog.twitter.com/developer/en_us/a/2013/introducing-new-metadata-for-tweets.html with _temp_tarfile([ lambda: _temp_json_path_lz4( "1088215462867959800.json.lz4", Path("tests/files/1_1_one_undefined_lang.json"), {"cjw:apiEndpoint": "1.1/statuses/user_timeline.json"}, ) ]) as tar_path: _assert_render( twitter.FetchResult(tar_path, []), P(accumulate=False), pa.table({ "screen_name": ["workbenchdata"], "created_at": pa.array([dt("Wed Jan 23 23:22:39 +0000 2019")], pa.timestamp("ns")), "text": ["🤖 https://t.co/FOhOfZT9MZ"], "retweet_count": [0], "favorite_count": [1], "in_reply_to_screen_name": pa.nulls(1, pa.utf8()), "retweeted_status_screen_name": pa.nulls(1, pa.utf8()), "user_description": [ "Scrape, clean and analyze data without code. Create reproducible data workflows that can be shared with others" ], "source": ["Twitter for iPhone"], # "lang" is the key data point we're testing "lang": pa.nulls(1, pa.utf8()), "id": [1088215462867959800], }), [], )
def to_arrow(self): children = [ pa.nulls(len(child)) if len(child) == child.null_count else child.to_arrow() for child in self.children ] pa_type = pa.struct( { field: child.type for field, child in zip(self.dtype.fields, children) } ) if self.nullable: nbuf = self.mask.to_host_array().view("int8") nbuf = pa.py_buffer(nbuf) buffers = (nbuf,) else: buffers = (None,) return pa.StructArray.from_buffers( pa_type, len(self), buffers, children=children )
def _pandas_series_to_arrow( values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True, min_len: Optional[int] = None, ) -> "pa.Array": """ Convert a pandas Series to an Arrow Array. Parameters ---------- values Series to convert to arrow nan_to_none Interpret `NaN` as missing values min_len in case of null values, this length will be used to create a dummy f64 array (with all values set to null) Returns ------- """ dtype = values.dtype if dtype == "object" and len(values) > 0: if isinstance(values.values[0], str): return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none) # array is null array, we set to a float64 array if values.values[0] is None and min_len is not None: return pa.nulls(min_len, pa.float64()) else: return pa.array(values, from_pandas=nan_to_none) else: return pa.array(values, from_pandas=nan_to_none)
def _pandas_series_to_arrow( values: pd.Series | pd.DatetimeIndex, nan_to_none: bool = True, min_len: int | None = None, ) -> pa.Array: """ Convert a pandas Series to an Arrow Array. Parameters ---------- values Series to convert to arrow nan_to_none Interpret `NaN` as missing values min_len in case of null values, this length will be used to create a dummy f64 array (with all values set to null) Returns ------- """ dtype = values.dtype if dtype == "object" and len(values) > 0: first_non_none = _get_first_non_none( values.values) # type: ignore[arg-type] if isinstance(first_non_none, str): return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none) if first_non_none is None: return pa.nulls(min_len, pa.large_utf8()) return pa.array(values, from_pandas=nan_to_none) else: return pa.array(values, from_pandas=nan_to_none)
def _filler(self, n, dtype): assert n > 0 if self.fill_value is None: type = vaex.array_types.to_arrow_type(dtype) return pa.nulls(n, type=type) else: return vaex.array_types.full(n, self.fill_value, dtype=dtype)
def test_render_retweeted_status_full_text_twitter_api_v1(): with _temp_tarfile([ lambda: _temp_json_path_lz4( "1105492514289512400.json.lz4", Path("tests/files/1_1_one_extended_retweet.json"), {"cjw:apiEndpoint": "1.1/statuses/user_timeline.json"}, ) ]) as tar_path: _assert_render( twitter.FetchResult(tar_path, []), P(accumulate=False), pa.table({ "screen_name": ["workbenchdata"], "created_at": pa.array([dt("Tue Mar 12 15:35:29 +0000 2019")], pa.timestamp("ns")), "text": [ # "text" is the key data point we're testing "RT @JacopoOttaviani: ⚡️ I'm playing with @workbenchdata: absolutely mindblowing. It's like a fusion between ScraperWiki, OpenRefine and Datawrapper. All of it online in the cloud and for free 👉🏽 https://t.co/fleqjI1qCI https://t.co/mmWHJLDjT2 #ddj #dataviz" ], "retweet_count": [7], "favorite_count": [0], "in_reply_to_screen_name": pa.nulls(1, pa.utf8()), "retweeted_status_screen_name": ["JacopoOttaviani"], "user_description": [ "Scrape, clean and analyze data without code. Create reproducible data workflows that can be shared with others" ], "source": ["Twitter for iPhone"], "lang": ["en"], "id": [1105492514289512400], }), [], )
def test_concat_with_null_type(typed): # when concatting an Arrow float32 column with a null type, resulting type should be Arrow float32 arrow_type = pa.float32() x1 = pa.array([1, 2], type=arrow_type) x2 = pa.nulls(2, type=arrow_type if typed else None) df1 = vaex.from_arrays(x=x1) df2 = vaex.from_arrays(x=x2) df = df1.concat(df2) assert df.x.data_type() == arrow_type assert df.x.tolist() == [1, 2, None, None]
def align(cls, N, ar, type, shape): # fast path for numpy if isinstance(ar, np.ndarray) and isinstance( type, np.dtype) and ar.dtype == type: return ar # needs a cast (or byteflip) if isinstance(ar, np.ndarray) and isinstance(type, np.dtype): return ar.astype(type) if ar is None: type = vaex.array_types.to_arrow_type(type) ar = pa.nulls(N, type=type) else: ar = vaex.array_types.to_arrow(ar) # convert null types to typed null types if pa.types.is_null(ar.type): ar = pa.nulls(len(ar), type=type) if ar.type != type: ar = ar.cast(type) return ar
def _recover_from_160258591(table: pa.Table) -> pa.Table: """Reset types of columns, in-place.""" if table.schema == ARROW_SCHEMA: return table else: # https://www.pivotaltracker.com/story/show/160258591 return pa.table({ column.name: table[column.name].cast(column.dtype) if column.name in table.column_names else pa.nulls(len(table), column.dtype) for column in Columns })
def test_concat_with_null_type_numpy_and_arrow(typed): # when concatting a numpy float32 column with a null type, resulting type should be Arrow float32 # rationale: Arrow will use use less memory, numpy has no efficient way to represent all missing data numpy_type = np.float32() arrow_type = pa.float32() x1 = np.array([1, 2], dtype=numpy_type) x2 = pa.nulls(2, type=arrow_type if typed else None) df1 = vaex.from_arrays(x=x1) df2 = vaex.from_arrays(x=x2) df = df1.concat(df2) assert df.x.data_type() == arrow_type assert df.x.tolist() == [1, 2, None, None]
def to_arrow(self): offsets = self.offsets.to_arrow() elements = (pa.nulls(len(self.elements)) if len(self.elements) == self.elements.null_count else self.elements.to_arrow()) pa_type = pa.list_(elements.type) if self.nullable: nbuf = self.mask.to_host_array().view("int8") nbuf = pa.py_buffer(nbuf) buffers = (nbuf, offsets.buffers()[1]) else: buffers = offsets.buffers() return pa.ListArray.from_buffers(pa_type, len(self), buffers, children=[elements])
def _get_field(struct_array: pa.StructArray, field: Union[str, int]) -> pa.Array: """Returns struct_array.field(field) with null propagation. This function is equivalent to struct_array.field() but correctly handles null propagation (the parent struct's null values are propagated to children). Args: struct_array: A struct array which should be queried. field: The request field to retrieve. Returns: A pa.Array containing the requested field. Raises: KeyError: If field is not a child field in struct_array. """ child_array = struct_array.field(field) # In case all values are present then there's no need for special handling. # We can return child_array as is to avoid a performance penalty caused by # constructing and flattening the returned array. if struct_array.null_count == 0: return child_array # is_valid returns a BooleanArray with two buffers the buffer at offset # 0 is always None and buffer 1 contains the data on which fields are # valid/not valid. # (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout) validity_bitmap_buffer = struct_array.is_valid().buffers()[1] # Construct a new struct array with a single field. Calling flatten() on the # new array guarantees validity bitmaps are merged correctly. new_type = pa.struct([pa.field(field, child_array.type)]) if (child_array.null_count == 0 and child_array.offset != 0): # TODO(https://issues.apache.org/jira/browse/ARROW-14156): Remove this # special handling once flattening a struct that has children that were # sliced produces arrays with a correct validity bitmap. child_array = pa.concat_arrays( [pa.nulls(0, child_array.type), child_array]) filtered_struct = pa.StructArray.from_buffers( new_type, len(struct_array), [validity_bitmap_buffer], null_count=struct_array.null_count, children=[child_array]) return filtered_struct.flatten()[0]
def _pandas_series_to_arrow( values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True, min_len: Optional[int] = None, ) -> "pa.Array": """ Convert a pandas Series to an Arrow Array. Parameters ---------- values Series to convert to arrow nan_to_none Interpret `NaN` as missing values min_len in case of null values, this length will be used to create a dummy f64 array (with all values set to null) Returns ------- """ dtype = values.dtype if dtype == "datetime64[ns]": # We first cast to ms because that's the unit of Datetime, # Then we cast to via int64 to datetime. Casting directly to Datetime lead to # loss of time information https://github.com/pola-rs/polars/issues/476 arr = pa.array(np.array(values.values, dtype="datetime64[ms]"), from_pandas=nan_to_none) arr = pa.compute.cast(arr, pa.int64()) return pa.compute.cast(arr, pa.timestamp("ms")) elif dtype == "object" and len(values) > 0: if isinstance(values.values[0], str): return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none) # array is null array, we set to a float64 array if values.values[0] is None and min_len is not None: return pa.nulls(min_len, pa.float64()) else: return pa.array(values, from_pandas=nan_to_none) else: return pa.array(values, from_pandas=nan_to_none)