Beispiel #1
0
def test_render_undefined_language_is_null():
    # https://blog.twitter.com/developer/en_us/a/2013/introducing-new-metadata-for-tweets.html
    with _temp_tarfile([
            lambda: _temp_json_path_lz4(
                "1088215462867959800.json.lz4",
                Path("tests/files/1_1_one_undefined_lang.json"),
                {"cjw:apiEndpoint": "1.1/statuses/user_timeline.json"},
            )
    ]) as tar_path:
        _assert_render(
            twitter.FetchResult(tar_path, []),
            P(accumulate=False),
            pa.table({
                "screen_name": ["workbenchdata"],
                "created_at":
                pa.array([dt("Wed Jan 23 23:22:39 +0000 2019")],
                         pa.timestamp("ns")),
                "text": ["🤖 https://t.co/FOhOfZT9MZ"],
                "retweet_count": [0],
                "favorite_count": [1],
                "in_reply_to_screen_name":
                pa.nulls(1, pa.utf8()),
                "retweeted_status_screen_name":
                pa.nulls(1, pa.utf8()),
                "user_description": [
                    "Scrape, clean and analyze data without code. Create reproducible data workflows that can be shared with others"
                ],
                "source": ["Twitter for iPhone"],
                # "lang" is the key data point we're testing
                "lang":
                pa.nulls(1, pa.utf8()),
                "id": [1088215462867959800],
            }),
            [],
        )
Beispiel #2
0
    def to_arrow(self):
        children = [
            pa.nulls(len(child))
            if len(child) == child.null_count
            else child.to_arrow()
            for child in self.children
        ]

        pa_type = pa.struct(
            {
                field: child.type
                for field, child in zip(self.dtype.fields, children)
            }
        )

        if self.nullable:
            nbuf = self.mask.to_host_array().view("int8")
            nbuf = pa.py_buffer(nbuf)
            buffers = (nbuf,)
        else:
            buffers = (None,)

        return pa.StructArray.from_buffers(
            pa_type, len(self), buffers, children=children
        )
Beispiel #3
0
def _pandas_series_to_arrow(
    values: Union["pd.Series", "pd.DatetimeIndex"],
    nan_to_none: bool = True,
    min_len: Optional[int] = None,
) -> "pa.Array":
    """
    Convert a pandas Series to an Arrow Array.

    Parameters
    ----------
    values
        Series to convert to arrow
    nan_to_none
        Interpret `NaN` as missing values
    min_len
        in case of null values, this length will be used to create a dummy f64 array (with all values set to null)

    Returns
    -------
    """
    dtype = values.dtype
    if dtype == "object" and len(values) > 0:
        if isinstance(values.values[0], str):
            return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)

        # array is null array, we set to a float64 array
        if values.values[0] is None and min_len is not None:
            return pa.nulls(min_len, pa.float64())
        else:
            return pa.array(values, from_pandas=nan_to_none)
    else:
        return pa.array(values, from_pandas=nan_to_none)
Beispiel #4
0
def _pandas_series_to_arrow(
    values: pd.Series | pd.DatetimeIndex,
    nan_to_none: bool = True,
    min_len: int | None = None,
) -> pa.Array:
    """
    Convert a pandas Series to an Arrow Array.

    Parameters
    ----------
    values
        Series to convert to arrow
    nan_to_none
        Interpret `NaN` as missing values
    min_len
        in case of null values, this length will be used to create a dummy f64 array (with all values set to null)

    Returns
    -------
    """
    dtype = values.dtype
    if dtype == "object" and len(values) > 0:
        first_non_none = _get_first_non_none(
            values.values)  # type: ignore[arg-type]

        if isinstance(first_non_none, str):
            return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)
        if first_non_none is None:
            return pa.nulls(min_len, pa.large_utf8())

        return pa.array(values, from_pandas=nan_to_none)
    else:
        return pa.array(values, from_pandas=nan_to_none)
Beispiel #5
0
 def _filler(self, n, dtype):
     assert n > 0
     if self.fill_value is None:
         type = vaex.array_types.to_arrow_type(dtype)
         return pa.nulls(n, type=type)
     else:
         return vaex.array_types.full(n, self.fill_value, dtype=dtype)
Beispiel #6
0
def test_render_retweeted_status_full_text_twitter_api_v1():
    with _temp_tarfile([
            lambda: _temp_json_path_lz4(
                "1105492514289512400.json.lz4",
                Path("tests/files/1_1_one_extended_retweet.json"),
                {"cjw:apiEndpoint": "1.1/statuses/user_timeline.json"},
            )
    ]) as tar_path:
        _assert_render(
            twitter.FetchResult(tar_path, []),
            P(accumulate=False),
            pa.table({
                "screen_name": ["workbenchdata"],
                "created_at":
                pa.array([dt("Tue Mar 12 15:35:29 +0000 2019")],
                         pa.timestamp("ns")),
                "text": [
                    # "text" is the key data point we're testing
                    "RT @JacopoOttaviani: ⚡️ I'm playing with @workbenchdata: absolutely mindblowing. It's like a fusion between ScraperWiki, OpenRefine and Datawrapper. All of it online in the cloud and for free 👉🏽 https://t.co/fleqjI1qCI https://t.co/mmWHJLDjT2 #ddj #dataviz"
                ],
                "retweet_count": [7],
                "favorite_count": [0],
                "in_reply_to_screen_name":
                pa.nulls(1, pa.utf8()),
                "retweeted_status_screen_name": ["JacopoOttaviani"],
                "user_description": [
                    "Scrape, clean and analyze data without code. Create reproducible data workflows that can be shared with others"
                ],
                "source": ["Twitter for iPhone"],
                "lang": ["en"],
                "id": [1105492514289512400],
            }),
            [],
        )
Beispiel #7
0
def test_concat_with_null_type(typed):
    # when concatting an Arrow float32 column with a null type, resulting type should be Arrow float32
    arrow_type = pa.float32()
    x1 = pa.array([1, 2], type=arrow_type)
    x2 = pa.nulls(2, type=arrow_type if typed else None)
    df1 = vaex.from_arrays(x=x1)
    df2 = vaex.from_arrays(x=x2)
    df = df1.concat(df2)
    assert df.x.data_type() == arrow_type
    assert df.x.tolist() == [1, 2, None, None]
Beispiel #8
0
 def align(cls, N, ar, type, shape):
     # fast path for numpy
     if isinstance(ar, np.ndarray) and isinstance(
             type, np.dtype) and ar.dtype == type:
         return ar
     # needs a cast (or byteflip)
     if isinstance(ar, np.ndarray) and isinstance(type, np.dtype):
         return ar.astype(type)
     if ar is None:
         type = vaex.array_types.to_arrow_type(type)
         ar = pa.nulls(N, type=type)
     else:
         ar = vaex.array_types.to_arrow(ar)
         # convert null types to typed null types
         if pa.types.is_null(ar.type):
             ar = pa.nulls(len(ar), type=type)
     if ar.type != type:
         ar = ar.cast(type)
     return ar
Beispiel #9
0
def _recover_from_160258591(table: pa.Table) -> pa.Table:
    """Reset types of columns, in-place."""
    if table.schema == ARROW_SCHEMA:
        return table
    else:
        # https://www.pivotaltracker.com/story/show/160258591
        return pa.table({
            column.name: table[column.name].cast(column.dtype) if column.name
            in table.column_names else pa.nulls(len(table), column.dtype)
            for column in Columns
        })
Beispiel #10
0
def test_concat_with_null_type_numpy_and_arrow(typed):
    # when concatting a numpy float32 column with a null type, resulting type should be Arrow float32
    # rationale: Arrow will use use less memory, numpy has no efficient way to represent all missing data
    numpy_type = np.float32()
    arrow_type = pa.float32()
    x1 = np.array([1, 2], dtype=numpy_type)
    x2 = pa.nulls(2, type=arrow_type if typed else None)
    df1 = vaex.from_arrays(x=x1)
    df2 = vaex.from_arrays(x=x2)
    df = df1.concat(df2)
    assert df.x.data_type() == arrow_type
    assert df.x.tolist() == [1, 2, None, None]
Beispiel #11
0
    def to_arrow(self):
        offsets = self.offsets.to_arrow()
        elements = (pa.nulls(len(self.elements)) if len(self.elements)
                    == self.elements.null_count else self.elements.to_arrow())
        pa_type = pa.list_(elements.type)

        if self.nullable:
            nbuf = self.mask.to_host_array().view("int8")
            nbuf = pa.py_buffer(nbuf)
            buffers = (nbuf, offsets.buffers()[1])
        else:
            buffers = offsets.buffers()
        return pa.ListArray.from_buffers(pa_type,
                                         len(self),
                                         buffers,
                                         children=[elements])
def _get_field(struct_array: pa.StructArray, field: Union[str,
                                                          int]) -> pa.Array:
    """Returns struct_array.field(field) with null propagation.

  This function is equivalent to struct_array.field() but correctly handles
  null propagation (the parent struct's null values are propagated to children).

  Args:
    struct_array: A struct array which should be queried.
    field: The request field to retrieve.

  Returns:
    A pa.Array containing the requested field.

  Raises:
    KeyError: If field is not a child field in struct_array.
  """
    child_array = struct_array.field(field)

    # In case all values are present then there's no need for special handling.
    # We can return child_array as is to avoid a performance penalty caused by
    # constructing and flattening the returned array.
    if struct_array.null_count == 0:
        return child_array
    # is_valid returns a BooleanArray with two buffers the buffer at offset
    # 0 is always None and buffer 1 contains the data on which fields are
    # valid/not valid.
    # (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout)
    validity_bitmap_buffer = struct_array.is_valid().buffers()[1]

    # Construct a new struct array with a single field.  Calling flatten() on the
    # new array guarantees validity bitmaps are merged correctly.
    new_type = pa.struct([pa.field(field, child_array.type)])
    if (child_array.null_count == 0 and child_array.offset != 0):
        # TODO(https://issues.apache.org/jira/browse/ARROW-14156): Remove this
        # special handling once flattening a struct that has children that were
        # sliced produces arrays with a correct validity bitmap.
        child_array = pa.concat_arrays(
            [pa.nulls(0, child_array.type), child_array])
    filtered_struct = pa.StructArray.from_buffers(
        new_type,
        len(struct_array), [validity_bitmap_buffer],
        null_count=struct_array.null_count,
        children=[child_array])
    return filtered_struct.flatten()[0]
Beispiel #13
0
def _pandas_series_to_arrow(
    values: Union["pd.Series", "pd.DatetimeIndex"],
    nan_to_none: bool = True,
    min_len: Optional[int] = None,
) -> "pa.Array":
    """
    Convert a pandas Series to an Arrow Array.

    Parameters
    ----------
    values
        Series to convert to arrow
    nan_to_none
        Interpret `NaN` as missing values
    min_len
        in case of null values, this length will be used to create a dummy f64 array (with all values set to null)

    Returns
    -------
    """
    dtype = values.dtype
    if dtype == "datetime64[ns]":
        # We first cast to ms because that's the unit of Datetime,
        # Then we cast to via int64 to datetime. Casting directly to Datetime lead to
        # loss of time information https://github.com/pola-rs/polars/issues/476
        arr = pa.array(np.array(values.values, dtype="datetime64[ms]"),
                       from_pandas=nan_to_none)
        arr = pa.compute.cast(arr, pa.int64())
        return pa.compute.cast(arr, pa.timestamp("ms"))
    elif dtype == "object" and len(values) > 0:
        if isinstance(values.values[0], str):
            return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)

        # array is null array, we set to a float64 array
        if values.values[0] is None and min_len is not None:
            return pa.nulls(min_len, pa.float64())
        else:
            return pa.array(values, from_pandas=nan_to_none)
    else:
        return pa.array(values, from_pandas=nan_to_none)