Example #1
0
def test_compare_scalar(typ):
    if typ == "array":

        def con(values):
            return pa.array(values)
    else:

        def con(values):
            return pa.chunked_array([values])

    arr = con([1, 2, 3, None])
    scalar = pa.scalar(2)

    result = pc.equal(arr, scalar)
    assert result.equals(con([False, True, False, None]))

    if typ == "array":
        nascalar = pa.scalar(None, type="int64")
        result = pc.equal(arr, nascalar)
        assert result.to_pylist() == [None, None, None, None]

    result = pc.not_equal(arr, scalar)
    assert result.equals(con([True, False, True, None]))

    result = pc.less(arr, scalar)
    assert result.equals(con([True, False, False, None]))

    result = pc.less_equal(arr, scalar)
    assert result.equals(con([True, True, False, None]))

    result = pc.greater(arr, scalar)
    assert result.equals(con([False, False, True, None]))

    result = pc.greater_equal(arr, scalar)
    assert result.equals(con([False, True, True, None]))
Example #2
0
def test_compare_string_scalar(typ):
    if typ == "array":
        def con(values): return pa.array(values)
    else:
        def con(values): return pa.chunked_array([values])

    arr = con(['a', 'b', 'c', None])
    scalar = pa.scalar('b')

    result = pc.equal(arr, scalar)
    assert result.equals(con([False, True, False, None]))

    if typ == "array":
        nascalar = pa.scalar(None, type="string")
        result = pc.equal(arr, nascalar)
        isnull = pc.is_null(result)
        assert isnull.equals(con([True, True, True, True]))

    result = pc.not_equal(arr, scalar)
    assert result.equals(con([True, False, True, None]))

    result = pc.less(arr, scalar)
    assert result.equals(con([True, False, False, None]))

    result = pc.less_equal(arr, scalar)
    assert result.equals(con([True, True, False, None]))

    result = pc.greater(arr, scalar)
    assert result.equals(con([False, False, True, None]))

    result = pc.greater_equal(arr, scalar)
    assert result.equals(con([False, True, True, None]))
Example #3
0
def clean_hot(arr, categories=[], drop_first=False):
    arr = arr.cast(pa.string())
    if categories:
        clns = [c.equal(arr, v).fill_null(False) for v in categories]
    else:
        categories = [u for u in arr.unique().to_pylist() if u]
        clns = [c.equal(arr, v).fill_null(False) for v in categories]
    return clns[(1 if drop_first else 0):], categories[(
        1 if drop_first else 0):]
Example #4
0
def test_compare_array(typ):
    if typ == "array":

        def con(values):
            return pa.array(values)
    else:

        def con(values):
            return pa.chunked_array([values])

    arr1 = con([1, 2, 3, 4, None])
    arr2 = con([1, 1, 4, None, 4])

    result = pc.equal(arr1, arr2)
    assert result.equals(con([True, False, False, None, None]))

    result = pc.not_equal(arr1, arr2)
    assert result.equals(con([False, True, True, None, None]))

    result = pc.less(arr1, arr2)
    assert result.equals(con([False, False, True, None, None]))

    result = pc.less_equal(arr1, arr2)
    assert result.equals(con([True, False, True, None, None]))

    result = pc.greater(arr1, arr2)
    assert result.equals(con([False, True, False, None, None]))

    result = pc.greater_equal(arr1, arr2)
    assert result.equals(con([True, True, False, None, None]))
Example #5
0
def test_compare_scalar(typ):
    if typ == "array":

        def con(values):
            return pa.array(values)
    else:

        def con(values):
            return pa.chunked_array([values])

    arr = con([1, 2, 3, None])
    # TODO this is a hacky way to construct a scalar ..
    scalar = pa.array([2]).sum()

    result = pc.equal(arr, scalar)
    assert result.equals(con([False, True, False, None]))

    result = pc.not_equal(arr, scalar)
    assert result.equals(con([True, False, True, None]))

    result = pc.less(arr, scalar)
    assert result.equals(con([True, False, False, None]))

    result = pc.less_equal(arr, scalar)
    assert result.equals(con([True, True, False, None]))

    result = pc.greater(arr, scalar)
    assert result.equals(con([False, False, True, None]))

    result = pc.greater_equal(arr, scalar)
    assert result.equals(con([False, True, True, None]))
Example #6
0
def pyarrow_transform(batch: pa.Table) -> pa.Table:
    batch = batch.filter(pac.equal(batch["variety"], "Versicolor"))
    batch = batch.append_column(
        "normalized.sepal.length",
        pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])),
    )
    return batch.drop(["sepal.length"])
Example #7
0
File: array.py Project: tnir/pandas
    def _mode(self: ArrowExtensionArrayT,
              dropna: bool = True) -> ArrowExtensionArrayT:
        """
        Returns the mode(s) of the ExtensionArray.

        Always returns `ExtensionArray` even if only one value.

        Parameters
        ----------
        dropna : bool, default True
            Don't consider counts of NA values.
            Not implemented by pyarrow.

        Returns
        -------
        same type as self
            Sorted, if possible.
        """
        if pa_version_under6p0:
            raise NotImplementedError(
                "mode only supported for pyarrow version >= 6.0")
        modes = pc.mode(self._data, pc.count_distinct(self._data).as_py())
        values = modes.field(0)
        counts = modes.field(1)
        # counts sorted descending i.e counts[0] = max
        mask = pc.equal(counts, counts[0])
        most_common = values.filter(mask)
        return type(self)(most_common)
Example #8
0
def test_input_type_conversion():
    # Automatic array conversion from Python
    arr = pc.add([1, 2], [4, None])
    assert arr.to_pylist() == [5, None]
    # Automatic scalar conversion from Python
    arr = pc.add([1, 2], 4)
    assert arr.to_pylist() == [5, 6]
    # Other scalar type
    assert pc.equal(["foo", "bar", None],
                    "foo").to_pylist() == [True, False, None]
Example #9
0
def binary_col(op, l, r):
    """
  interpretor for executing binary operator expressions
  """
    if op == "+": return compute.add_checked(l, r)
    if op == "*": return compute.multiply_checked(l, r)
    if op == '-': return compute.subtract_checked(l, r)
    if op == "=": return compute.equal(l, r)
    if op == "<>": return compute.not_equal(l, r)
    if op == "!=": return compute.not_equal(l, r)
    if op == "or": return compute.or_(l, r)
    if op == "<": return compute.less(l, r)
    if op == ">": return compute.greater(l, r)
    if op == "/": return compute.divide_checked(l, r)
    if op == "and": return compute.and_(l, r)
    if op == "in": return compute.is_in(l, r)
    if op == "==": return compute.equal(l, r)
    if op == "<=": return compute.less_equal(l, r)
    if op == ">=": return compute.greater_equal(l, r)
    raise Exception("binary op not implemented")
Example #10
0
def _get_row_mask_per_channel(ch_column: pa.array,
                              channels: List[int]) -> List[int]:
    """Generates the mask arrays for each channel.

    This function creates a list, the length of which is the expected number
    of channels, and in each entry of that list lies a boolean array that says
    whether each row belongs to that channel.
    """
    mask_per_channel = []
    channels = _match_channels_to_reference(ch_column, channels)
    for ch in channels:
        mask_per_channel.append(pc.equal(ch, ch_column))
    return mask_per_channel
Example #11
0
def test_compare_chunked_array_mixed():
    arr = pa.array([1, 2, 3, 4, None])
    arr_chunked = pa.chunked_array([[1, 2, 3], [4, None]])
    arr_chunked2 = pa.chunked_array([[1, 2], [3, 4, None]])

    expected = pa.chunked_array([[True, True, True, True, None]])

    for left, right in [
        (arr, arr_chunked),
        (arr_chunked, arr),
        (arr_chunked, arr_chunked2),
    ]:
        result = pc.equal(left, right)
        assert result.equals(expected)
    def get_vectors_for_date_df(
        self,
        date: datetime.datetime,
        vector_names: Sequence[str],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        # Note that we use MS here to be aligned with storage type in arrow file
        lookup_date = pa.scalar(date, type=pa.timestamp("ms"))
        mask = pc.equal(table["DATE"], lookup_date)

        if realizations:
            real_mask = pc.is_in(table["REAL"],
                                 value_set=pa.array(realizations))
            mask = pc.and_(mask, real_mask)

        table = table.drop(["DATE"])

        # table = table.filter(mask).combine_chunks()
        table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        df = table.to_pandas()
        # df = table.to_pandas(split_blocks=True, zero_copy_only=True)
        # del table  # not necessary, but a good practice
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_for_date_df() took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}")

        return df
Example #13
0
table_df = table.to_pandas()

convert_options = csv.ConvertOptions(
    column_types={
        "VendorID": pa.bool_(),
        # "trip_distance": pa.float16()
    },
    true_values=["Y", "1"],
    false_values=["N", "2"])
table = csv.read_csv("../sec1-intro/yellow_tripdata_2020-01.csv.gz",
                     convert_options=convert_options)
print(table["store_and_fwd_flag"].unique(),
      table["store_and_fwd_flag"].nbytes // (1024**2),
      table["VendorID"].nbytes // 1024,
      table["store_and_fwd_flag"].nbytes // 1024)

x = pa.array([False, True]).cast(pa.string()).cast(pa.bool_())

table_df = table.to_pandas()
print(table_df.store_and_fwd_flag)
mission_impossible = table.to_pandas(self_destruct=True)

import pyarrow.compute as pc
pc.equal(table["total_amount"], 0)
pc.equal(table["total_amount"], 0.0)
t0 = table.filter(pc.not_equal(table["total_amount"], 0.0))

pc.mean(pc.divide(t0["tip_amount"], t0["total_amount"]))  # 18ms
# The fair comparison is (also do on other computer)