def do_scan(file, cols):
    table = ds.dataset(file, format=format_).to_table(use_threads=False)
    table = table.flatten()
    print(table.num_rows)
    val1 = pc.stddev(table.column(4))
    val2 = pc.variance(table.column(4))
    val3 = pc.mean(table.column(4))
    val4 = pc.sum(table.column(4))
Esempio n. 2
0
table_df = table.to_pandas()

convert_options = csv.ConvertOptions(
    column_types={
        "VendorID": pa.bool_(),
        # "trip_distance": pa.float16()
    },
    true_values=["Y", "1"],
    false_values=["N", "2"])
table = csv.read_csv("../sec1-intro/yellow_tripdata_2020-01.csv.gz",
                     convert_options=convert_options)
print(table["store_and_fwd_flag"].unique(),
      table["store_and_fwd_flag"].nbytes // (1024**2),
      table["VendorID"].nbytes // 1024,
      table["store_and_fwd_flag"].nbytes // 1024)

x = pa.array([False, True]).cast(pa.string()).cast(pa.bool_())

table_df = table.to_pandas()
print(table_df.store_and_fwd_flag)
mission_impossible = table.to_pandas(self_destruct=True)

import pyarrow.compute as pc
pc.equal(table["total_amount"], 0)
pc.equal(table["total_amount"], 0.0)
t0 = table.filter(pc.not_equal(table["total_amount"], 0.0))

pc.mean(pc.divide(t0["tip_amount"], t0["total_amount"]))  # 18ms
# The fair comparison is (also do on other computer)
Esempio n. 3
0
 def update(self, arr):
     self.mean = float(c.mean(arr).as_py())
     minmax = c.min_max(arr)
     self.min, self.max = float(minmax['min'].as_py()), float(
         minmax['max'].as_py())
Esempio n. 4
0
from pyarrow import csv
import pyarrow.compute as pc
import pyarrow.plasma as plasma

client = plasma.connect("/tmp/fast_python")
while True:
    client = plasma.connect("/tmp/fast_python")
    all_objects = client.list()

    for plid, keys in all_objects.items():
        plid_str = ""
        try:
            plid_str = plid.binary().decode("us-ascii")
        except UnicodeDecodeError:
            continue
        if plid_str.startswith("csv-"):
            original_pid = plid_str[4:]
            result_plid = plasma.ObjectID(f"result-{original_pid}".ljust(
                20, " ")[:20].encode("us-ascii"))
            if client.contains(result_plid):
                continue
            print(f"Working on: {plid_str}")
            table = client.get(plid)
            t0 = table.filter(pc.not_equal(table["total_amount"], 0.0))
            my_mean = pc.mean(pc.divide(t0["tip_amount"],
                                        t0["total_amount"])).as_py()
            result_plid = plasma.ObjectID(f"result-{original_pid}".ljust(
                20, " ")[:20].encode("us-ascii"))
            client.put(my_mean, result_plid)
    time.sleep(0.05)
Esempio n. 5
0
    raise Exception("Could not find UDF named %s" % name)


#
# Prepopulate registry with simple functions
#
registry = UDFRegistry.registry()
registry.add(ScalarUDF("lower", 1, lambda col: compute.utf8_lower(col.cast(string()))))
registry.add(ScalarUDF("upper", 1, lambda col: compute.utf8_upper(col.cast(string()))))

#
# Prepopulate with incremental aggregation functions
#

registry.add(AggUDF("count", 1, lambda col: compute.count(col).cast(float64())))
registry.add(AggUDF("avg", 1, lambda col: compute.mean(col).cast(float64())))
registry.add(AggUDF("sum", 1, lambda col: compute.sum(col).cast(float64())))

# Welford's algorithm for online std
std_init = lambda: [0, 0., 0]
def std_update(s, v):
  s[0] += 1
  d = v - s[1]
  s[1] += d / s[0]
  s[2] += d * (v - s[1])
  return s
def std_finalize(s):
  if s[0] < 2: return float('nan')
  return s[2] / (s[0] - 1)

registry.add(IncAggUDF("std", 1, np.std, std_init, std_update, std_finalize))
Esempio n. 6
0
def my_mean(arr):
    return pc.mean(arr)