def data_frame_group_by_aggregate_14668(): data = data_frame("vehicles.csv") start = time.time() (data .group_by("make", "model", "year") .aggregate( n=di.count(), hwy=di.mean("hwy"), cty=di.mean("cty"))) return time.time() - start
def data_frame_group_by_aggregate_100000_short_numba(): with patch("dataiter.USE_NUMBA", True): data = data_frame_random(1_000_000, 100_000) start = time.time() (data .group_by("g") .aggregate( a_mean=di.mean("a"), a_std=di.std("a"))) return time.time() - start
def test_aggregate(self): data = test.data_frame("vehicles.csv") data = data.filter_out(data.cyl.is_na()) data = data.filter_out(data.displ.is_na()) stat = (data.group_by("make", "model").aggregate(cyl=di.median("cyl"), displ=di.mean("displ"))) assert stat.nrow == 3240 assert stat.ncol == 4 assert stat.sort(make=1, model=1) == stat assert np.isclose(np.sum(stat.cyl), 19964.5, atol=0.1) assert np.isclose(np.sum(stat.displ), 11430.1, atol=0.1)
data[name] = np.char.lower(data[name]) return data # AGGREGATE (read_csv("../data/vehicles.csv") .modify(fuel_regular=lambda x: x.fuel == "regular") .group_by("make", "model") .aggregate( all_fuel_regular=di.all("fuel_regular"), any_fuel_regular=di.any("fuel_regular"), count=di.count(), count_unique_cyl=di.count_unique("cyl"), first_hwy=di.first("hwy"), last_hwy=di.last("hwy"), max_hwy=di.max("hwy"), mean_hwy=di.mean("hwy"), median_hwy=di.median("hwy"), min_hwy=di.min("hwy"), mode_year=di.mode("year"), nth_id=di.nth("id", 0), quantile_hwy=di.quantile("hwy", 0.75), std_hwy=di.std("hwy", ddof=1), sum_hwy=di.sum("hwy"), var_hwy=di.var("hwy", ddof=1)) .modify(mean_hwy=lambda x: x.mean_hwy.round(2)) .modify(std_hwy =lambda x: x.std_hwy.round(2)) .modify(var_hwy =lambda x: x.var_hwy.round(2)) .write_csv("aggregate.df.csv")) # ANTI JOIN reviews = read_csv("../data/listings-reviews.csv")