コード例 #1
0
ファイル: benchmark.py プロジェクト: otsaloma/dataiter
def data_frame_group_by_aggregate_14668():
    data = data_frame("vehicles.csv")
    start = time.time()
    (data
     .group_by("make", "model", "year")
     .aggregate(
         n=di.count(),
         hwy=di.mean("hwy"),
         cty=di.mean("cty")))
    return time.time() - start
コード例 #2
0
ファイル: benchmark.py プロジェクト: otsaloma/dataiter
def data_frame_group_by_aggregate_100000_short_numba():
    with patch("dataiter.USE_NUMBA", True):
        data = data_frame_random(1_000_000, 100_000)
        start = time.time()
        (data
         .group_by("g")
         .aggregate(
             a_mean=di.mean("a"),
             a_std=di.std("a")))
        return time.time() - start
コード例 #3
0
    def test_aggregate(self):
        data = test.data_frame("vehicles.csv")
        data = data.filter_out(data.cyl.is_na())
        data = data.filter_out(data.displ.is_na())
        stat = (data.group_by("make",
                              "model").aggregate(cyl=di.median("cyl"),
                                                 displ=di.mean("displ")))

        assert stat.nrow == 3240
        assert stat.ncol == 4
        assert stat.sort(make=1, model=1) == stat
        assert np.isclose(np.sum(stat.cyl), 19964.5, atol=0.1)
        assert np.isclose(np.sum(stat.displ), 11430.1, atol=0.1)
コード例 #4
0
ファイル: generate-df.py プロジェクト: otsaloma/dataiter
            data[name] = np.char.lower(data[name])
    return data

# AGGREGATE
(read_csv("../data/vehicles.csv")
 .modify(fuel_regular=lambda x: x.fuel == "regular")
 .group_by("make", "model")
 .aggregate(
     all_fuel_regular=di.all("fuel_regular"),
     any_fuel_regular=di.any("fuel_regular"),
     count=di.count(),
     count_unique_cyl=di.count_unique("cyl"),
     first_hwy=di.first("hwy"),
     last_hwy=di.last("hwy"),
     max_hwy=di.max("hwy"),
     mean_hwy=di.mean("hwy"),
     median_hwy=di.median("hwy"),
     min_hwy=di.min("hwy"),
     mode_year=di.mode("year"),
     nth_id=di.nth("id", 0),
     quantile_hwy=di.quantile("hwy", 0.75),
     std_hwy=di.std("hwy", ddof=1),
     sum_hwy=di.sum("hwy"),
     var_hwy=di.var("hwy", ddof=1))
 .modify(mean_hwy=lambda x: x.mean_hwy.round(2))
 .modify(std_hwy =lambda x: x.std_hwy.round(2))
 .modify(var_hwy =lambda x: x.var_hwy.round(2))
 .write_csv("aggregate.df.csv"))

# ANTI JOIN
reviews = read_csv("../data/listings-reviews.csv")