def generate_data(self, rtd: dd.DataFrame) -> pd.DataFrame: # Use dask Client to do groupby as the groupby is complex and scales well on local cluster. from dask.distributed import Client client = Client(n_workers=min(16, os.cpu_count())) # Generate an index with self.FREQ for groupby over time and station rtd["stop_hour"] = rtd["ar_pt"].fillna(value=rtd["dp_pt"]).dt.round(self.FREQ) rtd = rtd.drop(columns=['ar_pt', 'dp_pt']) rtd["single_index_for_groupby"] = rtd["stop_hour"].astype("str") + rtd[ "station" ].astype("str") data: pd.DataFrame = ( rtd.groupby("single_index_for_groupby", sort=False) .agg({ "ar_delay": ["mean"], "ar_happened": ["sum"], "dp_delay": ["mean"], "dp_happened": ["sum"], "stop_hour": ["first"], "station": ["first"], "lat": ['first'], "lon": ['first'], }) .compute() ) data = groupby_index_to_flat(data) return data
def generate_data(rtd: dd.DataFrame) -> pd.DataFrame: data = rtd.groupby('pp', sort=False).agg({ 'ar_delay': ['count', 'mean'], 'ar_happened': ['mean'], 'dp_delay': ['count', 'mean'], 'dp_happened': ['mean'], }).compute() data = groupby_index_to_flat(data) return data