Ejemplo n.º 1
0
    def generate_data(self, rtd: dd.DataFrame) -> pd.DataFrame:
        # Use dask Client to do groupby as the groupby is complex and scales well on local cluster.
        from dask.distributed import Client
        client = Client(n_workers=min(16, os.cpu_count()))

        # Generate an index with self.FREQ for groupby over time and station
        rtd["stop_hour"] = rtd["ar_pt"].fillna(value=rtd["dp_pt"]).dt.round(self.FREQ)
        rtd = rtd.drop(columns=['ar_pt', 'dp_pt'])
        rtd["single_index_for_groupby"] = rtd["stop_hour"].astype("str") + rtd[
            "station"
        ].astype("str")

        data: pd.DataFrame = (
            rtd.groupby("single_index_for_groupby", sort=False)
            .agg({
                "ar_delay": ["mean"],
                "ar_happened": ["sum"],
                "dp_delay": ["mean"],
                "dp_happened": ["sum"],
                "stop_hour": ["first"],
                "station": ["first"],
                "lat": ['first'],
                "lon": ['first'],
            })
            .compute()
        )

        data = groupby_index_to_flat(data)
        return data
Ejemplo n.º 2
0
    def generate_data(rtd: dd.DataFrame) -> pd.DataFrame:
        data = rtd.groupby('pp', sort=False).agg({
            'ar_delay': ['count', 'mean'],
            'ar_happened': ['mean'],
            'dp_delay': ['count', 'mean'],
            'dp_happened': ['mean'],
        }).compute()

        data = groupby_index_to_flat(data)

        return data