def separate_stations():
    rtd_ray = RtdRay()
    rtd = rtd_ray.load_for_ml_model(label_encode=False,
                                    return_times=True).compute()  # .persist()
    # rtd = rtd.reset_index()
    # rtd['station'] = rtd['station'].cat.as_ordered()
    # rtd = rtd.set_index('station')
    # rtd.to_parquet(CACHE_PATH + '/station_rtd')
    storage_path = CACHE_PATH + '/station_rtd/part.{}.parquet'
    # stations = rtd['station'].cat.categories
    # rtd = rtd.groupby('station')
    # i = 0
    # for group in tqdm(stations):
    #     rtd.get_group(group).compute().to_parquet(storage_path.format(str(i)), engine='pyarrow')
    #     i += 1

    stations = rtd['station'].cat.categories
    print('separating stations')
    # station_rtd = []
    with progressbar.ProgressBar(max_value=len(stations)) as bar:
        for i, station in enumerate(stations):
            mask = rtd['station'] == station
            station_rtd = rtd.loc[mask, :]
            station_rtd.to_parquet(storage_path.format(str(i)),
                                   engine='pyarrow')
            # rtd = rtd.loc[~mask, :]
            bar.update(i)
Esempio n. 2
0
def train_models(**load_parameters):
    rtd_ray = RtdRay()
    train = rtd_ray.load_for_ml_model(**load_parameters).compute()
    status_encoder = {}
    status_encoder["ar"] = pickle.load(
        open(ENCODER_PATH.format(encoder="ar_cs"), "rb"))
    status_encoder["dp"] = pickle.load(
        open(ENCODER_PATH.format(encoder="dp_cs"), "rb"))

    ar_train = train.loc[~train["ar_delay"].isna() |
                         (train["ar_cs"] == status_encoder["ar"]["c"])]
    dp_train = train.loc[~train["dp_delay"].isna() |
                         (train["dp_cs"] == status_encoder["dp"]["c"])]
    del train

    ar_labels = {}
    dp_labels = {}
    for label in CLASSES_TO_COMPUTE:
        ar_labels[label] = (ar_train["ar_delay"] <= label) & (
            ar_train["ar_cs"] != status_encoder["ar"]["c"])
        dp_labels[label + 1] = (dp_train["dp_delay"] >=
                                (label + 1)) & (dp_train["dp_cs"] !=
                                                status_encoder["dp"]["c"])

    del ar_train["ar_delay"]
    del ar_train["dp_delay"]
    del ar_train["ar_cs"]
    del ar_train["dp_cs"]

    del dp_train["ar_delay"]
    del dp_train["dp_delay"]
    del dp_train["ar_cs"]
    del dp_train["dp_cs"]

    newpath = "cache/models"
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    parameters = pickle.load(open(CACHE_PATH + "/hyperparameters.pkl", "rb"))

    for label in CLASSES_TO_COMPUTE:
        model_name = f"ar_{label}"
        print("training", model_name)
        pickle.dump(
            train_model(ar_train, ar_labels[label], **parameters[label]),
            open(MODEL_PATH.format(model_name), "wb"),
        )

        label += 1
        model_name = f"dp_{label}"
        print("training", model_name)
        pickle.dump(
            train_model(dp_train, dp_labels[label], **parameters[
                label -
                1]),  # **parameters[label] # n_estimators=50, max_depth=6
            open(MODEL_PATH.format(model_name), "wb"),
        )
Esempio n. 3
0
    def __init__(self, max_date=datetime.now() - timedelta(hours=3)):
        from helpers import RtdRay

        min_date = max_date + timedelta(1)
        self._rtd_d = RtdRay().load_data(columns=[
            "dp_delay", "ar_delay", "dp_pt", "ar_pt", "ar_cs", "dp_cs"
        ])
        # HYPER OPTIMISATION SPEED DATA LOADING
        self._rtd_d_new = self._rtd_d.loc[(self._rtd_d["ar_pt"] >= min_date) |
                                          (self._rtd_d["dp_pt"] >= min_date) |
                                          (self._rtd_d["ar_pt"] <= min_date) |
                                          (self._rtd_d["dp_pt"] <= min_date)]
Esempio n. 4
0
            'dp_delay': ['count', 'mean'],
            'dp_happened': ['mean'],
        }).compute()

        data = groupby_index_to_flat(data)

        return data

    @staticmethod
    def group_uncommon(data: pd.DataFrame) -> pd.DataFrame:
        return data


if __name__ == '__main__':
    import helpers.fancy_print_tcp
    rtd_ray = RtdRay()
    rtd = rtd_ray.load_data(columns=[
        'c',
        'o',
        'f',
        't',
        'n',
        'pp',
        'ar_pt',
        'dp_pt',
        'ar_delay',
        'ar_happened',
        'dp_delay',
        'dp_happened',
    ])
    rtd['c'] = rtd['c'].astype(str)
Esempio n. 5
0
            self.ax.set_title(plot_name.replace("_", ":").replace('-', ' - '), fontsize=12)
            memory_buffer = io.BytesIO()
            self.fig.savefig(memory_buffer, dpi=300, transparent=True)
            image_to_webp(memory_buffer, plot_path)
        else:
            plot_path = f"{CACHE_PATH}/plot_cache/{self.version}_no data available.webp"

        return plot_path


if __name__ == "__main__":
    import helpers.fancy_print_tcp

    rtd_df=None
    rtd_ray = RtdRay()
    rtd_df = rtd_ray.load_data(
        columns=[
            "ar_pt",
            "dp_pt",
            "station",
            "ar_delay",
            "ar_happened",
            "dp_delay",
            "dp_happened",
            "lat",
            "lon",
        ],
        min_date=datetime.datetime(2021, 3, 1)
    )
Esempio n. 6
0
        Returns
        -------
        pd.Series
            date and hour combined.
        """
        return pd.Series([
            datetime.datetime.combine(
                row['date'], datetime.time(hour=int(row['floating_hour'])))
            for i, row in df.iterrows()
        ],
                         index=df['date'].index)


if __name__ == '__main__':
    import helpers.fancy_print_tcp
    rtd_ray = RtdRay()
    rtd_df = rtd_ray.load_data(columns=[
        'ar_pt', 'dp_pt', 'ar_delay', 'ar_happened', 'dp_delay', 'dp_happened'
    ])

    print('grouping over hour')
    time = OverHour(rtd_df, use_cache=True)
    time.plot()

    print('grouping over day')
    time = OverDay(rtd_df, use_cache=True)
    time.plot()

    print('grouping over week')
    time = OverWeek(rtd_df, use_cache=True)
    time.plot()
Esempio n. 7
0
    train_models(
        # max_date=datetime.datetime(2021, 2, 1),
        min_date=datetime.datetime(
            2021, 3, 14
        ),  # datetime.datetime(2021, 2, 1) - datetime.timedelta(days=7 * 2),
        long_distance_only=False,
        return_status=True,
    )

    status_encoder = {}
    status_encoder["ar"] = pickle.load(
        open(ENCODER_PATH.format(encoder="ar_cs"), "rb"))
    status_encoder["dp"] = pickle.load(
        open(ENCODER_PATH.format(encoder="dp_cs"), "rb"))

    rtd_ray = RtdRay()
    test = rtd_ray.load_for_ml_model(
        # max_date=datetime.datetime(2021, 2, 1),
        min_date=datetime.datetime(
            2021, 3, 14
        ),  # datetime.datetime(2021, 2, 1) - datetime.timedelta(days=7 * 2),
        long_distance_only=False,
        return_status=True,
    ).compute()
    ar_test = test.loc[~test["ar_delay"].isna() |
                       (test["ar_cs"] == status_encoder["ar"]["c"]),
                       ["ar_delay", "ar_cs"], ]
    dp_test = test.loc[~test["dp_delay"].isna() |
                       (test["dp_cs"] == status_encoder["dp"]["c"]),
                       ["dp_delay", "dp_cs"], ]
    # ar_test = test[['ar_delay', 'ar_cs']].dropna(subset=["ar_delay"])
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import dask.dataframe as dd
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from helpers import RtdRay

if __name__ == '__main__':
    import helpers.fancy_print_tcp
    from dask.distributed import Client
    client = Client(n_workers=min(16, os.cpu_count()))
    rtd_ray = RtdRay()
    rtd = rtd_ray.load_data(
        columns=['station', 'date_id', 'lat', 'lon', 'ar_ct', 'ar_delay'])

    rtd = rtd.loc[(rtd['ar_ct'] < datetime.datetime(2020, 11, 28)) &
                  (rtd['ar_ct'] > datetime.datetime(2020, 11, 10)), :]
    rtd['ar_ct'] = rtd['ar_ct'].astype(int)
    data = rtd.to_numpy()
    del rtd
    labels = data[:, 3]
    scaler = MinMaxScaler()
    scaler.fit(data[:, :3])
    data = scaler.transform(data[:, :3])
    ten_min_distance = pd.DataFrame({
        'lat': [0, 0],
        'lon': [0, 0],
Esempio n. 9
0
    non_obstacle_rtd.groupby('ar_delay')['ar_pt'].count().rename('No obstacles').plot(logy=True, legend=True, title=priorities_text[priority_col])
    obstacle_rtd.groupby('ar_delay')['ar_pt'].count().rename('obstacles').plot(logy=True, legend=True)
    plt.savefig(f'data/{priority_col}_ar.png')
    plt.close()
    # plt.show()
    non_obstacle_rtd.groupby('dp_delay')['dp_pt'].count().rename('No obstacles').plot(logy=True, legend=True, title=priorities_text[priority_col])
    obstacle_rtd.groupby('dp_delay')['dp_pt'].count().rename('obstacles').plot(logy=True, legend=True)
    plt.savefig(f'data/{priority_col}_dp.png')
    plt.close()
    # plt.show()

if __name__ == '__main__':
    import helpers.fancy_print_tcp

    rtd_ray = RtdRay()
    # obstacle_rtd = rtd_ray.load_data(min_date=datetime.datetime(2021, 3, 14)).compute()
    # non_obstacle_rtd = obstacle_rtd.loc[
    #     ~(obstacle_rtd['obstacles_priority_24'] > 0) &
    #     ~(obstacle_rtd['obstacles_priority_37'] > 0) &
    #     ~(obstacle_rtd['obstacles_priority_63'] > 0) &
    #     ~(obstacle_rtd['obstacles_priority_65'] > 0) &
    #     ~(obstacle_rtd['obstacles_priority_70'] > 0) &
    #     ~(obstacle_rtd['obstacles_priority_80'] > 0)
    # ]
    # non_obstacle_rtd.to_pickle(CACHE_PATH + '/non_obstacle_rtd.pkl')
    # obstacle_rtd = obstacle_rtd.loc[
    #     (obstacle_rtd['obstacles_priority_24'] > 0) |
    #     (obstacle_rtd['obstacles_priority_37'] > 0) |
    #     (obstacle_rtd['obstacles_priority_63'] > 0) |
    #     (obstacle_rtd['obstacles_priority_65'] > 0) |