def separate_stations(): rtd_ray = RtdRay() rtd = rtd_ray.load_for_ml_model(label_encode=False, return_times=True).compute() # .persist() # rtd = rtd.reset_index() # rtd['station'] = rtd['station'].cat.as_ordered() # rtd = rtd.set_index('station') # rtd.to_parquet(CACHE_PATH + '/station_rtd') storage_path = CACHE_PATH + '/station_rtd/part.{}.parquet' # stations = rtd['station'].cat.categories # rtd = rtd.groupby('station') # i = 0 # for group in tqdm(stations): # rtd.get_group(group).compute().to_parquet(storage_path.format(str(i)), engine='pyarrow') # i += 1 stations = rtd['station'].cat.categories print('separating stations') # station_rtd = [] with progressbar.ProgressBar(max_value=len(stations)) as bar: for i, station in enumerate(stations): mask = rtd['station'] == station station_rtd = rtd.loc[mask, :] station_rtd.to_parquet(storage_path.format(str(i)), engine='pyarrow') # rtd = rtd.loc[~mask, :] bar.update(i)
def train_models(**load_parameters): rtd_ray = RtdRay() train = rtd_ray.load_for_ml_model(**load_parameters).compute() status_encoder = {} status_encoder["ar"] = pickle.load( open(ENCODER_PATH.format(encoder="ar_cs"), "rb")) status_encoder["dp"] = pickle.load( open(ENCODER_PATH.format(encoder="dp_cs"), "rb")) ar_train = train.loc[~train["ar_delay"].isna() | (train["ar_cs"] == status_encoder["ar"]["c"])] dp_train = train.loc[~train["dp_delay"].isna() | (train["dp_cs"] == status_encoder["dp"]["c"])] del train ar_labels = {} dp_labels = {} for label in CLASSES_TO_COMPUTE: ar_labels[label] = (ar_train["ar_delay"] <= label) & ( ar_train["ar_cs"] != status_encoder["ar"]["c"]) dp_labels[label + 1] = (dp_train["dp_delay"] >= (label + 1)) & (dp_train["dp_cs"] != status_encoder["dp"]["c"]) del ar_train["ar_delay"] del ar_train["dp_delay"] del ar_train["ar_cs"] del ar_train["dp_cs"] del dp_train["ar_delay"] del dp_train["dp_delay"] del dp_train["ar_cs"] del dp_train["dp_cs"] newpath = "cache/models" if not os.path.exists(newpath): os.makedirs(newpath) parameters = pickle.load(open(CACHE_PATH + "/hyperparameters.pkl", "rb")) for label in CLASSES_TO_COMPUTE: model_name = f"ar_{label}" print("training", model_name) pickle.dump( train_model(ar_train, ar_labels[label], **parameters[label]), open(MODEL_PATH.format(model_name), "wb"), ) label += 1 model_name = f"dp_{label}" print("training", model_name) pickle.dump( train_model(dp_train, dp_labels[label], **parameters[ label - 1]), # **parameters[label] # n_estimators=50, max_depth=6 open(MODEL_PATH.format(model_name), "wb"), )
def __init__(self, max_date=datetime.now() - timedelta(hours=3)): from helpers import RtdRay min_date = max_date + timedelta(1) self._rtd_d = RtdRay().load_data(columns=[ "dp_delay", "ar_delay", "dp_pt", "ar_pt", "ar_cs", "dp_cs" ]) # HYPER OPTIMISATION SPEED DATA LOADING self._rtd_d_new = self._rtd_d.loc[(self._rtd_d["ar_pt"] >= min_date) | (self._rtd_d["dp_pt"] >= min_date) | (self._rtd_d["ar_pt"] <= min_date) | (self._rtd_d["dp_pt"] <= min_date)]
'dp_delay': ['count', 'mean'], 'dp_happened': ['mean'], }).compute() data = groupby_index_to_flat(data) return data @staticmethod def group_uncommon(data: pd.DataFrame) -> pd.DataFrame: return data if __name__ == '__main__': import helpers.fancy_print_tcp rtd_ray = RtdRay() rtd = rtd_ray.load_data(columns=[ 'c', 'o', 'f', 't', 'n', 'pp', 'ar_pt', 'dp_pt', 'ar_delay', 'ar_happened', 'dp_delay', 'dp_happened', ]) rtd['c'] = rtd['c'].astype(str)
self.ax.set_title(plot_name.replace("_", ":").replace('-', ' - '), fontsize=12) memory_buffer = io.BytesIO() self.fig.savefig(memory_buffer, dpi=300, transparent=True) image_to_webp(memory_buffer, plot_path) else: plot_path = f"{CACHE_PATH}/plot_cache/{self.version}_no data available.webp" return plot_path if __name__ == "__main__": import helpers.fancy_print_tcp rtd_df=None rtd_ray = RtdRay() rtd_df = rtd_ray.load_data( columns=[ "ar_pt", "dp_pt", "station", "ar_delay", "ar_happened", "dp_delay", "dp_happened", "lat", "lon", ], min_date=datetime.datetime(2021, 3, 1) )
Returns ------- pd.Series date and hour combined. """ return pd.Series([ datetime.datetime.combine( row['date'], datetime.time(hour=int(row['floating_hour']))) for i, row in df.iterrows() ], index=df['date'].index) if __name__ == '__main__': import helpers.fancy_print_tcp rtd_ray = RtdRay() rtd_df = rtd_ray.load_data(columns=[ 'ar_pt', 'dp_pt', 'ar_delay', 'ar_happened', 'dp_delay', 'dp_happened' ]) print('grouping over hour') time = OverHour(rtd_df, use_cache=True) time.plot() print('grouping over day') time = OverDay(rtd_df, use_cache=True) time.plot() print('grouping over week') time = OverWeek(rtd_df, use_cache=True) time.plot()
train_models( # max_date=datetime.datetime(2021, 2, 1), min_date=datetime.datetime( 2021, 3, 14 ), # datetime.datetime(2021, 2, 1) - datetime.timedelta(days=7 * 2), long_distance_only=False, return_status=True, ) status_encoder = {} status_encoder["ar"] = pickle.load( open(ENCODER_PATH.format(encoder="ar_cs"), "rb")) status_encoder["dp"] = pickle.load( open(ENCODER_PATH.format(encoder="dp_cs"), "rb")) rtd_ray = RtdRay() test = rtd_ray.load_for_ml_model( # max_date=datetime.datetime(2021, 2, 1), min_date=datetime.datetime( 2021, 3, 14 ), # datetime.datetime(2021, 2, 1) - datetime.timedelta(days=7 * 2), long_distance_only=False, return_status=True, ).compute() ar_test = test.loc[~test["ar_delay"].isna() | (test["ar_cs"] == status_encoder["ar"]["c"]), ["ar_delay", "ar_cs"], ] dp_test = test.loc[~test["dp_delay"].isna() | (test["dp_cs"] == status_encoder["dp"]["c"]), ["dp_delay", "dp_cs"], ] # ar_test = test[['ar_delay', 'ar_cs']].dropna(subset=["ar_delay"])
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import pandas as pd import dask.dataframe as dd import datetime import numpy as np import matplotlib.pyplot as plt from sklearn import neighbors from sklearn.preprocessing import MinMaxScaler from sklearn import linear_model from helpers import RtdRay if __name__ == '__main__': import helpers.fancy_print_tcp from dask.distributed import Client client = Client(n_workers=min(16, os.cpu_count())) rtd_ray = RtdRay() rtd = rtd_ray.load_data( columns=['station', 'date_id', 'lat', 'lon', 'ar_ct', 'ar_delay']) rtd = rtd.loc[(rtd['ar_ct'] < datetime.datetime(2020, 11, 28)) & (rtd['ar_ct'] > datetime.datetime(2020, 11, 10)), :] rtd['ar_ct'] = rtd['ar_ct'].astype(int) data = rtd.to_numpy() del rtd labels = data[:, 3] scaler = MinMaxScaler() scaler.fit(data[:, :3]) data = scaler.transform(data[:, :3]) ten_min_distance = pd.DataFrame({ 'lat': [0, 0], 'lon': [0, 0],
non_obstacle_rtd.groupby('ar_delay')['ar_pt'].count().rename('No obstacles').plot(logy=True, legend=True, title=priorities_text[priority_col]) obstacle_rtd.groupby('ar_delay')['ar_pt'].count().rename('obstacles').plot(logy=True, legend=True) plt.savefig(f'data/{priority_col}_ar.png') plt.close() # plt.show() non_obstacle_rtd.groupby('dp_delay')['dp_pt'].count().rename('No obstacles').plot(logy=True, legend=True, title=priorities_text[priority_col]) obstacle_rtd.groupby('dp_delay')['dp_pt'].count().rename('obstacles').plot(logy=True, legend=True) plt.savefig(f'data/{priority_col}_dp.png') plt.close() # plt.show() if __name__ == '__main__': import helpers.fancy_print_tcp rtd_ray = RtdRay() # obstacle_rtd = rtd_ray.load_data(min_date=datetime.datetime(2021, 3, 14)).compute() # non_obstacle_rtd = obstacle_rtd.loc[ # ~(obstacle_rtd['obstacles_priority_24'] > 0) & # ~(obstacle_rtd['obstacles_priority_37'] > 0) & # ~(obstacle_rtd['obstacles_priority_63'] > 0) & # ~(obstacle_rtd['obstacles_priority_65'] > 0) & # ~(obstacle_rtd['obstacles_priority_70'] > 0) & # ~(obstacle_rtd['obstacles_priority_80'] > 0) # ] # non_obstacle_rtd.to_pickle(CACHE_PATH + '/non_obstacle_rtd.pkl') # obstacle_rtd = obstacle_rtd.loc[ # (obstacle_rtd['obstacles_priority_24'] > 0) | # (obstacle_rtd['obstacles_priority_37'] > 0) | # (obstacle_rtd['obstacles_priority_63'] > 0) | # (obstacle_rtd['obstacles_priority_65'] > 0) |