def load_ts_in_range_flattern(start_date=None, end_date=None, diff_between=60): if start_date is None: start_date = datetime.datetime(2018, 6, 15) if end_date is None: end_date = datetime.datetime(2018, 8, 15) file_name = "data/flattern_ts_in_range-" + start_date.strftime( "%d_%m_%Y") + "-" + end_date.strftime("%d_%m_%Y") + ".npy" try: return np.load(Config.base_dir() + file_name) # "data/flattern_ts_in_range.npy" except FileNotFoundError: pass configs = DataFactory.load_ts_in_range(start_date=start_date, end_date=end_date, diff_between=diff_between) train = [] for c in configs: train = np.concatenate([train, c.train.y.values]) np.save(Config.base_dir() + file_name, train) #"data/flattern_ts_in_range.npy" return train
def _prebuild_data(): statuses = ["EF", "EO", "EP", "F", "NA", "NF", "O", "P", "PF", "PO"] channels = ["A", "I", "B"] data = pd.read_csv(Config.base_dir() + "data/reservations_all.csv", parse_dates=[ "DATUM_KREIRANJA", "DATUM_OD", "DATUM_DO", "DATUM_STORNA", "VRIJEME_ZAMRZAVANJA" ], low_memory=False) g = data.groupby(["HOTEL", "GODINA", "SIF_REZERVACIJE"]) mx = g["VRIJEME_ZAMRZAVANJA"].transform(max) data = data[data["VRIJEME_ZAMRZAVANJA"] == mx] data.STATUS_REZERVACIJE.fillna("F", inplace=True) data = data[[x in statuses for x in data.STATUS_REZERVACIJE]] data = data[[x in channels for x in data.KANAL_ID]] columns = [ "DATUM_KREIRANJA", "DATUM_OD", "DATUM_DO", "BROJ_SOBA_BOOK", "HOTEL" ] data = data[columns] data.to_csv(Config.base_dir() + "data/reservations.csv")
def _folder(): return Config.base_dir() + "data/stats/"
def load_data(): return pd.read_csv( Config.base_dir() + "data/reservations.csv", parse_dates=["DATUM_KREIRANJA", "DATUM_OD", "DATUM_DO"], low_memory=False)
def load_ts(end_date=None, target_date=None, data=None, use_cache=True): if target_date is None: target_date = datetime.datetime(2018, 7, 1) if end_date is None: end_date = datetime.datetime(2018, 5, 3) temp_file_path = Config.base_dir( ) + ".temp/config_" + end_date.strftime( "%d_%m_%Y") + "_" + target_date.strftime("%d_%m_%Y") + ".pkl" def load_from_temp(): with open(temp_file_path, 'rb') as input: return pickle.load(input) def save_to_temp(c): import os if not os.path.exists(temp_file_path.split("config")[0]): os.mkdir(temp_file_path.split("config")[0]) with open(temp_file_path, 'wb') as output: pickle.dump(c, output, pickle.HIGHEST_PROTOCOL) def load_data(): return pd.read_csv( Config.base_dir() + "data/reservations.csv", parse_dates=["DATUM_KREIRANJA", "DATUM_OD", "DATUM_DO"], low_memory=False) try: if use_cache: return load_from_temp() except: pass try: if data is None: data = load_data() except FileNotFoundError: DataFactory._prebuild_data() data = load_data() data.DATUM_OD = data.DATUM_OD.dt.normalize() data.DATUM_DO = data.DATUM_DO.dt.normalize() data.DATUM_KREIRANJA = data.DATUM_KREIRANJA.dt.normalize() min_date = target_date.replace(year=target_date.year - 3) #datetime.datetime(2015, 7, 1) max_date = target_date data["DATUM_OD_C"] = pd.to_datetime([ datetime.date(2018, 2, 28) if x == datetime.date(2016, 2, 29) else x.replace(year=2018) for x in data.DATUM_OD.dt.date.values ]) data["DATUM_DO_C"] = pd.to_datetime([ datetime.date(2018, 2, 28) if x == datetime.date(2016, 2, 29) else x.replace(year=2018) for x in data.DATUM_DO.dt.date.values ]) target_date_data = data[(data.DATUM_OD_C <= target_date) & (data.DATUM_DO_C > target_date)] range_dates = [ min_date + datetime.timedelta(days=x) for x in range(0, (max_date - min_date).days + 1) ] assert min_date == range_dates[0] and max_date == range_dates[ len(range_dates) - 1], "Range of dates generated incorrectly" elem_list = [] for index in range(len(range_dates)): obs_date = range_dates[index] date_data = target_date_data[target_date_data.DATUM_KREIRANJA == obs_date] # Ignore 29.2. and add it to 28.2. if obs_date.date() == datetime.date(2016, 2, 29): prev = list( filter( lambda s: s["X"].date() == datetime.date(2016, 2, 28), elem_list))[0] prev["y"] += date_data.BROJ_SOBA_BOOK.sum() continue ts_elem = dict() ts_elem["X"] = obs_date ts_elem["y"] = date_data.BROJ_SOBA_BOOK.sum() elem_list.append(ts_elem) ts = pd.DataFrame(elem_list) ts = ts.set_index("X", drop=True) config = Config.build(ts, end_date, target_date) if use_cache: save_to_temp(config) return config
def load_ts2(target_date=None, end_date=None, use_cache=True): if target_date is None: target_date = datetime.datetime(2018, 7, 1) if end_date is None: end_date = datetime.datetime(2018, 5, 3) #end_date = target_date - datetime.timedelta(60) temp_file_path = Config.base_dir( ) + ".temp/ts2_method_configs_" + end_date.strftime( "%d_%m_%Y") + "_" + target_date.strftime("%d_%m_%Y") + ".pkl" def load_from_temp(): with open(temp_file_path, 'rb') as input: return pickle.load(input) def save_to_temp(c): import os if not os.path.exists(temp_file_path.split("ts2_method")[0]): os.mkdir(temp_file_path.split("ts2_method")[0]) with open(temp_file_path, 'wb') as output: pickle.dump(c, output, pickle.HIGHEST_PROTOCOL) def load(): return pd.read_csv( Config.base_dir() + "data/reservations.csv", parse_dates=["DATUM_KREIRANJA", "DATUM_OD", "DATUM_DO"], low_memory=False) try: if use_cache: return load_from_temp() except: pass try: data = load() except: DataFactory._prebuild_data() data = load() #hotels = data.HOTEL.unique() hotels = ["H" + str(i + 1) for i in range(12)] configs = list() for h in hotels: filtered_data = data[data.HOTEL == h].copy() config = DataFactory.load_ts(end_date=end_date, target_date=target_date, use_cache=False, data=filtered_data) """ plt.plot(config.all) plt.title(h + " (Total: " + str(config.all.y.sum()) + ")") plt.savefig(Config.base_dir() + "figures/tmp/" + h + ".png") plt.close("all") """ configs.append({"hotel": h, "config": config}) if use_cache: save_to_temp(configs) return configs