def download(self) -> TimeseriesBundle: dataset_url = 'https://forecasters.org/data/m3comp/M3C.xls' raw_dataset_path = os.path.join(self.path, 'M3C.xls') download_url(dataset_url, raw_dataset_path) timeseries = [] for sp in ['M3Year', 'M3Quart', 'M3Month', 'M3Other']: dataset = pd.read_excel(raw_dataset_path, sheet_name=sp) for _, row in dataset.iterrows(): frequency = 1 starting_date = Unknown.date() time_unit = Unknown() year = month = day = 1 if 'Starting Year' in row.index: year = row['Starting Year'] time_unit = Year() if 'Starting Quarter' in row.index: month = 3 * (int(row['Starting Quarter']) - 1) + 1 frequency = 3 time_unit = Month() elif 'Starting Month' in row.index: month = int(row['Starting Month']) time_unit = Month() if not isinstance(time_unit, Unknown): try: starting_date = datetime(year=year, month=month, day=day) except Exception: time_unit = Unknown() pass timeseries.append( Timeseries(id=str(row['Series']), start_date=starting_date, time_unit=time_unit, frequency=frequency, period=1, values=row.T[6:row.N + 6].values.astype( np.float32), meta={'seasonal_pattern': sp})) return TimeseriesBundle(timeseries)
def download(self) -> TimeseriesBundle: archive_file = os.path.join(self.path, 'dataset.zip') raw_file = os.path.join(self.path, 'LD2011_2014.txt') download_url('https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip', archive_file) patoolib.extract_archive(archive_file, outdir=self.path) with open(raw_file, 'r') as f: raw = f.readlines() parsed_values = np.array(list(map( lambda raw_line: np.array(raw_line.replace(',', '.').strip().split(';')[1:]).astype(np.float), tqdm(raw[1:]) ))) aggregated = [] for i in tqdm(range(0, parsed_values.shape[0], 4)): aggregated.append(parsed_values[i:i + 4, :].sum(axis=0)) aggregated = np.array(aggregated) # regarding time labels, in dataset description authors specify # "Every year in March time change day (which has only 23 hours) the values between 1:00 am and 2:00 am # are zero for all points." # But I could not prove that claim for "2011-03-27 01:15:00" (lines 8165-8167), # neither for "2012-03-25 01:45:00", thus it's not clear how to deal with daylight saving time change in this # dataset. Taking into account this uncertainty the starting date is treated as UTC (without time changes). start_date = datetime(2011, 1, 1, 1, 0, 0) # aggregated towards next hour instead of current hour. dataset = aggregated.T # use time step as second dimension. timeseries = [] for i, values in enumerate(dataset): timeseries.append(Timeseries(id=str(i), start_date=start_date, time_unit=Hour(), frequency=1, period=ElectricityMeta.period, values=values, meta={})) return TimeseriesBundle(timeseries)
def download(self) -> TimeseriesBundle: raw_file_path = os.path.join(M3Meta.forecasts_path, 'M3Forecast.xls') download_url('https://forecasters.org/data/m3comp/M3Forecast.xls', raw_file_path) original_timeseries = M3Dataset(M3Meta().dataset_path).load_cache() horizon_mapping = M3Meta().horizons_map() training_set, _ = original_timeseries.split( lambda t: t.split(-horizon_mapping[t.meta['seasonal_pattern']])) training_timeseries = training_set.timeseries models_forecasts = [] for model_name in tqdm(M3Meta.models): forecast = pd.read_excel(raw_file_path, sheet_name=model_name, header=None) for i, row in forecast.iterrows(): ts = training_timeseries[i].future_values( row.T[2:row[1] + 2].values.astype(np.float32)) ts.meta = {**ts.meta, 'model': model_name} models_forecasts.append(ts) return TimeseriesBundle(models_forecasts)
def download(self) -> TimeseriesBundle: archive_file = os.path.join(self.path, 'dataset.zip') train_raw_file = os.path.join(self.path, 'PEMS_train') test_raw_file = os.path.join(self.path, 'PEMS_test') perm_raw_file = os.path.join(self.path, 'randperm') download_url( 'https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip', archive_file) patoolib.extract_archive(archive_file, outdir=self.path) with open(train_raw_file, 'r') as f: train_raw_data = f.readlines() with open(test_raw_file, 'r') as f: test_raw_data = f.readlines() with open(perm_raw_file, 'r') as f: permutations = f.readlines() permutations = np.array( permutations[0].rstrip()[1:-1].split(' ')).astype(np.int) raw_data = train_raw_data + test_raw_data # start date per https://archive.ics.uci.edu/ml/datasets/PEMS-SF # skip 2008-01-01 because it's holiday. # the number of days between 2008-01-01 and 2009-03-30 is 455 but based on provided labels (which are days of week) # the sequence of days had only 10 gaps by 1 day, where the first 6 correspond to a holiday or anomalous day which # was excluded from the dataset, but the other 4 gaps happen on unexplained dates. # More over with only 10 gaps it's not possible to fill dates up to 2009-03-30, it should be 15 gaps # (if 2009-01-01 included, 14 otherwise). # Taking into consideration all the concerns above, we decided to assume the following dates were skipped # (first 7 seem to be aligned with labels and description): # - Jan. 1, 2008 # - Jan. 21, 2008 # - Feb. 18, 2008 # - Mar. 9, 2008 - Anomaly # - May 26, 2008 # - Jul. 4, 2008 # - Sep. 1, 2008 # - Oct. 13, 2008 - Columbus Day # - Nov. 11, 2008 # - Nov. 27, 2008 # - Dec. 25, 2008 # - Jan. 1, 2009 # - Jan. 19, 2009 # - Feb. 16, 2009 # - Mar. 8, 2009 - Anomaly # ------------------------------------------ # Thus 455 - 15 = 440 days from 2008-01-01 to 2008-03-30 (incl.) start_date = datetime.strptime('2008-01-02', '%Y-%m-%d') # 2008-01-01 is a holiday current_date = start_date excluded_dates = [ datetime.strptime('2008-01-21', '%Y-%m-%d'), datetime.strptime('2008-02-18', '%Y-%m-%d'), datetime.strptime('2008-03-09', '%Y-%m-%d'), datetime.strptime('2008-05-26', '%Y-%m-%d'), datetime.strptime('2008-07-04', '%Y-%m-%d'), datetime.strptime('2008-09-01', '%Y-%m-%d'), datetime.strptime('2008-10-13', '%Y-%m-%d'), datetime.strptime('2008-11-11', '%Y-%m-%d'), datetime.strptime('2008-11-27', '%Y-%m-%d'), datetime.strptime('2008-12-25', '%Y-%m-%d'), datetime.strptime('2009-01-01', '%Y-%m-%d'), datetime.strptime('2009-01-19', '%Y-%m-%d'), datetime.strptime('2009-02-16', '%Y-%m-%d'), datetime.strptime('2009-03-08', '%Y-%m-%d'), ] values = [] for day, i in tqdm(enumerate(range(len(permutations)))): if current_date not in excluded_dates: matrix = raw_data[np.where(permutations == i + 1)[0][0]].rstrip()[1:-1] daily = [] for row_vector in matrix.split(';'): daily.append( np.array(row_vector.split(' ')).astype(np.float32)) daily = np.array(daily) if len(values) == 0: values = daily else: values = np.concatenate([values, daily], axis=1) else: # should never be in the first 24*7 records. # fill gaps with same day of previous week. values = np.concatenate( [values, values[:, -24 * 7 * 6:-24 * 6 * 6]], axis=1) current_date += timedelta(days=1) # aggregate 10 minutes events to hourly hourly = np.array([ list(map(np.mean, zip(*(iter(lane), ) * 6))) for lane in tqdm(values) ]) timeseries = [ Timeseries(id=str(i), start_date=start_date, time_unit=Hour(), frequency=1, period=24 * 7, values=values, meta={}) for i, values in enumerate(hourly) ] return TimeseriesBundle(timeseries=timeseries)
def download(self) -> TimeseriesBundle: url_template = 'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/{}/{}-{}.csv' m4_info_url = 'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/M4-info.csv' m4_info_path = os.path.join(self.path, 'M4info.csv') ssl._create_default_https_context = ssl._create_unverified_context download_url(m4_info_url, m4_info_path) for sp in M4Meta.seasonal_patterns: training_url = url_template.format("Train", sp, "train") download_url(training_url, os.path.join(M4Meta.dataset_path, f'{sp}-train.csv')) test_url = url_template.format("Test", sp, "test") download_url(test_url, os.path.join(M4Meta.dataset_path, f'{sp}-test.csv')) # Download naive2 forecasts, needed for OWA metric m4_naive2_archive = os.path.join(self.path, 'naive2.rar') download_url( 'https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-Naive2.rar', m4_naive2_archive) patoolib.extract_archive(m4_naive2_archive, outdir=self.path) os.remove(m4_naive2_archive) # Download m4 competition winner predictions, for summary testing purposes only m4_winner_archive = os.path.join(self.path, 'submission-118.rar') download_url( 'https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-118.rar', m4_winner_archive) patoolib.extract_archive(m4_winner_archive, outdir=self.path) os.remove(m4_winner_archive) m4_info = pd.read_csv(m4_info_path) m4_info.set_index('M4id', inplace=True) time_units_mapping = { 'Yearly': (Year(), 1), 'Quarterly': (Month(), 3), 'Monthly': (Month(), 1), 'Weekly': (Day(), 7), 'Daily': (Day(), 1), 'Hourly': (Hour(), 1) } all_timeseries = [] for sp in M4Meta.seasonal_patterns: training_set = pd.read_csv( os.path.join(M4Meta.dataset_path, f'{sp}-train.csv')) test_set = pd.read_csv( os.path.join(M4Meta.dataset_path, f'{sp}-test.csv')) time_unit, frequency = time_units_mapping[sp] for i, row in tqdm(training_set.iterrows()): timeseries_id = str(row['V1']) training_values = row.values[1:].astype(np.float32) training_values = training_values[~np.isnan(training_values)] test_values = test_set.loc[i].values[1:].astype(np.float32) timeseries_info = m4_info.loc[timeseries_id] parsing_formats = ['%d-%m-%y %H:%M', '%Y-%m-%d %H:%M:%S'] parsed_date = None for parsing_format in parsing_formats: try: parsed_date = datetime.strptime( timeseries_info.StartingDate, parsing_format) except Exception: continue if parsed_date is None: raise ValueError( f'Could not parse {timeseries_info.StartingDate} for {timeseries_id}' ) timeseries = Timeseries(id=timeseries_id, start_date=parsed_date, time_unit=time_unit, frequency=frequency, period=int(timeseries_info.Frequency), values=np.concatenate( [training_values, test_values]), meta={'seasonal_pattern': sp}) all_timeseries.append(timeseries) return TimeseriesBundle(all_timeseries)