예제 #1
0
    def download(self) -> TimeseriesBundle:
        dataset_url = 'https://forecasters.org/data/m3comp/M3C.xls'
        raw_dataset_path = os.path.join(self.path, 'M3C.xls')

        download_url(dataset_url, raw_dataset_path)

        timeseries = []

        for sp in ['M3Year', 'M3Quart', 'M3Month', 'M3Other']:
            dataset = pd.read_excel(raw_dataset_path, sheet_name=sp)

            for _, row in dataset.iterrows():
                frequency = 1
                starting_date = Unknown.date()
                time_unit = Unknown()
                year = month = day = 1

                if 'Starting Year' in row.index:
                    year = row['Starting Year']
                    time_unit = Year()

                if 'Starting Quarter' in row.index:
                    month = 3 * (int(row['Starting Quarter']) - 1) + 1
                    frequency = 3
                    time_unit = Month()
                elif 'Starting Month' in row.index:
                    month = int(row['Starting Month'])
                    time_unit = Month()

                if not isinstance(time_unit, Unknown):
                    try:
                        starting_date = datetime(year=year,
                                                 month=month,
                                                 day=day)
                    except Exception:
                        time_unit = Unknown()
                        pass

                timeseries.append(
                    Timeseries(id=str(row['Series']),
                               start_date=starting_date,
                               time_unit=time_unit,
                               frequency=frequency,
                               period=1,
                               values=row.T[6:row.N + 6].values.astype(
                                   np.float32),
                               meta={'seasonal_pattern': sp}))
        return TimeseriesBundle(timeseries)
    def download(self) -> TimeseriesBundle:
        archive_file = os.path.join(self.path, 'dataset.zip')
        raw_file = os.path.join(self.path, 'LD2011_2014.txt')
        download_url('https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip',
                     archive_file)
        patoolib.extract_archive(archive_file, outdir=self.path)

        with open(raw_file, 'r') as f:
            raw = f.readlines()

        parsed_values = np.array(list(map(
            lambda raw_line: np.array(raw_line.replace(',', '.').strip().split(';')[1:]).astype(np.float), tqdm(raw[1:])
        )))

        aggregated = []
        for i in tqdm(range(0, parsed_values.shape[0], 4)):
            aggregated.append(parsed_values[i:i + 4, :].sum(axis=0))
        aggregated = np.array(aggregated)

        # regarding time labels, in dataset description authors specify
        # "Every year in March time change day (which has only 23 hours) the values between 1:00 am and 2:00 am
        # are zero for all points."
        # But I could not prove that claim for "2011-03-27 01:15:00" (lines 8165-8167),
        # neither for "2012-03-25 01:45:00", thus it's not clear how to deal with daylight saving time change in this
        # dataset. Taking into account this uncertainty the starting date is treated as UTC (without time changes).

        start_date = datetime(2011, 1, 1, 1, 0, 0)  # aggregated towards next hour instead of current hour.

        dataset = aggregated.T  # use time step as second dimension.
        timeseries = []

        for i, values in enumerate(dataset):
            timeseries.append(Timeseries(id=str(i),
                                         start_date=start_date,
                                         time_unit=Hour(),
                                         frequency=1,
                                         period=ElectricityMeta.period,
                                         values=values,
                                         meta={}))
        return TimeseriesBundle(timeseries)
예제 #3
0
    def download(self) -> TimeseriesBundle:
        raw_file_path = os.path.join(M3Meta.forecasts_path, 'M3Forecast.xls')
        download_url('https://forecasters.org/data/m3comp/M3Forecast.xls',
                     raw_file_path)

        original_timeseries = M3Dataset(M3Meta().dataset_path).load_cache()
        horizon_mapping = M3Meta().horizons_map()
        training_set, _ = original_timeseries.split(
            lambda t: t.split(-horizon_mapping[t.meta['seasonal_pattern']]))
        training_timeseries = training_set.timeseries

        models_forecasts = []
        for model_name in tqdm(M3Meta.models):
            forecast = pd.read_excel(raw_file_path,
                                     sheet_name=model_name,
                                     header=None)
            for i, row in forecast.iterrows():
                ts = training_timeseries[i].future_values(
                    row.T[2:row[1] + 2].values.astype(np.float32))
                ts.meta = {**ts.meta, 'model': model_name}
                models_forecasts.append(ts)
        return TimeseriesBundle(models_forecasts)
예제 #4
0
    def download(self) -> TimeseriesBundle:
        archive_file = os.path.join(self.path, 'dataset.zip')
        train_raw_file = os.path.join(self.path, 'PEMS_train')
        test_raw_file = os.path.join(self.path, 'PEMS_test')
        perm_raw_file = os.path.join(self.path, 'randperm')
        download_url(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip',
            archive_file)
        patoolib.extract_archive(archive_file, outdir=self.path)
        with open(train_raw_file, 'r') as f:
            train_raw_data = f.readlines()
        with open(test_raw_file, 'r') as f:
            test_raw_data = f.readlines()
        with open(perm_raw_file, 'r') as f:
            permutations = f.readlines()
        permutations = np.array(
            permutations[0].rstrip()[1:-1].split(' ')).astype(np.int)

        raw_data = train_raw_data + test_raw_data

        # start date per https://archive.ics.uci.edu/ml/datasets/PEMS-SF
        # skip 2008-01-01 because it's holiday.
        # the number of days between 2008-01-01 and 2009-03-30 is 455 but based on provided labels (which are days of week)
        # the sequence of days had only 10 gaps by 1 day, where the first 6 correspond to a holiday or anomalous day which
        # was excluded from the dataset, but the other 4 gaps happen on unexplained dates.
        # More over with only 10 gaps it's not possible to fill dates up to 2009-03-30, it should be 15 gaps
        # (if 2009-01-01 included, 14 otherwise).
        # Taking into consideration all the concerns above, we decided to assume the following dates were skipped
        # (first 7 seem to be aligned with labels and description):
        #  - Jan. 1, 2008
        #  - Jan. 21, 2008
        #  - Feb. 18, 2008
        #  - Mar. 9, 2008 - Anomaly
        #  - May 26, 2008
        #  - Jul. 4, 2008
        #  - Sep. 1, 2008
        #  - Oct. 13, 2008 - Columbus Day
        #  - Nov. 11, 2008
        #  - Nov. 27, 2008
        #  - Dec. 25, 2008
        #  - Jan. 1, 2009
        #  - Jan. 19, 2009
        #  - Feb. 16, 2009
        #  - Mar. 8, 2009 - Anomaly
        #  ------------------------------------------
        # Thus 455 - 15 = 440 days from 2008-01-01 to 2008-03-30 (incl.)
        start_date = datetime.strptime('2008-01-02',
                                       '%Y-%m-%d')  # 2008-01-01 is a holiday
        current_date = start_date
        excluded_dates = [
            datetime.strptime('2008-01-21', '%Y-%m-%d'),
            datetime.strptime('2008-02-18', '%Y-%m-%d'),
            datetime.strptime('2008-03-09', '%Y-%m-%d'),
            datetime.strptime('2008-05-26', '%Y-%m-%d'),
            datetime.strptime('2008-07-04', '%Y-%m-%d'),
            datetime.strptime('2008-09-01', '%Y-%m-%d'),
            datetime.strptime('2008-10-13', '%Y-%m-%d'),
            datetime.strptime('2008-11-11', '%Y-%m-%d'),
            datetime.strptime('2008-11-27', '%Y-%m-%d'),
            datetime.strptime('2008-12-25', '%Y-%m-%d'),
            datetime.strptime('2009-01-01', '%Y-%m-%d'),
            datetime.strptime('2009-01-19', '%Y-%m-%d'),
            datetime.strptime('2009-02-16', '%Y-%m-%d'),
            datetime.strptime('2009-03-08', '%Y-%m-%d'),
        ]

        values = []
        for day, i in tqdm(enumerate(range(len(permutations)))):
            if current_date not in excluded_dates:
                matrix = raw_data[np.where(permutations == i +
                                           1)[0][0]].rstrip()[1:-1]
                daily = []
                for row_vector in matrix.split(';'):
                    daily.append(
                        np.array(row_vector.split(' ')).astype(np.float32))
                daily = np.array(daily)
                if len(values) == 0:
                    values = daily
                else:
                    values = np.concatenate([values, daily], axis=1)
            else:  # should never be in the first 24*7 records.
                # fill gaps with same day of previous week.
                values = np.concatenate(
                    [values, values[:, -24 * 7 * 6:-24 * 6 * 6]], axis=1)
            current_date += timedelta(days=1)

        # aggregate 10 minutes events to hourly
        hourly = np.array([
            list(map(np.mean, zip(*(iter(lane), ) * 6)))
            for lane in tqdm(values)
        ])
        timeseries = [
            Timeseries(id=str(i),
                       start_date=start_date,
                       time_unit=Hour(),
                       frequency=1,
                       period=24 * 7,
                       values=values,
                       meta={}) for i, values in enumerate(hourly)
        ]
        return TimeseriesBundle(timeseries=timeseries)
예제 #5
0
    def download(self) -> TimeseriesBundle:
        url_template = 'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/{}/{}-{}.csv'
        m4_info_url = 'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/M4-info.csv'
        m4_info_path = os.path.join(self.path, 'M4info.csv')

        ssl._create_default_https_context = ssl._create_unverified_context

        download_url(m4_info_url, m4_info_path)
        for sp in M4Meta.seasonal_patterns:
            training_url = url_template.format("Train", sp, "train")
            download_url(training_url,
                         os.path.join(M4Meta.dataset_path, f'{sp}-train.csv'))
            test_url = url_template.format("Test", sp, "test")
            download_url(test_url,
                         os.path.join(M4Meta.dataset_path, f'{sp}-test.csv'))

        # Download naive2 forecasts, needed for OWA metric
        m4_naive2_archive = os.path.join(self.path, 'naive2.rar')
        download_url(
            'https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-Naive2.rar',
            m4_naive2_archive)
        patoolib.extract_archive(m4_naive2_archive, outdir=self.path)
        os.remove(m4_naive2_archive)

        # Download m4 competition winner predictions, for summary testing purposes only
        m4_winner_archive = os.path.join(self.path, 'submission-118.rar')
        download_url(
            'https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-118.rar',
            m4_winner_archive)
        patoolib.extract_archive(m4_winner_archive, outdir=self.path)
        os.remove(m4_winner_archive)

        m4_info = pd.read_csv(m4_info_path)
        m4_info.set_index('M4id', inplace=True)

        time_units_mapping = {
            'Yearly': (Year(), 1),
            'Quarterly': (Month(), 3),
            'Monthly': (Month(), 1),
            'Weekly': (Day(), 7),
            'Daily': (Day(), 1),
            'Hourly': (Hour(), 1)
        }

        all_timeseries = []
        for sp in M4Meta.seasonal_patterns:
            training_set = pd.read_csv(
                os.path.join(M4Meta.dataset_path, f'{sp}-train.csv'))
            test_set = pd.read_csv(
                os.path.join(M4Meta.dataset_path, f'{sp}-test.csv'))

            time_unit, frequency = time_units_mapping[sp]

            for i, row in tqdm(training_set.iterrows()):
                timeseries_id = str(row['V1'])
                training_values = row.values[1:].astype(np.float32)
                training_values = training_values[~np.isnan(training_values)]

                test_values = test_set.loc[i].values[1:].astype(np.float32)

                timeseries_info = m4_info.loc[timeseries_id]

                parsing_formats = ['%d-%m-%y %H:%M', '%Y-%m-%d %H:%M:%S']
                parsed_date = None
                for parsing_format in parsing_formats:
                    try:
                        parsed_date = datetime.strptime(
                            timeseries_info.StartingDate, parsing_format)
                    except Exception:
                        continue
                if parsed_date is None:
                    raise ValueError(
                        f'Could not parse {timeseries_info.StartingDate} for {timeseries_id}'
                    )

                timeseries = Timeseries(id=timeseries_id,
                                        start_date=parsed_date,
                                        time_unit=time_unit,
                                        frequency=frequency,
                                        period=int(timeseries_info.Frequency),
                                        values=np.concatenate(
                                            [training_values, test_values]),
                                        meta={'seasonal_pattern': sp})
                all_timeseries.append(timeseries)

        return TimeseriesBundle(all_timeseries)