Ejemplo n.º 1
0
def test_build_db_ts_range_unsupported_frequency():
    cur_date = datetime.datetime(2020, 8, 5)
    request_year = 2019
    frequency = 'bad_frequency_string'

    with pytest.raises(NotImplementedError):
        utils.fetch_ts_start_end(cur_date, request_year, frequency)
Ejemplo n.º 2
0
def test_build_db_ts_range_bad_year():
    cur_date = datetime.datetime(2020, 8, 5)
    request_year = 2021  # later than current
    frequency = 'H'

    with pytest.raises(ValueError):
        utils.fetch_ts_start_end(cur_date=cur_date,
                                 request_year=request_year,
                                 frequency=frequency)
Ejemplo n.º 3
0
def test_build_db_ts_range_general():
    cur_date = datetime.datetime(2020, 8, 5)
    request_year = 2019
    frequency = 'H'

    exp_start = f'{request_year}-01-01 00:00:00'
    exp_end = f'{request_year}-12-31 23:00:00'

    actual = utils.fetch_ts_start_end(cur_date, request_year, frequency)
    expected = (exp_start, exp_end)

    # sort for testing ease
    expected = sorted(expected)
    actual = sorted(actual)

    assert actual == expected
Ejemplo n.º 4
0
    def construct_database(self):
        """Constructs database from raw datafiles and saves it in UTC"""
        # Determine expected timestamps for dataset
        self.curr_date = datetime.now(tz=pytz.timezone('US/Eastern'))  # update current time after download

        start, end = utils.fetch_ts_start_end(self.curr_date, self.year, self.dataset_details.f)
        timestamps = pd.date_range(start, end, freq=self.dataset_details.f, tz='US/Eastern')

        # Construct Database
        print('Constructing DB...')
        files = sorted(pl.Path(self.download_dir).glob('*.csv'))
        if not files:
            print('Warning: No raw datafiles found!')
            return  # skip the rest
        else:
            # Concatenate all CSVs into a DataFrame
            frames = [pd.read_csv(file, index_col=0) for file in files]
            df = pd.concat(frames, sort=False)
            df.index = pd.to_datetime(df.index)

            # If self.dataset_details.col is None then there is no need to pivot
            if ('Time Zone' in df.columns) or (self.dataset_details.col is None):
                # Make index timezone aware (US/Eastern)
                if 'Time Zone' in df.columns:
                    df = df.tz_localize('US/Eastern', ambiguous=df['Time Zone'] == 'EST')
                elif self.dataset_details.col is None:
                    df = df.tz_localize('US/Eastern', ambiguous='infer')
                df = df.sort_index(axis='index').tz_convert('UTC')
                # Convert to UTC so that pivot can work without throwing error for duplicate indices (due to
                if 'Time Zone' in df.columns:
                    print('Pivoting Data...')
                    df = df.pivot(columns=self.dataset_details.col, values=self.dataset_details.val_col)  # make columns
                print('Resampling...')
                df = df.resample(self.dataset_details.f).mean()
                df = utils.check_and_interpolate_nans(df)
            # When there is no timezone column and there is 'stacked' data
            else:
                print('Data is stacked...')
                frames = []
                for ctype, subdf in df.groupby(by=self.dataset_details.col):
                    subdf = subdf.tz_localize('US/Eastern', ambiguous='infer').tz_convert('UTC')
                    subdf = subdf.resample(self.dataset_details.f).mean()
                    subdf = utils.check_and_interpolate_nans(subdf)
                    subdf.loc[:, self.dataset_details.col] = ctype
                    frames.append(subdf)
                df = pd.concat(frames)
                # check if the number of regions/interface flow name are equal
                if not (len(set(df[self.dataset_details.col].value_counts().values)) <= 1):
                    print('Warning: There seems to be underlying missing data.\n{}'.format(
                        df[self.dataset_details.col].value_counts()))

            if self.dataset_details.type == 'load':
                df['NYCA'] = df.sum(axis='columns')  # Calculate statewide load based on interpolated values
            if self.dataset_details.type == 'interface_flows':
                # remap external interface names to match website
                df['Interface Name'] = df['Interface Name'].map(EXTERNAL_TFLOWS_MAP).fillna(df['Interface Name'])
                df = df.rename(columns={'Flow (MWH)': 'Flow (MW)',
                                        'Postitive Limit (MWH)': 'Postitive Limit (MW)',
                                        'Negative Limit (MWH)': 'Negative Limit (MW)'})

            # Convert back to US/Eastern to select time period based on local time
            df = df.tz_convert('US/Eastern')
            df = df.loc[start:end]

            # Check to make sure that all the expected timestamps exist
            assert timestamps[~timestamps.isin(df.index)].empty, 'Index is missing data! {}'.format(
                timestamps[~timestamps.isin(df.index)])
            assert ~df.isnull().values.any(), 'NANs Found! Resampling and interpolation should have handled this.'
            # Save and return dataset in UTC
            df = df.tz_convert('UTC')
            filepath = pl.Path(self.output_dir, f'{self.year}_{self.dataset}.pkl')
            df.to_pickle(filepath)  # pickle will contains timezone and frequency information
            if self.create_csvs:
                df.to_csv(filepath)
            self.df = df