def postprocess(self, raw_predictions, index): ''' ''' # Reconstruct the data frame. logger.debug('postprocess: constructing data frame') index = apollo.DatetimeIndex(index, name='time') predictions = super().postprocess(raw_predictions, index) # Set the columns. cols = list(self.columns) assert len(predictions.columns) == len(cols) predictions.columns = pd.Index(cols) # Unscale the predictions. if self.standardize: logger.debug('postprocess: unscaling predictions') predictions[cols] = self.target_scaler.inverse_transform( predictions) # Set overnight predictions to zero (optionally). if self.daylight_only: logger.debug('postprocess: setting night time to zero') (lat, lon) = night = ~apollo.is_daylight(index, lat, lon) predictions.loc[night, :] = 0 return predictions
def times_to_reftimes(times): '''Compute the reference times for forecasts containing the given times. On the edge case, this may select one extra forecast per time. Arguments: times (numpy.ndarray like): A series of forecast times. Returns: apollo.DatetimeIndex: The set of reftimes for forecasts containing the given times. ''' reftimes = apollo.DatetimeIndex(times, name='reftime').unique() a = reftimes.floor('6h').unique() b = a - pd.Timedelta('6h') c = a - pd.Timedelta('12h') d = a - pd.Timedelta('18h') e = a - pd.Timedelta('24h') f = a - pd.Timedelta('30h') g = a - pd.Timedelta('36h') return a.union(b).union(c).union(d).union(e).union(f).union(g)
def postprocess(self, times, raw_predictions): ''' ''' # Reconstruct the data frame. logger.debug('postprocess: constructing data frame') cols = self.cols index = apollo.DatetimeIndex(times, name='time') predictions = pd.DataFrame(raw_predictions, index=index, columns=cols) # Unscale the predictions. if self.standardize: logger.debug('postprocess: unscaling predictions') predictions[cols] = self.target_scaler.inverse_transform(raw_predictions) # Set overnight predictions to zero (optionally). if self.daylight_only: logger.debug('postprocess: setting night time to zero') (lat, lon) = night = not apollo.is_daylight(index, lat, lon) predictions.loc[night, :] = 0 return predictions
def load_data(self, index, dedupe_strategy='best'): '''Load input data for the given times. Arguments: index (pandas.DatetimeIndex): The times to forecast. dedupe_strategy (str or int): The strategy for selecting between duplicate forecasts. **TODO:** Better documentation. Returns: pandas.DataFrame: A data fram indexed by the forecast time. ''' index = apollo.DatetimeIndex(index) index = index.floor('1h').unique() # Load the xarray data. logger.debug('load: loading netcdf') reftimes = nam.times_to_reftimes(index) data =, on_miss='skip') data = data[self.features] data = data.astype('float32') # Select geographic area. logger.debug('load: slicing geographic area') data = nam.slice_geo(data,, shape=self.shape) # Create a data frame. # This will have a multi-index of `(reftime, forecast, x, y, *z)`, # where `*z` is all of the different z-axes in the dataset. logger.debug('load: converting to dataframe') data = data.to_dataframe().drop(['lat', 'lon'], axis=1) # Replace `reftime` and `forecast` levels with `time`. logger.debug('load: reindex by time') old_index = data.index data = data.set_index('time', append=True) data = data.reorder_levels(['time', *old_index.names]) data = data.reset_index('reftime', drop=True) data = data.reset_index('forecast', drop=False) # Filter to only the times requested. data = data.loc[index] # Handle duplicates. logger.debug(f'load: selecting forecast hour (dedupe_strategy={dedupe_strategy})') if dedupe_strategy == 'best': data = data.groupby(data.index) \ .apply(lambda g: g[g.forecast == g.forecast.min()]) \ .droplevel(0) elif isinstance(dedupe_strategy, int) and dedupe_strategy < 6: delta = pd.Timedelta('6h') lo = delta * dedupe_strategy hi = lo + delta data = data.groupby(data.index) \ .apply(lambda g: g[(lo <= g.forecast) & (g.forecast < hi)]) \ .droplevel(0) else: raise ValueError(f'invalid dedupe_strategy {repr(dedupe_strategy)}') # We no longer need the forecast hour. data = data.drop('forecast', axis=1) # Unstack until we're indexed by time alone, then flatten the columns. logger.debug('load: unstacking geographic indices') while 2 < len(data.index.levels): data = data.unstack() data = data.unstack() data.columns = data.columns.to_flat_index() # The index has lost its timezone information. Fix it. data.index = apollo.DatetimeIndex(data.index) # We're done. return data