def test_frame_tz_localize(self): rng = date_range("1/1/2011", periods=100, freq="H") df = DataFrame({"a": 1}, index=rng) result = df.tz_localize("utc") expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) assert result.index.tz.zone == "UTC" tm.assert_frame_equal(result, expected) df = df.T result = df.tz_localize("utc", axis=1) assert result.columns.tz.zone == "UTC" tm.assert_frame_equal(result, expected.T)
def test_frame_tz_localize(self): rng = date_range('1/1/2011', periods=100, freq='H') df = DataFrame({'a': 1}, index=rng) result = df.tz_localize('utc') expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) assert result.index.tz.zone == 'UTC' tm.assert_frame_equal(result, expected) df = df.T result = df.tz_localize('utc', axis=1) assert result.columns.tz.zone == 'UTC' tm.assert_frame_equal(result, expected.T)
def correct_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: level = self.level_name if isinstance(df.index, pd.MultiIndex) else None res = df.tz_localize(self.tz_source, ambiguous=self.ambiguous, copy=False, level=level).tz_convert(self.tz_dest, level=level) if self.neutralize_tz: res = res.tz_convert(None, level=level) res.sort_index(inplace=True) return res
def _standardize_index(self, df_in: pd.DataFrame, symbol: str = None, datatype: str = None, barsize: str = None, tz: str = None): """Normalize input DataFrame index to MarketDataBlock standard. """ # Add or starndardize index names in the input. if isinstance(df_in.index, pd.MultiIndex): df_in.reset_index(inplace=True) # Rename ambiguous column names. df_in.columns = [ col_rename.get(col.strip().lower(), col.strip().lower()) for col in df_in.columns ] # Insert Symbol, DataType, Barsize columns from arguments if not # found in the input dataframe. for col in MarketDataBlock.data_index: if col not in df_in.columns: if locals().get(col.lower(), None) is None: raise KeyError( 'No {0} argument and no {0} column in the DataFrame.'. format(col)) df_in.insert(0, col, locals()[col.lower()]) # Convert datetime strings to pandas DatetimeIndex df_in['TickerTime'] = pd.DatetimeIndex(df_in['TickerTime'].apply( pd.Timestamp)) # Standardize BarSize strings df_in['BarSize'] = df_in['BarSize'].map(timedur_standardize) # Set index to class-defined MultiIndex df_in.set_index(MarketDataBlock.data_index, inplace=True) # Set time zone so all DatetimeIndex are tz-aware df_in_tz = df_in.index.levels[self.__class__.dtlevel].tz if df_in_tz is None or isinstance(df_in_tz, timezone) or \ isinstance(df_in_tz, pytz._FixedOffset): # Input df has naive time index, or tzinfo is not pytz.timezone() if tz is None: raise ValueError( 'Argument tz=None, and TickerTime.tzinfo is None(naive),' 'datetime.timezone, or pytz._FixedOffset.') if df_in_tz is None: df_in = df_in.tz_localize(tz, level=self.__class__.dtlevel) else: df_in = df_in.tz_convert(tz, level=self.__class__.dtlevel) return df_in
def test_tz_localize(self, frame_or_series): rng = date_range("1/1/2011", periods=100, freq="H") obj = DataFrame({"a": 1}, index=rng) obj = tm.get_obj(obj, frame_or_series) result = obj.tz_localize("utc") expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) expected = tm.get_obj(expected, frame_or_series) assert result.index.tz.zone == "UTC" tm.assert_equal(result, expected)
def test_tz_localize(self, frame_or_series): rng = date_range("1/1/2011", periods=100, freq="H") obj = DataFrame({"a": 1}, index=rng) if frame_or_series is not DataFrame: obj = obj["a"] result = obj.tz_localize("utc") expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) if frame_or_series is not DataFrame: expected = expected["a"] assert result.index.tz.zone == "UTC" tm.assert_equal(result, expected)
def test_series_frame_tz_localize(self): rng = date_range('1/1/2011', periods=100, freq='H') ts = Series(1, index=rng) result = ts.tz_localize('utc') self.assert_(result.index.tz.zone == 'UTC') df = DataFrame({'a': 1}, index=rng) result = df.tz_localize('utc') expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) self.assert_(result.index.tz.zone == 'UTC') assert_frame_equal(result, expected) df = df.T result = df.tz_localize('utc', axis=1) self.assert_(result.columns.tz.zone == 'UTC') assert_frame_equal(result, expected.T) # Can't localize if already tz-aware rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') ts = Series(1, index=rng) self.assertRaises(Exception, ts.tz_localize, 'US/Eastern')
def test_series_frame_tz_localize(self): rng = date_range("1/1/2011", periods=100, freq="H") ts = Series(1, index=rng) result = ts.tz_localize("utc") self.assertEqual(result.index.tz.zone, "UTC") df = DataFrame({"a": 1}, index=rng) result = df.tz_localize("utc") expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) self.assertEqual(result.index.tz.zone, "UTC") assert_frame_equal(result, expected) df = df.T result = df.tz_localize("utc", axis=1) self.assertEqual(result.columns.tz.zone, "UTC") assert_frame_equal(result, expected.T) # Can't localize if already tz-aware rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") ts = Series(1, index=rng) tm.assertRaisesRegexp(TypeError, "Already tz-aware", ts.tz_localize, "US/Eastern")
def _standardize_index( self, df_in: pd.DataFrame, symbol: str=None, datatype: str=None, barsize: str=None, tz: str=None): """Normalize input DataFrame index to MarketDataBlock standard. """ # Add or starndardize index names in the input. if isinstance(df_in.index, pd.MultiIndex): df_in.reset_index(inplace=True) # Rename ambiguous column names. df_in.columns = [ col_rename.get(col.strip().lower(), col.strip().lower()) for col in df_in.columns] # Insert Symbol, DataType, Barsize columns from arguments if not # found in the input dataframe. for col in MarketDataBlock.data_index: if col not in df_in.columns: if locals().get(col.lower(), None) is None: raise KeyError( 'No {0} argument and no {0} column in the DataFrame.' .format(col)) df_in.insert(0, col, locals()[col.lower()]) # Convert datetime strings to pandas DatetimeIndex df_in['TickerTime'] = pd.DatetimeIndex( df_in['TickerTime'].apply(pd.Timestamp)) # Standardize BarSize strings df_in['BarSize'] = df_in['BarSize'].map(timedur_standardize) # Set index to class-defined MultiIndex df_in.set_index(MarketDataBlock.data_index, inplace=True) # Set time zone so all DatetimeIndex are tz-aware df_in_tz = df_in.index.levels[self.__class__.dtlevel].tz if df_in_tz is None or isinstance(df_in_tz, timezone) or \ isinstance(df_in_tz, pytz._FixedOffset): # Input df has naive time index, or tzinfo is not pytz.timezone() if tz is None: raise ValueError( 'Argument tz=None, and TickerTime.tzinfo is None(naive),' 'datetime.timezone, or pytz._FixedOffset.') if df_in_tz is None: df_in = df_in.tz_localize(tz, level=self.__class__.dtlevel) else: df_in = df_in.tz_convert(tz, level=self.__class__.dtlevel) return df_in
def get_home(data: pd.DataFrame, periods: dict) -> tuple: # NOTE this is to match the names library expects. Ideally library should work with the # same names as the AWARE database schema data.rename(index=str, columns={ 'double_latitude': 'latitude', 'double_longitude': 'longitude' }, inplace=True) # NOTE this is to imitate data in CMU's use of the library and mainly because # there has been cases (e.g. screen) where columns have been accessed by # their position and not their name data = data[['timestamp', 'latitude', 'longitude']] # NOTE this is again to imitation data in CMU's used of the library and out of # caution. I do not know if being sorted is assumed by the library. data.sort_values(by='timestamp', ascending=True, inplace=True) convert_timezone(data, 'US/Pacific', {'timestamp': 'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? periodranges = np.ndarray(shape=(len(periods), 2), dtype=np.int64) for index, period in enumerate(periods): start = period['start'] start = datetime.datetime(start['year'], start['month'], start['day'], start['hour'], start['minute'], start['second']) start = time.mktime(start.timetuple()) end = period['end'] end = datetime.datetime(end['year'], end['month'], end['day'], end['hour'], end['minute'], end['second']) end = time.mktime(end.timetuple()) periodranges[index, 0] = start periodranges[index, 1] = end nightranges = getDaywiseSplitsForEpoch("night") home_location = infer_home(data, periodranges, nightranges) return home_location
def __init__(self, name: str, data: pd.DataFrame, interval: str = DEFAULT_FREQ, indicators: List[str] = None, drop_columns=None): # Init OHLC data self.name = name self._data = data.tz_localize(DEFAULT_TIMEZONE, ambiguous='infer').tz_convert('UTC') self.upsample(DEFAULT_FREQ) if interval != DEFAULT_FREQ: self.downsample(interval) self._data = self._data.astype(COLUMN_DTYPES) # Init indicators self._indicators = {name: Function(name) for name in indicators or []} self._apply_indicators() if drop_columns: self._data = self._data.drop(drop_columns, axis=1)
def test_getitem_with_datestring_with_UTC_offset(self, start, end): # GH 24076 idx = date_range( start="2018-12-02 14:50:00-07:00", end="2018-12-02 14:50:00-07:00", freq="1min", ) df = DataFrame(1, index=idx, columns=["A"]) result = df[start:end] expected = df.iloc[0:3, :] tm.assert_frame_equal(result, expected) # GH 16785 start = str(start) end = str(end) with pytest.raises(ValueError, match="Both dates must"): df[start:end[:-4] + "1:00"] with pytest.raises(ValueError, match="The index must be timezone"): df = df.tz_localize(None) df[start:end]
month = lambda x: np.random.choice([abc for abc in x], np.random.choice([1])) contracts = np.ravel([[(''.join(month(string.letters[:26])) + str(np.random.choice([14, 15, 16])))] * len(cols) for x in xrange(len(source.columns) / len(cols) / 2)]) level_1 = len(source.columns) / len(contracts) * list(contracts) numsyms = len(source.columns) / (len(set(level_1)) * len(cols)) underlyings = [''.join(sym(string.letters[:26])) for x in xrange(numsyms)] level_0 = np.ravel([[sym] * len(set(level_1)) * len(cols) for sym in underlyings]) source.columns = pd.MultiIndex.from_tuples(zip(level_0, level_1, source.columns)) source.index = pd.date_range(start=dt.datetime.utcnow() - dt.timedelta(days=len(source.index) - 1), end=dt.datetime.utcnow(), freq='D') futdata = FuturesDataFrameSource(source.tz_localize('UTC')) class FrontTrader(TradingAlgorithm): @roll(lambda x: x[x['open_interest'] == x['open_interest'].max()]) def handle_data(self, data): for sym in data.keys(): self.order((sym, data[sym]['contract']), 2) return data bot = FrontTrader() stats = bot.run(futdata)
def get_canonical_data( df: pd.DataFrame, time_col: str = TIME_COL, value_col: str = VALUE_COL, freq: str = None, date_format: str = None, tz: str = None, train_end_date: datetime = None, regressor_cols: List[str] = None, anomaly_info: Optional[Union[Dict, List[Dict]]] = None): """Loads data to internal representation. Parses date column, sets timezone aware index. Checks for irregularities and raises an error if input is invalid. Adjusts for anomalies according to ``anomaly_info``. Parameters ---------- df : `pandas.DataFrame` Input timeseries. A data frame which includes the timestamp column as well as the value column. time_col : `str` The column name in ``df`` representing time for the time series data. The time column can be anything that can be parsed by pandas DatetimeIndex. value_col: `str` The column name which has the value of interest to be forecasted. freq : `str`, optional, default None Timeseries frequency, DateOffset alias, If None automatically inferred. date_format : `str`, optional, default None strftime format to parse time column, eg ``%m/%d/%Y``. Note that ``%f`` will parse all the way up to nanoseconds. If None (recommended), inferred by `pandas.to_datetime`. tz : `str` or pytz.timezone object, optional, default None Passed to `pandas.tz_localize` to localize the timestamp. train_end_date : `datetime.datetime`, optional, default None Last date to use for fitting the model. Forecasts are generated after this date. If None, it is set to the minimum of ``self.last_date_for_val`` and ``self.last_date_for_reg``. regressor_cols: `list` [`str`], optional, default None A list of regressor columns used in the training and prediction DataFrames. If None, no regressor columns are used. Regressor columns that are unavailable in ``df`` are dropped. anomaly_info : `dict` or None, default None anomaly_info : `dict` or `list` [`dict`] or None, default None Anomaly adjustment info. Anomalies in ``df`` are corrected before any forecasting is done. If None, no adjustments are made. A dictionary containing the parameters to `~greykite.common.features.adjust_anomalous_data.adjust_anomalous_data`. See that function for details. The possible keys are: ``"value_col"`` : `str` The name of the column in ``df`` to adjust. You may adjust the value to forecast as well as any numeric regressors. ``"anomaly_df"`` : `pandas.DataFrame` Adjustments to correct the anomalies. ``"start_date_col"``: `str`, default START_DATE_COL Start date column in ``anomaly_df``. ``"end_date_col"``: `str`, default END_DATE_COL End date column in ``anomaly_df``. ``"adjustment_delta_col"``: `str` or None, default None Impact column in ``anomaly_df``. ``"filter_by_dict"``: `dict` or None, default None Used to filter ``anomaly_df`` to the relevant anomalies for the ``value_col`` in this dictionary. Key specifies the column name, value specifies the filter value. ``"filter_by_value_col""``: `str` or None, default None Adds ``{filter_by_value_col: value_col}`` to ``filter_by_dict`` if not None, for the ``value_col`` in this dictionary. ``"adjustment_method"`` : `str` ("add" or "subtract"), default "add" How to make the adjustment, if ``adjustment_delta_col`` is provided. Accepts a list of such dictionaries to adjust multiple columns in ``df``. Returns ------- canonical_data_dict : `dict` Dictionary containing the dataset in canonical form, and information such as train end date. Keys: ``"df"`` : `pandas.DataFrame` Data frame containing timestamp and value, with standardized column names for internal use (TIME_COL, VALUE_COL). Rows are sorted by time index, and missing gaps between dates are filled in so that dates are spaced at regular intervals. Values are adjusted for anomalies according to ``anomaly_info``. The index can be timezone aware (but TIME_COL is not). ``"df_before_adjustment"`` : `pandas.DataFrame` or None ``df`` before adjustment by ``anomaly_info``. If ``anomaly_info`` is None, this is None. ``"fit_df"`` : `pandas.DataFrame` A subset of the returned ``df``, with data up until ``train_end_date``. ``"freq"`` : `pandas.DataFrame` timeseries frequency, inferred if not provided ``"time_stats"`` : `dict` Information about the time column: ``"gaps"``: missing_dates ``"added_timepoints"``: added_timepoints ``"dropped_timepoints"``: dropped_timepoints ``"regressor_cols"`` : `list` [`str`] A list of regressor columns. ``"fit_cols"`` : `list` [`str`] Names of time column, value column, and regressor columns. ``"train_end_date"`` : `datetime.datetime` Last date or timestamp for training. It is always less than or equal to minimum non-null values of ``last_date_for_val`` and ``last_date_for_reg``. ``"last_date_for_val"`` : `datetime.datetime` Date or timestamp corresponding to last non-null value in ``df[value_col]``. ``"last_date_for_reg"`` : `datetime.datetime` or None Date or timestamp corresponding to last non-null value in ``df[regressor_cols]``. If ``regressor_cols`` is None, ``last_date_for_reg`` is None. """ if time_col not in df.columns: raise ValueError(f"{time_col} column is not in input data") if value_col not in df.columns: raise ValueError(f"{value_col} column is not in input data") if df.shape[0] <= 2: raise ValueError( f"Time series has < 3 observations. More data are needed for forecasting.") # Standardizes the time column name. # `value_col` is standardized after anomalies are adjusted. df_standardized = df.rename({ time_col: TIME_COL, }, axis=1) df_standardized[TIME_COL] = pd.to_datetime( df_standardized[TIME_COL], format=date_format, infer_datetime_format=True) # Drops data points from duplicate time stamps df_standardized.drop_duplicates( subset=[TIME_COL], keep='first', inplace=True) if df.shape[0] > df_standardized.shape[0]: warnings.warn( f"Duplicate timestamps have been removed.", UserWarning) df = df_standardized.sort_values(by=TIME_COL) # Infers data frequency inferred_freq = pd.infer_freq(df[TIME_COL]) if freq is None: freq = inferred_freq elif inferred_freq is not None and freq != inferred_freq: warnings.warn( f"Provided frequency '{freq}' does not match inferred frequency '{inferred_freq}'." f" Using '{freq}'.", UserWarning) # NB: with missing data, it's better to provide freq # Handles gaps in time series missing_dates = find_missing_dates(df[TIME_COL]) df, added_timepoints, dropped_timepoints = fill_missing_dates( df, time_col=TIME_COL, freq=freq) time_stats = { "gaps": missing_dates, "added_timepoints": added_timepoints, "dropped_timepoints": dropped_timepoints } # Creates index with localized timestamp df.index = df[TIME_COL] df.index.name = None if tz is not None: df = df.tz_localize(tz) df_before_adjustment = None if anomaly_info is not None: # Saves values before adjustment. df_before_adjustment = df.copy() # Adjusts columns in df (e.g. `value_col`, `regressor_cols`) # using the anomaly info. One dictionary of parameters # for `adjust_anomalous_data` is provided for each column to adjust. if not isinstance(anomaly_info, (list, tuple)): anomaly_info = [anomaly_info] for single_anomaly_info in anomaly_info: adjusted_df_dict = adjust_anomalous_data( df=df, time_col=TIME_COL, **single_anomaly_info) # `self.df` with values for single_anomaly_info["value_col"] adjusted. df = adjusted_df_dict["adjusted_df"] # Standardizes `value_col` name df_before_adjustment.rename({ value_col: VALUE_COL }, axis=1, inplace=True) # Standardizes `value_col` name df.rename({ value_col: VALUE_COL }, axis=1, inplace=True) # Finds date of last available value last_date_available = df[TIME_COL].max() last_date_for_val = df[df[VALUE_COL].notnull()][TIME_COL].max() last_date_for_reg = None if regressor_cols: available_regressor_cols = [col for col in df.columns if col not in [TIME_COL, VALUE_COL]] cols_not_selected = set(regressor_cols) - set(available_regressor_cols) regressor_cols = [col for col in regressor_cols if col in available_regressor_cols] if cols_not_selected: warnings.warn(f"The following columns are not available to use as " f"regressors: {sorted(cols_not_selected)}") last_date_for_reg = df[df[regressor_cols].notnull().any(axis=1)][TIME_COL].max() max_train_end_date = min(last_date_for_val, last_date_for_reg) else: max_train_end_date = last_date_for_val # Chooses appropriate train_end_date if train_end_date is None: train_end_date = max_train_end_date if train_end_date < last_date_available: warnings.warn( f"{value_col} column of the provided TimeSeries contains " f"null values at the end. Setting 'train_end_date' to the last timestamp with a " f"non-null value ({train_end_date}).", UserWarning) elif train_end_date > max_train_end_date: warnings.warn( f"Input timestamp for the parameter 'train_end_date' " f"({train_end_date}) either exceeds the last available timestamp or" f"{value_col} column of the provided TimeSeries contains null " f"values at the end. Setting 'train_end_date' to the last timestamp with a " f"non-null value ({max_train_end_date}).", UserWarning) train_end_date = max_train_end_date if regressor_cols is None: regressor_cols = [] fit_cols = [TIME_COL, VALUE_COL] + regressor_cols fit_df = df[df[TIME_COL] <= train_end_date][fit_cols] return { "df": df, "df_before_adjustment": df_before_adjustment, "fit_df": fit_df, "freq": freq, "time_stats": time_stats, "regressor_cols": regressor_cols, "fit_cols": fit_cols, "train_end_date": train_end_date, "last_date_for_val": last_date_for_val, "last_date_for_reg": last_date_for_reg, }
def make_tz_unaware_df(df: DataFrame) -> DataFrame: return df.tz_localize(None)
(''.join(month(string.letters[:26])) + str(np.random.choice([14, 15, 16]))) ] * len(cols) for x in range(len(source.columns) / len(cols) / 2)]) level_1 = len(source.columns) / len(contracts) * list(contracts) numsyms = len(source.columns) / (len(set(level_1)) * len(cols)) underlyings = [''.join(sym(string.letters[:26])) for x in range(numsyms)] level_0 = np.ravel([[sym] * len(set(level_1)) * len(cols) for sym in underlyings]) source.columns = pd.MultiIndex.from_tuples( list(zip(level_0, level_1, source.columns))) source.index = pd.date_range(start=dt.datetime.utcnow() - dt.timedelta(days=len(source.index) - 1), end=dt.datetime.utcnow(), freq='D') futdata = FuturesDataFrameSource(source.tz_localize('UTC')) class FrontTrader(TradingAlgorithm): @roll(lambda x: x[x['open_interest'] == x['open_interest'].max()]) def handle_data(self, data): for sym in list(data.keys()): self.order((sym, data[sym]['contract']), 2) return data bot = FrontTrader() stats = bot.run(futdata)
def set_timezone(self, df: pd.DataFrame) -> pd.DataFrame: return df.tz_localize(self.data_tz)