def merge_ownership_periods(mappings): """ Given a dict of mappings where the values are lists of OwnershipPeriod objects, returns a dict with the same structure with new OwnershipPeriod objects adjusted so that the periods have no gaps. Orders the periods chronologically, and pushes forward the end date of each period to match the start date of the following period. The end date of the last period pushed forward to the max Timestamp. """ return valmap( lambda v: tuple( OwnershipPeriod( a.start, b.start, a.sid, a.value, ) for a, b in sliding_window( 2, concatv( sorted(v), # concat with a fake ownership object to make the last # end date be max timestamp [OwnershipPeriod( safe_tz_localize(pd.Timestamp.max, 'utc'), None, None, None, )], ), ) ), mappings, )
def load_from_yahoo(indexes=None, stocks=None, start=None, end=None, adjusted=True): """ Loads price data from Yahoo into a dataframe for each of the indicated assets. By default, 'price' is taken from Yahoo's 'Adjusted Close', which removes the impact of splits and dividends. If the argument 'adjusted' is False, then the non-adjusted 'close' field is used instead. :param indexes: Financial indexes to load. :type indexes: dict :param stocks: Stock closing prices to load. :type stocks: list :param start: Retrieve prices from start date on. :type start: datetime :param end: Retrieve prices until end date. :type end: datetime :param adjusted: Adjust the price for splits and dividends. :type adjusted: bool """ data = _load_raw_yahoo_data(indexes, stocks, start, end) if adjusted: close_key = 'Adj Close' else: close_key = 'Close' df = pd.DataFrame({key: d[close_key] for key, d in iteritems(data)}) df.index = safe_tz_localize(df.index, pytz.utc) return df
def test_nearest_unequal_elements(self, tz): dts = safe_tz_localize( pd.to_datetime( ['2014-01-01', '2014-01-05', '2014-01-06', '2014-01-09']), tz) def t(s): return None if s is None else pd.Timestamp(s, tz=tz) for dt, before, after in (('2013-12-30', None, '2014-01-01'), ('2013-12-31', None, '2014-01-01'), ('2014-01-01', None, '2014-01-05'), ('2014-01-02', '2014-01-01', '2014-01-05'), ('2014-01-03', '2014-01-01', '2014-01-05'), ('2014-01-04', '2014-01-01', '2014-01-05'), ('2014-01-05', '2014-01-01', '2014-01-06'), ('2014-01-06', '2014-01-05', '2014-01-09'), ('2014-01-07', '2014-01-06', '2014-01-09'), ('2014-01-08', '2014-01-06', '2014-01-09'), ('2014-01-09', '2014-01-06', None), ('2014-01-10', '2014-01-09', None), ('2014-01-11', '2014-01-09', None)): computed = nearest_unequal_elements(dts, t(dt)) expected = (t(before), t(after)) self.assertEqual(computed, expected)
def init_class_fixtures(cls): super(PipelineAlgorithmTestCase, cls).init_class_fixtures() cls.pipeline_loader = USEquityPricingLoader( cls.bcolz_equity_daily_bar_reader, cls.adjustment_reader, USEquityPricing, ) cls.dates = safe_tz_localize(cls.raw_data[cls.AAPL].index, 'UTC') cls.AAPL_split_date = Timestamp("2014-06-09", tz='UTC') cls.assets = cls.asset_finder.retrieve_all( cls.ASSET_FINDER_EQUITY_SIDS)
def _maybe_update_symbol_frame(self, start_time, api_key, cache, symbol, calendar, start_session, end_session, data_frequency, retries): # Attempt to load pre-existing symbol data from cache. key = '{sym}.{freq}.frame'.format(sym=symbol, freq=data_frequency) try: raw_data = cache[key] except KeyError: raw_data = None # Select the most recent date in cached dataset if it exists, # otherwise use the provided `start_session`. last = start_session if raw_data is not None and len(raw_data) > 0: last = safe_tz_localize(raw_data.index[-1], 'UTC') should_sleep = False # Determine time at which cached data will be considered stale. cache_expiration = last + pd.Timedelta(days=2) if start_time <= cache_expiration and raw_data is not None: # Data is fresh enough to reuse, no need to update. Iterator can # proceed to next symbol directly since no API call was required. return raw_data, should_sleep # If we arrive here, we must have attempted an API call. # Setting this flag tells the iterator to pause before starting # the next asset, that we don't exceed the data source's rate # limit. should_sleep = True raw_data = self._fetch_symbol_frame( api_key, symbol, calendar, start_session, end_session, data_frequency, retries=retries, ) # Cache latest symbol data. cache[key] = raw_data return raw_data, should_sleep
def load_frame(url, skiprows): """ Load a DataFrame of data from a Bank of Canada site. """ data = pd.read_csv( url, skiprows=skiprows, skipinitialspace=True, na_values=["Bank holiday", "Not available"], parse_dates=["Date"], index_col="Date", ).dropna(how='all') return safe_tz_localize(data, 'UTC').rename(columns=COLUMN_NAMES)
def _load_cached_data(filename, first_date, last_date, now, resource_name, environ=None): # Path for the cache. path = get_data_filepath(filename, environ) # If the path does not exist, it means the first download has not happened # yet, so don't try to read from 'path'. if os.path.exists(path): try: data = pd.DataFrame.from_csv(path) if data.empty: raise ValueError("File is empty.") data.index = safe_tz_localize( pd.to_datetime(data.index, infer_datetime_format=True, errors='coerce'), 'UTC') if has_data_for_dates(data, first_date, last_date): return data # Don't re-download if we've successfully downloaded and written a # file in the last hour. last_download_time = last_modified_time(path) if (now - last_download_time) <= ONE_HOUR: logger.warn( "Refusing to download new {resource} data because a " "download succeeded at {time}.", resource=resource_name, time=last_download_time, ) return data except (OSError, IOError, ValueError) as e: # These can all be raised by various versions of pandas on various # classes of malformed input. Treat them all as cache misses. logger.info( "Loading data for {path} failed with error [{error}].", path=path, error=e, ) logger.info( "Cache at {path} does not have data from {start} to {end}.", start=first_date, end=last_date, path=path, ) return None
def test_nearest_unequal_elements_short_dts(self, tz): # Length 1. dts = safe_tz_localize(pd.to_datetime(['2014-01-01']), tz) def t(s): return None if s is None else pd.Timestamp(s, tz=tz) for dt, before, after in (('2013-12-31', None, '2014-01-01'), ('2014-01-01', None, None), ('2014-01-02', '2014-01-01', None)): computed = nearest_unequal_elements(dts, t(dt)) expected = (t(before), t(after)) self.assertEqual(computed, expected) # Length 0 dts = safe_tz_localize(pd.to_datetime([]), tz) for dt, before, after in (('2013-12-31', None, None), ('2014-01-01', None, None), ('2014-01-02', None, None)): computed = nearest_unequal_elements(dts, t(dt)) expected = (t(before), t(after)) self.assertEqual(computed, expected)
def _prelude(self, dt, field): session = self._trading_calendar.minute_to_session_label(dt) dt_value = dt.value cache = self._caches[field] if cache is None or cache[0] != session: market_open = self._market_opens.loc[session] cache = self._caches[field] = (session, market_open, {}) _, market_open, entries = cache market_open = safe_tz_localize(market_open, 'UTC') if dt != market_open: prev_dt = dt_value - self._one_min else: prev_dt = None return market_open, prev_dt, dt_value, entries
def open_and_close_for_session(self, session_label): """ Returns a tuple of timestamps of the open and close of the session represented by the given label. Parameters ---------- session_label: pd.Timestamp The session whose open and close are desired. Returns ------- (Timestamp, Timestamp) The open and close for the given session. """ sched = self.schedule # `market_open` and `market_close` should be timezone aware, but pandas # 0.16.1 does not appear to support this: # http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#datetime-with-tz # noqa return ( safe_tz_localize(sched.at[session_label, 'market_open'], 'UTC'), safe_tz_localize(sched.at[session_label, 'market_close'], 'UTC'), )
def _dt_to_epoch_ns(dt_series): """Convert a timeseries into an Int64Index of nanoseconds since the epoch. Parameters ---------- dt_series : pd.Series The timeseries to convert. Returns ------- idx : pd.Int64Index The index converted to nanoseconds since the epoch. """ index = pd.to_datetime(dt_series.values) index = safe_tz_localize(index, 'UTC') return index.view(np.int64)
def load_bars_from_yahoo(indexes=None, stocks=None, start=None, end=None, adjusted=True): """ Loads data from Yahoo into a panel with the following column names for each indicated security: - open - high - low - close - volume - price Note that 'price' is Yahoo's 'Adjusted Close', which removes the impact of splits and dividends. If the argument 'adjusted' is True, then the open, high, low, and close values are adjusted as well. :param indexes: Financial indexes to load. :type indexes: dict :param stocks: Stock closing prices to load. :type stocks: list :param start: Retrieve prices from start date on. :type start: datetime :param end: Retrieve prices until end date. :type end: datetime :param adjusted: Adjust open/high/low/close for splits and dividends. The 'price' field is always adjusted. :type adjusted: bool """ data = _load_raw_yahoo_data(indexes, stocks, start, end) panel = pd.Panel(data) # Rename columns panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price'] panel.major_axis = safe_tz_localize(panel.major_axis, pytz.utc) # Adjust data if adjusted: adj_cols = ['open', 'high', 'low', 'close'] for ticker in panel.items: ratio = (panel[ticker]['price'] / panel[ticker]['close']) ratio_filtered = ratio.fillna(0).values for col in adj_cols: panel[ticker][col] *= ratio_filtered return panel
def days_at_time(days, t, tz, day_offset=0): """ Create an index of days at time ``t``, interpreted in timezone ``tz``. The returned index is localized to UTC. Parameters ---------- days : DatetimeIndex An index of dates (represented as midnight). t : datetime.time The time to apply as an offset to each day in ``days``. tz : pytz.timezone The timezone to use to interpret ``t``. day_offset : int The number of days we want to offset @days by Examples -------- In the example below, the times switch from 13:45 to 12:45 UTC because March 13th is the daylight savings transition for US/Eastern. All the times are still 8:45 when interpreted in US/Eastern. >>> import pandas as pd; import datetime; import pprint >>> dts = pd.date_range('2016-03-12', '2016-03-14') >>> dts_at_845 = days_at_time(dts, datetime.time(8, 45), 'US/Eastern') >>> pprint.pprint([str(dt) for dt in dts_at_845]) ['2016-03-12 13:45:00+00:00', '2016-03-13 12:45:00+00:00', '2016-03-14 12:45:00+00:00'] """ if len(days) == 0: return days # Offset days without tz to avoid timezone issues. days = DatetimeIndex(days).tz_localize(None) delta = pd.Timedelta( days=day_offset, hours=t.hour, minutes=t.minute, seconds=t.second, ) return safe_tz_localize((days + delta), tz).tz_convert('UTC')
def get_treasury_data(start_date, end_date): data = pd.read_csv( "https://www.federalreserve.gov/datadownload/Output.aspx" "?rel=H15" "&series=bf17364827e38702b42a58cf8eaa3f78" "&lastObs=" "&from=" # An unbounded query is ~2x faster than specifying dates. "&to=" "&filetype=csv" "&label=include" "&layout=seriescolumn" "&type=package", skiprows=5, # First 5 rows are useless headers. parse_dates=['Time Period'], na_values=['ND'], # Presumably this stands for "No Data". index_col=0, ).loc[start_date:end_date].dropna(how='all').rename( columns=parse_treasury_csv_column) return safe_tz_localize(data, 'UTC') * 0.01 # Convert from 2.57% to 0.0257.
def get_benchmark_returns(symbol, first_date, last_date): """ Get a Series of benchmark returns from Google associated with `symbol`. Default is `SPY`. Parameters ---------- symbol : str Benchmark symbol for which we're getting the returns. first_date : pd.Timestamp First date for which we want to get data. last_date : pd.Timestamp Last date for which we want to get data. The furthest date that Google goes back to is 1993-02-01. It has missing data for 2008-12-15, 2009-08-11, and 2012-02-02, so we add data for the dates for which Google is missing data. We're also limited to 4000 days worth of data per request. If we make a request for data that extends past 4000 trading days, we'll still only receive 4000 days of data. first_date is **not** included because we need the close from day N - 1 to compute the returns for day N. """ if symbol == '^GSPC': symbol = 'spy' data = pd_reader.DataReader(symbol, 'google', first_date, last_date) data = data['Close'] data[pd.Timestamp('2008-12-15')] = np.nan data[pd.Timestamp('2009-08-11')] = np.nan data[pd.Timestamp('2012-02-02')] = np.nan data = data.fillna(method='ffill') return safe_tz_localize(data.sort_index(), 'UTC').pct_change(1).iloc[1:]
def _all_minutes_with_interval(self, interval): """ Returns a DatetimeIndex representing all the minutes in this calendar. """ opens_in_ns = \ self._opens.values.astype('datetime64[ns]') closes_in_ns = \ self._closes.values.astype('datetime64[ns]') deltas = closes_in_ns - opens_in_ns nanos_in_interval = interval * NANOS_IN_MINUTE # + 1 because we want 390 days per standard day, not 389 daily_sizes = (deltas / nanos_in_interval) + 1 num_minutes = np.sum(daily_sizes).astype(np.int64) # One allocation for the entire thing. This assumes that each day # represents a contiguous block of minutes. all_minutes = np.empty(num_minutes, dtype='datetime64[ns]') idx = 0 for day_idx, size in enumerate(daily_sizes): # lots of small allocations, but it's fast enough for now. # size is a np.timedelta64, so we need to int it size_int = int(size) all_minutes[idx:(idx + size_int)] = \ np.arange( opens_in_ns[day_idx], closes_in_ns[day_idx] + NANOS_IN_MINUTE, nanos_in_interval ) idx += size_int return safe_tz_localize(DatetimeIndex(all_minutes), "UTC")
class EventLoaderUtilsTestCase(ZiplineTestCase): # These cases test the following: # 1. Shuffling timestamps in DST/EST produces the correct normalized # timestamps # 2. Timestamps at query time boundaries are normalized correctly boundary_dates = [pd.Timestamp('2013-01-04 8:44:59'), pd.Timestamp('2013-01-04 8:45:00'), pd.Timestamp('2013-01-04 8:46:00')] us_boundary_dates = [safe_tz_localize(date, 'US/Eastern') for date in boundary_dates] moscow_boundary_dates = [safe_tz_localize(date, 'Europe/Moscow') for date in boundary_dates] mixed_tz_dates = [pd.Timestamp('2013-12-30'), pd.Timestamp('2013-01-24'), pd.Timestamp('2013-01-31 20:00:00'), pd.Timestamp('2013-04-04'), pd.Timestamp('2013-04-21'), pd.Timestamp('2013-06-01')] us_dates = pd.to_datetime(us_boundary_dates + mixed_tz_dates, utc=True).tz_localize(None) moscow_dates = pd.to_datetime(moscow_boundary_dates + mixed_tz_dates, utc=True).tz_localize(None) all_combos = list(map(np.array, itertools.permutations(np.arange(len( boundary_dates + mixed_tz_dates) )))) # len(permutations(7)) is about 5000, which makes this take too long. # Sampling down to 50-ish permutations still gives is good coverage of the # different interleavings. combos = all_combos[::100] expected_us = pd.Series( [pd.Timestamp('2013-01-04'), pd.Timestamp('2013-01-05'), pd.Timestamp('2013-01-05'), pd.Timestamp('2013-12-30'), pd.Timestamp('2013-01-24'), pd.Timestamp('2013-02-01'), pd.Timestamp('2013-04-04'), pd.Timestamp('2013-04-21'), pd.Timestamp('2013-06-01')] ).values # Russia's TZ offset is +4 expected_russia = pd.Series( [pd.Timestamp('2013-01-04'), pd.Timestamp('2013-01-05'), pd.Timestamp('2013-01-05'), pd.Timestamp('2013-12-30'), pd.Timestamp('2013-01-24'), pd.Timestamp('2013-01-31'), pd.Timestamp('2013-04-04'), pd.Timestamp('2013-04-21'), pd.Timestamp('2013-06-01')] ).values # Test with timezones on either side of the meridian @parameterized.expand([(expected_us, 'US/Eastern', us_dates), (expected_russia, 'Europe/Moscow', moscow_dates)]) def test_normalize_to_query_time(self, expected, tz, dates): # Order matters in pandas 0.18.2. Prior to that, using tz_convert on # a DatetimeIndex with DST/EST timestamps mixed resulted in some of # them being an hour off (1 hour past midnight). for scrambler in self.combos: df = pd.DataFrame({"timestamp": dates[scrambler]}) result = normalize_timestamp_to_query_time(df, time(8, 45), tz, inplace=False, ts_field='timestamp') timestamps = result['timestamp'].values check_arrays(np.sort(timestamps), np.sort(expected[scrambler]))
def session_close(self, session_label): return safe_tz_localize( self.schedule.at[session_label, 'market_close'], 'UTC')
def ingest_csv(self, path, data_frequency, empty_rows_behavior='strip', duplicates_threshold=100): """ Ingest price data from a CSV file. Parameters ---------- path: str data_frequency: str Returns ------- list[str] A list of potential problems detected during ingestion. """ log.info('ingesting csv file: {}'.format(path)) if self.exchange is None: # Avoid circular dependencies from catalyst.exchange.utils.factory import get_exchange self.exchange = get_exchange(self.exchange_name) problems = [] df = pd.read_csv(path, header=0, sep=',', dtype=dict(symbol=np.object_, last_traded=np.object_, open=np.float64, high=np.float64, low=np.float64, close=np.float64, volume=np.float64), parse_dates=['last_traded'], index_col=None) min_start_dt = None max_end_dt = None symbols = df['symbol'].unique() # Apply the timezone before creating an index for simplicity df['last_traded'] = safe_tz_localize(df['last_traded'].dt, pytz.UTC) df.set_index(['symbol', 'last_traded'], drop=True, inplace=True) assets = dict() for symbol in symbols: start_dt = df.index.get_level_values(1).min() end_dt = df.index.get_level_values(1).max() end_dt_key = 'end_{}'.format(data_frequency) market = self.exchange.get_market(symbol) if market is None: raise ValueError('symbol not available in the exchange.') params = dict( exchange=self.exchange.name, data_source='local', exchange_symbol=market['id'], ) mixin_market_params(self.exchange_name, params, market) asset_def = self.exchange.get_asset_def(market, True) if asset_def is not None: params['symbol'] = asset_def['symbol'] params['start_date'] = asset_def['start_date'] \ if asset_def['start_date'] < start_dt else start_dt params['end_date'] = asset_def[end_dt_key] \ if asset_def[end_dt_key] > end_dt else end_dt params['end_daily'] = end_dt \ if data_frequency == 'daily' else asset_def['end_daily'] params['end_minute'] = end_dt \ if data_frequency == 'minute' else asset_def['end_minute'] else: params['symbol'] = get_catalyst_symbol(market) params['end_daily'] = end_dt \ if data_frequency == 'daily' else 'N/A' params['end_minute'] = end_dt \ if data_frequency == 'minute' else 'N/A' if min_start_dt is None or start_dt < min_start_dt: min_start_dt = start_dt if max_end_dt is None or end_dt > max_end_dt: max_end_dt = end_dt asset = TradingPair(**params) assets[market['id']] = asset save_exchange_symbols(self.exchange_name, assets, True) writer = self.get_writer(start_dt=min_start_dt.replace(hour=00, minute=00), end_dt=max_end_dt.replace(hour=23, minute=59), data_frequency=data_frequency) for symbol in assets: # here the symbol is the market['id'] asset = assets[symbol] ohlcv_df = df.loc[(df.index.get_level_values(0) == asset.symbol )] # type: pd.DataFrame ohlcv_df.index = ohlcv_df.index.droplevel(0) period_start = start_dt.replace(hour=00, minute=00) period_end = end_dt.replace(hour=23, minute=59) periods = self.get_calendar_periods_range(period_start, period_end, data_frequency) # We're not really resampling but ensuring that each frame # contains data ohlcv_df = ohlcv_df.reindex(periods, method='ffill') ohlcv_df['volume'] = ohlcv_df['volume'].fillna(0) problems += self.ingest_df( ohlcv_df=ohlcv_df, data_frequency=data_frequency, asset=asset, writer=writer, empty_rows_behavior=empty_rows_behavior, duplicates_threshold=duplicates_threshold) return filter(partial(is_not, None), problems)
def get_trading_days(start, end, trading_day=trading_day): return safe_tz_localize(pd.date_range(start=start.date(), end=end.date(), freq=trading_day), 'UTC')