def _infer_timezone(prices): """ Infers the timezone from the component securities if possible. """ if "Timezone" not in prices.index.get_level_values("Field"): raise ParameterError( "cannot infer timezone because Timezone field is missing, " "please specify timezone or include Timezone in master_fields") timezones = prices.loc["Timezone"].stack().unique() if len(timezones) > 1: raise ParameterError( "cannot infer timezone because multiple timezones are present " "in data, please specify timezone explicitly (timezones: {0})". format(", ".join(timezones))) return timezones[0]
def intraday_to_daily(results): """ Roll up a DataFrame of intraday performance results to daily, dropping the "Time" level from the multi-index. Parameters ---------- results : DataFrame, required a DataFrame of intraday Moonshot backtest results, with a "Time" level in the index Returns ------- DataFrame a DataFrame of daily Moonshot backtest results, without a "Time" level in the index Examples -------- >>> intraday_results = read_moonshot_csv("moonshot_intraday_backtest.csv") >>> daily_results = intraday_to_daily(intraday_results) """ try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") if "Time" not in results.index.names: raise ParameterError( "results DataFrame must have 'Time' level in index") fields_in_results = results.index.get_level_values("Field").unique() daily_results = {} # how to aggregate by field aggregation_methods = { "sum": ["Return", "Turnover", "Commission", "Slippage"], "max": ["AbsExposure", "TotalHoldings", "AbsWeight"], "mean": ["NetExposure", "Weight"], "last": ["Benchmark"], } for aggregation_method, fields_to_aggregate in aggregation_methods.items(): for field in fields_to_aggregate: if field not in fields_in_results: continue field_results = results.loc[field].astype(pd.np.float64) grouped = field_results.groupby( field_results.index.get_level_values("Date")) daily_results[field] = getattr(grouped, aggregation_method)() daily_results = pd.concat(daily_results, names=["Field", "Date"]) return daily_results
def get_prices(codes, start_date=None, end_date=None, universes=None, conids=None, exclude_universes=None, exclude_conids=None, times=None, fields=None, timezone=None, infer_timezone=None, cont_fut=None, master_fields=None): """ Query one or more history databases and/or real-time aggregate databases and load prices into a DataFrame. For bar sizes smaller than 1-day, the resulting DataFrame will have a MultiIndex with levels (Field, Date, Time). For bar sizes of 1-day or larger, the MultiIndex will have levels (Field, Date). Parameters ---------- codes : str or list of str, required the code(s) of one or more databases to query. If multiple databases are specified, they must have the same bar size. List databses in order of priority (highest priority first). If multiple databases provide the same field for the same conid on the same datetime, the first database's value will be used. start_date : str (YYYY-MM-DD), optional limit to data on or after this date end_date : str (YYYY-MM-DD), optional limit to data on or before this date universes : list of str, optional limit to these universes (default is to return all securities in database) conids : list of int, optional limit to these conids exclude_universes : list of str, optional exclude these universes exclude_conids : list of int, optional exclude these conids times: list of str (HH:MM:SS), optional limit to these times, specified in the timezone of the relevant exchange. See additional information in the Notes section regarding the timezone to use. fields : list of str, optional only return these fields. (If querying multiple databases that have different fields, provide the complete list of desired fields; only the supported fields for each database will be queried.) timezone : str, optional convert timestamps to this timezone, for example America/New_York (see `pytz.all_timezones` for choices); ignored for non-intraday bar sizes infer_timezone : bool infer the timezone from the securities master Timezone field; defaults to True if using intraday bars and no `timezone` specified; ignored for non-intraday bars, or if `timezone` is passed cont_fut : str stitch futures into continuous contracts using this method (default is not to stitch together). Only applicable to history databases. Possible choices: concat master_fields : list of str, optional [DEPRECATED] append these fields from the securities master database (pass ['?'] or any invalid fieldname to see available fields). This parameter is deprecated and will be removed in a future release. For better performance, use `quantrocket.master.get_securities_reindexed_like` to get securities master data shaped like prices. Returns ------- DataFrame a MultiIndex Notes ----- The `times` parameter, if provided, is applied differently for history databases vs real-time aggregate databases. For history databases, the parameter is applied when querying the database. For real-time aggregate databases, the parameter is not applied when querying the database; rather, all available times are retrieved and the `times` filter is applied to the resulting DataFrame after casting it to the appropriate timezone (as inferred from the securities master Timezone field or as explicitly specified with the `timezone` parameter). The rationale for this behavior is that history databases store intraday data in the timezone of the relevant exchange whereas real-time aggregate databases store data in UTC. By applying the `times` filter as described, users can specify the times in the timezone of the relevant exchange for both types of databases. Examples -------- Load intraday prices: >>> prices = get_prices('stk-sample-5min', fields=["Close", "Volume"]) >>> prices.head() ConId 265598 38708077 Field Date Time Close 2017-07-26 09:30:00 153.62 2715.0 09:35:00 153.46 2730.0 09:40:00 153.21 2725.0 09:45:00 153.28 2725.0 09:50:00 153.18 2725.0 Isolate the closes: >>> closes = prices.loc["Close"] >>> closes.head() ConId 265598 38708077 Date Time 2017-07-26 09:30:00 153.62 2715.0 09:35:00 153.46 2730.0 09:40:00 153.21 2725.0 09:45:00 153.28 2725.0 09:50:00 153.18 2725.0 Isolate the 15:45:00 prices: >>> session_closes = closes.xs("15:45:00", level="Time") >>> session_closes.head() ConId 265598 38708077 Date 2017-07-26 153.29 2700.00 2017-07-27 150.10 2660.00 2017-07-28 149.43 2650.02 2017-07-31 148.99 2650.34 2017-08-01 149.72 2675.50 """ # Import pandas lazily since it can take a moment to import try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") try: import pytz except ImportError: raise ImportError("pytz must be installed to use this function") if timezone and timezone not in pytz.all_timezones: raise ParameterError( "invalid timezone: {0} (see `pytz.all_timezones` for choices)".format( timezone)) dbs = codes if not isinstance(dbs, (list, tuple)): dbs = [dbs] fields = fields or [] if not isinstance(fields, (list, tuple)): fields = [fields] # separate history dbs from realtime dbs history_dbs = set(list_history_databases()) realtime_dbs = list_realtime_databases() realtime_agg_dbs = set(itertools.chain(*realtime_dbs.values())) history_dbs.intersection_update(set(dbs)) realtime_agg_dbs.intersection_update(set(dbs)) unknown_dbs = set(dbs) - history_dbs - realtime_agg_dbs if unknown_dbs: tick_dbs = set(realtime_dbs.keys()).intersection(unknown_dbs) # Improve error message if possible if tick_dbs: raise ParameterError("{} is a real-time tick database, only history databases or " "real-time aggregate databases are supported".format( ", ".join(tick_dbs))) raise ParameterError( "no history or real-time aggregate databases called {}".format( ", ".join(unknown_dbs))) db_bar_sizes = set() db_bar_sizes_parsed = set() db_domains = set() history_db_fields = {} realtime_db_fields = {} for db in history_dbs: db_config = get_history_db_config(db) bar_size = db_config.get("bar_size") db_bar_sizes.add(bar_size) # to validate uniform bar sizes, we need to parse them in case dbs # store different but equivalent timedelta strings. History db # strings may need massaging to be parsable. if bar_size.endswith("s"): # strip s from secs, mins, hours to get valid pandas timedelta bar_size = bar_size[:-1] elif bar_size == "1 week": bar_size = "7 day" elif bar_size == "1 month": bar_size = "30 day" db_bar_sizes_parsed.add(pd.Timedelta(bar_size)) db_domain = db_config.get("domain", "main") db_domains.add(db_domain) history_db_fields[db] = db_config.get("fields", []) for db in realtime_agg_dbs: db_config = get_realtime_db_config(db) bar_size = db_config.get("bar_size") db_bar_sizes.add(bar_size) db_bar_sizes_parsed.add(pd.Timedelta(bar_size)) # NOTE: aggregate dbs don't include a domain key; if available, this # would probably have to be obtained from the associated tick db. # This isn't an issue unless/until real-time data comes from multiple # vendors. As it now stands, real-time data is always from "main". db_domain = db_config.get("domain", "main") db_domains.add(db_domain) realtime_db_fields[db] = db_config.get("fields", []) if len(db_bar_sizes_parsed) > 1: raise ParameterError( "all databases must contain same bar size but {0} have different " "bar sizes: {1}".format(", ".join(dbs), ", ".join(db_bar_sizes)) ) if len(db_domains) > 1: raise ParameterError( "all databases must use the same securities master domain but {0} " "use different domains: {1}".format(", ".join(dbs), ", ".join(db_domains)) ) all_prices = [] for db in dbs: if db in history_dbs: # different DBs might support different fields so only request the # subset of supported fields fields_for_db = set(fields).intersection(set(history_db_fields[db])) kwargs = dict( start_date=start_date, end_date=end_date, universes=universes, conids=conids, exclude_universes=exclude_universes, exclude_conids=exclude_conids, times=times, cont_fut=cont_fut, fields=list(fields_for_db), tz_naive=False ) tmp_filepath = "{dir}{sep}history.{db}.{pid}.{time}.csv".format( dir=TMP_DIR, sep=os.path.sep, db=db, pid=os.getpid(), time=time.time()) try: download_history_file(db, tmp_filepath, **kwargs) except NoHistoricalData as e: # don't complain about NoHistoricalData if we're checking # multiple databases, unless none of them have data if len(dbs) == 1: raise else: continue prices = pd.read_csv(tmp_filepath) all_prices.append(prices) os.remove(tmp_filepath) if db in realtime_agg_dbs: fields_for_db = set(fields).intersection(set(realtime_db_fields[db])) kwargs = dict( start_date=start_date, end_date=end_date, universes=universes, conids=conids, exclude_universes=exclude_universes, exclude_conids=exclude_conids, fields=list(fields_for_db)) tmp_filepath = "{dir}{sep}realtime.{db}.{pid}.{time}.csv".format( dir=TMP_DIR, sep=os.path.sep, db=db, pid=os.getpid(), time=time.time()) try: download_market_data_file(db, tmp_filepath, **kwargs) except NoRealtimeData as e: # don't complain about NoRealtimeData if we're checking # multiple databases, unless none of them have data if len(dbs) == 1: raise else: continue prices = pd.read_csv(tmp_filepath) all_prices.append(prices) os.remove(tmp_filepath) # complain if multiple dbs and none had data if len(dbs) > 1 and not all_prices: raise NoHistoricalData("no price data matches the query parameters in any of {0}".format( ", ".join(dbs) )) prices = pd.concat(all_prices, sort=False) try: prices = prices.pivot(index="ConId", columns="Date").T except ValueError as e: if "duplicate" not in repr(e): raise # There are duplicates, likely due to querying multiple databases, # both of which return one or more identical dates for identical # conids. To resolve, we group by conid and date and take the first # available value for each field. This means that the orders of # [codes] matters. The use of groupby.first() instead of simply # drop_duplicates allows us to retain one field from one db and # another field from another db. grouped = prices.groupby(["ConId","Date"]) prices = pd.concat( dict([(col, grouped[col].first()) for col in prices.columns if col not in ("ConId","Date")]), axis=1) prices = prices.reset_index().pivot(index="ConId", columns="Date").T prices.index.set_names(["Field", "Date"], inplace=True) master_fields = master_fields or [] if master_fields: import warnings # DeprecationWarning is ignored by default but we want the user # to see it warnings.simplefilter("always", DeprecationWarning) warnings.warn( "`master_fields` parameter is deprecated and will be removed in a " "future release. For better performance, please use " "`quantrocket.master.get_securities_reindexed_like` " "to get securities master data shaped like prices.", DeprecationWarning) if isinstance(master_fields, tuple): master_fields = list(master_fields) elif not isinstance(master_fields, list): master_fields = [master_fields] # master fields that are required internally but shouldn't be returned to # the user (potentially Timezone) internal_master_fields = [] is_intraday = list(db_bar_sizes_parsed)[0] < pd.Timedelta("1 day") if is_intraday and not timezone and infer_timezone is not False: infer_timezone = True if not master_fields or "Timezone" not in master_fields: internal_master_fields.append("Timezone") # Next, get the master file if master_fields or internal_master_fields: conids = list(prices.columns) domain = list(db_domains)[0] if db_domains else None f = six.StringIO() download_master_file( f, conids=conids, fields=master_fields + internal_master_fields, domain=domain ) securities = pd.read_csv(f, index_col="ConId") if "Delisted" in securities.columns: securities.loc[:, "Delisted"] = securities.Delisted.astype(bool) if "Etf" in securities.columns: securities.loc[:, "Etf"] = securities.Etf.astype(bool) # Infer timezone if needed if not timezone and infer_timezone: timezones = securities.Timezone.unique() if len(timezones) > 1: raise ParameterError( "cannot infer timezone because multiple timezones are present " "in data, please specify timezone explicitly (timezones: {0})".format( ", ".join(timezones))) timezone = timezones[0] # Drop any internal-only fields if internal_master_fields: securities = securities.drop(internal_master_fields, axis=1) if not securities.empty: # Append securities, indexed to the min date, to allow easy ffill on demand securities = pd.DataFrame(securities.T, columns=prices.columns) securities.index.name = "Field" idx = pd.MultiIndex.from_product( (securities.index, [prices.index.get_level_values("Date").min()]), names=["Field", "Date"]) securities = securities.reindex(index=idx, level="Field") prices = pd.concat((prices, securities)) if is_intraday: dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True) if timezone: dates = dates.tz_convert(timezone) else: dates = pd.to_datetime(prices.index.get_level_values("Date")) prices.index = pd.MultiIndex.from_arrays(( prices.index.get_level_values("Field"), dates ), names=("Field", "Date")) # Split date and time dts = prices.index.get_level_values("Date") dates = pd.to_datetime(dts.date).tz_localize(None) # drop tz-aware in Date index prices.index = pd.MultiIndex.from_arrays( (prices.index.get_level_values("Field"), dates, dts.strftime("%H:%M:%S")), names=["Field", "Date", "Time"] ) # Align dates if there are any duplicate. Explanation: Suppose there are # two timezones represented in the data (e.g. history db in security # timezone vs real-time db in UTC). After parsing these dates into a # common timezone, they will align properly, but we pivoted before # parsing the dates (for performance reasons), so they may not be # aligned. Thus we need to dedupe the index. prices = prices.groupby(prices.index).first() prices.index = pd.MultiIndex.from_tuples(prices.index) prices.index.set_names(["Field", "Date", "Time"], inplace=True) # Drop time if not intraday if not is_intraday: prices.index = prices.index.droplevel("Time") return prices # If intraday, fill missing times so that each date has the same set of # times, allowing easier comparisons. Example implications: # - if history is retrieved intraday, this ensures that today will have NaN # entries for future times # - early close dates will have a full set of times, with NaNs after the # early close unique_fields = prices.index.get_level_values("Field").unique() unique_dates = prices.index.get_level_values("Date").unique() unique_times = prices.index.get_level_values("Time").unique() interpolated_index = None for field in unique_fields: if master_fields and field in master_fields: min_date = prices.loc[field].index.min() field_idx = pd.MultiIndex.from_tuples([(field,min_date[0], min_date[1])]) else: field_idx = pd.MultiIndex.from_product([[field], unique_dates, unique_times]).sort_values() if interpolated_index is None: interpolated_index = field_idx else: interpolated_index = interpolated_index.append(field_idx) prices = prices.reindex(interpolated_index) prices.index.set_names(["Field", "Date", "Time"], inplace=True) # Apply times filter if needed (see Notes in docstring) if times and realtime_agg_dbs: if not isinstance(times, (list, tuple)): times = [times] prices = prices.loc[prices.index.get_level_values("Time").isin(times)] return prices
def get_reuters_estimates_reindexed_like(reindex_like, codes, fields=["Actual"], period_types=["Q"], max_lag=None): """ Return a multiindex (Indicator, Field, Date) DataFrame of point-in-time Reuters estimates and actuals for one or more indicator codes, reindexed to match the index (dates) and columns (conids) of `reindex_like`. Estimates and actuals are forward-filled in order to provide the latest reading at any given date, indexed to the UpdatedDate field. UpdatedDate is shifted forward 1 day to avoid lookahead bias. Parameters ---------- reindex_like : DataFrame, required a DataFrame (usually of prices) with dates for the index and conids for the columns, to which the shape of the resulting DataFrame will be conformed codes : list of str, required the indicator code(s) to query. Use the `list_reuters_codes` function to see available codes. fields : list of str a list of fields to include in the resulting DataFrame. Defaults to simply including the Actual field. period_types : list of str, optional limit to these fiscal period types. Possible choices: A, Q, S, where A=Annual, Q=Quarterly, S=Semi-Annual. Default is Q/Quarterly. max_lag : str, optional maximum amount of time a data point can be used after the associated fiscal period has ended. Setting a limit can prevent using data that is reported long after the fiscal period ended, or can limit how far data is forward filled in the absence of subsequent data. Specify as a Pandas offset alias, e.g. '500D'. By default, no maximum limit is applied. Returns ------- DataFrame a multiindex (Indicator, Field, Date) DataFrame of estimates and actuals, shaped like the input DataFrame Examples -------- Query book value per share (code BVPS): >>> closes = prices.loc["Close"] >>> estimates = get_reuters_estimates_reindexed_like(closes, codes=["BVPS"]) >>> book_values_per_share = estimates.loc["BVPS"].loc["Actual"] """ try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") index_levels = reindex_like.index.names if "Time" in index_levels: raise ParameterError( "reindex_like should not have 'Time' in index, please take a cross-section first, " "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`") if index_levels != ["Date"]: raise ParameterError( "reindex_like must have index called 'Date', but has {0}".format( ",".join([str(name) for name in index_levels]))) if not hasattr(reindex_like.index, "date"): raise ParameterError("reindex_like must have a DatetimeIndex") conids = list(reindex_like.columns) start_date = reindex_like.index.min().date() # Since financial reports are sparse, start well before the reindex_like # min date start_date -= pd.Timedelta(days=365 + 180) start_date = start_date.isoformat() end_date = reindex_like.index.max().date().isoformat() f = six.StringIO() query_fields = list( fields ) # copy fields on Py2 or 3: https://stackoverflow.com/a/2612815/417414 if "UpdatedDate" not in query_fields: query_fields.append("UpdatedDate") download_reuters_estimates(codes, f, conids=conids, start_date=start_date, end_date=end_date, fields=query_fields, period_types=period_types) parse_dates = ["UpdatedDate", "FiscalPeriodEndDate"] if "AnnounceDate" in fields: parse_dates.append("AnnounceDate") estimates = pd.read_csv(f, parse_dates=parse_dates) # Drop records with no actuals estimates = estimates.loc[estimates.UpdatedDate.notnull()] # Convert UTC UpdatedDate to security timezone, and cast to date for # index f = six.StringIO() download_master_file(f, conids=list(estimates.ConId.unique()), delisted=True, fields=["Timezone"]) timezones = pd.read_csv(f, index_col="ConId") estimates = estimates.join(timezones, on="ConId") if estimates.Timezone.isnull().any(): conids_missing_timezones = list( estimates.ConId[estimates.Timezone.isnull()].unique()) raise MissingData( "timezones are missing for some conids so cannot convert UTC " "estimates to timezone of security (conids missing timezone: {0})". format(",".join([str(conid) for conid in conids_missing_timezones]))) # If only 1 timezone in data, use a faster method if len(estimates.Timezone.unique()) == 1: timezone = list(estimates.Timezone.unique())[0] estimates["Date"] = pd.to_datetime( estimates.UpdatedDate.values).tz_localize("UTC").tz_convert( timezone) else: estimates["Date"] = estimates.apply( lambda row: row.UpdatedDate.tz_localize("UTC").tz_convert( row.Timezone), axis=1) # Convert to dates (i.e. time = 00:00:00) estimates.loc[:, "Date"] = pd.to_datetime( estimates.Date.apply( lambda x: pd.datetime.combine(x, pd.Timestamp("00:00:00").time()))) # Drop any fields we don't need needed_fields = set(fields) needed_fields.update(set(("ConId", "Date", "Indicator"))) if max_lag: needed_fields.add("FiscalPeriodEndDate") unneeded_fields = set(estimates.columns) - needed_fields if unneeded_fields: estimates = estimates.drop(unneeded_fields, axis=1) # if reindex_like.index is tz-aware, make financials tz-aware so they can # be joined (tz-aware or tz-naive are both fine, as SourceDate represents # dates which are assumed to be in the local timezone of the reported # company) if reindex_like.index.tz: estimates.loc[:, "Date"] = estimates.Date.dt.tz_localize( reindex_like.index.tz.zone) deduped_updated_dates = estimates.Date.drop_duplicates() else: # joining to tz-naive requires using values, whereas joining to # tz-aware requires not using it. Why? deduped_updated_dates = estimates.Date.drop_duplicates().values # Create a unioned index of input DataFrame and UpdatedDate union_date_idx = reindex_like.index.union( deduped_updated_dates).sort_values() all_estimates = {} for code in codes: estimates_for_code = estimates.loc[estimates.Indicator == code] if estimates_for_code.empty: continue if "Indicator" not in fields: estimates_for_code = estimates_for_code.drop("Indicator", axis=1) # There might be duplicate UpdatedDates if a company announced # reports for several fiscal periods at once. In this case we keep # only the last value (i.e. latest fiscal period) estimates_for_code = estimates_for_code.drop_duplicates( subset=["ConId", "Date"], keep="last") estimates_for_code = estimates_for_code.pivot(index="ConId", columns="Date").T multiidx = pd.MultiIndex.from_product( (estimates_for_code.index.get_level_values(0).unique(), union_date_idx), names=["Field", "Date"]) estimates_for_code = estimates_for_code.reindex( index=multiidx, columns=reindex_like.columns) # estimates are sparse so ffill (one field at a time) all_fields_for_code = {} for field in estimates_for_code.index.get_level_values( "Field").unique(): field_for_code = estimates_for_code.loc[field].fillna( method="ffill") # Shift to avoid lookahead bias field_for_code = field_for_code.shift() all_fields_for_code[field] = field_for_code # Filter stale values if asked to if max_lag: fiscal_period_end_dates = all_fields_for_code[ "FiscalPeriodEndDate"] # subtract the max_lag from the index date to get the # earliest possible fiscal period end date for that row earliest_allowed_fiscal_period_end_dates = fiscal_period_end_dates.apply( lambda x: fiscal_period_end_dates.index - pd.Timedelta(max_lag )) within_max_timedelta = fiscal_period_end_dates.apply( pd.to_datetime) >= earliest_allowed_fiscal_period_end_dates for field, field_for_code in all_fields_for_code.items(): field_for_code = field_for_code.where(within_max_timedelta) all_fields_for_code[field] = field_for_code # Clean up if FiscalPeriodEndDate was just kept around for this purpose if "FiscalPeriodEndDate" not in fields: del all_fields_for_code["FiscalPeriodEndDate"] estimates_for_code = pd.concat(all_fields_for_code, names=["Field", "Date"]) # In cases the statements included dates not in the input # DataFrame, drop those now that we've ffilled extra_dates = union_date_idx.difference(reindex_like.index) if not extra_dates.empty: estimates_for_code.drop(extra_dates, axis=0, level="Date", inplace=True) all_estimates[code] = estimates_for_code estimates = pd.concat(all_estimates, names=["Indicator", "Field", "Date"]) return estimates
def get_reuters_financials_reindexed_like(reindex_like, coa_codes, fields=["Amount"], interim=False, exclude_restatements=False, max_lag=None): """ Return a multiindex (CoaCode, Field, Date) DataFrame of point-in-time Reuters financial statements for one or more Chart of Account (COA) codes, reindexed to match the index (dates) and columns (conids) of `reindex_like`. Financial values are forward-filled in order to provide the latest reading at any given date. Financials are indexed to the SourceDate field, i.e. the date on which the financial statement was released. SourceDate is shifted forward 1 day to avoid lookahead bias. Parameters ---------- reindex_like : DataFrame, required a DataFrame (usually of prices) with dates for the index and conids for the columns, to which the shape of the resulting DataFrame will be conformed coa_codes : list of str, required the Chart of Account (COA) code(s) to query. Use the `list_reuters_codes` function to see available codes. fields : list of str a list of fields to include in the resulting DataFrame. Defaults to simply including the Amount field. interim : bool query interim/quarterly reports (default is to query annual reports, which provide deeper history) exclude_restatements : bool, optional exclude restatements (default is to include them) max_lag : str, optional maximum amount of time a data point can be used after the associated fiscal period has ended. Setting a limit can prevent using data that is reported long after the fiscal period ended, or can limit how far data is forward filled in the absence of subsequent data. Specify as a Pandas offset alias, e.g. '500D'. By default, no maximum limit is applied. Returns ------- DataFrame a multiindex (CoaCode, Field, Date) DataFrame of financials, shaped like the input DataFrame Examples -------- Let's calculate book value per share, defined as: (Total Assets - Total Liabilities) / Number of shares outstanding The COA codes for these metrics are 'ATOT' (Total Assets), 'LTLL' (Total Liabilities), and 'QTCO' (Total Common Shares Outstanding). >>> closes = prices.loc["Close"] >>> financials = get_reuters_financials_reindexed_like(closes, coa_codes=["ATOT", "LTLL", "QTCO"]) >>> tot_assets = financials.loc["ATOT"].loc["Amount"] >>> tot_liabilities = financials.loc["LTLL"].loc["Amount"] >>> shares_out = financials.loc["QTCO"].loc["Amount"] >>> book_values_per_share = (tot_assets - tot_liabilities)/shares_out """ try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") index_levels = reindex_like.index.names if "Time" in index_levels: raise ParameterError( "reindex_like should not have 'Time' in index, please take a cross-section first, " "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`") if index_levels != ["Date"]: raise ParameterError( "reindex_like must have index called 'Date', but has {0}".format( ",".join([str(name) for name in index_levels]))) if not hasattr(reindex_like.index, "date"): raise ParameterError("reindex_like must have a DatetimeIndex") conids = list(reindex_like.columns) start_date = reindex_like.index.min().date() # Since financial reports are sparse, start well before the reindex_like # min date start_date -= pd.Timedelta(days=365 + 180) start_date = start_date.isoformat() end_date = reindex_like.index.max().date().isoformat() f = six.StringIO() download_reuters_financials(coa_codes, f, conids=conids, start_date=start_date, end_date=end_date, fields=fields, interim=interim, exclude_restatements=exclude_restatements) financials = pd.read_csv(f, parse_dates=["SourceDate", "FiscalPeriodEndDate"]) # Rename SourceDate to match price history index name financials = financials.rename(columns={"SourceDate": "Date"}) # Drop any fields we don't need needed_fields = set(fields) needed_fields.update(set(("ConId", "Date", "CoaCode"))) if max_lag: needed_fields.add("FiscalPeriodEndDate") unneeded_fields = set(financials.columns) - needed_fields if unneeded_fields: financials = financials.drop(unneeded_fields, axis=1) # if reindex_like.index is tz-aware, make financials tz-aware so they can # be joined (tz-aware or tz-naive are both fine, as SourceDate represents # dates which are assumed to be in the local timezone of the reported # company) if reindex_like.index.tz: financials.loc[:, "Date"] = financials.Date.dt.tz_localize( reindex_like.index.tz.zone) deduped_source_dates = financials.Date.drop_duplicates() else: # joining to tz-naive requires using values, whereas joining to # tz-aware requires not using it. Why? deduped_source_dates = financials.Date.drop_duplicates().values # Create a unioned index of input DataFrame and statement SourceDates union_date_idx = reindex_like.index.union( deduped_source_dates).sort_values() all_financials = {} for code in coa_codes: financials_for_code = financials.loc[financials.CoaCode == code] if financials_for_code.empty: continue if "CoaCode" not in fields: financials_for_code = financials_for_code.drop("CoaCode", axis=1) # There might be duplicate SourceDates if a company announced # reports for several fiscal periods at once. In this case we keep # only the last value (i.e. latest fiscal period) financials_for_code = financials_for_code.drop_duplicates( subset=["ConId", "Date"], keep="last") financials_for_code = financials_for_code.pivot(index="ConId", columns="Date").T multiidx = pd.MultiIndex.from_product( (financials_for_code.index.get_level_values(0).unique(), union_date_idx), names=["Field", "Date"]) financials_for_code = financials_for_code.reindex( index=multiidx, columns=reindex_like.columns) # financial values are sparse so ffill (one field at a time) all_fields_for_code = {} for field in financials_for_code.index.get_level_values( "Field").unique(): field_for_code = financials_for_code.loc[field].fillna( method="ffill") # Shift to avoid lookahead bias field_for_code = field_for_code.shift() all_fields_for_code[field] = field_for_code # Filter stale values if asked to if max_lag: fiscal_period_end_dates = all_fields_for_code[ "FiscalPeriodEndDate"] # subtract the max_lag from the index date to get the # earliest possible fiscal period end date for that row earliest_allowed_fiscal_period_end_dates = fiscal_period_end_dates.apply( lambda x: fiscal_period_end_dates.index - pd.Timedelta(max_lag )) within_max_timedelta = fiscal_period_end_dates.apply( pd.to_datetime) >= earliest_allowed_fiscal_period_end_dates for field, field_for_code in all_fields_for_code.items(): field_for_code = field_for_code.where(within_max_timedelta) all_fields_for_code[field] = field_for_code # Clean up if FiscalPeriodEndDate was just kept around for this purpose if "FiscalPeriodEndDate" not in fields: del all_fields_for_code["FiscalPeriodEndDate"] financials_for_code = pd.concat(all_fields_for_code, names=["Field", "Date"]) # In cases the statements included dates not in the input # DataFrame, drop those now that we've ffilled extra_dates = union_date_idx.difference(reindex_like.index) if not extra_dates.empty: financials_for_code.drop(extra_dates, axis=0, level="Date", inplace=True) all_financials[code] = financials_for_code financials = pd.concat(all_financials, names=["CoaCode", "Field", "Date"]) return financials
def _get_stockloan_data_reindexed_like(stockloan_func, stockloan_field, reindex_like, time=None): """ Common base function for get_shortable_shares_reindexed_like and get_borrow_fees_reindexed_like. """ try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") index_levels = reindex_like.index.names if "Time" in index_levels: raise ParameterError( "reindex_like should not have 'Time' in index, please take a cross-section first, " "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`") if index_levels != ["Date"]: raise ParameterError( "reindex_like must have index called 'Date', but has {0}".format( ",".join([str(name) for name in index_levels]))) if not hasattr(reindex_like.index, "date"): raise ParameterError("reindex_like must have a DatetimeIndex") conids = list(reindex_like.columns) start_date = reindex_like.index.min().date() # Stockloan data is sparse but batched in monthly files, so start >1-month # before the reindex_like min date start_date -= pd.Timedelta(days=45) start_date = start_date.isoformat() end_date = reindex_like.index.max().date().isoformat() f = six.StringIO() stockloan_func(f, conids=conids, start_date=start_date, end_date=end_date) stockloan_data = pd.read_csv(f) stockloan_data.loc[:, "Date"] = pd.to_datetime(stockloan_data.Date, utc=True) # Determine timezone, from: # - time param if provided # - else reindex_like.index.tz if set # - else component securities if all have same timezone timezone = None if time and " " in time: time, timezone = time.split(" ", 1) if reindex_like.index.tz and reindex_like.index.tz.zone != timezone: raise ParameterError(( "cannot use timezone {0} because reindex_like timezone is {1}, " "these must match".format(timezone, reindex_like.index.tz.zone))) if not timezone: if reindex_like.index.tz: timezone = reindex_like.index.tz.zone else: # try to infer from component securities f = six.StringIO() download_master_file(f, conids=list(stockloan_data.ConId.unique()), delisted=True, fields=["Timezone"]) security_timezones = pd.read_csv(f, index_col="ConId") security_timezones = list(security_timezones.Timezone.unique()) if len(security_timezones) > 1: raise ParameterError( "no timezone specified and cannot infer because multiple timezones are " "present in data, please specify timezone (timezones in data: {0})" .format(", ".join(security_timezones))) timezone = security_timezones[0] # Create an index of `reindex_like` dates at `time` if time: try: time = pd.Timestamp(time).time() except ValueError as e: raise ParameterError("could not parse time '{0}': {1}".format( time, str(e))) index_at_time = pd.Index(reindex_like.index.to_series().apply( lambda x: pd.datetime.combine(x, time))) else: index_at_time = reindex_like.index if not index_at_time.tz: index_at_time = index_at_time.tz_localize(timezone) index_at_time = index_at_time.tz_convert("UTC") stockloan_data = stockloan_data.pivot(index="ConId", columns="Date").T stockloan_data = stockloan_data.loc[stockloan_field] # Create a unioned index of requested times and stockloan data timestamps unioned_idx = index_at_time.union(stockloan_data.index) # performance boost: don't need to reindex and ffill earlier than the # stockloan data start date unioned_idx = unioned_idx[unioned_idx >= stockloan_data.index.min()] stockloan_data = stockloan_data.reindex(index=unioned_idx, columns=reindex_like.columns) stockloan_data = stockloan_data.fillna(method="ffill") # Keep only the requested times, now that we've ffilled stockloan_data = stockloan_data.reindex(index=index_at_time) # Replace index_at_time with the original reindex_like index (this needs # to be done because index_at_time is tz-aware and reindex_like may not # be, and because index_at_time uses the requested `time` if provided. # Since index_at_time was derived from reindex_like.index and has the # same shape, it is safe to replace it.) stockloan_data.index = reindex_like.index return stockloan_data
def create_agg_db(code, tick_db_code, bar_size, fields=None): """ Create an aggregate database from a tick database. Aggregate databases provide rolled-up views of the underlying tick data, aggregated to a desired frequency (such as 1-minute bars). Parameters ---------- code : str, required the code to assign to the aggregate database (lowercase alphanumerics and hyphens only) tick_db_code : str, required the code of the tick database to aggregate bar_size : str, required the time frequency to aggregate to (use a Pandas timedelta string, for example 10s or 1m or 2h or 1d) fields : dict of list of str, optional include these fields in aggregate database, aggregated in these ways. Provide a dict mapping tick db fields to lists of aggregate functions to apply to the field. Available aggregate functions are "Close", "Open", "High", "Low", "Mean", "Sum", and "Count". See examples section. If not specified, defaults to including the "Close" for each tick db field. Returns ------- dict status message Examples -------- Create an aggregate database of 1 minute bars consisting of OHLC trades and volume, from a tick database of US stocks, resulting in fields called LastPriceOpen, LastPriceHigh, LastPriceLow, LastPriceClose, and VolumeClose: >>> create_agg_db("usa-stk-trades-1min", tick_db_code="usa-stk-trades", bar_size="1m", fields={"LastPrice":["Open","High","Low","Close"], "Volume": ["Close"]}) Create an aggregate database of 1 second bars containing the closing bid and ask and the mean bid size and ask size, from a tick database of futures trades and quotes, resulting in fields called BidPriceClose, AskPriceClose, BidSizeMean, and AskSizeMean: >>> create_agg_db("globex-fut-taq-1sec", tick_db_code="globex-fut-taq", bar_size="1s", fields={"BidPrice":["Close"], "AskPrice": ["Close"], "BidSize": ["Mean"], "AskSize": ["Mean"] }) """ params = {} params["bar_size"] = bar_size if fields: if not isinstance(fields, dict): raise ParameterError("fields must be a dict") # convert lists to comma-separated strings _fields = {} for k, v in fields.items(): if isinstance(v, (list, tuple)): v = ",".join(v) _fields[k] = v params["fields"] = dict_to_dict_strs(_fields) response = houston.put("/realtime/databases/{0}/aggregates/{1}".format(tick_db_code, code), params=params) houston.raise_for_status_with_json(response) return response.json()
def get_historical_prices(codes, start_date=None, end_date=None, universes=None, conids=None, exclude_universes=None, exclude_conids=None, times=None, cont_fut=None, fields=None, master_fields=None, timezone=None, infer_timezone=None): """ Query one or more history databases and load prices into a DataFrame. For bar sizes smaller than 1-day, the resulting DataFrame will have a MultiIndex with levels (Field, Date, Time). For bar sizes of 1-day or larger, the MultiIndex will have levels (Field, Date). Parameters ---------- codes : str or list of str, required the code(s) of one or more databases to query. If multiple databases are specified, they must have the same bar size. start_date : str (YYYY-MM-DD), optional limit to history on or after this date end_date : str (YYYY-MM-DD), optional limit to history on or before this date universes : list of str, optional limit to these universes (default is to return all securities in database) conids : list of int, optional limit to these conids exclude_universes : list of str, optional exclude these universes exclude_conids : list of int, optional exclude these conids times: list of str (HH:MM:SS), optional limit to these times cont_fut : str stitch futures into continuous contracts using this method (default is not to stitch together). Possible choices: concat fields : list of str, optional only return these fields (pass ['?'] or any invalid fieldname to see available fields) master_fields : list of str, optional append these fields from the securities master database (pass ['?'] or any invalid fieldname to see available fields) timezone : str, optional convert timestamps to this timezone, for example America/New_York (see `pytz.all_timezones` for choices); ignored for non-intraday bar sizes infer_timezone : bool infer the timezone from the securities master Timezone field; defaults to True if using intraday bars and no `timezone` specified; ignored for non-intraday bars, or if `timezone` is passed Returns ------- DataFrame a MultiIndex Examples -------- Load intraday prices: >>> prices = get_historical_prices('stk-sample-5min', fields=["Close", "Volume"], timezone="America/New_York") >>> prices.head() ConId 265598 38708077 Field Date Time Close 2017-07-26 09:30:00 153.62 2715.0 09:35:00 153.46 2730.0 09:40:00 153.21 2725.0 09:45:00 153.28 2725.0 09:50:00 153.18 2725.0 Isolate the closes: >>> closes = prices.loc["Close"] >>> closes.head() ConId 265598 38708077 Date Time 2017-07-26 09:30:00 153.62 2715.0 09:35:00 153.46 2730.0 09:40:00 153.21 2725.0 09:45:00 153.28 2725.0 09:50:00 153.18 2725.0 Isolate the 15:45:00 prices: >>> session_closes = closes.xs("15:45:00", level="Time") >>> session_closes.head() ConId 265598 38708077 Date 2017-07-26 153.29 2700.00 2017-07-27 150.10 2660.00 2017-07-28 149.43 2650.02 2017-07-31 148.99 2650.34 2017-08-01 149.72 2675.50 """ # Import pandas lazily since it can take a moment to import try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") try: import pytz except ImportError: raise ImportError("pytz must be installed to use this function") if timezone and timezone not in pytz.all_timezones: raise ParameterError( "invalid timezone: {0} (see `pytz.all_timezones` for choices)". format(timezone)) dbs = codes if not isinstance(dbs, (list, tuple)): dbs = [dbs] if master_fields: if isinstance(master_fields, tuple): master_fields = list(master_fields) elif not isinstance(master_fields, list): master_fields = [master_fields] db_universes = set() db_bar_sizes = set() for db in dbs: db_config = get_db_config(db) _db_universes = db_config.get("universes", None) if _db_universes: db_universes.update(set(_db_universes)) bar_size = db_config.get("bar_size") db_bar_sizes.add(bar_size) db_universes = list(db_universes) db_bar_sizes = list(db_bar_sizes) if len(db_bar_sizes) > 1: raise ParameterError( "all databases must contain same bar size but {0} have different " "bar sizes: {1}".format(", ".join(dbs), ", ".join(db_bar_sizes))) all_prices = [] for db in dbs: kwargs = dict(start_date=start_date, end_date=end_date, universes=universes, conids=conids, exclude_universes=exclude_universes, exclude_conids=exclude_conids, times=times, cont_fut=cont_fut, fields=fields, tz_naive=False) tmp_filepath = "{0}/history.{1}.{2}.{3}.csv".format( TMP_DIR, db, os.getpid(), time.time()) download_history_file(db, tmp_filepath, **kwargs) prices = pd.read_csv(tmp_filepath) all_prices.append(prices) os.remove(tmp_filepath) prices = pd.concat(all_prices) prices = prices.pivot(index="ConId", columns="Date").T prices.index.set_names(["Field", "Date"], inplace=True) is_intraday = db_bar_sizes[0] not in ("1 day", "1 week", "1 month") if is_intraday and not timezone and infer_timezone is not False: infer_timezone = True if not master_fields: master_fields = [] if "Timezone" not in master_fields: master_fields.append("Timezone") # Next, get the master file if master_fields: universes = universes conids = conids if not conids and not universes: universes = db_universes if not universes: conids = list(prices.columns) f = six.StringIO() download_master_file(f, conids=conids, universes=universes, exclude_conids=exclude_conids, exclude_universes=exclude_universes, fields=master_fields, delisted=True) securities = pd.read_csv(f, index_col="ConId") if "Delisted" in securities.columns: securities.loc[:, "Delisted"] = securities.Delisted.astype(bool) if "Etf" in securities.columns: securities.loc[:, "Etf"] = securities.Etf.astype(bool) # Append securities, indexed to the min date, to allow easy ffill on demand securities = pd.DataFrame(securities.T, columns=prices.columns) securities.index.name = "Field" idx = pd.MultiIndex.from_product( (securities.index, [prices.index.get_level_values("Date").min()]), names=["Field", "Date"]) securities = securities.reindex(index=idx, level="Field") prices = pd.concat((prices, securities)) if is_intraday: dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True) if not timezone and infer_timezone: timezone = _infer_timezone(prices) if timezone: dates = dates.tz_convert(timezone) else: dates = pd.to_datetime(prices.index.get_level_values("Date")) prices.index = pd.MultiIndex.from_arrays( (prices.index.get_level_values("Field"), dates), names=("Field", "Date")) # Split date and time dts = prices.index.get_level_values("Date") dates = pd.to_datetime(dts.date).tz_localize( None) # drop tz-aware in Date index prices.index = pd.MultiIndex.from_arrays( (prices.index.get_level_values("Field"), dates, dts.strftime("%H:%M:%S")), names=["Field", "Date", "Time"]) # Align dates if there are any duplicate. Explanation: Suppose there are # two timezones represented in the data. After parsing these dates into a # common timezone, they will align properly, but we pivoted before # parsing the dates (for performance reasons), so they may not be # aligned. Thus we need to dedupe the index. prices = prices.groupby(prices.index).first() prices.index = pd.MultiIndex.from_tuples(prices.index) prices.index.set_names(["Field", "Date", "Time"], inplace=True) # Drop time if not intraday if not is_intraday: prices.index = prices.index.droplevel("Time") return prices # If intraday, fill missing times so that each date has the same set of # times, allowing easier comparisons. Example implications: # - if history is retrieved intraday, this ensures that today will have NaN # entries for future times # - early close dates will have a full set of times, with NaNs after the # early close unique_fields = prices.index.get_level_values("Field").unique() unique_dates = prices.index.get_level_values("Date").unique() unique_times = prices.index.get_level_values("Time").unique() interpolated_index = None for field in unique_fields: if master_fields and field in master_fields: min_date = prices.loc[field].index.min() field_idx = pd.MultiIndex.from_tuples([(field, min_date[0], min_date[1])]) else: field_idx = pd.MultiIndex.from_product([[field], unique_dates, unique_times]) if interpolated_index is None: interpolated_index = field_idx else: interpolated_index = interpolated_index.append(field_idx) prices = prices.reindex(interpolated_index) prices.index.set_names(["Field", "Date", "Time"], inplace=True) return prices
def get_contract_nums_reindexed_like(reindex_like, limit=5): """ From a DataFrame of futures (with dates as the index and conids as columns), return a DataFrame of integers representing each conid's sequence in the futures chain as of each date, where 1 is the front contract, 2 is the second nearest contract, etc. Sequences are based on the RolloverDate field in the securities master file, which is based on configurable rollover rules. Parameters ---------- reindex_like : DataFrame, required a DataFrame (usually of prices) with dates for the index and conids for the columns, to which the shape of the resulting DataFrame will be conformed limit : int how many contracts ahead to sequence. For example, assuming quarterly contracts, a limit of 5 will sequence 5 quarters out. Default 5. Returns ------- DataFrame a DataFrame of futures chain sequence numbers, shaped like the input DataFrame Examples -------- Get a Boolean mask of front-month contracts: >>> closes = prices.loc["Close"] >>> contract_nums = get_contract_nums_reindexed_like(closes) >>> are_front_months = contract_nums == 1 """ try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") index_levels = reindex_like.index.names if "Date" not in index_levels: raise ParameterError( "reindex_like must have index called 'Date', but has {0}".format( ",".join([str(name) for name in index_levels]))) reindex_like_dt_index = reindex_like.index.get_level_values("Date") if not hasattr(reindex_like_dt_index, "date"): raise ParameterError("reindex_like must have a DatetimeIndex") f = six.StringIO() download_master_file(f, conids=list(reindex_like.columns), fields=["RolloverDate", "UnderConId", "SecType"]) rollover_dates = pd.read_csv(f, parse_dates=["RolloverDate"]) rollover_dates = rollover_dates[rollover_dates.SecType == "FUT"].drop( "SecType", axis=1) if rollover_dates.empty: raise ParameterError( "input DataFrame does not appear to contain any futures contracts") if reindex_like_dt_index.tz: rollover_dates.loc[:, "RolloverDate"] = rollover_dates.RolloverDate.dt.tz_localize( reindex_like_dt_index.tz.zone) min_date = reindex_like_dt_index.min() max_date = max( [rollover_dates.RolloverDate.max(), reindex_like_dt_index.max()]) # Stack conids by underlying (1 column per underlying) rollover_dates = rollover_dates.set_index(["RolloverDate", "UnderConId"]).ConId.unstack() contract_nums = None for i in range(limit): # backshift conids _rollover_dates = rollover_dates.apply( lambda col: col.dropna().shift(-i)) # Reindex to daily frequency _rollover_dates = _rollover_dates.reindex( index=pd.date_range(start=min_date, end=max_date)) # RolloverDate is when we roll out of the contract, hence we backfill _rollover_dates = _rollover_dates.fillna(method="bfill") # Stack to Series of Date, nth conid _rollover_dates = _rollover_dates.stack() _rollover_dates.index = _rollover_dates.index.droplevel("UnderConId") _rollover_dates.index.name = "Date" # Pivot Series to DataFrame _rollover_dates = _rollover_dates.reset_index(name="ConId") _rollover_dates["ContractNum"] = i + 1 _rollover_dates = _rollover_dates.set_index(["Date", "ConId"]) _contract_nums = _rollover_dates.ContractNum.unstack() # If MultiIndex input, broadcast across Time level if len(index_levels) > 1: _contract_nums = _contract_nums.reindex(index=reindex_like.index, level="Date") _contract_nums = _contract_nums.reindex( columns=reindex_like.columns) else: _contract_nums = _contract_nums.reindex( index=reindex_like_dt_index, columns=reindex_like.columns) if contract_nums is None: contract_nums = _contract_nums else: contract_nums = contract_nums.fillna(_contract_nums) return contract_nums