コード例 #1
0
def _infer_timezone(prices):
    """
    Infers the timezone from the component securities if possible.
    """
    if "Timezone" not in prices.index.get_level_values("Field"):
        raise ParameterError(
            "cannot infer timezone because Timezone field is missing, "
            "please specify timezone or include Timezone in master_fields")

    timezones = prices.loc["Timezone"].stack().unique()

    if len(timezones) > 1:
        raise ParameterError(
            "cannot infer timezone because multiple timezones are present "
            "in data, please specify timezone explicitly (timezones: {0})".
            format(", ".join(timezones)))

    return timezones[0]
コード例 #2
0
def intraday_to_daily(results):
    """
    Roll up a DataFrame of intraday performance results to daily, dropping
    the "Time" level from the multi-index.

    Parameters
    ----------
    results : DataFrame, required
         a DataFrame of intraday Moonshot backtest results, with a "Time" level
         in the index

    Returns
    -------
    DataFrame
        a DataFrame of daily Moonshot backtest results, without a "Time" level in
        the index

    Examples
    --------
    >>> intraday_results = read_moonshot_csv("moonshot_intraday_backtest.csv")
    >>> daily_results = intraday_to_daily(intraday_results)
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    if "Time" not in results.index.names:
        raise ParameterError(
            "results DataFrame must have 'Time' level in index")

    fields_in_results = results.index.get_level_values("Field").unique()

    daily_results = {}

    # how to aggregate by field
    aggregation_methods = {
        "sum": ["Return", "Turnover", "Commission", "Slippage"],
        "max": ["AbsExposure", "TotalHoldings", "AbsWeight"],
        "mean": ["NetExposure", "Weight"],
        "last": ["Benchmark"],
    }

    for aggregation_method, fields_to_aggregate in aggregation_methods.items():
        for field in fields_to_aggregate:
            if field not in fields_in_results:
                continue

            field_results = results.loc[field].astype(pd.np.float64)
            grouped = field_results.groupby(
                field_results.index.get_level_values("Date"))
            daily_results[field] = getattr(grouped, aggregation_method)()

    daily_results = pd.concat(daily_results, names=["Field", "Date"])
    return daily_results
コード例 #3
0
def get_prices(codes, start_date=None, end_date=None,
               universes=None, conids=None,
               exclude_universes=None, exclude_conids=None,
               times=None, fields=None,
               timezone=None, infer_timezone=None,
               cont_fut=None, master_fields=None):
    """
    Query one or more history databases and/or real-time aggregate databases
    and load prices into a DataFrame.

    For bar sizes smaller than 1-day, the resulting DataFrame will have a MultiIndex
    with levels (Field, Date, Time). For bar sizes of 1-day or larger, the MultiIndex
    will have levels (Field, Date).

    Parameters
    ----------
    codes : str or list of str, required
        the code(s) of one or more databases to query. If multiple databases
        are specified, they must have the same bar size. List databses in order of
        priority (highest priority first). If multiple databases provide the same
        field for the same conid on the same datetime, the first database's value will
        be used.

    start_date : str (YYYY-MM-DD), optional
        limit to data on or after this date

    end_date : str (YYYY-MM-DD), optional
        limit to data on or before this date

    universes : list of str, optional
        limit to these universes (default is to return all securities in database)

    conids : list of int, optional
        limit to these conids

    exclude_universes : list of str, optional
        exclude these universes

    exclude_conids : list of int, optional
        exclude these conids

    times: list of str (HH:MM:SS), optional
        limit to these times, specified in the timezone of the relevant exchange. See
        additional information in the Notes section regarding the timezone to use.

    fields : list of str, optional
        only return these fields. (If querying multiple databases that have different fields,
        provide the complete list of desired fields; only the supported fields for each
        database will be queried.)

    timezone : str, optional
        convert timestamps to this timezone, for example America/New_York (see
        `pytz.all_timezones` for choices); ignored for non-intraday bar sizes

    infer_timezone : bool
        infer the timezone from the securities master Timezone field; defaults to
        True if using intraday bars and no `timezone` specified; ignored for
        non-intraday bars, or if `timezone` is passed

    cont_fut : str
        stitch futures into continuous contracts using this method (default is not
        to stitch together). Only applicable to history databases. Possible choices:
        concat

    master_fields : list of str, optional
        [DEPRECATED] append these fields from the securities master database (pass ['?'] or any
        invalid fieldname to see available fields). This parameter is deprecated and
        will be removed in a future release. For better performance, use
        `quantrocket.master.get_securities_reindexed_like` to get securities master
        data shaped like prices.

    Returns
    -------
    DataFrame
        a MultiIndex

    Notes
    -----
    The `times` parameter, if provided, is applied differently for history databases vs
    real-time aggregate databases. For history databases, the parameter is applied when
    querying the database. For real-time aggregate databases, the parameter is not applied
    when querying the database; rather, all available times are retrieved and the `times`
    filter is applied to the resulting DataFrame after casting it to the appropriate timezone
    (as inferred from the securities master Timezone field or as explicitly specified with
    the `timezone` parameter). The rationale for this behavior is that history databases store
    intraday data in the timezone of the relevant exchange whereas real-time aggregate
    databases store data in UTC. By applying the `times` filter as described, users can specify
    the times in the timezone of the relevant exchange for both types of databases.

    Examples
    --------
    Load intraday prices:

    >>> prices = get_prices('stk-sample-5min', fields=["Close", "Volume"])
    >>> prices.head()
                                ConId   	265598	38708077
    Field	Date	        Time
    Close	2017-07-26      09:30:00	153.62	2715.0
                                09:35:00	153.46	2730.0
                                09:40:00	153.21	2725.0
                                09:45:00	153.28	2725.0
                                09:50:00	153.18	2725.0

    Isolate the closes:

    >>> closes = prices.loc["Close"]
    >>> closes.head()
                ConId	        265598  38708077
    Date        Time
    2017-07-26	09:30:00	153.62	2715.0
                09:35:00	153.46	2730.0
                09:40:00	153.21	2725.0
                09:45:00	153.28	2725.0
                09:50:00	153.18	2725.0

    Isolate the 15:45:00 prices:

    >>> session_closes = closes.xs("15:45:00", level="Time")
    >>> session_closes.head()
        ConId	265598	38708077
    Date
    2017-07-26	153.29	2700.00
    2017-07-27 	150.10	2660.00
    2017-07-28	149.43	2650.02
    2017-07-31 	148.99	2650.34
    2017-08-01 	149.72	2675.50
    """
    # Import pandas lazily since it can take a moment to import
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    try:
        import pytz
    except ImportError:
        raise ImportError("pytz must be installed to use this function")

    if timezone and timezone not in pytz.all_timezones:
        raise ParameterError(
            "invalid timezone: {0} (see `pytz.all_timezones` for choices)".format(
                timezone))

    dbs = codes
    if not isinstance(dbs, (list, tuple)):
        dbs = [dbs]

    fields = fields or []
    if not isinstance(fields, (list, tuple)):
        fields = [fields]

    # separate history dbs from realtime dbs
    history_dbs = set(list_history_databases())
    realtime_dbs = list_realtime_databases()
    realtime_agg_dbs = set(itertools.chain(*realtime_dbs.values()))

    history_dbs.intersection_update(set(dbs))
    realtime_agg_dbs.intersection_update(set(dbs))

    unknown_dbs = set(dbs) - history_dbs - realtime_agg_dbs

    if unknown_dbs:
        tick_dbs = set(realtime_dbs.keys()).intersection(unknown_dbs)
        # Improve error message if possible
        if tick_dbs:
            raise ParameterError("{} is a real-time tick database, only history databases or "
                                 "real-time aggregate databases are supported".format(
                                     ", ".join(tick_dbs)))
        raise ParameterError(
            "no history or real-time aggregate databases called {}".format(
                ", ".join(unknown_dbs)))

    db_bar_sizes = set()
    db_bar_sizes_parsed = set()
    db_domains = set()
    history_db_fields = {}
    realtime_db_fields = {}

    for db in history_dbs:
        db_config = get_history_db_config(db)
        bar_size = db_config.get("bar_size")
        db_bar_sizes.add(bar_size)
        # to validate uniform bar sizes, we need to parse them in case dbs
        # store different but equivalent timedelta strings. History db
        # strings may need massaging to be parsable.
        if bar_size.endswith("s"):
            # strip s from secs, mins, hours to get valid pandas timedelta
            bar_size = bar_size[:-1]
        elif bar_size == "1 week":
            bar_size = "7 day"
        elif bar_size == "1 month":
            bar_size = "30 day"
        db_bar_sizes_parsed.add(pd.Timedelta(bar_size))
        db_domain = db_config.get("domain", "main")
        db_domains.add(db_domain)
        history_db_fields[db] = db_config.get("fields", [])

    for db in realtime_agg_dbs:
        db_config = get_realtime_db_config(db)
        bar_size = db_config.get("bar_size")
        db_bar_sizes.add(bar_size)
        db_bar_sizes_parsed.add(pd.Timedelta(bar_size))
        # NOTE: aggregate dbs don't include a domain key; if available, this
        # would probably have to be obtained from the associated tick db.
        # This isn't an issue unless/until real-time data comes from multiple
        # vendors. As it now stands, real-time data is always from "main".
        db_domain = db_config.get("domain", "main")
        db_domains.add(db_domain)
        realtime_db_fields[db] = db_config.get("fields", [])

    if len(db_bar_sizes_parsed) > 1:
        raise ParameterError(
            "all databases must contain same bar size but {0} have different "
            "bar sizes: {1}".format(", ".join(dbs), ", ".join(db_bar_sizes))
        )

    if len(db_domains) > 1:
        raise ParameterError(
            "all databases must use the same securities master domain but {0} "
            "use different domains: {1}".format(", ".join(dbs), ", ".join(db_domains))
        )

    all_prices = []

    for db in dbs:

        if db in history_dbs:
            # different DBs might support different fields so only request the
            # subset of supported fields
            fields_for_db = set(fields).intersection(set(history_db_fields[db]))

            kwargs = dict(
                start_date=start_date,
                end_date=end_date,
                universes=universes,
                conids=conids,
                exclude_universes=exclude_universes,
                exclude_conids=exclude_conids,
                times=times,
                cont_fut=cont_fut,
                fields=list(fields_for_db),
                tz_naive=False
            )

            tmp_filepath = "{dir}{sep}history.{db}.{pid}.{time}.csv".format(
                dir=TMP_DIR, sep=os.path.sep, db=db, pid=os.getpid(), time=time.time())

            try:
                download_history_file(db, tmp_filepath, **kwargs)
            except NoHistoricalData as e:
                # don't complain about NoHistoricalData if we're checking
                # multiple databases, unless none of them have data
                if len(dbs) == 1:
                    raise
                else:
                    continue

            prices = pd.read_csv(tmp_filepath)
            all_prices.append(prices)

            os.remove(tmp_filepath)

        if db in realtime_agg_dbs:

            fields_for_db = set(fields).intersection(set(realtime_db_fields[db]))

            kwargs = dict(
                start_date=start_date,
                end_date=end_date,
                universes=universes,
                conids=conids,
                exclude_universes=exclude_universes,
                exclude_conids=exclude_conids,
                fields=list(fields_for_db))

            tmp_filepath = "{dir}{sep}realtime.{db}.{pid}.{time}.csv".format(
                dir=TMP_DIR, sep=os.path.sep, db=db, pid=os.getpid(), time=time.time())

            try:
                download_market_data_file(db, tmp_filepath, **kwargs)
            except NoRealtimeData as e:
                # don't complain about NoRealtimeData if we're checking
                # multiple databases, unless none of them have data
                if len(dbs) == 1:
                    raise
                else:
                    continue

            prices = pd.read_csv(tmp_filepath)
            all_prices.append(prices)

            os.remove(tmp_filepath)

    # complain if multiple dbs and none had data
    if len(dbs) > 1 and not all_prices:
        raise NoHistoricalData("no price data matches the query parameters in any of {0}".format(
            ", ".join(dbs)
        ))

    prices = pd.concat(all_prices, sort=False)

    try:
        prices = prices.pivot(index="ConId", columns="Date").T
    except ValueError as e:
        if "duplicate" not in repr(e):
            raise
        # There are duplicates, likely due to querying multiple databases,
        # both of which return one or more identical dates for identical
        # conids. To resolve, we group by conid and date and take the first
        # available value for each field. This means that the orders of
        # [codes] matters. The use of groupby.first() instead of simply
        # drop_duplicates allows us to retain one field from one db and
        # another field from another db.
        grouped = prices.groupby(["ConId","Date"])
        prices = pd.concat(
            dict([(col, grouped[col].first()) for col in prices.columns
                  if col not in ("ConId","Date")]), axis=1)
        prices = prices.reset_index().pivot(index="ConId", columns="Date").T

    prices.index.set_names(["Field", "Date"], inplace=True)

    master_fields = master_fields or []
    if master_fields:
        import warnings
        # DeprecationWarning is ignored by default but we want the user
        # to see it
        warnings.simplefilter("always", DeprecationWarning)
        warnings.warn(
            "`master_fields` parameter is deprecated and will be removed in a "
            "future release. For better performance, please use "
            "`quantrocket.master.get_securities_reindexed_like` "
            "to get securities master data shaped like prices.", DeprecationWarning)

        if isinstance(master_fields, tuple):
            master_fields = list(master_fields)
        elif not isinstance(master_fields, list):
            master_fields = [master_fields]

    # master fields that are required internally but shouldn't be returned to
    # the user (potentially Timezone)
    internal_master_fields = []

    is_intraday = list(db_bar_sizes_parsed)[0] < pd.Timedelta("1 day")

    if is_intraday and not timezone and infer_timezone is not False:
        infer_timezone = True
        if not master_fields or "Timezone" not in master_fields:
            internal_master_fields.append("Timezone")

    # Next, get the master file
    if master_fields or internal_master_fields:
        conids = list(prices.columns)

        domain = list(db_domains)[0] if db_domains else None

        f = six.StringIO()
        download_master_file(
            f,
            conids=conids,
            fields=master_fields + internal_master_fields,
            domain=domain
        )
        securities = pd.read_csv(f, index_col="ConId")

        if "Delisted" in securities.columns:
            securities.loc[:, "Delisted"] = securities.Delisted.astype(bool)

        if "Etf" in securities.columns:
            securities.loc[:, "Etf"] = securities.Etf.astype(bool)

        # Infer timezone if needed
        if not timezone and infer_timezone:
            timezones = securities.Timezone.unique()

            if len(timezones) > 1:
                raise ParameterError(
                    "cannot infer timezone because multiple timezones are present "
                    "in data, please specify timezone explicitly (timezones: {0})".format(
                        ", ".join(timezones)))

            timezone = timezones[0]

        # Drop any internal-only fields
        if internal_master_fields:
            securities = securities.drop(internal_master_fields, axis=1)

        if not securities.empty:
            # Append securities, indexed to the min date, to allow easy ffill on demand
            securities = pd.DataFrame(securities.T, columns=prices.columns)
            securities.index.name = "Field"
            idx = pd.MultiIndex.from_product(
                (securities.index, [prices.index.get_level_values("Date").min()]),
                names=["Field", "Date"])

            securities = securities.reindex(index=idx, level="Field")
            prices = pd.concat((prices, securities))

    if is_intraday:
        dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True)

        if timezone:
            dates = dates.tz_convert(timezone)
    else:
        dates = pd.to_datetime(prices.index.get_level_values("Date"))

    prices.index = pd.MultiIndex.from_arrays((
        prices.index.get_level_values("Field"),
        dates
        ), names=("Field", "Date"))

    # Split date and time
    dts = prices.index.get_level_values("Date")
    dates = pd.to_datetime(dts.date).tz_localize(None) # drop tz-aware in Date index
    prices.index = pd.MultiIndex.from_arrays(
        (prices.index.get_level_values("Field"),
         dates,
         dts.strftime("%H:%M:%S")),
        names=["Field", "Date", "Time"]
    )

    # Align dates if there are any duplicate. Explanation: Suppose there are
    # two timezones represented in the data (e.g. history db in security
    # timezone vs real-time db in UTC). After parsing these dates into a
    # common timezone, they will align properly, but we pivoted before
    # parsing the dates (for performance reasons), so they may not be
    # aligned. Thus we need to dedupe the index.
    prices = prices.groupby(prices.index).first()
    prices.index = pd.MultiIndex.from_tuples(prices.index)
    prices.index.set_names(["Field", "Date", "Time"], inplace=True)

    # Drop time if not intraday
    if not is_intraday:
        prices.index = prices.index.droplevel("Time")
        return prices

    # If intraday, fill missing times so that each date has the same set of
    # times, allowing easier comparisons. Example implications:
    # - if history is retrieved intraday, this ensures that today will have NaN
    #   entries for future times
    # - early close dates will have a full set of times, with NaNs after the
    #   early close
    unique_fields = prices.index.get_level_values("Field").unique()
    unique_dates = prices.index.get_level_values("Date").unique()
    unique_times = prices.index.get_level_values("Time").unique()
    interpolated_index = None
    for field in unique_fields:
        if master_fields and field in master_fields:
            min_date = prices.loc[field].index.min()
            field_idx = pd.MultiIndex.from_tuples([(field,min_date[0], min_date[1])])
        else:
            field_idx = pd.MultiIndex.from_product([[field], unique_dates, unique_times]).sort_values()
        if interpolated_index is None:
            interpolated_index = field_idx
        else:
            interpolated_index = interpolated_index.append(field_idx)

    prices = prices.reindex(interpolated_index)
    prices.index.set_names(["Field", "Date", "Time"], inplace=True)

    # Apply times filter if needed (see Notes in docstring)
    if times and realtime_agg_dbs:
        if not isinstance(times, (list, tuple)):
            times = [times]
        prices = prices.loc[prices.index.get_level_values("Time").isin(times)]

    return prices
コード例 #4
0
def get_reuters_estimates_reindexed_like(reindex_like,
                                         codes,
                                         fields=["Actual"],
                                         period_types=["Q"],
                                         max_lag=None):
    """
    Return a multiindex (Indicator, Field, Date) DataFrame of point-in-time
    Reuters estimates and actuals for one or more indicator codes, reindexed
    to match the index (dates) and columns (conids) of `reindex_like`.
    Estimates and actuals are forward-filled in order to provide the latest
    reading at any given date, indexed to the UpdatedDate field.
    UpdatedDate is shifted forward 1 day to avoid lookahead bias.

    Parameters
    ----------
    reindex_like : DataFrame, required
        a DataFrame (usually of prices) with dates for the index and conids
        for the columns, to which the shape of the resulting DataFrame will
        be conformed

    codes : list of str, required
        the indicator code(s) to query. Use the `list_reuters_codes`
        function to see available codes.

    fields : list of str
        a list of fields to include in the resulting DataFrame. Defaults to
        simply including the Actual field.

    period_types : list of str, optional
        limit to these fiscal period types. Possible choices: A, Q, S, where
        A=Annual, Q=Quarterly, S=Semi-Annual. Default is Q/Quarterly.

    max_lag : str, optional
        maximum amount of time a data point can be used after the
        associated fiscal period has ended. Setting a limit can prevent
        using data that is reported long after the fiscal period ended, or
        can limit how far data is forward filled in the absence of subsequent
        data. Specify as a Pandas offset alias, e.g. '500D'. By default, no
        maximum limit is applied.

    Returns
    -------
    DataFrame
        a multiindex (Indicator, Field, Date) DataFrame of estimates and actuals,
        shaped like the input DataFrame

    Examples
    --------
    Query book value per share (code BVPS):

    >>> closes = prices.loc["Close"]
    >>> estimates = get_reuters_estimates_reindexed_like(closes, codes=["BVPS"])
    >>> book_values_per_share = estimates.loc["BVPS"].loc["Actual"]
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    index_levels = reindex_like.index.names
    if "Time" in index_levels:
        raise ParameterError(
            "reindex_like should not have 'Time' in index, please take a cross-section first, "
            "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`")

    if index_levels != ["Date"]:
        raise ParameterError(
            "reindex_like must have index called 'Date', but has {0}".format(
                ",".join([str(name) for name in index_levels])))

    if not hasattr(reindex_like.index, "date"):
        raise ParameterError("reindex_like must have a DatetimeIndex")

    conids = list(reindex_like.columns)
    start_date = reindex_like.index.min().date()
    # Since financial reports are sparse, start well before the reindex_like
    # min date
    start_date -= pd.Timedelta(days=365 + 180)
    start_date = start_date.isoformat()
    end_date = reindex_like.index.max().date().isoformat()

    f = six.StringIO()
    query_fields = list(
        fields
    )  # copy fields on Py2 or 3: https://stackoverflow.com/a/2612815/417414
    if "UpdatedDate" not in query_fields:
        query_fields.append("UpdatedDate")
    download_reuters_estimates(codes,
                               f,
                               conids=conids,
                               start_date=start_date,
                               end_date=end_date,
                               fields=query_fields,
                               period_types=period_types)
    parse_dates = ["UpdatedDate", "FiscalPeriodEndDate"]
    if "AnnounceDate" in fields:
        parse_dates.append("AnnounceDate")
    estimates = pd.read_csv(f, parse_dates=parse_dates)

    # Drop records with no actuals
    estimates = estimates.loc[estimates.UpdatedDate.notnull()]

    # Convert UTC UpdatedDate to security timezone, and cast to date for
    # index
    f = six.StringIO()
    download_master_file(f,
                         conids=list(estimates.ConId.unique()),
                         delisted=True,
                         fields=["Timezone"])
    timezones = pd.read_csv(f, index_col="ConId")
    estimates = estimates.join(timezones, on="ConId")
    if estimates.Timezone.isnull().any():
        conids_missing_timezones = list(
            estimates.ConId[estimates.Timezone.isnull()].unique())
        raise MissingData(
            "timezones are missing for some conids so cannot convert UTC "
            "estimates to timezone of security (conids missing timezone: {0})".
            format(",".join([str(conid)
                             for conid in conids_missing_timezones])))

    # If only 1 timezone in data, use a faster method
    if len(estimates.Timezone.unique()) == 1:
        timezone = list(estimates.Timezone.unique())[0]
        estimates["Date"] = pd.to_datetime(
            estimates.UpdatedDate.values).tz_localize("UTC").tz_convert(
                timezone)
    else:
        estimates["Date"] = estimates.apply(
            lambda row: row.UpdatedDate.tz_localize("UTC").tz_convert(
                row.Timezone),
            axis=1)

    # Convert to dates (i.e. time = 00:00:00)
    estimates.loc[:, "Date"] = pd.to_datetime(
        estimates.Date.apply(
            lambda x: pd.datetime.combine(x,
                                          pd.Timestamp("00:00:00").time())))

    # Drop any fields we don't need
    needed_fields = set(fields)
    needed_fields.update(set(("ConId", "Date", "Indicator")))
    if max_lag:
        needed_fields.add("FiscalPeriodEndDate")
    unneeded_fields = set(estimates.columns) - needed_fields
    if unneeded_fields:
        estimates = estimates.drop(unneeded_fields, axis=1)

    # if reindex_like.index is tz-aware, make financials tz-aware so they can
    # be joined (tz-aware or tz-naive are both fine, as SourceDate represents
    # dates which are assumed to be in the local timezone of the reported
    # company)
    if reindex_like.index.tz:
        estimates.loc[:, "Date"] = estimates.Date.dt.tz_localize(
            reindex_like.index.tz.zone)
        deduped_updated_dates = estimates.Date.drop_duplicates()
    else:
        # joining to tz-naive requires using values, whereas joining to
        # tz-aware requires not using it. Why?
        deduped_updated_dates = estimates.Date.drop_duplicates().values

    # Create a unioned index of input DataFrame and UpdatedDate
    union_date_idx = reindex_like.index.union(
        deduped_updated_dates).sort_values()

    all_estimates = {}
    for code in codes:
        estimates_for_code = estimates.loc[estimates.Indicator == code]
        if estimates_for_code.empty:
            continue
        if "Indicator" not in fields:
            estimates_for_code = estimates_for_code.drop("Indicator", axis=1)
        # There might be duplicate UpdatedDates if a company announced
        # reports for several fiscal periods at once. In this case we keep
        # only the last value (i.e. latest fiscal period)
        estimates_for_code = estimates_for_code.drop_duplicates(
            subset=["ConId", "Date"], keep="last")
        estimates_for_code = estimates_for_code.pivot(index="ConId",
                                                      columns="Date").T
        multiidx = pd.MultiIndex.from_product(
            (estimates_for_code.index.get_level_values(0).unique(),
             union_date_idx),
            names=["Field", "Date"])
        estimates_for_code = estimates_for_code.reindex(
            index=multiidx, columns=reindex_like.columns)

        # estimates are sparse so ffill (one field at a time)
        all_fields_for_code = {}
        for field in estimates_for_code.index.get_level_values(
                "Field").unique():
            field_for_code = estimates_for_code.loc[field].fillna(
                method="ffill")

            # Shift to avoid lookahead bias
            field_for_code = field_for_code.shift()

            all_fields_for_code[field] = field_for_code

        # Filter stale values if asked to
        if max_lag:
            fiscal_period_end_dates = all_fields_for_code[
                "FiscalPeriodEndDate"]
            # subtract the max_lag from the index date to get the
            # earliest possible fiscal period end date for that row
            earliest_allowed_fiscal_period_end_dates = fiscal_period_end_dates.apply(
                lambda x: fiscal_period_end_dates.index - pd.Timedelta(max_lag
                                                                       ))
            within_max_timedelta = fiscal_period_end_dates.apply(
                pd.to_datetime) >= earliest_allowed_fiscal_period_end_dates

            for field, field_for_code in all_fields_for_code.items():
                field_for_code = field_for_code.where(within_max_timedelta)
                all_fields_for_code[field] = field_for_code

            # Clean up if FiscalPeriodEndDate was just kept around for this purpose
            if "FiscalPeriodEndDate" not in fields:
                del all_fields_for_code["FiscalPeriodEndDate"]

        estimates_for_code = pd.concat(all_fields_for_code,
                                       names=["Field", "Date"])

        # In cases the statements included dates not in the input
        # DataFrame, drop those now that we've ffilled
        extra_dates = union_date_idx.difference(reindex_like.index)
        if not extra_dates.empty:
            estimates_for_code.drop(extra_dates,
                                    axis=0,
                                    level="Date",
                                    inplace=True)

        all_estimates[code] = estimates_for_code

    estimates = pd.concat(all_estimates, names=["Indicator", "Field", "Date"])

    return estimates
コード例 #5
0
def get_reuters_financials_reindexed_like(reindex_like,
                                          coa_codes,
                                          fields=["Amount"],
                                          interim=False,
                                          exclude_restatements=False,
                                          max_lag=None):
    """
    Return a multiindex (CoaCode, Field, Date) DataFrame of point-in-time
    Reuters financial statements for one or more Chart of Account (COA)
    codes, reindexed to match the index (dates) and columns (conids) of
    `reindex_like`. Financial values are forward-filled in order to provide
    the latest reading at any given date. Financials are indexed to the
    SourceDate field, i.e. the date on which the financial statement was
    released. SourceDate is shifted forward 1 day to avoid lookahead bias.

    Parameters
    ----------
    reindex_like : DataFrame, required
        a DataFrame (usually of prices) with dates for the index and conids
        for the columns, to which the shape of the resulting DataFrame will
        be conformed

    coa_codes : list of str, required
        the Chart of Account (COA) code(s) to query. Use the `list_reuters_codes`
        function to see available codes.

    fields : list of str
        a list of fields to include in the resulting DataFrame. Defaults to
        simply including the Amount field.

    interim : bool
        query interim/quarterly reports (default is to query annual reports,
        which provide deeper history)

    exclude_restatements : bool, optional
        exclude restatements (default is to include them)

    max_lag : str, optional
        maximum amount of time a data point can be used after the
        associated fiscal period has ended. Setting a limit can prevent
        using data that is reported long after the fiscal period ended, or
        can limit how far data is forward filled in the absence of subsequent
        data. Specify as a Pandas offset alias, e.g. '500D'. By default, no
        maximum limit is applied.

    Returns
    -------
    DataFrame
        a multiindex (CoaCode, Field, Date) DataFrame of financials,
        shaped like the input DataFrame

    Examples
    --------
    Let's calculate book value per share, defined as:

        (Total Assets - Total Liabilities) / Number of shares outstanding

    The COA codes for these metrics are 'ATOT' (Total Assets), 'LTLL' (Total
    Liabilities), and 'QTCO' (Total Common Shares Outstanding).


    >>> closes = prices.loc["Close"]
    >>> financials = get_reuters_financials_reindexed_like(closes, coa_codes=["ATOT", "LTLL", "QTCO"])
    >>> tot_assets = financials.loc["ATOT"].loc["Amount"]
    >>> tot_liabilities = financials.loc["LTLL"].loc["Amount"]
    >>> shares_out = financials.loc["QTCO"].loc["Amount"]
    >>> book_values_per_share = (tot_assets - tot_liabilities)/shares_out

    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    index_levels = reindex_like.index.names
    if "Time" in index_levels:
        raise ParameterError(
            "reindex_like should not have 'Time' in index, please take a cross-section first, "
            "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`")

    if index_levels != ["Date"]:
        raise ParameterError(
            "reindex_like must have index called 'Date', but has {0}".format(
                ",".join([str(name) for name in index_levels])))

    if not hasattr(reindex_like.index, "date"):
        raise ParameterError("reindex_like must have a DatetimeIndex")

    conids = list(reindex_like.columns)
    start_date = reindex_like.index.min().date()
    # Since financial reports are sparse, start well before the reindex_like
    # min date
    start_date -= pd.Timedelta(days=365 + 180)
    start_date = start_date.isoformat()
    end_date = reindex_like.index.max().date().isoformat()

    f = six.StringIO()
    download_reuters_financials(coa_codes,
                                f,
                                conids=conids,
                                start_date=start_date,
                                end_date=end_date,
                                fields=fields,
                                interim=interim,
                                exclude_restatements=exclude_restatements)
    financials = pd.read_csv(f,
                             parse_dates=["SourceDate", "FiscalPeriodEndDate"])

    # Rename SourceDate to match price history index name
    financials = financials.rename(columns={"SourceDate": "Date"})

    # Drop any fields we don't need
    needed_fields = set(fields)
    needed_fields.update(set(("ConId", "Date", "CoaCode")))
    if max_lag:
        needed_fields.add("FiscalPeriodEndDate")
    unneeded_fields = set(financials.columns) - needed_fields
    if unneeded_fields:
        financials = financials.drop(unneeded_fields, axis=1)

    # if reindex_like.index is tz-aware, make financials tz-aware so they can
    # be joined (tz-aware or tz-naive are both fine, as SourceDate represents
    # dates which are assumed to be in the local timezone of the reported
    # company)
    if reindex_like.index.tz:
        financials.loc[:, "Date"] = financials.Date.dt.tz_localize(
            reindex_like.index.tz.zone)
        deduped_source_dates = financials.Date.drop_duplicates()
    else:
        # joining to tz-naive requires using values, whereas joining to
        # tz-aware requires not using it. Why?
        deduped_source_dates = financials.Date.drop_duplicates().values

    # Create a unioned index of input DataFrame and statement SourceDates
    union_date_idx = reindex_like.index.union(
        deduped_source_dates).sort_values()

    all_financials = {}
    for code in coa_codes:
        financials_for_code = financials.loc[financials.CoaCode == code]
        if financials_for_code.empty:
            continue
        if "CoaCode" not in fields:
            financials_for_code = financials_for_code.drop("CoaCode", axis=1)
        # There might be duplicate SourceDates if a company announced
        # reports for several fiscal periods at once. In this case we keep
        # only the last value (i.e. latest fiscal period)
        financials_for_code = financials_for_code.drop_duplicates(
            subset=["ConId", "Date"], keep="last")
        financials_for_code = financials_for_code.pivot(index="ConId",
                                                        columns="Date").T
        multiidx = pd.MultiIndex.from_product(
            (financials_for_code.index.get_level_values(0).unique(),
             union_date_idx),
            names=["Field", "Date"])
        financials_for_code = financials_for_code.reindex(
            index=multiidx, columns=reindex_like.columns)

        # financial values are sparse so ffill (one field at a time)
        all_fields_for_code = {}
        for field in financials_for_code.index.get_level_values(
                "Field").unique():
            field_for_code = financials_for_code.loc[field].fillna(
                method="ffill")

            # Shift to avoid lookahead bias
            field_for_code = field_for_code.shift()

            all_fields_for_code[field] = field_for_code

        # Filter stale values if asked to
        if max_lag:
            fiscal_period_end_dates = all_fields_for_code[
                "FiscalPeriodEndDate"]
            # subtract the max_lag from the index date to get the
            # earliest possible fiscal period end date for that row
            earliest_allowed_fiscal_period_end_dates = fiscal_period_end_dates.apply(
                lambda x: fiscal_period_end_dates.index - pd.Timedelta(max_lag
                                                                       ))
            within_max_timedelta = fiscal_period_end_dates.apply(
                pd.to_datetime) >= earliest_allowed_fiscal_period_end_dates

            for field, field_for_code in all_fields_for_code.items():
                field_for_code = field_for_code.where(within_max_timedelta)
                all_fields_for_code[field] = field_for_code

            # Clean up if FiscalPeriodEndDate was just kept around for this purpose
            if "FiscalPeriodEndDate" not in fields:
                del all_fields_for_code["FiscalPeriodEndDate"]

        financials_for_code = pd.concat(all_fields_for_code,
                                        names=["Field", "Date"])

        # In cases the statements included dates not in the input
        # DataFrame, drop those now that we've ffilled
        extra_dates = union_date_idx.difference(reindex_like.index)
        if not extra_dates.empty:
            financials_for_code.drop(extra_dates,
                                     axis=0,
                                     level="Date",
                                     inplace=True)

        all_financials[code] = financials_for_code

    financials = pd.concat(all_financials, names=["CoaCode", "Field", "Date"])

    return financials
コード例 #6
0
def _get_stockloan_data_reindexed_like(stockloan_func,
                                       stockloan_field,
                                       reindex_like,
                                       time=None):
    """
    Common base function for get_shortable_shares_reindexed_like and
    get_borrow_fees_reindexed_like.
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    index_levels = reindex_like.index.names
    if "Time" in index_levels:
        raise ParameterError(
            "reindex_like should not have 'Time' in index, please take a cross-section first, "
            "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`")

    if index_levels != ["Date"]:
        raise ParameterError(
            "reindex_like must have index called 'Date', but has {0}".format(
                ",".join([str(name) for name in index_levels])))

    if not hasattr(reindex_like.index, "date"):
        raise ParameterError("reindex_like must have a DatetimeIndex")

    conids = list(reindex_like.columns)
    start_date = reindex_like.index.min().date()
    # Stockloan data is sparse but batched in monthly files, so start >1-month
    # before the reindex_like min date
    start_date -= pd.Timedelta(days=45)
    start_date = start_date.isoformat()
    end_date = reindex_like.index.max().date().isoformat()

    f = six.StringIO()
    stockloan_func(f, conids=conids, start_date=start_date, end_date=end_date)
    stockloan_data = pd.read_csv(f)
    stockloan_data.loc[:, "Date"] = pd.to_datetime(stockloan_data.Date,
                                                   utc=True)

    # Determine timezone, from:
    # - time param if provided
    # - else reindex_like.index.tz if set
    # - else component securities if all have same timezone
    timezone = None

    if time and " " in time:
        time, timezone = time.split(" ", 1)

        if reindex_like.index.tz and reindex_like.index.tz.zone != timezone:
            raise ParameterError((
                "cannot use timezone {0} because reindex_like timezone is {1}, "
                "these must match".format(timezone,
                                          reindex_like.index.tz.zone)))

    if not timezone:
        if reindex_like.index.tz:
            timezone = reindex_like.index.tz.zone
        else:
            # try to infer from component securities
            f = six.StringIO()
            download_master_file(f,
                                 conids=list(stockloan_data.ConId.unique()),
                                 delisted=True,
                                 fields=["Timezone"])
            security_timezones = pd.read_csv(f, index_col="ConId")
            security_timezones = list(security_timezones.Timezone.unique())
            if len(security_timezones) > 1:
                raise ParameterError(
                    "no timezone specified and cannot infer because multiple timezones are "
                    "present in data, please specify timezone (timezones in data: {0})"
                    .format(", ".join(security_timezones)))
            timezone = security_timezones[0]

    # Create an index of `reindex_like` dates at `time`
    if time:
        try:
            time = pd.Timestamp(time).time()
        except ValueError as e:
            raise ParameterError("could not parse time '{0}': {1}".format(
                time, str(e)))
        index_at_time = pd.Index(reindex_like.index.to_series().apply(
            lambda x: pd.datetime.combine(x, time)))
    else:
        index_at_time = reindex_like.index

    if not index_at_time.tz:
        index_at_time = index_at_time.tz_localize(timezone)

    index_at_time = index_at_time.tz_convert("UTC")

    stockloan_data = stockloan_data.pivot(index="ConId", columns="Date").T
    stockloan_data = stockloan_data.loc[stockloan_field]

    # Create a unioned index of requested times and stockloan data timestamps
    unioned_idx = index_at_time.union(stockloan_data.index)
    # performance boost: don't need to reindex and ffill earlier than the
    # stockloan data start date
    unioned_idx = unioned_idx[unioned_idx >= stockloan_data.index.min()]
    stockloan_data = stockloan_data.reindex(index=unioned_idx,
                                            columns=reindex_like.columns)

    stockloan_data = stockloan_data.fillna(method="ffill")

    # Keep only the requested times, now that we've ffilled
    stockloan_data = stockloan_data.reindex(index=index_at_time)

    # Replace index_at_time with the original reindex_like index (this needs
    # to be done because index_at_time is tz-aware and reindex_like may not
    # be, and because index_at_time uses the requested `time` if provided.
    # Since index_at_time was derived from reindex_like.index and has the
    # same shape, it is safe to replace it.)
    stockloan_data.index = reindex_like.index

    return stockloan_data
コード例 #7
0
def create_agg_db(code, tick_db_code, bar_size, fields=None):
    """
    Create an aggregate database from a tick database.

    Aggregate databases provide rolled-up views of the underlying tick data,
    aggregated to a desired frequency (such as 1-minute bars).

    Parameters
    ----------
    code : str, required
        the code to assign to the aggregate database (lowercase alphanumerics and hyphens only)

    tick_db_code : str, required
        the code of the tick database to aggregate

    bar_size : str, required
        the time frequency to aggregate to (use a Pandas timedelta string, for example
        10s or 1m or 2h or 1d)

    fields : dict of list of str, optional
        include these fields in aggregate database, aggregated in these ways. Provide a dict
        mapping tick db fields to lists of aggregate functions to apply to the field. Available
        aggregate functions are "Close", "Open", "High", "Low", "Mean", "Sum", and "Count".
        See examples section. If not specified, defaults to including the "Close" for each tick
        db field.

    Returns
    -------
    dict
        status message

    Examples
    --------
    Create an aggregate database of 1 minute bars consisting of OHLC trades and volume,
    from a tick database of US stocks, resulting in fields called LastPriceOpen, LastPriceHigh,
    LastPriceLow, LastPriceClose, and VolumeClose:

    >>> create_agg_db("usa-stk-trades-1min", tick_db_code="usa-stk-trades",
                      bar_size="1m",
                      fields={"LastPrice":["Open","High","Low","Close"],
                              "Volume": ["Close"]})

    Create an aggregate database of 1 second bars containing the closing bid and ask and
    the mean bid size and ask size, from a tick database of futures trades and
    quotes, resulting in fields called BidPriceClose, AskPriceClose, BidSizeMean, and AskSizeMean:

    >>> create_agg_db("globex-fut-taq-1sec", tick_db_code="globex-fut-taq",
                      bar_size="1s",
                      fields={"BidPrice":["Close"],
                              "AskPrice": ["Close"],
                              "BidSize": ["Mean"],
                              "AskSize": ["Mean"]
                              })
    """
    params = {}
    params["bar_size"] = bar_size
    if fields:
        if not isinstance(fields, dict):
            raise ParameterError("fields must be a dict")

        # convert lists to comma-separated strings
        _fields = {}
        for k, v in fields.items():
            if isinstance(v, (list, tuple)):
                v = ",".join(v)
            _fields[k] = v
        params["fields"] = dict_to_dict_strs(_fields)

    response = houston.put("/realtime/databases/{0}/aggregates/{1}".format(tick_db_code, code), params=params)

    houston.raise_for_status_with_json(response)
    return response.json()
コード例 #8
0
def get_historical_prices(codes,
                          start_date=None,
                          end_date=None,
                          universes=None,
                          conids=None,
                          exclude_universes=None,
                          exclude_conids=None,
                          times=None,
                          cont_fut=None,
                          fields=None,
                          master_fields=None,
                          timezone=None,
                          infer_timezone=None):
    """
    Query one or more history databases and load prices into a DataFrame.

    For bar sizes smaller than 1-day, the resulting DataFrame will have a MultiIndex
    with levels (Field, Date, Time). For bar sizes of 1-day or larger, the MultiIndex
    will have levels (Field, Date).

    Parameters
    ----------
    codes : str or list of str, required
        the code(s) of one or more databases to query. If multiple databases
        are specified, they must have the same bar size.

    start_date : str (YYYY-MM-DD), optional
        limit to history on or after this date

    end_date : str (YYYY-MM-DD), optional
        limit to history on or before this date

    universes : list of str, optional
        limit to these universes (default is to return all securities in database)

    conids : list of int, optional
        limit to these conids

    exclude_universes : list of str, optional
        exclude these universes

    exclude_conids : list of int, optional
        exclude these conids

    times: list of str (HH:MM:SS), optional
        limit to these times

    cont_fut : str
        stitch futures into continuous contracts using this method (default is not
        to stitch together). Possible choices: concat

    fields : list of str, optional
        only return these fields (pass ['?'] or any invalid fieldname to see
        available fields)

    master_fields : list of str, optional
        append these fields from the securities master database (pass ['?'] or any
        invalid fieldname to see available fields)

    timezone : str, optional
        convert timestamps to this timezone, for example America/New_York (see
        `pytz.all_timezones` for choices); ignored for non-intraday bar sizes

    infer_timezone : bool
        infer the timezone from the securities master Timezone field; defaults to
        True if using intraday bars and no `timezone` specified; ignored for
        non-intraday bars, or if `timezone` is passed

    Returns
    -------
    DataFrame
        a MultiIndex

    Examples
    --------
    Load intraday prices:

    >>> prices = get_historical_prices('stk-sample-5min', fields=["Close", "Volume"], timezone="America/New_York")
    >>> prices.head()
                                ConId   	265598	38708077
    Field	Date	        Time
    Close	2017-07-26      09:30:00	153.62	2715.0
                                09:35:00	153.46	2730.0
                                09:40:00	153.21	2725.0
                                09:45:00	153.28	2725.0
                                09:50:00	153.18	2725.0

    Isolate the closes:

    >>> closes = prices.loc["Close"]
    >>> closes.head()
                ConId	        265598  38708077
    Date        Time
    2017-07-26	09:30:00	153.62	2715.0
                09:35:00	153.46	2730.0
                09:40:00	153.21	2725.0
                09:45:00	153.28	2725.0
                09:50:00	153.18	2725.0

    Isolate the 15:45:00 prices:

    >>> session_closes = closes.xs("15:45:00", level="Time")
    >>> session_closes.head()
        ConId	265598	38708077
    Date
    2017-07-26	153.29	2700.00
    2017-07-27 	150.10	2660.00
    2017-07-28	149.43	2650.02
    2017-07-31 	148.99	2650.34
    2017-08-01 	149.72	2675.50
    """
    # Import pandas lazily since it can take a moment to import
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    try:
        import pytz
    except ImportError:
        raise ImportError("pytz must be installed to use this function")

    if timezone and timezone not in pytz.all_timezones:
        raise ParameterError(
            "invalid timezone: {0} (see `pytz.all_timezones` for choices)".
            format(timezone))

    dbs = codes
    if not isinstance(dbs, (list, tuple)):
        dbs = [dbs]

    if master_fields:
        if isinstance(master_fields, tuple):
            master_fields = list(master_fields)
        elif not isinstance(master_fields, list):
            master_fields = [master_fields]

    db_universes = set()
    db_bar_sizes = set()
    for db in dbs:
        db_config = get_db_config(db)
        _db_universes = db_config.get("universes", None)
        if _db_universes:
            db_universes.update(set(_db_universes))
        bar_size = db_config.get("bar_size")
        db_bar_sizes.add(bar_size)

    db_universes = list(db_universes)
    db_bar_sizes = list(db_bar_sizes)

    if len(db_bar_sizes) > 1:
        raise ParameterError(
            "all databases must contain same bar size but {0} have different "
            "bar sizes: {1}".format(", ".join(dbs), ", ".join(db_bar_sizes)))

    all_prices = []

    for db in dbs:

        kwargs = dict(start_date=start_date,
                      end_date=end_date,
                      universes=universes,
                      conids=conids,
                      exclude_universes=exclude_universes,
                      exclude_conids=exclude_conids,
                      times=times,
                      cont_fut=cont_fut,
                      fields=fields,
                      tz_naive=False)

        tmp_filepath = "{0}/history.{1}.{2}.{3}.csv".format(
            TMP_DIR, db, os.getpid(), time.time())
        download_history_file(db, tmp_filepath, **kwargs)

        prices = pd.read_csv(tmp_filepath)
        all_prices.append(prices)

        os.remove(tmp_filepath)

    prices = pd.concat(all_prices)

    prices = prices.pivot(index="ConId", columns="Date").T
    prices.index.set_names(["Field", "Date"], inplace=True)

    is_intraday = db_bar_sizes[0] not in ("1 day", "1 week", "1 month")

    if is_intraday and not timezone and infer_timezone is not False:
        infer_timezone = True
        if not master_fields:
            master_fields = []
        if "Timezone" not in master_fields:
            master_fields.append("Timezone")

    # Next, get the master file
    if master_fields:
        universes = universes
        conids = conids
        if not conids and not universes:
            universes = db_universes
            if not universes:
                conids = list(prices.columns)

        f = six.StringIO()
        download_master_file(f,
                             conids=conids,
                             universes=universes,
                             exclude_conids=exclude_conids,
                             exclude_universes=exclude_universes,
                             fields=master_fields,
                             delisted=True)
        securities = pd.read_csv(f, index_col="ConId")

        if "Delisted" in securities.columns:
            securities.loc[:, "Delisted"] = securities.Delisted.astype(bool)

        if "Etf" in securities.columns:
            securities.loc[:, "Etf"] = securities.Etf.astype(bool)

        # Append securities, indexed to the min date, to allow easy ffill on demand
        securities = pd.DataFrame(securities.T, columns=prices.columns)
        securities.index.name = "Field"
        idx = pd.MultiIndex.from_product(
            (securities.index, [prices.index.get_level_values("Date").min()]),
            names=["Field", "Date"])

        securities = securities.reindex(index=idx, level="Field")
        prices = pd.concat((prices, securities))

    if is_intraday:
        dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True)

        if not timezone and infer_timezone:
            timezone = _infer_timezone(prices)

        if timezone:
            dates = dates.tz_convert(timezone)
    else:
        dates = pd.to_datetime(prices.index.get_level_values("Date"))

    prices.index = pd.MultiIndex.from_arrays(
        (prices.index.get_level_values("Field"), dates),
        names=("Field", "Date"))

    # Split date and time
    dts = prices.index.get_level_values("Date")
    dates = pd.to_datetime(dts.date).tz_localize(
        None)  # drop tz-aware in Date index
    prices.index = pd.MultiIndex.from_arrays(
        (prices.index.get_level_values("Field"), dates,
         dts.strftime("%H:%M:%S")),
        names=["Field", "Date", "Time"])

    # Align dates if there are any duplicate. Explanation: Suppose there are
    # two timezones represented in the data. After parsing these dates into a
    # common timezone, they will align properly, but we pivoted before
    # parsing the dates (for performance reasons), so they may not be
    # aligned. Thus we need to dedupe the index.
    prices = prices.groupby(prices.index).first()
    prices.index = pd.MultiIndex.from_tuples(prices.index)
    prices.index.set_names(["Field", "Date", "Time"], inplace=True)

    # Drop time if not intraday
    if not is_intraday:
        prices.index = prices.index.droplevel("Time")
        return prices

    # If intraday, fill missing times so that each date has the same set of
    # times, allowing easier comparisons. Example implications:
    # - if history is retrieved intraday, this ensures that today will have NaN
    #   entries for future times
    # - early close dates will have a full set of times, with NaNs after the
    #   early close
    unique_fields = prices.index.get_level_values("Field").unique()
    unique_dates = prices.index.get_level_values("Date").unique()
    unique_times = prices.index.get_level_values("Time").unique()
    interpolated_index = None
    for field in unique_fields:
        if master_fields and field in master_fields:
            min_date = prices.loc[field].index.min()
            field_idx = pd.MultiIndex.from_tuples([(field, min_date[0],
                                                    min_date[1])])
        else:
            field_idx = pd.MultiIndex.from_product([[field], unique_dates,
                                                    unique_times])
        if interpolated_index is None:
            interpolated_index = field_idx
        else:
            interpolated_index = interpolated_index.append(field_idx)

    prices = prices.reindex(interpolated_index)
    prices.index.set_names(["Field", "Date", "Time"], inplace=True)

    return prices
コード例 #9
0
ファイル: master.py プロジェクト: jmscraig/quantrocket-client
def get_contract_nums_reindexed_like(reindex_like, limit=5):
    """
    From a DataFrame of futures (with dates as the index and conids as columns),
    return a DataFrame of integers representing each conid's sequence in the
    futures chain as of each date, where 1 is the front contract, 2 is the second
    nearest contract, etc.

    Sequences are based on the RolloverDate field in the securities master
    file, which is based on configurable rollover rules.

    Parameters
    ----------
    reindex_like : DataFrame, required
        a DataFrame (usually of prices) with dates for the index and conids
        for the columns, to which the shape of the resulting DataFrame will
        be conformed

    limit : int
        how many contracts ahead to sequence. For example, assuming quarterly
        contracts, a limit of 5 will sequence 5 quarters out. Default 5.

    Returns
    -------
    DataFrame
        a DataFrame of futures chain sequence numbers, shaped like the input
        DataFrame

    Examples
    --------
    Get a Boolean mask of front-month contracts:

    >>> closes = prices.loc["Close"]
    >>> contract_nums = get_contract_nums_reindexed_like(closes)
    >>> are_front_months = contract_nums == 1
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    index_levels = reindex_like.index.names

    if "Date" not in index_levels:
        raise ParameterError(
            "reindex_like must have index called 'Date', but has {0}".format(
                ",".join([str(name) for name in index_levels])))

    reindex_like_dt_index = reindex_like.index.get_level_values("Date")

    if not hasattr(reindex_like_dt_index, "date"):
        raise ParameterError("reindex_like must have a DatetimeIndex")

    f = six.StringIO()
    download_master_file(f,
                         conids=list(reindex_like.columns),
                         fields=["RolloverDate", "UnderConId", "SecType"])
    rollover_dates = pd.read_csv(f, parse_dates=["RolloverDate"])
    rollover_dates = rollover_dates[rollover_dates.SecType == "FUT"].drop(
        "SecType", axis=1)

    if rollover_dates.empty:
        raise ParameterError(
            "input DataFrame does not appear to contain any futures contracts")

    if reindex_like_dt_index.tz:
        rollover_dates.loc[:,
                           "RolloverDate"] = rollover_dates.RolloverDate.dt.tz_localize(
                               reindex_like_dt_index.tz.zone)

    min_date = reindex_like_dt_index.min()
    max_date = max(
        [rollover_dates.RolloverDate.max(),
         reindex_like_dt_index.max()])

    # Stack conids by underlying (1 column per underlying)
    rollover_dates = rollover_dates.set_index(["RolloverDate",
                                               "UnderConId"]).ConId.unstack()

    contract_nums = None

    for i in range(limit):

        # backshift conids
        _rollover_dates = rollover_dates.apply(
            lambda col: col.dropna().shift(-i))

        # Reindex to daily frequency
        _rollover_dates = _rollover_dates.reindex(
            index=pd.date_range(start=min_date, end=max_date))

        # RolloverDate is when we roll out of the contract, hence we backfill
        _rollover_dates = _rollover_dates.fillna(method="bfill")

        # Stack to Series of Date, nth conid
        _rollover_dates = _rollover_dates.stack()
        _rollover_dates.index = _rollover_dates.index.droplevel("UnderConId")
        _rollover_dates.index.name = "Date"

        # Pivot Series to DataFrame
        _rollover_dates = _rollover_dates.reset_index(name="ConId")
        _rollover_dates["ContractNum"] = i + 1
        _rollover_dates = _rollover_dates.set_index(["Date", "ConId"])
        _contract_nums = _rollover_dates.ContractNum.unstack()

        # If MultiIndex input, broadcast across Time level
        if len(index_levels) > 1:
            _contract_nums = _contract_nums.reindex(index=reindex_like.index,
                                                    level="Date")
            _contract_nums = _contract_nums.reindex(
                columns=reindex_like.columns)
        else:
            _contract_nums = _contract_nums.reindex(
                index=reindex_like_dt_index, columns=reindex_like.columns)

        if contract_nums is None:
            contract_nums = _contract_nums
        else:
            contract_nums = contract_nums.fillna(_contract_nums)

    return contract_nums