Beispiel #1
0
 def _init(self, *args, **kwargs):
     # not sure why I have to use this awkward convention to get the param
     code = kwargs["params"][0][1]
     f = io.StringIO()
     download_master_file(f, universes=code, fields="Sid")
     self._real_sids = pd.read_csv(f).Sid.tolist()
     return super(Universe, self)._init(*args, **kwargs)
def collect_combo(universe, contract_months, tick_db, until=None):
    """
    Creates a combo and initiates real-time data collection for it.

    Parameters
    ----------
    universe : str, required
        the universe of futures from which to select the contract contract_months

    contract_months : tuple, required
        a tuple of contract months which should make up the legs of the spread,
        for example, (1, 2) for the first and second closest months to expiration

    tick_db : str, required
        the tick db code for real-time data collection

    until : str, optional
        collect real-time data until this time (for example, '16:30:00 America/New_York')
    """

    # query non-expired futures contracts
    f = io.StringIO()
    download_master_file(f,
                         universes=universe,
                         fields="RolloverDate",
                         exclude_expired=True)
    contracts = pd.read_csv(f,
                            index_col="RolloverDate",
                            parse_dates=["RolloverDate"])

    # sort by RolloverDate
    sids = contracts.Sid.sort_index().tolist()

    sid_1 = sids[contract_months[0] - 1]
    sid_2 = sids[contract_months[1] - 1]

    # Create the combo if it doesn't already exist
    result = create_ibkr_combo([["BUY", 1, sid_1], ["SELL", 1, sid_2]])

    combo_sid = result["sid"]

    # initiate data collection
    collect_market_data(tick_db, sids=combo_sid, until=until)
    def _get_securities(self):
        """
        Queries the master db and gets a securities master file.
        """
        f = io.StringIO()

        download_master_file(f,
                             universes=self.universes,
                             conids=self.conids,
                             exclude_universes=self.exclude_universes,
                             exclude_conids=self.exclude_conids,
                             delisted=True,
                             fields=[
                                 "ConId", "PrimaryExchange", "Symbol",
                                 "SecType", "LocalSymbol", "LongName",
                                 "MinTick", "Multiplier", "LastTradeDate",
                                 "ContractMonth", "Timezone", "UnderConId"
                             ])

        self.securities = pd.read_csv(
            f, index_col="ConId").sort_values(by="Symbol")
Beispiel #4
0
def get_prices(codes, start_date=None, end_date=None,
               universes=None, conids=None,
               exclude_universes=None, exclude_conids=None,
               times=None, fields=None,
               timezone=None, infer_timezone=None,
               cont_fut=None, master_fields=None):
    """
    Query one or more history databases and/or real-time aggregate databases
    and load prices into a DataFrame.

    For bar sizes smaller than 1-day, the resulting DataFrame will have a MultiIndex
    with levels (Field, Date, Time). For bar sizes of 1-day or larger, the MultiIndex
    will have levels (Field, Date).

    Parameters
    ----------
    codes : str or list of str, required
        the code(s) of one or more databases to query. If multiple databases
        are specified, they must have the same bar size. List databses in order of
        priority (highest priority first). If multiple databases provide the same
        field for the same conid on the same datetime, the first database's value will
        be used.

    start_date : str (YYYY-MM-DD), optional
        limit to data on or after this date

    end_date : str (YYYY-MM-DD), optional
        limit to data on or before this date

    universes : list of str, optional
        limit to these universes (default is to return all securities in database)

    conids : list of int, optional
        limit to these conids

    exclude_universes : list of str, optional
        exclude these universes

    exclude_conids : list of int, optional
        exclude these conids

    times: list of str (HH:MM:SS), optional
        limit to these times, specified in the timezone of the relevant exchange. See
        additional information in the Notes section regarding the timezone to use.

    fields : list of str, optional
        only return these fields. (If querying multiple databases that have different fields,
        provide the complete list of desired fields; only the supported fields for each
        database will be queried.)

    timezone : str, optional
        convert timestamps to this timezone, for example America/New_York (see
        `pytz.all_timezones` for choices); ignored for non-intraday bar sizes

    infer_timezone : bool
        infer the timezone from the securities master Timezone field; defaults to
        True if using intraday bars and no `timezone` specified; ignored for
        non-intraday bars, or if `timezone` is passed

    cont_fut : str
        stitch futures into continuous contracts using this method (default is not
        to stitch together). Only applicable to history databases. Possible choices:
        concat

    master_fields : list of str, optional
        [DEPRECATED] append these fields from the securities master database (pass ['?'] or any
        invalid fieldname to see available fields). This parameter is deprecated and
        will be removed in a future release. For better performance, use
        `quantrocket.master.get_securities_reindexed_like` to get securities master
        data shaped like prices.

    Returns
    -------
    DataFrame
        a MultiIndex

    Notes
    -----
    The `times` parameter, if provided, is applied differently for history databases vs
    real-time aggregate databases. For history databases, the parameter is applied when
    querying the database. For real-time aggregate databases, the parameter is not applied
    when querying the database; rather, all available times are retrieved and the `times`
    filter is applied to the resulting DataFrame after casting it to the appropriate timezone
    (as inferred from the securities master Timezone field or as explicitly specified with
    the `timezone` parameter). The rationale for this behavior is that history databases store
    intraday data in the timezone of the relevant exchange whereas real-time aggregate
    databases store data in UTC. By applying the `times` filter as described, users can specify
    the times in the timezone of the relevant exchange for both types of databases.

    Examples
    --------
    Load intraday prices:

    >>> prices = get_prices('stk-sample-5min', fields=["Close", "Volume"])
    >>> prices.head()
                                ConId   	265598	38708077
    Field	Date	        Time
    Close	2017-07-26      09:30:00	153.62	2715.0
                                09:35:00	153.46	2730.0
                                09:40:00	153.21	2725.0
                                09:45:00	153.28	2725.0
                                09:50:00	153.18	2725.0

    Isolate the closes:

    >>> closes = prices.loc["Close"]
    >>> closes.head()
                ConId	        265598  38708077
    Date        Time
    2017-07-26	09:30:00	153.62	2715.0
                09:35:00	153.46	2730.0
                09:40:00	153.21	2725.0
                09:45:00	153.28	2725.0
                09:50:00	153.18	2725.0

    Isolate the 15:45:00 prices:

    >>> session_closes = closes.xs("15:45:00", level="Time")
    >>> session_closes.head()
        ConId	265598	38708077
    Date
    2017-07-26	153.29	2700.00
    2017-07-27 	150.10	2660.00
    2017-07-28	149.43	2650.02
    2017-07-31 	148.99	2650.34
    2017-08-01 	149.72	2675.50
    """
    # Import pandas lazily since it can take a moment to import
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    try:
        import pytz
    except ImportError:
        raise ImportError("pytz must be installed to use this function")

    if timezone and timezone not in pytz.all_timezones:
        raise ParameterError(
            "invalid timezone: {0} (see `pytz.all_timezones` for choices)".format(
                timezone))

    dbs = codes
    if not isinstance(dbs, (list, tuple)):
        dbs = [dbs]

    fields = fields or []
    if not isinstance(fields, (list, tuple)):
        fields = [fields]

    # separate history dbs from realtime dbs
    history_dbs = set(list_history_databases())
    realtime_dbs = list_realtime_databases()
    realtime_agg_dbs = set(itertools.chain(*realtime_dbs.values()))

    history_dbs.intersection_update(set(dbs))
    realtime_agg_dbs.intersection_update(set(dbs))

    unknown_dbs = set(dbs) - history_dbs - realtime_agg_dbs

    if unknown_dbs:
        tick_dbs = set(realtime_dbs.keys()).intersection(unknown_dbs)
        # Improve error message if possible
        if tick_dbs:
            raise ParameterError("{} is a real-time tick database, only history databases or "
                                 "real-time aggregate databases are supported".format(
                                     ", ".join(tick_dbs)))
        raise ParameterError(
            "no history or real-time aggregate databases called {}".format(
                ", ".join(unknown_dbs)))

    db_bar_sizes = set()
    db_bar_sizes_parsed = set()
    db_domains = set()
    history_db_fields = {}
    realtime_db_fields = {}

    for db in history_dbs:
        db_config = get_history_db_config(db)
        bar_size = db_config.get("bar_size")
        db_bar_sizes.add(bar_size)
        # to validate uniform bar sizes, we need to parse them in case dbs
        # store different but equivalent timedelta strings. History db
        # strings may need massaging to be parsable.
        if bar_size.endswith("s"):
            # strip s from secs, mins, hours to get valid pandas timedelta
            bar_size = bar_size[:-1]
        elif bar_size == "1 week":
            bar_size = "7 day"
        elif bar_size == "1 month":
            bar_size = "30 day"
        db_bar_sizes_parsed.add(pd.Timedelta(bar_size))
        db_domain = db_config.get("domain", "main")
        db_domains.add(db_domain)
        history_db_fields[db] = db_config.get("fields", [])

    for db in realtime_agg_dbs:
        db_config = get_realtime_db_config(db)
        bar_size = db_config.get("bar_size")
        db_bar_sizes.add(bar_size)
        db_bar_sizes_parsed.add(pd.Timedelta(bar_size))
        # NOTE: aggregate dbs don't include a domain key; if available, this
        # would probably have to be obtained from the associated tick db.
        # This isn't an issue unless/until real-time data comes from multiple
        # vendors. As it now stands, real-time data is always from "main".
        db_domain = db_config.get("domain", "main")
        db_domains.add(db_domain)
        realtime_db_fields[db] = db_config.get("fields", [])

    if len(db_bar_sizes_parsed) > 1:
        raise ParameterError(
            "all databases must contain same bar size but {0} have different "
            "bar sizes: {1}".format(", ".join(dbs), ", ".join(db_bar_sizes))
        )

    if len(db_domains) > 1:
        raise ParameterError(
            "all databases must use the same securities master domain but {0} "
            "use different domains: {1}".format(", ".join(dbs), ", ".join(db_domains))
        )

    all_prices = []

    for db in dbs:

        if db in history_dbs:
            # different DBs might support different fields so only request the
            # subset of supported fields
            fields_for_db = set(fields).intersection(set(history_db_fields[db]))

            kwargs = dict(
                start_date=start_date,
                end_date=end_date,
                universes=universes,
                conids=conids,
                exclude_universes=exclude_universes,
                exclude_conids=exclude_conids,
                times=times,
                cont_fut=cont_fut,
                fields=list(fields_for_db),
                tz_naive=False
            )

            tmp_filepath = "{dir}{sep}history.{db}.{pid}.{time}.csv".format(
                dir=TMP_DIR, sep=os.path.sep, db=db, pid=os.getpid(), time=time.time())

            try:
                download_history_file(db, tmp_filepath, **kwargs)
            except NoHistoricalData as e:
                # don't complain about NoHistoricalData if we're checking
                # multiple databases, unless none of them have data
                if len(dbs) == 1:
                    raise
                else:
                    continue

            prices = pd.read_csv(tmp_filepath)
            all_prices.append(prices)

            os.remove(tmp_filepath)

        if db in realtime_agg_dbs:

            fields_for_db = set(fields).intersection(set(realtime_db_fields[db]))

            kwargs = dict(
                start_date=start_date,
                end_date=end_date,
                universes=universes,
                conids=conids,
                exclude_universes=exclude_universes,
                exclude_conids=exclude_conids,
                fields=list(fields_for_db))

            tmp_filepath = "{dir}{sep}realtime.{db}.{pid}.{time}.csv".format(
                dir=TMP_DIR, sep=os.path.sep, db=db, pid=os.getpid(), time=time.time())

            try:
                download_market_data_file(db, tmp_filepath, **kwargs)
            except NoRealtimeData as e:
                # don't complain about NoRealtimeData if we're checking
                # multiple databases, unless none of them have data
                if len(dbs) == 1:
                    raise
                else:
                    continue

            prices = pd.read_csv(tmp_filepath)
            all_prices.append(prices)

            os.remove(tmp_filepath)

    # complain if multiple dbs and none had data
    if len(dbs) > 1 and not all_prices:
        raise NoHistoricalData("no price data matches the query parameters in any of {0}".format(
            ", ".join(dbs)
        ))

    prices = pd.concat(all_prices, sort=False)

    try:
        prices = prices.pivot(index="ConId", columns="Date").T
    except ValueError as e:
        if "duplicate" not in repr(e):
            raise
        # There are duplicates, likely due to querying multiple databases,
        # both of which return one or more identical dates for identical
        # conids. To resolve, we group by conid and date and take the first
        # available value for each field. This means that the orders of
        # [codes] matters. The use of groupby.first() instead of simply
        # drop_duplicates allows us to retain one field from one db and
        # another field from another db.
        grouped = prices.groupby(["ConId","Date"])
        prices = pd.concat(
            dict([(col, grouped[col].first()) for col in prices.columns
                  if col not in ("ConId","Date")]), axis=1)
        prices = prices.reset_index().pivot(index="ConId", columns="Date").T

    prices.index.set_names(["Field", "Date"], inplace=True)

    master_fields = master_fields or []
    if master_fields:
        import warnings
        # DeprecationWarning is ignored by default but we want the user
        # to see it
        warnings.simplefilter("always", DeprecationWarning)
        warnings.warn(
            "`master_fields` parameter is deprecated and will be removed in a "
            "future release. For better performance, please use "
            "`quantrocket.master.get_securities_reindexed_like` "
            "to get securities master data shaped like prices.", DeprecationWarning)

        if isinstance(master_fields, tuple):
            master_fields = list(master_fields)
        elif not isinstance(master_fields, list):
            master_fields = [master_fields]

    # master fields that are required internally but shouldn't be returned to
    # the user (potentially Timezone)
    internal_master_fields = []

    is_intraday = list(db_bar_sizes_parsed)[0] < pd.Timedelta("1 day")

    if is_intraday and not timezone and infer_timezone is not False:
        infer_timezone = True
        if not master_fields or "Timezone" not in master_fields:
            internal_master_fields.append("Timezone")

    # Next, get the master file
    if master_fields or internal_master_fields:
        conids = list(prices.columns)

        domain = list(db_domains)[0] if db_domains else None

        f = six.StringIO()
        download_master_file(
            f,
            conids=conids,
            fields=master_fields + internal_master_fields,
            domain=domain
        )
        securities = pd.read_csv(f, index_col="ConId")

        if "Delisted" in securities.columns:
            securities.loc[:, "Delisted"] = securities.Delisted.astype(bool)

        if "Etf" in securities.columns:
            securities.loc[:, "Etf"] = securities.Etf.astype(bool)

        # Infer timezone if needed
        if not timezone and infer_timezone:
            timezones = securities.Timezone.unique()

            if len(timezones) > 1:
                raise ParameterError(
                    "cannot infer timezone because multiple timezones are present "
                    "in data, please specify timezone explicitly (timezones: {0})".format(
                        ", ".join(timezones)))

            timezone = timezones[0]

        # Drop any internal-only fields
        if internal_master_fields:
            securities = securities.drop(internal_master_fields, axis=1)

        if not securities.empty:
            # Append securities, indexed to the min date, to allow easy ffill on demand
            securities = pd.DataFrame(securities.T, columns=prices.columns)
            securities.index.name = "Field"
            idx = pd.MultiIndex.from_product(
                (securities.index, [prices.index.get_level_values("Date").min()]),
                names=["Field", "Date"])

            securities = securities.reindex(index=idx, level="Field")
            prices = pd.concat((prices, securities))

    if is_intraday:
        dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True)

        if timezone:
            dates = dates.tz_convert(timezone)
    else:
        dates = pd.to_datetime(prices.index.get_level_values("Date"))

    prices.index = pd.MultiIndex.from_arrays((
        prices.index.get_level_values("Field"),
        dates
        ), names=("Field", "Date"))

    # Split date and time
    dts = prices.index.get_level_values("Date")
    dates = pd.to_datetime(dts.date).tz_localize(None) # drop tz-aware in Date index
    prices.index = pd.MultiIndex.from_arrays(
        (prices.index.get_level_values("Field"),
         dates,
         dts.strftime("%H:%M:%S")),
        names=["Field", "Date", "Time"]
    )

    # Align dates if there are any duplicate. Explanation: Suppose there are
    # two timezones represented in the data (e.g. history db in security
    # timezone vs real-time db in UTC). After parsing these dates into a
    # common timezone, they will align properly, but we pivoted before
    # parsing the dates (for performance reasons), so they may not be
    # aligned. Thus we need to dedupe the index.
    prices = prices.groupby(prices.index).first()
    prices.index = pd.MultiIndex.from_tuples(prices.index)
    prices.index.set_names(["Field", "Date", "Time"], inplace=True)

    # Drop time if not intraday
    if not is_intraday:
        prices.index = prices.index.droplevel("Time")
        return prices

    # If intraday, fill missing times so that each date has the same set of
    # times, allowing easier comparisons. Example implications:
    # - if history is retrieved intraday, this ensures that today will have NaN
    #   entries for future times
    # - early close dates will have a full set of times, with NaNs after the
    #   early close
    unique_fields = prices.index.get_level_values("Field").unique()
    unique_dates = prices.index.get_level_values("Date").unique()
    unique_times = prices.index.get_level_values("Time").unique()
    interpolated_index = None
    for field in unique_fields:
        if master_fields and field in master_fields:
            min_date = prices.loc[field].index.min()
            field_idx = pd.MultiIndex.from_tuples([(field,min_date[0], min_date[1])])
        else:
            field_idx = pd.MultiIndex.from_product([[field], unique_dates, unique_times]).sort_values()
        if interpolated_index is None:
            interpolated_index = field_idx
        else:
            interpolated_index = interpolated_index.append(field_idx)

    prices = prices.reindex(interpolated_index)
    prices.index.set_names(["Field", "Date", "Time"], inplace=True)

    # Apply times filter if needed (see Notes in docstring)
    if times and realtime_agg_dbs:
        if not isinstance(times, (list, tuple)):
            times = [times]
        prices = prices.loc[prices.index.get_level_values("Time").isin(times)]

    return prices
Beispiel #5
0
    def get_historical_prices(self,
                              start_date,
                              end_date=None,
                              nlv=None,
                              max_cache=None):
        """
        Downloads historical prices from a history db. Downloads security
        details from the master db and broadcasts the values to be shaped
        like the historical prices.
        """
        if start_date:
            start_date = self._get_start_date_with_lookback(start_date)

        dbs = self.DB
        if not isinstance(dbs, (list, tuple)):
            dbs = [self.DB]

        db_universes = set()
        db_bar_sizes = set()
        for db in dbs:
            db_config = get_db_config(db)
            universes = db_config.get("universes", None)
            if universes:
                db_universes.update(set(universes))
            bar_size = db_config.get("bar_size")
            db_bar_sizes.add(bar_size)

        db_universes = list(db_universes)
        db_bar_sizes = list(db_bar_sizes)

        if len(db_bar_sizes) > 1:
            raise MoonshotParameterError(
                "databases must contain same bar size but have different bar sizes "
                "(databases: {0}; bar sizes: {1})".format(
                    ", ".join(dbs), ", ".join(db_bar_sizes)))

        all_prices = []

        for db in dbs:

            kwargs = dict(start_date=start_date,
                          end_date=end_date,
                          universes=self.UNIVERSES,
                          conids=self.CONIDS,
                          exclude_universes=self.EXCLUDE_UNIVERSES,
                          exclude_conids=self.EXCLUDE_CONIDS,
                          times=self.DB_TIME_FILTERS,
                          cont_fut=self.CONT_FUT,
                          fields=self.DB_FIELDS,
                          tz_naive=False)

            if max_cache:
                prices = HistoryCache.load(db, kwargs, max_cache)

                if prices is not None:
                    all_prices.append(prices)
                    continue

            if max_cache:
                f = HistoryCache.get_filepath(db, kwargs)
            else:
                f = io.StringIO()
            download_history_file(db, f, **kwargs)

            prices = pd.read_csv(f)
            all_prices.append(prices)

        prices = pd.concat(all_prices)

        prices = prices.pivot(index="ConId", columns="Date").T
        prices.index.set_names(["Field", "Date"], inplace=True)

        # Next, get the master file
        universes = self.UNIVERSES
        conids = self.CONIDS
        if not conids and not universes:
            universes = db_universes
            if not universes:
                conids = list(prices.columns)

        f = io.StringIO()
        download_master_file(f,
                             conids=conids,
                             universes=universes,
                             exclude_conids=self.EXCLUDE_CONIDS,
                             exclude_universes=self.EXCLUDE_UNIVERSES,
                             fields=self.MASTER_FIELDS)
        securities = pd.read_csv(f, index_col="ConId")

        nlv = nlv or self._get_nlv()
        if nlv:
            missing_nlvs = set(securities.Currency) - set(nlv.keys())
            if missing_nlvs:
                raise ValueError(
                    "NLV dict is missing values for required currencies: {0}".
                    format(", ".join(missing_nlvs)))

            securities['Nlv'] = securities.apply(
                lambda row: nlv.get(row.Currency, None), axis=1)

        # Append securities, indexed to the min date, to allow easy ffill on demand
        securities = pd.DataFrame(securities.T, columns=prices.columns)
        securities.index.name = "Field"
        idx = pd.MultiIndex.from_product(
            (securities.index, [prices.index.get_level_values("Date").min()]),
            names=["Field", "Date"])

        securities = securities.reindex(index=idx, level="Field")
        prices = pd.concat((prices, securities))

        timezone = self.TIMEZONE or self._infer_timezone(prices)

        dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True)
        dates = dates.tz_convert(timezone)

        prices.index = pd.MultiIndex.from_arrays(
            (prices.index.get_level_values("Field"), dates),
            names=("Field", "Date"))

        # Split date and time
        dts = prices.index.get_level_values("Date")
        dates = pd.to_datetime(dts.date)
        dates.tz = timezone
        prices.index = pd.MultiIndex.from_arrays(
            (prices.index.get_level_values("Field"), dates,
             dts.strftime("%H:%M:%S")),
            names=["Field", "Date", "Time"])

        if db_bar_sizes[0] in ("1 day", "1 week", "1 month"):
            prices.index = prices.index.droplevel("Time")

        return prices
def get_reuters_estimates_reindexed_like(reindex_like,
                                         codes,
                                         fields=["Actual"],
                                         period_types=["Q"],
                                         max_lag=None):
    """
    Return a multiindex (Indicator, Field, Date) DataFrame of point-in-time
    Reuters estimates and actuals for one or more indicator codes, reindexed
    to match the index (dates) and columns (conids) of `reindex_like`.
    Estimates and actuals are forward-filled in order to provide the latest
    reading at any given date, indexed to the UpdatedDate field.
    UpdatedDate is shifted forward 1 day to avoid lookahead bias.

    Parameters
    ----------
    reindex_like : DataFrame, required
        a DataFrame (usually of prices) with dates for the index and conids
        for the columns, to which the shape of the resulting DataFrame will
        be conformed

    codes : list of str, required
        the indicator code(s) to query. Use the `list_reuters_codes`
        function to see available codes.

    fields : list of str
        a list of fields to include in the resulting DataFrame. Defaults to
        simply including the Actual field.

    period_types : list of str, optional
        limit to these fiscal period types. Possible choices: A, Q, S, where
        A=Annual, Q=Quarterly, S=Semi-Annual. Default is Q/Quarterly.

    max_lag : str, optional
        maximum amount of time a data point can be used after the
        associated fiscal period has ended. Setting a limit can prevent
        using data that is reported long after the fiscal period ended, or
        can limit how far data is forward filled in the absence of subsequent
        data. Specify as a Pandas offset alias, e.g. '500D'. By default, no
        maximum limit is applied.

    Returns
    -------
    DataFrame
        a multiindex (Indicator, Field, Date) DataFrame of estimates and actuals,
        shaped like the input DataFrame

    Examples
    --------
    Query book value per share (code BVPS):

    >>> closes = prices.loc["Close"]
    >>> estimates = get_reuters_estimates_reindexed_like(closes, codes=["BVPS"])
    >>> book_values_per_share = estimates.loc["BVPS"].loc["Actual"]
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    index_levels = reindex_like.index.names
    if "Time" in index_levels:
        raise ParameterError(
            "reindex_like should not have 'Time' in index, please take a cross-section first, "
            "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`")

    if index_levels != ["Date"]:
        raise ParameterError(
            "reindex_like must have index called 'Date', but has {0}".format(
                ",".join([str(name) for name in index_levels])))

    if not hasattr(reindex_like.index, "date"):
        raise ParameterError("reindex_like must have a DatetimeIndex")

    conids = list(reindex_like.columns)
    start_date = reindex_like.index.min().date()
    # Since financial reports are sparse, start well before the reindex_like
    # min date
    start_date -= pd.Timedelta(days=365 + 180)
    start_date = start_date.isoformat()
    end_date = reindex_like.index.max().date().isoformat()

    f = six.StringIO()
    query_fields = list(
        fields
    )  # copy fields on Py2 or 3: https://stackoverflow.com/a/2612815/417414
    if "UpdatedDate" not in query_fields:
        query_fields.append("UpdatedDate")
    download_reuters_estimates(codes,
                               f,
                               conids=conids,
                               start_date=start_date,
                               end_date=end_date,
                               fields=query_fields,
                               period_types=period_types)
    parse_dates = ["UpdatedDate", "FiscalPeriodEndDate"]
    if "AnnounceDate" in fields:
        parse_dates.append("AnnounceDate")
    estimates = pd.read_csv(f, parse_dates=parse_dates)

    # Drop records with no actuals
    estimates = estimates.loc[estimates.UpdatedDate.notnull()]

    # Convert UTC UpdatedDate to security timezone, and cast to date for
    # index
    f = six.StringIO()
    download_master_file(f,
                         conids=list(estimates.ConId.unique()),
                         delisted=True,
                         fields=["Timezone"])
    timezones = pd.read_csv(f, index_col="ConId")
    estimates = estimates.join(timezones, on="ConId")
    if estimates.Timezone.isnull().any():
        conids_missing_timezones = list(
            estimates.ConId[estimates.Timezone.isnull()].unique())
        raise MissingData(
            "timezones are missing for some conids so cannot convert UTC "
            "estimates to timezone of security (conids missing timezone: {0})".
            format(",".join([str(conid)
                             for conid in conids_missing_timezones])))

    # If only 1 timezone in data, use a faster method
    if len(estimates.Timezone.unique()) == 1:
        timezone = list(estimates.Timezone.unique())[0]
        estimates["Date"] = pd.to_datetime(
            estimates.UpdatedDate.values).tz_localize("UTC").tz_convert(
                timezone)
    else:
        estimates["Date"] = estimates.apply(
            lambda row: row.UpdatedDate.tz_localize("UTC").tz_convert(
                row.Timezone),
            axis=1)

    # Convert to dates (i.e. time = 00:00:00)
    estimates.loc[:, "Date"] = pd.to_datetime(
        estimates.Date.apply(
            lambda x: pd.datetime.combine(x,
                                          pd.Timestamp("00:00:00").time())))

    # Drop any fields we don't need
    needed_fields = set(fields)
    needed_fields.update(set(("ConId", "Date", "Indicator")))
    if max_lag:
        needed_fields.add("FiscalPeriodEndDate")
    unneeded_fields = set(estimates.columns) - needed_fields
    if unneeded_fields:
        estimates = estimates.drop(unneeded_fields, axis=1)

    # if reindex_like.index is tz-aware, make financials tz-aware so they can
    # be joined (tz-aware or tz-naive are both fine, as SourceDate represents
    # dates which are assumed to be in the local timezone of the reported
    # company)
    if reindex_like.index.tz:
        estimates.loc[:, "Date"] = estimates.Date.dt.tz_localize(
            reindex_like.index.tz.zone)
        deduped_updated_dates = estimates.Date.drop_duplicates()
    else:
        # joining to tz-naive requires using values, whereas joining to
        # tz-aware requires not using it. Why?
        deduped_updated_dates = estimates.Date.drop_duplicates().values

    # Create a unioned index of input DataFrame and UpdatedDate
    union_date_idx = reindex_like.index.union(
        deduped_updated_dates).sort_values()

    all_estimates = {}
    for code in codes:
        estimates_for_code = estimates.loc[estimates.Indicator == code]
        if estimates_for_code.empty:
            continue
        if "Indicator" not in fields:
            estimates_for_code = estimates_for_code.drop("Indicator", axis=1)
        # There might be duplicate UpdatedDates if a company announced
        # reports for several fiscal periods at once. In this case we keep
        # only the last value (i.e. latest fiscal period)
        estimates_for_code = estimates_for_code.drop_duplicates(
            subset=["ConId", "Date"], keep="last")
        estimates_for_code = estimates_for_code.pivot(index="ConId",
                                                      columns="Date").T
        multiidx = pd.MultiIndex.from_product(
            (estimates_for_code.index.get_level_values(0).unique(),
             union_date_idx),
            names=["Field", "Date"])
        estimates_for_code = estimates_for_code.reindex(
            index=multiidx, columns=reindex_like.columns)

        # estimates are sparse so ffill (one field at a time)
        all_fields_for_code = {}
        for field in estimates_for_code.index.get_level_values(
                "Field").unique():
            field_for_code = estimates_for_code.loc[field].fillna(
                method="ffill")

            # Shift to avoid lookahead bias
            field_for_code = field_for_code.shift()

            all_fields_for_code[field] = field_for_code

        # Filter stale values if asked to
        if max_lag:
            fiscal_period_end_dates = all_fields_for_code[
                "FiscalPeriodEndDate"]
            # subtract the max_lag from the index date to get the
            # earliest possible fiscal period end date for that row
            earliest_allowed_fiscal_period_end_dates = fiscal_period_end_dates.apply(
                lambda x: fiscal_period_end_dates.index - pd.Timedelta(max_lag
                                                                       ))
            within_max_timedelta = fiscal_period_end_dates.apply(
                pd.to_datetime) >= earliest_allowed_fiscal_period_end_dates

            for field, field_for_code in all_fields_for_code.items():
                field_for_code = field_for_code.where(within_max_timedelta)
                all_fields_for_code[field] = field_for_code

            # Clean up if FiscalPeriodEndDate was just kept around for this purpose
            if "FiscalPeriodEndDate" not in fields:
                del all_fields_for_code["FiscalPeriodEndDate"]

        estimates_for_code = pd.concat(all_fields_for_code,
                                       names=["Field", "Date"])

        # In cases the statements included dates not in the input
        # DataFrame, drop those now that we've ffilled
        extra_dates = union_date_idx.difference(reindex_like.index)
        if not extra_dates.empty:
            estimates_for_code.drop(extra_dates,
                                    axis=0,
                                    level="Date",
                                    inplace=True)

        all_estimates[code] = estimates_for_code

    estimates = pd.concat(all_estimates, names=["Indicator", "Field", "Date"])

    return estimates
def _get_stockloan_data_reindexed_like(stockloan_func,
                                       stockloan_field,
                                       reindex_like,
                                       time=None):
    """
    Common base function for get_shortable_shares_reindexed_like and
    get_borrow_fees_reindexed_like.
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    index_levels = reindex_like.index.names
    if "Time" in index_levels:
        raise ParameterError(
            "reindex_like should not have 'Time' in index, please take a cross-section first, "
            "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`")

    if index_levels != ["Date"]:
        raise ParameterError(
            "reindex_like must have index called 'Date', but has {0}".format(
                ",".join([str(name) for name in index_levels])))

    if not hasattr(reindex_like.index, "date"):
        raise ParameterError("reindex_like must have a DatetimeIndex")

    conids = list(reindex_like.columns)
    start_date = reindex_like.index.min().date()
    # Stockloan data is sparse but batched in monthly files, so start >1-month
    # before the reindex_like min date
    start_date -= pd.Timedelta(days=45)
    start_date = start_date.isoformat()
    end_date = reindex_like.index.max().date().isoformat()

    f = six.StringIO()
    stockloan_func(f, conids=conids, start_date=start_date, end_date=end_date)
    stockloan_data = pd.read_csv(f)
    stockloan_data.loc[:, "Date"] = pd.to_datetime(stockloan_data.Date,
                                                   utc=True)

    # Determine timezone, from:
    # - time param if provided
    # - else reindex_like.index.tz if set
    # - else component securities if all have same timezone
    timezone = None

    if time and " " in time:
        time, timezone = time.split(" ", 1)

        if reindex_like.index.tz and reindex_like.index.tz.zone != timezone:
            raise ParameterError((
                "cannot use timezone {0} because reindex_like timezone is {1}, "
                "these must match".format(timezone,
                                          reindex_like.index.tz.zone)))

    if not timezone:
        if reindex_like.index.tz:
            timezone = reindex_like.index.tz.zone
        else:
            # try to infer from component securities
            f = six.StringIO()
            download_master_file(f,
                                 conids=list(stockloan_data.ConId.unique()),
                                 delisted=True,
                                 fields=["Timezone"])
            security_timezones = pd.read_csv(f, index_col="ConId")
            security_timezones = list(security_timezones.Timezone.unique())
            if len(security_timezones) > 1:
                raise ParameterError(
                    "no timezone specified and cannot infer because multiple timezones are "
                    "present in data, please specify timezone (timezones in data: {0})"
                    .format(", ".join(security_timezones)))
            timezone = security_timezones[0]

    # Create an index of `reindex_like` dates at `time`
    if time:
        try:
            time = pd.Timestamp(time).time()
        except ValueError as e:
            raise ParameterError("could not parse time '{0}': {1}".format(
                time, str(e)))
        index_at_time = pd.Index(reindex_like.index.to_series().apply(
            lambda x: pd.datetime.combine(x, time)))
    else:
        index_at_time = reindex_like.index

    if not index_at_time.tz:
        index_at_time = index_at_time.tz_localize(timezone)

    index_at_time = index_at_time.tz_convert("UTC")

    stockloan_data = stockloan_data.pivot(index="ConId", columns="Date").T
    stockloan_data = stockloan_data.loc[stockloan_field]

    # Create a unioned index of requested times and stockloan data timestamps
    unioned_idx = index_at_time.union(stockloan_data.index)
    # performance boost: don't need to reindex and ffill earlier than the
    # stockloan data start date
    unioned_idx = unioned_idx[unioned_idx >= stockloan_data.index.min()]
    stockloan_data = stockloan_data.reindex(index=unioned_idx,
                                            columns=reindex_like.columns)

    stockloan_data = stockloan_data.fillna(method="ffill")

    # Keep only the requested times, now that we've ffilled
    stockloan_data = stockloan_data.reindex(index=index_at_time)

    # Replace index_at_time with the original reindex_like index (this needs
    # to be done because index_at_time is tz-aware and reindex_like may not
    # be, and because index_at_time uses the requested `time` if provided.
    # Since index_at_time was derived from reindex_like.index and has the
    # same shape, it is safe to replace it.)
    stockloan_data.index = reindex_like.index

    return stockloan_data
Beispiel #8
0
def get_historical_prices(codes,
                          start_date=None,
                          end_date=None,
                          universes=None,
                          conids=None,
                          exclude_universes=None,
                          exclude_conids=None,
                          times=None,
                          cont_fut=None,
                          fields=None,
                          master_fields=None,
                          timezone=None,
                          infer_timezone=None):
    """
    Query one or more history databases and load prices into a DataFrame.

    For bar sizes smaller than 1-day, the resulting DataFrame will have a MultiIndex
    with levels (Field, Date, Time). For bar sizes of 1-day or larger, the MultiIndex
    will have levels (Field, Date).

    Parameters
    ----------
    codes : str or list of str, required
        the code(s) of one or more databases to query. If multiple databases
        are specified, they must have the same bar size.

    start_date : str (YYYY-MM-DD), optional
        limit to history on or after this date

    end_date : str (YYYY-MM-DD), optional
        limit to history on or before this date

    universes : list of str, optional
        limit to these universes (default is to return all securities in database)

    conids : list of int, optional
        limit to these conids

    exclude_universes : list of str, optional
        exclude these universes

    exclude_conids : list of int, optional
        exclude these conids

    times: list of str (HH:MM:SS), optional
        limit to these times

    cont_fut : str
        stitch futures into continuous contracts using this method (default is not
        to stitch together). Possible choices: concat

    fields : list of str, optional
        only return these fields (pass ['?'] or any invalid fieldname to see
        available fields)

    master_fields : list of str, optional
        append these fields from the securities master database (pass ['?'] or any
        invalid fieldname to see available fields)

    timezone : str, optional
        convert timestamps to this timezone, for example America/New_York (see
        `pytz.all_timezones` for choices); ignored for non-intraday bar sizes

    infer_timezone : bool
        infer the timezone from the securities master Timezone field; defaults to
        True if using intraday bars and no `timezone` specified; ignored for
        non-intraday bars, or if `timezone` is passed

    Returns
    -------
    DataFrame
        a MultiIndex

    Examples
    --------
    Load intraday prices:

    >>> prices = get_historical_prices('stk-sample-5min', fields=["Close", "Volume"], timezone="America/New_York")
    >>> prices.head()
                                ConId   	265598	38708077
    Field	Date	        Time
    Close	2017-07-26      09:30:00	153.62	2715.0
                                09:35:00	153.46	2730.0
                                09:40:00	153.21	2725.0
                                09:45:00	153.28	2725.0
                                09:50:00	153.18	2725.0

    Isolate the closes:

    >>> closes = prices.loc["Close"]
    >>> closes.head()
                ConId	        265598  38708077
    Date        Time
    2017-07-26	09:30:00	153.62	2715.0
                09:35:00	153.46	2730.0
                09:40:00	153.21	2725.0
                09:45:00	153.28	2725.0
                09:50:00	153.18	2725.0

    Isolate the 15:45:00 prices:

    >>> session_closes = closes.xs("15:45:00", level="Time")
    >>> session_closes.head()
        ConId	265598	38708077
    Date
    2017-07-26	153.29	2700.00
    2017-07-27 	150.10	2660.00
    2017-07-28	149.43	2650.02
    2017-07-31 	148.99	2650.34
    2017-08-01 	149.72	2675.50
    """
    # Import pandas lazily since it can take a moment to import
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas must be installed to use this function")

    try:
        import pytz
    except ImportError:
        raise ImportError("pytz must be installed to use this function")

    if timezone and timezone not in pytz.all_timezones:
        raise ParameterError(
            "invalid timezone: {0} (see `pytz.all_timezones` for choices)".
            format(timezone))

    dbs = codes
    if not isinstance(dbs, (list, tuple)):
        dbs = [dbs]

    if master_fields:
        if isinstance(master_fields, tuple):
            master_fields = list(master_fields)
        elif not isinstance(master_fields, list):
            master_fields = [master_fields]

    db_universes = set()
    db_bar_sizes = set()
    for db in dbs:
        db_config = get_db_config(db)
        _db_universes = db_config.get("universes", None)
        if _db_universes:
            db_universes.update(set(_db_universes))
        bar_size = db_config.get("bar_size")
        db_bar_sizes.add(bar_size)

    db_universes = list(db_universes)
    db_bar_sizes = list(db_bar_sizes)

    if len(db_bar_sizes) > 1:
        raise ParameterError(
            "all databases must contain same bar size but {0} have different "
            "bar sizes: {1}".format(", ".join(dbs), ", ".join(db_bar_sizes)))

    all_prices = []

    for db in dbs:

        kwargs = dict(start_date=start_date,
                      end_date=end_date,
                      universes=universes,
                      conids=conids,
                      exclude_universes=exclude_universes,
                      exclude_conids=exclude_conids,
                      times=times,
                      cont_fut=cont_fut,
                      fields=fields,
                      tz_naive=False)

        tmp_filepath = "{0}/history.{1}.{2}.{3}.csv".format(
            TMP_DIR, db, os.getpid(), time.time())
        download_history_file(db, tmp_filepath, **kwargs)

        prices = pd.read_csv(tmp_filepath)
        all_prices.append(prices)

        os.remove(tmp_filepath)

    prices = pd.concat(all_prices)

    prices = prices.pivot(index="ConId", columns="Date").T
    prices.index.set_names(["Field", "Date"], inplace=True)

    is_intraday = db_bar_sizes[0] not in ("1 day", "1 week", "1 month")

    if is_intraday and not timezone and infer_timezone is not False:
        infer_timezone = True
        if not master_fields:
            master_fields = []
        if "Timezone" not in master_fields:
            master_fields.append("Timezone")

    # Next, get the master file
    if master_fields:
        universes = universes
        conids = conids
        if not conids and not universes:
            universes = db_universes
            if not universes:
                conids = list(prices.columns)

        f = six.StringIO()
        download_master_file(f,
                             conids=conids,
                             universes=universes,
                             exclude_conids=exclude_conids,
                             exclude_universes=exclude_universes,
                             fields=master_fields,
                             delisted=True)
        securities = pd.read_csv(f, index_col="ConId")

        if "Delisted" in securities.columns:
            securities.loc[:, "Delisted"] = securities.Delisted.astype(bool)

        if "Etf" in securities.columns:
            securities.loc[:, "Etf"] = securities.Etf.astype(bool)

        # Append securities, indexed to the min date, to allow easy ffill on demand
        securities = pd.DataFrame(securities.T, columns=prices.columns)
        securities.index.name = "Field"
        idx = pd.MultiIndex.from_product(
            (securities.index, [prices.index.get_level_values("Date").min()]),
            names=["Field", "Date"])

        securities = securities.reindex(index=idx, level="Field")
        prices = pd.concat((prices, securities))

    if is_intraday:
        dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True)

        if not timezone and infer_timezone:
            timezone = _infer_timezone(prices)

        if timezone:
            dates = dates.tz_convert(timezone)
    else:
        dates = pd.to_datetime(prices.index.get_level_values("Date"))

    prices.index = pd.MultiIndex.from_arrays(
        (prices.index.get_level_values("Field"), dates),
        names=("Field", "Date"))

    # Split date and time
    dts = prices.index.get_level_values("Date")
    dates = pd.to_datetime(dts.date).tz_localize(
        None)  # drop tz-aware in Date index
    prices.index = pd.MultiIndex.from_arrays(
        (prices.index.get_level_values("Field"), dates,
         dts.strftime("%H:%M:%S")),
        names=["Field", "Date", "Time"])

    # Align dates if there are any duplicate. Explanation: Suppose there are
    # two timezones represented in the data. After parsing these dates into a
    # common timezone, they will align properly, but we pivoted before
    # parsing the dates (for performance reasons), so they may not be
    # aligned. Thus we need to dedupe the index.
    prices = prices.groupby(prices.index).first()
    prices.index = pd.MultiIndex.from_tuples(prices.index)
    prices.index.set_names(["Field", "Date", "Time"], inplace=True)

    # Drop time if not intraday
    if not is_intraday:
        prices.index = prices.index.droplevel("Time")
        return prices

    # If intraday, fill missing times so that each date has the same set of
    # times, allowing easier comparisons. Example implications:
    # - if history is retrieved intraday, this ensures that today will have NaN
    #   entries for future times
    # - early close dates will have a full set of times, with NaNs after the
    #   early close
    unique_fields = prices.index.get_level_values("Field").unique()
    unique_dates = prices.index.get_level_values("Date").unique()
    unique_times = prices.index.get_level_values("Time").unique()
    interpolated_index = None
    for field in unique_fields:
        if master_fields and field in master_fields:
            min_date = prices.loc[field].index.min()
            field_idx = pd.MultiIndex.from_tuples([(field, min_date[0],
                                                    min_date[1])])
        else:
            field_idx = pd.MultiIndex.from_product([[field], unique_dates,
                                                    unique_times])
        if interpolated_index is None:
            interpolated_index = field_idx
        else:
            interpolated_index = interpolated_index.append(field_idx)

    prices = prices.reindex(interpolated_index)
    prices.index.set_names(["Field", "Date", "Time"], inplace=True)

    return prices
Beispiel #9
0
    def _get_historical_prices(self, start_date, end_date, nlv=None):
        """
        Downloads historical prices from a history db. Downloads security
        details from the master db and broadcasts the values to be shaped
        like the historical prices.
        """
        if start_date:
            start_date = self._get_start_date_with_lookback(start_date)

        f = io.StringIO()
        download_history_file(self.DB,
                              f,
                              start_date=start_date,
                              end_date=end_date,
                              universes=self.UNIVERSES,
                              conids=self.CONIDS,
                              exclude_universes=self.EXCLUDE_UNIVERSES,
                              exclude_conids=self.EXCLUDE_CONIDS,
                              times=self.DB_TIME_FILTERS,
                              cont_fut=self.CONT_FUT,
                              fields=self.DB_FIELDS,
                              tz_naive=False)

        prices = pd.read_csv(f, parse_dates=["Date"])
        prices = prices.pivot(index="ConId", columns="Date").T
        prices.index.set_names(["Field", "Date"], inplace=True)

        # Next, get the master file
        universes = self.UNIVERSES
        conids = self.CONIDS
        if not conids and not universes:
            universes = get_db_config(self.DB).get("universes", None)
            if not universes:
                conids = list(prices.columns)

        f = io.StringIO()
        download_master_file(f,
                             conids=conids,
                             universes=universes,
                             exclude_conids=self.EXCLUDE_CONIDS,
                             exclude_universes=self.EXCLUDE_UNIVERSES,
                             fields=self.MASTER_FIELDS)
        securities = pd.read_csv(f, index_col="ConId")

        nlv = nlv or self._get_nlv()
        if nlv:
            missing_nlvs = set(securities.Currency) - set(nlv.keys())
            if missing_nlvs:
                raise ValueError(
                    "NLV dict is missing values for required currencies: {0}".
                    format(", ".join(missing_nlvs)))

            securities['Nlv'] = securities.apply(
                lambda row: nlv.get(row.Currency, None), axis=1)

        securities = pd.DataFrame(securities.T, columns=prices.columns)
        securities.index.name = "Field"
        idx = pd.MultiIndex.from_product(
            (securities.index, prices.index.get_level_values("Date").unique()),
            names=["Field", "Date"])

        broadcast_securities = securities.reindex(index=idx, level="Field")
        prices = pd.concat((prices, broadcast_securities))

        return prices
    def _write_assets(self, prices, asset_db_writer):
        """
        Queries the master service and prepares the securities for the asset
        writer.
        """

        f = io.StringIO()

        download_master_file(f,
                             universes=self.universes,
                             delisted=True,
                             fields=[
                                 "ConId", "PrimaryExchange", "Symbol",
                                 "SecType", "LocalSymbol", "LongName",
                                 "MinTick", "Multiplier", "LastTradeDate",
                                 "ContractMonth", "Timezone", "UnderConId"
                             ])

        assets = pd.read_csv(f, index_col="ConId")
        assets[["Symbol", "LocalSymbol"]] = assets[["Symbol",
                                                    "LocalSymbol"]].astype(str)

        grouped_by_conid = prices.reset_index().groupby("ConId")
        max_dates = grouped_by_conid.Date.max()
        min_dates = grouped_by_conid.Date.min()
        min_dates.name = "start_date"
        max_dates.name = "end_date"
        assets = assets.join(min_dates).join(max_dates)
        assets["first_traded"] = assets["start_date"]

        exchanges = pd.DataFrame(assets,
                                 columns=["PrimaryExchange",
                                          "Timezone"]).drop_duplicates()
        exchanges = exchanges.rename(columns={
            "PrimaryExchange": "exchange",
            "Timezone": "timezone"
        })

        equities = assets[assets.SecType == "STK"].copy()
        if equities.empty:
            equities = None
        else:
            equities = pd.DataFrame(equities,
                                    columns=[
                                        "PrimaryExchange", "Symbol",
                                        "LongName", "start_date", "end_date",
                                        "first_traded"
                                    ])
            equities = equities.rename(
                columns={
                    "PrimaryExchange": "exchange",
                    "Symbol": "symbol",
                    "LongName": "asset_name"
                })

        futures = assets[assets.SecType == "FUT"].copy()
        if futures.empty:
            futures = None
            root_symbols = None
        else:
            # Concat local symbol plus contract month to ensure unique symbol
            futures["symbol"] = futures.LocalSymbol.str.cat(
                futures.ContractMonth.astype(str), "-")

            futures = futures.rename(
                columns={
                    "PrimaryExchange": "exchange",
                    "Symbol": "root_symbol",
                    "LongName": "asset_name",
                    "Multiplier": "multiplier",
                    "MinTick": "tick_size",
                    "LastTradeDate": "auto_close_date",
                    "UnderConId": "root_symbol_id"
                })
            futures["expiration_date"] = futures.auto_close_date
            root_symbols = pd.DataFrame(
                futures, columns=["root_symbol", "exchange",
                                  "root_symbol_id"]).drop_duplicates()
            futures = futures.drop(
                ["root_symbol_id", "LocalSymbol", "ContractMonth", "SecType"],
                axis=1)

        asset_db_writer.write(equities=equities,
                              futures=futures,
                              exchanges=exchanges,
                              root_symbols=root_symbols)